ia64/xen-unstable

view xen/arch/x86/setup.c @ 16856:cff4c8a1aa28

New XEN_DOMCTL_set_target
Stubdomains (and probably other domain disagregation elements too)
need to be able to tinker with another domain. This adds IS_PRIV_FOR
that extends IS_PRIV by allowing domains to have privileges over a
given "target" domain. XEN_DOMCTL_set_target permits to set this
"target". A new 'target' configuration option makes the domain builder
use it.

Signed-off-by: Samuel Thibault <samuel.thibault@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jan 23 13:21:44 2008 +0000 (2008-01-23)
parents 257ca4017b41
children 76601c290fa9
line source
1 #include <xen/config.h>
2 #include <xen/init.h>
3 #include <xen/lib.h>
4 #include <xen/sched.h>
5 #include <xen/domain.h>
6 #include <xen/serial.h>
7 #include <xen/softirq.h>
8 #include <xen/acpi.h>
9 #include <xen/console.h>
10 #include <xen/serial.h>
11 #include <xen/trace.h>
12 #include <xen/multiboot.h>
13 #include <xen/domain_page.h>
14 #include <xen/version.h>
15 #include <xen/gdbstub.h>
16 #include <xen/percpu.h>
17 #include <xen/hypercall.h>
18 #include <xen/keyhandler.h>
19 #include <xen/numa.h>
20 #include <xen/rcupdate.h>
21 #include <xen/vga.h>
22 #include <xen/dmi.h>
23 #include <public/version.h>
24 #ifdef CONFIG_COMPAT
25 #include <compat/platform.h>
26 #include <compat/xen.h>
27 #endif
28 #include <asm/bitops.h>
29 #include <asm/smp.h>
30 #include <asm/processor.h>
31 #include <asm/mpspec.h>
32 #include <asm/apic.h>
33 #include <asm/desc.h>
34 #include <asm/paging.h>
35 #include <asm/e820.h>
36 #include <xsm/acm/acm_hooks.h>
37 #include <xen/kexec.h>
38 #include <asm/edd.h>
39 #include <xsm/xsm.h>
40 #include <asm/tboot.h>
42 #if defined(CONFIG_X86_64)
43 #define BOOTSTRAP_DIRECTMAP_END (1UL << 32) /* 4GB */
44 #define maddr_to_bootstrap_virt(m) maddr_to_virt(m)
45 #else
46 #define BOOTSTRAP_DIRECTMAP_END (1UL << 30) /* 1GB */
47 #define maddr_to_bootstrap_virt(m) ((void *)(long)(m))
48 #endif
50 extern void generic_apic_probe(void);
51 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
53 extern u16 boot_edid_caps;
54 extern u8 boot_edid_info[128];
55 extern struct boot_video_info boot_vid_info;
57 /*
58 * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the
59 * page_info table and allocation bitmap.
60 */
61 static unsigned int opt_xenheap_megabytes = XENHEAP_DEFAULT_MB;
62 #if defined(CONFIG_X86_64)
63 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
64 #endif
66 /* opt_nosmp: If true, secondary processors are ignored. */
67 static int opt_nosmp = 0;
68 boolean_param("nosmp", opt_nosmp);
70 /* maxcpus: maximum number of CPUs to activate. */
71 static unsigned int max_cpus = NR_CPUS;
72 integer_param("maxcpus", max_cpus);
74 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
75 static int opt_watchdog = 0;
76 boolean_param("watchdog", opt_watchdog);
78 /* **** Linux config option: propagated to domain0. */
79 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
80 /* "acpi=force": Override the disable blacklist. */
81 /* "acpi=strict": Disables out-of-spec workarounds. */
82 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
83 /* "acpi=noirq": Disables ACPI interrupt routing. */
84 static void parse_acpi_param(char *s);
85 custom_param("acpi", parse_acpi_param);
87 /* **** Linux config option: propagated to domain0. */
88 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
89 extern int acpi_skip_timer_override;
90 boolean_param("acpi_skip_timer_override", acpi_skip_timer_override);
92 /* **** Linux config option: propagated to domain0. */
93 /* noapic: Disable IOAPIC setup. */
94 extern int skip_ioapic_setup;
95 boolean_param("noapic", skip_ioapic_setup);
97 int early_boot = 1;
99 cpumask_t cpu_present_map;
101 unsigned long xen_phys_start;
103 /* Limits of Xen heap, used to initialise the allocator. */
104 unsigned long xenheap_phys_start, xenheap_phys_end;
106 extern void arch_init_memory(void);
107 extern void init_IRQ(void);
108 extern void early_time_init(void);
109 extern void early_cpu_init(void);
110 extern void vesa_init(void);
111 extern void vesa_mtrr_init(void);
113 struct tss_struct init_tss[NR_CPUS];
115 char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
117 struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1 };
119 #if CONFIG_PAGING_LEVELS > 2
120 unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE;
121 #else
122 unsigned long mmu_cr4_features = X86_CR4_PSE;
123 #endif
124 EXPORT_SYMBOL(mmu_cr4_features);
126 int acpi_disabled;
128 int acpi_force;
129 char acpi_param[10] = "";
130 static void __init parse_acpi_param(char *s)
131 {
132 /* Save the parameter so it can be propagated to domain0. */
133 safe_strcpy(acpi_param, s);
135 /* Interpret the parameter for use within Xen. */
136 if ( !strcmp(s, "off") )
137 {
138 disable_acpi();
139 }
140 else if ( !strcmp(s, "force") )
141 {
142 acpi_force = 1;
143 acpi_ht = 1;
144 acpi_disabled = 0;
145 }
146 else if ( !strcmp(s, "strict") )
147 {
148 acpi_strict = 1;
149 }
150 else if ( !strcmp(s, "ht") )
151 {
152 if ( !acpi_force )
153 disable_acpi();
154 acpi_ht = 1;
155 }
156 else if ( !strcmp(s, "noirq") )
157 {
158 acpi_noirq_set();
159 }
160 }
162 static void __init do_initcalls(void)
163 {
164 initcall_t *call;
165 for ( call = &__initcall_start; call < &__initcall_end; call++ )
166 (*call)();
167 }
169 #define EARLY_FAIL(f, a...) do { \
170 printk( f , ## a ); \
171 for ( ; ; ) halt(); \
172 } while (0)
174 static unsigned long __initdata initial_images_start, initial_images_end;
176 unsigned long __init initial_images_nrpages(void)
177 {
178 ASSERT(!(initial_images_start & ~PAGE_MASK));
179 ASSERT(!(initial_images_end & ~PAGE_MASK));
180 return ((initial_images_end >> PAGE_SHIFT) -
181 (initial_images_start >> PAGE_SHIFT));
182 }
184 void __init discard_initial_images(void)
185 {
186 init_domheap_pages(initial_images_start, initial_images_end);
187 }
189 extern char __per_cpu_start[], __per_cpu_data_end[], __per_cpu_end[];
191 static void __init percpu_init_areas(void)
192 {
193 unsigned int i, data_size = __per_cpu_data_end - __per_cpu_start;
194 unsigned int first_unused;
196 BUG_ON(data_size > PERCPU_SIZE);
198 /* Initialise per-cpu data area for all possible secondary CPUs. */
199 for ( i = 1; (i < NR_CPUS) && cpu_possible(i); i++ )
200 memcpy(__per_cpu_start + (i << PERCPU_SHIFT),
201 __per_cpu_start,
202 data_size);
203 first_unused = i;
205 /* Check that there are no holes in cpu_possible_map. */
206 for ( ; i < NR_CPUS; i++ )
207 BUG_ON(cpu_possible(i));
209 #ifndef MEMORY_GUARD
210 init_xenheap_pages(__pa(__per_cpu_start) + (first_unused << PERCPU_SHIFT),
211 __pa(__per_cpu_end));
212 #endif
213 memguard_guard_range(&__per_cpu_start[first_unused << PERCPU_SHIFT],
214 (NR_CPUS - first_unused) << PERCPU_SHIFT);
215 #if defined(CONFIG_X86_64)
216 /* Also zap the mapping in the 1:1 area. */
217 memguard_guard_range(__va(__pa(__per_cpu_start)) +
218 (first_unused << PERCPU_SHIFT),
219 (NR_CPUS - first_unused) << PERCPU_SHIFT);
220 #endif
221 }
223 static void __init init_idle_domain(void)
224 {
225 struct domain *idle_domain;
227 /* Domain creation requires that scheduler structures are initialised. */
228 scheduler_init();
230 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
231 if ( (idle_domain == NULL) || (alloc_vcpu(idle_domain, 0, 0) == NULL) )
232 BUG();
234 set_current(idle_domain->vcpu[0]);
235 idle_vcpu[0] = this_cpu(curr_vcpu) = current;
237 setup_idle_pagetable();
238 }
240 static void __init srat_detect_node(int cpu)
241 {
242 unsigned node;
243 u8 apicid = x86_cpu_to_apicid[cpu];
245 node = apicid_to_node[apicid];
246 if ( node == NUMA_NO_NODE )
247 node = 0;
248 numa_set_node(cpu, node);
250 if ( acpi_numa > 0 )
251 printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
252 }
254 /*
255 * Ensure a given physical memory range is present in the bootstrap mappings.
256 * Use superpage mappings to ensure that pagetable memory needn't be allocated.
257 */
258 static void __init bootstrap_map(unsigned long start, unsigned long end)
259 {
260 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
261 start = start & ~mask;
262 end = (end + mask) & ~mask;
263 if ( end > BOOTSTRAP_DIRECTMAP_END )
264 panic("Cannot access memory beyond end of "
265 "bootstrap direct-map area\n");
266 map_pages_to_xen(
267 (unsigned long)maddr_to_bootstrap_virt(start),
268 start >> PAGE_SHIFT, (end-start) >> PAGE_SHIFT, PAGE_HYPERVISOR);
269 }
271 static void __init move_memory(
272 unsigned long dst, unsigned long src_start, unsigned long src_end)
273 {
274 bootstrap_map(src_start, src_end);
275 bootstrap_map(dst, dst + src_end - src_start);
276 memmove(maddr_to_bootstrap_virt(dst),
277 maddr_to_bootstrap_virt(src_start),
278 src_end - src_start);
279 }
281 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
282 static struct e820map __initdata boot_e820;
284 struct boot_video_info {
285 u8 orig_x; /* 0x00 */
286 u8 orig_y; /* 0x01 */
287 u8 orig_video_mode; /* 0x02 */
288 u8 orig_video_cols; /* 0x03 */
289 u8 orig_video_lines; /* 0x04 */
290 u8 orig_video_isVGA; /* 0x05 */
291 u16 orig_video_points; /* 0x06 */
293 /* VESA graphic mode -- linear frame buffer */
294 u32 capabilities; /* 0x08 */
295 u16 lfb_linelength; /* 0x0c */
296 u16 lfb_width; /* 0x0e */
297 u16 lfb_height; /* 0x10 */
298 u16 lfb_depth; /* 0x12 */
299 u32 lfb_base; /* 0x14 */
300 u32 lfb_size; /* 0x18 */
301 u8 red_size; /* 0x1c */
302 u8 red_pos; /* 0x1d */
303 u8 green_size; /* 0x1e */
304 u8 green_pos; /* 0x1f */
305 u8 blue_size; /* 0x20 */
306 u8 blue_pos; /* 0x21 */
307 u8 rsvd_size; /* 0x22 */
308 u8 rsvd_pos; /* 0x23 */
309 u16 vesapm_seg; /* 0x24 */
310 u16 vesapm_off; /* 0x26 */
311 u16 vesa_attrib; /* 0x28 */
312 };
314 static void __init parse_video_info(void)
315 {
316 struct boot_video_info *bvi = &bootsym(boot_vid_info);
318 if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
319 {
320 vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
321 vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
322 vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
323 vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
324 vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
325 vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
326 }
327 else if ( bvi->orig_video_isVGA == 0x23 )
328 {
329 vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
330 vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
331 vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
332 vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
333 vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
334 vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
335 vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
336 vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
337 vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
338 vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
339 vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
340 vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
341 vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
342 vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
343 vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
344 vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
345 vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
346 }
347 }
349 void __init kexec_reserve_area(struct e820map *e820)
350 {
351 unsigned long kdump_start = kexec_crash_area.start;
352 unsigned long kdump_size = kexec_crash_area.size;
353 static int is_reserved = 0;
355 kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
357 if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
358 return;
360 is_reserved = 1;
362 if ( !reserve_e820_ram(e820, kdump_start, kdump_size) )
363 {
364 printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)"
365 "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
366 kexec_crash_area.start = kexec_crash_area.size = 0;
367 }
368 else
369 {
370 printk("Kdump: %luMB (%lukB) at 0x%lx\n",
371 kdump_size >> 20, kdump_size >> 10, kdump_start);
372 }
373 }
375 void init_done(void)
376 {
377 extern char __init_begin[], __init_end[];
379 /* Free (or page-protect) the init areas. */
380 #ifndef MEMORY_GUARD
381 init_xenheap_pages(__pa(__init_begin), __pa(__init_end));
382 #endif
383 memguard_guard_range(__init_begin, __init_end - __init_begin);
384 #if defined(CONFIG_X86_64)
385 /* Also zap the mapping in the 1:1 area. */
386 memguard_guard_range(__va(__pa(__init_begin)), __init_end - __init_begin);
387 #endif
388 printk("Freed %ldkB init memory.\n", (long)(__init_end-__init_begin)>>10);
390 startup_cpu_idle_loop();
391 }
393 static char * __init cmdline_cook(char *p)
394 {
395 p = p ? : "";
396 while ( *p == ' ' )
397 p++;
398 while ( (*p != ' ') && (*p != '\0') )
399 p++;
400 while ( *p == ' ' )
401 p++;
402 return p;
403 }
405 void __init __start_xen(unsigned long mbi_p)
406 {
407 char *memmap_type = NULL;
408 char *cmdline, *kextra;
409 unsigned long _initrd_start = 0, _initrd_len = 0;
410 unsigned int initrdidx = 1;
411 multiboot_info_t *mbi = __va(mbi_p);
412 module_t *mod = (module_t *)__va(mbi->mods_addr);
413 unsigned long nr_pages, modules_length;
414 int i, e820_warn = 0, bytes = 0;
415 struct ns16550_defaults ns16550 = {
416 .data_bits = 8,
417 .parity = 'n',
418 .stop_bits = 1
419 };
421 extern void early_page_fault(void);
422 set_intr_gate(TRAP_page_fault, &early_page_fault);
424 /* Parse the command-line options. */
425 cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
426 __va(mbi->cmdline) : NULL);
427 if ( (kextra = strstr(cmdline, " -- ")) != NULL )
428 {
429 /*
430 * Options after ' -- ' separator belong to dom0.
431 * 1. Orphan dom0's options from Xen's command line.
432 * 2. Skip all but final leading space from dom0's options.
433 */
434 *kextra = '\0';
435 kextra += 3;
436 while ( kextra[1] == ' ' ) kextra++;
437 }
438 cmdline_parse(cmdline);
440 parse_video_info();
442 set_current((struct vcpu *)0xfffff000); /* debug sanity */
443 idle_vcpu[0] = current;
444 set_processor_id(0); /* needed early, for smp_processor_id() */
445 if ( cpu_has_efer )
446 rdmsrl(MSR_EFER, this_cpu(efer));
447 asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) );
449 smp_prepare_boot_cpu();
451 /* We initialise the serial devices very early so we can get debugging. */
452 ns16550.io_base = 0x3f8;
453 ns16550.irq = 4;
454 ns16550_init(0, &ns16550);
455 ns16550.io_base = 0x2f8;
456 ns16550.irq = 3;
457 ns16550_init(1, &ns16550);
458 serial_init_preirq();
460 init_console();
462 printk("Command line: %s\n", cmdline);
464 printk("Video information:\n");
466 /* Print VGA display mode information. */
467 switch ( vga_console_info.video_type )
468 {
469 case XEN_VGATYPE_TEXT_MODE_3:
470 printk(" VGA is text mode %dx%d, font 8x%d\n",
471 vga_console_info.u.text_mode_3.columns,
472 vga_console_info.u.text_mode_3.rows,
473 vga_console_info.u.text_mode_3.font_height);
474 break;
475 case XEN_VGATYPE_VESA_LFB:
476 printk(" VGA is graphics mode %dx%d, %d bpp\n",
477 vga_console_info.u.vesa_lfb.width,
478 vga_console_info.u.vesa_lfb.height,
479 vga_console_info.u.vesa_lfb.bits_per_pixel);
480 break;
481 default:
482 printk(" No VGA detected\n");
483 break;
484 }
486 /* Print VBE/DDC EDID information. */
487 if ( bootsym(boot_edid_caps) != 0x1313 )
488 {
489 u16 caps = bootsym(boot_edid_caps);
490 printk(" VBE/DDC methods:%s%s%s; ",
491 (caps & 1) ? " V1" : "",
492 (caps & 2) ? " V2" : "",
493 !(caps & 3) ? " none" : "");
494 printk("EDID transfer time: %d seconds\n", caps >> 8);
495 if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
496 {
497 printk(" EDID info not retrieved because ");
498 if ( !(caps & 3) )
499 printk("no DDC retrieval method detected\n");
500 else if ( (caps >> 8) > 5 )
501 printk("takes longer than 5 seconds\n");
502 else
503 printk("of reasons unknown\n");
504 }
505 }
507 printk("Disc information:\n");
508 printk(" Found %d MBR signatures\n",
509 bootsym(boot_mbr_signature_nr));
510 printk(" Found %d EDD information structures\n",
511 bootsym(boot_edd_info_nr));
513 /* Check that we have at least one Multiboot module. */
514 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
515 EARLY_FAIL("dom0 kernel not specified. "
516 "Check bootloader configuration.\n");
518 if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
519 EARLY_FAIL("Misaligned CPU0 stack.\n");
521 /*
522 * Since there are some stubs getting built on the stacks which use
523 * direct calls/jumps, the heap must be confined to the lower 2G so
524 * that those branches can reach their targets.
525 */
526 if ( opt_xenheap_megabytes > 2048 )
527 opt_xenheap_megabytes = 2048;
529 if ( e820_raw_nr != 0 )
530 {
531 memmap_type = "Xen-e820";
532 }
533 else if ( bootsym(lowmem_kb) )
534 {
535 memmap_type = "Xen-e801";
536 e820_raw[0].addr = 0;
537 e820_raw[0].size = bootsym(lowmem_kb) << 10;
538 e820_raw[0].type = E820_RAM;
539 e820_raw[1].addr = 0x100000;
540 e820_raw[1].size = bootsym(highmem_kb) << 10;
541 e820_raw[1].type = E820_RAM;
542 e820_raw_nr = 2;
543 }
544 else if ( mbi->flags & MBI_MEMMAP )
545 {
546 memmap_type = "Multiboot-e820";
547 while ( (bytes < mbi->mmap_length) && (e820_raw_nr < E820MAX) )
548 {
549 memory_map_t *map = __va(mbi->mmap_addr + bytes);
551 /*
552 * This is a gross workaround for a BIOS bug. Some bootloaders do
553 * not write e820 map entries into pre-zeroed memory. This is
554 * okay if the BIOS fills in all fields of the map entry, but
555 * some broken BIOSes do not bother to write the high word of
556 * the length field if the length is smaller than 4GB. We
557 * detect and fix this by flagging sections below 4GB that
558 * appear to be larger than 4GB in size.
559 */
560 if ( (map->base_addr_high == 0) && (map->length_high != 0) )
561 {
562 if ( !e820_warn )
563 {
564 printk("WARNING: Buggy e820 map detected and fixed "
565 "(truncated length fields).\n");
566 e820_warn = 1;
567 }
568 map->length_high = 0;
569 }
571 e820_raw[e820_raw_nr].addr =
572 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
573 e820_raw[e820_raw_nr].size =
574 ((u64)map->length_high << 32) | (u64)map->length_low;
575 e820_raw[e820_raw_nr].type = map->type;
576 e820_raw_nr++;
578 bytes += map->size + 4;
579 }
580 }
581 else if ( mbi->flags & MBI_MEMLIMITS )
582 {
583 memmap_type = "Multiboot-e801";
584 e820_raw[0].addr = 0;
585 e820_raw[0].size = mbi->mem_lower << 10;
586 e820_raw[0].type = E820_RAM;
587 e820_raw[1].addr = 0x100000;
588 e820_raw[1].size = mbi->mem_upper << 10;
589 e820_raw[1].type = E820_RAM;
590 e820_raw_nr = 2;
591 }
592 else
593 {
594 EARLY_FAIL("Bootloader provided no memory information.\n");
595 }
597 /* Sanitise the raw E820 map to produce a final clean version. */
598 max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
600 /* Create a temporary copy of the E820 map. */
601 memcpy(&boot_e820, &e820, sizeof(e820));
603 /* Early kexec reservation (explicit static start address). */
604 kexec_reserve_area(&boot_e820);
606 /*
607 * Iterate backwards over all superpage-aligned RAM regions.
608 *
609 * We require superpage alignment because the boot allocator is not yet
610 * initialised. Hence we can only map superpages in the address range
611 * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require
612 * dynamic allocation of pagetables.
613 *
614 * As well as mapping superpages in that range, in preparation for
615 * initialising the boot allocator, we also look for a region to which
616 * we can relocate the dom0 kernel and other multiboot modules. Also, on
617 * x86/64, we relocate Xen to higher memory.
618 */
619 modules_length = mod[mbi->mods_count-1].mod_end - mod[0].mod_start;
620 for ( i = boot_e820.nr_map-1; i >= 0; i-- )
621 {
622 uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
624 /* Superpage-aligned chunks from 16MB to BOOTSTRAP_DIRECTMAP_END. */
625 s = (boot_e820.map[i].addr + mask) & ~mask;
626 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
627 s = max_t(uint64_t, s, 16 << 20);
628 e = min_t(uint64_t, e, BOOTSTRAP_DIRECTMAP_END);
629 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
630 continue;
632 /* Map the chunk. No memory will need to be allocated to do this. */
633 map_pages_to_xen(
634 (unsigned long)maddr_to_bootstrap_virt(s),
635 s >> PAGE_SHIFT, (e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
637 #if defined(CONFIG_X86_64)
638 /* Is the region suitable for relocating Xen? */
639 if ( !xen_phys_start && (((e-s) >> 20) >= opt_xenheap_megabytes) )
640 {
641 extern l2_pgentry_t l2_xenmap[];
642 l4_pgentry_t *pl4e;
643 l3_pgentry_t *pl3e;
644 l2_pgentry_t *pl2e;
645 int i, j;
647 /* Select relocation address. */
648 e = (e - (opt_xenheap_megabytes << 20)) & ~mask;
649 xen_phys_start = e;
650 bootsym(trampoline_xen_phys_start) = e;
652 /*
653 * Perform relocation to new physical address.
654 * Before doing so we must sync static/global data with main memory
655 * with a barrier(). After this we must *not* modify static/global
656 * data until after we have switched to the relocated pagetables!
657 */
658 barrier();
659 move_memory(e, 0, __pa(&_end) - xen_phys_start);
661 /* Poison low 1MB to detect stray pointers to physical 0-1MB. */
662 memset(maddr_to_bootstrap_virt(e), 0x55, 1U<<20);
664 /* Walk initial pagetables, relocating page directory entries. */
665 pl4e = __va(__pa(idle_pg_table));
666 for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
667 {
668 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
669 continue;
670 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
671 xen_phys_start);
672 pl3e = l4e_to_l3e(*pl4e);
673 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
674 {
675 /* Not present or already relocated? */
676 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
677 (l3e_get_pfn(*pl3e) > 0x1000) )
678 continue;
679 *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
680 xen_phys_start);
681 }
682 }
684 /* The only data mappings to be relocated are in the Xen area. */
685 pl2e = __va(__pa(l2_xenmap));
686 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
687 {
688 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
689 continue;
690 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
691 xen_phys_start);
692 }
694 /* Re-sync the stack and then switch to relocated pagetables. */
695 asm volatile (
696 "rep movsb ; " /* re-sync the stack */
697 "movq %%cr4,%%rsi ; "
698 "andb $0x7f,%%sil ; "
699 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
700 "movq %0,%%cr3 ; " /* CR3 == new pagetables */
701 "orb $0x80,%%sil ; "
702 "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */
703 : : "r" (__pa(idle_pg_table)), "S" (cpu0_stack),
704 "D" (__va(__pa(cpu0_stack))), "c" (STACK_SIZE) : "memory" );
705 }
706 #endif
708 /* Is the region suitable for relocating the multiboot modules? */
709 if ( !initial_images_start && (s < e) && ((e-s) >= modules_length) )
710 {
711 initial_images_end = e;
712 e = (e - modules_length) & PAGE_MASK;
713 initial_images_start = e;
714 move_memory(initial_images_start,
715 mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
716 }
718 if ( !kexec_crash_area.start && (s < e) &&
719 ((e-s) >= kexec_crash_area.size) )
720 {
721 e = (e - kexec_crash_area.size) & PAGE_MASK;
722 kexec_crash_area.start = e;
723 }
724 }
726 if ( !initial_images_start )
727 EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n");
728 reserve_e820_ram(&boot_e820, initial_images_start, initial_images_end);
730 /* Initialise Xen heap and boot heap. */
731 xenheap_phys_start = init_boot_allocator(__pa(&_end));
732 xenheap_phys_end = opt_xenheap_megabytes << 20;
733 #if defined(CONFIG_X86_64)
734 if ( !xen_phys_start )
735 EARLY_FAIL("Not enough memory to relocate Xen.\n");
736 xenheap_phys_end += xen_phys_start;
737 reserve_e820_ram(&boot_e820, xen_phys_start,
738 xen_phys_start + (opt_xenheap_megabytes<<20));
739 #endif
741 /* Late kexec reservation (dynamic start address). */
742 kexec_reserve_area(&boot_e820);
744 /*
745 * With the boot allocator now initialised, we can walk every RAM region
746 * and map it in its entirety (on x86/64, at least) and notify it to the
747 * boot allocator.
748 */
749 for ( i = 0; i < boot_e820.nr_map; i++ )
750 {
751 uint64_t s, e, map_s, map_e, mask = PAGE_SIZE - 1;
753 /* Only page alignment required now. */
754 s = (boot_e820.map[i].addr + mask) & ~mask;
755 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
756 #if defined(CONFIG_X86_32)
757 s = max_t(uint64_t, s, xenheap_phys_end);
758 #else
759 s = max_t(uint64_t, s, 1<<20);
760 #endif
761 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
762 continue;
764 /* Need to create mappings above 16MB. */
765 map_s = max_t(uint64_t, s, 16<<20);
766 map_e = e;
767 #if defined(CONFIG_X86_32) /* mappings are truncated on x86_32 */
768 map_e = min_t(uint64_t, map_e, BOOTSTRAP_DIRECTMAP_END);
769 #endif
771 /* Pass mapped memory to allocator /before/ creating new mappings. */
772 init_boot_pages(s, min_t(uint64_t, map_s, e));
774 /* Create new mappings /before/ passing memory to the allocator. */
775 if ( map_s < map_e )
776 map_pages_to_xen(
777 (unsigned long)maddr_to_bootstrap_virt(map_s),
778 map_s >> PAGE_SHIFT, (map_e-map_s) >> PAGE_SHIFT,
779 PAGE_HYPERVISOR);
781 /* Pass remainder of this memory chunk to the allocator. */
782 init_boot_pages(map_s, e);
783 }
785 memguard_init();
787 nr_pages = 0;
788 for ( i = 0; i < e820.nr_map; i++ )
789 if ( e820.map[i].type == E820_RAM )
790 nr_pages += e820.map[i].size >> PAGE_SHIFT;
791 printk("System RAM: %luMB (%lukB)\n",
792 nr_pages >> (20 - PAGE_SHIFT),
793 nr_pages << (PAGE_SHIFT - 10));
794 total_pages = nr_pages;
796 /* Sanity check for unwanted bloat of certain hypercall structures. */
797 BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
798 sizeof(((struct xen_platform_op *)0)->u.pad));
799 BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
800 sizeof(((struct xen_domctl *)0)->u.pad));
801 BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
802 sizeof(((struct xen_sysctl *)0)->u.pad));
804 BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
805 BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
806 BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
808 #ifdef CONFIG_COMPAT
809 BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
810 sizeof(((struct compat_platform_op *)0)->u.pad));
811 BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
812 BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
813 #endif
815 /* Check definitions in public headers match internal defs. */
816 BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
817 #ifdef HYPERVISOR_VIRT_END
818 BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
819 #endif
820 BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
821 BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
823 init_frametable();
825 acpi_boot_table_init();
827 acpi_numa_init();
829 numa_initmem_init(0, max_page);
831 /* Initialise the Xen heap, skipping RAM holes. */
832 init_xenheap_pages(xenheap_phys_start, xenheap_phys_end);
833 nr_pages = (xenheap_phys_end - xenheap_phys_start) >> PAGE_SHIFT;
834 #ifdef __x86_64__
835 init_xenheap_pages(xen_phys_start, __pa(&_start));
836 nr_pages += (__pa(&_start) - xen_phys_start) >> PAGE_SHIFT;
837 vesa_init();
838 #endif
839 xenheap_phys_start = xen_phys_start;
840 printk("Xen heap: %luMB (%lukB)\n",
841 nr_pages >> (20 - PAGE_SHIFT),
842 nr_pages << (PAGE_SHIFT - 10));
844 end_boot_allocator();
846 early_boot = 0;
848 early_cpu_init();
850 paging_init();
852 tboot_probe();
854 /* Unmap the first page of CPU0's stack. */
855 memguard_guard_stack(cpu0_stack);
857 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
859 if ( opt_watchdog )
860 nmi_watchdog = NMI_LOCAL_APIC;
862 sort_exception_tables();
864 find_smp_config();
866 dmi_scan_machine();
868 generic_apic_probe();
870 acpi_boot_init();
872 init_cpu_to_node();
874 if ( smp_found_config )
875 get_smp_config();
877 #ifdef CONFIG_X86_64
878 /* Low mappings were only needed for some BIOS table parsing. */
879 zap_low_mappings();
880 #endif
882 init_apic_mappings();
884 init_IRQ();
886 percpu_init_areas();
888 xsm_init(&initrdidx, mbi, initial_images_start);
890 init_idle_domain();
892 trap_init();
894 rcu_init();
896 timer_init();
898 early_time_init();
900 arch_init_memory();
902 identify_cpu(&boot_cpu_data);
903 if ( cpu_has_fxsr )
904 set_in_cr4(X86_CR4_OSFXSR);
905 if ( cpu_has_xmm )
906 set_in_cr4(X86_CR4_OSXMMEXCPT);
907 #ifdef CONFIG_X86_64
908 vesa_mtrr_init();
909 #endif
911 if ( opt_nosmp )
912 max_cpus = 0;
914 smp_prepare_cpus(max_cpus);
916 /*
917 * Initialise higher-level timer functions. We do this fairly late
918 * (post-SMP) because the time bases and scale factors need to be updated
919 * regularly, and SMP initialisation can cause a long delay with
920 * interrupts not yet enabled.
921 */
922 init_xen_time();
924 initialize_keytable();
926 serial_init_postirq();
928 BUG_ON(!local_irq_is_enabled());
930 for_each_present_cpu ( i )
931 {
932 if ( num_online_cpus() >= max_cpus )
933 break;
934 if ( !cpu_online(i) )
935 {
936 rcu_online_cpu(i);
937 __cpu_up(i);
938 }
940 /* Set up cpu_to_node[]. */
941 srat_detect_node(i);
942 /* Set up node_to_cpumask based on cpu_to_node[]. */
943 numa_add_cpu(i);
944 }
946 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
947 smp_cpus_done(max_cpus);
949 initialise_gdb(); /* could be moved earlier */
951 do_initcalls();
953 if ( opt_watchdog )
954 watchdog_enable();
956 /* Create initial domain 0. */
957 dom0 = domain_create(0, 0, DOM0_SSIDREF);
958 if ( (dom0 == NULL) || (alloc_vcpu(dom0, 0, 0) == NULL) )
959 panic("Error creating domain 0\n");
961 dom0->is_privileged = 1;
962 dom0->target = NULL;
964 /* Grab the DOM0 command line. */
965 cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
966 if ( (cmdline != NULL) || (kextra != NULL) )
967 {
968 static char dom0_cmdline[MAX_GUEST_CMDLINE];
970 cmdline = cmdline_cook(cmdline);
971 safe_strcpy(dom0_cmdline, cmdline);
973 if ( kextra != NULL )
974 /* kextra always includes exactly one leading space. */
975 safe_strcat(dom0_cmdline, kextra);
977 /* Append any extra parameters. */
978 if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
979 safe_strcat(dom0_cmdline, " noapic");
980 if ( acpi_skip_timer_override &&
981 !strstr(dom0_cmdline, "acpi_skip_timer_override") )
982 safe_strcat(dom0_cmdline, " acpi_skip_timer_override");
983 if ( (strlen(acpi_param) == 0) && acpi_disabled )
984 {
985 printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
986 safe_strcpy(acpi_param, "off");
987 }
988 if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
989 {
990 safe_strcat(dom0_cmdline, " acpi=");
991 safe_strcat(dom0_cmdline, acpi_param);
992 }
994 cmdline = dom0_cmdline;
995 }
997 if ( (initrdidx > 0) && (initrdidx < mbi->mods_count) )
998 {
999 _initrd_start = initial_images_start +
1000 (mod[initrdidx].mod_start - mod[0].mod_start);
1001 _initrd_len = mod[initrdidx].mod_end - mod[initrdidx].mod_start;
1004 iommu_setup();
1006 amd_iommu_detect();
1008 /*
1009 * We're going to setup domain0 using the module(s) that we stashed safely
1010 * above our heap. The second module, if present, is an initrd ramdisk.
1011 */
1012 if ( construct_dom0(dom0,
1013 initial_images_start,
1014 mod[0].mod_end-mod[0].mod_start,
1015 _initrd_start,
1016 _initrd_len,
1017 cmdline) != 0)
1018 panic("Could not set up DOM0 guest OS\n");
1020 /* Scrub RAM that is still free and so may go to an unprivileged domain. */
1021 scrub_heap_pages();
1023 init_trace_bufs();
1025 console_endboot();
1027 /* Hide UART from DOM0 if we're using it */
1028 serial_endboot();
1030 domain_unpause_by_systemcontroller(dom0);
1032 reset_stack_and_jump(init_done);
1035 void arch_get_xen_caps(xen_capabilities_info_t *info)
1037 /* Interface name is always xen-3.0-* for Xen-3.x. */
1038 int major = 3, minor = 0;
1039 char s[32];
1041 (*info)[0] = '\0';
1043 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
1045 snprintf(s, sizeof(s), "xen-%d.%d-x86_32 ", major, minor);
1046 safe_strcat(*info, s);
1047 if ( hvm_enabled )
1049 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1050 safe_strcat(*info, s);
1053 #elif defined(CONFIG_X86_32) && defined(CONFIG_X86_PAE)
1055 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1056 safe_strcat(*info, s);
1057 if ( hvm_enabled )
1059 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1060 safe_strcat(*info, s);
1061 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1062 safe_strcat(*info, s);
1065 #elif defined(CONFIG_X86_64)
1067 snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1068 safe_strcat(*info, s);
1069 #ifdef CONFIG_COMPAT
1070 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1071 safe_strcat(*info, s);
1072 #endif
1073 if ( hvm_enabled )
1075 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1076 safe_strcat(*info, s);
1077 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1078 safe_strcat(*info, s);
1079 snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1080 safe_strcat(*info, s);
1083 #endif
1086 /*
1087 * Local variables:
1088 * mode: C
1089 * c-set-style: "BSD"
1090 * c-basic-offset: 4
1091 * tab-width: 4
1092 * indent-tabs-mode: nil
1093 * End:
1094 */