ia64/xen-unstable

view xen/arch/x86/setup.c @ 8402:47d947e07205

Work around a nasty BIOS/GRUB bug which causes fields in
the e820 map to not be initialized to zero when they should be.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Dec 16 04:11:41 2005 +0100 (2005-12-16)
parents 379921f2259d
children 3dc1c23dd508
line source
2 #include <xen/config.h>
3 #include <xen/init.h>
4 #include <xen/lib.h>
5 #include <xen/sched.h>
6 #include <xen/domain.h>
7 #include <xen/serial.h>
8 #include <xen/softirq.h>
9 #include <xen/acpi.h>
10 #include <xen/console.h>
11 #include <xen/serial.h>
12 #include <xen/trace.h>
13 #include <xen/multiboot.h>
14 #include <xen/domain_page.h>
15 #include <xen/compile.h>
16 #include <public/version.h>
17 #include <asm/bitops.h>
18 #include <asm/smp.h>
19 #include <asm/processor.h>
20 #include <asm/mpspec.h>
21 #include <asm/apic.h>
22 #include <asm/desc.h>
23 #include <asm/shadow.h>
24 #include <asm/e820.h>
25 #include <acm/acm_hooks.h>
27 extern void dmi_scan_machine(void);
28 extern void generic_apic_probe(void);
30 /*
31 * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the
32 * pfn_info table and allocation bitmap.
33 */
34 static unsigned int opt_xenheap_megabytes = XENHEAP_DEFAULT_MB;
35 #if defined(CONFIG_X86_64)
36 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
37 #endif
39 /* opt_nosmp: If true, secondary processors are ignored. */
40 static int opt_nosmp = 0;
41 boolean_param("nosmp", opt_nosmp);
43 /* maxcpus: maximum number of CPUs to activate. */
44 static unsigned int max_cpus = NR_CPUS;
45 integer_param("maxcpus", max_cpus);
47 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
48 static int opt_watchdog = 0;
49 boolean_param("watchdog", opt_watchdog);
51 /* **** Linux config option: propagated to domain0. */
52 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
53 /* "acpi=force": Override the disable blacklist. */
54 /* "acpi=strict": Disables out-of-spec workarounds. */
55 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
56 /* "acpi=noirq": Disables ACPI interrupt routing. */
57 static void parse_acpi_param(char *s);
58 custom_param("acpi", parse_acpi_param);
60 /* **** Linux config option: propagated to domain0. */
61 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
62 extern int acpi_skip_timer_override;
63 boolean_param("acpi_skip_timer_override", acpi_skip_timer_override);
65 /* **** Linux config option: propagated to domain0. */
66 /* noapic: Disable IOAPIC setup. */
67 extern int skip_ioapic_setup;
68 boolean_param("noapic", skip_ioapic_setup);
70 int early_boot = 1;
72 cpumask_t cpu_present_map;
74 /* Limits of Xen heap, used to initialise the allocator. */
75 unsigned long xenheap_phys_start, xenheap_phys_end;
77 extern void arch_init_memory(void);
78 extern void init_IRQ(void);
79 extern void trap_init(void);
80 extern void early_time_init(void);
81 extern void initialize_keytable(void);
82 extern void early_cpu_init(void);
84 extern unsigned long cpu0_stack[];
86 struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
88 #if CONFIG_PAGING_LEVELS > 2
89 unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE;
90 #else
91 unsigned long mmu_cr4_features = X86_CR4_PSE;
92 #endif
93 EXPORT_SYMBOL(mmu_cr4_features);
95 struct vcpu *idle_task[NR_CPUS] = { &idle0_vcpu };
97 int acpi_disabled;
99 int acpi_force;
100 char acpi_param[10] = "";
101 static void parse_acpi_param(char *s)
102 {
103 /* Save the parameter so it can be propagated to domain0. */
104 strncpy(acpi_param, s, sizeof(acpi_param));
105 acpi_param[sizeof(acpi_param)-1] = '\0';
107 /* Interpret the parameter for use within Xen. */
108 if ( !strcmp(s, "off") )
109 {
110 disable_acpi();
111 }
112 else if ( !strcmp(s, "force") )
113 {
114 acpi_force = 1;
115 acpi_ht = 1;
116 acpi_disabled = 0;
117 }
118 else if ( !strcmp(s, "strict") )
119 {
120 acpi_strict = 1;
121 }
122 else if ( !strcmp(s, "ht") )
123 {
124 if ( !acpi_force )
125 disable_acpi();
126 acpi_ht = 1;
127 }
128 else if ( !strcmp(s, "noirq") )
129 {
130 acpi_noirq_set();
131 }
132 }
134 static void __init do_initcalls(void)
135 {
136 initcall_t *call;
137 for ( call = &__initcall_start; call < &__initcall_end; call++ )
138 (*call)();
139 }
141 static void __init start_of_day(void)
142 {
143 int i;
144 unsigned long vgdt, gdt_pfn;
146 early_cpu_init();
148 paging_init();
150 /* Unmap the first page of CPU0's stack. */
151 memguard_guard_stack(cpu0_stack);
153 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
155 if ( opt_watchdog )
156 nmi_watchdog = NMI_LOCAL_APIC;
158 sort_exception_tables();
160 arch_do_createdomain(current);
162 /*
163 * Map default GDT into its final positions in the idle page table. As
164 * noted in arch_do_createdomain(), we must map for every possible VCPU#.
165 */
166 vgdt = GDT_VIRT_START(current) + FIRST_RESERVED_GDT_BYTE;
167 gdt_pfn = virt_to_phys(gdt_table) >> PAGE_SHIFT;
168 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
169 {
170 map_pages_to_xen(vgdt, gdt_pfn, 1, PAGE_HYPERVISOR);
171 vgdt += 1 << PDPT_VCPU_VA_SHIFT;
172 }
174 find_smp_config();
176 smp_alloc_memory();
178 dmi_scan_machine();
180 generic_apic_probe();
182 acpi_boot_table_init();
183 acpi_boot_init();
185 if ( smp_found_config )
186 get_smp_config();
188 init_apic_mappings();
190 init_IRQ();
192 trap_init();
194 ac_timer_init();
196 early_time_init();
198 arch_init_memory();
200 scheduler_init();
202 identify_cpu(&boot_cpu_data);
203 if ( cpu_has_fxsr )
204 set_in_cr4(X86_CR4_OSFXSR);
205 if ( cpu_has_xmm )
206 set_in_cr4(X86_CR4_OSXMMEXCPT);
208 if ( opt_nosmp )
209 {
210 max_cpus = 0;
211 smp_num_siblings = 1;
212 boot_cpu_data.x86_num_cores = 1;
213 }
215 smp_prepare_cpus(max_cpus);
217 /* We aren't hotplug-capable yet. */
218 BUG_ON(!cpus_empty(cpu_present_map));
219 for_each_cpu ( i )
220 cpu_set(i, cpu_present_map);
222 /*
223 * Initialise higher-level timer functions. We do this fairly late
224 * (post-SMP) because the time bases and scale factors need to be updated
225 * regularly, and SMP initialisation can cause a long delay with
226 * interrupts not yet enabled.
227 */
228 init_xen_time();
230 initialize_keytable();
232 serial_init_postirq();
234 BUG_ON(!local_irq_is_enabled());
236 for_each_present_cpu ( i )
237 {
238 if ( num_online_cpus() >= max_cpus )
239 break;
240 if ( !cpu_online(i) )
241 __cpu_up(i);
242 }
244 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
245 smp_cpus_done(max_cpus);
247 do_initcalls();
249 schedulers_start();
251 watchdog_enable();
252 }
254 #define EARLY_FAIL() for ( ; ; ) __asm__ __volatile__ ( "hlt" )
256 static struct e820entry e820_raw[E820MAX];
258 void __init __start_xen(multiboot_info_t *mbi)
259 {
260 char *cmdline;
261 module_t *mod = (module_t *)__va(mbi->mods_addr);
262 unsigned long nr_pages, modules_length;
263 unsigned long initial_images_start, initial_images_end;
264 unsigned long _initrd_start = 0, _initrd_len = 0;
265 unsigned int initrdidx = 1;
266 physaddr_t s, e;
267 int i, e820_warn = 0, e820_raw_nr = 0, bytes = 0;
268 struct ns16550_defaults ns16550 = {
269 .data_bits = 8,
270 .parity = 'n',
271 .stop_bits = 1
272 };
274 /* Parse the command-line options. */
275 if ( (mbi->flags & MBI_CMDLINE) && (mbi->cmdline != 0) )
276 cmdline_parse(__va(mbi->cmdline));
278 /* Must do this early -- e.g., spinlocks rely on get_current(). */
279 set_current(&idle0_vcpu);
280 set_processor_id(0);
282 smp_prepare_boot_cpu();
284 /* We initialise the serial devices very early so we can get debugging. */
285 ns16550.io_base = 0x3f8;
286 ns16550.irq = 4;
287 ns16550_init(0, &ns16550);
288 ns16550.io_base = 0x2f8;
289 ns16550.irq = 3;
290 ns16550_init(1, &ns16550);
291 serial_init_preirq();
293 init_console();
295 /* Check that we have at least one Multiboot module. */
296 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
297 {
298 printk("FATAL ERROR: dom0 kernel not specified."
299 " Check bootloader configuration.\n");
300 EARLY_FAIL();
301 }
303 if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
304 {
305 printk("FATAL ERROR: Misaligned CPU0 stack.\n");
306 EARLY_FAIL();
307 }
309 xenheap_phys_end = opt_xenheap_megabytes << 20;
311 if ( mbi->flags & MBI_MEMMAP )
312 {
313 while ( bytes < mbi->mmap_length )
314 {
315 memory_map_t *map = __va(mbi->mmap_addr + bytes);
317 /*
318 * This is a gross workaround for a BIOS/GRUB bug. GRUB does
319 * not write e820 map entries into pre-zeroed memory. This is
320 * okay if the BIOS fills in all fields of the map entry, but
321 * some broken BIOSes do not bother to write the high word of
322 * the length field if the length is smaller than 4GB. We
323 * detect and fix this by flagging sections below 4GB that
324 * appear to be larger than 4GB in size. We disable this check
325 * for mbootpack and syslinux (which we can detect because they
326 * place the mmap_addr list above 1MB in memory).
327 */
328 if ( (mbi->mmap_addr < 0x100000) &&
329 (map->base_addr_high == 0) &&
330 (map->length_high != 0) )
331 {
332 e820_warn = 1;
333 map->length_high = 0;
334 }
336 e820_raw[e820_raw_nr].addr =
337 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
338 e820_raw[e820_raw_nr].size =
339 ((u64)map->length_high << 32) | (u64)map->length_low;
340 e820_raw[e820_raw_nr].type =
341 (map->type > E820_SHARED_PAGE) ? E820_RESERVED : map->type;
342 e820_raw_nr++;
344 bytes += map->size + 4;
345 }
346 }
347 else if ( mbi->flags & MBI_MEMLIMITS )
348 {
349 e820_raw[0].addr = 0;
350 e820_raw[0].size = mbi->mem_lower << 10;
351 e820_raw[0].type = E820_RAM;
352 e820_raw[1].addr = 0x100000;
353 e820_raw[1].size = mbi->mem_upper << 10;
354 e820_raw[1].type = E820_RAM;
355 e820_raw_nr = 2;
356 }
357 else
358 {
359 printk("FATAL ERROR: Bootloader provided no memory information.\n");
360 for ( ; ; ) ;
361 }
363 if ( e820_warn )
364 printk("WARNING: Buggy e820 map detected and fixed "
365 "(truncated length fields).\n");
367 max_page = init_e820(e820_raw, &e820_raw_nr);
369 modules_length = mod[mbi->mods_count-1].mod_end - mod[0].mod_start;
371 /* Find a large enough RAM extent to stash the DOM0 modules. */
372 for ( i = 0; ; i++ )
373 {
374 if ( i == e820.nr_map )
375 {
376 printk("Not enough memory to stash the DOM0 kernel image.\n");
377 for ( ; ; ) ;
378 }
380 if ( (e820.map[i].type == E820_RAM) &&
381 (e820.map[i].size >= modules_length) &&
382 ((e820.map[i].addr + e820.map[i].size) >=
383 (xenheap_phys_end + modules_length)) )
384 break;
385 }
387 /* Stash as near as possible to the beginning of the RAM extent. */
388 initial_images_start = e820.map[i].addr;
389 if ( initial_images_start < xenheap_phys_end )
390 initial_images_start = xenheap_phys_end;
391 initial_images_end = initial_images_start + modules_length;
393 #if defined(CONFIG_X86_32)
394 memmove((void *)initial_images_start, /* use low mapping */
395 (void *)mod[0].mod_start, /* use low mapping */
396 mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
397 #elif defined(CONFIG_X86_64)
398 memmove(__va(initial_images_start),
399 __va(mod[0].mod_start),
400 mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
401 #endif
403 /* Initialise boot-time allocator with all RAM situated after modules. */
404 xenheap_phys_start = init_boot_allocator(__pa(&_end));
405 nr_pages = 0;
406 for ( i = 0; i < e820.nr_map; i++ )
407 {
408 if ( e820.map[i].type != E820_RAM )
409 continue;
411 nr_pages += e820.map[i].size >> PAGE_SHIFT;
413 /* Initialise boot heap, skipping Xen heap and dom0 modules. */
414 s = e820.map[i].addr;
415 e = s + e820.map[i].size;
416 if ( s < xenheap_phys_end )
417 s = xenheap_phys_end;
418 if ( (s < initial_images_end) && (e > initial_images_start) )
419 s = initial_images_end;
420 init_boot_pages(s, e);
422 #if defined (CONFIG_X86_64)
423 /*
424 * x86/64 maps all registered RAM. Points to note:
425 * 1. The initial pagetable already maps low 64MB, so skip that.
426 * 2. We must map *only* RAM areas, taking care to avoid I/O holes.
427 * Failure to do this can cause coherency problems and deadlocks
428 * due to cache-attribute mismatches (e.g., AMD/AGP Linux bug).
429 */
430 {
431 /* Calculate page-frame range, discarding partial frames. */
432 unsigned long start, end;
433 start = PFN_UP(e820.map[i].addr);
434 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
435 /* Clip the range to above 64MB. */
436 if ( end < (64UL << (20-PAGE_SHIFT)) )
437 continue;
438 if ( start < (64UL << (20-PAGE_SHIFT)) )
439 start = 64UL << (20-PAGE_SHIFT);
440 /* Request the mapping. */
441 map_pages_to_xen(
442 PAGE_OFFSET + (start << PAGE_SHIFT),
443 start, end-start, PAGE_HYPERVISOR);
444 }
445 #endif
446 }
448 memguard_init();
450 printk("System RAM: %luMB (%lukB)\n",
451 nr_pages >> (20 - PAGE_SHIFT),
452 nr_pages << (PAGE_SHIFT - 10));
453 total_pages = nr_pages;
455 /* Sanity check for unwanted bloat of dom0_op_t structure. */
456 BUG_ON(sizeof(((dom0_op_t *)0)->u) != sizeof(((dom0_op_t *)0)->u.pad));
458 BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
459 BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
460 BUG_ON(sizeof(vcpu_info_t) != 64);
462 init_frametable();
464 end_boot_allocator();
466 /* Initialise the Xen heap, skipping RAM holes. */
467 nr_pages = 0;
468 for ( i = 0; i < e820.nr_map; i++ )
469 {
470 if ( e820.map[i].type != E820_RAM )
471 continue;
473 s = e820.map[i].addr;
474 e = s + e820.map[i].size;
475 if ( s < xenheap_phys_start )
476 s = xenheap_phys_start;
477 if ( e > xenheap_phys_end )
478 e = xenheap_phys_end;
480 if ( s < e )
481 {
482 nr_pages += (e - s) >> PAGE_SHIFT;
483 init_xenheap_pages(s, e);
484 }
485 }
487 printk("Xen heap: %luMB (%lukB)\n",
488 nr_pages >> (20 - PAGE_SHIFT),
489 nr_pages << (PAGE_SHIFT - 10));
491 early_boot = 0;
493 start_of_day();
495 grant_table_init();
497 shadow_mode_init();
499 /* initialize access control security module */
500 acm_init(&initrdidx, mbi, initial_images_start);
502 /* Create initial domain 0. */
503 dom0 = do_createdomain(0, 0);
504 if ( dom0 == NULL )
505 panic("Error creating domain 0\n");
507 set_bit(_DOMF_privileged, &dom0->domain_flags);
508 /* post-create hooks sets security label */
509 acm_post_domain0_create(dom0->domain_id);
511 /* Grab the DOM0 command line. */
512 cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
513 if ( cmdline != NULL )
514 {
515 static char dom0_cmdline[MAX_GUEST_CMDLINE];
517 /* Skip past the image name and copy to a local buffer. */
518 while ( *cmdline == ' ' ) cmdline++;
519 if ( (cmdline = strchr(cmdline, ' ')) != NULL )
520 {
521 while ( *cmdline == ' ' ) cmdline++;
522 strcpy(dom0_cmdline, cmdline);
523 }
525 cmdline = dom0_cmdline;
527 /* Append any extra parameters. */
528 if ( skip_ioapic_setup && !strstr(cmdline, "noapic") )
529 strcat(cmdline, " noapic");
530 if ( acpi_skip_timer_override &&
531 !strstr(cmdline, "acpi_skip_timer_override") )
532 strcat(cmdline, " acpi_skip_timer_override");
533 if ( (strlen(acpi_param) != 0) && !strstr(cmdline, "acpi=") )
534 {
535 strcat(cmdline, " acpi=");
536 strcat(cmdline, acpi_param);
537 }
538 }
540 if ( (initrdidx > 0) && (initrdidx < mbi->mods_count) )
541 {
542 _initrd_start = initial_images_start +
543 (mod[initrdidx].mod_start - mod[0].mod_start);
544 _initrd_len = mod[initrdidx].mod_end - mod[initrdidx].mod_start;
545 }
547 /*
548 * We're going to setup domain0 using the module(s) that we stashed safely
549 * above our heap. The second module, if present, is an initrd ramdisk.
550 */
551 if ( construct_dom0(dom0,
552 initial_images_start,
553 mod[0].mod_end-mod[0].mod_start,
554 _initrd_start,
555 _initrd_len,
556 cmdline) != 0)
557 panic("Could not set up DOM0 guest OS\n");
559 /* Scrub RAM that is still free and so may go to an unprivileged domain. */
560 scrub_heap_pages();
562 init_trace_bufs();
564 /* Give up the VGA console if DOM0 is configured to grab it. */
565 console_endboot(cmdline && strstr(cmdline, "tty0"));
567 /* Hide UART from DOM0 if we're using it */
568 serial_endboot();
570 domain_unpause_by_systemcontroller(dom0);
572 startup_cpu_idle_loop();
573 }
575 void arch_get_xen_caps(xen_capabilities_info_t info)
576 {
577 char *p = info;
579 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
581 p += sprintf(p, "xen-%d.%d-x86_32 ", XEN_VERSION, XEN_SUBVERSION);
582 if ( hvm_enabled )
583 p += sprintf(p, "hvm-%d.%d-x86_32 ", XEN_VERSION, XEN_SUBVERSION);
585 #elif defined(CONFIG_X86_32) && defined(CONFIG_X86_PAE)
587 p += sprintf(p, "xen-%d.%d-x86_32p ", XEN_VERSION, XEN_SUBVERSION);
588 if ( hvm_enabled )
589 {
590 //p += sprintf(p, "hvm-%d.%d-x86_32 ", XEN_VERSION, XEN_SUBVERSION);
591 //p += sprintf(p, "hvm-%d.%d-x86_32p ", XEN_VERSION, XEN_SUBVERSION);
592 }
594 #elif defined(CONFIG_X86_64)
596 p += sprintf(p, "xen-%d.%d-x86_64 ", XEN_VERSION, XEN_SUBVERSION);
597 if ( hvm_enabled )
598 {
599 p += sprintf(p, "hvm-%d.%d-x86_32 ", XEN_VERSION, XEN_SUBVERSION);
600 //p += sprintf(p, "hvm-%d.%d-x86_32p ", XEN_VERSION, XEN_SUBVERSION);
601 p += sprintf(p, "hvm-%d.%d-x86_64 ", XEN_VERSION, XEN_SUBVERSION);
602 }
604 #else
606 p++;
608 #endif
610 *(p-1) = 0;
612 BUG_ON((p - info) > sizeof(xen_capabilities_info_t));
613 }
615 /*
616 * Local variables:
617 * mode: C
618 * c-set-style: "BSD"
619 * c-basic-offset: 4
620 * tab-width: 4
621 * indent-tabs-mode: nil
622 * End:
623 */