ia64/xen-unstable

view xen/arch/x86/setup.c @ 19545:34dca01addc9

x86: Disable cpuidle by default unless hpet broadcast is available.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Apr 15 08:40:12 2009 +0100 (2009-04-15)
parents 94ffd85005c5
children 4dd8ed253ee0
line source
1 #include <xen/config.h>
2 #include <xen/init.h>
3 #include <xen/lib.h>
4 #include <xen/sched.h>
5 #include <xen/domain.h>
6 #include <xen/serial.h>
7 #include <xen/softirq.h>
8 #include <xen/acpi.h>
9 #include <xen/console.h>
10 #include <xen/serial.h>
11 #include <xen/trace.h>
12 #include <xen/multiboot.h>
13 #include <xen/domain_page.h>
14 #include <xen/version.h>
15 #include <xen/gdbstub.h>
16 #include <xen/percpu.h>
17 #include <xen/hypercall.h>
18 #include <xen/keyhandler.h>
19 #include <xen/numa.h>
20 #include <xen/rcupdate.h>
21 #include <xen/vga.h>
22 #include <xen/dmi.h>
23 #include <public/version.h>
24 #ifdef CONFIG_COMPAT
25 #include <compat/platform.h>
26 #include <compat/xen.h>
27 #endif
28 #include <asm/bitops.h>
29 #include <asm/smp.h>
30 #include <asm/processor.h>
31 #include <asm/mpspec.h>
32 #include <asm/apic.h>
33 #include <asm/desc.h>
34 #include <asm/paging.h>
35 #include <asm/e820.h>
36 #include <xsm/acm/acm_hooks.h>
37 #include <xen/kexec.h>
38 #include <asm/edd.h>
39 #include <xsm/xsm.h>
40 #include <asm/tboot.h>
42 int __init bzimage_headroom(char *image_start, unsigned long image_length);
44 #if defined(CONFIG_X86_64)
45 #define BOOTSTRAP_DIRECTMAP_END (1UL << 32) /* 4GB */
46 #define maddr_to_bootstrap_virt(m) maddr_to_virt(m)
47 #else
48 #define BOOTSTRAP_DIRECTMAP_END (1UL << 30) /* 1GB */
49 #define maddr_to_bootstrap_virt(m) ((void *)(long)(m))
50 #endif
52 extern void generic_apic_probe(void);
53 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
55 extern u16 boot_edid_caps;
56 extern u8 boot_edid_info[128];
57 extern struct boot_video_info boot_vid_info;
59 /* opt_nosmp: If true, secondary processors are ignored. */
60 static int opt_nosmp = 0;
61 boolean_param("nosmp", opt_nosmp);
63 /* maxcpus: maximum number of CPUs to activate. */
64 static unsigned int max_cpus = NR_CPUS;
65 integer_param("maxcpus", max_cpus);
67 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
68 static int opt_watchdog = 0;
69 boolean_param("watchdog", opt_watchdog);
71 /* **** Linux config option: propagated to domain0. */
72 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
73 /* "acpi=force": Override the disable blacklist. */
74 /* "acpi=strict": Disables out-of-spec workarounds. */
75 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
76 /* "acpi=noirq": Disables ACPI interrupt routing. */
77 static void parse_acpi_param(char *s);
78 custom_param("acpi", parse_acpi_param);
80 /* **** Linux config option: propagated to domain0. */
81 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
82 extern int acpi_skip_timer_override;
83 boolean_param("acpi_skip_timer_override", acpi_skip_timer_override);
85 /* **** Linux config option: propagated to domain0. */
86 /* noapic: Disable IOAPIC setup. */
87 extern int skip_ioapic_setup;
88 boolean_param("noapic", skip_ioapic_setup);
90 /* **** Linux config option: propagated to domain0. */
91 /* xen_cpuidle: xen control cstate. */
92 /*static*/ int xen_cpuidle = -1;
93 boolean_param("cpuidle", xen_cpuidle);
95 int early_boot = 1;
97 cpumask_t cpu_present_map;
99 unsigned long xen_phys_start;
100 unsigned long allocator_bitmap_end;
102 #ifdef CONFIG_X86_32
103 /* Limits of Xen heap, used to initialise the allocator. */
104 unsigned long xenheap_initial_phys_start, xenheap_phys_end;
105 #endif
107 extern void arch_init_memory(void);
108 extern void init_IRQ(void);
109 extern void early_time_init(void);
110 extern void early_cpu_init(void);
111 extern void vesa_init(void);
112 extern void vesa_mtrr_init(void);
114 DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
115 #ifdef CONFIG_COMPAT
116 DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table)
117 = boot_cpu_compat_gdt_table;
118 #endif
120 struct tss_struct init_tss[NR_CPUS];
122 char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
124 struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1 };
126 unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE;
127 EXPORT_SYMBOL(mmu_cr4_features);
129 int acpi_disabled;
131 int acpi_force;
132 char acpi_param[10] = "";
133 static void __init parse_acpi_param(char *s)
134 {
135 /* Save the parameter so it can be propagated to domain0. */
136 safe_strcpy(acpi_param, s);
138 /* Interpret the parameter for use within Xen. */
139 if ( !strcmp(s, "off") )
140 {
141 disable_acpi();
142 }
143 else if ( !strcmp(s, "force") )
144 {
145 acpi_force = 1;
146 acpi_ht = 1;
147 acpi_disabled = 0;
148 }
149 else if ( !strcmp(s, "strict") )
150 {
151 acpi_strict = 1;
152 }
153 else if ( !strcmp(s, "ht") )
154 {
155 if ( !acpi_force )
156 disable_acpi();
157 acpi_ht = 1;
158 }
159 else if ( !strcmp(s, "noirq") )
160 {
161 acpi_noirq_set();
162 }
163 }
165 static void __init do_initcalls(void)
166 {
167 initcall_t *call;
168 for ( call = &__initcall_start; call < &__initcall_end; call++ )
169 (*call)();
170 }
172 #define EARLY_FAIL(f, a...) do { \
173 printk( f , ## a ); \
174 for ( ; ; ) halt(); \
175 } while (0)
177 static unsigned long __initdata initial_images_base;
178 static unsigned long __initdata initial_images_start;
179 static unsigned long __initdata initial_images_end;
181 unsigned long __init initial_images_nrpages(void)
182 {
183 ASSERT(!(initial_images_base & ~PAGE_MASK));
184 ASSERT(!(initial_images_end & ~PAGE_MASK));
185 return ((initial_images_end >> PAGE_SHIFT) -
186 (initial_images_base >> PAGE_SHIFT));
187 }
189 void __init discard_initial_images(void)
190 {
191 init_domheap_pages(initial_images_base, initial_images_end);
192 }
194 extern char __per_cpu_start[], __per_cpu_data_end[], __per_cpu_end[];
196 static void __init percpu_init_areas(void)
197 {
198 unsigned int i, data_size = __per_cpu_data_end - __per_cpu_start;
199 unsigned int first_unused;
201 BUG_ON(data_size > PERCPU_SIZE);
203 /* Initialise per-cpu data area for all possible secondary CPUs. */
204 for ( i = 1; (i < NR_CPUS) && cpu_possible(i); i++ )
205 memcpy(__per_cpu_start + (i << PERCPU_SHIFT),
206 __per_cpu_start,
207 data_size);
208 first_unused = i;
210 /* Check that there are no holes in cpu_possible_map. */
211 for ( ; i < NR_CPUS; i++ )
212 BUG_ON(cpu_possible(i));
214 #ifndef MEMORY_GUARD
215 init_xenheap_pages(__pa(__per_cpu_start) + (first_unused << PERCPU_SHIFT),
216 __pa(__per_cpu_end));
217 #endif
218 memguard_guard_range(&__per_cpu_start[first_unused << PERCPU_SHIFT],
219 (NR_CPUS - first_unused) << PERCPU_SHIFT);
220 #if defined(CONFIG_X86_64)
221 /* Also zap the mapping in the 1:1 area. */
222 memguard_guard_range(__va(__pa(__per_cpu_start)) +
223 (first_unused << PERCPU_SHIFT),
224 (NR_CPUS - first_unused) << PERCPU_SHIFT);
225 #endif
226 }
228 static void __init init_idle_domain(void)
229 {
230 struct domain *idle_domain;
232 /* Domain creation requires that scheduler structures are initialised. */
233 scheduler_init();
235 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
236 if ( (idle_domain == NULL) || (alloc_vcpu(idle_domain, 0, 0) == NULL) )
237 BUG();
239 set_current(idle_domain->vcpu[0]);
240 idle_vcpu[0] = this_cpu(curr_vcpu) = current;
242 setup_idle_pagetable();
243 }
245 static void __init srat_detect_node(int cpu)
246 {
247 unsigned node;
248 u32 apicid = x86_cpu_to_apicid[cpu];
250 node = apicid_to_node[apicid];
251 if ( node == NUMA_NO_NODE )
252 node = 0;
253 numa_set_node(cpu, node);
255 if ( acpi_numa > 0 )
256 printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
257 }
259 /*
260 * Ensure a given physical memory range is present in the bootstrap mappings.
261 * Use superpage mappings to ensure that pagetable memory needn't be allocated.
262 */
263 static void __init bootstrap_map(unsigned long start, unsigned long end)
264 {
265 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
266 start = max_t(unsigned long, start & ~mask, 16UL << 20);
267 end = (end + mask) & ~mask;
268 if ( start >= end )
269 return;
270 if ( end > BOOTSTRAP_DIRECTMAP_END )
271 panic("Cannot access memory beyond end of "
272 "bootstrap direct-map area\n");
273 map_pages_to_xen(
274 (unsigned long)maddr_to_bootstrap_virt(start),
275 start >> PAGE_SHIFT, (end-start) >> PAGE_SHIFT, PAGE_HYPERVISOR);
276 }
278 static void __init move_memory(
279 unsigned long dst, unsigned long src_start, unsigned long src_end)
280 {
281 bootstrap_map(src_start, src_end);
282 bootstrap_map(dst, dst + src_end - src_start);
283 memmove(maddr_to_bootstrap_virt(dst),
284 maddr_to_bootstrap_virt(src_start),
285 src_end - src_start);
286 }
288 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
289 static struct e820map __initdata boot_e820;
291 struct boot_video_info {
292 u8 orig_x; /* 0x00 */
293 u8 orig_y; /* 0x01 */
294 u8 orig_video_mode; /* 0x02 */
295 u8 orig_video_cols; /* 0x03 */
296 u8 orig_video_lines; /* 0x04 */
297 u8 orig_video_isVGA; /* 0x05 */
298 u16 orig_video_points; /* 0x06 */
300 /* VESA graphic mode -- linear frame buffer */
301 u32 capabilities; /* 0x08 */
302 u16 lfb_linelength; /* 0x0c */
303 u16 lfb_width; /* 0x0e */
304 u16 lfb_height; /* 0x10 */
305 u16 lfb_depth; /* 0x12 */
306 u32 lfb_base; /* 0x14 */
307 u32 lfb_size; /* 0x18 */
308 u8 red_size; /* 0x1c */
309 u8 red_pos; /* 0x1d */
310 u8 green_size; /* 0x1e */
311 u8 green_pos; /* 0x1f */
312 u8 blue_size; /* 0x20 */
313 u8 blue_pos; /* 0x21 */
314 u8 rsvd_size; /* 0x22 */
315 u8 rsvd_pos; /* 0x23 */
316 u16 vesapm_seg; /* 0x24 */
317 u16 vesapm_off; /* 0x26 */
318 u16 vesa_attrib; /* 0x28 */
319 };
321 static void __init parse_video_info(void)
322 {
323 struct boot_video_info *bvi = &bootsym(boot_vid_info);
325 if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
326 {
327 vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
328 vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
329 vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
330 vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
331 vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
332 vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
333 }
334 else if ( bvi->orig_video_isVGA == 0x23 )
335 {
336 vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
337 vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
338 vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
339 vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
340 vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
341 vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
342 vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
343 vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
344 vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
345 vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
346 vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
347 vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
348 vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
349 vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
350 vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
351 vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
352 vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
353 }
354 }
356 void __init kexec_reserve_area(struct e820map *e820)
357 {
358 unsigned long kdump_start = kexec_crash_area.start;
359 unsigned long kdump_size = kexec_crash_area.size;
360 static int is_reserved = 0;
362 kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
364 if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
365 return;
367 is_reserved = 1;
369 if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
370 {
371 printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)"
372 "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
373 kexec_crash_area.start = kexec_crash_area.size = 0;
374 }
375 else
376 {
377 printk("Kdump: %luMB (%lukB) at 0x%lx\n",
378 kdump_size >> 20, kdump_size >> 10, kdump_start);
379 }
380 }
382 void init_done(void)
383 {
384 extern char __init_begin[], __init_end[];
386 /* Free (or page-protect) the init areas. */
387 memset(__init_begin, 0xcc, __init_end - __init_begin); /* int3 poison */
388 #ifndef MEMORY_GUARD
389 init_xenheap_pages(__pa(__init_begin), __pa(__init_end));
390 #endif
391 memguard_guard_range(__init_begin, __init_end - __init_begin);
392 #if defined(CONFIG_X86_64)
393 /* Also zap the mapping in the 1:1 area. */
394 memguard_guard_range(__va(__pa(__init_begin)), __init_end - __init_begin);
395 #endif
396 printk("Freed %ldkB init memory.\n", (long)(__init_end-__init_begin)>>10);
398 startup_cpu_idle_loop();
399 }
401 static char * __init cmdline_cook(char *p)
402 {
403 p = p ? : "";
404 while ( *p == ' ' )
405 p++;
406 while ( (*p != ' ') && (*p != '\0') )
407 p++;
408 while ( *p == ' ' )
409 p++;
410 return p;
411 }
413 void __init __start_xen(unsigned long mbi_p)
414 {
415 char *memmap_type = NULL;
416 char *cmdline, *kextra;
417 unsigned long _initrd_start = 0, _initrd_len = 0;
418 unsigned int initrdidx = 1;
419 multiboot_info_t *mbi = __va(mbi_p);
420 module_t *mod = (module_t *)__va(mbi->mods_addr);
421 unsigned long nr_pages, modules_length, modules_headroom;
422 int i, j, e820_warn = 0, bytes = 0;
423 struct ns16550_defaults ns16550 = {
424 .data_bits = 8,
425 .parity = 'n',
426 .stop_bits = 1
427 };
429 extern void early_page_fault(void);
430 set_intr_gate(TRAP_page_fault, &early_page_fault);
432 /* Parse the command-line options. */
433 cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
434 __va(mbi->cmdline) : NULL);
435 if ( (kextra = strstr(cmdline, " -- ")) != NULL )
436 {
437 /*
438 * Options after ' -- ' separator belong to dom0.
439 * 1. Orphan dom0's options from Xen's command line.
440 * 2. Skip all but final leading space from dom0's options.
441 */
442 *kextra = '\0';
443 kextra += 3;
444 while ( kextra[1] == ' ' ) kextra++;
445 }
446 cmdline_parse(cmdline);
448 parse_video_info();
450 set_current((struct vcpu *)0xfffff000); /* debug sanity */
451 idle_vcpu[0] = current;
452 set_processor_id(0); /* needed early, for smp_processor_id() */
453 if ( cpu_has_efer )
454 rdmsrl(MSR_EFER, this_cpu(efer));
455 asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) );
457 smp_prepare_boot_cpu();
459 /* We initialise the serial devices very early so we can get debugging. */
460 ns16550.io_base = 0x3f8;
461 ns16550.irq = 4;
462 ns16550_init(0, &ns16550);
463 ns16550.io_base = 0x2f8;
464 ns16550.irq = 3;
465 ns16550_init(1, &ns16550);
466 console_init_preirq();
468 printk("Command line: %s\n", cmdline);
470 printk("Video information:\n");
472 /* Print VGA display mode information. */
473 switch ( vga_console_info.video_type )
474 {
475 case XEN_VGATYPE_TEXT_MODE_3:
476 printk(" VGA is text mode %dx%d, font 8x%d\n",
477 vga_console_info.u.text_mode_3.columns,
478 vga_console_info.u.text_mode_3.rows,
479 vga_console_info.u.text_mode_3.font_height);
480 break;
481 case XEN_VGATYPE_VESA_LFB:
482 printk(" VGA is graphics mode %dx%d, %d bpp\n",
483 vga_console_info.u.vesa_lfb.width,
484 vga_console_info.u.vesa_lfb.height,
485 vga_console_info.u.vesa_lfb.bits_per_pixel);
486 break;
487 default:
488 printk(" No VGA detected\n");
489 break;
490 }
492 /* Print VBE/DDC EDID information. */
493 if ( bootsym(boot_edid_caps) != 0x1313 )
494 {
495 u16 caps = bootsym(boot_edid_caps);
496 printk(" VBE/DDC methods:%s%s%s; ",
497 (caps & 1) ? " V1" : "",
498 (caps & 2) ? " V2" : "",
499 !(caps & 3) ? " none" : "");
500 printk("EDID transfer time: %d seconds\n", caps >> 8);
501 if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
502 {
503 printk(" EDID info not retrieved because ");
504 if ( !(caps & 3) )
505 printk("no DDC retrieval method detected\n");
506 else if ( (caps >> 8) > 5 )
507 printk("takes longer than 5 seconds\n");
508 else
509 printk("of reasons unknown\n");
510 }
511 }
513 printk("Disc information:\n");
514 printk(" Found %d MBR signatures\n",
515 bootsym(boot_mbr_signature_nr));
516 printk(" Found %d EDD information structures\n",
517 bootsym(boot_edd_info_nr));
519 /* Check that we have at least one Multiboot module. */
520 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
521 EARLY_FAIL("dom0 kernel not specified. "
522 "Check bootloader configuration.\n");
524 if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
525 EARLY_FAIL("Misaligned CPU0 stack.\n");
527 if ( e820_raw_nr != 0 )
528 {
529 memmap_type = "Xen-e820";
530 }
531 else if ( bootsym(lowmem_kb) )
532 {
533 memmap_type = "Xen-e801";
534 e820_raw[0].addr = 0;
535 e820_raw[0].size = bootsym(lowmem_kb) << 10;
536 e820_raw[0].type = E820_RAM;
537 e820_raw[1].addr = 0x100000;
538 e820_raw[1].size = bootsym(highmem_kb) << 10;
539 e820_raw[1].type = E820_RAM;
540 e820_raw_nr = 2;
541 }
542 else if ( mbi->flags & MBI_MEMMAP )
543 {
544 memmap_type = "Multiboot-e820";
545 while ( (bytes < mbi->mmap_length) && (e820_raw_nr < E820MAX) )
546 {
547 memory_map_t *map = __va(mbi->mmap_addr + bytes);
549 /*
550 * This is a gross workaround for a BIOS bug. Some bootloaders do
551 * not write e820 map entries into pre-zeroed memory. This is
552 * okay if the BIOS fills in all fields of the map entry, but
553 * some broken BIOSes do not bother to write the high word of
554 * the length field if the length is smaller than 4GB. We
555 * detect and fix this by flagging sections below 4GB that
556 * appear to be larger than 4GB in size.
557 */
558 if ( (map->base_addr_high == 0) && (map->length_high != 0) )
559 {
560 if ( !e820_warn )
561 {
562 printk("WARNING: Buggy e820 map detected and fixed "
563 "(truncated length fields).\n");
564 e820_warn = 1;
565 }
566 map->length_high = 0;
567 }
569 e820_raw[e820_raw_nr].addr =
570 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
571 e820_raw[e820_raw_nr].size =
572 ((u64)map->length_high << 32) | (u64)map->length_low;
573 e820_raw[e820_raw_nr].type = map->type;
574 e820_raw_nr++;
576 bytes += map->size + 4;
577 }
578 }
579 else if ( mbi->flags & MBI_MEMLIMITS )
580 {
581 memmap_type = "Multiboot-e801";
582 e820_raw[0].addr = 0;
583 e820_raw[0].size = mbi->mem_lower << 10;
584 e820_raw[0].type = E820_RAM;
585 e820_raw[1].addr = 0x100000;
586 e820_raw[1].size = mbi->mem_upper << 10;
587 e820_raw[1].type = E820_RAM;
588 e820_raw_nr = 2;
589 }
590 else
591 {
592 EARLY_FAIL("Bootloader provided no memory information.\n");
593 }
595 /* Sanitise the raw E820 map to produce a final clean version. */
596 max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
598 /* Create a temporary copy of the E820 map. */
599 memcpy(&boot_e820, &e820, sizeof(e820));
601 /* Early kexec reservation (explicit static start address). */
602 kexec_reserve_area(&boot_e820);
604 /*
605 * Iterate backwards over all superpage-aligned RAM regions.
606 *
607 * We require superpage alignment because the boot allocator is not yet
608 * initialised. Hence we can only map superpages in the address range
609 * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require
610 * dynamic allocation of pagetables.
611 *
612 * As well as mapping superpages in that range, in preparation for
613 * initialising the boot allocator, we also look for a region to which
614 * we can relocate the dom0 kernel and other multiboot modules. Also, on
615 * x86/64, we relocate Xen to higher memory.
616 */
617 modules_length = 0;
618 for ( i = 0; i < mbi->mods_count; i++ )
619 modules_length += mod[i].mod_end - mod[i].mod_start;
621 /* ensure mod[0] is mapped before parsing */
622 bootstrap_map(mod[0].mod_start, mod[0].mod_end);
623 modules_headroom = bzimage_headroom(
624 (char *)(unsigned long)mod[0].mod_start,
625 (unsigned long)(mod[0].mod_end - mod[0].mod_start));
627 for ( i = boot_e820.nr_map-1; i >= 0; i-- )
628 {
629 uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
631 /* Superpage-aligned chunks from 16MB to BOOTSTRAP_DIRECTMAP_END. */
632 s = (boot_e820.map[i].addr + mask) & ~mask;
633 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
634 s = max_t(uint64_t, s, 16 << 20);
635 e = min_t(uint64_t, e, BOOTSTRAP_DIRECTMAP_END);
636 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
637 continue;
639 /* Map the chunk. No memory will need to be allocated to do this. */
640 map_pages_to_xen(
641 (unsigned long)maddr_to_bootstrap_virt(s),
642 s >> PAGE_SHIFT, (e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
644 #if defined(CONFIG_X86_64)
645 /* Relocate Xen image, allocation bitmap, and one page of padding. */
646 #define reloc_size ((__pa(&_end) + max_page/8 + PAGE_SIZE + mask) & ~mask)
647 /* Is the region suitable for relocating Xen? */
648 if ( !xen_phys_start && ((e-s) >= reloc_size) )
649 {
650 extern l2_pgentry_t l2_xenmap[];
651 l4_pgentry_t *pl4e;
652 l3_pgentry_t *pl3e;
653 l2_pgentry_t *pl2e;
654 int i, j, k;
656 /* Select relocation address. */
657 e -= reloc_size;
658 xen_phys_start = e;
659 bootsym(trampoline_xen_phys_start) = e;
661 /*
662 * Perform relocation to new physical address.
663 * Before doing so we must sync static/global data with main memory
664 * with a barrier(). After this we must *not* modify static/global
665 * data until after we have switched to the relocated pagetables!
666 */
667 barrier();
668 move_memory(e, 0, __pa(&_end) - xen_phys_start);
670 /* Poison low 1MB to detect stray pointers to physical 0-1MB. */
671 memset(maddr_to_bootstrap_virt(e), 0x55, 1U<<20);
673 /* Walk initial pagetables, relocating page directory entries. */
674 pl4e = __va(__pa(idle_pg_table));
675 for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
676 {
677 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
678 continue;
679 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
680 xen_phys_start);
681 pl3e = l4e_to_l3e(*pl4e);
682 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
683 {
684 /* Not present, 1GB mapping, or already relocated? */
685 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
686 (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
687 (l3e_get_pfn(*pl3e) > 0x1000) )
688 continue;
689 *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
690 xen_phys_start);
691 pl2e = l3e_to_l2e(*pl3e);
692 for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
693 {
694 /* Not present, PSE, or already relocated? */
695 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
696 (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
697 (l2e_get_pfn(*pl2e) > 0x1000) )
698 continue;
699 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
700 xen_phys_start);
701 }
702 }
703 }
705 /* The only data mappings to be relocated are in the Xen area. */
706 pl2e = __va(__pa(l2_xenmap));
707 *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
708 PAGE_HYPERVISOR | _PAGE_PSE);
709 for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
710 {
711 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
712 continue;
713 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
714 xen_phys_start);
715 }
717 /* Re-sync the stack and then switch to relocated pagetables. */
718 asm volatile (
719 "rep movsb ; " /* re-sync the stack */
720 "movq %%cr4,%%rsi ; "
721 "andb $0x7f,%%sil ; "
722 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
723 "movq %0,%%cr3 ; " /* CR3 == new pagetables */
724 "orb $0x80,%%sil ; "
725 "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */
726 : : "r" (__pa(idle_pg_table)), "S" (cpu0_stack),
727 "D" (__va(__pa(cpu0_stack))), "c" (STACK_SIZE) : "memory" );
728 }
729 #endif
731 /* Is the region suitable for relocating the multiboot modules? */
732 if ( !initial_images_start && (s < e) &&
733 ((e-s) >= (modules_length+modules_headroom)) )
734 {
735 initial_images_end = e;
736 e = (e - modules_length) & PAGE_MASK;
737 initial_images_start = e;
738 e -= modules_headroom;
739 initial_images_base = e;
740 e += modules_length + modules_headroom;
741 for ( j = mbi->mods_count-1; j >= 0; j-- )
742 {
743 e -= mod[j].mod_end - mod[j].mod_start;
744 move_memory(e, mod[j].mod_start, mod[j].mod_end);
745 mod[j].mod_end += e - mod[j].mod_start;
746 mod[j].mod_start = e;
747 }
748 }
750 if ( !kexec_crash_area.start && (s < e) &&
751 ((e-s) >= kexec_crash_area.size) )
752 {
753 e = (e - kexec_crash_area.size) & PAGE_MASK;
754 kexec_crash_area.start = e;
755 }
756 }
758 if ( !initial_images_start )
759 EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n");
760 reserve_e820_ram(&boot_e820, initial_images_base, initial_images_end);
762 /* Initialise boot heap. */
763 allocator_bitmap_end = init_boot_allocator(__pa(&_end));
764 #if defined(CONFIG_X86_32)
765 xenheap_initial_phys_start = allocator_bitmap_end;
766 xenheap_phys_end = DIRECTMAP_MBYTES << 20;
767 #else
768 if ( !xen_phys_start )
769 EARLY_FAIL("Not enough memory to relocate Xen.\n");
770 reserve_e820_ram(&boot_e820, __pa(&_start), allocator_bitmap_end);
771 #endif
773 /* Late kexec reservation (dynamic start address). */
774 kexec_reserve_area(&boot_e820);
776 /*
777 * With the boot allocator now initialised, we can walk every RAM region
778 * and map it in its entirety (on x86/64, at least) and notify it to the
779 * boot allocator.
780 */
781 for ( i = 0; i < boot_e820.nr_map; i++ )
782 {
783 uint64_t s, e, map_s, map_e, mask = PAGE_SIZE - 1;
785 /* Only page alignment required now. */
786 s = (boot_e820.map[i].addr + mask) & ~mask;
787 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
788 #if defined(CONFIG_X86_32)
789 s = max_t(uint64_t, s, xenheap_phys_end);
790 #else
791 s = max_t(uint64_t, s, 1<<20);
792 #endif
793 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
794 continue;
796 /* Need to create mappings above 16MB. */
797 map_s = max_t(uint64_t, s, 16<<20);
798 map_e = e;
799 #if defined(CONFIG_X86_32) /* mappings are truncated on x86_32 */
800 map_e = min_t(uint64_t, map_e, BOOTSTRAP_DIRECTMAP_END);
801 #endif
803 /* Pass mapped memory to allocator /before/ creating new mappings. */
804 init_boot_pages(s, min_t(uint64_t, map_s, e));
806 /* Create new mappings /before/ passing memory to the allocator. */
807 if ( map_s < map_e )
808 map_pages_to_xen(
809 (unsigned long)maddr_to_bootstrap_virt(map_s),
810 map_s >> PAGE_SHIFT, (map_e-map_s) >> PAGE_SHIFT,
811 PAGE_HYPERVISOR);
813 /* Pass remainder of this memory chunk to the allocator. */
814 init_boot_pages(map_s, e);
815 }
817 memguard_init();
819 nr_pages = 0;
820 for ( i = 0; i < e820.nr_map; i++ )
821 if ( e820.map[i].type == E820_RAM )
822 nr_pages += e820.map[i].size >> PAGE_SHIFT;
823 printk("System RAM: %luMB (%lukB)\n",
824 nr_pages >> (20 - PAGE_SHIFT),
825 nr_pages << (PAGE_SHIFT - 10));
826 total_pages = nr_pages;
828 /* Sanity check for unwanted bloat of certain hypercall structures. */
829 BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
830 sizeof(((struct xen_platform_op *)0)->u.pad));
831 BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
832 sizeof(((struct xen_domctl *)0)->u.pad));
833 BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
834 sizeof(((struct xen_sysctl *)0)->u.pad));
836 BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
837 BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
838 BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
840 #ifdef CONFIG_COMPAT
841 BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
842 sizeof(((struct compat_platform_op *)0)->u.pad));
843 BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
844 BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
845 #endif
847 /* Check definitions in public headers match internal defs. */
848 BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
849 #ifdef HYPERVISOR_VIRT_END
850 BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
851 #endif
852 BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
853 BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
855 init_frametable();
857 acpi_boot_table_init();
859 acpi_numa_init();
861 numa_initmem_init(0, max_page);
863 #if defined(CONFIG_X86_32)
864 /* Initialise the Xen heap. */
865 init_xenheap_pages(xenheap_initial_phys_start, xenheap_phys_end);
866 nr_pages = (xenheap_phys_end - xenheap_initial_phys_start) >> PAGE_SHIFT;
867 printk("Xen heap: %luMB (%lukB)\n",
868 nr_pages >> (20 - PAGE_SHIFT),
869 nr_pages << (PAGE_SHIFT - 10));
870 #endif
872 end_boot_allocator();
873 early_boot = 0;
875 #if defined(CONFIG_X86_64)
876 vesa_init();
877 #endif
879 softirq_init();
881 early_cpu_init();
883 paging_init();
885 tboot_probe();
887 /* Unmap the first page of CPU0's stack. */
888 memguard_guard_stack(cpu0_stack);
890 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
892 if ( opt_watchdog )
893 nmi_watchdog = NMI_LOCAL_APIC;
895 sort_exception_tables();
897 find_smp_config();
899 dmi_scan_machine();
901 generic_apic_probe();
903 if ( x2apic_is_available() )
904 enable_x2apic();
906 acpi_boot_init();
908 init_cpu_to_node();
910 if ( smp_found_config )
911 get_smp_config();
913 #ifdef CONFIG_X86_64
914 /* Low mappings were only needed for some BIOS table parsing. */
915 zap_low_mappings();
916 #endif
918 init_apic_mappings();
920 init_IRQ();
922 percpu_init_areas();
924 xsm_init(&initrdidx, mbi, initial_images_start);
926 init_idle_domain();
928 trap_init();
930 rcu_init();
932 timer_init();
934 early_time_init();
936 arch_init_memory();
938 identify_cpu(&boot_cpu_data);
939 if ( cpu_has_fxsr )
940 set_in_cr4(X86_CR4_OSFXSR);
941 if ( cpu_has_xmm )
942 set_in_cr4(X86_CR4_OSXMMEXCPT);
944 local_irq_enable();
946 #ifdef CONFIG_X86_64
947 vesa_mtrr_init();
948 #endif
950 if ( opt_nosmp )
951 max_cpus = 0;
953 smp_prepare_cpus(max_cpus);
955 spin_debug_enable();
957 /*
958 * Initialise higher-level timer functions. We do this fairly late
959 * (post-SMP) because the time bases and scale factors need to be updated
960 * regularly, and SMP initialisation can cause a long delay with
961 * interrupts not yet enabled.
962 */
963 init_xen_time();
965 initialize_keytable();
967 console_init_postirq();
969 for_each_present_cpu ( i )
970 {
971 if ( num_online_cpus() >= max_cpus )
972 break;
973 if ( !cpu_online(i) )
974 {
975 rcu_online_cpu(i);
976 __cpu_up(i);
977 }
979 /* Set up cpu_to_node[]. */
980 srat_detect_node(i);
981 /* Set up node_to_cpumask based on cpu_to_node[]. */
982 numa_add_cpu(i);
983 }
985 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
986 smp_cpus_done(max_cpus);
988 initialise_gdb(); /* could be moved earlier */
990 do_initcalls();
992 if ( opt_watchdog )
993 watchdog_enable();
995 if ( !tboot_protect_mem_regions() )
996 panic("Could not protect TXT memory regions\n");
998 /* Create initial domain 0. */
999 dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
1000 if ( (dom0 == NULL) || (alloc_vcpu(dom0, 0, 0) == NULL) )
1001 panic("Error creating domain 0\n");
1003 dom0->is_privileged = 1;
1004 dom0->target = NULL;
1006 /* Grab the DOM0 command line. */
1007 cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
1008 if ( (cmdline != NULL) || (kextra != NULL) )
1010 static char dom0_cmdline[MAX_GUEST_CMDLINE];
1012 cmdline = cmdline_cook(cmdline);
1013 safe_strcpy(dom0_cmdline, cmdline);
1015 if ( kextra != NULL )
1016 /* kextra always includes exactly one leading space. */
1017 safe_strcat(dom0_cmdline, kextra);
1019 /* Append any extra parameters. */
1020 if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
1021 safe_strcat(dom0_cmdline, " noapic");
1022 if ( acpi_skip_timer_override &&
1023 !strstr(dom0_cmdline, "acpi_skip_timer_override") )
1024 safe_strcat(dom0_cmdline, " acpi_skip_timer_override");
1025 if ( (strlen(acpi_param) == 0) && acpi_disabled )
1027 printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
1028 safe_strcpy(acpi_param, "off");
1030 if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
1032 safe_strcat(dom0_cmdline, " acpi=");
1033 safe_strcat(dom0_cmdline, acpi_param);
1036 cmdline = dom0_cmdline;
1039 if ( (initrdidx > 0) && (initrdidx < mbi->mods_count) )
1041 _initrd_start = mod[initrdidx].mod_start;
1042 _initrd_len = mod[initrdidx].mod_end - mod[initrdidx].mod_start;
1045 if ( xen_cpuidle )
1046 xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
1048 /*
1049 * We're going to setup domain0 using the module(s) that we stashed safely
1050 * above our heap. The second module, if present, is an initrd ramdisk.
1051 */
1052 if ( construct_dom0(dom0,
1053 initial_images_base,
1054 initial_images_start,
1055 mod[0].mod_end-mod[0].mod_start,
1056 _initrd_start,
1057 _initrd_len,
1058 cmdline) != 0)
1059 panic("Could not set up DOM0 guest OS\n");
1061 /* Scrub RAM that is still free and so may go to an unprivileged domain. */
1062 scrub_heap_pages();
1064 init_trace_bufs();
1066 console_endboot();
1068 /* Hide UART from DOM0 if we're using it */
1069 serial_endboot();
1071 domain_unpause_by_systemcontroller(dom0);
1073 reset_stack_and_jump(init_done);
1076 void arch_get_xen_caps(xen_capabilities_info_t *info)
1078 /* Interface name is always xen-3.0-* for Xen-3.x. */
1079 int major = 3, minor = 0;
1080 char s[32];
1082 (*info)[0] = '\0';
1084 #if defined(CONFIG_X86_32)
1086 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1087 safe_strcat(*info, s);
1088 if ( hvm_enabled )
1090 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1091 safe_strcat(*info, s);
1092 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1093 safe_strcat(*info, s);
1096 #elif defined(CONFIG_X86_64)
1098 snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1099 safe_strcat(*info, s);
1100 #ifdef CONFIG_COMPAT
1101 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1102 safe_strcat(*info, s);
1103 #endif
1104 if ( hvm_enabled )
1106 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1107 safe_strcat(*info, s);
1108 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1109 safe_strcat(*info, s);
1110 snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1111 safe_strcat(*info, s);
1114 #endif
1117 int xen_in_range(paddr_t start, paddr_t end)
1119 int i;
1120 static struct {
1121 paddr_t s, e;
1122 } xen_regions[5];
1124 /* initialize first time */
1125 if ( !xen_regions[0].s )
1127 extern char __init_begin[], __per_cpu_start[], __per_cpu_end[],
1128 __bss_start[];
1129 extern unsigned long allocator_bitmap_end;
1131 /* S3 resume code (and other real mode trampoline code) */
1132 xen_regions[0].s = bootsym_phys(trampoline_start);
1133 xen_regions[0].e = bootsym_phys(trampoline_end);
1134 /* hypervisor code + data */
1135 xen_regions[1].s =__pa(&_stext);
1136 xen_regions[1].e = __pa(&__init_begin);
1137 /* per-cpu data */
1138 xen_regions[2].s = __pa(&__per_cpu_start);
1139 xen_regions[2].e = __pa(&__per_cpu_end);
1140 /* bss + boot allocator bitmap */
1141 xen_regions[3].s = __pa(&__bss_start);
1142 xen_regions[3].e = allocator_bitmap_end;
1143 /* frametable */
1144 xen_regions[4].s = (unsigned long)frame_table;
1145 xen_regions[4].e = (unsigned long)frame_table +
1146 PFN_UP(max_page * sizeof(*frame_table));
1149 for ( i = 0; i < ARRAY_SIZE(xen_regions); i++ )
1151 if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
1152 return 1;
1155 return 0;
1158 /*
1159 * Local variables:
1160 * mode: C
1161 * c-set-style: "BSD"
1162 * c-basic-offset: 4
1163 * tab-width: 4
1164 * indent-tabs-mode: nil
1165 * End:
1166 */