ia64/xen-unstable

view xen/arch/x86/setup.c @ 19788:2f9e1348aa98

x86_64: allow more vCPU-s per guest

Since the shared info layout is fixed, guests are required to use
VCPUOP_register_vcpu_info prior to booting any vCPU beyond the
traditional limit of 32.

MAX_VIRT_CPUS, being an implemetation detail of the hypervisor, is no
longer being exposed in the public headers.

The tools changes are clearly incomplete (and done only so things
would
build again), and the current state of the tools (using scalar
variables all over the place to represent vCPU bitmaps) very likely
doesn't permit booting DomU-s with more than the traditional number of
vCPU-s. Testing of the extended functionality was done with Dom0 (96
vCPU-s, as well as 128 vCPU-s out of which the kernel elected - by way
of a simple kernel side patch - to use only some, resulting in a
sparse
bitmap).

ia64 changes only to make things build, and build-tested only (and the
tools part only as far as the build would go without encountering
unrelated problems in the blktap code).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:14:16 2009 +0100 (2009-06-18)
parents f210a633571c
children
line source
1 #include <xen/config.h>
2 #include <xen/init.h>
3 #include <xen/lib.h>
4 #include <xen/sched.h>
5 #include <xen/domain.h>
6 #include <xen/serial.h>
7 #include <xen/softirq.h>
8 #include <xen/acpi.h>
9 #include <xen/console.h>
10 #include <xen/serial.h>
11 #include <xen/trace.h>
12 #include <xen/multiboot.h>
13 #include <xen/domain_page.h>
14 #include <xen/version.h>
15 #include <xen/gdbstub.h>
16 #include <xen/percpu.h>
17 #include <xen/hypercall.h>
18 #include <xen/keyhandler.h>
19 #include <xen/numa.h>
20 #include <xen/rcupdate.h>
21 #include <xen/vga.h>
22 #include <xen/dmi.h>
23 #include <public/version.h>
24 #ifdef CONFIG_COMPAT
25 #include <compat/platform.h>
26 #include <compat/xen.h>
27 #endif
28 #include <asm/bitops.h>
29 #include <asm/smp.h>
30 #include <asm/processor.h>
31 #include <asm/mpspec.h>
32 #include <asm/apic.h>
33 #include <asm/desc.h>
34 #include <asm/paging.h>
35 #include <asm/e820.h>
36 #include <xsm/acm/acm_hooks.h>
37 #include <xen/kexec.h>
38 #include <asm/edd.h>
39 #include <xsm/xsm.h>
40 #include <asm/tboot.h>
42 int __init bzimage_headroom(char *image_start, unsigned long image_length);
44 #if defined(CONFIG_X86_64)
45 #define BOOTSTRAP_DIRECTMAP_END (1UL << 32) /* 4GB */
46 #define maddr_to_bootstrap_virt(m) maddr_to_virt(m)
47 #else
48 #define BOOTSTRAP_DIRECTMAP_END (1UL << 30) /* 1GB */
49 #define maddr_to_bootstrap_virt(m) ((void *)(long)(m))
50 #endif
52 extern void generic_apic_probe(void);
53 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
55 extern u16 boot_edid_caps;
56 extern u8 boot_edid_info[128];
57 extern struct boot_video_info boot_vid_info;
59 /* opt_nosmp: If true, secondary processors are ignored. */
60 static int opt_nosmp = 0;
61 boolean_param("nosmp", opt_nosmp);
63 /* maxcpus: maximum number of CPUs to activate. */
64 static unsigned int max_cpus = NR_CPUS;
65 integer_param("maxcpus", max_cpus);
67 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
68 static int opt_watchdog = 0;
69 boolean_param("watchdog", opt_watchdog);
71 /* **** Linux config option: propagated to domain0. */
72 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
73 /* "acpi=force": Override the disable blacklist. */
74 /* "acpi=strict": Disables out-of-spec workarounds. */
75 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
76 /* "acpi=noirq": Disables ACPI interrupt routing. */
77 static void parse_acpi_param(char *s);
78 custom_param("acpi", parse_acpi_param);
80 /* **** Linux config option: propagated to domain0. */
81 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
82 extern int acpi_skip_timer_override;
83 boolean_param("acpi_skip_timer_override", acpi_skip_timer_override);
85 /* **** Linux config option: propagated to domain0. */
86 /* noapic: Disable IOAPIC setup. */
87 extern int skip_ioapic_setup;
88 boolean_param("noapic", skip_ioapic_setup);
90 /* **** Linux config option: propagated to domain0. */
91 /* xen_cpuidle: xen control cstate. */
92 /*static*/ int xen_cpuidle = -1;
93 boolean_param("cpuidle", xen_cpuidle);
95 int early_boot = 1;
97 cpumask_t cpu_present_map;
99 unsigned long xen_phys_start;
100 unsigned long allocator_bitmap_end;
102 #ifdef CONFIG_X86_32
103 /* Limits of Xen heap, used to initialise the allocator. */
104 unsigned long xenheap_initial_phys_start, xenheap_phys_end;
105 #endif
107 extern void arch_init_memory(void);
108 extern void init_IRQ(void);
109 extern void early_time_init(void);
110 extern void early_cpu_init(void);
111 extern void vesa_init(void);
112 extern void vesa_mtrr_init(void);
113 extern void init_tmem(void);
115 DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
116 #ifdef CONFIG_COMPAT
117 DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table)
118 = boot_cpu_compat_gdt_table;
119 #endif
121 struct tss_struct init_tss[NR_CPUS];
123 char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
125 struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1 };
127 unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE;
128 EXPORT_SYMBOL(mmu_cr4_features);
130 int acpi_disabled;
132 int acpi_force;
133 char acpi_param[10] = "";
134 static void __init parse_acpi_param(char *s)
135 {
136 /* Save the parameter so it can be propagated to domain0. */
137 safe_strcpy(acpi_param, s);
139 /* Interpret the parameter for use within Xen. */
140 if ( !strcmp(s, "off") )
141 {
142 disable_acpi();
143 }
144 else if ( !strcmp(s, "force") )
145 {
146 acpi_force = 1;
147 acpi_ht = 1;
148 acpi_disabled = 0;
149 }
150 else if ( !strcmp(s, "strict") )
151 {
152 acpi_strict = 1;
153 }
154 else if ( !strcmp(s, "ht") )
155 {
156 if ( !acpi_force )
157 disable_acpi();
158 acpi_ht = 1;
159 }
160 else if ( !strcmp(s, "noirq") )
161 {
162 acpi_noirq_set();
163 }
164 }
166 static void __init do_initcalls(void)
167 {
168 initcall_t *call;
169 for ( call = &__initcall_start; call < &__initcall_end; call++ )
170 (*call)();
171 }
173 #define EARLY_FAIL(f, a...) do { \
174 printk( f , ## a ); \
175 for ( ; ; ) halt(); \
176 } while (0)
178 static unsigned long __initdata initial_images_base;
179 static unsigned long __initdata initial_images_start;
180 static unsigned long __initdata initial_images_end;
182 unsigned long __init initial_images_nrpages(void)
183 {
184 ASSERT(!(initial_images_base & ~PAGE_MASK));
185 ASSERT(!(initial_images_end & ~PAGE_MASK));
186 return ((initial_images_end >> PAGE_SHIFT) -
187 (initial_images_base >> PAGE_SHIFT));
188 }
190 void __init discard_initial_images(void)
191 {
192 init_domheap_pages(initial_images_base, initial_images_end);
193 }
195 extern char __per_cpu_start[], __per_cpu_data_end[], __per_cpu_end[];
197 static void __init percpu_init_areas(void)
198 {
199 unsigned int i, data_size = __per_cpu_data_end - __per_cpu_start;
200 unsigned int first_unused;
202 BUG_ON(data_size > PERCPU_SIZE);
204 /* Initialise per-cpu data area for all possible secondary CPUs. */
205 for ( i = 1; (i < NR_CPUS) && cpu_possible(i); i++ )
206 memcpy(__per_cpu_start + (i << PERCPU_SHIFT),
207 __per_cpu_start,
208 data_size);
209 first_unused = i;
211 /* Check that there are no holes in cpu_possible_map. */
212 for ( ; i < NR_CPUS; i++ )
213 BUG_ON(cpu_possible(i));
215 #ifndef MEMORY_GUARD
216 init_xenheap_pages(__pa(__per_cpu_start) + (first_unused << PERCPU_SHIFT),
217 __pa(__per_cpu_end));
218 #endif
219 memguard_guard_range(&__per_cpu_start[first_unused << PERCPU_SHIFT],
220 (NR_CPUS - first_unused) << PERCPU_SHIFT);
221 #if defined(CONFIG_X86_64)
222 /* Also zap the mapping in the 1:1 area. */
223 memguard_guard_range(__va(__pa(__per_cpu_start)) +
224 (first_unused << PERCPU_SHIFT),
225 (NR_CPUS - first_unused) << PERCPU_SHIFT);
226 #endif
227 }
229 static void __init init_idle_domain(void)
230 {
231 struct domain *idle_domain;
233 /* Domain creation requires that scheduler structures are initialised. */
234 scheduler_init();
236 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
237 if ( idle_domain == NULL )
238 BUG();
239 idle_domain->vcpu = idle_vcpu;
240 idle_domain->max_vcpus = NR_CPUS;
241 if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
242 BUG();
244 set_current(idle_vcpu[0]);
245 this_cpu(curr_vcpu) = current;
247 setup_idle_pagetable();
248 }
250 static void __init srat_detect_node(int cpu)
251 {
252 unsigned node;
253 u32 apicid = x86_cpu_to_apicid[cpu];
255 node = apicid_to_node[apicid];
256 if ( node == NUMA_NO_NODE )
257 node = 0;
258 numa_set_node(cpu, node);
260 if ( acpi_numa > 0 )
261 printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
262 }
264 /*
265 * Ensure a given physical memory range is present in the bootstrap mappings.
266 * Use superpage mappings to ensure that pagetable memory needn't be allocated.
267 */
268 static void __init bootstrap_map(unsigned long start, unsigned long end)
269 {
270 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
271 start = max_t(unsigned long, start & ~mask, 16UL << 20);
272 end = (end + mask) & ~mask;
273 if ( start >= end )
274 return;
275 if ( end > BOOTSTRAP_DIRECTMAP_END )
276 panic("Cannot access memory beyond end of "
277 "bootstrap direct-map area\n");
278 map_pages_to_xen(
279 (unsigned long)maddr_to_bootstrap_virt(start),
280 start >> PAGE_SHIFT, (end-start) >> PAGE_SHIFT, PAGE_HYPERVISOR);
281 }
283 static void __init move_memory(
284 unsigned long dst, unsigned long src_start, unsigned long src_end)
285 {
286 bootstrap_map(src_start, src_end);
287 bootstrap_map(dst, dst + src_end - src_start);
288 memmove(maddr_to_bootstrap_virt(dst),
289 maddr_to_bootstrap_virt(src_start),
290 src_end - src_start);
291 }
293 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
294 static struct e820map __initdata boot_e820;
296 struct boot_video_info {
297 u8 orig_x; /* 0x00 */
298 u8 orig_y; /* 0x01 */
299 u8 orig_video_mode; /* 0x02 */
300 u8 orig_video_cols; /* 0x03 */
301 u8 orig_video_lines; /* 0x04 */
302 u8 orig_video_isVGA; /* 0x05 */
303 u16 orig_video_points; /* 0x06 */
305 /* VESA graphic mode -- linear frame buffer */
306 u32 capabilities; /* 0x08 */
307 u16 lfb_linelength; /* 0x0c */
308 u16 lfb_width; /* 0x0e */
309 u16 lfb_height; /* 0x10 */
310 u16 lfb_depth; /* 0x12 */
311 u32 lfb_base; /* 0x14 */
312 u32 lfb_size; /* 0x18 */
313 u8 red_size; /* 0x1c */
314 u8 red_pos; /* 0x1d */
315 u8 green_size; /* 0x1e */
316 u8 green_pos; /* 0x1f */
317 u8 blue_size; /* 0x20 */
318 u8 blue_pos; /* 0x21 */
319 u8 rsvd_size; /* 0x22 */
320 u8 rsvd_pos; /* 0x23 */
321 u16 vesapm_seg; /* 0x24 */
322 u16 vesapm_off; /* 0x26 */
323 u16 vesa_attrib; /* 0x28 */
324 };
326 static void __init parse_video_info(void)
327 {
328 struct boot_video_info *bvi = &bootsym(boot_vid_info);
330 if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
331 {
332 vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
333 vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
334 vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
335 vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
336 vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
337 vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
338 }
339 else if ( bvi->orig_video_isVGA == 0x23 )
340 {
341 vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
342 vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
343 vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
344 vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
345 vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
346 vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
347 vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
348 vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
349 vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
350 vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
351 vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
352 vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
353 vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
354 vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
355 vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
356 vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
357 vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
358 }
359 }
361 void __init kexec_reserve_area(struct e820map *e820)
362 {
363 unsigned long kdump_start = kexec_crash_area.start;
364 unsigned long kdump_size = kexec_crash_area.size;
365 static int is_reserved = 0;
367 kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
369 if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
370 return;
372 is_reserved = 1;
374 if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
375 {
376 printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)"
377 "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
378 kexec_crash_area.start = kexec_crash_area.size = 0;
379 }
380 else
381 {
382 printk("Kdump: %luMB (%lukB) at 0x%lx\n",
383 kdump_size >> 20, kdump_size >> 10, kdump_start);
384 }
385 }
387 void init_done(void)
388 {
389 extern char __init_begin[], __init_end[];
391 /* Free (or page-protect) the init areas. */
392 memset(__init_begin, 0xcc, __init_end - __init_begin); /* int3 poison */
393 #ifndef MEMORY_GUARD
394 init_xenheap_pages(__pa(__init_begin), __pa(__init_end));
395 #endif
396 memguard_guard_range(__init_begin, __init_end - __init_begin);
397 #if defined(CONFIG_X86_64)
398 /* Also zap the mapping in the 1:1 area. */
399 memguard_guard_range(__va(__pa(__init_begin)), __init_end - __init_begin);
400 #endif
401 printk("Freed %ldkB init memory.\n", (long)(__init_end-__init_begin)>>10);
403 startup_cpu_idle_loop();
404 }
406 static char * __init cmdline_cook(char *p)
407 {
408 p = p ? : "";
409 while ( *p == ' ' )
410 p++;
411 while ( (*p != ' ') && (*p != '\0') )
412 p++;
413 while ( *p == ' ' )
414 p++;
415 return p;
416 }
418 void __init __start_xen(unsigned long mbi_p)
419 {
420 char *memmap_type = NULL;
421 char *cmdline, *kextra;
422 unsigned long _initrd_start = 0, _initrd_len = 0;
423 unsigned int initrdidx = 1;
424 multiboot_info_t *mbi = __va(mbi_p);
425 module_t *mod = (module_t *)__va(mbi->mods_addr);
426 unsigned long nr_pages, modules_length, modules_headroom;
427 int i, j, e820_warn = 0, bytes = 0;
428 struct ns16550_defaults ns16550 = {
429 .data_bits = 8,
430 .parity = 'n',
431 .stop_bits = 1
432 };
434 extern void early_page_fault(void);
435 set_intr_gate(TRAP_page_fault, &early_page_fault);
437 /* Parse the command-line options. */
438 cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
439 __va(mbi->cmdline) : NULL);
440 if ( (kextra = strstr(cmdline, " -- ")) != NULL )
441 {
442 /*
443 * Options after ' -- ' separator belong to dom0.
444 * 1. Orphan dom0's options from Xen's command line.
445 * 2. Skip all but final leading space from dom0's options.
446 */
447 *kextra = '\0';
448 kextra += 3;
449 while ( kextra[1] == ' ' ) kextra++;
450 }
451 cmdline_parse(cmdline);
453 parse_video_info();
455 set_current((struct vcpu *)0xfffff000); /* debug sanity */
456 idle_vcpu[0] = current;
457 set_processor_id(0); /* needed early, for smp_processor_id() */
458 if ( cpu_has_efer )
459 rdmsrl(MSR_EFER, this_cpu(efer));
460 asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) );
462 smp_prepare_boot_cpu();
464 /* We initialise the serial devices very early so we can get debugging. */
465 ns16550.io_base = 0x3f8;
466 ns16550.irq = 4;
467 ns16550_init(0, &ns16550);
468 ns16550.io_base = 0x2f8;
469 ns16550.irq = 3;
470 ns16550_init(1, &ns16550);
471 console_init_preirq();
473 printk("Command line: %s\n", cmdline);
475 printk("Video information:\n");
477 /* Print VGA display mode information. */
478 switch ( vga_console_info.video_type )
479 {
480 case XEN_VGATYPE_TEXT_MODE_3:
481 printk(" VGA is text mode %dx%d, font 8x%d\n",
482 vga_console_info.u.text_mode_3.columns,
483 vga_console_info.u.text_mode_3.rows,
484 vga_console_info.u.text_mode_3.font_height);
485 break;
486 case XEN_VGATYPE_VESA_LFB:
487 printk(" VGA is graphics mode %dx%d, %d bpp\n",
488 vga_console_info.u.vesa_lfb.width,
489 vga_console_info.u.vesa_lfb.height,
490 vga_console_info.u.vesa_lfb.bits_per_pixel);
491 break;
492 default:
493 printk(" No VGA detected\n");
494 break;
495 }
497 /* Print VBE/DDC EDID information. */
498 if ( bootsym(boot_edid_caps) != 0x1313 )
499 {
500 u16 caps = bootsym(boot_edid_caps);
501 printk(" VBE/DDC methods:%s%s%s; ",
502 (caps & 1) ? " V1" : "",
503 (caps & 2) ? " V2" : "",
504 !(caps & 3) ? " none" : "");
505 printk("EDID transfer time: %d seconds\n", caps >> 8);
506 if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
507 {
508 printk(" EDID info not retrieved because ");
509 if ( !(caps & 3) )
510 printk("no DDC retrieval method detected\n");
511 else if ( (caps >> 8) > 5 )
512 printk("takes longer than 5 seconds\n");
513 else
514 printk("of reasons unknown\n");
515 }
516 }
518 printk("Disc information:\n");
519 printk(" Found %d MBR signatures\n",
520 bootsym(boot_mbr_signature_nr));
521 printk(" Found %d EDD information structures\n",
522 bootsym(boot_edd_info_nr));
524 /* Check that we have at least one Multiboot module. */
525 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
526 EARLY_FAIL("dom0 kernel not specified. "
527 "Check bootloader configuration.\n");
529 if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
530 EARLY_FAIL("Misaligned CPU0 stack.\n");
532 if ( e820_raw_nr != 0 )
533 {
534 memmap_type = "Xen-e820";
535 }
536 else if ( bootsym(lowmem_kb) )
537 {
538 memmap_type = "Xen-e801";
539 e820_raw[0].addr = 0;
540 e820_raw[0].size = bootsym(lowmem_kb) << 10;
541 e820_raw[0].type = E820_RAM;
542 e820_raw[1].addr = 0x100000;
543 e820_raw[1].size = bootsym(highmem_kb) << 10;
544 e820_raw[1].type = E820_RAM;
545 e820_raw_nr = 2;
546 }
547 else if ( mbi->flags & MBI_MEMMAP )
548 {
549 memmap_type = "Multiboot-e820";
550 while ( (bytes < mbi->mmap_length) && (e820_raw_nr < E820MAX) )
551 {
552 memory_map_t *map = __va(mbi->mmap_addr + bytes);
554 /*
555 * This is a gross workaround for a BIOS bug. Some bootloaders do
556 * not write e820 map entries into pre-zeroed memory. This is
557 * okay if the BIOS fills in all fields of the map entry, but
558 * some broken BIOSes do not bother to write the high word of
559 * the length field if the length is smaller than 4GB. We
560 * detect and fix this by flagging sections below 4GB that
561 * appear to be larger than 4GB in size.
562 */
563 if ( (map->base_addr_high == 0) && (map->length_high != 0) )
564 {
565 if ( !e820_warn )
566 {
567 printk("WARNING: Buggy e820 map detected and fixed "
568 "(truncated length fields).\n");
569 e820_warn = 1;
570 }
571 map->length_high = 0;
572 }
574 e820_raw[e820_raw_nr].addr =
575 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
576 e820_raw[e820_raw_nr].size =
577 ((u64)map->length_high << 32) | (u64)map->length_low;
578 e820_raw[e820_raw_nr].type = map->type;
579 e820_raw_nr++;
581 bytes += map->size + 4;
582 }
583 }
584 else if ( mbi->flags & MBI_MEMLIMITS )
585 {
586 memmap_type = "Multiboot-e801";
587 e820_raw[0].addr = 0;
588 e820_raw[0].size = mbi->mem_lower << 10;
589 e820_raw[0].type = E820_RAM;
590 e820_raw[1].addr = 0x100000;
591 e820_raw[1].size = mbi->mem_upper << 10;
592 e820_raw[1].type = E820_RAM;
593 e820_raw_nr = 2;
594 }
595 else
596 {
597 EARLY_FAIL("Bootloader provided no memory information.\n");
598 }
600 /* Sanitise the raw E820 map to produce a final clean version. */
601 max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
603 /* Create a temporary copy of the E820 map. */
604 memcpy(&boot_e820, &e820, sizeof(e820));
606 /* Early kexec reservation (explicit static start address). */
607 kexec_reserve_area(&boot_e820);
609 /*
610 * Iterate backwards over all superpage-aligned RAM regions.
611 *
612 * We require superpage alignment because the boot allocator is not yet
613 * initialised. Hence we can only map superpages in the address range
614 * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require
615 * dynamic allocation of pagetables.
616 *
617 * As well as mapping superpages in that range, in preparation for
618 * initialising the boot allocator, we also look for a region to which
619 * we can relocate the dom0 kernel and other multiboot modules. Also, on
620 * x86/64, we relocate Xen to higher memory.
621 */
622 modules_length = 0;
623 for ( i = 0; i < mbi->mods_count; i++ )
624 modules_length += mod[i].mod_end - mod[i].mod_start;
626 /* ensure mod[0] is mapped before parsing */
627 bootstrap_map(mod[0].mod_start, mod[0].mod_end);
628 modules_headroom = bzimage_headroom(
629 (char *)(unsigned long)mod[0].mod_start,
630 (unsigned long)(mod[0].mod_end - mod[0].mod_start));
632 for ( i = boot_e820.nr_map-1; i >= 0; i-- )
633 {
634 uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
636 /* Superpage-aligned chunks from 16MB to BOOTSTRAP_DIRECTMAP_END. */
637 s = (boot_e820.map[i].addr + mask) & ~mask;
638 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
639 s = max_t(uint64_t, s, 16 << 20);
640 e = min_t(uint64_t, e, BOOTSTRAP_DIRECTMAP_END);
641 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
642 continue;
644 /* Map the chunk. No memory will need to be allocated to do this. */
645 map_pages_to_xen(
646 (unsigned long)maddr_to_bootstrap_virt(s),
647 s >> PAGE_SHIFT, (e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
649 #if defined(CONFIG_X86_64)
650 /* Relocate Xen image, allocation bitmap, and one page of padding. */
651 #define reloc_size ((__pa(&_end) + max_page/8 + PAGE_SIZE + mask) & ~mask)
652 /* Is the region suitable for relocating Xen? */
653 if ( !xen_phys_start && ((e-s) >= reloc_size) )
654 {
655 extern l2_pgentry_t l2_xenmap[];
656 l4_pgentry_t *pl4e;
657 l3_pgentry_t *pl3e;
658 l2_pgentry_t *pl2e;
659 int i, j, k;
661 /* Select relocation address. */
662 e -= reloc_size;
663 xen_phys_start = e;
664 bootsym(trampoline_xen_phys_start) = e;
666 /*
667 * Perform relocation to new physical address.
668 * Before doing so we must sync static/global data with main memory
669 * with a barrier(). After this we must *not* modify static/global
670 * data until after we have switched to the relocated pagetables!
671 */
672 barrier();
673 move_memory(e, 0, __pa(&_end) - xen_phys_start);
675 /* Poison low 1MB to detect stray pointers to physical 0-1MB. */
676 memset(maddr_to_bootstrap_virt(e), 0x55, 1U<<20);
678 /* Walk initial pagetables, relocating page directory entries. */
679 pl4e = __va(__pa(idle_pg_table));
680 for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
681 {
682 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
683 continue;
684 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
685 xen_phys_start);
686 pl3e = l4e_to_l3e(*pl4e);
687 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
688 {
689 /* Not present, 1GB mapping, or already relocated? */
690 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
691 (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
692 (l3e_get_pfn(*pl3e) > 0x1000) )
693 continue;
694 *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
695 xen_phys_start);
696 pl2e = l3e_to_l2e(*pl3e);
697 for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
698 {
699 /* Not present, PSE, or already relocated? */
700 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
701 (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
702 (l2e_get_pfn(*pl2e) > 0x1000) )
703 continue;
704 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
705 xen_phys_start);
706 }
707 }
708 }
710 /* The only data mappings to be relocated are in the Xen area. */
711 pl2e = __va(__pa(l2_xenmap));
712 *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
713 PAGE_HYPERVISOR | _PAGE_PSE);
714 for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
715 {
716 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
717 continue;
718 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
719 xen_phys_start);
720 }
722 /* Re-sync the stack and then switch to relocated pagetables. */
723 asm volatile (
724 "rep movsb ; " /* re-sync the stack */
725 "movq %%cr4,%%rsi ; "
726 "andb $0x7f,%%sil ; "
727 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
728 "movq %0,%%cr3 ; " /* CR3 == new pagetables */
729 "orb $0x80,%%sil ; "
730 "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */
731 : : "r" (__pa(idle_pg_table)), "S" (cpu0_stack),
732 "D" (__va(__pa(cpu0_stack))), "c" (STACK_SIZE) : "memory" );
733 }
734 #endif
736 /* Is the region suitable for relocating the multiboot modules? */
737 if ( !initial_images_start && (s < e) &&
738 ((e-s) >= (modules_length+modules_headroom)) )
739 {
740 initial_images_end = e;
741 e = (e - modules_length) & PAGE_MASK;
742 initial_images_start = e;
743 e -= modules_headroom;
744 initial_images_base = e;
745 e += modules_length + modules_headroom;
746 for ( j = mbi->mods_count-1; j >= 0; j-- )
747 {
748 e -= mod[j].mod_end - mod[j].mod_start;
749 move_memory(e, mod[j].mod_start, mod[j].mod_end);
750 mod[j].mod_end += e - mod[j].mod_start;
751 mod[j].mod_start = e;
752 }
753 }
755 if ( !kexec_crash_area.start && (s < e) &&
756 ((e-s) >= kexec_crash_area.size) )
757 {
758 e = (e - kexec_crash_area.size) & PAGE_MASK;
759 kexec_crash_area.start = e;
760 }
761 }
763 if ( !initial_images_start )
764 EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n");
765 reserve_e820_ram(&boot_e820, initial_images_base, initial_images_end);
767 /* Initialise boot heap. */
768 allocator_bitmap_end = init_boot_allocator(__pa(&_end));
769 #if defined(CONFIG_X86_32)
770 xenheap_initial_phys_start = allocator_bitmap_end;
771 xenheap_phys_end = DIRECTMAP_MBYTES << 20;
772 #else
773 if ( !xen_phys_start )
774 EARLY_FAIL("Not enough memory to relocate Xen.\n");
775 reserve_e820_ram(&boot_e820, __pa(&_start), allocator_bitmap_end);
776 #endif
778 /* Late kexec reservation (dynamic start address). */
779 kexec_reserve_area(&boot_e820);
781 /*
782 * With the boot allocator now initialised, we can walk every RAM region
783 * and map it in its entirety (on x86/64, at least) and notify it to the
784 * boot allocator.
785 */
786 for ( i = 0; i < boot_e820.nr_map; i++ )
787 {
788 uint64_t s, e, map_s, map_e, mask = PAGE_SIZE - 1;
790 /* Only page alignment required now. */
791 s = (boot_e820.map[i].addr + mask) & ~mask;
792 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
793 #if defined(CONFIG_X86_32)
794 s = max_t(uint64_t, s, xenheap_phys_end);
795 #else
796 s = max_t(uint64_t, s, 1<<20);
797 #endif
798 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
799 continue;
801 /* Need to create mappings above 16MB. */
802 map_s = max_t(uint64_t, s, 16<<20);
803 map_e = e;
804 #if defined(CONFIG_X86_32) /* mappings are truncated on x86_32 */
805 map_e = min_t(uint64_t, map_e, BOOTSTRAP_DIRECTMAP_END);
806 #endif
808 /* Pass mapped memory to allocator /before/ creating new mappings. */
809 init_boot_pages(s, min_t(uint64_t, map_s, e));
811 /* Create new mappings /before/ passing memory to the allocator. */
812 if ( map_s < map_e )
813 map_pages_to_xen(
814 (unsigned long)maddr_to_bootstrap_virt(map_s),
815 map_s >> PAGE_SHIFT, (map_e-map_s) >> PAGE_SHIFT,
816 PAGE_HYPERVISOR);
818 /* Pass remainder of this memory chunk to the allocator. */
819 init_boot_pages(map_s, e);
820 }
822 memguard_init();
824 nr_pages = 0;
825 for ( i = 0; i < e820.nr_map; i++ )
826 if ( e820.map[i].type == E820_RAM )
827 nr_pages += e820.map[i].size >> PAGE_SHIFT;
828 printk("System RAM: %luMB (%lukB)\n",
829 nr_pages >> (20 - PAGE_SHIFT),
830 nr_pages << (PAGE_SHIFT - 10));
831 total_pages = nr_pages;
833 /* Sanity check for unwanted bloat of certain hypercall structures. */
834 BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
835 sizeof(((struct xen_platform_op *)0)->u.pad));
836 BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
837 sizeof(((struct xen_domctl *)0)->u.pad));
838 BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
839 sizeof(((struct xen_sysctl *)0)->u.pad));
841 BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
842 BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
843 BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
845 #ifdef CONFIG_COMPAT
846 BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
847 sizeof(((struct compat_platform_op *)0)->u.pad));
848 BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
849 BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
850 #endif
852 /* Check definitions in public headers match internal defs. */
853 BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
854 #ifdef HYPERVISOR_VIRT_END
855 BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
856 #endif
857 BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
858 BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
860 init_frametable();
862 acpi_boot_table_init();
864 acpi_numa_init();
866 numa_initmem_init(0, max_page);
868 #if defined(CONFIG_X86_32)
869 /* Initialise the Xen heap. */
870 init_xenheap_pages(xenheap_initial_phys_start, xenheap_phys_end);
871 nr_pages = (xenheap_phys_end - xenheap_initial_phys_start) >> PAGE_SHIFT;
872 printk("Xen heap: %luMB (%lukB)\n",
873 nr_pages >> (20 - PAGE_SHIFT),
874 nr_pages << (PAGE_SHIFT - 10));
875 #endif
877 end_boot_allocator();
878 early_boot = 0;
880 #if defined(CONFIG_X86_64)
881 vesa_init();
882 #endif
884 softirq_init();
886 early_cpu_init();
888 paging_init();
890 tboot_probe();
892 /* Unmap the first page of CPU0's stack. */
893 memguard_guard_stack(cpu0_stack);
895 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
897 if ( opt_watchdog )
898 nmi_watchdog = NMI_LOCAL_APIC;
900 sort_exception_tables();
902 find_smp_config();
904 dmi_scan_machine();
906 generic_apic_probe();
908 if ( x2apic_is_available() )
909 enable_x2apic();
911 acpi_boot_init();
913 init_cpu_to_node();
915 if ( smp_found_config )
916 get_smp_config();
918 #ifdef CONFIG_X86_64
919 /* Low mappings were only needed for some BIOS table parsing. */
920 zap_low_mappings();
921 #endif
923 init_apic_mappings();
925 init_IRQ();
927 percpu_init_areas();
929 xsm_init(&initrdidx, mbi, initial_images_start);
931 init_idle_domain();
933 trap_init();
935 rcu_init();
937 timer_init();
939 early_time_init();
941 arch_init_memory();
943 identify_cpu(&boot_cpu_data);
944 if ( cpu_has_fxsr )
945 set_in_cr4(X86_CR4_OSFXSR);
946 if ( cpu_has_xmm )
947 set_in_cr4(X86_CR4_OSXMMEXCPT);
949 local_irq_enable();
951 #ifdef CONFIG_X86_64
952 vesa_mtrr_init();
953 #endif
955 if ( opt_nosmp )
956 max_cpus = 0;
958 smp_prepare_cpus(max_cpus);
960 spin_debug_enable();
962 /*
963 * Initialise higher-level timer functions. We do this fairly late
964 * (post-SMP) because the time bases and scale factors need to be updated
965 * regularly, and SMP initialisation can cause a long delay with
966 * interrupts not yet enabled.
967 */
968 init_xen_time();
970 initialize_keytable();
972 console_init_postirq();
974 for_each_present_cpu ( i )
975 {
976 if ( num_online_cpus() >= max_cpus )
977 break;
978 if ( !cpu_online(i) )
979 {
980 rcu_online_cpu(i);
981 __cpu_up(i);
982 }
984 /* Set up cpu_to_node[]. */
985 srat_detect_node(i);
986 /* Set up node_to_cpumask based on cpu_to_node[]. */
987 numa_add_cpu(i);
988 }
990 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
991 smp_cpus_done(max_cpus);
993 initialise_gdb(); /* could be moved earlier */
995 do_initcalls();
997 if ( opt_watchdog )
998 watchdog_enable();
1000 if ( !tboot_protect_mem_regions() )
1001 panic("Could not protect TXT memory regions\n");
1003 /* Create initial domain 0. */
1004 dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
1005 if ( (dom0 == NULL) || (alloc_dom0_vcpu0() == NULL) )
1006 panic("Error creating domain 0\n");
1008 dom0->is_privileged = 1;
1009 dom0->target = NULL;
1011 /* Grab the DOM0 command line. */
1012 cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
1013 if ( (cmdline != NULL) || (kextra != NULL) )
1015 static char dom0_cmdline[MAX_GUEST_CMDLINE];
1017 cmdline = cmdline_cook(cmdline);
1018 safe_strcpy(dom0_cmdline, cmdline);
1020 if ( kextra != NULL )
1021 /* kextra always includes exactly one leading space. */
1022 safe_strcat(dom0_cmdline, kextra);
1024 /* Append any extra parameters. */
1025 if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
1026 safe_strcat(dom0_cmdline, " noapic");
1027 if ( acpi_skip_timer_override &&
1028 !strstr(dom0_cmdline, "acpi_skip_timer_override") )
1029 safe_strcat(dom0_cmdline, " acpi_skip_timer_override");
1030 if ( (strlen(acpi_param) == 0) && acpi_disabled )
1032 printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
1033 safe_strcpy(acpi_param, "off");
1035 if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
1037 safe_strcat(dom0_cmdline, " acpi=");
1038 safe_strcat(dom0_cmdline, acpi_param);
1041 cmdline = dom0_cmdline;
1044 if ( (initrdidx > 0) && (initrdidx < mbi->mods_count) )
1046 _initrd_start = mod[initrdidx].mod_start;
1047 _initrd_len = mod[initrdidx].mod_end - mod[initrdidx].mod_start;
1050 if ( xen_cpuidle )
1051 xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
1053 /*
1054 * We're going to setup domain0 using the module(s) that we stashed safely
1055 * above our heap. The second module, if present, is an initrd ramdisk.
1056 */
1057 if ( construct_dom0(dom0,
1058 initial_images_base,
1059 initial_images_start,
1060 mod[0].mod_end-mod[0].mod_start,
1061 _initrd_start,
1062 _initrd_len,
1063 cmdline) != 0)
1064 panic("Could not set up DOM0 guest OS\n");
1066 /* Scrub RAM that is still free and so may go to an unprivileged domain. */
1067 scrub_heap_pages();
1069 init_trace_bufs();
1071 init_tmem();
1073 console_endboot();
1075 /* Hide UART from DOM0 if we're using it */
1076 serial_endboot();
1078 domain_unpause_by_systemcontroller(dom0);
1080 reset_stack_and_jump(init_done);
1083 void arch_get_xen_caps(xen_capabilities_info_t *info)
1085 /* Interface name is always xen-3.0-* for Xen-3.x. */
1086 int major = 3, minor = 0;
1087 char s[32];
1089 (*info)[0] = '\0';
1091 #if defined(CONFIG_X86_32)
1093 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1094 safe_strcat(*info, s);
1095 if ( hvm_enabled )
1097 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1098 safe_strcat(*info, s);
1099 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1100 safe_strcat(*info, s);
1103 #elif defined(CONFIG_X86_64)
1105 snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1106 safe_strcat(*info, s);
1107 #ifdef CONFIG_COMPAT
1108 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1109 safe_strcat(*info, s);
1110 #endif
1111 if ( hvm_enabled )
1113 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1114 safe_strcat(*info, s);
1115 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1116 safe_strcat(*info, s);
1117 snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1118 safe_strcat(*info, s);
1121 #endif
1124 int xen_in_range(paddr_t start, paddr_t end)
1126 int i;
1127 static struct {
1128 paddr_t s, e;
1129 } xen_regions[4];
1131 /* initialize first time */
1132 if ( !xen_regions[0].s )
1134 extern char __init_begin[], __bss_start[];
1135 extern unsigned long allocator_bitmap_end;
1137 /* S3 resume code (and other real mode trampoline code) */
1138 xen_regions[0].s = bootsym_phys(trampoline_start);
1139 xen_regions[0].e = bootsym_phys(trampoline_end);
1140 /* hypervisor code + data */
1141 xen_regions[1].s =__pa(&_stext);
1142 xen_regions[1].e = __pa(&__init_begin);
1143 /* per-cpu data */
1144 xen_regions[2].s = __pa(&__per_cpu_start);
1145 xen_regions[2].e = xen_regions[2].s +
1146 (((paddr_t)last_cpu(cpu_possible_map) + 1) << PERCPU_SHIFT);
1147 /* bss + boot allocator bitmap */
1148 xen_regions[3].s = __pa(&__bss_start);
1149 xen_regions[3].e = allocator_bitmap_end;
1152 for ( i = 0; i < ARRAY_SIZE(xen_regions); i++ )
1154 if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
1155 return 1;
1158 return 0;
1161 /*
1162 * Local variables:
1163 * mode: C
1164 * c-set-style: "BSD"
1165 * c-basic-offset: 4
1166 * tab-width: 4
1167 * indent-tabs-mode: nil
1168 * End:
1169 */