direct-io.hg

view xen/arch/x86/setup.c @ 15402:799b3e4bfeac

kernel command line extension

In order to allow appending to the dom0 command line even with boot
loaders that only allow editing the kernel (i.e. Xen in our case)
command line, support a '--' separator option.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Wed Jun 20 16:52:01 2007 +0100 (2007-06-20)
parents 499bab040137
children 296fd2598e00
line source
1 #include <xen/config.h>
2 #include <xen/init.h>
3 #include <xen/lib.h>
4 #include <xen/sched.h>
5 #include <xen/domain.h>
6 #include <xen/serial.h>
7 #include <xen/softirq.h>
8 #include <xen/acpi.h>
9 #include <xen/console.h>
10 #include <xen/serial.h>
11 #include <xen/trace.h>
12 #include <xen/multiboot.h>
13 #include <xen/domain_page.h>
14 #include <xen/version.h>
15 #include <xen/gdbstub.h>
16 #include <xen/percpu.h>
17 #include <xen/hypercall.h>
18 #include <xen/keyhandler.h>
19 #include <xen/numa.h>
20 #include <xen/rcupdate.h>
21 #include <xen/vga.h>
22 #include <public/version.h>
23 #ifdef CONFIG_COMPAT
24 #include <compat/platform.h>
25 #include <compat/xen.h>
26 #endif
27 #include <asm/bitops.h>
28 #include <asm/smp.h>
29 #include <asm/processor.h>
30 #include <asm/mpspec.h>
31 #include <asm/apic.h>
32 #include <asm/desc.h>
33 #include <asm/paging.h>
34 #include <asm/e820.h>
35 #include <acm/acm_hooks.h>
36 #include <xen/kexec.h>
37 #include <asm/edd.h>
39 #if defined(CONFIG_X86_64)
40 #define BOOTSTRAP_DIRECTMAP_END (1UL << 32)
41 #define maddr_to_bootstrap_virt(m) maddr_to_virt(m)
42 #else
43 #define BOOTSTRAP_DIRECTMAP_END HYPERVISOR_VIRT_START
44 #define maddr_to_bootstrap_virt(m) ((void *)(long)(m))
45 #endif
47 extern void dmi_scan_machine(void);
48 extern void generic_apic_probe(void);
49 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
51 extern u16 boot_edid_caps;
52 extern u8 boot_edid_info[128];
53 extern struct boot_video_info boot_vid_info;
55 /*
56 * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the
57 * page_info table and allocation bitmap.
58 */
59 static unsigned int opt_xenheap_megabytes = XENHEAP_DEFAULT_MB;
60 #if defined(CONFIG_X86_64)
61 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
62 #endif
64 /* opt_nosmp: If true, secondary processors are ignored. */
65 static int opt_nosmp = 0;
66 boolean_param("nosmp", opt_nosmp);
68 /* maxcpus: maximum number of CPUs to activate. */
69 static unsigned int max_cpus = NR_CPUS;
70 integer_param("maxcpus", max_cpus);
72 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
73 static int opt_watchdog = 0;
74 boolean_param("watchdog", opt_watchdog);
76 /* **** Linux config option: propagated to domain0. */
77 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
78 /* "acpi=force": Override the disable blacklist. */
79 /* "acpi=strict": Disables out-of-spec workarounds. */
80 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
81 /* "acpi=noirq": Disables ACPI interrupt routing. */
82 static void parse_acpi_param(char *s);
83 custom_param("acpi", parse_acpi_param);
85 /* **** Linux config option: propagated to domain0. */
86 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
87 extern int acpi_skip_timer_override;
88 boolean_param("acpi_skip_timer_override", acpi_skip_timer_override);
90 /* **** Linux config option: propagated to domain0. */
91 /* noapic: Disable IOAPIC setup. */
92 extern int skip_ioapic_setup;
93 boolean_param("noapic", skip_ioapic_setup);
95 int early_boot = 1;
97 cpumask_t cpu_present_map;
99 unsigned long xen_phys_start;
101 /* Limits of Xen heap, used to initialise the allocator. */
102 unsigned long xenheap_phys_start, xenheap_phys_end;
104 extern void arch_init_memory(void);
105 extern void init_IRQ(void);
106 extern void trap_init(void);
107 extern void early_time_init(void);
108 extern void early_cpu_init(void);
110 struct tss_struct init_tss[NR_CPUS];
112 char __attribute__ ((__section__(".bss.page_aligned"))) cpu0_stack[STACK_SIZE];
114 struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
116 #if CONFIG_PAGING_LEVELS > 2
117 unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE;
118 #else
119 unsigned long mmu_cr4_features = X86_CR4_PSE;
120 #endif
121 EXPORT_SYMBOL(mmu_cr4_features);
123 int acpi_disabled;
125 int acpi_force;
126 char acpi_param[10] = "";
127 static void __init parse_acpi_param(char *s)
128 {
129 /* Save the parameter so it can be propagated to domain0. */
130 safe_strcpy(acpi_param, s);
132 /* Interpret the parameter for use within Xen. */
133 if ( !strcmp(s, "off") )
134 {
135 disable_acpi();
136 }
137 else if ( !strcmp(s, "force") )
138 {
139 acpi_force = 1;
140 acpi_ht = 1;
141 acpi_disabled = 0;
142 }
143 else if ( !strcmp(s, "strict") )
144 {
145 acpi_strict = 1;
146 }
147 else if ( !strcmp(s, "ht") )
148 {
149 if ( !acpi_force )
150 disable_acpi();
151 acpi_ht = 1;
152 }
153 else if ( !strcmp(s, "noirq") )
154 {
155 acpi_noirq_set();
156 }
157 }
159 static void __init do_initcalls(void)
160 {
161 initcall_t *call;
162 for ( call = &__initcall_start; call < &__initcall_end; call++ )
163 (*call)();
164 }
166 #define EARLY_FAIL(f, a...) do { \
167 printk( f , ## a ); \
168 for ( ; ; ) __asm__ __volatile__ ( "hlt" ); \
169 } while (0)
171 static unsigned long __initdata initial_images_start, initial_images_end;
173 unsigned long __init initial_images_nrpages(void)
174 {
175 unsigned long s = initial_images_start + PAGE_SIZE - 1;
176 unsigned long e = initial_images_end;
177 return ((e >> PAGE_SHIFT) - (s >> PAGE_SHIFT));
178 }
180 void __init discard_initial_images(void)
181 {
182 init_domheap_pages(initial_images_start, initial_images_end);
183 }
185 extern char __per_cpu_start[], __per_cpu_data_end[], __per_cpu_end[];
187 static void __init percpu_init_areas(void)
188 {
189 unsigned int i, data_size = __per_cpu_data_end - __per_cpu_start;
190 unsigned int first_unused;
192 BUG_ON(data_size > PERCPU_SIZE);
194 /* Initialise per-cpu data area for all possible secondary CPUs. */
195 for ( i = 1; (i < NR_CPUS) && cpu_possible(i); i++ )
196 memcpy(__per_cpu_start + (i << PERCPU_SHIFT),
197 __per_cpu_start,
198 data_size);
199 first_unused = i;
201 /* Check that there are no holes in cpu_possible_map. */
202 for ( ; i < NR_CPUS; i++ )
203 BUG_ON(cpu_possible(i));
205 #ifndef MEMORY_GUARD
206 init_xenheap_pages(__pa(__per_cpu_start) + (first_unused << PERCPU_SHIFT),
207 __pa(__per_cpu_end));
208 #endif
209 memguard_guard_range(&__per_cpu_start[first_unused << PERCPU_SHIFT],
210 (NR_CPUS - first_unused) << PERCPU_SHIFT);
211 #if defined(CONFIG_X86_64)
212 /* Also zap the mapping in the 1:1 area. */
213 memguard_guard_range(__va(__pa(__per_cpu_start)) +
214 (first_unused << PERCPU_SHIFT),
215 (NR_CPUS - first_unused) << PERCPU_SHIFT);
216 #endif
217 }
219 /* Fetch acm policy module from multiboot modules. */
220 static void __init extract_acm_policy(
221 multiboot_info_t *mbi,
222 unsigned int *initrdidx,
223 char **_policy_start,
224 unsigned long *_policy_len)
225 {
226 int i;
227 module_t *mod = (module_t *)__va(mbi->mods_addr);
228 unsigned long start, policy_len;
229 char *policy_start;
231 /*
232 * Try all modules and see whichever could be the binary policy.
233 * Adjust the initrdidx if module[1] is the binary policy.
234 */
235 for ( i = mbi->mods_count-1; i >= 1; i-- )
236 {
237 start = initial_images_start + (mod[i].mod_start-mod[0].mod_start);
238 policy_start = maddr_to_bootstrap_virt(start);
239 policy_len = mod[i].mod_end - mod[i].mod_start;
240 if ( acm_is_policy(policy_start, policy_len) )
241 {
242 printk("Policy len 0x%lx, start at %p - module %d.\n",
243 policy_len, policy_start, i);
244 *_policy_start = policy_start;
245 *_policy_len = policy_len;
246 if ( i == 1 )
247 *initrdidx = (mbi->mods_count > 2) ? 2 : 0;
248 break;
249 }
250 }
251 }
253 static void __init init_idle_domain(void)
254 {
255 struct domain *idle_domain;
257 /* Domain creation requires that scheduler structures are initialised. */
258 scheduler_init();
260 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
261 if ( (idle_domain == NULL) || (alloc_vcpu(idle_domain, 0, 0) == NULL) )
262 BUG();
264 set_current(idle_domain->vcpu[0]);
265 idle_vcpu[0] = this_cpu(curr_vcpu) = current;
267 setup_idle_pagetable();
268 }
270 static void __init srat_detect_node(int cpu)
271 {
272 unsigned node;
273 u8 apicid = x86_cpu_to_apicid[cpu];
275 node = apicid_to_node[apicid];
276 if ( node == NUMA_NO_NODE )
277 node = 0;
278 numa_set_node(cpu, node);
280 if ( acpi_numa > 0 )
281 printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
282 }
284 static void __init move_memory(
285 unsigned long dst, unsigned long src_start, unsigned long src_end)
286 {
287 memmove(maddr_to_bootstrap_virt(dst),
288 maddr_to_bootstrap_virt(src_start),
289 src_end - src_start);
290 }
292 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
293 static struct e820map __initdata boot_e820;
295 /* Reserve area (@s,@e) in the temporary bootstrap e820 map. */
296 static void __init reserve_in_boot_e820(unsigned long s, unsigned long e)
297 {
298 uint64_t rs, re;
299 int i;
301 for ( i = 0; i < boot_e820.nr_map; i++ )
302 {
303 /* Have we found the e820 region that includes the specified range? */
304 rs = boot_e820.map[i].addr;
305 re = rs + boot_e820.map[i].size;
306 if ( (s < rs) || (e > re) )
307 continue;
309 /* Start fragment. */
310 boot_e820.map[i].size = s - rs;
312 /* End fragment. */
313 if ( e < re )
314 {
315 memmove(&boot_e820.map[i+1], &boot_e820.map[i],
316 (boot_e820.nr_map-i) * sizeof(boot_e820.map[0]));
317 boot_e820.nr_map++;
318 i++;
319 boot_e820.map[i].addr = e;
320 boot_e820.map[i].size = re - e;
321 }
322 }
323 }
325 struct boot_video_info {
326 u8 orig_x; /* 0x00 */
327 u8 orig_y; /* 0x01 */
328 u8 orig_video_mode; /* 0x02 */
329 u8 orig_video_cols; /* 0x03 */
330 u8 orig_video_lines; /* 0x04 */
331 u8 orig_video_isVGA; /* 0x05 */
332 u16 orig_video_points; /* 0x06 */
334 /* VESA graphic mode -- linear frame buffer */
335 u32 capabilities; /* 0x08 */
336 u16 lfb_linelength; /* 0x0c */
337 u16 lfb_width; /* 0x0e */
338 u16 lfb_height; /* 0x10 */
339 u16 lfb_depth; /* 0x12 */
340 u32 lfb_base; /* 0x14 */
341 u32 lfb_size; /* 0x18 */
342 u8 red_size; /* 0x1c */
343 u8 red_pos; /* 0x1d */
344 u8 green_size; /* 0x1e */
345 u8 green_pos; /* 0x1f */
346 u8 blue_size; /* 0x20 */
347 u8 blue_pos; /* 0x21 */
348 u8 rsvd_size; /* 0x22 */
349 u8 rsvd_pos; /* 0x23 */
350 u16 vesapm_seg; /* 0x24 */
351 u16 vesapm_off; /* 0x26 */
352 };
354 static void __init parse_video_info(void)
355 {
356 struct boot_video_info *bvi = &bootsym(boot_vid_info);
358 if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
359 {
360 vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
361 vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
362 vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
363 vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
364 vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
365 vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
366 }
367 else if ( bvi->orig_video_isVGA == 0x23 )
368 {
369 vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
370 vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
371 vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
372 vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
373 vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
374 vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
375 vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
376 vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
377 vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
378 vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
379 vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
380 vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
381 vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
382 vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
383 vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
384 }
385 }
387 void init_done(void)
388 {
389 extern char __init_begin[], __init_end[];
391 /* Free (or page-protect) the init areas. */
392 #ifndef MEMORY_GUARD
393 init_xenheap_pages(__pa(__init_begin), __pa(__init_end));
394 #endif
395 memguard_guard_range(__init_begin, __init_end - __init_begin);
396 #if defined(CONFIG_X86_64)
397 /* Also zap the mapping in the 1:1 area. */
398 memguard_guard_range(__va(__pa(__init_begin)), __init_end - __init_begin);
399 #endif
400 printk("Freed %ldkB init memory.\n", (long)(__init_end-__init_begin)>>10);
402 startup_cpu_idle_loop();
403 }
405 void __init __start_xen(unsigned long mbi_p)
406 {
407 char *memmap_type = NULL;
408 char __cmdline[] = "", *cmdline = __cmdline, *kextra;
409 unsigned long _initrd_start = 0, _initrd_len = 0;
410 unsigned int initrdidx = 1;
411 char *_policy_start = NULL;
412 unsigned long _policy_len = 0;
413 multiboot_info_t *mbi = __va(mbi_p);
414 module_t *mod = (module_t *)__va(mbi->mods_addr);
415 unsigned long nr_pages, modules_length;
416 int i, e820_warn = 0, bytes = 0;
417 struct ns16550_defaults ns16550 = {
418 .data_bits = 8,
419 .parity = 'n',
420 .stop_bits = 1
421 };
423 extern void early_page_fault(void);
424 set_intr_gate(TRAP_page_fault, &early_page_fault);
426 /* Parse the command-line options. */
427 if ( (mbi->flags & MBI_CMDLINE) && (mbi->cmdline != 0) )
428 cmdline = __va(mbi->cmdline);
429 if ( (kextra = strstr(cmdline, " -- ")) != NULL )
430 {
431 /*
432 * Options after ' -- ' separator belong to dom0.
433 * 1. Orphan dom0's options from Xen's command line.
434 * 2. Skip all but final leading space from dom0's options.
435 */
436 *kextra = '\0';
437 kextra += 3;
438 while ( kextra[1] == ' ' ) kextra++;
439 }
440 cmdline_parse(cmdline);
442 parse_video_info();
444 set_current((struct vcpu *)0xfffff000); /* debug sanity */
445 idle_vcpu[0] = current;
446 set_processor_id(0); /* needed early, for smp_processor_id() */
448 smp_prepare_boot_cpu();
450 /* We initialise the serial devices very early so we can get debugging. */
451 ns16550.io_base = 0x3f8;
452 ns16550.irq = 4;
453 ns16550_init(0, &ns16550);
454 ns16550.io_base = 0x2f8;
455 ns16550.irq = 3;
456 ns16550_init(1, &ns16550);
457 serial_init_preirq();
459 init_console();
461 printk("Command line: %s\n", cmdline);
463 printk("Video information:\n");
465 /* Print VGA display mode information. */
466 switch ( vga_console_info.video_type )
467 {
468 case XEN_VGATYPE_TEXT_MODE_3:
469 printk(" VGA is text mode %dx%d, font 8x%d\n",
470 vga_console_info.u.text_mode_3.columns,
471 vga_console_info.u.text_mode_3.rows,
472 vga_console_info.u.text_mode_3.font_height);
473 break;
474 case XEN_VGATYPE_VESA_LFB:
475 printk(" VGA is graphics mode %dx%d, %d bpp\n",
476 vga_console_info.u.vesa_lfb.width,
477 vga_console_info.u.vesa_lfb.height,
478 vga_console_info.u.vesa_lfb.bits_per_pixel);
479 break;
480 default:
481 printk(" No VGA detected\n");
482 break;
483 }
485 /* Print VBE/DDC EDID information. */
486 if ( bootsym(boot_edid_caps) != 0x1313 )
487 {
488 u16 caps = bootsym(boot_edid_caps);
489 printk(" VBE/DDC methods:%s%s%s; ",
490 (caps & 1) ? " V1" : "",
491 (caps & 2) ? " V2" : "",
492 !(caps & 3) ? " none" : "");
493 printk("EDID transfer time: %d seconds\n", caps >> 8);
494 if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
495 {
496 printk(" EDID info not retrieved because ");
497 if ( !(caps & 3) )
498 printk("no DDC retrieval method detected\n");
499 else if ( (caps >> 8) > 5 )
500 printk("takes longer than 5 seconds\n");
501 else
502 printk("of reasons unknown\n");
503 }
504 }
506 printk("Disc information:\n");
507 printk(" Found %d MBR signatures\n",
508 bootsym(boot_edd_signature_nr));
509 printk(" Found %d EDD information structures\n",
510 bootsym(boot_edd_info_nr));
512 /* Check that we have at least one Multiboot module. */
513 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
514 EARLY_FAIL("dom0 kernel not specified. "
515 "Check bootloader configuration.\n");
517 if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
518 EARLY_FAIL("Misaligned CPU0 stack.\n");
520 /*
521 * Since there are some stubs getting built on the stacks which use
522 * direct calls/jumps, the heap must be confined to the lower 2G so
523 * that those branches can reach their targets.
524 */
525 if ( opt_xenheap_megabytes > 2048 )
526 opt_xenheap_megabytes = 2048;
528 if ( e820_raw_nr != 0 )
529 {
530 memmap_type = "Xen-e820";
531 }
532 else if ( bootsym(lowmem_kb) )
533 {
534 memmap_type = "Xen-e801";
535 e820_raw[0].addr = 0;
536 e820_raw[0].size = bootsym(lowmem_kb) << 10;
537 e820_raw[0].type = E820_RAM;
538 e820_raw[1].addr = 0x100000;
539 e820_raw[1].size = bootsym(highmem_kb) << 10;
540 e820_raw[1].type = E820_RAM;
541 e820_raw_nr = 2;
542 }
543 else if ( mbi->flags & MBI_MEMMAP )
544 {
545 memmap_type = "Multiboot-e820";
546 while ( bytes < mbi->mmap_length )
547 {
548 memory_map_t *map = __va(mbi->mmap_addr + bytes);
550 /*
551 * This is a gross workaround for a BIOS bug. Some bootloaders do
552 * not write e820 map entries into pre-zeroed memory. This is
553 * okay if the BIOS fills in all fields of the map entry, but
554 * some broken BIOSes do not bother to write the high word of
555 * the length field if the length is smaller than 4GB. We
556 * detect and fix this by flagging sections below 4GB that
557 * appear to be larger than 4GB in size.
558 */
559 if ( (map->base_addr_high == 0) && (map->length_high != 0) )
560 {
561 if ( !e820_warn )
562 {
563 printk("WARNING: Buggy e820 map detected and fixed "
564 "(truncated length fields).\n");
565 e820_warn = 1;
566 }
567 map->length_high = 0;
568 }
570 e820_raw[e820_raw_nr].addr =
571 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
572 e820_raw[e820_raw_nr].size =
573 ((u64)map->length_high << 32) | (u64)map->length_low;
574 e820_raw[e820_raw_nr].type =
575 (map->type > E820_NVS) ? E820_RESERVED : map->type;
576 e820_raw_nr++;
578 bytes += map->size + 4;
579 }
580 }
581 else if ( mbi->flags & MBI_MEMLIMITS )
582 {
583 memmap_type = "Multiboot-e801";
584 e820_raw[0].addr = 0;
585 e820_raw[0].size = mbi->mem_lower << 10;
586 e820_raw[0].type = E820_RAM;
587 e820_raw[1].addr = 0x100000;
588 e820_raw[1].size = mbi->mem_upper << 10;
589 e820_raw[1].type = E820_RAM;
590 e820_raw_nr = 2;
591 }
592 else
593 {
594 EARLY_FAIL("Bootloader provided no memory information.\n");
595 }
597 /* Ensure that all E820 RAM regions are page-aligned and -sized. */
598 for ( i = 0; i < e820_raw_nr; i++ )
599 {
600 uint64_t s, e;
602 if ( e820_raw[i].type != E820_RAM )
603 continue;
604 s = PFN_UP(e820_raw[i].addr);
605 e = PFN_DOWN(e820_raw[i].addr + e820_raw[i].size);
606 e820_raw[i].size = 0; /* discarded later */
607 if ( s < e )
608 {
609 e820_raw[i].addr = s << PAGE_SHIFT;
610 e820_raw[i].size = (e - s) << PAGE_SHIFT;
611 }
612 }
614 /* Sanitise the raw E820 map to produce a final clean version. */
615 max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
617 /*
618 * Create a temporary copy of the E820 map. Truncate it to above 16MB
619 * as anything below that is already mapped and has a statically-allocated
620 * purpose.
621 */
622 memcpy(&boot_e820, &e820, sizeof(e820));
623 for ( i = 0; i < boot_e820.nr_map; i++ )
624 {
625 uint64_t s, e, min = 16 << 20; /* 16MB */
626 s = boot_e820.map[i].addr;
627 e = boot_e820.map[i].addr + boot_e820.map[i].size;
628 if ( s >= min )
629 continue;
630 if ( e > min )
631 {
632 boot_e820.map[i].addr = min;
633 boot_e820.map[i].size = e - min;
634 }
635 else
636 boot_e820.map[i].type = E820_RESERVED;
637 }
639 /*
640 * Iterate backwards over all superpage-aligned RAM regions.
641 *
642 * We require superpage alignment because the boot allocator is not yet
643 * initialised. Hence we can only map superpages in the address range
644 * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require
645 * dynamic allocation of pagetables.
646 *
647 * As well as mapping superpages in that range, in preparation for
648 * initialising the boot allocator, we also look for a region to which
649 * we can relocate the dom0 kernel and other multiboot modules. Also, on
650 * x86/64, we relocate Xen to higher memory.
651 */
652 modules_length = mod[mbi->mods_count-1].mod_end - mod[0].mod_start;
653 for ( i = boot_e820.nr_map-1; i >= 0; i-- )
654 {
655 uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
657 /* Superpage-aligned chunks up to BOOTSTRAP_DIRECTMAP_END, please. */
658 s = (boot_e820.map[i].addr + mask) & ~mask;
659 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
660 e = min_t(uint64_t, e, BOOTSTRAP_DIRECTMAP_END);
661 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
662 continue;
664 /* Map the chunk. No memory will need to be allocated to do this. */
665 map_pages_to_xen(
666 (unsigned long)maddr_to_bootstrap_virt(s),
667 s >> PAGE_SHIFT, (e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
669 #if defined(CONFIG_X86_64)
670 /* Is the region suitable for relocating Xen? */
671 if ( !xen_phys_start && (((e-s) >> 20) >= opt_xenheap_megabytes) )
672 {
673 extern l2_pgentry_t l2_xenmap[];
674 l4_pgentry_t *pl4e;
675 l3_pgentry_t *pl3e;
676 l2_pgentry_t *pl2e;
677 int i, j;
679 /* Select relocation address. */
680 e = (e - (opt_xenheap_megabytes << 20)) & ~mask;
681 xen_phys_start = e;
682 bootsym(trampoline_xen_phys_start) = e;
684 /*
685 * Perform relocation to new physical address.
686 * Before doing so we must sync static/global data with main memory
687 * with a barrier(). After this we must *not* modify static/global
688 * data until after we have switched to the relocated pagetables!
689 */
690 barrier();
691 move_memory(e, 0, __pa(&_end) - xen_phys_start);
693 /* Poison low 1MB to detect stray pointers to physical 0-1MB. */
694 memset(maddr_to_bootstrap_virt(e), 0x55, 1U<<20);
696 /* Walk initial pagetables, relocating page directory entries. */
697 pl4e = __va(__pa(idle_pg_table));
698 for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
699 {
700 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
701 continue;
702 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
703 xen_phys_start);
704 pl3e = l4e_to_l3e(*pl4e);
705 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
706 {
707 /* Not present or already relocated? */
708 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
709 (l3e_get_pfn(*pl3e) > 0x1000) )
710 continue;
711 *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
712 xen_phys_start);
713 }
714 }
716 /* The only data mappings to be relocated are in the Xen area. */
717 pl2e = __va(__pa(l2_xenmap));
718 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
719 {
720 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
721 continue;
722 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
723 xen_phys_start);
724 }
726 /* Re-sync the stack and then switch to relocated pagetables. */
727 asm volatile (
728 "rep movsb ; " /* re-sync the stack */
729 "movq %%cr4,%%rsi ; "
730 "andb $0x7f,%%sil ; "
731 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
732 "movq %0,%%cr3 ; " /* CR3 == new pagetables */
733 "orb $0x80,%%sil ; "
734 "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */
735 : : "r" (__pa(idle_pg_table)), "S" (cpu0_stack),
736 "D" (__va(__pa(cpu0_stack))), "c" (STACK_SIZE) : "memory" );
737 }
738 #endif
740 /* Is the region suitable for relocating the multiboot modules? */
741 if ( !initial_images_start && (s < e) && ((e-s) >= modules_length) )
742 {
743 e -= modules_length;
744 initial_images_start = e;
745 initial_images_end = initial_images_start + modules_length;
746 move_memory(initial_images_start,
747 mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
748 }
749 }
751 if ( !initial_images_start )
752 EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n");
753 reserve_in_boot_e820(initial_images_start, initial_images_end);
755 /*
756 * With modules (and Xen itself, on x86/64) relocated out of the way, we
757 * can now initialise the boot allocator with some memory.
758 */
759 xenheap_phys_start = init_boot_allocator(__pa(&_end));
760 xenheap_phys_end = opt_xenheap_megabytes << 20;
761 #if defined(CONFIG_X86_64)
762 if ( !xen_phys_start )
763 EARLY_FAIL("Not enough memory to relocate Xen.\n");
764 xenheap_phys_end += xen_phys_start;
765 reserve_in_boot_e820(xen_phys_start,
766 xen_phys_start + (opt_xenheap_megabytes<<20));
767 init_boot_pages(1<<20, 16<<20); /* Initial seed: 15MB */
768 #else
769 init_boot_pages(xenheap_phys_end, 16<<20); /* Initial seed: 4MB */
770 #endif
772 /*
773 * With the boot allocator now seeded, we can walk every RAM region and
774 * map it in its entirety (on x86/64, at least) and notify it to the
775 * boot allocator.
776 */
777 for ( i = 0; i < boot_e820.nr_map; i++ )
778 {
779 uint64_t s, e, map_e, mask = PAGE_SIZE - 1;
781 /* Only page alignment required now. */
782 s = (boot_e820.map[i].addr + mask) & ~mask;
783 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
784 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
785 continue;
787 /* Perform the mapping (truncated in 32-bit mode). */
788 map_e = e;
789 #if defined(CONFIG_X86_32)
790 map_e = min_t(uint64_t, map_e, BOOTSTRAP_DIRECTMAP_END);
791 #endif
792 if ( s < map_e )
793 map_pages_to_xen(
794 (unsigned long)maddr_to_bootstrap_virt(s),
795 s >> PAGE_SHIFT, (map_e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
797 init_boot_pages(s, e);
798 }
800 if ( (kexec_crash_area.size > 0) && (kexec_crash_area.start > 0) )
801 {
802 unsigned long kdump_start, kdump_size, k;
804 /* Mark images pages as free for now. */
805 init_boot_pages(initial_images_start, initial_images_end);
807 kdump_start = kexec_crash_area.start;
808 kdump_size = kexec_crash_area.size;
810 printk("Kdump: %luMB (%lukB) at 0x%lx\n",
811 kdump_size >> 20,
812 kdump_size >> 10,
813 kdump_start);
815 if ( (kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK) )
816 panic("Kdump parameters not page aligned\n");
818 kdump_start >>= PAGE_SHIFT;
819 kdump_size >>= PAGE_SHIFT;
821 /* Allocate pages for Kdump memory area. */
822 if ( !reserve_boot_pages(kdump_start, kdump_size) )
823 panic("Unable to reserve Kdump memory\n");
825 /* Allocate pages for relocated initial images. */
826 k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
827 k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
829 #if defined(CONFIG_X86_32)
830 /* Must allocate within bootstrap 1:1 limits. */
831 k = alloc_boot_low_pages(k, 1); /* 0x0 - BOOTSTRAP_DIRECTMAP_END */
832 #else
833 k = alloc_boot_pages(k, 1);
834 #endif
835 if ( k == 0 )
836 panic("Unable to allocate initial images memory\n");
838 move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
840 initial_images_end -= initial_images_start;
841 initial_images_start = k << PAGE_SHIFT;
842 initial_images_end += initial_images_start;
843 }
845 memguard_init();
847 nr_pages = 0;
848 for ( i = 0; i < e820.nr_map; i++ )
849 if ( e820.map[i].type == E820_RAM )
850 nr_pages += e820.map[i].size >> PAGE_SHIFT;
851 printk("System RAM: %luMB (%lukB)\n",
852 nr_pages >> (20 - PAGE_SHIFT),
853 nr_pages << (PAGE_SHIFT - 10));
854 total_pages = nr_pages;
856 /* Sanity check for unwanted bloat of certain hypercall structures. */
857 BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
858 sizeof(((struct xen_platform_op *)0)->u.pad));
859 BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
860 sizeof(((struct xen_domctl *)0)->u.pad));
861 BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
862 sizeof(((struct xen_sysctl *)0)->u.pad));
864 BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
865 BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
866 BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
868 #ifdef CONFIG_COMPAT
869 BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
870 sizeof(((struct compat_platform_op *)0)->u.pad));
871 BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
872 BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
873 #endif
875 /* Check definitions in public headers match internal defs. */
876 BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
877 #ifdef HYPERVISOR_VIRT_END
878 BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
879 #endif
880 BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
881 BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
883 init_frametable();
885 acpi_boot_table_init();
887 acpi_numa_init();
889 numa_initmem_init(0, max_page);
891 /* Initialise the Xen heap, skipping RAM holes. */
892 init_xenheap_pages(xenheap_phys_start, xenheap_phys_end);
893 nr_pages = (xenheap_phys_end - xenheap_phys_start) >> PAGE_SHIFT;
894 #ifdef __x86_64__
895 init_xenheap_pages(xen_phys_start, __pa(&_start));
896 nr_pages += (__pa(&_start) - xen_phys_start) >> PAGE_SHIFT;
897 #endif
898 xenheap_phys_start = xen_phys_start;
899 printk("Xen heap: %luMB (%lukB)\n",
900 nr_pages >> (20 - PAGE_SHIFT),
901 nr_pages << (PAGE_SHIFT - 10));
903 end_boot_allocator();
905 early_boot = 0;
907 early_cpu_init();
909 paging_init();
911 /* Unmap the first page of CPU0's stack. */
912 memguard_guard_stack(cpu0_stack);
914 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
916 if ( opt_watchdog )
917 nmi_watchdog = NMI_LOCAL_APIC;
919 sort_exception_tables();
921 find_smp_config();
923 dmi_scan_machine();
925 generic_apic_probe();
927 acpi_boot_init();
929 init_cpu_to_node();
931 if ( smp_found_config )
932 get_smp_config();
934 #ifdef CONFIG_X86_64
935 /* Low mappings were only needed for some BIOS table parsing. */
936 zap_low_mappings();
937 #endif
939 init_apic_mappings();
941 init_IRQ();
943 percpu_init_areas();
945 init_idle_domain();
947 trap_init();
949 rcu_init();
951 timer_init();
953 early_time_init();
955 arch_init_memory();
957 identify_cpu(&boot_cpu_data);
958 if ( cpu_has_fxsr )
959 set_in_cr4(X86_CR4_OSFXSR);
960 if ( cpu_has_xmm )
961 set_in_cr4(X86_CR4_OSXMMEXCPT);
963 if ( opt_nosmp )
964 max_cpus = 0;
966 smp_prepare_cpus(max_cpus);
968 /*
969 * Initialise higher-level timer functions. We do this fairly late
970 * (post-SMP) because the time bases and scale factors need to be updated
971 * regularly, and SMP initialisation can cause a long delay with
972 * interrupts not yet enabled.
973 */
974 init_xen_time();
976 initialize_keytable();
978 serial_init_postirq();
980 BUG_ON(!local_irq_is_enabled());
982 for_each_present_cpu ( i )
983 {
984 if ( num_online_cpus() >= max_cpus )
985 break;
986 if ( !cpu_online(i) )
987 {
988 rcu_online_cpu(i);
989 __cpu_up(i);
990 }
992 /* Set up cpu_to_node[]. */
993 srat_detect_node(i);
994 /* Set up node_to_cpumask based on cpu_to_node[]. */
995 numa_add_cpu(i);
996 }
998 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
999 smp_cpus_done(max_cpus);
1001 initialise_gdb(); /* could be moved earlier */
1003 do_initcalls();
1005 if ( opt_watchdog )
1006 watchdog_enable();
1008 /* Extract policy from multiboot. */
1009 extract_acm_policy(mbi, &initrdidx, &_policy_start, &_policy_len);
1011 /* initialize access control security module */
1012 acm_init(_policy_start, _policy_len);
1014 /* Create initial domain 0. */
1015 dom0 = domain_create(0, 0, DOM0_SSIDREF);
1016 if ( (dom0 == NULL) || (alloc_vcpu(dom0, 0, 0) == NULL) )
1017 panic("Error creating domain 0\n");
1019 dom0->is_privileged = 1;
1021 /* Grab the DOM0 command line. */
1022 cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
1023 if ( (cmdline != NULL) || (kextra != NULL) )
1025 static char dom0_cmdline[MAX_GUEST_CMDLINE];
1027 dom0_cmdline[0] = '\0';
1029 if ( cmdline != NULL )
1031 /* Skip past the image name and copy to a local buffer. */
1032 while ( *cmdline == ' ' ) cmdline++;
1033 if ( (cmdline = strchr(cmdline, ' ')) != NULL )
1035 while ( *cmdline == ' ' ) cmdline++;
1036 safe_strcpy(dom0_cmdline, cmdline);
1040 if ( kextra != NULL )
1041 /* kextra always includes exactly one leading space. */
1042 safe_strcat(dom0_cmdline, kextra);
1044 /* Append any extra parameters. */
1045 if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
1046 safe_strcat(dom0_cmdline, " noapic");
1047 if ( acpi_skip_timer_override &&
1048 !strstr(dom0_cmdline, "acpi_skip_timer_override") )
1049 safe_strcat(dom0_cmdline, " acpi_skip_timer_override");
1050 if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
1052 safe_strcat(dom0_cmdline, " acpi=");
1053 safe_strcat(dom0_cmdline, acpi_param);
1056 cmdline = dom0_cmdline;
1059 if ( (initrdidx > 0) && (initrdidx < mbi->mods_count) )
1061 _initrd_start = initial_images_start +
1062 (mod[initrdidx].mod_start - mod[0].mod_start);
1063 _initrd_len = mod[initrdidx].mod_end - mod[initrdidx].mod_start;
1066 /*
1067 * We're going to setup domain0 using the module(s) that we stashed safely
1068 * above our heap. The second module, if present, is an initrd ramdisk.
1069 */
1070 if ( construct_dom0(dom0,
1071 initial_images_start,
1072 mod[0].mod_end-mod[0].mod_start,
1073 _initrd_start,
1074 _initrd_len,
1075 cmdline) != 0)
1076 panic("Could not set up DOM0 guest OS\n");
1078 /* Scrub RAM that is still free and so may go to an unprivileged domain. */
1079 scrub_heap_pages();
1081 init_trace_bufs();
1083 console_endboot();
1085 /* Hide UART from DOM0 if we're using it */
1086 serial_endboot();
1088 domain_unpause_by_systemcontroller(dom0);
1090 reset_stack_and_jump(init_done);
1093 void arch_get_xen_caps(xen_capabilities_info_t *info)
1095 /* Interface name is always xen-3.0-* for Xen-3.x. */
1096 int major = 3, minor = 0;
1097 char s[32];
1099 (*info)[0] = '\0';
1101 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
1103 snprintf(s, sizeof(s), "xen-%d.%d-x86_32 ", major, minor);
1104 safe_strcat(*info, s);
1105 if ( hvm_enabled )
1107 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1108 safe_strcat(*info, s);
1111 #elif defined(CONFIG_X86_32) && defined(CONFIG_X86_PAE)
1113 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1114 safe_strcat(*info, s);
1115 if ( hvm_enabled )
1117 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1118 safe_strcat(*info, s);
1119 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1120 safe_strcat(*info, s);
1123 #elif defined(CONFIG_X86_64)
1125 snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1126 safe_strcat(*info, s);
1127 #ifdef CONFIG_COMPAT
1128 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1129 safe_strcat(*info, s);
1130 #endif
1131 if ( hvm_enabled )
1133 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1134 safe_strcat(*info, s);
1135 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1136 safe_strcat(*info, s);
1137 snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1138 safe_strcat(*info, s);
1141 #endif
1144 /*
1145 * Local variables:
1146 * mode: C
1147 * c-set-style: "BSD"
1148 * c-basic-offset: 4
1149 * tab-width: 4
1150 * indent-tabs-mode: nil
1151 * End:
1152 */