ia64/xen-unstable

annotate xen/arch/x86/setup.c @ 19646:f210a633571c

Transcendent memory ("tmem") for Xen.

Tmem, when called from a tmem-capable (paravirtualized) guest, makes
use of otherwise unutilized ("fallow") memory to create and manage
pools of pages that can be accessed from the guest either as
"ephemeral" pages or as "persistent" pages. In either case, the pages
are not directly addressible by the guest, only copied to and fro via
the tmem interface. Ephemeral pages are a nice place for a guest to
put recently evicted clean pages that it might need again; these pages
can be reclaimed synchronously by Xen for other guests or other uses.
Persistent pages are a nice place for a guest to put "swap" pages to
avoid sending them to disk. These pages retain data as long as the
guest lives, but count against the guest memory allocation.

Tmem pages may optionally be compressed and, in certain cases, can be
shared between guests. Tmem also handles concurrency nicely and
provides limited QoS settings to combat malicious DoS attempts.
Save/restore and live migration support is not yet provided.

Tmem is primarily targeted for an x86 64-bit hypervisor. On a 32-bit
x86 hypervisor, it has limited functionality and testing due to
limitations of the xen heap. Nearly all of tmem is
architecture-independent; three routines remain to be ported to ia64
and it should work on that architecture too. It is also structured to
be portable to non-Xen environments.

Tmem defaults off (for now) and must be enabled with a "tmem" xen boot
option (and does nothing unless a tmem-capable guest is running). The
"tmem_compress" boot option enables compression which takes about 10x
more CPU but approximately doubles the number of pages that can be
stored.

Tmem can be controlled via several "xm" commands and many interesting
tmem statistics can be obtained. A README and internal specification
will follow, but lots of useful prose about tmem, as well as Linux
patches, can be found at http://oss.oracle.com/projects/tmem .

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:05:04 2009 +0100 (2009-05-26)
parents 7809e0941b38
children 2f9e1348aa98
rev   line source
kaf24@1452 1 #include <xen/config.h>
kaf24@1452 2 #include <xen/init.h>
kaf24@1452 3 #include <xen/lib.h>
kaf24@1452 4 #include <xen/sched.h>
cl349@5247 5 #include <xen/domain.h>
kaf24@1452 6 #include <xen/serial.h>
kaf24@1506 7 #include <xen/softirq.h>
kaf24@1452 8 #include <xen/acpi.h>
kaf24@3338 9 #include <xen/console.h>
iap10@4287 10 #include <xen/serial.h>
kaf24@3338 11 #include <xen/trace.h>
kaf24@3338 12 #include <xen/multiboot.h>
kaf24@5356 13 #include <xen/domain_page.h>
kfraser@10890 14 #include <xen/version.h>
kaf24@9117 15 #include <xen/gdbstub.h>
kaf24@9818 16 #include <xen/percpu.h>
kfraser@11296 17 #include <xen/hypercall.h>
kfraser@11601 18 #include <xen/keyhandler.h>
kfraser@11971 19 #include <xen/numa.h>
kaf24@13662 20 #include <xen/rcupdate.h>
keir@15298 21 #include <xen/vga.h>
keir@15988 22 #include <xen/dmi.h>
iap10@6721 23 #include <public/version.h>
ack@13291 24 #ifdef CONFIG_COMPAT
ack@13291 25 #include <compat/platform.h>
ack@13291 26 #include <compat/xen.h>
ack@13291 27 #endif
kaf24@1452 28 #include <asm/bitops.h>
kaf24@1452 29 #include <asm/smp.h>
kaf24@1452 30 #include <asm/processor.h>
kaf24@1452 31 #include <asm/mpspec.h>
kaf24@1452 32 #include <asm/apic.h>
kaf24@1452 33 #include <asm/desc.h>
Tim@13909 34 #include <asm/paging.h>
kaf24@3344 35 #include <asm/e820.h>
kfraser@15819 36 #include <xsm/acm/acm_hooks.h>
ian@12677 37 #include <xen/kexec.h>
kfraser@15336 38 #include <asm/edd.h>
kfraser@15815 39 #include <xsm/xsm.h>
keir@16274 40 #include <asm/tboot.h>
kaf24@3338 41
keir@19076 42 int __init bzimage_headroom(char *image_start, unsigned long image_length);
keir@19076 43
kfraser@15074 44 #if defined(CONFIG_X86_64)
kfraser@15597 45 #define BOOTSTRAP_DIRECTMAP_END (1UL << 32) /* 4GB */
kfraser@15074 46 #define maddr_to_bootstrap_virt(m) maddr_to_virt(m)
kfraser@15074 47 #else
kfraser@15597 48 #define BOOTSTRAP_DIRECTMAP_END (1UL << 30) /* 1GB */
kfraser@15074 49 #define maddr_to_bootstrap_virt(m) ((void *)(long)(m))
kfraser@15074 50 #endif
kfraser@15074 51
kaf24@5211 52 extern void generic_apic_probe(void);
kfraser@11971 53 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
kaf24@5157 54
kfraser@15330 55 extern u16 boot_edid_caps;
kfraser@15330 56 extern u8 boot_edid_info[128];
kfraser@15330 57 extern struct boot_video_info boot_vid_info;
kfraser@15330 58
kaf24@5146 59 /* opt_nosmp: If true, secondary processors are ignored. */
kaf24@5900 60 static int opt_nosmp = 0;
kaf24@5146 61 boolean_param("nosmp", opt_nosmp);
kaf24@5146 62
kaf24@5146 63 /* maxcpus: maximum number of CPUs to activate. */
kaf24@5146 64 static unsigned int max_cpus = NR_CPUS;
shand@11156 65 integer_param("maxcpus", max_cpus);
kaf24@5146 66
kaf24@3334 67 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
kaf24@3334 68 static int opt_watchdog = 0;
kaf24@3334 69 boolean_param("watchdog", opt_watchdog);
kaf24@3334 70
kaf24@4850 71 /* **** Linux config option: propagated to domain0. */
kaf24@4850 72 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
kaf24@4850 73 /* "acpi=force": Override the disable blacklist. */
kaf24@4850 74 /* "acpi=strict": Disables out-of-spec workarounds. */
kaf24@4850 75 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
kaf24@4850 76 /* "acpi=noirq": Disables ACPI interrupt routing. */
kaf24@4850 77 static void parse_acpi_param(char *s);
kaf24@4850 78 custom_param("acpi", parse_acpi_param);
kaf24@4850 79
kaf24@4850 80 /* **** Linux config option: propagated to domain0. */
kaf24@4850 81 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
kaf24@4850 82 extern int acpi_skip_timer_override;
kaf24@4850 83 boolean_param("acpi_skip_timer_override", acpi_skip_timer_override);
kaf24@4850 84
kaf24@4850 85 /* **** Linux config option: propagated to domain0. */
kaf24@4850 86 /* noapic: Disable IOAPIC setup. */
kaf24@4850 87 extern int skip_ioapic_setup;
kaf24@4850 88 boolean_param("noapic", skip_ioapic_setup);
kaf24@4850 89
keir@17546 90 /* **** Linux config option: propagated to domain0. */
keir@17657 91 /* xen_cpuidle: xen control cstate. */
keir@19545 92 /*static*/ int xen_cpuidle = -1;
keir@17657 93 boolean_param("cpuidle", xen_cpuidle);
keir@17546 94
kaf24@3594 95 int early_boot = 1;
kaf24@3594 96
kaf24@5146 97 cpumask_t cpu_present_map;
kaf24@5146 98
kfraser@15074 99 unsigned long xen_phys_start;
keir@19266 100 unsigned long allocator_bitmap_end;
kfraser@15074 101
keir@19055 102 #ifdef CONFIG_X86_32
kaf24@5003 103 /* Limits of Xen heap, used to initialise the allocator. */
keir@19061 104 unsigned long xenheap_initial_phys_start, xenheap_phys_end;
keir@19055 105 #endif
kaf24@3338 106
kaf24@2298 107 extern void arch_init_memory(void);
kaf24@1589 108 extern void init_IRQ(void);
kaf24@5604 109 extern void early_time_init(void);
kaf24@5167 110 extern void early_cpu_init(void);
kfraser@15747 111 extern void vesa_init(void);
kfraser@15747 112 extern void vesa_mtrr_init(void);
keir@19646 113 extern void init_tmem(void);
kaf24@1589 114
keir@18523 115 DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
keir@18523 116 #ifdef CONFIG_COMPAT
keir@18523 117 DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table)
keir@18523 118 = boot_cpu_compat_gdt_table;
keir@18523 119 #endif
keir@18523 120
kaf24@8533 121 struct tss_struct init_tss[NR_CPUS];
kaf24@8533 122
kfraser@15490 123 char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
kaf24@5011 124
keir@16144 125 struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1 };
kaf24@1452 126
kaf24@1670 127 unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE;
kaf24@1452 128 EXPORT_SYMBOL(mmu_cr4_features);
kaf24@1452 129
kaf24@4818 130 int acpi_disabled;
kaf24@1452 131
kaf24@4850 132 int acpi_force;
kaf24@4850 133 char acpi_param[10] = "";
kfraser@15074 134 static void __init parse_acpi_param(char *s)
kaf24@4850 135 {
kaf24@4850 136 /* Save the parameter so it can be propagated to domain0. */
kfraser@13689 137 safe_strcpy(acpi_param, s);
kaf24@4850 138
kaf24@4850 139 /* Interpret the parameter for use within Xen. */
kaf24@4850 140 if ( !strcmp(s, "off") )
kaf24@4850 141 {
kaf24@4850 142 disable_acpi();
kaf24@4850 143 }
kaf24@4850 144 else if ( !strcmp(s, "force") )
kaf24@4850 145 {
kaf24@4850 146 acpi_force = 1;
kaf24@4850 147 acpi_ht = 1;
kaf24@4850 148 acpi_disabled = 0;
kaf24@4850 149 }
kaf24@4850 150 else if ( !strcmp(s, "strict") )
kaf24@4850 151 {
kaf24@4850 152 acpi_strict = 1;
kaf24@4850 153 }
kaf24@4850 154 else if ( !strcmp(s, "ht") )
kaf24@4850 155 {
kaf24@4850 156 if ( !acpi_force )
kaf24@4850 157 disable_acpi();
kaf24@4850 158 acpi_ht = 1;
kaf24@4850 159 }
kaf24@4850 160 else if ( !strcmp(s, "noirq") )
kaf24@4850 161 {
kaf24@4850 162 acpi_noirq_set();
kaf24@4850 163 }
kaf24@4850 164 }
kaf24@4850 165
kaf24@1452 166 static void __init do_initcalls(void)
kaf24@1452 167 {
kaf24@1452 168 initcall_t *call;
kaf24@1452 169 for ( call = &__initcall_start; call < &__initcall_end; call++ )
kaf24@1452 170 (*call)();
kaf24@1452 171 }
kaf24@1452 172
kfraser@15074 173 #define EARLY_FAIL(f, a...) do { \
kfraser@15074 174 printk( f , ## a ); \
kfraser@15871 175 for ( ; ; ) halt(); \
kfraser@15074 176 } while (0)
kaf24@8459 177
keir@19076 178 static unsigned long __initdata initial_images_base;
keir@19076 179 static unsigned long __initdata initial_images_start;
keir@19076 180 static unsigned long __initdata initial_images_end;
kaf24@9067 181
kfraser@15074 182 unsigned long __init initial_images_nrpages(void)
kaf24@9067 183 {
keir@19076 184 ASSERT(!(initial_images_base & ~PAGE_MASK));
kfraser@15489 185 ASSERT(!(initial_images_end & ~PAGE_MASK));
kfraser@15489 186 return ((initial_images_end >> PAGE_SHIFT) -
keir@19076 187 (initial_images_base >> PAGE_SHIFT));
kaf24@9067 188 }
kaf24@9067 189
kfraser@15074 190 void __init discard_initial_images(void)
kaf24@9067 191 {
keir@19076 192 init_domheap_pages(initial_images_base, initial_images_end);
kaf24@9067 193 }
kaf24@9067 194
kaf24@9818 195 extern char __per_cpu_start[], __per_cpu_data_end[], __per_cpu_end[];
kaf24@9818 196
kfraser@11241 197 static void __init percpu_init_areas(void)
kaf24@9818 198 {
kaf24@9818 199 unsigned int i, data_size = __per_cpu_data_end - __per_cpu_start;
kfraser@15074 200 unsigned int first_unused;
kaf24@9818 201
kaf24@9818 202 BUG_ON(data_size > PERCPU_SIZE);
kaf24@9818 203
kfraser@15074 204 /* Initialise per-cpu data area for all possible secondary CPUs. */
kfraser@15074 205 for ( i = 1; (i < NR_CPUS) && cpu_possible(i); i++ )
kfraser@15074 206 memcpy(__per_cpu_start + (i << PERCPU_SHIFT),
kfraser@15074 207 __per_cpu_start,
kfraser@15074 208 data_size);
kaf24@9818 209 first_unused = i;
kaf24@9818 210
kfraser@14340 211 /* Check that there are no holes in cpu_possible_map. */
kaf24@9818 212 for ( ; i < NR_CPUS; i++ )
kfraser@14340 213 BUG_ON(cpu_possible(i));
kaf24@9818 214
kfraser@11241 215 #ifndef MEMORY_GUARD
kaf24@9818 216 init_xenheap_pages(__pa(__per_cpu_start) + (first_unused << PERCPU_SHIFT),
kaf24@9818 217 __pa(__per_cpu_end));
kfraser@11241 218 #endif
keir@15082 219 memguard_guard_range(&__per_cpu_start[first_unused << PERCPU_SHIFT],
keir@15082 220 (NR_CPUS - first_unused) << PERCPU_SHIFT);
keir@15082 221 #if defined(CONFIG_X86_64)
keir@15082 222 /* Also zap the mapping in the 1:1 area. */
keir@15082 223 memguard_guard_range(__va(__pa(__per_cpu_start)) +
keir@15082 224 (first_unused << PERCPU_SHIFT),
keir@15082 225 (NR_CPUS - first_unused) << PERCPU_SHIFT);
keir@15082 226 #endif
kaf24@9818 227 }
kaf24@9818 228
kfraser@11241 229 static void __init init_idle_domain(void)
kfraser@11240 230 {
kfraser@11240 231 struct domain *idle_domain;
kfraser@11240 232
kfraser@11240 233 /* Domain creation requires that scheduler structures are initialised. */
kfraser@11240 234 scheduler_init();
kfraser@11240 235
kfraser@14911 236 idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
kfraser@11240 237 if ( (idle_domain == NULL) || (alloc_vcpu(idle_domain, 0, 0) == NULL) )
kfraser@11240 238 BUG();
kfraser@11240 239
kfraser@11240 240 set_current(idle_domain->vcpu[0]);
kfraser@11240 241 idle_vcpu[0] = this_cpu(curr_vcpu) = current;
kfraser@11240 242
kfraser@11240 243 setup_idle_pagetable();
kfraser@11240 244 }
kfraser@11240 245
kfraser@15074 246 static void __init srat_detect_node(int cpu)
kfraser@11971 247 {
kfraser@11998 248 unsigned node;
keir@17551 249 u32 apicid = x86_cpu_to_apicid[cpu];
kfraser@11971 250
kfraser@11998 251 node = apicid_to_node[apicid];
kfraser@11998 252 if ( node == NUMA_NO_NODE )
kfraser@11998 253 node = 0;
kfraser@11998 254 numa_set_node(cpu, node);
kfraser@11971 255
kfraser@11998 256 if ( acpi_numa > 0 )
kfraser@11998 257 printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
kfraser@11971 258 }
kfraser@11971 259
kfraser@15740 260 /*
kfraser@15740 261 * Ensure a given physical memory range is present in the bootstrap mappings.
kfraser@15740 262 * Use superpage mappings to ensure that pagetable memory needn't be allocated.
kfraser@15740 263 */
kfraser@15740 264 static void __init bootstrap_map(unsigned long start, unsigned long end)
kfraser@15740 265 {
kfraser@15740 266 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
keir@16888 267 start = max_t(unsigned long, start & ~mask, 16UL << 20);
kfraser@15740 268 end = (end + mask) & ~mask;
keir@16888 269 if ( start >= end )
keir@16888 270 return;
kfraser@15740 271 if ( end > BOOTSTRAP_DIRECTMAP_END )
kfraser@15740 272 panic("Cannot access memory beyond end of "
kfraser@15740 273 "bootstrap direct-map area\n");
kfraser@15740 274 map_pages_to_xen(
kfraser@15740 275 (unsigned long)maddr_to_bootstrap_virt(start),
kfraser@15740 276 start >> PAGE_SHIFT, (end-start) >> PAGE_SHIFT, PAGE_HYPERVISOR);
kfraser@15740 277 }
kfraser@15740 278
kfraser@15074 279 static void __init move_memory(
kfraser@15074 280 unsigned long dst, unsigned long src_start, unsigned long src_end)
ian@12677 281 {
kfraser@15740 282 bootstrap_map(src_start, src_end);
kfraser@15740 283 bootstrap_map(dst, dst + src_end - src_start);
kfraser@15074 284 memmove(maddr_to_bootstrap_virt(dst),
kfraser@15074 285 maddr_to_bootstrap_virt(src_start),
ian@12677 286 src_end - src_start);
kfraser@15074 287 }
kfraser@15074 288
kfraser@15074 289 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
kfraser@15074 290 static struct e820map __initdata boot_e820;
kfraser@15074 291
keir@15298 292 struct boot_video_info {
keir@15298 293 u8 orig_x; /* 0x00 */
keir@15298 294 u8 orig_y; /* 0x01 */
keir@15298 295 u8 orig_video_mode; /* 0x02 */
keir@15298 296 u8 orig_video_cols; /* 0x03 */
keir@15298 297 u8 orig_video_lines; /* 0x04 */
keir@15298 298 u8 orig_video_isVGA; /* 0x05 */
keir@15298 299 u16 orig_video_points; /* 0x06 */
keir@15298 300
keir@15298 301 /* VESA graphic mode -- linear frame buffer */
keir@15298 302 u32 capabilities; /* 0x08 */
keir@15298 303 u16 lfb_linelength; /* 0x0c */
keir@15298 304 u16 lfb_width; /* 0x0e */
keir@15298 305 u16 lfb_height; /* 0x10 */
keir@15298 306 u16 lfb_depth; /* 0x12 */
keir@15298 307 u32 lfb_base; /* 0x14 */
keir@15298 308 u32 lfb_size; /* 0x18 */
keir@15298 309 u8 red_size; /* 0x1c */
keir@15298 310 u8 red_pos; /* 0x1d */
keir@15298 311 u8 green_size; /* 0x1e */
keir@15298 312 u8 green_pos; /* 0x1f */
keir@15298 313 u8 blue_size; /* 0x20 */
keir@15298 314 u8 blue_pos; /* 0x21 */
keir@15298 315 u8 rsvd_size; /* 0x22 */
keir@15298 316 u8 rsvd_pos; /* 0x23 */
keir@15298 317 u16 vesapm_seg; /* 0x24 */
keir@15298 318 u16 vesapm_off; /* 0x26 */
keir@16124 319 u16 vesa_attrib; /* 0x28 */
keir@15298 320 };
keir@15298 321
keir@15298 322 static void __init parse_video_info(void)
keir@15298 323 {
keir@15298 324 struct boot_video_info *bvi = &bootsym(boot_vid_info);
keir@15298 325
keir@15298 326 if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
keir@15298 327 {
keir@15298 328 vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
keir@15298 329 vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
keir@15298 330 vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
keir@15298 331 vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
keir@15298 332 vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
keir@15298 333 vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
keir@15298 334 }
keir@15298 335 else if ( bvi->orig_video_isVGA == 0x23 )
keir@15298 336 {
keir@15298 337 vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
keir@15298 338 vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
keir@15298 339 vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
keir@15298 340 vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
keir@15298 341 vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
keir@15298 342 vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
keir@15298 343 vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
keir@15298 344 vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
keir@15298 345 vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
keir@15298 346 vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
keir@15298 347 vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
keir@15298 348 vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
keir@15298 349 vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
keir@15298 350 vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
keir@15298 351 vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
keir@16124 352 vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
keir@16124 353 vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
keir@15298 354 }
keir@15298 355 }
keir@15298 356
keir@16563 357 void __init kexec_reserve_area(struct e820map *e820)
keir@16563 358 {
keir@16563 359 unsigned long kdump_start = kexec_crash_area.start;
keir@16563 360 unsigned long kdump_size = kexec_crash_area.size;
keir@16563 361 static int is_reserved = 0;
keir@16563 362
keir@16563 363 kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
keir@16563 364
keir@16563 365 if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
keir@16563 366 return;
keir@16563 367
keir@16563 368 is_reserved = 1;
keir@16563 369
keir@17674 370 if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
keir@16563 371 {
keir@16563 372 printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at 0x%lx)"
keir@16563 373 "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
keir@16563 374 kexec_crash_area.start = kexec_crash_area.size = 0;
keir@16563 375 }
keir@16563 376 else
keir@16563 377 {
keir@16563 378 printk("Kdump: %luMB (%lukB) at 0x%lx\n",
keir@16563 379 kdump_size >> 20, kdump_size >> 10, kdump_start);
keir@16563 380 }
keir@16563 381 }
keir@16563 382
keir@15082 383 void init_done(void)
keir@15082 384 {
keir@15082 385 extern char __init_begin[], __init_end[];
keir@15082 386
keir@15082 387 /* Free (or page-protect) the init areas. */
keir@18988 388 memset(__init_begin, 0xcc, __init_end - __init_begin); /* int3 poison */
keir@15082 389 #ifndef MEMORY_GUARD
keir@15082 390 init_xenheap_pages(__pa(__init_begin), __pa(__init_end));
keir@15082 391 #endif
keir@15082 392 memguard_guard_range(__init_begin, __init_end - __init_begin);
keir@15082 393 #if defined(CONFIG_X86_64)
keir@15082 394 /* Also zap the mapping in the 1:1 area. */
keir@15082 395 memguard_guard_range(__va(__pa(__init_begin)), __init_end - __init_begin);
keir@15082 396 #endif
keir@15082 397 printk("Freed %ldkB init memory.\n", (long)(__init_end-__init_begin)>>10);
keir@15082 398
keir@15082 399 startup_cpu_idle_loop();
keir@15082 400 }
keir@15082 401
kfraser@15796 402 static char * __init cmdline_cook(char *p)
kfraser@15796 403 {
kfraser@15796 404 p = p ? : "";
kfraser@15796 405 while ( *p == ' ' )
kfraser@15796 406 p++;
kfraser@15796 407 while ( (*p != ' ') && (*p != '\0') )
kfraser@15796 408 p++;
kfraser@15796 409 while ( *p == ' ' )
kfraser@15796 410 p++;
kfraser@15796 411 return p;
kfraser@15796 412 }
kfraser@15796 413
kfraser@15379 414 void __init __start_xen(unsigned long mbi_p)
kaf24@1452 415 {
kfraser@15293 416 char *memmap_type = NULL;
kfraser@15796 417 char *cmdline, *kextra;
kaf24@8457 418 unsigned long _initrd_start = 0, _initrd_len = 0;
kaf24@8457 419 unsigned int initrdidx = 1;
kfraser@15379 420 multiboot_info_t *mbi = __va(mbi_p);
kaf24@8457 421 module_t *mod = (module_t *)__va(mbi->mods_addr);
keir@19135 422 unsigned long nr_pages, modules_length, modules_headroom;
keir@19544 423 int i, j, e820_warn = 0, bytes = 0;
kaf24@5776 424 struct ns16550_defaults ns16550 = {
kaf24@5776 425 .data_bits = 8,
kaf24@5776 426 .parity = 'n',
kaf24@5776 427 .stop_bits = 1
kaf24@5776 428 };
kaf24@3338 429
kfraser@12853 430 extern void early_page_fault(void);
kfraser@12853 431 set_intr_gate(TRAP_page_fault, &early_page_fault);
kfraser@12853 432
kaf24@3338 433 /* Parse the command-line options. */
kfraser@15796 434 cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
kfraser@15796 435 __va(mbi->cmdline) : NULL);
kfraser@15426 436 if ( (kextra = strstr(cmdline, " -- ")) != NULL )
kfraser@15426 437 {
kfraser@15426 438 /*
kfraser@15426 439 * Options after ' -- ' separator belong to dom0.
kfraser@15426 440 * 1. Orphan dom0's options from Xen's command line.
kfraser@15426 441 * 2. Skip all but final leading space from dom0's options.
kfraser@15426 442 */
kfraser@15426 443 *kextra = '\0';
kfraser@15426 444 kextra += 3;
kfraser@15426 445 while ( kextra[1] == ' ' ) kextra++;
kfraser@15426 446 }
kaf24@9823 447 cmdline_parse(cmdline);
kaf24@3338 448
keir@15298 449 parse_video_info();
keir@15298 450
kaf24@8534 451 set_current((struct vcpu *)0xfffff000); /* debug sanity */
keir@18790 452 idle_vcpu[0] = current;
kaf24@8534 453 set_processor_id(0); /* needed early, for smp_processor_id() */
keir@16378 454 if ( cpu_has_efer )
keir@16378 455 rdmsrl(MSR_EFER, this_cpu(efer));
keir@16267 456 asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) );
kaf24@3338 457
kaf24@5146 458 smp_prepare_boot_cpu();
kaf24@5146 459
kaf24@3338 460 /* We initialise the serial devices very early so we can get debugging. */
kaf24@5776 461 ns16550.io_base = 0x3f8;
kaf24@5776 462 ns16550.irq = 4;
kaf24@5776 463 ns16550_init(0, &ns16550);
kaf24@5776 464 ns16550.io_base = 0x2f8;
kaf24@5776 465 ns16550.irq = 3;
kaf24@5776 466 ns16550_init(1, &ns16550);
keir@19543 467 console_init_preirq();
kaf24@3338 468
kfraser@11947 469 printk("Command line: %s\n", cmdline);
kaf24@9823 470
kfraser@15330 471 printk("Video information:\n");
kfraser@15330 472
kfraser@15330 473 /* Print VGA display mode information. */
keir@15298 474 switch ( vga_console_info.video_type )
keir@15298 475 {
keir@15298 476 case XEN_VGATYPE_TEXT_MODE_3:
kfraser@15330 477 printk(" VGA is text mode %dx%d, font 8x%d\n",
keir@15298 478 vga_console_info.u.text_mode_3.columns,
keir@15298 479 vga_console_info.u.text_mode_3.rows,
keir@15298 480 vga_console_info.u.text_mode_3.font_height);
keir@15298 481 break;
keir@15298 482 case XEN_VGATYPE_VESA_LFB:
kfraser@15330 483 printk(" VGA is graphics mode %dx%d, %d bpp\n",
keir@15298 484 vga_console_info.u.vesa_lfb.width,
keir@15298 485 vga_console_info.u.vesa_lfb.height,
keir@15298 486 vga_console_info.u.vesa_lfb.bits_per_pixel);
keir@15298 487 break;
kfraser@15330 488 default:
kfraser@15330 489 printk(" No VGA detected\n");
kfraser@15330 490 break;
kfraser@15330 491 }
kfraser@15330 492
kfraser@15330 493 /* Print VBE/DDC EDID information. */
kfraser@15330 494 if ( bootsym(boot_edid_caps) != 0x1313 )
kfraser@15330 495 {
kfraser@15330 496 u16 caps = bootsym(boot_edid_caps);
kfraser@15330 497 printk(" VBE/DDC methods:%s%s%s; ",
kfraser@15330 498 (caps & 1) ? " V1" : "",
kfraser@15330 499 (caps & 2) ? " V2" : "",
kfraser@15330 500 !(caps & 3) ? " none" : "");
kfraser@15330 501 printk("EDID transfer time: %d seconds\n", caps >> 8);
kfraser@15330 502 if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
kfraser@15330 503 {
kfraser@15330 504 printk(" EDID info not retrieved because ");
kfraser@15330 505 if ( !(caps & 3) )
kfraser@15330 506 printk("no DDC retrieval method detected\n");
kfraser@15330 507 else if ( (caps >> 8) > 5 )
kfraser@15330 508 printk("takes longer than 5 seconds\n");
kfraser@15330 509 else
kfraser@15330 510 printk("of reasons unknown\n");
kfraser@15330 511 }
keir@15298 512 }
keir@15298 513
kfraser@15336 514 printk("Disc information:\n");
kfraser@15336 515 printk(" Found %d MBR signatures\n",
kfraser@15430 516 bootsym(boot_mbr_signature_nr));
kfraser@15336 517 printk(" Found %d EDD information structures\n",
kfraser@15336 518 bootsym(boot_edd_info_nr));
kfraser@15336 519
kaf24@3344 520 /* Check that we have at least one Multiboot module. */
kaf24@3344 521 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
kfraser@15074 522 EARLY_FAIL("dom0 kernel not specified. "
kfraser@15074 523 "Check bootloader configuration.\n");
kaf24@5011 524
kaf24@5011 525 if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
kfraser@15074 526 EARLY_FAIL("Misaligned CPU0 stack.\n");
kaf24@3338 527
kfraser@15293 528 if ( e820_raw_nr != 0 )
kfraser@15292 529 {
kfraser@15293 530 memmap_type = "Xen-e820";
kfraser@15292 531 }
kfraser@15293 532 else if ( bootsym(lowmem_kb) )
kfraser@15292 533 {
kfraser@15293 534 memmap_type = "Xen-e801";
kfraser@15292 535 e820_raw[0].addr = 0;
kfraser@15293 536 e820_raw[0].size = bootsym(lowmem_kb) << 10;
kfraser@15292 537 e820_raw[0].type = E820_RAM;
kfraser@15292 538 e820_raw[1].addr = 0x100000;
kfraser@15293 539 e820_raw[1].size = bootsym(highmem_kb) << 10;
kfraser@15292 540 e820_raw[1].type = E820_RAM;
kfraser@15292 541 e820_raw_nr = 2;
kfraser@15292 542 }
kfraser@15292 543 else if ( mbi->flags & MBI_MEMMAP )
kaf24@3344 544 {
kfraser@15293 545 memmap_type = "Multiboot-e820";
keir@15988 546 while ( (bytes < mbi->mmap_length) && (e820_raw_nr < E820MAX) )
kaf24@3344 547 {
kaf24@3344 548 memory_map_t *map = __va(mbi->mmap_addr + bytes);
kaf24@8402 549
kaf24@8402 550 /*
kaf24@8403 551 * This is a gross workaround for a BIOS bug. Some bootloaders do
kaf24@8402 552 * not write e820 map entries into pre-zeroed memory. This is
kaf24@8402 553 * okay if the BIOS fills in all fields of the map entry, but
kaf24@8402 554 * some broken BIOSes do not bother to write the high word of
kaf24@8402 555 * the length field if the length is smaller than 4GB. We
kaf24@8402 556 * detect and fix this by flagging sections below 4GB that
kaf24@8403 557 * appear to be larger than 4GB in size.
kaf24@8402 558 */
kaf24@8403 559 if ( (map->base_addr_high == 0) && (map->length_high != 0) )
kaf24@8402 560 {
kfraser@15292 561 if ( !e820_warn )
kfraser@15292 562 {
kfraser@15292 563 printk("WARNING: Buggy e820 map detected and fixed "
kfraser@15292 564 "(truncated length fields).\n");
kfraser@15292 565 e820_warn = 1;
kfraser@15292 566 }
kaf24@8402 567 map->length_high = 0;
kaf24@8402 568 }
kaf24@8402 569
kaf24@3344 570 e820_raw[e820_raw_nr].addr =
kaf24@3344 571 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
kaf24@3344 572 e820_raw[e820_raw_nr].size =
kaf24@3344 573 ((u64)map->length_high << 32) | (u64)map->length_low;
kfraser@15799 574 e820_raw[e820_raw_nr].type = map->type;
kaf24@3344 575 e820_raw_nr++;
kaf24@8402 576
kaf24@3344 577 bytes += map->size + 4;
kaf24@3344 578 }
kaf24@3344 579 }
kaf24@3344 580 else if ( mbi->flags & MBI_MEMLIMITS )
kaf24@3344 581 {
kfraser@15293 582 memmap_type = "Multiboot-e801";
kaf24@3344 583 e820_raw[0].addr = 0;
kaf24@3344 584 e820_raw[0].size = mbi->mem_lower << 10;
kaf24@3344 585 e820_raw[0].type = E820_RAM;
kaf24@3354 586 e820_raw[1].addr = 0x100000;
kaf24@3354 587 e820_raw[1].size = mbi->mem_upper << 10;
kaf24@3354 588 e820_raw[1].type = E820_RAM;
kaf24@3344 589 e820_raw_nr = 2;
kaf24@3344 590 }
kaf24@3344 591 else
kaf24@3344 592 {
kfraser@15074 593 EARLY_FAIL("Bootloader provided no memory information.\n");
kaf24@3344 594 }
kaf24@3344 595
kaf24@13427 596 /* Sanitise the raw E820 map to produce a final clean version. */
kfraser@15293 597 max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
kaf24@3338 598
keir@16563 599 /* Create a temporary copy of the E820 map. */
kfraser@15074 600 memcpy(&boot_e820, &e820, sizeof(e820));
keir@16563 601
keir@16563 602 /* Early kexec reservation (explicit static start address). */
keir@16563 603 kexec_reserve_area(&boot_e820);
kaf24@6111 604
kfraser@15074 605 /*
keir@15077 606 * Iterate backwards over all superpage-aligned RAM regions.
kfraser@15074 607 *
kfraser@15074 608 * We require superpage alignment because the boot allocator is not yet
kfraser@15074 609 * initialised. Hence we can only map superpages in the address range
kfraser@15074 610 * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require
kfraser@15074 611 * dynamic allocation of pagetables.
kfraser@15074 612 *
kfraser@15074 613 * As well as mapping superpages in that range, in preparation for
kfraser@15074 614 * initialising the boot allocator, we also look for a region to which
kfraser@15074 615 * we can relocate the dom0 kernel and other multiboot modules. Also, on
kfraser@15074 616 * x86/64, we relocate Xen to higher memory.
kfraser@15074 617 */
keir@19544 618 modules_length = 0;
keir@19544 619 for ( i = 0; i < mbi->mods_count; i++ )
keir@19544 620 modules_length += mod[i].mod_end - mod[i].mod_start;
keir@19076 621
keir@19135 622 /* ensure mod[0] is mapped before parsing */
keir@19135 623 bootstrap_map(mod[0].mod_start, mod[0].mod_end);
keir@19135 624 modules_headroom = bzimage_headroom(
keir@19135 625 (char *)(unsigned long)mod[0].mod_start,
keir@19135 626 (unsigned long)(mod[0].mod_end - mod[0].mod_start));
keir@19135 627
keir@15077 628 for ( i = boot_e820.nr_map-1; i >= 0; i-- )
kfraser@15074 629 {
kfraser@15074 630 uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
kaf24@6134 631
keir@16563 632 /* Superpage-aligned chunks from 16MB to BOOTSTRAP_DIRECTMAP_END. */
kfraser@15074 633 s = (boot_e820.map[i].addr + mask) & ~mask;
kfraser@15074 634 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
keir@16563 635 s = max_t(uint64_t, s, 16 << 20);
kfraser@15074 636 e = min_t(uint64_t, e, BOOTSTRAP_DIRECTMAP_END);
kfraser@15074 637 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
kaf24@3354 638 continue;
kaf24@6111 639
kfraser@15074 640 /* Map the chunk. No memory will need to be allocated to do this. */
kfraser@15074 641 map_pages_to_xen(
kfraser@15074 642 (unsigned long)maddr_to_bootstrap_virt(s),
kfraser@15074 643 s >> PAGE_SHIFT, (e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
kaf24@6111 644
kfraser@14084 645 #if defined(CONFIG_X86_64)
keir@19190 646 /* Relocate Xen image, allocation bitmap, and one page of padding. */
keir@19190 647 #define reloc_size ((__pa(&_end) + max_page/8 + PAGE_SIZE + mask) & ~mask)
kfraser@15074 648 /* Is the region suitable for relocating Xen? */
keir@19055 649 if ( !xen_phys_start && ((e-s) >= reloc_size) )
kaf24@5003 650 {
kfraser@15074 651 extern l2_pgentry_t l2_xenmap[];
kfraser@15074 652 l4_pgentry_t *pl4e;
kfraser@15074 653 l3_pgentry_t *pl3e;
kfraser@15074 654 l2_pgentry_t *pl2e;
keir@16888 655 int i, j, k;
kfraser@15074 656
kfraser@15074 657 /* Select relocation address. */
keir@19055 658 e -= reloc_size;
kfraser@15074 659 xen_phys_start = e;
kfraser@15292 660 bootsym(trampoline_xen_phys_start) = e;
kfraser@15074 661
kfraser@15074 662 /*
kfraser@15074 663 * Perform relocation to new physical address.
kfraser@15074 664 * Before doing so we must sync static/global data with main memory
kfraser@15074 665 * with a barrier(). After this we must *not* modify static/global
kfraser@15074 666 * data until after we have switched to the relocated pagetables!
kfraser@15074 667 */
kfraser@15074 668 barrier();
kfraser@15074 669 move_memory(e, 0, __pa(&_end) - xen_phys_start);
kfraser@15074 670
kfraser@15379 671 /* Poison low 1MB to detect stray pointers to physical 0-1MB. */
kfraser@15379 672 memset(maddr_to_bootstrap_virt(e), 0x55, 1U<<20);
kfraser@15379 673
kfraser@15074 674 /* Walk initial pagetables, relocating page directory entries. */
kfraser@15074 675 pl4e = __va(__pa(idle_pg_table));
kfraser@15074 676 for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
kfraser@15074 677 {
kfraser@15074 678 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
kfraser@15074 679 continue;
kfraser@15074 680 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
kfraser@15074 681 xen_phys_start);
kfraser@15074 682 pl3e = l4e_to_l3e(*pl4e);
kfraser@15074 683 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
kfraser@15074 684 {
keir@16921 685 /* Not present, 1GB mapping, or already relocated? */
kfraser@15074 686 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
keir@16921 687 (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
kfraser@15074 688 (l3e_get_pfn(*pl3e) > 0x1000) )
kfraser@15074 689 continue;
kfraser@15074 690 *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
kfraser@15074 691 xen_phys_start);
keir@16888 692 pl2e = l3e_to_l2e(*pl3e);
keir@16888 693 for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
keir@16888 694 {
keir@16888 695 /* Not present, PSE, or already relocated? */
keir@16888 696 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
keir@16888 697 (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
keir@16888 698 (l2e_get_pfn(*pl2e) > 0x1000) )
keir@16888 699 continue;
keir@16888 700 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
keir@16888 701 xen_phys_start);
keir@16888 702 }
kfraser@15074 703 }
kfraser@15074 704 }
kfraser@15074 705
kfraser@15074 706 /* The only data mappings to be relocated are in the Xen area. */
kfraser@15074 707 pl2e = __va(__pa(l2_xenmap));
keir@16888 708 *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
keir@16888 709 PAGE_HYPERVISOR | _PAGE_PSE);
keir@16888 710 for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
kfraser@15074 711 {
kfraser@15074 712 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
kfraser@15074 713 continue;
kfraser@15074 714 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
kfraser@15074 715 xen_phys_start);
kfraser@15074 716 }
kfraser@15074 717
kfraser@15074 718 /* Re-sync the stack and then switch to relocated pagetables. */
kfraser@15074 719 asm volatile (
kfraser@15074 720 "rep movsb ; " /* re-sync the stack */
kfraser@15074 721 "movq %%cr4,%%rsi ; "
kfraser@15074 722 "andb $0x7f,%%sil ; "
kfraser@15074 723 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
kfraser@15074 724 "movq %0,%%cr3 ; " /* CR3 == new pagetables */
kfraser@15074 725 "orb $0x80,%%sil ; "
kfraser@15074 726 "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */
kfraser@15074 727 : : "r" (__pa(idle_pg_table)), "S" (cpu0_stack),
kfraser@15074 728 "D" (__va(__pa(cpu0_stack))), "c" (STACK_SIZE) : "memory" );
kaf24@5003 729 }
kaf24@5003 730 #endif
keir@15077 731
keir@15077 732 /* Is the region suitable for relocating the multiboot modules? */
keir@19076 733 if ( !initial_images_start && (s < e) &&
keir@19076 734 ((e-s) >= (modules_length+modules_headroom)) )
keir@15077 735 {
kfraser@15489 736 initial_images_end = e;
kfraser@15489 737 e = (e - modules_length) & PAGE_MASK;
keir@15077 738 initial_images_start = e;
keir@19076 739 e -= modules_headroom;
keir@19076 740 initial_images_base = e;
keir@19544 741 e += modules_length + modules_headroom;
keir@19544 742 for ( j = mbi->mods_count-1; j >= 0; j-- )
keir@19544 743 {
keir@19544 744 e -= mod[j].mod_end - mod[j].mod_start;
keir@19544 745 move_memory(e, mod[j].mod_start, mod[j].mod_end);
keir@19544 746 mod[j].mod_end += e - mod[j].mod_start;
keir@19544 747 mod[j].mod_start = e;
keir@19544 748 }
keir@15077 749 }
kfraser@15489 750
kfraser@15489 751 if ( !kexec_crash_area.start && (s < e) &&
kfraser@15489 752 ((e-s) >= kexec_crash_area.size) )
kfraser@15489 753 {
kfraser@15489 754 e = (e - kexec_crash_area.size) & PAGE_MASK;
kfraser@15489 755 kexec_crash_area.start = e;
kfraser@15489 756 }
kaf24@3354 757 }
kaf24@3354 758
kfraser@15074 759 if ( !initial_images_start )
kfraser@15074 760 EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n");
keir@19076 761 reserve_e820_ram(&boot_e820, initial_images_base, initial_images_end);
kfraser@15074 762
keir@19055 763 /* Initialise boot heap. */
keir@19055 764 allocator_bitmap_end = init_boot_allocator(__pa(&_end));
keir@19055 765 #if defined(CONFIG_X86_32)
keir@19061 766 xenheap_initial_phys_start = allocator_bitmap_end;
keir@19061 767 xenheap_phys_end = DIRECTMAP_MBYTES << 20;
keir@19055 768 #else
kfraser@15074 769 if ( !xen_phys_start )
kfraser@15074 770 EARLY_FAIL("Not enough memory to relocate Xen.\n");
keir@19055 771 reserve_e820_ram(&boot_e820, __pa(&_start), allocator_bitmap_end);
kfraser@15074 772 #endif
kfraser@15074 773
keir@16563 774 /* Late kexec reservation (dynamic start address). */
keir@16563 775 kexec_reserve_area(&boot_e820);
kfraser@15489 776
kfraser@15074 777 /*
keir@16624 778 * With the boot allocator now initialised, we can walk every RAM region
keir@16624 779 * and map it in its entirety (on x86/64, at least) and notify it to the
kfraser@15074 780 * boot allocator.
kfraser@15074 781 */
kfraser@15074 782 for ( i = 0; i < boot_e820.nr_map; i++ )
kfraser@15074 783 {
keir@16563 784 uint64_t s, e, map_s, map_e, mask = PAGE_SIZE - 1;
kfraser@15074 785
kfraser@15074 786 /* Only page alignment required now. */
kfraser@15074 787 s = (boot_e820.map[i].addr + mask) & ~mask;
kfraser@15074 788 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
keir@16563 789 #if defined(CONFIG_X86_32)
keir@16563 790 s = max_t(uint64_t, s, xenheap_phys_end);
keir@16563 791 #else
keir@16563 792 s = max_t(uint64_t, s, 1<<20);
keir@16563 793 #endif
kfraser@15074 794 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
kfraser@15074 795 continue;
kfraser@15074 796
keir@16563 797 /* Need to create mappings above 16MB. */
keir@16563 798 map_s = max_t(uint64_t, s, 16<<20);
kfraser@15074 799 map_e = e;
keir@16563 800 #if defined(CONFIG_X86_32) /* mappings are truncated on x86_32 */
kfraser@15074 801 map_e = min_t(uint64_t, map_e, BOOTSTRAP_DIRECTMAP_END);
kfraser@15074 802 #endif
keir@16563 803
keir@16563 804 /* Pass mapped memory to allocator /before/ creating new mappings. */
keir@16624 805 init_boot_pages(s, min_t(uint64_t, map_s, e));
keir@16563 806
keir@16563 807 /* Create new mappings /before/ passing memory to the allocator. */
keir@16563 808 if ( map_s < map_e )
kfraser@15074 809 map_pages_to_xen(
keir@16563 810 (unsigned long)maddr_to_bootstrap_virt(map_s),
keir@16563 811 map_s >> PAGE_SHIFT, (map_e-map_s) >> PAGE_SHIFT,
keir@16563 812 PAGE_HYPERVISOR);
kfraser@15074 813
keir@16563 814 /* Pass remainder of this memory chunk to the allocator. */
keir@16624 815 init_boot_pages(map_s, e);
kfraser@15074 816 }
kfraser@15074 817
kaf24@5003 818 memguard_init();
kaf24@4950 819
kfraser@15074 820 nr_pages = 0;
kfraser@15074 821 for ( i = 0; i < e820.nr_map; i++ )
kfraser@15074 822 if ( e820.map[i].type == E820_RAM )
kfraser@15074 823 nr_pages += e820.map[i].size >> PAGE_SHIFT;
ian@12681 824 printk("System RAM: %luMB (%lukB)\n",
kaf24@3354 825 nr_pages >> (20 - PAGE_SHIFT),
kaf24@3354 826 nr_pages << (PAGE_SHIFT - 10));
kaf24@7220 827 total_pages = nr_pages;
kaf24@3354 828
kfraser@11296 829 /* Sanity check for unwanted bloat of certain hypercall structures. */
kfraser@11296 830 BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
kfraser@11296 831 sizeof(((struct xen_platform_op *)0)->u.pad));
kfraser@11296 832 BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
kfraser@11296 833 sizeof(((struct xen_domctl *)0)->u.pad));
kfraser@11296 834 BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
kfraser@11296 835 sizeof(((struct xen_sysctl *)0)->u.pad));
kaf24@7388 836
kaf24@9878 837 BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
kaf24@9878 838 BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
ack@13292 839 BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
kaf24@7744 840
ack@13291 841 #ifdef CONFIG_COMPAT
ack@13291 842 BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
ack@13291 843 sizeof(((struct compat_platform_op *)0)->u.pad));
ack@13291 844 BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
ack@13292 845 BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
ack@13291 846 #endif
ack@13291 847
kfraser@10492 848 /* Check definitions in public headers match internal defs. */
kaf24@9878 849 BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
kaf24@8521 850 #ifdef HYPERVISOR_VIRT_END
kaf24@9878 851 BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
kaf24@8521 852 #endif
kfraser@10492 853 BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
kfraser@10492 854 BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
kaf24@8521 855
kaf24@3354 856 init_frametable();
kaf24@3338 857
kfraser@11971 858 acpi_boot_table_init();
kfraser@11971 859
kfraser@11971 860 acpi_numa_init();
kfraser@11971 861
kfraser@11971 862 numa_initmem_init(0, max_page);
kfraser@11971 863
keir@19055 864 #if defined(CONFIG_X86_32)
keir@19055 865 /* Initialise the Xen heap. */
keir@19061 866 init_xenheap_pages(xenheap_initial_phys_start, xenheap_phys_end);
keir@19061 867 nr_pages = (xenheap_phys_end - xenheap_initial_phys_start) >> PAGE_SHIFT;
kaf24@6111 868 printk("Xen heap: %luMB (%lukB)\n",
kaf24@6111 869 nr_pages >> (20 - PAGE_SHIFT),
kaf24@6111 870 nr_pages << (PAGE_SHIFT - 10));
keir@19055 871 #endif
kaf24@3338 872
keir@14680 873 end_boot_allocator();
keir@19055 874 early_boot = 0;
keir@14680 875
keir@19055 876 #if defined(CONFIG_X86_64)
keir@19055 877 vesa_init();
keir@19055 878 #endif
kaf24@3338 879
keir@17444 880 softirq_init();
keir@17444 881
kaf24@8459 882 early_cpu_init();
kaf24@8459 883
kaf24@8459 884 paging_init();
kaf24@8459 885
keir@16274 886 tboot_probe();
keir@16274 887
kaf24@8459 888 /* Unmap the first page of CPU0's stack. */
kaf24@8459 889 memguard_guard_stack(cpu0_stack);
kaf24@8459 890
kaf24@8459 891 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
kaf24@8459 892
kaf24@8459 893 if ( opt_watchdog )
kaf24@8459 894 nmi_watchdog = NMI_LOCAL_APIC;
kaf24@8459 895
kaf24@8459 896 sort_exception_tables();
kaf24@8459 897
kaf24@8459 898 find_smp_config();
kaf24@8459 899
kaf24@8459 900 dmi_scan_machine();
kaf24@8459 901
kaf24@8459 902 generic_apic_probe();
kaf24@8459 903
keir@17552 904 if ( x2apic_is_available() )
keir@17552 905 enable_x2apic();
keir@17552 906
kaf24@8459 907 acpi_boot_init();
kaf24@8459 908
kfraser@11971 909 init_cpu_to_node();
kfraser@11971 910
kfraser@11241 911 if ( smp_found_config )
kaf24@8459 912 get_smp_config();
kaf24@8459 913
keir@15083 914 #ifdef CONFIG_X86_64
keir@15083 915 /* Low mappings were only needed for some BIOS table parsing. */
keir@15083 916 zap_low_mappings();
keir@15083 917 #endif
keir@15083 918
kaf24@8459 919 init_apic_mappings();
kaf24@8459 920
kaf24@8459 921 init_IRQ();
kaf24@8459 922
kfraser@11241 923 percpu_init_areas();
kfraser@11241 924
kfraser@15815 925 xsm_init(&initrdidx, mbi, initial_images_start);
kfraser@15815 926
kfraser@11240 927 init_idle_domain();
kfraser@11240 928
kaf24@8459 929 trap_init();
kaf24@8459 930
kaf24@13662 931 rcu_init();
kaf24@13662 932
kaf24@8586 933 timer_init();
kaf24@8459 934
kaf24@8459 935 early_time_init();
kaf24@8459 936
kaf24@8459 937 arch_init_memory();
kaf24@8459 938
kaf24@8459 939 identify_cpu(&boot_cpu_data);
kaf24@8459 940 if ( cpu_has_fxsr )
kaf24@8459 941 set_in_cr4(X86_CR4_OSFXSR);
kaf24@8459 942 if ( cpu_has_xmm )
kaf24@8459 943 set_in_cr4(X86_CR4_OSXMMEXCPT);
keir@18920 944
keir@18920 945 local_irq_enable();
keir@18920 946
kfraser@15747 947 #ifdef CONFIG_X86_64
kfraser@15747 948 vesa_mtrr_init();
kfraser@15747 949 #endif
kaf24@8459 950
kaf24@8459 951 if ( opt_nosmp )
kaf24@8459 952 max_cpus = 0;
kaf24@8459 953
kaf24@8459 954 smp_prepare_cpus(max_cpus);
kaf24@8459 955
keir@18920 956 spin_debug_enable();
keir@18920 957
kaf24@8459 958 /*
kaf24@8459 959 * Initialise higher-level timer functions. We do this fairly late
kaf24@8459 960 * (post-SMP) because the time bases and scale factors need to be updated
kaf24@8459 961 * regularly, and SMP initialisation can cause a long delay with
kaf24@8459 962 * interrupts not yet enabled.
kaf24@8459 963 */
kaf24@8459 964 init_xen_time();
kaf24@8459 965
kaf24@8459 966 initialize_keytable();
kaf24@8459 967
keir@19543 968 console_init_postirq();
kaf24@8459 969
kaf24@8459 970 for_each_present_cpu ( i )
kaf24@8459 971 {
kaf24@8459 972 if ( num_online_cpus() >= max_cpus )
kaf24@8459 973 break;
kaf24@8459 974 if ( !cpu_online(i) )
kaf24@13662 975 {
kaf24@13662 976 rcu_online_cpu(i);
kaf24@8459 977 __cpu_up(i);
kaf24@13662 978 }
kfraser@11971 979
kfraser@11998 980 /* Set up cpu_to_node[]. */
kfraser@11971 981 srat_detect_node(i);
kfraser@11998 982 /* Set up node_to_cpumask based on cpu_to_node[]. */
kfraser@11971 983 numa_add_cpu(i);
kaf24@8459 984 }
kaf24@8459 985
kaf24@8459 986 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
kaf24@8459 987 smp_cpus_done(max_cpus);
kaf24@8459 988
kaf24@9117 989 initialise_gdb(); /* could be moved earlier */
kaf24@9117 990
kaf24@8459 991 do_initcalls();
kaf24@8459 992
kaf24@8594 993 if ( opt_watchdog )
kaf24@8594 994 watchdog_enable();
keir@19259 995
keir@19259 996 if ( !tboot_protect_mem_regions() )
keir@19259 997 panic("Could not protect TXT memory regions\n");
kaf24@8459 998
kaf24@8459 999 /* Create initial domain 0. */
keir@19266 1000 dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
kfraser@10655 1001 if ( (dom0 == NULL) || (alloc_vcpu(dom0, 0, 0) == NULL) )
kaf24@8459 1002 panic("Error creating domain 0\n");
kaf24@8459 1003
kfraser@12210 1004 dom0->is_privileged = 1;
keir@16856 1005 dom0->target = NULL;
kfraser@12210 1006
kaf24@8459 1007 /* Grab the DOM0 command line. */
kaf24@8459 1008 cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
kfraser@15426 1009 if ( (cmdline != NULL) || (kextra != NULL) )
kaf24@8459 1010 {
kaf24@8459 1011 static char dom0_cmdline[MAX_GUEST_CMDLINE];
kaf24@8459 1012
kfraser@15796 1013 cmdline = cmdline_cook(cmdline);
kfraser@15796 1014 safe_strcpy(dom0_cmdline, cmdline);
kaf24@8459 1015
kfraser@15426 1016 if ( kextra != NULL )
kfraser@15426 1017 /* kextra always includes exactly one leading space. */
kfraser@15426 1018 safe_strcat(dom0_cmdline, kextra);
kfraser@15426 1019
kaf24@8459 1020 /* Append any extra parameters. */
kfraser@13691 1021 if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
kfraser@13691 1022 safe_strcat(dom0_cmdline, " noapic");
kaf24@8459 1023 if ( acpi_skip_timer_override &&
kfraser@13691 1024 !strstr(dom0_cmdline, "acpi_skip_timer_override") )
kfraser@13691 1025 safe_strcat(dom0_cmdline, " acpi_skip_timer_override");
keir@16165 1026 if ( (strlen(acpi_param) == 0) && acpi_disabled )
keir@16165 1027 {
keir@16165 1028 printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
keir@16165 1029 safe_strcpy(acpi_param, "off");
keir@16165 1030 }
kfraser@13691 1031 if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
kaf24@8459 1032 {
kfraser@13691 1033 safe_strcat(dom0_cmdline, " acpi=");
kfraser@13691 1034 safe_strcat(dom0_cmdline, acpi_param);
kaf24@8459 1035 }
kfraser@13691 1036
kfraser@13691 1037 cmdline = dom0_cmdline;
kaf24@8459 1038 }
kaf24@8459 1039
kaf24@8459 1040 if ( (initrdidx > 0) && (initrdidx < mbi->mods_count) )
kaf24@8459 1041 {
keir@19544 1042 _initrd_start = mod[initrdidx].mod_start;
kaf24@8459 1043 _initrd_len = mod[initrdidx].mod_end - mod[initrdidx].mod_start;
kaf24@8459 1044 }
kaf24@8459 1045
keir@18180 1046 if ( xen_cpuidle )
keir@18180 1047 xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
keir@18180 1048
kaf24@8459 1049 /*
kaf24@8459 1050 * We're going to setup domain0 using the module(s) that we stashed safely
kaf24@8459 1051 * above our heap. The second module, if present, is an initrd ramdisk.
kaf24@8459 1052 */
kaf24@8459 1053 if ( construct_dom0(dom0,
keir@19076 1054 initial_images_base,
keir@19076 1055 initial_images_start,
kaf24@8459 1056 mod[0].mod_end-mod[0].mod_start,
kaf24@8459 1057 _initrd_start,
kaf24@8459 1058 _initrd_len,
kaf24@8459 1059 cmdline) != 0)
kaf24@8459 1060 panic("Could not set up DOM0 guest OS\n");
kaf24@8459 1061
kaf24@8459 1062 /* Scrub RAM that is still free and so may go to an unprivileged domain. */
kaf24@8459 1063 scrub_heap_pages();
kaf24@8459 1064
kaf24@8459 1065 init_trace_bufs();
kaf24@8459 1066
keir@19646 1067 init_tmem();
keir@19646 1068
kaf24@10502 1069 console_endboot();
kaf24@8459 1070
kaf24@8459 1071 /* Hide UART from DOM0 if we're using it */
kaf24@8459 1072 serial_endboot();
kaf24@8459 1073
kaf24@8459 1074 domain_unpause_by_systemcontroller(dom0);
kaf24@8459 1075
keir@15082 1076 reset_stack_and_jump(init_done);
kaf24@8459 1077 }
kaf24@8459 1078
ian@13763 1079 void arch_get_xen_caps(xen_capabilities_info_t *info)
iap10@6721 1080 {
kfraser@14997 1081 /* Interface name is always xen-3.0-* for Xen-3.x. */
kfraser@14997 1082 int major = 3, minor = 0;
keir@13754 1083 char s[32];
keir@13754 1084
ian@13763 1085 (*info)[0] = '\0';
iap10@6721 1086
keir@17618 1087 #if defined(CONFIG_X86_32)
kaf24@6725 1088
keir@13754 1089 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
ian@13763 1090 safe_strcat(*info, s);
kaf24@6725 1091 if ( hvm_enabled )
iap10@6721 1092 {
keir@13754 1093 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
ian@13763 1094 safe_strcat(*info, s);
keir@13754 1095 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
ian@13763 1096 safe_strcat(*info, s);
iap10@6721 1097 }
iap10@6721 1098
kaf24@6725 1099 #elif defined(CONFIG_X86_64)
iap10@6721 1100
keir@13754 1101 snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
ian@13763 1102 safe_strcat(*info, s);
ack@13288 1103 #ifdef CONFIG_COMPAT
keir@13754 1104 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
ian@13763 1105 safe_strcat(*info, s);
ack@13288 1106 #endif
kaf24@6725 1107 if ( hvm_enabled )
iap10@6721 1108 {
keir@13754 1109 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
ian@13763 1110 safe_strcat(*info, s);
keir@13754 1111 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
ian@13763 1112 safe_strcat(*info, s);
keir@13754 1113 snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
ian@13763 1114 safe_strcat(*info, s);
iap10@6721 1115 }
kaf24@6725 1116
iap10@6721 1117 #endif
iap10@6721 1118 }
iap10@6721 1119
keir@17738 1120 int xen_in_range(paddr_t start, paddr_t end)
keir@17729 1121 {
keir@19283 1122 int i;
keir@19283 1123 static struct {
keir@19283 1124 paddr_t s, e;
keir@19568 1125 } xen_regions[4];
keir@19055 1126
keir@19283 1127 /* initialize first time */
keir@19283 1128 if ( !xen_regions[0].s )
keir@19283 1129 {
keir@19577 1130 extern char __init_begin[], __bss_start[];
keir@19283 1131 extern unsigned long allocator_bitmap_end;
keir@19283 1132
keir@19283 1133 /* S3 resume code (and other real mode trampoline code) */
keir@19283 1134 xen_regions[0].s = bootsym_phys(trampoline_start);
keir@19283 1135 xen_regions[0].e = bootsym_phys(trampoline_end);
keir@19283 1136 /* hypervisor code + data */
keir@19283 1137 xen_regions[1].s =__pa(&_stext);
keir@19283 1138 xen_regions[1].e = __pa(&__init_begin);
keir@19283 1139 /* per-cpu data */
keir@19283 1140 xen_regions[2].s = __pa(&__per_cpu_start);
keir@19577 1141 xen_regions[2].e = xen_regions[2].s +
keir@19577 1142 (((paddr_t)last_cpu(cpu_possible_map) + 1) << PERCPU_SHIFT);
keir@19283 1143 /* bss + boot allocator bitmap */
keir@19283 1144 xen_regions[3].s = __pa(&__bss_start);
keir@19283 1145 xen_regions[3].e = allocator_bitmap_end;
keir@19283 1146 }
keir@19283 1147
keir@19283 1148 for ( i = 0; i < ARRAY_SIZE(xen_regions); i++ )
keir@19283 1149 {
keir@19283 1150 if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
keir@19283 1151 return 1;
keir@19283 1152 }
keir@19283 1153
keir@19283 1154 return 0;
keir@17729 1155 }
keir@17729 1156
kaf24@3914 1157 /*
kaf24@3914 1158 * Local variables:
kaf24@3914 1159 * mode: C
kaf24@3914 1160 * c-set-style: "BSD"
kaf24@3914 1161 * c-basic-offset: 4
kaf24@3914 1162 * tab-width: 4
kaf24@3914 1163 * indent-tabs-mode: nil
kaf24@3988 1164 * End:
kaf24@3914 1165 */