ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 16535:c67d024fdd2d

x86_64: Do not leak trampoline mapping into dom0's initial address space.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Dec 05 13:49:19 2007 +0000 (2007-12-05)
parents 64fbef22f86a
children cd5e1e76d0bc
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/console.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/version.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <xen/compat.h>
22 #include <asm/regs.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/i387.h>
28 #include <asm/paging.h>
29 #include <asm/e820.h>
31 #include <public/version.h>
32 #include <public/libelf.h>
34 extern unsigned long initial_images_nrpages(void);
35 extern void discard_initial_images(void);
37 static long __initdata dom0_nrpages;
38 static long __initdata dom0_min_nrpages;
39 static long __initdata dom0_max_nrpages = LONG_MAX;
41 /*
42 * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
43 *
44 * <min_amt>: The minimum amount of memory which should be allocated for dom0.
45 * <max_amt>: The maximum amount of memory which should be allocated for dom0.
46 * <amt>: The precise amount of memory to allocate for dom0.
47 *
48 * Notes:
49 * 1. <amt> is clamped from below by <min_amt> and from above by available
50 * memory and <max_amt>
51 * 2. <min_amt> is clamped from above by available memory and <max_amt>
52 * 3. <min_amt> is ignored if it is greater than <max_amt>
53 * 4. If <amt> is not specified, it is calculated as follows:
54 * "All of memory is allocated to domain 0, minus 1/16th which is reserved
55 * for uses such as DMA buffers (the reservation is clamped to 128MB)."
56 *
57 * Each value can be specified as positive or negative:
58 * If +ve: The specified amount is an absolute value.
59 * If -ve: The specified amount is subtracted from total available memory.
60 */
61 static long __init parse_amt(const char *s, const char **ps)
62 {
63 long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT;
64 return (*s == '-') ? -pages : pages;
65 }
66 static void __init parse_dom0_mem(const char *s)
67 {
68 do {
69 if ( !strncmp(s, "min:", 4) )
70 dom0_min_nrpages = parse_amt(s+4, &s);
71 else if ( !strncmp(s, "max:", 4) )
72 dom0_max_nrpages = parse_amt(s+4, &s);
73 else
74 dom0_nrpages = parse_amt(s, &s);
75 if ( *s != ',' )
76 break;
77 } while ( *s++ == ',' );
78 }
79 custom_param("dom0_mem", parse_dom0_mem);
81 static unsigned int opt_dom0_max_vcpus;
82 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
84 static unsigned int opt_dom0_shadow;
85 boolean_param("dom0_shadow", opt_dom0_shadow);
87 static char opt_dom0_ioports_disable[200] = "";
88 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
90 #if defined(__i386__)
91 /* No ring-3 access in initial leaf page tables. */
92 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
93 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
94 #define L3_PROT (_PAGE_PRESENT)
95 #elif defined(__x86_64__)
96 /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
97 #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
98 #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
99 /* ... except for compatibility mode guests. */
100 #define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
101 #define L2_PROT (BASE_PROT|_PAGE_DIRTY)
102 #define L3_PROT (BASE_PROT|_PAGE_DIRTY)
103 #define L4_PROT (BASE_PROT|_PAGE_DIRTY)
104 #endif
106 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
107 #define round_pgdown(_p) ((_p)&PAGE_MASK)
109 static struct page_info * __init alloc_chunk(
110 struct domain *d, unsigned long max_pages)
111 {
112 struct page_info *page;
113 unsigned int order;
114 /*
115 * Allocate up to 2MB at a time: It prevents allocating very large chunks
116 * from DMA pools before the >4GB pool is fully depleted.
117 */
118 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
119 max_pages = 2UL << (20 - PAGE_SHIFT);
120 order = get_order_from_pages(max_pages);
121 if ( (max_pages & (max_pages-1)) != 0 )
122 order--;
123 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
124 if ( order-- == 0 )
125 break;
126 return page;
127 }
129 static unsigned long __init compute_dom0_nr_pages(void)
130 {
131 unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
133 /*
134 * If domain 0 allocation isn't specified, reserve 1/16th of available
135 * memory for things like DMA buffers. This reservation is clamped to
136 * a maximum of 128MB.
137 */
138 if ( dom0_nrpages == 0 )
139 {
140 dom0_nrpages = avail;
141 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
142 dom0_nrpages = -dom0_nrpages;
143 }
145 /* Negative memory specification means "all memory - specified amount". */
146 if ( dom0_nrpages < 0 ) dom0_nrpages += avail;
147 if ( dom0_min_nrpages < 0 ) dom0_min_nrpages += avail;
148 if ( dom0_max_nrpages < 0 ) dom0_max_nrpages += avail;
150 /* Clamp dom0 memory according to min/max limits and available memory. */
151 dom0_nrpages = max(dom0_nrpages, dom0_min_nrpages);
152 dom0_nrpages = min(dom0_nrpages, dom0_max_nrpages);
153 dom0_nrpages = min(dom0_nrpages, (long)avail);
155 return dom0_nrpages;
156 }
158 static void __init process_dom0_ioports_disable(void)
159 {
160 unsigned long io_from, io_to;
161 char *t, *s = opt_dom0_ioports_disable;
162 const char *u;
164 if ( *s == '\0' )
165 return;
167 while ( (t = strsep(&s, ",")) != NULL )
168 {
169 io_from = simple_strtoul(t, &u, 16);
170 if ( u == t )
171 {
172 parse_error:
173 printk("Invalid ioport range <%s> "
174 "in dom0_ioports_disable, skipping\n", t);
175 continue;
176 }
178 if ( *u == '\0' )
179 io_to = io_from;
180 else if ( *u == '-' )
181 io_to = simple_strtoul(u + 1, &u, 16);
182 else
183 goto parse_error;
185 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
186 goto parse_error;
188 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
189 io_from, io_to);
191 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
192 BUG();
193 }
194 }
196 int __init construct_dom0(
197 struct domain *d,
198 unsigned long _image_start, unsigned long image_len,
199 unsigned long _initrd_start, unsigned long initrd_len,
200 char *cmdline)
201 {
202 int i, rc, compatible, compat32, order, machine;
203 struct cpu_user_regs *regs;
204 unsigned long pfn, mfn;
205 unsigned long nr_pages;
206 unsigned long nr_pt_pages;
207 unsigned long alloc_spfn;
208 unsigned long alloc_epfn;
209 unsigned long count;
210 struct page_info *page = NULL;
211 start_info_t *si;
212 struct vcpu *v = d->vcpu[0];
213 unsigned long long value;
214 #if defined(__i386__)
215 char *image_start = (char *)_image_start; /* use lowmem mappings */
216 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
217 #elif defined(__x86_64__)
218 char *image_start = __va(_image_start);
219 char *initrd_start = __va(_initrd_start);
220 #endif
221 #if CONFIG_PAGING_LEVELS >= 4
222 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
223 #endif
224 #if CONFIG_PAGING_LEVELS >= 3
225 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
226 #endif
227 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
228 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
230 /*
231 * This fully describes the memory layout of the initial domain. All
232 * *_start address are page-aligned, except v_start (and v_end) which are
233 * superpage-aligned.
234 */
235 struct elf_binary elf;
236 struct elf_dom_parms parms;
237 unsigned long vkern_start;
238 unsigned long vkern_end;
239 unsigned long vinitrd_start;
240 unsigned long vinitrd_end;
241 unsigned long vphysmap_start;
242 unsigned long vphysmap_end;
243 unsigned long vstartinfo_start;
244 unsigned long vstartinfo_end;
245 unsigned long vstack_start;
246 unsigned long vstack_end;
247 unsigned long vpt_start;
248 unsigned long vpt_end;
249 unsigned long v_start;
250 unsigned long v_end;
252 /* Machine address of next candidate page-table page. */
253 unsigned long mpt_alloc;
255 /* Features supported. */
256 uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
257 uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
259 /* Sanity! */
260 BUG_ON(d->domain_id != 0);
261 BUG_ON(d->vcpu[0] == NULL);
262 BUG_ON(v->is_initialised);
264 printk("*** LOADING DOMAIN 0 ***\n");
266 d->max_pages = ~0U;
268 nr_pages = compute_dom0_nr_pages();
270 if ( (rc = elf_init(&elf, image_start, image_len)) != 0 )
271 return rc;
272 #ifdef VERBOSE
273 elf_set_verbose(&elf);
274 #endif
275 elf_parse_binary(&elf);
276 if ( (rc = elf_xen_parse(&elf, &parms)) != 0 )
277 return rc;
279 /* compatibility check */
280 compatible = 0;
281 compat32 = 0;
282 machine = elf_uval(&elf, elf.ehdr, e_machine);
283 switch (CONFIG_PAGING_LEVELS) {
284 case 2: /* x86_32 */
285 if (parms.pae == PAEKERN_bimodal)
286 parms.pae = PAEKERN_no;
287 printk(" Xen kernel: 32-bit, lsb\n");
288 if (elf_32bit(&elf) && !parms.pae && machine == EM_386)
289 compatible = 1;
290 break;
291 case 3: /* x86_32p */
292 if (parms.pae == PAEKERN_bimodal)
293 parms.pae = PAEKERN_extended_cr3;
294 printk(" Xen kernel: 32-bit, PAE, lsb\n");
295 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
296 compatible = 1;
297 break;
298 case 4: /* x86_64 */
299 #ifndef CONFIG_COMPAT
300 printk(" Xen kernel: 64-bit, lsb\n");
301 #else
302 printk(" Xen kernel: 64-bit, lsb, compat32\n");
303 if (elf_32bit(&elf) && parms.pae == PAEKERN_bimodal)
304 parms.pae = PAEKERN_extended_cr3;
305 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
306 {
307 compat32 = 1;
308 compatible = 1;
309 }
310 #endif
311 if (elf_64bit(&elf) && machine == EM_X86_64)
312 compatible = 1;
313 break;
314 }
315 printk(" Dom0 kernel: %s%s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
316 elf_64bit(&elf) ? "64-bit" : "32-bit",
317 parms.pae ? ", PAE" : "",
318 elf_msb(&elf) ? "msb" : "lsb",
319 elf.pstart, elf.pend);
320 if ( elf.bsd_symtab_pstart )
321 printk(" Dom0 symbol map 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
322 elf.bsd_symtab_pstart, elf.bsd_symtab_pend);
324 if ( !compatible )
325 {
326 printk("Mismatch between Xen and DOM0 kernel\n");
327 return -EINVAL;
328 }
330 #ifdef CONFIG_COMPAT
331 if ( compat32 )
332 {
333 l1_pgentry_t gdt_l1e;
335 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
336 v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
338 if ( nr_pages != (unsigned int)nr_pages )
339 nr_pages = UINT_MAX;
341 /*
342 * Map compatibility Xen segments into every VCPU's GDT. See
343 * arch_domain_create() for further comments.
344 */
345 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table),
346 PAGE_HYPERVISOR);
347 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
348 d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) +
349 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
350 flush_tlb_one_local(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE);
351 }
352 #endif
353 if ( parms.pae == PAEKERN_extended_cr3 )
354 set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
356 if ( UNSET_ADDR != parms.virt_hv_start_low && elf_32bit(&elf) )
357 {
358 #if CONFIG_PAGING_LEVELS < 4
359 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
360 #else
361 unsigned long mask = is_pv_32bit_domain(d)
362 ? (1UL << L2_PAGETABLE_SHIFT) - 1
363 : (1UL << L4_PAGETABLE_SHIFT) - 1;
364 #endif
366 value = (parms.virt_hv_start_low + mask) & ~mask;
367 #ifdef CONFIG_COMPAT
368 HYPERVISOR_COMPAT_VIRT_START(d) =
369 max_t(unsigned int, m2p_compat_vstart, value);
370 d->arch.physaddr_bitsize = !is_pv_32on64_domain(d) ? 64 :
371 fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
372 + (PAGE_SIZE - 2);
373 if ( value > (!is_pv_32on64_domain(d) ?
374 HYPERVISOR_VIRT_START :
375 __HYPERVISOR_COMPAT_VIRT_START) )
376 #else
377 if ( value > HYPERVISOR_VIRT_START )
378 #endif
379 panic("Domain 0 expects too high a hypervisor start address.\n");
380 }
382 /*
383 * Why do we need this? The number of page-table frames depends on the
384 * size of the bootstrap address space. But the size of the address space
385 * depends on the number of page-table frames (since each one is mapped
386 * read-only). We have a pair of simultaneous equations in two unknowns,
387 * which we solve by exhaustive search.
388 */
389 v_start = parms.virt_base;
390 vkern_start = parms.virt_kstart;
391 vkern_end = parms.virt_kend;
392 vinitrd_start = round_pgup(vkern_end);
393 vinitrd_end = vinitrd_start + initrd_len;
394 vphysmap_start = round_pgup(vinitrd_end);
395 vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
396 sizeof(unsigned long) :
397 sizeof(unsigned int)));
398 vstartinfo_start = round_pgup(vphysmap_end);
399 vstartinfo_end = (vstartinfo_start +
400 sizeof(struct start_info) +
401 sizeof(struct dom0_vga_console_info));
402 vpt_start = round_pgup(vstartinfo_end);
403 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
404 {
405 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
406 vstack_start = vpt_end;
407 vstack_end = vstack_start + PAGE_SIZE;
408 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
409 if ( (v_end - vstack_end) < (512UL << 10) )
410 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
411 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
412 if ( (((v_end - v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
413 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
414 break;
415 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
416 /* 5 pages: 1x 3rd + 4x 2nd level */
417 if ( (((v_end - v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
418 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
419 break;
420 #elif defined(__x86_64__)
421 #define NR(_l,_h,_s) \
422 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
423 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
424 if ( (1 + /* # L4 */
425 NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
426 (!is_pv_32on64_domain(d) ?
427 NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */
428 4) + /* # compat L2 */
429 NR(v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
430 <= nr_pt_pages )
431 break;
432 #endif
433 }
435 order = get_order_from_bytes(v_end - v_start);
436 if ( (1UL << order) > nr_pages )
437 panic("Domain 0 allocation is too small for kernel image.\n");
439 #ifdef __i386__
440 /* Ensure that our low-memory 1:1 mapping covers the allocation. */
441 page = alloc_domheap_pages(d, order, MEMF_bits(30));
442 #else
443 page = alloc_domheap_pages(d, order, 0);
444 #endif
445 if ( page == NULL )
446 panic("Not enough RAM for domain 0 allocation.\n");
447 alloc_spfn = page_to_mfn(page);
448 alloc_epfn = alloc_spfn + d->tot_pages;
450 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
451 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
452 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
453 if ( d->tot_pages < nr_pages )
454 printk(" (%lu pages to be allocated)",
455 nr_pages - d->tot_pages);
456 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
457 " Loaded kernel: %p->%p\n"
458 " Init. ramdisk: %p->%p\n"
459 " Phys-Mach map: %p->%p\n"
460 " Start info: %p->%p\n"
461 " Page tables: %p->%p\n"
462 " Boot stack: %p->%p\n"
463 " TOTAL: %p->%p\n",
464 _p(vkern_start), _p(vkern_end),
465 _p(vinitrd_start), _p(vinitrd_end),
466 _p(vphysmap_start), _p(vphysmap_end),
467 _p(vstartinfo_start), _p(vstartinfo_end),
468 _p(vpt_start), _p(vpt_end),
469 _p(vstack_start), _p(vstack_end),
470 _p(v_start), _p(v_end));
471 printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
473 if ( ((v_end - v_start)>>PAGE_SHIFT) > nr_pages )
474 {
475 printk("Initial guest OS requires too much space\n"
476 "(%luMB is greater than %luMB limit)\n",
477 (v_end-v_start)>>20, nr_pages>>(20-PAGE_SHIFT));
478 return -ENOMEM;
479 }
481 mpt_alloc = (vpt_start - v_start) +
482 (unsigned long)pfn_to_paddr(alloc_spfn);
484 #if defined(__i386__)
485 /*
486 * Protect the lowest 1GB of memory. We use a temporary mapping there
487 * from which we copy the kernel and ramdisk images.
488 */
489 if ( v_start < (1UL<<30) )
490 {
491 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
492 return -EINVAL;
493 }
495 /* WARNING: The new domain must have its 'processor' field filled in! */
496 #if CONFIG_PAGING_LEVELS == 3
497 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
498 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
499 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
500 for (i = 0; i < 4; i++) {
501 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
502 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
503 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
504 }
505 v->arch.guest_table = pagetable_from_paddr((unsigned long)l3start);
506 #else
507 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
508 copy_page(l2tab, idle_pg_table);
509 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
510 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
511 v->arch.guest_table = pagetable_from_paddr((unsigned long)l2start);
512 #endif
514 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
515 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
516 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
517 __PAGE_HYPERVISOR);
519 l2tab += l2_linear_offset(v_start);
520 mfn = alloc_spfn;
521 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
522 {
523 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
524 {
525 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
526 mpt_alloc += PAGE_SIZE;
527 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
528 l2tab++;
529 clear_page(l1tab);
530 if ( count == 0 )
531 l1tab += l1_table_offset(v_start);
532 }
533 *l1tab = l1e_from_pfn(mfn, L1_PROT);
534 l1tab++;
536 page = mfn_to_page(mfn);
537 if ( !get_page_and_type(page, d, PGT_writable_page) )
538 BUG();
540 mfn++;
541 }
543 /* Pages that are part of page tables must be read only. */
544 l2tab = l2start + l2_linear_offset(vpt_start);
545 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
546 l1tab += l1_table_offset(vpt_start);
547 for ( count = 0; count < nr_pt_pages; count++ )
548 {
549 page = mfn_to_page(l1e_get_pfn(*l1tab));
550 if ( !opt_dom0_shadow )
551 l1e_remove_flags(*l1tab, _PAGE_RW);
552 else
553 if ( !get_page_type(page, PGT_writable_page) )
554 BUG();
556 #if CONFIG_PAGING_LEVELS == 3
557 switch (count) {
558 case 0:
559 page->u.inuse.type_info &= ~PGT_type_mask;
560 page->u.inuse.type_info |= PGT_l3_page_table;
561 get_page(page, d); /* an extra ref because of readable mapping */
563 /* Get another ref to L3 page so that it can be pinned. */
564 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
565 BUG();
566 set_bit(_PGT_pinned, &page->u.inuse.type_info);
567 break;
568 case 1 ... 4:
569 page->u.inuse.type_info &= ~PGT_type_mask;
570 page->u.inuse.type_info |= PGT_l2_page_table;
571 if ( count == 4 )
572 page->u.inuse.type_info |= PGT_pae_xen_l2;
573 get_page(page, d); /* an extra ref because of readable mapping */
574 break;
575 default:
576 page->u.inuse.type_info &= ~PGT_type_mask;
577 page->u.inuse.type_info |= PGT_l1_page_table;
578 get_page(page, d); /* an extra ref because of readable mapping */
579 break;
580 }
581 #else
582 if ( count == 0 )
583 {
584 page->u.inuse.type_info &= ~PGT_type_mask;
585 page->u.inuse.type_info |= PGT_l2_page_table;
587 /*
588 * No longer writable: decrement the type_count.
589 * Installed as CR3: increment both the ref_count and type_count.
590 * Net: just increment the ref_count.
591 */
592 get_page(page, d); /* an extra ref because of readable mapping */
594 /* Get another ref to L2 page so that it can be pinned. */
595 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
596 BUG();
597 set_bit(_PGT_pinned, &page->u.inuse.type_info);
598 }
599 else
600 {
601 page->u.inuse.type_info &= ~PGT_type_mask;
602 page->u.inuse.type_info |= PGT_l1_page_table;
604 /*
605 * No longer writable: decrement the type_count.
606 * This is an L1 page, installed in a validated L2 page:
607 * increment both the ref_count and type_count.
608 * Net: just increment the ref_count.
609 */
610 get_page(page, d); /* an extra ref because of readable mapping */
611 }
612 #endif
613 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
614 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
615 }
617 #elif defined(__x86_64__)
619 /* Overlap with Xen protected area? */
620 if ( !is_pv_32on64_domain(d) ?
621 ((v_start < HYPERVISOR_VIRT_END) &&
622 (v_end > HYPERVISOR_VIRT_START)) :
623 (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) )
624 {
625 printk("DOM0 image overlaps with Xen private area.\n");
626 return -EINVAL;
627 }
629 if ( is_pv_32on64_domain(d) )
630 {
631 v->arch.guest_context.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS;
632 v->arch.guest_context.event_callback_cs = FLAT_COMPAT_KERNEL_CS;
633 }
635 /* WARNING: The new domain must have its 'processor' field filled in! */
636 if ( !is_pv_32on64_domain(d) )
637 {
638 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
639 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
640 }
641 else
642 {
643 page = alloc_domheap_page(NULL);
644 if ( !page )
645 panic("Not enough RAM for domain 0 PML4.\n");
646 l4start = l4tab = page_to_virt(page);
647 }
648 copy_page(l4tab, idle_pg_table);
649 l4tab[0] = l4e_empty(); /* zap trampoline mapping */
650 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
651 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
652 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
653 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
654 v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
655 if ( is_pv_32on64_domain(d) )
656 {
657 v->arch.guest_table_user = v->arch.guest_table;
658 if ( setup_arg_xlat_area(v, l4start) < 0 )
659 panic("Not enough RAM for domain 0 hypercall argument translation.\n");
660 }
662 l4tab += l4_table_offset(v_start);
663 mfn = alloc_spfn;
664 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
665 {
666 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
667 {
668 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
669 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
670 clear_page(l1tab);
671 if ( count == 0 )
672 l1tab += l1_table_offset(v_start);
673 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
674 {
675 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
676 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
677 clear_page(l2tab);
678 if ( count == 0 )
679 l2tab += l2_table_offset(v_start);
680 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
681 {
682 maddr_to_page(mpt_alloc)->u.inuse.type_info =
683 PGT_l3_page_table;
684 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
685 clear_page(l3tab);
686 if ( count == 0 )
687 l3tab += l3_table_offset(v_start);
688 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
689 l4tab++;
690 }
691 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
692 l3tab++;
693 }
694 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
695 l2tab++;
696 }
697 *l1tab = l1e_from_pfn(mfn, (!is_pv_32on64_domain(d) ?
698 L1_PROT : COMPAT_L1_PROT));
699 l1tab++;
701 page = mfn_to_page(mfn);
702 if ( (page->u.inuse.type_info == 0) &&
703 !get_page_and_type(page, d, PGT_writable_page) )
704 BUG();
706 mfn++;
707 }
709 #ifdef CONFIG_COMPAT
710 if ( is_pv_32on64_domain(d) )
711 {
712 /* Ensure the first four L3 entries are all populated. */
713 for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
714 {
715 if ( !l3e_get_intpte(*l3tab) )
716 {
717 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
718 l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
719 clear_page(l2tab);
720 *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
721 }
722 if ( i == 3 )
723 l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
724 }
725 /* Install read-only guest visible MPT mapping. */
726 l2tab = l3e_to_l2e(l3start[3]);
727 memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
728 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
729 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
730 }
731 #endif
733 /* Pages that are part of page tables must be read only. */
734 l4tab = l4start + l4_table_offset(vpt_start);
735 l3start = l3tab = l4e_to_l3e(*l4tab);
736 l3tab += l3_table_offset(vpt_start);
737 l2start = l2tab = l3e_to_l2e(*l3tab);
738 l2tab += l2_table_offset(vpt_start);
739 l1start = l1tab = l2e_to_l1e(*l2tab);
740 l1tab += l1_table_offset(vpt_start);
741 for ( count = 0; count < nr_pt_pages; count++ )
742 {
743 l1e_remove_flags(*l1tab, _PAGE_RW);
744 page = mfn_to_page(l1e_get_pfn(*l1tab));
746 /* Read-only mapping + PGC_allocated + page-table page. */
747 page->count_info = PGC_allocated | 3;
748 page->u.inuse.type_info |= PGT_validated | 1;
750 /* Top-level p.t. is pinned. */
751 if ( (page->u.inuse.type_info & PGT_type_mask) ==
752 (!is_pv_32on64_domain(d) ?
753 PGT_l4_page_table : PGT_l3_page_table) )
754 {
755 page->count_info += 1;
756 page->u.inuse.type_info += 1 | PGT_pinned;
757 }
759 /* Iterate. */
760 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
761 {
762 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
763 {
764 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
765 l3start = l3tab = l4e_to_l3e(*++l4tab);
766 l2start = l2tab = l3e_to_l2e(*l3tab);
767 }
768 l1start = l1tab = l2e_to_l1e(*l2tab);
769 }
770 }
772 #endif /* __x86_64__ */
774 /* Mask all upcalls... */
775 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
776 shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
778 if ( opt_dom0_max_vcpus == 0 )
779 opt_dom0_max_vcpus = num_online_cpus();
780 if ( opt_dom0_max_vcpus > num_online_cpus() )
781 opt_dom0_max_vcpus = num_online_cpus();
782 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
783 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
784 if ( opt_dom0_max_vcpus > BITS_PER_GUEST_LONG(d) )
785 opt_dom0_max_vcpus = BITS_PER_GUEST_LONG(d);
786 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
788 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
789 (void)alloc_vcpu(d, i, i);
791 /* Set up CR3 value for write_ptbase */
792 if ( paging_mode_enabled(v->domain) )
793 paging_update_paging_modes(v);
794 else
795 update_cr3(v);
797 /* Install the new page tables. */
798 local_irq_disable();
799 write_ptbase(v);
801 /* Copy the OS image and free temporary buffer. */
802 elf.dest = (void*)vkern_start;
803 elf_load_binary(&elf);
805 if ( UNSET_ADDR != parms.virt_hypercall )
806 {
807 if ( (parms.virt_hypercall < v_start) ||
808 (parms.virt_hypercall >= v_end) )
809 {
810 write_ptbase(current);
811 local_irq_enable();
812 printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
813 return -1;
814 }
815 hypercall_page_initialise(d, (void *)(unsigned long)parms.virt_hypercall);
816 }
818 /* Copy the initial ramdisk. */
819 if ( initrd_len != 0 )
820 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
822 /* Free temporary buffers. */
823 discard_initial_images();
825 /* Set up start info area. */
826 si = (start_info_t *)vstartinfo_start;
827 clear_page(si);
828 si->nr_pages = nr_pages;
830 si->shared_info = virt_to_maddr(d->shared_info);
832 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
833 si->pt_base = vpt_start + 2 * PAGE_SIZE * !!is_pv_32on64_domain(d);
834 si->nr_pt_frames = nr_pt_pages;
835 si->mfn_list = vphysmap_start;
836 snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
837 elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
839 /* Write the phys->machine and machine->phys table entries. */
840 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
841 {
842 mfn = pfn + alloc_spfn;
843 #ifndef NDEBUG
844 #define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT)
845 if ( pfn > REVERSE_START )
846 mfn = alloc_epfn - (pfn - REVERSE_START);
847 #endif
848 if ( !is_pv_32on64_domain(d) )
849 ((unsigned long *)vphysmap_start)[pfn] = mfn;
850 else
851 ((unsigned int *)vphysmap_start)[pfn] = mfn;
852 set_gpfn_from_mfn(mfn, pfn);
853 }
854 while ( pfn < nr_pages )
855 {
856 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
857 panic("Not enough RAM for DOM0 reservation.\n");
858 while ( pfn < d->tot_pages )
859 {
860 mfn = page_to_mfn(page);
861 #ifndef NDEBUG
862 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
863 #endif
864 if ( !is_pv_32on64_domain(d) )
865 ((unsigned long *)vphysmap_start)[pfn] = mfn;
866 else
867 ((unsigned int *)vphysmap_start)[pfn] = mfn;
868 set_gpfn_from_mfn(mfn, pfn);
869 #undef pfn
870 page++; pfn++;
871 }
872 }
874 if ( initrd_len != 0 )
875 {
876 si->mod_start = vinitrd_start;
877 si->mod_len = initrd_len;
878 printk("Initrd len 0x%lx, start at 0x%lx\n",
879 si->mod_len, si->mod_start);
880 }
882 memset(si->cmd_line, 0, sizeof(si->cmd_line));
883 if ( cmdline != NULL )
884 strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line));
886 if ( fill_console_start_info((void *)(si + 1)) )
887 {
888 si->console.dom0.info_off = sizeof(struct start_info);
889 si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
890 }
892 #ifdef CONFIG_COMPAT
893 if ( is_pv_32on64_domain(d) )
894 xlat_start_info(si, XLAT_start_info_console_dom0);
895 #endif
897 /* Reinstate the caller's page tables. */
898 write_ptbase(current);
899 local_irq_enable();
901 #if defined(__i386__)
902 /* Destroy low mappings - they were only for our convenience. */
903 zap_low_mappings(l2start);
904 #endif
906 update_domain_wallclock_time(d);
908 v->is_initialised = 1;
909 clear_bit(_VPF_down, &v->pause_flags);
911 /*
912 * Initial register values:
913 * DS,ES,FS,GS = FLAT_KERNEL_DS
914 * CS:EIP = FLAT_KERNEL_CS:start_pc
915 * SS:ESP = FLAT_KERNEL_SS:start_stack
916 * ESI = start_info
917 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
918 */
919 regs = &v->arch.guest_context.user_regs;
920 regs->ds = regs->es = regs->fs = regs->gs =
921 !is_pv_32on64_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS;
922 regs->ss = (!is_pv_32on64_domain(d) ?
923 FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS);
924 regs->cs = (!is_pv_32on64_domain(d) ?
925 FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS);
926 regs->eip = parms.virt_entry;
927 regs->esp = vstack_end;
928 regs->esi = vstartinfo_start;
929 regs->eflags = X86_EFLAGS_IF;
931 if ( opt_dom0_shadow )
932 if ( paging_enable(d, PG_SH_enable) == 0 )
933 paging_update_paging_modes(v);
935 if ( supervisor_mode_kernel )
936 {
937 v->arch.guest_context.kernel_ss &= ~3;
938 v->arch.guest_context.user_regs.ss &= ~3;
939 v->arch.guest_context.user_regs.es &= ~3;
940 v->arch.guest_context.user_regs.ds &= ~3;
941 v->arch.guest_context.user_regs.fs &= ~3;
942 v->arch.guest_context.user_regs.gs &= ~3;
943 printk("Dom0 runs in ring 0 (supervisor mode)\n");
944 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
945 dom0_features_supported) )
946 panic("Dom0 does not support supervisor-mode execution\n");
947 }
948 else
949 {
950 if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
951 panic("Dom0 requires supervisor-mode execution\n");
952 }
954 rc = 0;
956 /* DOM0 is permitted full I/O capabilities. */
957 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
958 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
959 rc |= irqs_permit_access(dom0, 0, NR_IRQS-1);
961 /*
962 * Modify I/O port access permissions.
963 */
964 /* Master Interrupt Controller (PIC). */
965 rc |= ioports_deny_access(dom0, 0x20, 0x21);
966 /* Slave Interrupt Controller (PIC). */
967 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
968 /* Interval Timer (PIT). */
969 rc |= ioports_deny_access(dom0, 0x40, 0x43);
970 /* PIT Channel 2 / PC Speaker Control. */
971 rc |= ioports_deny_access(dom0, 0x61, 0x61);
972 /* Command-line I/O ranges. */
973 process_dom0_ioports_disable();
975 /*
976 * Modify I/O memory access permissions.
977 */
978 /* Local APIC. */
979 if ( mp_lapic_addr != 0 )
980 {
981 mfn = paddr_to_pfn(mp_lapic_addr);
982 rc |= iomem_deny_access(dom0, mfn, mfn);
983 }
984 /* I/O APICs. */
985 for ( i = 0; i < nr_ioapics; i++ )
986 {
987 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
988 if ( smp_found_config )
989 rc |= iomem_deny_access(dom0, mfn, mfn);
990 }
992 /* Remove access to E820_UNUSABLE I/O regions above 1MB. */
993 for ( i = 0; i < e820.nr_map; i++ )
994 {
995 unsigned long sfn, efn;
996 sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul);
997 efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1);
998 if ( (e820.map[i].type == E820_UNUSABLE) &&
999 (e820.map[i].size != 0) &&
1000 (sfn <= efn) )
1001 rc |= iomem_deny_access(dom0, sfn, efn);
1004 BUG_ON(rc != 0);
1006 return 0;
1009 /*
1010 * Local variables:
1011 * mode: C
1012 * c-set-style: "BSD"
1013 * c-basic-offset: 4
1014 * tab-width: 4
1015 * indent-tabs-mode: nil
1016 * End:
1017 */