ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 18432:1e98ea5c8604

x86: Fix guest_handle_okay/guest_handle_subrange_okay

The guest handle checks should use paging_* predicates, not shadow_*.
Also tidy up a few places where p2m definitions were being imported
via asm/guest_access.h -> asm/shadow.h -> asm/p2m.h

Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Sep 03 14:16:35 2008 +0100 (2008-09-03)
parents d31546a3883e
children 7f1a36b834e1
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/console.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/version.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <xen/compat.h>
22 #include <asm/regs.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/i387.h>
28 #include <asm/paging.h>
29 #include <asm/p2m.h>
30 #include <asm/e820.h>
32 #include <public/version.h>
33 #include <public/libelf.h>
35 extern unsigned long initial_images_nrpages(void);
36 extern void discard_initial_images(void);
38 static long __initdata dom0_nrpages;
39 static long __initdata dom0_min_nrpages;
40 static long __initdata dom0_max_nrpages = LONG_MAX;
42 /*
43 * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
44 *
45 * <min_amt>: The minimum amount of memory which should be allocated for dom0.
46 * <max_amt>: The maximum amount of memory which should be allocated for dom0.
47 * <amt>: The precise amount of memory to allocate for dom0.
48 *
49 * Notes:
50 * 1. <amt> is clamped from below by <min_amt> and from above by available
51 * memory and <max_amt>
52 * 2. <min_amt> is clamped from above by available memory and <max_amt>
53 * 3. <min_amt> is ignored if it is greater than <max_amt>
54 * 4. If <amt> is not specified, it is calculated as follows:
55 * "All of memory is allocated to domain 0, minus 1/16th which is reserved
56 * for uses such as DMA buffers (the reservation is clamped to 128MB)."
57 *
58 * Each value can be specified as positive or negative:
59 * If +ve: The specified amount is an absolute value.
60 * If -ve: The specified amount is subtracted from total available memory.
61 */
62 static long __init parse_amt(const char *s, const char **ps)
63 {
64 long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT;
65 return (*s == '-') ? -pages : pages;
66 }
67 static void __init parse_dom0_mem(const char *s)
68 {
69 do {
70 if ( !strncmp(s, "min:", 4) )
71 dom0_min_nrpages = parse_amt(s+4, &s);
72 else if ( !strncmp(s, "max:", 4) )
73 dom0_max_nrpages = parse_amt(s+4, &s);
74 else
75 dom0_nrpages = parse_amt(s, &s);
76 if ( *s != ',' )
77 break;
78 } while ( *s++ == ',' );
79 }
80 custom_param("dom0_mem", parse_dom0_mem);
82 static unsigned int opt_dom0_max_vcpus;
83 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
85 static unsigned int opt_dom0_shadow;
86 boolean_param("dom0_shadow", opt_dom0_shadow);
88 static char opt_dom0_ioports_disable[200] = "";
89 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
91 #if defined(__i386__)
92 /* No ring-3 access in initial leaf page tables. */
93 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
94 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
95 #define L3_PROT (_PAGE_PRESENT)
96 #elif defined(__x86_64__)
97 /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
98 #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
99 #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
100 /* ... except for compatibility mode guests. */
101 #define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
102 #define L2_PROT (BASE_PROT|_PAGE_DIRTY)
103 #define L3_PROT (BASE_PROT|_PAGE_DIRTY)
104 #define L4_PROT (BASE_PROT|_PAGE_DIRTY)
105 #endif
107 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
108 #define round_pgdown(_p) ((_p)&PAGE_MASK)
110 static struct page_info * __init alloc_chunk(
111 struct domain *d, unsigned long max_pages)
112 {
113 struct page_info *page;
114 unsigned int order;
115 /*
116 * Allocate up to 2MB at a time: It prevents allocating very large chunks
117 * from DMA pools before the >4GB pool is fully depleted.
118 */
119 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
120 max_pages = 2UL << (20 - PAGE_SHIFT);
121 order = get_order_from_pages(max_pages);
122 if ( (max_pages & (max_pages-1)) != 0 )
123 order--;
124 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
125 if ( order-- == 0 )
126 break;
127 return page;
128 }
130 static unsigned long __init compute_dom0_nr_pages(void)
131 {
132 unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
134 /*
135 * If domain 0 allocation isn't specified, reserve 1/16th of available
136 * memory for things like DMA buffers. This reservation is clamped to
137 * a maximum of 128MB.
138 */
139 if ( dom0_nrpages == 0 )
140 {
141 dom0_nrpages = avail;
142 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
143 dom0_nrpages = -dom0_nrpages;
144 }
146 /* Negative memory specification means "all memory - specified amount". */
147 if ( dom0_nrpages < 0 ) dom0_nrpages += avail;
148 if ( dom0_min_nrpages < 0 ) dom0_min_nrpages += avail;
149 if ( dom0_max_nrpages < 0 ) dom0_max_nrpages += avail;
151 /* Clamp dom0 memory according to min/max limits and available memory. */
152 dom0_nrpages = max(dom0_nrpages, dom0_min_nrpages);
153 dom0_nrpages = min(dom0_nrpages, dom0_max_nrpages);
154 dom0_nrpages = min(dom0_nrpages, (long)avail);
156 return dom0_nrpages;
157 }
159 static void __init process_dom0_ioports_disable(void)
160 {
161 unsigned long io_from, io_to;
162 char *t, *s = opt_dom0_ioports_disable;
163 const char *u;
165 if ( *s == '\0' )
166 return;
168 while ( (t = strsep(&s, ",")) != NULL )
169 {
170 io_from = simple_strtoul(t, &u, 16);
171 if ( u == t )
172 {
173 parse_error:
174 printk("Invalid ioport range <%s> "
175 "in dom0_ioports_disable, skipping\n", t);
176 continue;
177 }
179 if ( *u == '\0' )
180 io_to = io_from;
181 else if ( *u == '-' )
182 io_to = simple_strtoul(u + 1, &u, 16);
183 else
184 goto parse_error;
186 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
187 goto parse_error;
189 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
190 io_from, io_to);
192 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
193 BUG();
194 }
195 }
197 int __init construct_dom0(
198 struct domain *d,
199 unsigned long _image_start, unsigned long image_len,
200 unsigned long _initrd_start, unsigned long initrd_len,
201 char *cmdline)
202 {
203 int i, rc, compatible, compat32, order, machine;
204 struct cpu_user_regs *regs;
205 unsigned long pfn, mfn;
206 unsigned long nr_pages;
207 unsigned long nr_pt_pages;
208 unsigned long alloc_spfn;
209 unsigned long alloc_epfn;
210 unsigned long count;
211 struct page_info *page = NULL;
212 start_info_t *si;
213 struct vcpu *v = d->vcpu[0];
214 unsigned long long value;
215 #if defined(__i386__)
216 char *image_start = (char *)_image_start; /* use lowmem mappings */
217 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
218 #elif defined(__x86_64__)
219 char *image_start = __va(_image_start);
220 char *initrd_start = __va(_initrd_start);
221 #endif
222 #if CONFIG_PAGING_LEVELS >= 4
223 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
224 #endif
225 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
226 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
227 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
229 /*
230 * This fully describes the memory layout of the initial domain. All
231 * *_start address are page-aligned, except v_start (and v_end) which are
232 * superpage-aligned.
233 */
234 struct elf_binary elf;
235 struct elf_dom_parms parms;
236 unsigned long vkern_start;
237 unsigned long vkern_end;
238 unsigned long vinitrd_start;
239 unsigned long vinitrd_end;
240 unsigned long vphysmap_start;
241 unsigned long vphysmap_end;
242 unsigned long vstartinfo_start;
243 unsigned long vstartinfo_end;
244 unsigned long vstack_start;
245 unsigned long vstack_end;
246 unsigned long vpt_start;
247 unsigned long vpt_end;
248 unsigned long v_start;
249 unsigned long v_end;
251 /* Machine address of next candidate page-table page. */
252 unsigned long mpt_alloc;
254 /* Sanity! */
255 BUG_ON(d->domain_id != 0);
256 BUG_ON(d->vcpu[0] == NULL);
257 BUG_ON(v->is_initialised);
259 printk("*** LOADING DOMAIN 0 ***\n");
261 d->max_pages = ~0U;
263 nr_pages = compute_dom0_nr_pages();
265 if ( (rc = elf_init(&elf, image_start, image_len)) != 0 )
266 return rc;
267 #ifdef VERBOSE
268 elf_set_verbose(&elf);
269 #endif
270 elf_parse_binary(&elf);
271 if ( (rc = elf_xen_parse(&elf, &parms)) != 0 )
272 return rc;
274 /* compatibility check */
275 compatible = 0;
276 compat32 = 0;
277 machine = elf_uval(&elf, elf.ehdr, e_machine);
278 switch (CONFIG_PAGING_LEVELS) {
279 case 3: /* x86_32p */
280 if (parms.pae == PAEKERN_bimodal)
281 parms.pae = PAEKERN_extended_cr3;
282 printk(" Xen kernel: 32-bit, PAE, lsb\n");
283 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
284 compatible = 1;
285 break;
286 case 4: /* x86_64 */
287 printk(" Xen kernel: 64-bit, lsb, compat32\n");
288 if (elf_32bit(&elf) && parms.pae == PAEKERN_bimodal)
289 parms.pae = PAEKERN_extended_cr3;
290 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
291 {
292 compat32 = 1;
293 compatible = 1;
294 }
295 if (elf_64bit(&elf) && machine == EM_X86_64)
296 compatible = 1;
297 break;
298 }
299 printk(" Dom0 kernel: %s%s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
300 elf_64bit(&elf) ? "64-bit" : "32-bit",
301 parms.pae ? ", PAE" : "",
302 elf_msb(&elf) ? "msb" : "lsb",
303 elf.pstart, elf.pend);
304 if ( elf.bsd_symtab_pstart )
305 printk(" Dom0 symbol map 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
306 elf.bsd_symtab_pstart, elf.bsd_symtab_pend);
308 if ( !compatible )
309 {
310 printk("Mismatch between Xen and DOM0 kernel\n");
311 return -EINVAL;
312 }
314 #if defined(__x86_64__)
315 if ( compat32 )
316 {
317 l1_pgentry_t gdt_l1e;
319 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
320 v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
322 if ( nr_pages != (unsigned int)nr_pages )
323 nr_pages = UINT_MAX;
325 /*
326 * Map compatibility Xen segments into every VCPU's GDT. See
327 * arch_domain_create() for further comments.
328 */
329 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table),
330 PAGE_HYPERVISOR);
331 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
332 d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) +
333 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
334 flush_tlb_one_local(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE);
335 }
336 #endif
338 if ( parms.pae == PAEKERN_extended_cr3 )
339 set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
341 if ( (parms.virt_hv_start_low != UNSET_ADDR) && elf_32bit(&elf) )
342 {
343 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
344 value = (parms.virt_hv_start_low + mask) & ~mask;
345 BUG_ON(!is_pv_32bit_domain(d));
346 #if defined(__i386__)
347 if ( value > HYPERVISOR_VIRT_START )
348 panic("Domain 0 expects too high a hypervisor start address.\n");
349 #else
350 if ( value > __HYPERVISOR_COMPAT_VIRT_START )
351 panic("Domain 0 expects too high a hypervisor start address.\n");
352 HYPERVISOR_COMPAT_VIRT_START(d) =
353 max_t(unsigned int, m2p_compat_vstart, value);
354 #endif
355 }
357 domain_set_alloc_bitsize(d);
359 /*
360 * Why do we need this? The number of page-table frames depends on the
361 * size of the bootstrap address space. But the size of the address space
362 * depends on the number of page-table frames (since each one is mapped
363 * read-only). We have a pair of simultaneous equations in two unknowns,
364 * which we solve by exhaustive search.
365 */
366 v_start = parms.virt_base;
367 vkern_start = parms.virt_kstart;
368 vkern_end = parms.virt_kend;
369 vinitrd_start = round_pgup(vkern_end);
370 vinitrd_end = vinitrd_start + initrd_len;
371 vphysmap_start = round_pgup(vinitrd_end);
372 vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
373 sizeof(unsigned long) :
374 sizeof(unsigned int)));
375 vstartinfo_start = round_pgup(vphysmap_end);
376 vstartinfo_end = (vstartinfo_start +
377 sizeof(struct start_info) +
378 sizeof(struct dom0_vga_console_info));
379 vpt_start = round_pgup(vstartinfo_end);
380 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
381 {
382 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
383 vstack_start = vpt_end;
384 vstack_end = vstack_start + PAGE_SIZE;
385 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
386 if ( (v_end - vstack_end) < (512UL << 10) )
387 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
388 #if defined(__i386__)
389 /* 5 pages: 1x 3rd + 4x 2nd level */
390 if ( (((v_end - v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
391 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
392 break;
393 #elif defined(__x86_64__)
394 #define NR(_l,_h,_s) \
395 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
396 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
397 if ( (1 + /* # L4 */
398 NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
399 (!is_pv_32on64_domain(d) ?
400 NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */
401 4) + /* # compat L2 */
402 NR(v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
403 <= nr_pt_pages )
404 break;
405 #endif
406 }
408 order = get_order_from_bytes(v_end - v_start);
409 if ( (1UL << order) > nr_pages )
410 panic("Domain 0 allocation is too small for kernel image.\n");
412 #ifdef __i386__
413 /* Ensure that our low-memory 1:1 mapping covers the allocation. */
414 page = alloc_domheap_pages(d, order, MEMF_bits(30));
415 #else
416 page = alloc_domheap_pages(d, order, 0);
417 #endif
418 if ( page == NULL )
419 panic("Not enough RAM for domain 0 allocation.\n");
420 alloc_spfn = page_to_mfn(page);
421 alloc_epfn = alloc_spfn + d->tot_pages;
423 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
424 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
425 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
426 if ( d->tot_pages < nr_pages )
427 printk(" (%lu pages to be allocated)",
428 nr_pages - d->tot_pages);
429 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
430 " Loaded kernel: %p->%p\n"
431 " Init. ramdisk: %p->%p\n"
432 " Phys-Mach map: %p->%p\n"
433 " Start info: %p->%p\n"
434 " Page tables: %p->%p\n"
435 " Boot stack: %p->%p\n"
436 " TOTAL: %p->%p\n",
437 _p(vkern_start), _p(vkern_end),
438 _p(vinitrd_start), _p(vinitrd_end),
439 _p(vphysmap_start), _p(vphysmap_end),
440 _p(vstartinfo_start), _p(vstartinfo_end),
441 _p(vpt_start), _p(vpt_end),
442 _p(vstack_start), _p(vstack_end),
443 _p(v_start), _p(v_end));
444 printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
446 if ( ((v_end - v_start)>>PAGE_SHIFT) > nr_pages )
447 {
448 printk("Initial guest OS requires too much space\n"
449 "(%luMB is greater than %luMB limit)\n",
450 (v_end-v_start)>>20, nr_pages>>(20-PAGE_SHIFT));
451 return -ENOMEM;
452 }
454 mpt_alloc = (vpt_start - v_start) +
455 (unsigned long)pfn_to_paddr(alloc_spfn);
457 #if defined(__i386__)
458 /*
459 * Protect the lowest 1GB of memory. We use a temporary mapping there
460 * from which we copy the kernel and ramdisk images.
461 */
462 if ( v_start < (1UL<<30) )
463 {
464 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
465 return -EINVAL;
466 }
468 /* WARNING: The new domain must have its 'processor' field filled in! */
469 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
470 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
471 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
472 for (i = 0; i < 4; i++) {
473 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
474 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
475 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
476 }
477 v->arch.guest_table = pagetable_from_paddr((unsigned long)l3start);
479 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
480 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
481 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
482 __PAGE_HYPERVISOR);
484 l2tab += l2_linear_offset(v_start);
485 mfn = alloc_spfn;
486 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
487 {
488 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
489 {
490 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
491 mpt_alloc += PAGE_SIZE;
492 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
493 l2tab++;
494 clear_page(l1tab);
495 if ( count == 0 )
496 l1tab += l1_table_offset(v_start);
497 }
498 *l1tab = l1e_from_pfn(mfn, L1_PROT);
499 l1tab++;
501 page = mfn_to_page(mfn);
502 if ( !get_page_and_type(page, d, PGT_writable_page) )
503 BUG();
505 mfn++;
506 }
508 /* Pages that are part of page tables must be read only. */
509 l2tab = l2start + l2_linear_offset(vpt_start);
510 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
511 l1tab += l1_table_offset(vpt_start);
512 for ( count = 0; count < nr_pt_pages; count++ )
513 {
514 page = mfn_to_page(l1e_get_pfn(*l1tab));
515 if ( !opt_dom0_shadow )
516 l1e_remove_flags(*l1tab, _PAGE_RW);
517 else
518 if ( !get_page_type(page, PGT_writable_page) )
519 BUG();
521 switch ( count )
522 {
523 case 0:
524 page->u.inuse.type_info &= ~PGT_type_mask;
525 page->u.inuse.type_info |= PGT_l3_page_table;
526 get_page(page, d); /* an extra ref because of readable mapping */
528 /* Get another ref to L3 page so that it can be pinned. */
529 page->u.inuse.type_info++;
530 page->count_info++;
531 set_bit(_PGT_pinned, &page->u.inuse.type_info);
532 break;
533 case 1 ... 4:
534 page->u.inuse.type_info &= ~PGT_type_mask;
535 page->u.inuse.type_info |= PGT_l2_page_table;
536 if ( count == 4 )
537 page->u.inuse.type_info |= PGT_pae_xen_l2;
538 get_page(page, d); /* an extra ref because of readable mapping */
539 break;
540 default:
541 page->u.inuse.type_info &= ~PGT_type_mask;
542 page->u.inuse.type_info |= PGT_l1_page_table;
543 get_page(page, d); /* an extra ref because of readable mapping */
544 break;
545 }
546 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
547 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
548 }
550 #elif defined(__x86_64__)
552 /* Overlap with Xen protected area? */
553 if ( !is_pv_32on64_domain(d) ?
554 ((v_start < HYPERVISOR_VIRT_END) &&
555 (v_end > HYPERVISOR_VIRT_START)) :
556 (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) )
557 {
558 printk("DOM0 image overlaps with Xen private area.\n");
559 return -EINVAL;
560 }
562 if ( is_pv_32on64_domain(d) )
563 {
564 v->arch.guest_context.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS;
565 v->arch.guest_context.event_callback_cs = FLAT_COMPAT_KERNEL_CS;
566 }
568 /* WARNING: The new domain must have its 'processor' field filled in! */
569 if ( !is_pv_32on64_domain(d) )
570 {
571 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
572 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
573 }
574 else
575 {
576 page = alloc_domheap_page(NULL, 0);
577 if ( !page )
578 panic("Not enough RAM for domain 0 PML4.\n");
579 page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
580 l4start = l4tab = page_to_virt(page);
581 }
582 copy_page(l4tab, idle_pg_table);
583 l4tab[0] = l4e_empty(); /* zap trampoline mapping */
584 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
585 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
586 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
587 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
588 v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
589 if ( is_pv_32on64_domain(d) )
590 v->arch.guest_table_user = v->arch.guest_table;
592 l4tab += l4_table_offset(v_start);
593 mfn = alloc_spfn;
594 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
595 {
596 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
597 {
598 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
599 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
600 clear_page(l1tab);
601 if ( count == 0 )
602 l1tab += l1_table_offset(v_start);
603 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
604 {
605 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
606 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
607 clear_page(l2tab);
608 if ( count == 0 )
609 l2tab += l2_table_offset(v_start);
610 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
611 {
612 maddr_to_page(mpt_alloc)->u.inuse.type_info =
613 PGT_l3_page_table;
614 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
615 clear_page(l3tab);
616 if ( count == 0 )
617 l3tab += l3_table_offset(v_start);
618 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
619 l4tab++;
620 }
621 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
622 l3tab++;
623 }
624 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
625 l2tab++;
626 }
627 *l1tab = l1e_from_pfn(mfn, (!is_pv_32on64_domain(d) ?
628 L1_PROT : COMPAT_L1_PROT));
629 l1tab++;
631 page = mfn_to_page(mfn);
632 if ( (page->u.inuse.type_info == 0) &&
633 !get_page_and_type(page, d, PGT_writable_page) )
634 BUG();
636 mfn++;
637 }
639 if ( is_pv_32on64_domain(d) )
640 {
641 /* Ensure the first four L3 entries are all populated. */
642 for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
643 {
644 if ( !l3e_get_intpte(*l3tab) )
645 {
646 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
647 l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
648 clear_page(l2tab);
649 *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
650 }
651 if ( i == 3 )
652 l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
653 }
654 /* Install read-only guest visible MPT mapping. */
655 l2tab = l3e_to_l2e(l3start[3]);
656 memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
657 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
658 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
659 }
661 /* Pages that are part of page tables must be read only. */
662 l4tab = l4start + l4_table_offset(vpt_start);
663 l3start = l3tab = l4e_to_l3e(*l4tab);
664 l3tab += l3_table_offset(vpt_start);
665 l2start = l2tab = l3e_to_l2e(*l3tab);
666 l2tab += l2_table_offset(vpt_start);
667 l1start = l1tab = l2e_to_l1e(*l2tab);
668 l1tab += l1_table_offset(vpt_start);
669 for ( count = 0; count < nr_pt_pages; count++ )
670 {
671 l1e_remove_flags(*l1tab, _PAGE_RW);
672 page = mfn_to_page(l1e_get_pfn(*l1tab));
674 /* Read-only mapping + PGC_allocated + page-table page. */
675 page->count_info = PGC_allocated | 3;
676 page->u.inuse.type_info |= PGT_validated | 1;
678 /* Top-level p.t. is pinned. */
679 if ( (page->u.inuse.type_info & PGT_type_mask) ==
680 (!is_pv_32on64_domain(d) ?
681 PGT_l4_page_table : PGT_l3_page_table) )
682 {
683 page->count_info += 1;
684 page->u.inuse.type_info += 1 | PGT_pinned;
685 }
687 /* Iterate. */
688 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
689 {
690 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
691 {
692 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
693 l3start = l3tab = l4e_to_l3e(*++l4tab);
694 l2start = l2tab = l3e_to_l2e(*l3tab);
695 }
696 l1start = l1tab = l2e_to_l1e(*l2tab);
697 }
698 }
700 #endif /* __x86_64__ */
702 /* Mask all upcalls... */
703 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
704 shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
706 if ( opt_dom0_max_vcpus == 0 )
707 opt_dom0_max_vcpus = num_online_cpus();
708 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
709 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
710 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
712 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
713 (void)alloc_vcpu(d, i, i % num_online_cpus());
715 /* Set up CR3 value for write_ptbase */
716 if ( paging_mode_enabled(v->domain) )
717 paging_update_paging_modes(v);
718 else
719 update_cr3(v);
721 /* Install the new page tables. */
722 local_irq_disable();
723 write_ptbase(v);
725 /* Copy the OS image and free temporary buffer. */
726 elf.dest = (void*)vkern_start;
727 elf_load_binary(&elf);
729 if ( UNSET_ADDR != parms.virt_hypercall )
730 {
731 if ( (parms.virt_hypercall < v_start) ||
732 (parms.virt_hypercall >= v_end) )
733 {
734 write_ptbase(current);
735 local_irq_enable();
736 printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
737 return -1;
738 }
739 hypercall_page_initialise(d, (void *)(unsigned long)parms.virt_hypercall);
740 }
742 /* Copy the initial ramdisk. */
743 if ( initrd_len != 0 )
744 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
746 /* Free temporary buffers. */
747 discard_initial_images();
749 /* Set up start info area. */
750 si = (start_info_t *)vstartinfo_start;
751 clear_page(si);
752 si->nr_pages = nr_pages;
754 si->shared_info = virt_to_maddr(d->shared_info);
756 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
757 si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
758 si->pt_base = vpt_start + 2 * PAGE_SIZE * !!is_pv_32on64_domain(d);
759 si->nr_pt_frames = nr_pt_pages;
760 si->mfn_list = vphysmap_start;
761 snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
762 elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
764 /* Write the phys->machine and machine->phys table entries. */
765 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
766 {
767 mfn = pfn + alloc_spfn;
768 #ifndef NDEBUG
769 #define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT)
770 if ( pfn > REVERSE_START )
771 mfn = alloc_epfn - (pfn - REVERSE_START);
772 #endif
773 if ( !is_pv_32on64_domain(d) )
774 ((unsigned long *)vphysmap_start)[pfn] = mfn;
775 else
776 ((unsigned int *)vphysmap_start)[pfn] = mfn;
777 set_gpfn_from_mfn(mfn, pfn);
778 }
779 while ( pfn < nr_pages )
780 {
781 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
782 panic("Not enough RAM for DOM0 reservation.\n");
783 while ( pfn < d->tot_pages )
784 {
785 mfn = page_to_mfn(page);
786 #ifndef NDEBUG
787 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
788 #endif
789 if ( !is_pv_32on64_domain(d) )
790 ((unsigned long *)vphysmap_start)[pfn] = mfn;
791 else
792 ((unsigned int *)vphysmap_start)[pfn] = mfn;
793 set_gpfn_from_mfn(mfn, pfn);
794 #undef pfn
795 page++; pfn++;
796 }
797 }
799 if ( initrd_len != 0 )
800 {
801 si->mod_start = vinitrd_start;
802 si->mod_len = initrd_len;
803 }
805 memset(si->cmd_line, 0, sizeof(si->cmd_line));
806 if ( cmdline != NULL )
807 strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line));
809 if ( fill_console_start_info((void *)(si + 1)) )
810 {
811 si->console.dom0.info_off = sizeof(struct start_info);
812 si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
813 }
815 #if defined(__x86_64__)
816 if ( is_pv_32on64_domain(d) )
817 xlat_start_info(si, XLAT_start_info_console_dom0);
818 #endif
820 /* Reinstate the caller's page tables. */
821 write_ptbase(current);
822 local_irq_enable();
824 #if defined(__i386__)
825 /* Destroy low mappings - they were only for our convenience. */
826 zap_low_mappings(l2start);
827 #endif
829 update_domain_wallclock_time(d);
831 v->is_initialised = 1;
832 clear_bit(_VPF_down, &v->pause_flags);
834 /*
835 * Initial register values:
836 * DS,ES,FS,GS = FLAT_KERNEL_DS
837 * CS:EIP = FLAT_KERNEL_CS:start_pc
838 * SS:ESP = FLAT_KERNEL_SS:start_stack
839 * ESI = start_info
840 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
841 */
842 regs = &v->arch.guest_context.user_regs;
843 regs->ds = regs->es = regs->fs = regs->gs =
844 !is_pv_32on64_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS;
845 regs->ss = (!is_pv_32on64_domain(d) ?
846 FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS);
847 regs->cs = (!is_pv_32on64_domain(d) ?
848 FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS);
849 regs->eip = parms.virt_entry;
850 regs->esp = vstack_end;
851 regs->esi = vstartinfo_start;
852 regs->eflags = X86_EFLAGS_IF;
854 if ( opt_dom0_shadow )
855 if ( paging_enable(d, PG_SH_enable) == 0 )
856 paging_update_paging_modes(v);
858 if ( supervisor_mode_kernel )
859 {
860 v->arch.guest_context.kernel_ss &= ~3;
861 v->arch.guest_context.user_regs.ss &= ~3;
862 v->arch.guest_context.user_regs.es &= ~3;
863 v->arch.guest_context.user_regs.ds &= ~3;
864 v->arch.guest_context.user_regs.fs &= ~3;
865 v->arch.guest_context.user_regs.gs &= ~3;
866 printk("Dom0 runs in ring 0 (supervisor mode)\n");
867 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
868 parms.f_supported) )
869 panic("Dom0 does not support supervisor-mode execution\n");
870 }
871 else
872 {
873 if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) )
874 panic("Dom0 requires supervisor-mode execution\n");
875 }
877 rc = 0;
879 /* DOM0 is permitted full I/O capabilities. */
880 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
881 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
882 rc |= irqs_permit_access(dom0, 0, NR_IRQS-1);
884 /*
885 * Modify I/O port access permissions.
886 */
887 /* Master Interrupt Controller (PIC). */
888 rc |= ioports_deny_access(dom0, 0x20, 0x21);
889 /* Slave Interrupt Controller (PIC). */
890 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
891 /* Interval Timer (PIT). */
892 rc |= ioports_deny_access(dom0, 0x40, 0x43);
893 /* PIT Channel 2 / PC Speaker Control. */
894 rc |= ioports_deny_access(dom0, 0x61, 0x61);
895 /* PCI configuration space (NB. 0xcf8 has special treatment). */
896 rc |= ioports_deny_access(dom0, 0xcfc, 0xcff);
897 /* Command-line I/O ranges. */
898 process_dom0_ioports_disable();
900 /*
901 * Modify I/O memory access permissions.
902 */
903 /* Local APIC. */
904 if ( mp_lapic_addr != 0 )
905 {
906 mfn = paddr_to_pfn(mp_lapic_addr);
907 rc |= iomem_deny_access(dom0, mfn, mfn);
908 }
909 /* I/O APICs. */
910 for ( i = 0; i < nr_ioapics; i++ )
911 {
912 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
913 if ( smp_found_config )
914 rc |= iomem_deny_access(dom0, mfn, mfn);
915 }
917 /* Remove access to E820_UNUSABLE I/O regions above 1MB. */
918 for ( i = 0; i < e820.nr_map; i++ )
919 {
920 unsigned long sfn, efn;
921 sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul);
922 efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1);
923 if ( (e820.map[i].type == E820_UNUSABLE) &&
924 (e820.map[i].size != 0) &&
925 (sfn <= efn) )
926 rc |= iomem_deny_access(dom0, sfn, efn);
927 }
929 BUG_ON(rc != 0);
931 return 0;
932 }
934 /*
935 * Local variables:
936 * mode: C
937 * c-set-style: "BSD"
938 * c-basic-offset: 4
939 * tab-width: 4
940 * indent-tabs-mode: nil
941 * End:
942 */