ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 19787:cecc76506afc

x86_64: don't allocate L1 per-domain page table pages in a single chunk

Instead, allocate them on demand, and adjust the consumer to no longer
assume the allocated space is contiguous.

This another prerequisite to extend to number of vCPU-s the hypervisor
can support per guest.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:05:23 2009 +0100 (2009-06-18)
parents 6705898f768d
children 2f9e1348aa98
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/console.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/version.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <xen/compat.h>
22 #include <xen/libelf.h>
23 #include <asm/regs.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/i387.h>
29 #include <asm/paging.h>
30 #include <asm/p2m.h>
31 #include <asm/e820.h>
33 #include <public/version.h>
35 int __init bzimage_parse(
36 char *output, char **image_start, unsigned long *image_len);
38 extern unsigned long initial_images_nrpages(void);
39 extern void discard_initial_images(void);
41 static long __initdata dom0_nrpages;
42 static long __initdata dom0_min_nrpages;
43 static long __initdata dom0_max_nrpages = LONG_MAX;
45 /*
46 * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
47 *
48 * <min_amt>: The minimum amount of memory which should be allocated for dom0.
49 * <max_amt>: The maximum amount of memory which should be allocated for dom0.
50 * <amt>: The precise amount of memory to allocate for dom0.
51 *
52 * Notes:
53 * 1. <amt> is clamped from below by <min_amt> and from above by available
54 * memory and <max_amt>
55 * 2. <min_amt> is clamped from above by available memory and <max_amt>
56 * 3. <min_amt> is ignored if it is greater than <max_amt>
57 * 4. If <amt> is not specified, it is calculated as follows:
58 * "All of memory is allocated to domain 0, minus 1/16th which is reserved
59 * for uses such as DMA buffers (the reservation is clamped to 128MB)."
60 *
61 * Each value can be specified as positive or negative:
62 * If +ve: The specified amount is an absolute value.
63 * If -ve: The specified amount is subtracted from total available memory.
64 */
65 static long __init parse_amt(const char *s, const char **ps)
66 {
67 long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT;
68 return (*s == '-') ? -pages : pages;
69 }
70 static void __init parse_dom0_mem(const char *s)
71 {
72 do {
73 if ( !strncmp(s, "min:", 4) )
74 dom0_min_nrpages = parse_amt(s+4, &s);
75 else if ( !strncmp(s, "max:", 4) )
76 dom0_max_nrpages = parse_amt(s+4, &s);
77 else
78 dom0_nrpages = parse_amt(s, &s);
79 if ( *s != ',' )
80 break;
81 } while ( *s++ == ',' );
82 }
83 custom_param("dom0_mem", parse_dom0_mem);
85 static unsigned int opt_dom0_max_vcpus;
86 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
88 static unsigned int opt_dom0_shadow;
89 boolean_param("dom0_shadow", opt_dom0_shadow);
91 static char opt_dom0_ioports_disable[200] = "";
92 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
94 #if defined(__i386__)
95 /* No ring-3 access in initial leaf page tables. */
96 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
97 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
98 #define L3_PROT (_PAGE_PRESENT)
99 #elif defined(__x86_64__)
100 /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
101 #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
102 #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
103 /* ... except for compatibility mode guests. */
104 #define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
105 #define L2_PROT (BASE_PROT|_PAGE_DIRTY)
106 #define L3_PROT (BASE_PROT|_PAGE_DIRTY)
107 #define L4_PROT (BASE_PROT|_PAGE_DIRTY)
108 #endif
110 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
111 #define round_pgdown(_p) ((_p)&PAGE_MASK)
113 static struct page_info * __init alloc_chunk(
114 struct domain *d, unsigned long max_pages)
115 {
116 struct page_info *page;
117 unsigned int order;
118 /*
119 * Allocate up to 2MB at a time: It prevents allocating very large chunks
120 * from DMA pools before the >4GB pool is fully depleted.
121 */
122 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
123 max_pages = 2UL << (20 - PAGE_SHIFT);
124 order = get_order_from_pages(max_pages);
125 if ( (max_pages & (max_pages-1)) != 0 )
126 order--;
127 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
128 if ( order-- == 0 )
129 break;
130 return page;
131 }
133 static unsigned long __init compute_dom0_nr_pages(void)
134 {
135 unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
137 /*
138 * If domain 0 allocation isn't specified, reserve 1/16th of available
139 * memory for things like DMA buffers. This reservation is clamped to
140 * a maximum of 128MB.
141 */
142 if ( dom0_nrpages == 0 )
143 {
144 dom0_nrpages = avail;
145 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
146 dom0_nrpages = -dom0_nrpages;
147 }
149 /* Negative memory specification means "all memory - specified amount". */
150 if ( dom0_nrpages < 0 ) dom0_nrpages += avail;
151 if ( dom0_min_nrpages < 0 ) dom0_min_nrpages += avail;
152 if ( dom0_max_nrpages < 0 ) dom0_max_nrpages += avail;
154 /* Clamp dom0 memory according to min/max limits and available memory. */
155 dom0_nrpages = max(dom0_nrpages, dom0_min_nrpages);
156 dom0_nrpages = min(dom0_nrpages, dom0_max_nrpages);
157 dom0_nrpages = min(dom0_nrpages, (long)avail);
159 return dom0_nrpages;
160 }
162 static void __init process_dom0_ioports_disable(void)
163 {
164 unsigned long io_from, io_to;
165 char *t, *s = opt_dom0_ioports_disable;
166 const char *u;
168 if ( *s == '\0' )
169 return;
171 while ( (t = strsep(&s, ",")) != NULL )
172 {
173 io_from = simple_strtoul(t, &u, 16);
174 if ( u == t )
175 {
176 parse_error:
177 printk("Invalid ioport range <%s> "
178 "in dom0_ioports_disable, skipping\n", t);
179 continue;
180 }
182 if ( *u == '\0' )
183 io_to = io_from;
184 else if ( *u == '-' )
185 io_to = simple_strtoul(u + 1, &u, 16);
186 else
187 goto parse_error;
189 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
190 goto parse_error;
192 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
193 io_from, io_to);
195 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
196 BUG();
197 }
198 }
200 int __init construct_dom0(
201 struct domain *d,
202 unsigned long _image_base,
203 unsigned long _image_start, unsigned long image_len,
204 unsigned long _initrd_start, unsigned long initrd_len,
205 char *cmdline)
206 {
207 int i, rc, compatible, compat32, order, machine;
208 struct cpu_user_regs *regs;
209 unsigned long pfn, mfn;
210 unsigned long nr_pages;
211 unsigned long nr_pt_pages;
212 unsigned long alloc_spfn;
213 unsigned long alloc_epfn;
214 unsigned long count;
215 struct page_info *page = NULL;
216 start_info_t *si;
217 struct vcpu *v = d->vcpu[0];
218 unsigned long long value;
219 #if defined(__i386__)
220 char *image_base = (char *)_image_base; /* use lowmem mappings */
221 char *image_start = (char *)_image_start; /* use lowmem mappings */
222 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
223 #elif defined(__x86_64__)
224 char *image_base = __va(_image_base);
225 char *image_start = __va(_image_start);
226 char *initrd_start = __va(_initrd_start);
227 #endif
228 #if CONFIG_PAGING_LEVELS >= 4
229 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
230 #endif
231 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
232 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
233 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
235 /*
236 * This fully describes the memory layout of the initial domain. All
237 * *_start address are page-aligned, except v_start (and v_end) which are
238 * superpage-aligned.
239 */
240 struct elf_binary elf;
241 struct elf_dom_parms parms;
242 unsigned long vkern_start;
243 unsigned long vkern_end;
244 unsigned long vinitrd_start;
245 unsigned long vinitrd_end;
246 unsigned long vphysmap_start;
247 unsigned long vphysmap_end;
248 unsigned long vstartinfo_start;
249 unsigned long vstartinfo_end;
250 unsigned long vstack_start;
251 unsigned long vstack_end;
252 unsigned long vpt_start;
253 unsigned long vpt_end;
254 unsigned long v_start;
255 unsigned long v_end;
257 /* Machine address of next candidate page-table page. */
258 unsigned long mpt_alloc;
260 /* Sanity! */
261 BUG_ON(d->domain_id != 0);
262 BUG_ON(d->vcpu[0] == NULL);
263 BUG_ON(v->is_initialised);
265 printk("*** LOADING DOMAIN 0 ***\n");
267 d->max_pages = ~0U;
269 nr_pages = compute_dom0_nr_pages();
271 if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 )
272 return rc;
274 if ( (rc = elf_init(&elf, image_start, image_len)) != 0 )
275 return rc;
276 #ifdef VERBOSE
277 elf_set_verbose(&elf);
278 #endif
279 elf_parse_binary(&elf);
280 if ( (rc = elf_xen_parse(&elf, &parms)) != 0 )
281 return rc;
283 /* compatibility check */
284 compatible = 0;
285 compat32 = 0;
286 machine = elf_uval(&elf, elf.ehdr, e_machine);
287 switch (CONFIG_PAGING_LEVELS) {
288 case 3: /* x86_32p */
289 if (parms.pae == PAEKERN_bimodal)
290 parms.pae = PAEKERN_extended_cr3;
291 printk(" Xen kernel: 32-bit, PAE, lsb\n");
292 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
293 compatible = 1;
294 break;
295 case 4: /* x86_64 */
296 printk(" Xen kernel: 64-bit, lsb, compat32\n");
297 if (elf_32bit(&elf) && parms.pae == PAEKERN_bimodal)
298 parms.pae = PAEKERN_extended_cr3;
299 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
300 {
301 compat32 = 1;
302 compatible = 1;
303 }
304 if (elf_64bit(&elf) && machine == EM_X86_64)
305 compatible = 1;
306 break;
307 }
308 printk(" Dom0 kernel: %s%s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
309 elf_64bit(&elf) ? "64-bit" : "32-bit",
310 parms.pae ? ", PAE" : "",
311 elf_msb(&elf) ? "msb" : "lsb",
312 elf.pstart, elf.pend);
313 if ( elf.bsd_symtab_pstart )
314 printk(" Dom0 symbol map 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
315 elf.bsd_symtab_pstart, elf.bsd_symtab_pend);
317 if ( !compatible )
318 {
319 printk("Mismatch between Xen and DOM0 kernel\n");
320 return -EINVAL;
321 }
323 #if defined(__x86_64__)
324 if ( compat32 )
325 {
326 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
327 v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
329 if ( nr_pages != (unsigned int)nr_pages )
330 nr_pages = UINT_MAX;
331 }
332 #endif
334 if ( parms.pae == PAEKERN_extended_cr3 )
335 set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
337 if ( (parms.virt_hv_start_low != UNSET_ADDR) && elf_32bit(&elf) )
338 {
339 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
340 value = (parms.virt_hv_start_low + mask) & ~mask;
341 BUG_ON(!is_pv_32bit_domain(d));
342 #if defined(__i386__)
343 if ( value > HYPERVISOR_VIRT_START )
344 panic("Domain 0 expects too high a hypervisor start address.\n");
345 #else
346 if ( value > __HYPERVISOR_COMPAT_VIRT_START )
347 panic("Domain 0 expects too high a hypervisor start address.\n");
348 HYPERVISOR_COMPAT_VIRT_START(d) =
349 max_t(unsigned int, m2p_compat_vstart, value);
350 #endif
351 }
353 if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) )
354 {
355 printk(XENLOG_WARNING "P2M table base ignored\n");
356 parms.p2m_base = UNSET_ADDR;
357 }
359 domain_set_alloc_bitsize(d);
361 /*
362 * Why do we need this? The number of page-table frames depends on the
363 * size of the bootstrap address space. But the size of the address space
364 * depends on the number of page-table frames (since each one is mapped
365 * read-only). We have a pair of simultaneous equations in two unknowns,
366 * which we solve by exhaustive search.
367 */
368 v_start = parms.virt_base;
369 vkern_start = parms.virt_kstart;
370 vkern_end = parms.virt_kend;
371 vinitrd_start = round_pgup(vkern_end);
372 vinitrd_end = vinitrd_start + initrd_len;
373 vphysmap_start = round_pgup(vinitrd_end);
374 vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
375 sizeof(unsigned long) :
376 sizeof(unsigned int)));
377 if ( parms.p2m_base != UNSET_ADDR )
378 vphysmap_end = vphysmap_start;
379 vstartinfo_start = round_pgup(vphysmap_end);
380 vstartinfo_end = (vstartinfo_start +
381 sizeof(struct start_info) +
382 sizeof(struct dom0_vga_console_info));
383 vpt_start = round_pgup(vstartinfo_end);
384 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
385 {
386 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
387 vstack_start = vpt_end;
388 vstack_end = vstack_start + PAGE_SIZE;
389 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
390 if ( (v_end - vstack_end) < (512UL << 10) )
391 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
392 #if defined(__i386__)
393 /* 5 pages: 1x 3rd + 4x 2nd level */
394 if ( (((v_end - v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
395 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
396 break;
397 #elif defined(__x86_64__)
398 #define NR(_l,_h,_s) \
399 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
400 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
401 if ( (1 + /* # L4 */
402 NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
403 (!is_pv_32on64_domain(d) ?
404 NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */
405 4) + /* # compat L2 */
406 NR(v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
407 <= nr_pt_pages )
408 break;
409 #endif
410 }
412 order = get_order_from_bytes(v_end - v_start);
413 if ( (1UL << order) > nr_pages )
414 panic("Domain 0 allocation is too small for kernel image.\n");
416 #ifdef __i386__
417 /* Ensure that our low-memory 1:1 mapping covers the allocation. */
418 page = alloc_domheap_pages(d, order, MEMF_bits(30));
419 #else
420 if ( parms.p2m_base != UNSET_ADDR )
421 {
422 vphysmap_start = parms.p2m_base;
423 vphysmap_end = vphysmap_start + nr_pages * sizeof(unsigned long);
424 }
425 page = alloc_domheap_pages(d, order, 0);
426 #endif
427 if ( page == NULL )
428 panic("Not enough RAM for domain 0 allocation.\n");
429 alloc_spfn = page_to_mfn(page);
430 alloc_epfn = alloc_spfn + d->tot_pages;
432 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
433 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
434 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
435 if ( d->tot_pages < nr_pages )
436 printk(" (%lu pages to be allocated)",
437 nr_pages - d->tot_pages);
438 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
439 " Loaded kernel: %p->%p\n"
440 " Init. ramdisk: %p->%p\n"
441 " Phys-Mach map: %p->%p\n"
442 " Start info: %p->%p\n"
443 " Page tables: %p->%p\n"
444 " Boot stack: %p->%p\n"
445 " TOTAL: %p->%p\n",
446 _p(vkern_start), _p(vkern_end),
447 _p(vinitrd_start), _p(vinitrd_end),
448 _p(vphysmap_start), _p(vphysmap_end),
449 _p(vstartinfo_start), _p(vstartinfo_end),
450 _p(vpt_start), _p(vpt_end),
451 _p(vstack_start), _p(vstack_end),
452 _p(v_start), _p(v_end));
453 printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
455 mpt_alloc = (vpt_start - v_start) +
456 (unsigned long)pfn_to_paddr(alloc_spfn);
458 #if defined(__i386__)
459 /*
460 * Protect the lowest 1GB of memory. We use a temporary mapping there
461 * from which we copy the kernel and ramdisk images.
462 */
463 if ( v_start < (1UL<<30) )
464 {
465 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
466 return -EINVAL;
467 }
469 /* WARNING: The new domain must have its 'processor' field filled in! */
470 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
471 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
472 for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
473 copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
474 idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
475 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
476 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
477 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
478 }
479 v->arch.guest_table = pagetable_from_paddr((unsigned long)l3start);
481 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
482 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
483 l2e_from_page(perdomain_pt_page(d, i), __PAGE_HYPERVISOR);
485 l2tab += l2_linear_offset(v_start);
486 mfn = alloc_spfn;
487 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
488 {
489 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
490 {
491 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
492 mpt_alloc += PAGE_SIZE;
493 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
494 l2tab++;
495 clear_page(l1tab);
496 if ( count == 0 )
497 l1tab += l1_table_offset(v_start);
498 }
499 *l1tab = l1e_from_pfn(mfn, L1_PROT);
500 l1tab++;
502 page = mfn_to_page(mfn);
503 if ( !get_page_and_type(page, d, PGT_writable_page) )
504 BUG();
506 mfn++;
507 }
509 /* Pages that are part of page tables must be read only. */
510 l2tab = l2start + l2_linear_offset(vpt_start);
511 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
512 l1tab += l1_table_offset(vpt_start);
513 for ( count = 0; count < nr_pt_pages; count++ )
514 {
515 page = mfn_to_page(l1e_get_pfn(*l1tab));
516 if ( !opt_dom0_shadow )
517 l1e_remove_flags(*l1tab, _PAGE_RW);
518 else
519 if ( !get_page_type(page, PGT_writable_page) )
520 BUG();
522 switch ( count )
523 {
524 case 0:
525 page->u.inuse.type_info &= ~PGT_type_mask;
526 page->u.inuse.type_info |= PGT_l3_page_table;
527 get_page(page, d); /* an extra ref because of readable mapping */
529 /* Get another ref to L3 page so that it can be pinned. */
530 page->u.inuse.type_info++;
531 page->count_info++;
532 set_bit(_PGT_pinned, &page->u.inuse.type_info);
533 break;
534 case 1 ... 4:
535 page->u.inuse.type_info &= ~PGT_type_mask;
536 page->u.inuse.type_info |= PGT_l2_page_table;
537 if ( count == 4 )
538 page->u.inuse.type_info |= PGT_pae_xen_l2;
539 get_page(page, d); /* an extra ref because of readable mapping */
540 break;
541 default:
542 page->u.inuse.type_info &= ~PGT_type_mask;
543 page->u.inuse.type_info |= PGT_l1_page_table;
544 get_page(page, d); /* an extra ref because of readable mapping */
545 break;
546 }
547 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
548 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
549 }
551 #elif defined(__x86_64__)
553 /* Overlap with Xen protected area? */
554 if ( !is_pv_32on64_domain(d) ?
555 ((v_start < HYPERVISOR_VIRT_END) &&
556 (v_end > HYPERVISOR_VIRT_START)) :
557 (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) )
558 {
559 printk("DOM0 image overlaps with Xen private area.\n");
560 return -EINVAL;
561 }
563 if ( is_pv_32on64_domain(d) )
564 {
565 v->arch.guest_context.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS;
566 v->arch.guest_context.event_callback_cs = FLAT_COMPAT_KERNEL_CS;
567 }
569 /* WARNING: The new domain must have its 'processor' field filled in! */
570 if ( !is_pv_32on64_domain(d) )
571 {
572 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
573 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
574 }
575 else
576 {
577 page = alloc_domheap_page(NULL, 0);
578 if ( !page )
579 panic("Not enough RAM for domain 0 PML4.\n");
580 page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
581 l4start = l4tab = page_to_virt(page);
582 }
583 copy_page(l4tab, idle_pg_table);
584 l4tab[0] = l4e_empty(); /* zap trampoline mapping */
585 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
586 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
587 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
588 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
589 v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
590 if ( is_pv_32on64_domain(d) )
591 v->arch.guest_table_user = v->arch.guest_table;
593 l4tab += l4_table_offset(v_start);
594 mfn = alloc_spfn;
595 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
596 {
597 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
598 {
599 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
600 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
601 clear_page(l1tab);
602 if ( count == 0 )
603 l1tab += l1_table_offset(v_start);
604 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
605 {
606 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
607 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
608 clear_page(l2tab);
609 if ( count == 0 )
610 l2tab += l2_table_offset(v_start);
611 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
612 {
613 maddr_to_page(mpt_alloc)->u.inuse.type_info =
614 PGT_l3_page_table;
615 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
616 clear_page(l3tab);
617 if ( count == 0 )
618 l3tab += l3_table_offset(v_start);
619 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
620 l4tab++;
621 }
622 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
623 l3tab++;
624 }
625 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
626 l2tab++;
627 }
628 *l1tab = l1e_from_pfn(mfn, (!is_pv_32on64_domain(d) ?
629 L1_PROT : COMPAT_L1_PROT));
630 l1tab++;
632 page = mfn_to_page(mfn);
633 if ( (page->u.inuse.type_info == 0) &&
634 !get_page_and_type(page, d, PGT_writable_page) )
635 BUG();
637 mfn++;
638 }
640 if ( is_pv_32on64_domain(d) )
641 {
642 /* Ensure the first four L3 entries are all populated. */
643 for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
644 {
645 if ( !l3e_get_intpte(*l3tab) )
646 {
647 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
648 l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
649 clear_page(l2tab);
650 *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
651 }
652 if ( i == 3 )
653 l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
654 }
655 /* Install read-only guest visible MPT mapping. */
656 l2tab = l3e_to_l2e(l3start[3]);
657 memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
658 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
659 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
660 }
662 /* Pages that are part of page tables must be read only. */
663 l4tab = l4start + l4_table_offset(vpt_start);
664 l3start = l3tab = l4e_to_l3e(*l4tab);
665 l3tab += l3_table_offset(vpt_start);
666 l2start = l2tab = l3e_to_l2e(*l3tab);
667 l2tab += l2_table_offset(vpt_start);
668 l1start = l1tab = l2e_to_l1e(*l2tab);
669 l1tab += l1_table_offset(vpt_start);
670 for ( count = 0; count < nr_pt_pages; count++ )
671 {
672 l1e_remove_flags(*l1tab, _PAGE_RW);
673 page = mfn_to_page(l1e_get_pfn(*l1tab));
675 /* Read-only mapping + PGC_allocated + page-table page. */
676 page->count_info = PGC_allocated | 3;
677 page->u.inuse.type_info |= PGT_validated | 1;
679 /* Top-level p.t. is pinned. */
680 if ( (page->u.inuse.type_info & PGT_type_mask) ==
681 (!is_pv_32on64_domain(d) ?
682 PGT_l4_page_table : PGT_l3_page_table) )
683 {
684 page->count_info += 1;
685 page->u.inuse.type_info += 1 | PGT_pinned;
686 }
688 /* Iterate. */
689 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
690 {
691 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
692 {
693 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
694 l3start = l3tab = l4e_to_l3e(*++l4tab);
695 l2start = l2tab = l3e_to_l2e(*l3tab);
696 }
697 l1start = l1tab = l2e_to_l1e(*l2tab);
698 }
699 }
701 #endif /* __x86_64__ */
703 /* Mask all upcalls... */
704 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
705 shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
707 if ( opt_dom0_max_vcpus == 0 )
708 opt_dom0_max_vcpus = num_online_cpus();
709 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
710 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
711 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
713 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
714 (void)alloc_vcpu(d, i, i % num_online_cpus());
716 /* Set up CR3 value for write_ptbase */
717 if ( paging_mode_enabled(d) )
718 paging_update_paging_modes(v);
719 else
720 update_cr3(v);
722 /* We run on dom0's page tables for the final part of the build process. */
723 write_ptbase(v);
725 /* Copy the OS image and free temporary buffer. */
726 elf.dest = (void*)vkern_start;
727 elf_load_binary(&elf);
729 if ( UNSET_ADDR != parms.virt_hypercall )
730 {
731 if ( (parms.virt_hypercall < v_start) ||
732 (parms.virt_hypercall >= v_end) )
733 {
734 write_ptbase(current);
735 printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
736 return -1;
737 }
738 hypercall_page_initialise(
739 d, (void *)(unsigned long)parms.virt_hypercall);
740 }
742 /* Copy the initial ramdisk. */
743 if ( initrd_len != 0 )
744 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
746 /* Free temporary buffers. */
747 discard_initial_images();
749 /* Set up start info area. */
750 si = (start_info_t *)vstartinfo_start;
751 clear_page(si);
752 si->nr_pages = nr_pages;
754 si->shared_info = virt_to_maddr(d->shared_info);
756 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
757 si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
758 si->pt_base = vpt_start + 2 * PAGE_SIZE * !!is_pv_32on64_domain(d);
759 si->nr_pt_frames = nr_pt_pages;
760 si->mfn_list = vphysmap_start;
761 snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
762 elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
764 count = d->tot_pages;
765 #ifdef __x86_64__
766 /* Set up the phys->machine table if not part of the initial mapping. */
767 if ( parms.p2m_base != UNSET_ADDR )
768 {
769 unsigned long va = vphysmap_start;
771 if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
772 panic("DOM0 P->M table overlaps initial mapping");
774 while ( va < vphysmap_end )
775 {
776 if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
777 >> PAGE_SHIFT) + 3 > nr_pages )
778 panic("Dom0 allocation too small for initial P->M table.\n");
780 l4tab = l4start + l4_table_offset(va);
781 if ( !l4e_get_intpte(*l4tab) )
782 {
783 page = alloc_domheap_page(d, 0);
784 if ( !page )
785 break;
786 /* No mapping, PGC_allocated + page-table page. */
787 page->count_info = PGC_allocated | 2;
788 page->u.inuse.type_info =
789 PGT_l3_page_table | PGT_validated | 1;
790 clear_page(page_to_virt(page));
791 *l4tab = l4e_from_page(page, L4_PROT);
792 }
793 l3tab = page_to_virt(l4e_get_page(*l4tab));
794 l3tab += l3_table_offset(va);
795 if ( !l3e_get_intpte(*l3tab) )
796 {
797 if ( cpu_has_page1gb &&
798 !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
799 vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) &&
800 (page = alloc_domheap_pages(d,
801 L3_PAGETABLE_SHIFT -
802 PAGE_SHIFT,
803 0)) != NULL )
804 {
805 *l3tab = l3e_from_page(page,
806 L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
807 va += 1UL << L3_PAGETABLE_SHIFT;
808 continue;
809 }
810 if ( (page = alloc_domheap_page(d, 0)) == NULL )
811 break;
812 else
813 {
814 /* No mapping, PGC_allocated + page-table page. */
815 page->count_info = PGC_allocated | 2;
816 page->u.inuse.type_info =
817 PGT_l2_page_table | PGT_validated | 1;
818 clear_page(page_to_virt(page));
819 *l3tab = l3e_from_page(page, L3_PROT);
820 }
821 }
822 l2tab = page_to_virt(l3e_get_page(*l3tab));
823 l2tab += l2_table_offset(va);
824 if ( !l2e_get_intpte(*l2tab) )
825 {
826 if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
827 vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) &&
828 (page = alloc_domheap_pages(d,
829 L2_PAGETABLE_SHIFT -
830 PAGE_SHIFT,
831 0)) != NULL )
832 {
833 *l2tab = l2e_from_page(page,
834 L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
835 va += 1UL << L2_PAGETABLE_SHIFT;
836 continue;
837 }
838 if ( (page = alloc_domheap_page(d, 0)) == NULL )
839 break;
840 else
841 {
842 /* No mapping, PGC_allocated + page-table page. */
843 page->count_info = PGC_allocated | 2;
844 page->u.inuse.type_info =
845 PGT_l1_page_table | PGT_validated | 1;
846 clear_page(page_to_virt(page));
847 *l2tab = l2e_from_page(page, L2_PROT);
848 }
849 }
850 l1tab = page_to_virt(l2e_get_page(*l2tab));
851 l1tab += l1_table_offset(va);
852 BUG_ON(l1e_get_intpte(*l1tab));
853 page = alloc_domheap_page(d, 0);
854 if ( !page )
855 break;
856 *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
857 va += PAGE_SIZE;
858 va &= PAGE_MASK;
859 }
860 if ( !page )
861 panic("Not enough RAM for DOM0 P->M table.\n");
862 }
863 #endif
865 /* Write the phys->machine and machine->phys table entries. */
866 for ( pfn = 0; pfn < count; pfn++ )
867 {
868 mfn = pfn + alloc_spfn;
869 #ifndef NDEBUG
870 #define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT)
871 if ( pfn > REVERSE_START )
872 mfn = alloc_epfn - (pfn - REVERSE_START);
873 #endif
874 if ( !is_pv_32on64_domain(d) )
875 ((unsigned long *)vphysmap_start)[pfn] = mfn;
876 else
877 ((unsigned int *)vphysmap_start)[pfn] = mfn;
878 set_gpfn_from_mfn(mfn, pfn);
879 }
880 si->first_p2m_pfn = pfn;
881 si->nr_p2m_frames = d->tot_pages - count;
882 page_list_for_each ( page, &d->page_list )
883 {
884 mfn = page_to_mfn(page);
885 if ( get_gpfn_from_mfn(mfn) >= count )
886 {
887 BUG_ON(is_pv_32bit_domain(d));
888 if ( !page->u.inuse.type_info &&
889 !get_page_and_type(page, d, PGT_writable_page) )
890 BUG();
891 ((unsigned long *)vphysmap_start)[pfn] = mfn;
892 set_gpfn_from_mfn(mfn, pfn);
893 ++pfn;
894 #ifndef NDEBUG
895 ++alloc_epfn;
896 #endif
897 }
898 }
899 BUG_ON(pfn != d->tot_pages);
900 while ( pfn < nr_pages )
901 {
902 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
903 panic("Not enough RAM for DOM0 reservation.\n");
904 while ( pfn < d->tot_pages )
905 {
906 mfn = page_to_mfn(page);
907 #ifndef NDEBUG
908 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
909 #endif
910 if ( !is_pv_32on64_domain(d) )
911 ((unsigned long *)vphysmap_start)[pfn] = mfn;
912 else
913 ((unsigned int *)vphysmap_start)[pfn] = mfn;
914 set_gpfn_from_mfn(mfn, pfn);
915 #undef pfn
916 page++; pfn++;
917 }
918 }
920 if ( initrd_len != 0 )
921 {
922 si->mod_start = vinitrd_start;
923 si->mod_len = initrd_len;
924 }
926 memset(si->cmd_line, 0, sizeof(si->cmd_line));
927 if ( cmdline != NULL )
928 strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line));
930 if ( fill_console_start_info((void *)(si + 1)) )
931 {
932 si->console.dom0.info_off = sizeof(struct start_info);
933 si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
934 }
936 #if defined(__x86_64__)
937 if ( is_pv_32on64_domain(d) )
938 xlat_start_info(si, XLAT_start_info_console_dom0);
939 #endif
941 /* Return to idle domain's page tables. */
942 write_ptbase(current);
944 #if defined(__i386__)
945 /* Destroy low mappings - they were only for our convenience. */
946 zap_low_mappings(l2start);
947 #endif
949 update_domain_wallclock_time(d);
951 v->is_initialised = 1;
952 clear_bit(_VPF_down, &v->pause_flags);
954 /*
955 * Initial register values:
956 * DS,ES,FS,GS = FLAT_KERNEL_DS
957 * CS:EIP = FLAT_KERNEL_CS:start_pc
958 * SS:ESP = FLAT_KERNEL_SS:start_stack
959 * ESI = start_info
960 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
961 */
962 regs = &v->arch.guest_context.user_regs;
963 regs->ds = regs->es = regs->fs = regs->gs =
964 !is_pv_32on64_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS;
965 regs->ss = (!is_pv_32on64_domain(d) ?
966 FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS);
967 regs->cs = (!is_pv_32on64_domain(d) ?
968 FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS);
969 regs->eip = parms.virt_entry;
970 regs->esp = vstack_end;
971 regs->esi = vstartinfo_start;
972 regs->eflags = X86_EFLAGS_IF;
974 if ( opt_dom0_shadow )
975 if ( paging_enable(d, PG_SH_enable) == 0 )
976 paging_update_paging_modes(v);
978 if ( supervisor_mode_kernel )
979 {
980 v->arch.guest_context.kernel_ss &= ~3;
981 v->arch.guest_context.user_regs.ss &= ~3;
982 v->arch.guest_context.user_regs.es &= ~3;
983 v->arch.guest_context.user_regs.ds &= ~3;
984 v->arch.guest_context.user_regs.fs &= ~3;
985 v->arch.guest_context.user_regs.gs &= ~3;
986 printk("Dom0 runs in ring 0 (supervisor mode)\n");
987 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
988 parms.f_supported) )
989 panic("Dom0 does not support supervisor-mode execution\n");
990 }
991 else
992 {
993 if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) )
994 panic("Dom0 requires supervisor-mode execution\n");
995 }
997 rc = 0;
999 /* DOM0 is permitted full I/O capabilities. */
1000 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
1001 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
1002 rc |= irqs_permit_access(dom0, 0, d->nr_pirqs - 1);
1004 /*
1005 * Modify I/O port access permissions.
1006 */
1007 /* Master Interrupt Controller (PIC). */
1008 rc |= ioports_deny_access(dom0, 0x20, 0x21);
1009 /* Slave Interrupt Controller (PIC). */
1010 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
1011 /* Interval Timer (PIT). */
1012 rc |= ioports_deny_access(dom0, 0x40, 0x43);
1013 /* PIT Channel 2 / PC Speaker Control. */
1014 rc |= ioports_deny_access(dom0, 0x61, 0x61);
1015 /* PCI configuration space (NB. 0xcf8 has special treatment). */
1016 rc |= ioports_deny_access(dom0, 0xcfc, 0xcff);
1017 /* Command-line I/O ranges. */
1018 process_dom0_ioports_disable();
1020 /*
1021 * Modify I/O memory access permissions.
1022 */
1023 /* Local APIC. */
1024 if ( mp_lapic_addr != 0 )
1026 mfn = paddr_to_pfn(mp_lapic_addr);
1027 rc |= iomem_deny_access(dom0, mfn, mfn);
1029 /* I/O APICs. */
1030 for ( i = 0; i < nr_ioapics; i++ )
1032 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
1033 if ( smp_found_config )
1034 rc |= iomem_deny_access(dom0, mfn, mfn);
1037 /* Remove access to E820_UNUSABLE I/O regions above 1MB. */
1038 for ( i = 0; i < e820.nr_map; i++ )
1040 unsigned long sfn, efn;
1041 sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul);
1042 efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1);
1043 if ( (e820.map[i].type == E820_UNUSABLE) &&
1044 (e820.map[i].size != 0) &&
1045 (sfn <= efn) )
1046 rc |= iomem_deny_access(dom0, sfn, efn);
1049 BUG_ON(rc != 0);
1051 return 0;
1054 /*
1055 * Local variables:
1056 * mode: C
1057 * c-set-style: "BSD"
1058 * c-basic-offset: 4
1059 * tab-width: 4
1060 * indent-tabs-mode: nil
1061 * End:
1062 */