ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 14103:ee4850bc895b

xen memory alloctor: remove bit width restrictions

Hide the (default or user specified) DMA width from anything outside
the heap allocator. I/O-capable guests can now request any width for
the memory they want exchanged/added.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Fri Feb 23 17:02:58 2007 +0000 (2007-02-23)
parents 96d08345f1c5
children 1e5a83fb928b
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/console.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/version.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <xen/compat.h>
22 #include <asm/regs.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/i387.h>
28 #include <asm/paging.h>
30 #include <public/version.h>
31 #include <public/libelf.h>
33 extern unsigned long initial_images_nrpages(void);
34 extern void discard_initial_images(void);
36 static long dom0_nrpages, dom0_min_nrpages, dom0_max_nrpages = LONG_MAX;
38 /*
39 * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
40 *
41 * <min_amt>: The minimum amount of memory which should be allocated for dom0.
42 * <max_amt>: The maximum amount of memory which should be allocated for dom0.
43 * <amt>: The precise amount of memory to allocate for dom0.
44 *
45 * Notes:
46 * 1. <amt> is clamped from below by <min_amt> and from above by available
47 * memory and <max_amt>
48 * 2. <min_amt> is clamped from above by available memory and <max_amt>
49 * 3. <min_amt> is ignored if it is greater than <max_amt>
50 * 4. If <amt> is not specified, it is calculated as follows:
51 * "All of memory is allocated to domain 0, minus 1/16th which is reserved
52 * for uses such as DMA buffers (the reservation is clamped to 128MB)."
53 *
54 * Each value can be specified as positive or negative:
55 * If +ve: The specified amount is an absolute value.
56 * If -ve: The specified amount is subtracted from total available memory.
57 */
58 static long parse_amt(const char *s, const char **ps)
59 {
60 long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT;
61 return (*s == '-') ? -pages : pages;
62 }
63 static void parse_dom0_mem(const char *s)
64 {
65 do {
66 if ( !strncmp(s, "min:", 4) )
67 dom0_min_nrpages = parse_amt(s+4, &s);
68 else if ( !strncmp(s, "max:", 4) )
69 dom0_max_nrpages = parse_amt(s+4, &s);
70 else
71 dom0_nrpages = parse_amt(s, &s);
72 if ( *s != ',' )
73 break;
74 } while ( *s++ == ',' );
75 }
76 custom_param("dom0_mem", parse_dom0_mem);
78 static unsigned int opt_dom0_max_vcpus;
79 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
81 static unsigned int opt_dom0_shadow;
82 boolean_param("dom0_shadow", opt_dom0_shadow);
84 static char opt_dom0_ioports_disable[200] = "";
85 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
87 #if defined(__i386__)
88 /* No ring-3 access in initial leaf page tables. */
89 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
90 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
91 #define L3_PROT (_PAGE_PRESENT)
92 #elif defined(__x86_64__)
93 /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
94 #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
95 #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
96 /* ... except for compatibility mode guests. */
97 #define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
98 #define L2_PROT (BASE_PROT|_PAGE_DIRTY)
99 #define L3_PROT (BASE_PROT|_PAGE_DIRTY)
100 #define L4_PROT (BASE_PROT|_PAGE_DIRTY)
101 #endif
103 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
104 #define round_pgdown(_p) ((_p)&PAGE_MASK)
106 static struct page_info *alloc_chunk(struct domain *d, unsigned long max_pages)
107 {
108 struct page_info *page;
109 unsigned int order;
110 /*
111 * Allocate up to 2MB at a time: It prevents allocating very large chunks
112 * from DMA pools before the >4GB pool is fully depleted.
113 */
114 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
115 max_pages = 2UL << (20 - PAGE_SHIFT);
116 order = get_order_from_pages(max_pages);
117 if ( (max_pages & (max_pages-1)) != 0 )
118 order--;
119 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
120 if ( order-- == 0 )
121 break;
122 return page;
123 }
125 static unsigned long compute_dom0_nr_pages(void)
126 {
127 unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
129 /*
130 * If domain 0 allocation isn't specified, reserve 1/16th of available
131 * memory for things like DMA buffers. This reservation is clamped to
132 * a maximum of 128MB.
133 */
134 if ( dom0_nrpages == 0 )
135 {
136 dom0_nrpages = avail;
137 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
138 dom0_nrpages = -dom0_nrpages;
139 }
141 /* Negative memory specification means "all memory - specified amount". */
142 if ( dom0_nrpages < 0 ) dom0_nrpages += avail;
143 if ( dom0_min_nrpages < 0 ) dom0_min_nrpages += avail;
144 if ( dom0_max_nrpages < 0 ) dom0_max_nrpages += avail;
146 /* Clamp dom0 memory according to min/max limits and available memory. */
147 dom0_nrpages = max(dom0_nrpages, dom0_min_nrpages);
148 dom0_nrpages = min(dom0_nrpages, dom0_max_nrpages);
149 dom0_nrpages = min(dom0_nrpages, (long)avail);
151 return dom0_nrpages;
152 }
154 static void process_dom0_ioports_disable(void)
155 {
156 unsigned long io_from, io_to;
157 char *t, *s = opt_dom0_ioports_disable;
158 const char *u;
160 if ( *s == '\0' )
161 return;
163 while ( (t = strsep(&s, ",")) != NULL )
164 {
165 io_from = simple_strtoul(t, &u, 16);
166 if ( u == t )
167 {
168 parse_error:
169 printk("Invalid ioport range <%s> "
170 "in dom0_ioports_disable, skipping\n", t);
171 continue;
172 }
174 if ( *u == '\0' )
175 io_to = io_from;
176 else if ( *u == '-' )
177 io_to = simple_strtoul(u + 1, &u, 16);
178 else
179 goto parse_error;
181 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
182 goto parse_error;
184 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
185 io_from, io_to);
187 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
188 BUG();
189 }
190 }
192 int construct_dom0(struct domain *d,
193 unsigned long _image_start, unsigned long image_len,
194 unsigned long _initrd_start, unsigned long initrd_len,
195 char *cmdline)
196 {
197 int i, rc, compatible, compat32, order, machine;
198 struct cpu_user_regs *regs;
199 unsigned long pfn, mfn;
200 unsigned long nr_pages;
201 unsigned long nr_pt_pages;
202 unsigned long alloc_spfn;
203 unsigned long alloc_epfn;
204 unsigned long count;
205 struct page_info *page = NULL;
206 start_info_t *si;
207 struct vcpu *v = d->vcpu[0];
208 unsigned long long value;
209 #if defined(__i386__)
210 char *image_start = (char *)_image_start; /* use lowmem mappings */
211 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
212 #elif defined(__x86_64__)
213 char *image_start = __va(_image_start);
214 char *initrd_start = __va(_initrd_start);
215 #endif
216 #if CONFIG_PAGING_LEVELS >= 4
217 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
218 #endif
219 #if CONFIG_PAGING_LEVELS >= 3
220 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
221 #endif
222 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
223 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
225 /*
226 * This fully describes the memory layout of the initial domain. All
227 * *_start address are page-aligned, except v_start (and v_end) which are
228 * superpage-aligned.
229 */
230 struct elf_binary elf;
231 struct elf_dom_parms parms;
232 unsigned long vkern_start;
233 unsigned long vkern_end;
234 unsigned long vinitrd_start;
235 unsigned long vinitrd_end;
236 unsigned long vphysmap_start;
237 unsigned long vphysmap_end;
238 unsigned long vstartinfo_start;
239 unsigned long vstartinfo_end;
240 unsigned long vstack_start;
241 unsigned long vstack_end;
242 unsigned long vpt_start;
243 unsigned long vpt_end;
244 unsigned long v_start;
245 unsigned long v_end;
247 /* Machine address of next candidate page-table page. */
248 unsigned long mpt_alloc;
250 /* Features supported. */
251 uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
252 uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
254 /* Sanity! */
255 BUG_ON(d->domain_id != 0);
256 BUG_ON(d->vcpu[0] == NULL);
257 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
259 printk("*** LOADING DOMAIN 0 ***\n");
261 d->max_pages = ~0U;
263 nr_pages = compute_dom0_nr_pages();
265 if ( (rc = elf_init(&elf, image_start, image_len)) != 0 )
266 return rc;
267 #ifdef VERBOSE
268 elf_set_verbose(&elf);
269 #endif
270 elf_parse_binary(&elf);
271 if ( (rc = elf_xen_parse(&elf, &parms)) != 0 )
272 return rc;
274 /* compatibility check */
275 compatible = 0;
276 compat32 = 0;
277 machine = elf_uval(&elf, elf.ehdr, e_machine);
278 switch (CONFIG_PAGING_LEVELS) {
279 case 2: /* x86_32 */
280 if (parms.pae == PAEKERN_bimodal)
281 parms.pae = PAEKERN_no;
282 printk(" Xen kernel: 32-bit, lsb\n");
283 if (elf_32bit(&elf) && !parms.pae && machine == EM_386)
284 compatible = 1;
285 break;
286 case 3: /* x86_32p */
287 if (parms.pae == PAEKERN_bimodal)
288 parms.pae = PAEKERN_extended_cr3;
289 printk(" Xen kernel: 32-bit, PAE, lsb\n");
290 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
291 compatible = 1;
292 break;
293 case 4: /* x86_64 */
294 #ifndef CONFIG_COMPAT
295 printk(" Xen kernel: 64-bit, lsb\n");
296 #else
297 printk(" Xen kernel: 64-bit, lsb, compat32\n");
298 if (elf_32bit(&elf) && parms.pae == PAEKERN_bimodal)
299 parms.pae = PAEKERN_extended_cr3;
300 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
301 {
302 compat32 = 1;
303 compatible = 1;
304 }
305 #endif
306 if (elf_64bit(&elf) && machine == EM_X86_64)
307 compatible = 1;
308 break;
309 }
310 printk(" Dom0 kernel: %s%s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
311 elf_64bit(&elf) ? "64-bit" : "32-bit",
312 parms.pae ? ", PAE" : "",
313 elf_msb(&elf) ? "msb" : "lsb",
314 elf.pstart, elf.pend);
316 if ( !compatible )
317 {
318 printk("Mismatch between Xen and DOM0 kernel\n");
319 return -EINVAL;
320 }
322 #ifdef CONFIG_COMPAT
323 if (compat32)
324 {
325 l1_pgentry_t gdt_l1e;
327 set_bit(_DOMF_compat, &d->domain_flags);
328 v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
330 if ( nr_pages != (unsigned int)nr_pages )
331 nr_pages = UINT_MAX;
333 /*
334 * Map compatibility Xen segments into every VCPU's GDT. See
335 * arch_domain_create() for further comments.
336 */
337 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table),
338 PAGE_HYPERVISOR);
339 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
340 d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) +
341 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
342 local_flush_tlb_one(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE);
343 }
344 #endif
345 if ( parms.pae == PAEKERN_extended_cr3 )
346 set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
348 if ( UNSET_ADDR != parms.virt_hv_start_low && elf_32bit(&elf) )
349 {
350 #if CONFIG_PAGING_LEVELS < 4
351 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
352 #else
353 unsigned long mask = !IS_COMPAT(d)
354 ? (1UL << L4_PAGETABLE_SHIFT) - 1
355 : (1UL << L2_PAGETABLE_SHIFT) - 1;
356 #endif
358 value = (parms.virt_hv_start_low + mask) & ~mask;
359 #ifdef CONFIG_COMPAT
360 HYPERVISOR_COMPAT_VIRT_START(d) = max_t(unsigned int, m2p_compat_vstart, value);
361 if ( value > (!IS_COMPAT(d) ?
362 HYPERVISOR_VIRT_START :
363 __HYPERVISOR_COMPAT_VIRT_START) )
364 #else
365 if ( value > HYPERVISOR_VIRT_START )
366 #endif
367 panic("Domain 0 expects too high a hypervisor start address.\n");
368 }
370 if ( parms.f_required[0] /* Huh? -- kraxel */ )
371 panic("Domain 0 requires an unsupported hypervisor feature.\n");
373 /* Align load address to 4MB boundary. */
374 v_start = parms.virt_base & ~((1UL<<22)-1);
376 /*
377 * Why do we need this? The number of page-table frames depends on the
378 * size of the bootstrap address space. But the size of the address space
379 * depends on the number of page-table frames (since each one is mapped
380 * read-only). We have a pair of simultaneous equations in two unknowns,
381 * which we solve by exhaustive search.
382 */
383 vkern_start = parms.virt_kstart;
384 vkern_end = parms.virt_kend;
385 vinitrd_start = round_pgup(vkern_end);
386 vinitrd_end = vinitrd_start + initrd_len;
387 vphysmap_start = round_pgup(vinitrd_end);
388 vphysmap_end = vphysmap_start + (nr_pages * (!IS_COMPAT(d) ?
389 sizeof(unsigned long) :
390 sizeof(unsigned int)));
391 vstartinfo_start = round_pgup(vphysmap_end);
392 vstartinfo_end = (vstartinfo_start +
393 sizeof(struct start_info) +
394 sizeof(struct dom0_vga_console_info));
395 vpt_start = round_pgup(vstartinfo_end);
396 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
397 {
398 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
399 vstack_start = vpt_end;
400 vstack_end = vstack_start + PAGE_SIZE;
401 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
402 if ( (v_end - vstack_end) < (512UL << 10) )
403 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
404 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
405 if ( (((v_end - v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
406 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
407 break;
408 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
409 /* 5 pages: 1x 3rd + 4x 2nd level */
410 if ( (((v_end - v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
411 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
412 break;
413 #elif defined(__x86_64__)
414 #define NR(_l,_h,_s) \
415 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
416 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
417 if ( (1 + /* # L4 */
418 NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
419 (!IS_COMPAT(d) ?
420 NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */
421 4) + /* # compat L2 */
422 NR(v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
423 <= nr_pt_pages )
424 break;
425 #endif
426 }
428 order = get_order_from_bytes(v_end - v_start);
429 if ( (1UL << order) > nr_pages )
430 panic("Domain 0 allocation is too small for kernel image.\n");
432 #ifdef __i386__
433 /* Ensure that our low-memory 1:1 mapping covers the allocation. */
434 page = alloc_domheap_pages(d, order,
435 MEMF_bits(30 + (v_start >> 31)));
436 #else
437 page = alloc_domheap_pages(d, order, 0);
438 #endif
439 if ( page == NULL )
440 panic("Not enough RAM for domain 0 allocation.\n");
441 alloc_spfn = page_to_mfn(page);
442 alloc_epfn = alloc_spfn + d->tot_pages;
444 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
445 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
446 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
447 if ( d->tot_pages < nr_pages )
448 printk(" (%lu pages to be allocated)",
449 nr_pages - d->tot_pages);
450 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
451 " Loaded kernel: %p->%p\n"
452 " Init. ramdisk: %p->%p\n"
453 " Phys-Mach map: %p->%p\n"
454 " Start info: %p->%p\n"
455 " Page tables: %p->%p\n"
456 " Boot stack: %p->%p\n"
457 " TOTAL: %p->%p\n",
458 _p(vkern_start), _p(vkern_end),
459 _p(vinitrd_start), _p(vinitrd_end),
460 _p(vphysmap_start), _p(vphysmap_end),
461 _p(vstartinfo_start), _p(vstartinfo_end),
462 _p(vpt_start), _p(vpt_end),
463 _p(vstack_start), _p(vstack_end),
464 _p(v_start), _p(v_end));
465 printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
467 if ( ((v_end - v_start)>>PAGE_SHIFT) > nr_pages )
468 {
469 printk("Initial guest OS requires too much space\n"
470 "(%luMB is greater than %luMB limit)\n",
471 (v_end-v_start)>>20, nr_pages>>(20-PAGE_SHIFT));
472 return -ENOMEM;
473 }
475 mpt_alloc = (vpt_start - v_start) +
476 (unsigned long)pfn_to_paddr(alloc_spfn);
478 #if defined(__i386__)
479 /*
480 * Protect the lowest 1GB of memory. We use a temporary mapping there
481 * from which we copy the kernel and ramdisk images.
482 */
483 if ( v_start < (1UL<<30) )
484 {
485 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
486 return -EINVAL;
487 }
489 /* WARNING: The new domain must have its 'processor' field filled in! */
490 #if CONFIG_PAGING_LEVELS == 3
491 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
492 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
493 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
494 for (i = 0; i < 4; i++) {
495 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
496 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
497 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
498 }
499 v->arch.guest_table = pagetable_from_paddr((unsigned long)l3start);
500 #else
501 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
502 memcpy(l2tab, idle_pg_table, PAGE_SIZE);
503 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
504 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
505 v->arch.guest_table = pagetable_from_paddr((unsigned long)l2start);
506 #endif
508 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
509 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
510 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
511 __PAGE_HYPERVISOR);
513 l2tab += l2_linear_offset(v_start);
514 mfn = alloc_spfn;
515 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
516 {
517 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
518 {
519 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
520 mpt_alloc += PAGE_SIZE;
521 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
522 l2tab++;
523 clear_page(l1tab);
524 if ( count == 0 )
525 l1tab += l1_table_offset(v_start);
526 }
527 *l1tab = l1e_from_pfn(mfn, L1_PROT);
528 l1tab++;
530 page = mfn_to_page(mfn);
531 if ( !get_page_and_type(page, d, PGT_writable_page) )
532 BUG();
534 mfn++;
535 }
537 /* Pages that are part of page tables must be read only. */
538 l2tab = l2start + l2_linear_offset(vpt_start);
539 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
540 l1tab += l1_table_offset(vpt_start);
541 for ( count = 0; count < nr_pt_pages; count++ )
542 {
543 page = mfn_to_page(l1e_get_pfn(*l1tab));
544 if ( !opt_dom0_shadow )
545 l1e_remove_flags(*l1tab, _PAGE_RW);
546 else
547 if ( !get_page_type(page, PGT_writable_page) )
548 BUG();
550 #if CONFIG_PAGING_LEVELS == 3
551 switch (count) {
552 case 0:
553 page->u.inuse.type_info &= ~PGT_type_mask;
554 page->u.inuse.type_info |= PGT_l3_page_table;
555 get_page(page, d); /* an extra ref because of readable mapping */
557 /* Get another ref to L3 page so that it can be pinned. */
558 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
559 BUG();
560 set_bit(_PGT_pinned, &page->u.inuse.type_info);
561 break;
562 case 1 ... 4:
563 page->u.inuse.type_info &= ~PGT_type_mask;
564 page->u.inuse.type_info |= PGT_l2_page_table;
565 if ( count == 4 )
566 page->u.inuse.type_info |= PGT_pae_xen_l2;
567 get_page(page, d); /* an extra ref because of readable mapping */
568 break;
569 default:
570 page->u.inuse.type_info &= ~PGT_type_mask;
571 page->u.inuse.type_info |= PGT_l1_page_table;
572 get_page(page, d); /* an extra ref because of readable mapping */
573 break;
574 }
575 #else
576 if ( count == 0 )
577 {
578 page->u.inuse.type_info &= ~PGT_type_mask;
579 page->u.inuse.type_info |= PGT_l2_page_table;
581 /*
582 * No longer writable: decrement the type_count.
583 * Installed as CR3: increment both the ref_count and type_count.
584 * Net: just increment the ref_count.
585 */
586 get_page(page, d); /* an extra ref because of readable mapping */
588 /* Get another ref to L2 page so that it can be pinned. */
589 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
590 BUG();
591 set_bit(_PGT_pinned, &page->u.inuse.type_info);
592 }
593 else
594 {
595 page->u.inuse.type_info &= ~PGT_type_mask;
596 page->u.inuse.type_info |= PGT_l1_page_table;
598 /*
599 * No longer writable: decrement the type_count.
600 * This is an L1 page, installed in a validated L2 page:
601 * increment both the ref_count and type_count.
602 * Net: just increment the ref_count.
603 */
604 get_page(page, d); /* an extra ref because of readable mapping */
605 }
606 #endif
607 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
608 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
609 }
611 #elif defined(__x86_64__)
613 /* Overlap with Xen protected area? */
614 if ( !IS_COMPAT(d) ?
615 ((v_start < HYPERVISOR_VIRT_END) &&
616 (v_end > HYPERVISOR_VIRT_START)) :
617 (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) )
618 {
619 printk("DOM0 image overlaps with Xen private area.\n");
620 return -EINVAL;
621 }
623 if ( IS_COMPAT(d) )
624 {
625 v->arch.guest_context.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS;
626 v->arch.guest_context.event_callback_cs = FLAT_COMPAT_KERNEL_CS;
627 }
629 /* WARNING: The new domain must have its 'processor' field filled in! */
630 if ( !IS_COMPAT(d) )
631 {
632 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
633 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
634 }
635 else
636 {
637 page = alloc_domheap_page(NULL);
638 if ( !page )
639 panic("Not enough RAM for domain 0 PML4.\n");
640 l4start = l4tab = page_to_virt(page);
641 }
642 memcpy(l4tab, idle_pg_table, PAGE_SIZE);
643 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
644 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
645 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
646 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
647 v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
648 if ( IS_COMPAT(d) )
649 {
650 v->arch.guest_table_user = v->arch.guest_table;
651 if ( setup_arg_xlat_area(v, l4start) < 0 )
652 panic("Not enough RAM for domain 0 hypercall argument translation.\n");
653 }
655 l4tab += l4_table_offset(v_start);
656 mfn = alloc_spfn;
657 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
658 {
659 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
660 {
661 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
662 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
663 clear_page(l1tab);
664 if ( count == 0 )
665 l1tab += l1_table_offset(v_start);
666 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
667 {
668 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
669 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
670 clear_page(l2tab);
671 if ( count == 0 )
672 l2tab += l2_table_offset(v_start);
673 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
674 {
675 maddr_to_page(mpt_alloc)->u.inuse.type_info =
676 PGT_l3_page_table;
677 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
678 clear_page(l3tab);
679 if ( count == 0 )
680 l3tab += l3_table_offset(v_start);
681 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
682 l4tab++;
683 }
684 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
685 l3tab++;
686 }
687 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
688 l2tab++;
689 }
690 *l1tab = l1e_from_pfn(mfn, !IS_COMPAT(d) ? L1_PROT : COMPAT_L1_PROT);
691 l1tab++;
693 page = mfn_to_page(mfn);
694 if ( (page->u.inuse.type_info == 0) &&
695 !get_page_and_type(page, d, PGT_writable_page) )
696 BUG();
698 mfn++;
699 }
701 #ifdef CONFIG_COMPAT
702 if ( IS_COMPAT(d) )
703 {
704 /* Ensure the first four L3 entries are all populated. */
705 for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
706 {
707 if ( !l3e_get_intpte(*l3tab) )
708 {
709 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
710 l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
711 clear_page(l2tab);
712 *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
713 }
714 if ( i == 3 )
715 l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
716 }
717 /* Install read-only guest visible MPT mapping. */
718 l2tab = l3e_to_l2e(l3start[3]);
719 memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
720 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
721 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
722 }
723 #endif
725 /* Pages that are part of page tables must be read only. */
726 l4tab = l4start + l4_table_offset(vpt_start);
727 l3start = l3tab = l4e_to_l3e(*l4tab);
728 l3tab += l3_table_offset(vpt_start);
729 l2start = l2tab = l3e_to_l2e(*l3tab);
730 l2tab += l2_table_offset(vpt_start);
731 l1start = l1tab = l2e_to_l1e(*l2tab);
732 l1tab += l1_table_offset(vpt_start);
733 for ( count = 0; count < nr_pt_pages; count++ )
734 {
735 l1e_remove_flags(*l1tab, _PAGE_RW);
736 page = mfn_to_page(l1e_get_pfn(*l1tab));
738 /* Read-only mapping + PGC_allocated + page-table page. */
739 page->count_info = PGC_allocated | 3;
740 page->u.inuse.type_info |= PGT_validated | 1;
742 /* Top-level p.t. is pinned. */
743 if ( (page->u.inuse.type_info & PGT_type_mask) ==
744 (!IS_COMPAT(d) ? PGT_l4_page_table : PGT_l3_page_table) )
745 {
746 page->count_info += 1;
747 page->u.inuse.type_info += 1 | PGT_pinned;
748 }
750 /* Iterate. */
751 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
752 {
753 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
754 {
755 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
756 l3start = l3tab = l4e_to_l3e(*++l4tab);
757 l2start = l2tab = l3e_to_l2e(*l3tab);
758 }
759 l1start = l1tab = l2e_to_l1e(*l2tab);
760 }
761 }
763 #endif /* __x86_64__ */
765 /* Mask all upcalls... */
766 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
767 shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
769 if ( opt_dom0_max_vcpus == 0 )
770 opt_dom0_max_vcpus = num_online_cpus();
771 if ( opt_dom0_max_vcpus > num_online_cpus() )
772 opt_dom0_max_vcpus = num_online_cpus();
773 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
774 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
775 if ( opt_dom0_max_vcpus > BITS_PER_GUEST_LONG(d) )
776 opt_dom0_max_vcpus = BITS_PER_GUEST_LONG(d);
777 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
779 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
780 (void)alloc_vcpu(d, i, i);
782 /* Set up CR3 value for write_ptbase */
783 if ( paging_mode_enabled(v->domain) )
784 paging_update_paging_modes(v);
785 else
786 update_cr3(v);
788 /* Install the new page tables. */
789 local_irq_disable();
790 write_ptbase(v);
792 /* Copy the OS image and free temporary buffer. */
793 elf.dest = (void*)vkern_start;
794 elf_load_binary(&elf);
796 if ( UNSET_ADDR != parms.virt_hypercall )
797 {
798 if ( (parms.virt_hypercall < v_start) ||
799 (parms.virt_hypercall >= v_end) )
800 {
801 write_ptbase(current);
802 local_irq_enable();
803 printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
804 return -1;
805 }
806 hypercall_page_initialise(d, (void *)(unsigned long)parms.virt_hypercall);
807 }
809 /* Copy the initial ramdisk. */
810 if ( initrd_len != 0 )
811 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
813 /* Free temporary buffers. */
814 discard_initial_images();
816 /* Set up start info area. */
817 si = (start_info_t *)vstartinfo_start;
818 memset(si, 0, PAGE_SIZE);
819 si->nr_pages = nr_pages;
821 si->shared_info = virt_to_maddr(d->shared_info);
823 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
824 si->pt_base = vpt_start + 2 * PAGE_SIZE * !!IS_COMPAT(d);
825 si->nr_pt_frames = nr_pt_pages;
826 si->mfn_list = vphysmap_start;
827 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-x86_%d%s",
828 xen_major_version(), xen_minor_version(),
829 elf_64bit(&elf) ? 64 : 32,
830 parms.pae ? "p" : "");
832 /* Write the phys->machine and machine->phys table entries. */
833 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
834 {
835 mfn = pfn + alloc_spfn;
836 #ifndef NDEBUG
837 #define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT)
838 if ( pfn > REVERSE_START )
839 mfn = alloc_epfn - (pfn - REVERSE_START);
840 #endif
841 if ( !IS_COMPAT(d) )
842 ((unsigned long *)vphysmap_start)[pfn] = mfn;
843 else
844 ((unsigned int *)vphysmap_start)[pfn] = mfn;
845 set_gpfn_from_mfn(mfn, pfn);
846 }
847 while ( pfn < nr_pages )
848 {
849 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
850 panic("Not enough RAM for DOM0 reservation.\n");
851 while ( pfn < d->tot_pages )
852 {
853 mfn = page_to_mfn(page);
854 #ifndef NDEBUG
855 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
856 #endif
857 if ( !IS_COMPAT(d) )
858 ((unsigned long *)vphysmap_start)[pfn] = mfn;
859 else
860 ((unsigned int *)vphysmap_start)[pfn] = mfn;
861 set_gpfn_from_mfn(mfn, pfn);
862 #undef pfn
863 page++; pfn++;
864 }
865 }
867 if ( initrd_len != 0 )
868 {
869 si->mod_start = vinitrd_start;
870 si->mod_len = initrd_len;
871 printk("Initrd len 0x%lx, start at 0x%lx\n",
872 si->mod_len, si->mod_start);
873 }
875 memset(si->cmd_line, 0, sizeof(si->cmd_line));
876 if ( cmdline != NULL )
877 strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line));
879 if ( fill_console_start_info((void *)(si + 1)) )
880 {
881 si->console.dom0.info_off = sizeof(struct start_info);
882 si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
883 }
885 #ifdef CONFIG_COMPAT
886 if ( IS_COMPAT(d) )
887 xlat_start_info(si, XLAT_start_info_console_dom0);
888 #endif
890 /* Reinstate the caller's page tables. */
891 write_ptbase(current);
892 local_irq_enable();
894 #if defined(__i386__)
895 /* Destroy low mappings - they were only for our convenience. */
896 zap_low_mappings(l2start);
897 zap_low_mappings(idle_pg_table_l2);
898 #endif
900 update_domain_wallclock_time(d);
902 set_bit(_VCPUF_initialised, &v->vcpu_flags);
904 /*
905 * Initial register values:
906 * DS,ES,FS,GS = FLAT_KERNEL_DS
907 * CS:EIP = FLAT_KERNEL_CS:start_pc
908 * SS:ESP = FLAT_KERNEL_SS:start_stack
909 * ESI = start_info
910 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
911 */
912 regs = &v->arch.guest_context.user_regs;
913 regs->ds = regs->es = regs->fs = regs->gs = !IS_COMPAT(d)
914 ? FLAT_KERNEL_DS
915 : FLAT_COMPAT_KERNEL_DS;
916 regs->ss = !IS_COMPAT(d) ? FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS;
917 regs->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
918 regs->eip = parms.virt_entry;
919 regs->esp = vstack_end;
920 regs->esi = vstartinfo_start;
921 regs->eflags = X86_EFLAGS_IF;
923 if ( opt_dom0_shadow )
924 if ( paging_enable(d, PG_SH_enable) == 0 )
925 paging_update_paging_modes(v);
927 if ( supervisor_mode_kernel )
928 {
929 v->arch.guest_context.kernel_ss &= ~3;
930 v->arch.guest_context.user_regs.ss &= ~3;
931 v->arch.guest_context.user_regs.es &= ~3;
932 v->arch.guest_context.user_regs.ds &= ~3;
933 v->arch.guest_context.user_regs.fs &= ~3;
934 v->arch.guest_context.user_regs.gs &= ~3;
935 printk("Dom0 runs in ring 0 (supervisor mode)\n");
936 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
937 dom0_features_supported) )
938 panic("Dom0 does not support supervisor-mode execution\n");
939 }
940 else
941 {
942 if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
943 panic("Dom0 requires supervisor-mode execution\n");
944 }
946 rc = 0;
948 /* DOM0 is permitted full I/O capabilities. */
949 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
950 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
951 rc |= irqs_permit_access(dom0, 0, NR_IRQS-1);
953 /*
954 * Modify I/O port access permissions.
955 */
956 /* Master Interrupt Controller (PIC). */
957 rc |= ioports_deny_access(dom0, 0x20, 0x21);
958 /* Slave Interrupt Controller (PIC). */
959 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
960 /* Interval Timer (PIT). */
961 rc |= ioports_deny_access(dom0, 0x40, 0x43);
962 /* PIT Channel 2 / PC Speaker Control. */
963 rc |= ioports_deny_access(dom0, 0x61, 0x61);
964 /* Command-line I/O ranges. */
965 process_dom0_ioports_disable();
967 /*
968 * Modify I/O memory access permissions.
969 */
970 /* Local APIC. */
971 if ( mp_lapic_addr != 0 )
972 {
973 mfn = paddr_to_pfn(mp_lapic_addr);
974 rc |= iomem_deny_access(dom0, mfn, mfn);
975 }
976 /* I/O APICs. */
977 for ( i = 0; i < nr_ioapics; i++ )
978 {
979 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
980 if ( smp_found_config )
981 rc |= iomem_deny_access(dom0, mfn, mfn);
982 }
984 BUG_ON(rc != 0);
986 return 0;
987 }
989 /*
990 * Local variables:
991 * mode: C
992 * c-set-style: "BSD"
993 * c-basic-offset: 4
994 * tab-width: 4
995 * indent-tabs-mode: nil
996 * End:
997 */