ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 6315:0f69e0adddb0

Fix dom0 memory allocation.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sun Aug 21 08:14:36 2005 +0000 (2005-08-21)
parents 19ef6202d75f
children 6721abf6b16d
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <asm/regs.h>
18 #include <asm/system.h>
19 #include <asm/io.h>
20 #include <asm/processor.h>
21 #include <asm/desc.h>
22 #include <asm/i387.h>
23 #include <asm/shadow.h>
25 static long dom0_nrpages;
27 /*
28 * dom0_mem:
29 * If +ve:
30 * * The specified amount of memory is allocated to domain 0.
31 * If -ve:
32 * * All of memory is allocated to domain 0, minus the specified amount.
33 * If not specified:
34 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
35 * for uses such as DMA buffers (the reservation is clamped to 128MB).
36 */
37 static void parse_dom0_mem(char *s)
38 {
39 unsigned long long bytes;
40 char *t = s;
41 if ( *s == '-' )
42 t++;
43 bytes = parse_size_and_unit(t);
44 dom0_nrpages = bytes >> PAGE_SHIFT;
45 if ( *s == '-' )
46 dom0_nrpages = -dom0_nrpages;
47 }
48 custom_param("dom0_mem", parse_dom0_mem);
50 static unsigned int opt_dom0_shadow = 0;
51 boolean_param("dom0_shadow", opt_dom0_shadow);
53 static unsigned int opt_dom0_translate = 0;
54 boolean_param("dom0_translate", opt_dom0_translate);
56 #if defined(__i386__)
57 /* No ring-3 access in initial leaf page tables. */
58 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
59 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
60 #define L3_PROT (_PAGE_PRESENT)
61 #elif defined(__x86_64__)
62 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
63 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
64 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
65 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
66 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
67 #endif
69 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
70 #define round_pgdown(_p) ((_p)&PAGE_MASK)
72 static struct pfn_info *alloc_chunk(struct domain *d, unsigned long max_pages)
73 {
74 struct pfn_info *page;
75 unsigned int order;
76 /*
77 * Allocate up to 2MB at a time:
78 * 1. This prevents overflow of get_order() when allocating more than
79 * 4GB to domain 0 on a PAE machine.
80 * 2. It prevents allocating very large chunks from DMA pools before
81 * the >4GB pool is fully depleted.
82 */
83 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
84 max_pages = 2UL << (20 - PAGE_SHIFT);
85 order = get_order(max_pages << PAGE_SHIFT);
86 if ( (max_pages & (max_pages-1)) != 0 )
87 order--;
88 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
89 if ( order-- == 0 )
90 break;
91 return page;
92 }
94 int construct_dom0(struct domain *d,
95 unsigned long _image_start, unsigned long image_len,
96 unsigned long _initrd_start, unsigned long initrd_len,
97 char *cmdline)
98 {
99 int i, rc, dom0_pae, xen_pae, order;
100 unsigned long pfn, mfn;
101 unsigned long nr_pages;
102 unsigned long nr_pt_pages;
103 unsigned long alloc_spfn;
104 unsigned long alloc_epfn;
105 unsigned long count;
106 struct pfn_info *page = NULL;
107 start_info_t *si;
108 struct vcpu *v = d->vcpu[0];
109 #if defined(__i386__)
110 char *image_start = (char *)_image_start; /* use lowmem mappings */
111 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
112 #elif defined(__x86_64__)
113 char *image_start = __va(_image_start);
114 char *initrd_start = __va(_initrd_start);
115 #endif
116 #if CONFIG_PAGING_LEVELS >= 4
117 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
118 #endif
119 #if CONFIG_PAGING_LEVELS >= 3
120 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
121 #endif
122 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
123 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
125 /*
126 * This fully describes the memory layout of the initial domain. All
127 * *_start address are page-aligned, except v_start (and v_end) which are
128 * superpage-aligned.
129 */
130 struct domain_setup_info dsi;
131 unsigned long vinitrd_start;
132 unsigned long vinitrd_end;
133 unsigned long vphysmap_start;
134 unsigned long vphysmap_end;
135 unsigned long vstartinfo_start;
136 unsigned long vstartinfo_end;
137 unsigned long vstack_start;
138 unsigned long vstack_end;
139 unsigned long vpt_start;
140 unsigned long vpt_end;
141 unsigned long v_end;
143 /* Machine address of next candidate page-table page. */
144 unsigned long mpt_alloc;
146 extern void physdev_init_dom0(struct domain *);
147 extern void translate_l2pgtable(
148 struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
150 /* Sanity! */
151 if ( d->domain_id != 0 )
152 BUG();
153 if ( test_bit(_DOMF_constructed, &d->domain_flags) )
154 BUG();
156 memset(&dsi, 0, sizeof(struct domain_setup_info));
157 dsi.image_addr = (unsigned long)image_start;
158 dsi.image_len = image_len;
160 printk("*** LOADING DOMAIN 0 ***\n");
162 d->max_pages = ~0U;
164 /*
165 * If domain 0 allocation isn't specified, reserve 1/16th of available
166 * memory for things like DMA buffers. This reservation is clamped to
167 * a maximum of 128MB.
168 */
169 if ( dom0_nrpages == 0 )
170 {
171 dom0_nrpages = avail_domheap_pages() +
172 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
173 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT);
174 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
175 dom0_nrpages = -dom0_nrpages;
176 }
178 /* Negative memory specification means "all memory - specified amount". */
179 if ( dom0_nrpages < 0 )
180 nr_pages = avail_domheap_pages() +
181 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
182 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
183 dom0_nrpages;
184 else
185 nr_pages = dom0_nrpages;
187 if ( (rc = parseelfimage(&dsi)) != 0 )
188 return rc;
190 if ( dsi.xen_section_string == NULL )
191 {
192 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
193 return -EINVAL;
194 }
196 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
197 xen_pae = (CONFIG_PAGING_LEVELS == 3);
198 if ( dom0_pae != xen_pae )
199 {
200 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
201 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
202 return -EINVAL;
203 }
204 if (strstr(dsi.xen_section_string, "SHADOW=translate"))
205 opt_dom0_translate = 1;
207 /* Align load address to 4MB boundary. */
208 dsi.v_start &= ~((1UL<<22)-1);
210 /*
211 * Why do we need this? The number of page-table frames depends on the
212 * size of the bootstrap address space. But the size of the address space
213 * depends on the number of page-table frames (since each one is mapped
214 * read-only). We have a pair of simultaneous equations in two unknowns,
215 * which we solve by exhaustive search.
216 */
217 vinitrd_start = round_pgup(dsi.v_end);
218 vinitrd_end = vinitrd_start + initrd_len;
219 vphysmap_start = round_pgup(vinitrd_end);
220 vphysmap_end = vphysmap_start + (nr_pages * sizeof(u32));
221 vpt_start = round_pgup(vphysmap_end);
222 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
223 {
224 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
225 vstartinfo_start = vpt_end;
226 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
227 vstack_start = vstartinfo_end;
228 vstack_end = vstack_start + PAGE_SIZE;
229 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
230 if ( (v_end - vstack_end) < (512UL << 10) )
231 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
232 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
233 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
234 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
235 break;
236 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
237 /* 5 pages: 1x 3rd + 4x 2nd level */
238 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
239 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
240 break;
241 #elif defined(__x86_64__)
242 #define NR(_l,_h,_s) \
243 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
244 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
245 if ( (1 + /* # L4 */
246 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
247 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
248 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
249 <= nr_pt_pages )
250 break;
251 #endif
252 }
254 order = get_order(v_end - dsi.v_start);
255 if ( (1UL << order) > nr_pages )
256 panic("Domain 0 allocation is too small for kernel image.\n");
258 /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
259 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
260 panic("Not enough RAM for domain 0 allocation.\n");
261 alloc_spfn = page_to_pfn(page);
262 alloc_epfn = alloc_spfn + d->tot_pages;
264 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
265 " Dom0 alloc.: %"PRIphysaddr"->%"PRIphysaddr,
266 pfn_to_phys(alloc_spfn), pfn_to_phys(alloc_epfn));
267 if ( d->tot_pages < nr_pages )
268 printk(" (%lu pages to be allocated)",
269 nr_pages - d->tot_pages);
270 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
271 " Loaded kernel: %p->%p\n"
272 " Init. ramdisk: %p->%p\n"
273 " Phys-Mach map: %p->%p\n"
274 " Page tables: %p->%p\n"
275 " Start info: %p->%p\n"
276 " Boot stack: %p->%p\n"
277 " TOTAL: %p->%p\n",
278 _p(dsi.v_kernstart), _p(dsi.v_kernend),
279 _p(vinitrd_start), _p(vinitrd_end),
280 _p(vphysmap_start), _p(vphysmap_end),
281 _p(vpt_start), _p(vpt_end),
282 _p(vstartinfo_start), _p(vstartinfo_end),
283 _p(vstack_start), _p(vstack_end),
284 _p(dsi.v_start), _p(v_end));
285 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
287 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
288 {
289 printk("Initial guest OS requires too much space\n"
290 "(%luMB is greater than %luMB limit)\n",
291 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
292 return -ENOMEM;
293 }
295 mpt_alloc = (vpt_start - dsi.v_start) +
296 (unsigned long)pfn_to_phys(alloc_spfn);
298 /*
299 * We're basically forcing default RPLs to 1, so that our "what privilege
300 * level are we returning to?" logic works.
301 */
302 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
303 for ( i = 0; i < 256; i++ )
304 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
306 #if defined(__i386__)
308 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
309 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
311 /*
312 * Protect the lowest 1GB of memory. We use a temporary mapping there
313 * from which we copy the kernel and ramdisk images.
314 */
315 if ( dsi.v_start < (1UL<<30) )
316 {
317 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
318 return -EINVAL;
319 }
321 /* WARNING: The new domain must have its 'processor' field filled in! */
322 #if CONFIG_PAGING_LEVELS == 3
323 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
324 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
325 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
326 for (i = 0; i < 4; i++) {
327 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
328 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
329 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
330 }
331 {
332 unsigned long va;
333 for (va = PERDOMAIN_VIRT_START; va < PERDOMAIN_VIRT_END;
334 va += (1 << L2_PAGETABLE_SHIFT)) {
335 l2tab[va >> L2_PAGETABLE_SHIFT] =
336 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt) +
337 (va-PERDOMAIN_VIRT_START),
338 __PAGE_HYPERVISOR);
339 }
340 }
341 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
342 #else
343 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
344 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
345 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
346 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
347 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
348 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
349 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
350 #endif
352 l2tab += l2_linear_offset(dsi.v_start);
353 mfn = alloc_spfn;
354 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
355 {
356 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
357 {
358 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
359 mpt_alloc += PAGE_SIZE;
360 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
361 l2tab++;
362 clear_page(l1tab);
363 if ( count == 0 )
364 l1tab += l1_table_offset(dsi.v_start);
365 }
366 *l1tab = l1e_from_pfn(mfn, L1_PROT);
367 l1tab++;
369 page = &frame_table[mfn];
370 if ( !get_page_and_type(page, d, PGT_writable_page) )
371 BUG();
373 mfn++;
374 }
376 /* Pages that are part of page tables must be read only. */
377 l2tab = l2start + l2_linear_offset(vpt_start);
378 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
379 l1tab += l1_table_offset(vpt_start);
380 for ( count = 0; count < nr_pt_pages; count++ )
381 {
382 page = &frame_table[l1e_get_pfn(*l1tab)];
383 if ( !opt_dom0_shadow )
384 l1e_remove_flags(*l1tab, _PAGE_RW);
385 else
386 if ( !get_page_type(page, PGT_writable_page) )
387 BUG();
389 #if CONFIG_PAGING_LEVELS == 3
390 switch (count) {
391 case 0:
392 page->u.inuse.type_info &= ~PGT_type_mask;
393 page->u.inuse.type_info |= PGT_l3_page_table;
394 get_page(page, d); /* an extra ref because of readable mapping */
396 /* Get another ref to L3 page so that it can be pinned. */
397 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
398 BUG();
399 set_bit(_PGT_pinned, &page->u.inuse.type_info);
400 break;
401 case 1 ... 4:
402 page->u.inuse.type_info &= ~PGT_type_mask;
403 page->u.inuse.type_info |= PGT_l2_page_table;
404 page->u.inuse.type_info |=
405 (count-1) << PGT_va_shift;
406 get_page(page, d); /* an extra ref because of readable mapping */
407 break;
408 default:
409 page->u.inuse.type_info &= ~PGT_type_mask;
410 page->u.inuse.type_info |= PGT_l1_page_table;
411 page->u.inuse.type_info |=
412 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
413 get_page(page, d); /* an extra ref because of readable mapping */
414 break;
415 }
416 #else
417 if ( count == 0 )
418 {
419 page->u.inuse.type_info &= ~PGT_type_mask;
420 page->u.inuse.type_info |= PGT_l2_page_table;
422 /*
423 * No longer writable: decrement the type_count.
424 * Installed as CR3: increment both the ref_count and type_count.
425 * Net: just increment the ref_count.
426 */
427 get_page(page, d); /* an extra ref because of readable mapping */
429 /* Get another ref to L2 page so that it can be pinned. */
430 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
431 BUG();
432 set_bit(_PGT_pinned, &page->u.inuse.type_info);
433 }
434 else
435 {
436 page->u.inuse.type_info &= ~PGT_type_mask;
437 page->u.inuse.type_info |= PGT_l1_page_table;
438 page->u.inuse.type_info |=
439 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
441 /*
442 * No longer writable: decrement the type_count.
443 * This is an L1 page, installed in a validated L2 page:
444 * increment both the ref_count and type_count.
445 * Net: just increment the ref_count.
446 */
447 get_page(page, d); /* an extra ref because of readable mapping */
448 }
449 #endif
450 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
451 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
452 }
454 #elif defined(__x86_64__)
456 /* Overlap with Xen protected area? */
457 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
458 (v_end > HYPERVISOR_VIRT_START) )
459 {
460 printk("DOM0 image overlaps with Xen private area.\n");
461 return -EINVAL;
462 }
464 /* WARNING: The new domain must have its 'processor' field filled in! */
465 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
466 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
467 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
468 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
469 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
470 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
471 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
472 v->arch.guest_table = mk_pagetable(__pa(l4start));
474 l4tab += l4_table_offset(dsi.v_start);
475 mfn = alloc_spfn;
476 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
477 {
478 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
479 {
480 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
481 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
482 clear_page(l1tab);
483 if ( count == 0 )
484 l1tab += l1_table_offset(dsi.v_start);
485 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
486 {
487 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
488 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
489 clear_page(l2tab);
490 if ( count == 0 )
491 l2tab += l2_table_offset(dsi.v_start);
492 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
493 {
494 phys_to_page(mpt_alloc)->u.inuse.type_info =
495 PGT_l3_page_table;
496 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
497 clear_page(l3tab);
498 if ( count == 0 )
499 l3tab += l3_table_offset(dsi.v_start);
500 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
501 l4tab++;
502 }
503 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
504 l3tab++;
505 }
506 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
507 l2tab++;
508 }
509 *l1tab = l1e_from_pfn(mfn, L1_PROT);
510 l1tab++;
512 page = &frame_table[mfn];
513 if ( (page->u.inuse.type_info == 0) &&
514 !get_page_and_type(page, d, PGT_writable_page) )
515 BUG();
517 mfn++;
518 }
520 /* Pages that are part of page tables must be read only. */
521 l4tab = l4start + l4_table_offset(vpt_start);
522 l3start = l3tab = l4e_to_l3e(*l4tab);
523 l3tab += l3_table_offset(vpt_start);
524 l2start = l2tab = l3e_to_l2e(*l3tab);
525 l2tab += l2_table_offset(vpt_start);
526 l1start = l1tab = l2e_to_l1e(*l2tab);
527 l1tab += l1_table_offset(vpt_start);
528 for ( count = 0; count < nr_pt_pages; count++ )
529 {
530 l1e_remove_flags(*l1tab, _PAGE_RW);
531 page = &frame_table[l1e_get_pfn(*l1tab)];
533 /* Read-only mapping + PGC_allocated + page-table page. */
534 page->count_info = PGC_allocated | 3;
535 page->u.inuse.type_info |= PGT_validated | 1;
537 /* Top-level p.t. is pinned. */
538 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
539 {
540 page->count_info += 1;
541 page->u.inuse.type_info += 1 | PGT_pinned;
542 }
544 /* Iterate. */
545 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
546 {
547 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
548 {
549 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
550 l3start = l3tab = l4e_to_l3e(*++l4tab);
551 l2start = l2tab = l3e_to_l2e(*l3tab);
552 }
553 l1start = l1tab = l2e_to_l1e(*l2tab);
554 }
555 }
557 #endif /* __x86_64__ */
559 /* Mask all upcalls... */
560 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
561 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
562 d->shared_info->n_vcpu = num_online_cpus();
564 /* Set up monitor table */
565 update_pagetables(v);
567 /* Install the new page tables. */
568 local_irq_disable();
569 write_ptbase(v);
571 /* Copy the OS image and free temporary buffer. */
572 (void)loadelfimage(&dsi);
574 init_domheap_pages(
575 _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
577 /* Copy the initial ramdisk and free temporary buffer. */
578 if ( initrd_len != 0 )
579 {
580 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
581 init_domheap_pages(
582 _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
583 }
585 d->next_io_page = max_page;
587 /* Set up start info area. */
588 si = (start_info_t *)vstartinfo_start;
589 memset(si, 0, PAGE_SIZE);
590 si->nr_pages = nr_pages;
592 if ( opt_dom0_translate )
593 {
594 si->shared_info = d->next_io_page << PAGE_SHIFT;
595 set_machinetophys(virt_to_phys(d->shared_info) >> PAGE_SHIFT,
596 d->next_io_page);
597 d->next_io_page++;
598 }
599 else
600 si->shared_info = virt_to_phys(d->shared_info);
602 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
603 si->pt_base = vpt_start;
604 si->nr_pt_frames = nr_pt_pages;
605 si->mfn_list = vphysmap_start;
607 /* Write the phys->machine and machine->phys table entries. */
608 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
609 {
610 mfn = pfn + alloc_spfn;
611 #ifndef NDEBUG
612 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
613 if ( !opt_dom0_translate && (pfn > REVERSE_START) )
614 mfn = alloc_epfn - (pfn - REVERSE_START);
615 #endif
616 ((u32 *)vphysmap_start)[pfn] = mfn;
617 machine_to_phys_mapping[mfn] = pfn;
618 }
619 while ( pfn < nr_pages )
620 {
621 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
622 panic("Not enough RAM for DOM0 reservation.\n");
623 while ( pfn < d->tot_pages )
624 {
625 mfn = page_to_pfn(page);
626 #ifndef NDEBUG
627 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
628 #endif
629 ((u32 *)vphysmap_start)[pfn] = mfn;
630 machine_to_phys_mapping[mfn] = pfn;
631 #undef pfn
632 page++; pfn++;
633 }
634 }
636 if ( initrd_len != 0 )
637 {
638 si->mod_start = vinitrd_start;
639 si->mod_len = initrd_len;
640 printk("Initrd len 0x%lx, start at 0x%lx\n",
641 si->mod_len, si->mod_start);
642 }
644 memset(si->cmd_line, 0, sizeof(si->cmd_line));
645 if ( cmdline != NULL )
646 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
648 /* Reinstate the caller's page tables. */
649 write_ptbase(current);
650 local_irq_enable();
652 #if defined(__i386__)
653 /* Destroy low mappings - they were only for our convenience. */
654 zap_low_mappings(l2start);
655 zap_low_mappings(idle_pg_table_l2);
656 #endif
658 /* DOM0 gets access to everything. */
659 physdev_init_dom0(d);
661 init_domain_time(d);
663 set_bit(_DOMF_constructed, &d->domain_flags);
665 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
667 if ( opt_dom0_shadow || opt_dom0_translate )
668 {
669 printk("dom0: shadow enable\n");
670 shadow_mode_enable(d, (opt_dom0_translate
671 ? SHM_enable | SHM_refcounts | SHM_translate
672 : SHM_enable));
673 if ( opt_dom0_translate )
674 {
675 printk("dom0: shadow translate\n");
676 #if defined(__i386__) && defined(CONFIG_X86_PAE)
677 printk("FIXME: PAE code needed here: %s:%d (%s)\n",
678 __FILE__, __LINE__, __FUNCTION__);
679 for ( ; ; )
680 __asm__ __volatile__ ( "hlt" );
681 #else
682 /* Hmm, what does this?
683 Looks like isn't portable across 32/64 bit and pae/non-pae ...
684 -- kraxel */
686 /* mafetter: This code is mostly a hack in order to be able to
687 * test with dom0's which are running with shadow translate.
688 * I expect we'll rip this out once we have a stable set of
689 * domU clients which use the various shadow modes, but it's
690 * useful to leave this here for now...
691 */
693 // map this domain's p2m table into current page table,
694 // so that we can easily access it.
695 //
696 ASSERT( root_get_intpte(idle_pg_table[1]) == 0 );
697 ASSERT( pagetable_get_paddr(d->arch.phys_table) );
698 idle_pg_table[1] = root_from_paddr(
699 pagetable_get_paddr(d->arch.phys_table), __PAGE_HYPERVISOR);
700 translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT),
701 pagetable_get_pfn(v->arch.guest_table));
702 idle_pg_table[1] = root_empty();
703 local_flush_tlb();
704 #endif
705 }
707 update_pagetables(v); /* XXX SMP */
708 printk("dom0: shadow setup done\n");
709 }
711 return 0;
712 }
714 int elf_sanity_check(Elf_Ehdr *ehdr)
715 {
716 if ( !IS_ELF(*ehdr) ||
717 #if defined(__i386__)
718 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
719 (ehdr->e_machine != EM_386) ||
720 #elif defined(__x86_64__)
721 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
722 (ehdr->e_machine != EM_X86_64) ||
723 #endif
724 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
725 (ehdr->e_type != ET_EXEC) )
726 {
727 printk("DOM0 image is not a Xen-compatible Elf image.\n");
728 return 0;
729 }
731 return 1;
732 }
734 /*
735 * Local variables:
736 * mode: C
737 * c-set-style: "BSD"
738 * c-basic-offset: 4
739 * tab-width: 4
740 * indent-tabs-mode: nil
741 * End:
742 */