ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 6702:e3fd0fa58364

Rename get_order() to get_order_from_bytes() and add
new function get_order_from_pages(). Fix
HYPERVISOR_memory_op(), properly this time.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Sep 08 17:25:52 2005 +0000 (2005-09-08)
parents 8db9c5873b9b
children 3bde4219c681 aa0990ef260f
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <asm/regs.h>
18 #include <asm/system.h>
19 #include <asm/io.h>
20 #include <asm/processor.h>
21 #include <asm/desc.h>
22 #include <asm/i387.h>
23 #include <asm/physdev.h>
24 #include <asm/shadow.h>
26 static long dom0_nrpages;
28 /*
29 * dom0_mem:
30 * If +ve:
31 * * The specified amount of memory is allocated to domain 0.
32 * If -ve:
33 * * All of memory is allocated to domain 0, minus the specified amount.
34 * If not specified:
35 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
36 * for uses such as DMA buffers (the reservation is clamped to 128MB).
37 */
38 static void parse_dom0_mem(char *s)
39 {
40 unsigned long long bytes;
41 char *t = s;
42 if ( *s == '-' )
43 t++;
44 bytes = parse_size_and_unit(t);
45 dom0_nrpages = bytes >> PAGE_SHIFT;
46 if ( *s == '-' )
47 dom0_nrpages = -dom0_nrpages;
48 }
49 custom_param("dom0_mem", parse_dom0_mem);
51 static unsigned int opt_dom0_shadow = 0;
52 boolean_param("dom0_shadow", opt_dom0_shadow);
54 static unsigned int opt_dom0_translate = 0;
55 boolean_param("dom0_translate", opt_dom0_translate);
57 #if defined(__i386__)
58 /* No ring-3 access in initial leaf page tables. */
59 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
60 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
61 #define L3_PROT (_PAGE_PRESENT)
62 #elif defined(__x86_64__)
63 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
64 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
65 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
66 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
67 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
68 #endif
70 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
71 #define round_pgdown(_p) ((_p)&PAGE_MASK)
73 static struct pfn_info *alloc_chunk(struct domain *d, unsigned long max_pages)
74 {
75 struct pfn_info *page;
76 unsigned int order;
77 /*
78 * Allocate up to 2MB at a time: It prevents allocating very large chunks
79 * from DMA pools before the >4GB pool is fully depleted.
80 */
81 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
82 max_pages = 2UL << (20 - PAGE_SHIFT);
83 order = get_order_from_pages(max_pages);
84 if ( (max_pages & (max_pages-1)) != 0 )
85 order--;
86 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
87 if ( order-- == 0 )
88 break;
89 return page;
90 }
92 int construct_dom0(struct domain *d,
93 unsigned long _image_start, unsigned long image_len,
94 unsigned long _initrd_start, unsigned long initrd_len,
95 char *cmdline)
96 {
97 int i, rc, dom0_pae, xen_pae, order;
98 unsigned long pfn, mfn;
99 unsigned long nr_pages;
100 unsigned long nr_pt_pages;
101 unsigned long alloc_spfn;
102 unsigned long alloc_epfn;
103 unsigned long count;
104 struct pfn_info *page = NULL;
105 start_info_t *si;
106 struct vcpu *v = d->vcpu[0];
107 #if defined(__i386__)
108 char *image_start = (char *)_image_start; /* use lowmem mappings */
109 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
110 #elif defined(__x86_64__)
111 char *image_start = __va(_image_start);
112 char *initrd_start = __va(_initrd_start);
113 #endif
114 #if CONFIG_PAGING_LEVELS >= 4
115 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
116 #endif
117 #if CONFIG_PAGING_LEVELS >= 3
118 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
119 #endif
120 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
121 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
123 /*
124 * This fully describes the memory layout of the initial domain. All
125 * *_start address are page-aligned, except v_start (and v_end) which are
126 * superpage-aligned.
127 */
128 struct domain_setup_info dsi;
129 unsigned long vinitrd_start;
130 unsigned long vinitrd_end;
131 unsigned long vphysmap_start;
132 unsigned long vphysmap_end;
133 unsigned long vstartinfo_start;
134 unsigned long vstartinfo_end;
135 unsigned long vstack_start;
136 unsigned long vstack_end;
137 unsigned long vpt_start;
138 unsigned long vpt_end;
139 unsigned long v_end;
141 /* Machine address of next candidate page-table page. */
142 unsigned long mpt_alloc;
144 extern void physdev_init_dom0(struct domain *);
145 extern void translate_l2pgtable(
146 struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
148 /* Sanity! */
149 if ( d->domain_id != 0 )
150 BUG();
151 if ( test_bit(_DOMF_constructed, &d->domain_flags) )
152 BUG();
154 memset(&dsi, 0, sizeof(struct domain_setup_info));
155 dsi.image_addr = (unsigned long)image_start;
156 dsi.image_len = image_len;
158 printk("*** LOADING DOMAIN 0 ***\n");
160 d->max_pages = ~0U;
162 /*
163 * If domain 0 allocation isn't specified, reserve 1/16th of available
164 * memory for things like DMA buffers. This reservation is clamped to
165 * a maximum of 128MB.
166 */
167 if ( dom0_nrpages == 0 )
168 {
169 dom0_nrpages = avail_domheap_pages() +
170 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
171 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT);
172 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
173 dom0_nrpages = -dom0_nrpages;
174 }
176 /* Negative memory specification means "all memory - specified amount". */
177 if ( dom0_nrpages < 0 )
178 nr_pages = avail_domheap_pages() +
179 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
180 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
181 dom0_nrpages;
182 else
183 nr_pages = dom0_nrpages;
185 if ( (rc = parseelfimage(&dsi)) != 0 )
186 return rc;
188 if ( dsi.xen_section_string == NULL )
189 {
190 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
191 return -EINVAL;
192 }
194 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
195 xen_pae = (CONFIG_PAGING_LEVELS == 3);
196 if ( dom0_pae != xen_pae )
197 {
198 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
199 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
200 return -EINVAL;
201 }
202 if (strstr(dsi.xen_section_string, "SHADOW=translate"))
203 opt_dom0_translate = 1;
205 /* Align load address to 4MB boundary. */
206 dsi.v_start &= ~((1UL<<22)-1);
208 /*
209 * Why do we need this? The number of page-table frames depends on the
210 * size of the bootstrap address space. But the size of the address space
211 * depends on the number of page-table frames (since each one is mapped
212 * read-only). We have a pair of simultaneous equations in two unknowns,
213 * which we solve by exhaustive search.
214 */
215 vinitrd_start = round_pgup(dsi.v_end);
216 vinitrd_end = vinitrd_start + initrd_len;
217 vphysmap_start = round_pgup(vinitrd_end);
218 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
219 vstartinfo_start = round_pgup(vphysmap_end);
220 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
221 vpt_start = vstartinfo_end;
222 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
223 {
224 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
225 vstack_start = vpt_end;
226 vstack_end = vstack_start + PAGE_SIZE;
227 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
228 if ( (v_end - vstack_end) < (512UL << 10) )
229 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
230 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
231 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
232 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
233 break;
234 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
235 /* 5 pages: 1x 3rd + 4x 2nd level */
236 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
237 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
238 break;
239 #elif defined(__x86_64__)
240 #define NR(_l,_h,_s) \
241 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
242 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
243 if ( (1 + /* # L4 */
244 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
245 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
246 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
247 <= nr_pt_pages )
248 break;
249 #endif
250 }
252 order = get_order_from_bytes(v_end - dsi.v_start);
253 if ( (1UL << order) > nr_pages )
254 panic("Domain 0 allocation is too small for kernel image.\n");
256 /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
257 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
258 panic("Not enough RAM for domain 0 allocation.\n");
259 alloc_spfn = page_to_pfn(page);
260 alloc_epfn = alloc_spfn + d->tot_pages;
262 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
263 " Dom0 alloc.: %"PRIphysaddr"->%"PRIphysaddr,
264 pfn_to_phys(alloc_spfn), pfn_to_phys(alloc_epfn));
265 if ( d->tot_pages < nr_pages )
266 printk(" (%lu pages to be allocated)",
267 nr_pages - d->tot_pages);
268 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
269 " Loaded kernel: %p->%p\n"
270 " Init. ramdisk: %p->%p\n"
271 " Phys-Mach map: %p->%p\n"
272 " Start info: %p->%p\n"
273 " Page tables: %p->%p\n"
274 " Boot stack: %p->%p\n"
275 " TOTAL: %p->%p\n",
276 _p(dsi.v_kernstart), _p(dsi.v_kernend),
277 _p(vinitrd_start), _p(vinitrd_end),
278 _p(vphysmap_start), _p(vphysmap_end),
279 _p(vstartinfo_start), _p(vstartinfo_end),
280 _p(vpt_start), _p(vpt_end),
281 _p(vstack_start), _p(vstack_end),
282 _p(dsi.v_start), _p(v_end));
283 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
285 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
286 {
287 printk("Initial guest OS requires too much space\n"
288 "(%luMB is greater than %luMB limit)\n",
289 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
290 return -ENOMEM;
291 }
293 mpt_alloc = (vpt_start - dsi.v_start) +
294 (unsigned long)pfn_to_phys(alloc_spfn);
296 /*
297 * We're basically forcing default RPLs to 1, so that our "what privilege
298 * level are we returning to?" logic works.
299 */
300 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
301 for ( i = 0; i < 256; i++ )
302 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
304 #if defined(__i386__)
306 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
307 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
309 /*
310 * Protect the lowest 1GB of memory. We use a temporary mapping there
311 * from which we copy the kernel and ramdisk images.
312 */
313 if ( dsi.v_start < (1UL<<30) )
314 {
315 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
316 return -EINVAL;
317 }
319 /* WARNING: The new domain must have its 'processor' field filled in! */
320 #if CONFIG_PAGING_LEVELS == 3
321 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
322 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
323 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
324 for (i = 0; i < 4; i++) {
325 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
326 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
327 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
328 }
329 {
330 unsigned long va;
331 for (va = PERDOMAIN_VIRT_START; va < PERDOMAIN_VIRT_END;
332 va += (1 << L2_PAGETABLE_SHIFT)) {
333 l2tab[va >> L2_PAGETABLE_SHIFT] =
334 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt) +
335 (va-PERDOMAIN_VIRT_START),
336 __PAGE_HYPERVISOR);
337 }
338 }
339 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
340 #else
341 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
342 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
343 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
344 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
345 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
346 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
347 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
348 #endif
350 l2tab += l2_linear_offset(dsi.v_start);
351 mfn = alloc_spfn;
352 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
353 {
354 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
355 {
356 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
357 mpt_alloc += PAGE_SIZE;
358 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
359 l2tab++;
360 clear_page(l1tab);
361 if ( count == 0 )
362 l1tab += l1_table_offset(dsi.v_start);
363 }
364 *l1tab = l1e_from_pfn(mfn, L1_PROT);
365 l1tab++;
367 page = &frame_table[mfn];
368 if ( !get_page_and_type(page, d, PGT_writable_page) )
369 BUG();
371 mfn++;
372 }
374 /* Pages that are part of page tables must be read only. */
375 l2tab = l2start + l2_linear_offset(vpt_start);
376 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
377 l1tab += l1_table_offset(vpt_start);
378 for ( count = 0; count < nr_pt_pages; count++ )
379 {
380 page = &frame_table[l1e_get_pfn(*l1tab)];
381 if ( !opt_dom0_shadow )
382 l1e_remove_flags(*l1tab, _PAGE_RW);
383 else
384 if ( !get_page_type(page, PGT_writable_page) )
385 BUG();
387 #if CONFIG_PAGING_LEVELS == 3
388 switch (count) {
389 case 0:
390 page->u.inuse.type_info &= ~PGT_type_mask;
391 page->u.inuse.type_info |= PGT_l3_page_table;
392 get_page(page, d); /* an extra ref because of readable mapping */
394 /* Get another ref to L3 page so that it can be pinned. */
395 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
396 BUG();
397 set_bit(_PGT_pinned, &page->u.inuse.type_info);
398 break;
399 case 1 ... 4:
400 page->u.inuse.type_info &= ~PGT_type_mask;
401 page->u.inuse.type_info |= PGT_l2_page_table;
402 page->u.inuse.type_info |=
403 (count-1) << PGT_va_shift;
404 get_page(page, d); /* an extra ref because of readable mapping */
405 break;
406 default:
407 page->u.inuse.type_info &= ~PGT_type_mask;
408 page->u.inuse.type_info |= PGT_l1_page_table;
409 page->u.inuse.type_info |=
410 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
411 get_page(page, d); /* an extra ref because of readable mapping */
412 break;
413 }
414 #else
415 if ( count == 0 )
416 {
417 page->u.inuse.type_info &= ~PGT_type_mask;
418 page->u.inuse.type_info |= PGT_l2_page_table;
420 /*
421 * No longer writable: decrement the type_count.
422 * Installed as CR3: increment both the ref_count and type_count.
423 * Net: just increment the ref_count.
424 */
425 get_page(page, d); /* an extra ref because of readable mapping */
427 /* Get another ref to L2 page so that it can be pinned. */
428 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
429 BUG();
430 set_bit(_PGT_pinned, &page->u.inuse.type_info);
431 }
432 else
433 {
434 page->u.inuse.type_info &= ~PGT_type_mask;
435 page->u.inuse.type_info |= PGT_l1_page_table;
436 page->u.inuse.type_info |=
437 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
439 /*
440 * No longer writable: decrement the type_count.
441 * This is an L1 page, installed in a validated L2 page:
442 * increment both the ref_count and type_count.
443 * Net: just increment the ref_count.
444 */
445 get_page(page, d); /* an extra ref because of readable mapping */
446 }
447 #endif
448 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
449 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
450 }
452 #elif defined(__x86_64__)
454 /* Overlap with Xen protected area? */
455 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
456 (v_end > HYPERVISOR_VIRT_START) )
457 {
458 printk("DOM0 image overlaps with Xen private area.\n");
459 return -EINVAL;
460 }
462 /* WARNING: The new domain must have its 'processor' field filled in! */
463 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
464 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
465 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
466 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
467 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
468 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
469 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
470 v->arch.guest_table = mk_pagetable(__pa(l4start));
472 l4tab += l4_table_offset(dsi.v_start);
473 mfn = alloc_spfn;
474 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
475 {
476 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
477 {
478 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
479 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
480 clear_page(l1tab);
481 if ( count == 0 )
482 l1tab += l1_table_offset(dsi.v_start);
483 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
484 {
485 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
486 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
487 clear_page(l2tab);
488 if ( count == 0 )
489 l2tab += l2_table_offset(dsi.v_start);
490 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
491 {
492 phys_to_page(mpt_alloc)->u.inuse.type_info =
493 PGT_l3_page_table;
494 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
495 clear_page(l3tab);
496 if ( count == 0 )
497 l3tab += l3_table_offset(dsi.v_start);
498 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
499 l4tab++;
500 }
501 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
502 l3tab++;
503 }
504 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
505 l2tab++;
506 }
507 *l1tab = l1e_from_pfn(mfn, L1_PROT);
508 l1tab++;
510 page = &frame_table[mfn];
511 if ( (page->u.inuse.type_info == 0) &&
512 !get_page_and_type(page, d, PGT_writable_page) )
513 BUG();
515 mfn++;
516 }
518 /* Pages that are part of page tables must be read only. */
519 l4tab = l4start + l4_table_offset(vpt_start);
520 l3start = l3tab = l4e_to_l3e(*l4tab);
521 l3tab += l3_table_offset(vpt_start);
522 l2start = l2tab = l3e_to_l2e(*l3tab);
523 l2tab += l2_table_offset(vpt_start);
524 l1start = l1tab = l2e_to_l1e(*l2tab);
525 l1tab += l1_table_offset(vpt_start);
526 for ( count = 0; count < nr_pt_pages; count++ )
527 {
528 l1e_remove_flags(*l1tab, _PAGE_RW);
529 page = &frame_table[l1e_get_pfn(*l1tab)];
531 /* Read-only mapping + PGC_allocated + page-table page. */
532 page->count_info = PGC_allocated | 3;
533 page->u.inuse.type_info |= PGT_validated | 1;
535 /* Top-level p.t. is pinned. */
536 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
537 {
538 page->count_info += 1;
539 page->u.inuse.type_info += 1 | PGT_pinned;
540 }
542 /* Iterate. */
543 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
544 {
545 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
546 {
547 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
548 l3start = l3tab = l4e_to_l3e(*++l4tab);
549 l2start = l2tab = l3e_to_l2e(*l3tab);
550 }
551 l1start = l1tab = l2e_to_l1e(*l2tab);
552 }
553 }
555 #endif /* __x86_64__ */
557 /* Mask all upcalls... */
558 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
559 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
560 d->shared_info->n_vcpu = num_online_cpus();
562 /* Set up monitor table */
563 update_pagetables(v);
565 /* Install the new page tables. */
566 local_irq_disable();
567 write_ptbase(v);
569 /* Copy the OS image and free temporary buffer. */
570 (void)loadelfimage(&dsi);
572 init_domheap_pages(
573 _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
575 /* Copy the initial ramdisk and free temporary buffer. */
576 if ( initrd_len != 0 )
577 {
578 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
579 init_domheap_pages(
580 _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
581 }
583 d->next_io_page = max_page;
585 /* Set up start info area. */
586 si = (start_info_t *)vstartinfo_start;
587 memset(si, 0, PAGE_SIZE);
588 si->nr_pages = nr_pages;
590 if ( opt_dom0_translate )
591 {
592 si->shared_info = d->next_io_page << PAGE_SHIFT;
593 set_pfn_from_mfn(virt_to_phys(d->shared_info) >> PAGE_SHIFT, d->next_io_page);
594 d->next_io_page++;
595 }
596 else
597 si->shared_info = virt_to_phys(d->shared_info);
599 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
600 si->pt_base = vpt_start;
601 si->nr_pt_frames = nr_pt_pages;
602 si->mfn_list = vphysmap_start;
604 /* Write the phys->machine and machine->phys table entries. */
605 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
606 {
607 mfn = pfn + alloc_spfn;
608 #ifndef NDEBUG
609 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
610 if ( !opt_dom0_translate && (pfn > REVERSE_START) )
611 mfn = alloc_epfn - (pfn - REVERSE_START);
612 #endif
613 ((unsigned long *)vphysmap_start)[pfn] = mfn;
614 set_pfn_from_mfn(mfn, pfn);
615 }
616 while ( pfn < nr_pages )
617 {
618 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
619 panic("Not enough RAM for DOM0 reservation.\n");
620 while ( pfn < d->tot_pages )
621 {
622 mfn = page_to_pfn(page);
623 #ifndef NDEBUG
624 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
625 #endif
626 ((unsigned long *)vphysmap_start)[pfn] = mfn;
627 set_pfn_from_mfn(mfn, pfn);
628 #undef pfn
629 page++; pfn++;
630 }
631 }
633 if ( initrd_len != 0 )
634 {
635 si->mod_start = vinitrd_start;
636 si->mod_len = initrd_len;
637 printk("Initrd len 0x%lx, start at 0x%lx\n",
638 si->mod_len, si->mod_start);
639 }
641 memset(si->cmd_line, 0, sizeof(si->cmd_line));
642 if ( cmdline != NULL )
643 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
645 /* Reinstate the caller's page tables. */
646 write_ptbase(current);
647 local_irq_enable();
649 #if defined(__i386__)
650 /* Destroy low mappings - they were only for our convenience. */
651 zap_low_mappings(l2start);
652 zap_low_mappings(idle_pg_table_l2);
653 #endif
655 /* DOM0 gets access to everything. */
656 physdev_init_dom0(d);
658 init_domain_time(d);
660 set_bit(_DOMF_constructed, &d->domain_flags);
662 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
664 if ( opt_dom0_shadow || opt_dom0_translate )
665 {
666 printk("dom0: shadow enable\n");
667 shadow_mode_enable(d, (opt_dom0_translate
668 ? SHM_enable | SHM_refcounts | SHM_translate
669 : SHM_enable));
670 if ( opt_dom0_translate )
671 {
672 printk("dom0: shadow translate\n");
673 #if defined(__i386__) && defined(CONFIG_X86_PAE)
674 printk("FIXME: PAE code needed here: %s:%d (%s)\n",
675 __FILE__, __LINE__, __FUNCTION__);
676 for ( ; ; )
677 __asm__ __volatile__ ( "hlt" );
678 #else
679 /* Hmm, what does this?
680 Looks like isn't portable across 32/64 bit and pae/non-pae ...
681 -- kraxel */
683 /* mafetter: This code is mostly a hack in order to be able to
684 * test with dom0's which are running with shadow translate.
685 * I expect we'll rip this out once we have a stable set of
686 * domU clients which use the various shadow modes, but it's
687 * useful to leave this here for now...
688 */
690 // map this domain's p2m table into current page table,
691 // so that we can easily access it.
692 //
693 ASSERT( root_get_intpte(idle_pg_table[1]) == 0 );
694 ASSERT( pagetable_get_paddr(d->arch.phys_table) );
695 idle_pg_table[1] = root_from_paddr(
696 pagetable_get_paddr(d->arch.phys_table), __PAGE_HYPERVISOR);
697 translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT),
698 pagetable_get_pfn(v->arch.guest_table));
699 idle_pg_table[1] = root_empty();
700 local_flush_tlb();
701 #endif
702 }
704 update_pagetables(v); /* XXX SMP */
705 printk("dom0: shadow setup done\n");
706 }
708 /*
709 * Modify I/O port access permissions.
710 */
711 /* Master Interrupt Controller (PIC). */
712 physdev_modify_ioport_access_range(dom0, 0, 0x20, 2);
713 /* Slave Interrupt Controller (PIC). */
714 physdev_modify_ioport_access_range(dom0, 0, 0xA0, 2);
715 /* Interval Timer (PIT). */
716 physdev_modify_ioport_access_range(dom0, 0, 0x40, 4);
717 /* PIT Channel 2 / PC Speaker Control. */
718 physdev_modify_ioport_access_range(dom0, 0, 0x61, 1);
720 return 0;
721 }
723 int elf_sanity_check(Elf_Ehdr *ehdr)
724 {
725 if ( !IS_ELF(*ehdr) ||
726 #if defined(__i386__)
727 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
728 (ehdr->e_machine != EM_386) ||
729 #elif defined(__x86_64__)
730 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
731 (ehdr->e_machine != EM_X86_64) ||
732 #endif
733 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
734 (ehdr->e_type != ET_EXEC) )
735 {
736 printk("DOM0 image is not a Xen-compatible Elf image.\n");
737 return 0;
738 }
740 return 1;
741 }
743 /*
744 * Local variables:
745 * mode: C
746 * c-set-style: "BSD"
747 * c-basic-offset: 4
748 * tab-width: 4
749 * indent-tabs-mode: nil
750 * End:
751 */