ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 6538:84ee014ebd41

Merge xen-vtx-unstable.hg
author adsharma@los-vmm.sc.intel.com
date Wed Aug 17 12:34:38 2005 -0800 (2005-08-17)
parents 23979fb12c49 10b395bc465e
children 99914b54f7bf
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <asm/regs.h>
18 #include <asm/system.h>
19 #include <asm/io.h>
20 #include <asm/processor.h>
21 #include <asm/desc.h>
22 #include <asm/i387.h>
23 #include <asm/shadow.h>
25 /* opt_dom0_mem: memory allocated to domain 0. */
26 static unsigned int opt_dom0_mem;
27 static void parse_dom0_mem(char *s)
28 {
29 unsigned long long bytes = parse_size_and_unit(s);
30 /* If no unit is specified we default to kB units, not bytes. */
31 if ( isdigit(s[strlen(s)-1]) )
32 opt_dom0_mem = (unsigned int)bytes;
33 else
34 opt_dom0_mem = (unsigned int)(bytes >> 10);
35 }
36 custom_param("dom0_mem", parse_dom0_mem);
38 static unsigned int opt_dom0_shadow = 0;
39 boolean_param("dom0_shadow", opt_dom0_shadow);
41 static unsigned int opt_dom0_translate = 0;
42 boolean_param("dom0_translate", opt_dom0_translate);
44 #if defined(__i386__)
45 /* No ring-3 access in initial leaf page tables. */
46 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
47 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
48 #define L3_PROT (_PAGE_PRESENT)
49 #elif defined(__x86_64__)
50 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
51 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
52 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
53 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
54 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
55 #endif
57 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
58 #define round_pgdown(_p) ((_p)&PAGE_MASK)
60 static struct pfn_info *alloc_largest(struct domain *d, unsigned long max)
61 {
62 struct pfn_info *page;
63 unsigned int order = get_order(max * PAGE_SIZE);
64 if ( (max & (max-1)) != 0 )
65 order--;
66 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
67 if ( order-- == 0 )
68 break;
69 return page;
70 }
72 int construct_dom0(struct domain *d,
73 unsigned long _image_start, unsigned long image_len,
74 unsigned long _initrd_start, unsigned long initrd_len,
75 char *cmdline)
76 {
77 int i, rc, dom0_pae, xen_pae, order;
78 unsigned long pfn, mfn;
79 unsigned long nr_pages;
80 unsigned long nr_pt_pages;
81 unsigned long alloc_spfn;
82 unsigned long alloc_epfn;
83 unsigned long count;
84 struct pfn_info *page = NULL;
85 start_info_t *si;
86 struct vcpu *v = d->vcpu[0];
87 #if defined(__i386__)
88 char *image_start = (char *)_image_start; /* use lowmem mappings */
89 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
90 #elif defined(__x86_64__)
91 char *image_start = __va(_image_start);
92 char *initrd_start = __va(_initrd_start);
93 #endif
94 #if CONFIG_PAGING_LEVELS >= 4
95 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
96 #endif
97 #if CONFIG_PAGING_LEVELS >= 3
98 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
99 #endif
100 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
101 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
103 /*
104 * This fully describes the memory layout of the initial domain. All
105 * *_start address are page-aligned, except v_start (and v_end) which are
106 * superpage-aligned.
107 */
108 struct domain_setup_info dsi;
109 unsigned long vinitrd_start;
110 unsigned long vinitrd_end;
111 unsigned long vphysmap_start;
112 unsigned long vphysmap_end;
113 unsigned long vstartinfo_start;
114 unsigned long vstartinfo_end;
115 unsigned long vstack_start;
116 unsigned long vstack_end;
117 unsigned long vpt_start;
118 unsigned long vpt_end;
119 unsigned long v_end;
121 /* Machine address of next candidate page-table page. */
122 unsigned long mpt_alloc;
124 extern void physdev_init_dom0(struct domain *);
125 extern void translate_l2pgtable(
126 struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
128 /* Sanity! */
129 if ( d->domain_id != 0 )
130 BUG();
131 if ( test_bit(_DOMF_constructed, &d->domain_flags) )
132 BUG();
134 memset(&dsi, 0, sizeof(struct domain_setup_info));
135 dsi.image_addr = (unsigned long)image_start;
136 dsi.image_len = image_len;
138 printk("*** LOADING DOMAIN 0 ***\n");
140 /* By default DOM0 is allocated all available memory. */
141 d->max_pages = ~0U;
142 if ( (nr_pages = opt_dom0_mem >> (PAGE_SHIFT - 10)) == 0 )
143 nr_pages = avail_domheap_pages() +
144 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
145 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT);
147 if ( (rc = parseelfimage(&dsi)) != 0 )
148 return rc;
150 if ( dsi.xen_section_string == NULL )
151 {
152 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
153 return -EINVAL;
154 }
156 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
157 xen_pae = (CONFIG_PAGING_LEVELS == 3);
158 if ( dom0_pae != xen_pae )
159 {
160 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
161 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
162 return -EINVAL;
163 }
164 if (strstr(dsi.xen_section_string, "SHADOW=translate"))
165 opt_dom0_translate = 1;
167 /* Align load address to 4MB boundary. */
168 dsi.v_start &= ~((1UL<<22)-1);
170 /*
171 * Why do we need this? The number of page-table frames depends on the
172 * size of the bootstrap address space. But the size of the address space
173 * depends on the number of page-table frames (since each one is mapped
174 * read-only). We have a pair of simultaneous equations in two unknowns,
175 * which we solve by exhaustive search.
176 */
177 vinitrd_start = round_pgup(dsi.v_end);
178 vinitrd_end = vinitrd_start + initrd_len;
179 vphysmap_start = round_pgup(vinitrd_end);
180 vphysmap_end = vphysmap_start + (nr_pages * sizeof(u32));
181 vpt_start = round_pgup(vphysmap_end);
182 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
183 {
184 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
185 vstartinfo_start = vpt_end;
186 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
187 vstack_start = vstartinfo_end;
188 vstack_end = vstack_start + PAGE_SIZE;
189 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
190 if ( (v_end - vstack_end) < (512UL << 10) )
191 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
192 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
193 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
194 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
195 break;
196 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
197 /* 5 pages: 1x 3rd + 4x 2nd level */
198 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
199 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
200 break;
201 #elif defined(__x86_64__)
202 #define NR(_l,_h,_s) \
203 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
204 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
205 if ( (1 + /* # L4 */
206 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
207 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
208 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
209 <= nr_pt_pages )
210 break;
211 #endif
212 }
214 order = get_order(v_end - dsi.v_start);
215 if ( (1UL << order) > nr_pages )
216 panic("Domain 0 allocation is too small for kernel image.\n");
218 /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
219 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
220 panic("Not enough RAM for domain 0 allocation.\n");
221 alloc_spfn = page_to_pfn(page);
222 alloc_epfn = alloc_spfn + d->tot_pages;
224 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
225 " Dom0 alloc.: %"PRIphysaddr"->%"PRIphysaddr,
226 pfn_to_phys(alloc_spfn), pfn_to_phys(alloc_epfn));
227 if ( d->tot_pages < nr_pages )
228 printk(" (%lu pages to be allocated)",
229 nr_pages - d->tot_pages);
230 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
231 " Loaded kernel: %p->%p\n"
232 " Init. ramdisk: %p->%p\n"
233 " Phys-Mach map: %p->%p\n"
234 " Page tables: %p->%p\n"
235 " Start info: %p->%p\n"
236 " Boot stack: %p->%p\n"
237 " TOTAL: %p->%p\n",
238 _p(dsi.v_kernstart), _p(dsi.v_kernend),
239 _p(vinitrd_start), _p(vinitrd_end),
240 _p(vphysmap_start), _p(vphysmap_end),
241 _p(vpt_start), _p(vpt_end),
242 _p(vstartinfo_start), _p(vstartinfo_end),
243 _p(vstack_start), _p(vstack_end),
244 _p(dsi.v_start), _p(v_end));
245 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
247 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
248 {
249 printk("Initial guest OS requires too much space\n"
250 "(%luMB is greater than %luMB limit)\n",
251 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
252 return -ENOMEM;
253 }
255 mpt_alloc = (vpt_start - dsi.v_start) +
256 (unsigned long)pfn_to_phys(alloc_spfn);
258 /*
259 * We're basically forcing default RPLs to 1, so that our "what privilege
260 * level are we returning to?" logic works.
261 */
262 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
263 for ( i = 0; i < 256; i++ )
264 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
266 #if defined(__i386__)
268 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
269 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
271 /*
272 * Protect the lowest 1GB of memory. We use a temporary mapping there
273 * from which we copy the kernel and ramdisk images.
274 */
275 if ( dsi.v_start < (1UL<<30) )
276 {
277 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
278 return -EINVAL;
279 }
281 /* WARNING: The new domain must have its 'processor' field filled in! */
282 #if CONFIG_PAGING_LEVELS == 3
283 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
284 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
285 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
286 for (i = 0; i < 4; i++) {
287 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
288 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
289 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
290 }
291 {
292 unsigned long va;
293 for (va = PERDOMAIN_VIRT_START; va < PERDOMAIN_VIRT_END;
294 va += (1 << L2_PAGETABLE_SHIFT)) {
295 l2tab[va >> L2_PAGETABLE_SHIFT] =
296 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt) +
297 (va-PERDOMAIN_VIRT_START),
298 __PAGE_HYPERVISOR);
299 }
300 }
301 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
302 #else
303 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
304 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
305 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
306 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
307 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
308 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
309 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
310 #endif
312 l2tab += l2_linear_offset(dsi.v_start);
313 mfn = alloc_spfn;
314 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
315 {
316 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
317 {
318 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
319 mpt_alloc += PAGE_SIZE;
320 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
321 l2tab++;
322 clear_page(l1tab);
323 if ( count == 0 )
324 l1tab += l1_table_offset(dsi.v_start);
325 }
326 *l1tab = l1e_from_pfn(mfn, L1_PROT);
327 l1tab++;
329 page = &frame_table[mfn];
330 if ( !get_page_and_type(page, d, PGT_writable_page) )
331 BUG();
333 mfn++;
334 }
336 /* Pages that are part of page tables must be read only. */
337 l2tab = l2start + l2_linear_offset(vpt_start);
338 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
339 l1tab += l1_table_offset(vpt_start);
340 for ( count = 0; count < nr_pt_pages; count++ )
341 {
342 page = &frame_table[l1e_get_pfn(*l1tab)];
343 if ( !opt_dom0_shadow )
344 l1e_remove_flags(*l1tab, _PAGE_RW);
345 else
346 if ( !get_page_type(page, PGT_writable_page) )
347 BUG();
349 #if CONFIG_PAGING_LEVELS == 3
350 switch (count) {
351 case 0:
352 page->u.inuse.type_info &= ~PGT_type_mask;
353 page->u.inuse.type_info |= PGT_l3_page_table;
354 get_page(page, d); /* an extra ref because of readable mapping */
356 /* Get another ref to L3 page so that it can be pinned. */
357 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
358 BUG();
359 set_bit(_PGT_pinned, &page->u.inuse.type_info);
360 break;
361 case 1 ... 4:
362 page->u.inuse.type_info &= ~PGT_type_mask;
363 page->u.inuse.type_info |= PGT_l2_page_table;
364 page->u.inuse.type_info |=
365 (count-1) << PGT_va_shift;
366 get_page(page, d); /* an extra ref because of readable mapping */
367 break;
368 default:
369 page->u.inuse.type_info &= ~PGT_type_mask;
370 page->u.inuse.type_info |= PGT_l1_page_table;
371 page->u.inuse.type_info |=
372 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
373 get_page(page, d); /* an extra ref because of readable mapping */
374 break;
375 }
376 #else
377 if ( count == 0 )
378 {
379 page->u.inuse.type_info &= ~PGT_type_mask;
380 page->u.inuse.type_info |= PGT_l2_page_table;
382 /*
383 * No longer writable: decrement the type_count.
384 * Installed as CR3: increment both the ref_count and type_count.
385 * Net: just increment the ref_count.
386 */
387 get_page(page, d); /* an extra ref because of readable mapping */
389 /* Get another ref to L2 page so that it can be pinned. */
390 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
391 BUG();
392 set_bit(_PGT_pinned, &page->u.inuse.type_info);
393 }
394 else
395 {
396 page->u.inuse.type_info &= ~PGT_type_mask;
397 page->u.inuse.type_info |= PGT_l1_page_table;
398 page->u.inuse.type_info |=
399 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
401 /*
402 * No longer writable: decrement the type_count.
403 * This is an L1 page, installed in a validated L2 page:
404 * increment both the ref_count and type_count.
405 * Net: just increment the ref_count.
406 */
407 get_page(page, d); /* an extra ref because of readable mapping */
408 }
409 #endif
410 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
411 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
412 }
414 #elif defined(__x86_64__)
416 /* Overlap with Xen protected area? */
417 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
418 (v_end > HYPERVISOR_VIRT_START) )
419 {
420 printk("DOM0 image overlaps with Xen private area.\n");
421 return -EINVAL;
422 }
424 /* WARNING: The new domain must have its 'processor' field filled in! */
425 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
426 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
427 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
428 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
429 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
430 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
431 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
432 v->arch.guest_table = mk_pagetable(__pa(l4start));
434 l4tab += l4_table_offset(dsi.v_start);
435 mfn = alloc_spfn;
436 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
437 {
438 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
439 {
440 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
441 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
442 clear_page(l1tab);
443 if ( count == 0 )
444 l1tab += l1_table_offset(dsi.v_start);
445 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
446 {
447 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
448 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
449 clear_page(l2tab);
450 if ( count == 0 )
451 l2tab += l2_table_offset(dsi.v_start);
452 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
453 {
454 phys_to_page(mpt_alloc)->u.inuse.type_info =
455 PGT_l3_page_table;
456 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
457 clear_page(l3tab);
458 if ( count == 0 )
459 l3tab += l3_table_offset(dsi.v_start);
460 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
461 l4tab++;
462 }
463 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
464 l3tab++;
465 }
466 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
467 l2tab++;
468 }
469 *l1tab = l1e_from_pfn(mfn, L1_PROT);
470 l1tab++;
472 page = &frame_table[mfn];
473 if ( (page->u.inuse.type_info == 0) &&
474 !get_page_and_type(page, d, PGT_writable_page) )
475 BUG();
477 mfn++;
478 }
480 /* Pages that are part of page tables must be read only. */
481 l4tab = l4start + l4_table_offset(vpt_start);
482 l3start = l3tab = l4e_to_l3e(*l4tab);
483 l3tab += l3_table_offset(vpt_start);
484 l2start = l2tab = l3e_to_l2e(*l3tab);
485 l2tab += l2_table_offset(vpt_start);
486 l1start = l1tab = l2e_to_l1e(*l2tab);
487 l1tab += l1_table_offset(vpt_start);
488 for ( count = 0; count < nr_pt_pages; count++ )
489 {
490 l1e_remove_flags(*l1tab, _PAGE_RW);
491 page = &frame_table[l1e_get_pfn(*l1tab)];
493 /* Read-only mapping + PGC_allocated + page-table page. */
494 page->count_info = PGC_allocated | 3;
495 page->u.inuse.type_info |= PGT_validated | 1;
497 /* Top-level p.t. is pinned. */
498 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
499 {
500 page->count_info += 1;
501 page->u.inuse.type_info += 1 | PGT_pinned;
502 }
504 /* Iterate. */
505 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
506 {
507 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
508 {
509 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
510 l3start = l3tab = l4e_to_l3e(*++l4tab);
511 l2start = l2tab = l3e_to_l2e(*l3tab);
512 }
513 l1start = l1tab = l2e_to_l1e(*l2tab);
514 }
515 }
517 #endif /* __x86_64__ */
519 /* Mask all upcalls... */
520 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
521 d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
522 d->shared_info->n_vcpu = num_online_cpus();
524 /* Set up monitor table */
525 update_pagetables(v);
527 /* Install the new page tables. */
528 local_irq_disable();
529 write_ptbase(v);
531 /* Copy the OS image and free temporary buffer. */
532 (void)loadelfimage(&dsi);
534 init_domheap_pages(
535 _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
537 /* Copy the initial ramdisk and free temporary buffer. */
538 if ( initrd_len != 0 )
539 {
540 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
541 init_domheap_pages(
542 _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
543 }
545 d->next_io_page = max_page;
547 /* Set up start info area. */
548 si = (start_info_t *)vstartinfo_start;
549 memset(si, 0, PAGE_SIZE);
550 si->nr_pages = nr_pages;
552 if ( opt_dom0_translate )
553 {
554 si->shared_info = d->next_io_page << PAGE_SHIFT;
555 set_machinetophys(virt_to_phys(d->shared_info) >> PAGE_SHIFT,
556 d->next_io_page);
557 d->next_io_page++;
558 }
559 else
560 si->shared_info = virt_to_phys(d->shared_info);
562 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
563 si->pt_base = vpt_start;
564 si->nr_pt_frames = nr_pt_pages;
565 si->mfn_list = vphysmap_start;
567 /* Write the phys->machine and machine->phys table entries. */
568 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
569 {
570 mfn = pfn + alloc_spfn;
571 #ifndef NDEBUG
572 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
573 if ( !opt_dom0_translate && (pfn > REVERSE_START) )
574 mfn = alloc_epfn - (pfn - REVERSE_START);
575 #endif
576 ((u32 *)vphysmap_start)[pfn] = mfn;
577 machine_to_phys_mapping[mfn] = pfn;
578 }
579 while ( pfn < nr_pages )
580 {
581 if ( (page = alloc_largest(d, nr_pages - d->tot_pages)) == NULL )
582 panic("Not enough RAM for DOM0 reservation.\n");
583 while ( pfn < d->tot_pages )
584 {
585 mfn = page_to_pfn(page);
586 #ifndef NDEBUG
587 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
588 #endif
589 ((u32 *)vphysmap_start)[pfn] = mfn;
590 machine_to_phys_mapping[mfn] = pfn;
591 #undef pfn
592 page++; pfn++;
593 }
594 }
596 if ( initrd_len != 0 )
597 {
598 si->mod_start = vinitrd_start;
599 si->mod_len = initrd_len;
600 printk("Initrd len 0x%lx, start at 0x%lx\n",
601 si->mod_len, si->mod_start);
602 }
604 memset(si->cmd_line, 0, sizeof(si->cmd_line));
605 if ( cmdline != NULL )
606 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
608 /* Reinstate the caller's page tables. */
609 write_ptbase(current);
610 local_irq_enable();
612 #if defined(__i386__)
613 /* Destroy low mappings - they were only for our convenience. */
614 zap_low_mappings(l2start);
615 zap_low_mappings(idle_pg_table_l2);
616 #endif
618 /* DOM0 gets access to everything. */
619 physdev_init_dom0(d);
621 init_domain_time(d);
623 set_bit(_DOMF_constructed, &d->domain_flags);
625 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
627 if ( opt_dom0_shadow || opt_dom0_translate )
628 {
629 printk("dom0: shadow enable\n");
630 shadow_mode_enable(d, (opt_dom0_translate
631 ? SHM_enable | SHM_refcounts | SHM_translate
632 : SHM_enable));
633 if ( opt_dom0_translate )
634 {
635 printk("dom0: shadow translate\n");
636 #if defined(__i386__) && defined(CONFIG_X86_PAE)
637 printk("FIXME: PAE code needed here: %s:%d (%s)\n",
638 __FILE__, __LINE__, __FUNCTION__);
639 for ( ; ; )
640 __asm__ __volatile__ ( "hlt" );
641 #else
642 /* Hmm, what does this?
643 Looks like isn't portable across 32/64 bit and pae/non-pae ...
644 -- kraxel */
646 /* mafetter: This code is mostly a hack in order to be able to
647 * test with dom0's which are running with shadow translate.
648 * I expect we'll rip this out once we have a stable set of
649 * domU clients which use the various shadow modes, but it's
650 * useful to leave this here for now...
651 */
653 // map this domain's p2m table into current page table,
654 // so that we can easily access it.
655 //
656 ASSERT( root_get_intpte(idle_pg_table[1]) == 0 );
657 ASSERT( pagetable_get_paddr(d->arch.phys_table) );
658 idle_pg_table[1] = root_from_paddr(
659 pagetable_get_paddr(d->arch.phys_table), __PAGE_HYPERVISOR);
660 translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT),
661 pagetable_get_pfn(v->arch.guest_table));
662 idle_pg_table[1] = root_empty();
663 local_flush_tlb();
664 #endif
665 }
667 update_pagetables(v); /* XXX SMP */
668 printk("dom0: shadow setup done\n");
669 }
671 return 0;
672 }
674 int elf_sanity_check(Elf_Ehdr *ehdr)
675 {
676 if ( !IS_ELF(*ehdr) ||
677 #if defined(__i386__)
678 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
679 (ehdr->e_machine != EM_386) ||
680 #elif defined(__x86_64__)
681 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
682 (ehdr->e_machine != EM_X86_64) ||
683 #endif
684 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
685 (ehdr->e_type != ET_EXEC) )
686 {
687 printk("DOM0 image is not a Xen-compatible Elf image.\n");
688 return 0;
689 }
691 return 1;
692 }
694 /*
695 * Local variables:
696 * mode: C
697 * c-set-style: "BSD"
698 * c-basic-offset: 4
699 * tab-width: 4
700 * indent-tabs-mode: nil
701 * End:
702 */