direct-io.hg

view xen/arch/x86/domain_build.c @ 8736:8aeb417387ca

Fix some more pfn/mfn/gmfn/gpfn inconsistencies. Fix some direct
uses of max_page variable to use the mfn_valid() predicate.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Feb 02 12:18:28 2006 +0100 (2006-02-02)
parents 0c94043f5c5b
children 5d9f4e6c9519
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/compile.h>
19 #include <xen/iocap.h>
20 #include <asm/regs.h>
21 #include <asm/system.h>
22 #include <asm/io.h>
23 #include <asm/processor.h>
24 #include <asm/desc.h>
25 #include <asm/i387.h>
26 #include <asm/shadow.h>
28 static long dom0_nrpages;
30 /*
31 * dom0_mem:
32 * If +ve:
33 * * The specified amount of memory is allocated to domain 0.
34 * If -ve:
35 * * All of memory is allocated to domain 0, minus the specified amount.
36 * If not specified:
37 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
38 * for uses such as DMA buffers (the reservation is clamped to 128MB).
39 */
40 static void parse_dom0_mem(char *s)
41 {
42 unsigned long long bytes;
43 char *t = s;
44 if ( *s == '-' )
45 t++;
46 bytes = parse_size_and_unit(t);
47 dom0_nrpages = bytes >> PAGE_SHIFT;
48 if ( *s == '-' )
49 dom0_nrpages = -dom0_nrpages;
50 }
51 custom_param("dom0_mem", parse_dom0_mem);
53 static unsigned int opt_dom0_shadow = 0;
54 boolean_param("dom0_shadow", opt_dom0_shadow);
56 static unsigned int opt_dom0_translate = 0;
57 boolean_param("dom0_translate", opt_dom0_translate);
59 static char opt_dom0_ioports_disable[200] = "";
60 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
62 #if defined(__i386__)
63 /* No ring-3 access in initial leaf page tables. */
64 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
65 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
66 #define L3_PROT (_PAGE_PRESENT)
67 #elif defined(__x86_64__)
68 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
69 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
70 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
71 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
72 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
73 #endif
75 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
76 #define round_pgdown(_p) ((_p)&PAGE_MASK)
78 static struct page_info *alloc_chunk(struct domain *d, unsigned long max_pages)
79 {
80 struct page_info *page;
81 unsigned int order;
82 /*
83 * Allocate up to 2MB at a time: It prevents allocating very large chunks
84 * from DMA pools before the >4GB pool is fully depleted.
85 */
86 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
87 max_pages = 2UL << (20 - PAGE_SHIFT);
88 order = get_order_from_pages(max_pages);
89 if ( (max_pages & (max_pages-1)) != 0 )
90 order--;
91 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
92 if ( order-- == 0 )
93 break;
94 return page;
95 }
97 static void process_dom0_ioports_disable(void)
98 {
99 unsigned long io_from, io_to;
100 char *t, *u, *s = opt_dom0_ioports_disable;
102 if ( *s == '\0' )
103 return;
105 while ( (t = strsep(&s, ",")) != NULL )
106 {
107 io_from = simple_strtoul(t, &u, 16);
108 if ( u == t )
109 {
110 parse_error:
111 printk("Invalid ioport range <%s> "
112 "in dom0_ioports_disable, skipping\n", t);
113 continue;
114 }
116 if ( *u == '\0' )
117 io_to = io_from;
118 else if ( *u == '-' )
119 io_to = simple_strtoul(u + 1, &u, 16);
120 else
121 goto parse_error;
123 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
124 goto parse_error;
126 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
127 io_from, io_to);
129 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
130 BUG();
131 }
132 }
134 int construct_dom0(struct domain *d,
135 unsigned long _image_start, unsigned long image_len,
136 unsigned long _initrd_start, unsigned long initrd_len,
137 char *cmdline)
138 {
139 int i, rc, dom0_pae, xen_pae, order;
140 unsigned long pfn, mfn;
141 unsigned long nr_pages;
142 unsigned long nr_pt_pages;
143 unsigned long alloc_spfn;
144 unsigned long alloc_epfn;
145 unsigned long count;
146 struct page_info *page = NULL;
147 start_info_t *si;
148 struct vcpu *v = d->vcpu[0];
149 char *p;
150 unsigned long hypercall_page;
151 #if defined(__i386__)
152 char *image_start = (char *)_image_start; /* use lowmem mappings */
153 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
154 #elif defined(__x86_64__)
155 char *image_start = __va(_image_start);
156 char *initrd_start = __va(_initrd_start);
157 #endif
158 #if CONFIG_PAGING_LEVELS >= 4
159 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
160 #endif
161 #if CONFIG_PAGING_LEVELS >= 3
162 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
163 #endif
164 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
165 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
167 /*
168 * This fully describes the memory layout of the initial domain. All
169 * *_start address are page-aligned, except v_start (and v_end) which are
170 * superpage-aligned.
171 */
172 struct domain_setup_info dsi;
173 unsigned long vinitrd_start;
174 unsigned long vinitrd_end;
175 unsigned long vphysmap_start;
176 unsigned long vphysmap_end;
177 unsigned long vstartinfo_start;
178 unsigned long vstartinfo_end;
179 unsigned long vstack_start;
180 unsigned long vstack_end;
181 unsigned long vpt_start;
182 unsigned long vpt_end;
183 unsigned long v_end;
185 /* Machine address of next candidate page-table page. */
186 unsigned long mpt_alloc;
188 extern void translate_l2pgtable(
189 struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
191 /* Sanity! */
192 BUG_ON(d->domain_id != 0);
193 BUG_ON(d->vcpu[0] == NULL);
194 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
196 memset(&dsi, 0, sizeof(struct domain_setup_info));
197 dsi.image_addr = (unsigned long)image_start;
198 dsi.image_len = image_len;
200 printk("*** LOADING DOMAIN 0 ***\n");
202 d->max_pages = ~0U;
204 /*
205 * If domain 0 allocation isn't specified, reserve 1/16th of available
206 * memory for things like DMA buffers. This reservation is clamped to
207 * a maximum of 128MB.
208 */
209 if ( dom0_nrpages == 0 )
210 {
211 dom0_nrpages = avail_domheap_pages() +
212 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
213 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT);
214 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
215 dom0_nrpages = -dom0_nrpages;
216 }
218 /* Negative memory specification means "all memory - specified amount". */
219 if ( dom0_nrpages < 0 )
220 nr_pages = avail_domheap_pages() +
221 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
222 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
223 dom0_nrpages;
224 else
225 nr_pages = dom0_nrpages;
227 if ( (rc = parseelfimage(&dsi)) != 0 )
228 return rc;
230 if ( dsi.xen_section_string == NULL )
231 {
232 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
233 return -EINVAL;
234 }
236 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
237 xen_pae = (CONFIG_PAGING_LEVELS == 3);
238 if ( dom0_pae != xen_pae )
239 {
240 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
241 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
242 return -EINVAL;
243 }
245 if ( strstr(dsi.xen_section_string, "SHADOW=translate") )
246 opt_dom0_translate = 1;
248 /* Align load address to 4MB boundary. */
249 dsi.v_start &= ~((1UL<<22)-1);
251 /*
252 * Why do we need this? The number of page-table frames depends on the
253 * size of the bootstrap address space. But the size of the address space
254 * depends on the number of page-table frames (since each one is mapped
255 * read-only). We have a pair of simultaneous equations in two unknowns,
256 * which we solve by exhaustive search.
257 */
258 vinitrd_start = round_pgup(dsi.v_end);
259 vinitrd_end = vinitrd_start + initrd_len;
260 vphysmap_start = round_pgup(vinitrd_end);
261 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
262 vstartinfo_start = round_pgup(vphysmap_end);
263 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
264 vpt_start = vstartinfo_end;
265 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
266 {
267 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
268 vstack_start = vpt_end;
269 vstack_end = vstack_start + PAGE_SIZE;
270 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
271 if ( (v_end - vstack_end) < (512UL << 10) )
272 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
273 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
274 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
275 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
276 break;
277 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
278 /* 5 pages: 1x 3rd + 4x 2nd level */
279 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
280 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
281 break;
282 #elif defined(__x86_64__)
283 #define NR(_l,_h,_s) \
284 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
285 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
286 if ( (1 + /* # L4 */
287 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
288 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
289 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
290 <= nr_pt_pages )
291 break;
292 #endif
293 }
295 order = get_order_from_bytes(v_end - dsi.v_start);
296 if ( (1UL << order) > nr_pages )
297 panic("Domain 0 allocation is too small for kernel image.\n");
299 /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
300 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
301 panic("Not enough RAM for domain 0 allocation.\n");
302 alloc_spfn = page_to_mfn(page);
303 alloc_epfn = alloc_spfn + d->tot_pages;
305 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
306 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
307 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
308 if ( d->tot_pages < nr_pages )
309 printk(" (%lu pages to be allocated)",
310 nr_pages - d->tot_pages);
311 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
312 " Loaded kernel: %p->%p\n"
313 " Init. ramdisk: %p->%p\n"
314 " Phys-Mach map: %p->%p\n"
315 " Start info: %p->%p\n"
316 " Page tables: %p->%p\n"
317 " Boot stack: %p->%p\n"
318 " TOTAL: %p->%p\n",
319 _p(dsi.v_kernstart), _p(dsi.v_kernend),
320 _p(vinitrd_start), _p(vinitrd_end),
321 _p(vphysmap_start), _p(vphysmap_end),
322 _p(vstartinfo_start), _p(vstartinfo_end),
323 _p(vpt_start), _p(vpt_end),
324 _p(vstack_start), _p(vstack_end),
325 _p(dsi.v_start), _p(v_end));
326 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
328 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
329 {
330 printk("Initial guest OS requires too much space\n"
331 "(%luMB is greater than %luMB limit)\n",
332 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
333 return -ENOMEM;
334 }
336 mpt_alloc = (vpt_start - dsi.v_start) +
337 (unsigned long)pfn_to_paddr(alloc_spfn);
339 /*
340 * We're basically forcing default RPLs to 1, so that our "what privilege
341 * level are we returning to?" logic works.
342 */
343 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
344 for ( i = 0; i < 256; i++ )
345 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
347 #if defined(__i386__)
349 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
350 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
352 /*
353 * Protect the lowest 1GB of memory. We use a temporary mapping there
354 * from which we copy the kernel and ramdisk images.
355 */
356 if ( dsi.v_start < (1UL<<30) )
357 {
358 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
359 return -EINVAL;
360 }
362 /* WARNING: The new domain must have its 'processor' field filled in! */
363 #if CONFIG_PAGING_LEVELS == 3
364 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
365 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
366 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
367 for (i = 0; i < 4; i++) {
368 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
369 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
370 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
371 }
372 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
373 #else
374 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
375 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
376 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
377 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
378 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
379 #endif
381 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
382 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
383 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
384 __PAGE_HYPERVISOR);
386 l2tab += l2_linear_offset(dsi.v_start);
387 mfn = alloc_spfn;
388 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
389 {
390 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
391 {
392 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
393 mpt_alloc += PAGE_SIZE;
394 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
395 l2tab++;
396 clear_page(l1tab);
397 if ( count == 0 )
398 l1tab += l1_table_offset(dsi.v_start);
399 }
400 *l1tab = l1e_from_pfn(mfn, L1_PROT);
401 l1tab++;
403 page = mfn_to_page(mfn);
404 if ( !get_page_and_type(page, d, PGT_writable_page) )
405 BUG();
407 mfn++;
408 }
410 /* Pages that are part of page tables must be read only. */
411 l2tab = l2start + l2_linear_offset(vpt_start);
412 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
413 l1tab += l1_table_offset(vpt_start);
414 for ( count = 0; count < nr_pt_pages; count++ )
415 {
416 page = mfn_to_page(l1e_get_pfn(*l1tab));
417 if ( !opt_dom0_shadow )
418 l1e_remove_flags(*l1tab, _PAGE_RW);
419 else
420 if ( !get_page_type(page, PGT_writable_page) )
421 BUG();
423 #if CONFIG_PAGING_LEVELS == 3
424 switch (count) {
425 case 0:
426 page->u.inuse.type_info &= ~PGT_type_mask;
427 page->u.inuse.type_info |= PGT_l3_page_table;
428 get_page(page, d); /* an extra ref because of readable mapping */
430 /* Get another ref to L3 page so that it can be pinned. */
431 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
432 BUG();
433 set_bit(_PGT_pinned, &page->u.inuse.type_info);
434 break;
435 case 1 ... 4:
436 page->u.inuse.type_info &= ~PGT_type_mask;
437 page->u.inuse.type_info |= PGT_l2_page_table;
438 page->u.inuse.type_info |=
439 (count-1) << PGT_va_shift;
440 get_page(page, d); /* an extra ref because of readable mapping */
441 break;
442 default:
443 page->u.inuse.type_info &= ~PGT_type_mask;
444 page->u.inuse.type_info |= PGT_l1_page_table;
445 page->u.inuse.type_info |=
446 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
447 get_page(page, d); /* an extra ref because of readable mapping */
448 break;
449 }
450 #else
451 if ( count == 0 )
452 {
453 page->u.inuse.type_info &= ~PGT_type_mask;
454 page->u.inuse.type_info |= PGT_l2_page_table;
456 /*
457 * No longer writable: decrement the type_count.
458 * Installed as CR3: increment both the ref_count and type_count.
459 * Net: just increment the ref_count.
460 */
461 get_page(page, d); /* an extra ref because of readable mapping */
463 /* Get another ref to L2 page so that it can be pinned. */
464 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
465 BUG();
466 set_bit(_PGT_pinned, &page->u.inuse.type_info);
467 }
468 else
469 {
470 page->u.inuse.type_info &= ~PGT_type_mask;
471 page->u.inuse.type_info |= PGT_l1_page_table;
472 page->u.inuse.type_info |=
473 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
475 /*
476 * No longer writable: decrement the type_count.
477 * This is an L1 page, installed in a validated L2 page:
478 * increment both the ref_count and type_count.
479 * Net: just increment the ref_count.
480 */
481 get_page(page, d); /* an extra ref because of readable mapping */
482 }
483 #endif
484 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
485 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
486 }
488 #elif defined(__x86_64__)
490 /* Overlap with Xen protected area? */
491 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
492 (v_end > HYPERVISOR_VIRT_START) )
493 {
494 printk("DOM0 image overlaps with Xen private area.\n");
495 return -EINVAL;
496 }
498 /* WARNING: The new domain must have its 'processor' field filled in! */
499 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
500 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
501 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
502 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
503 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
504 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
505 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
506 v->arch.guest_table = mk_pagetable(__pa(l4start));
508 l4tab += l4_table_offset(dsi.v_start);
509 mfn = alloc_spfn;
510 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
511 {
512 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
513 {
514 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
515 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
516 clear_page(l1tab);
517 if ( count == 0 )
518 l1tab += l1_table_offset(dsi.v_start);
519 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
520 {
521 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
522 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
523 clear_page(l2tab);
524 if ( count == 0 )
525 l2tab += l2_table_offset(dsi.v_start);
526 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
527 {
528 maddr_to_page(mpt_alloc)->u.inuse.type_info =
529 PGT_l3_page_table;
530 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
531 clear_page(l3tab);
532 if ( count == 0 )
533 l3tab += l3_table_offset(dsi.v_start);
534 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
535 l4tab++;
536 }
537 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
538 l3tab++;
539 }
540 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
541 l2tab++;
542 }
543 *l1tab = l1e_from_pfn(mfn, L1_PROT);
544 l1tab++;
546 page = mfn_to_page(mfn);
547 if ( (page->u.inuse.type_info == 0) &&
548 !get_page_and_type(page, d, PGT_writable_page) )
549 BUG();
551 mfn++;
552 }
554 /* Pages that are part of page tables must be read only. */
555 l4tab = l4start + l4_table_offset(vpt_start);
556 l3start = l3tab = l4e_to_l3e(*l4tab);
557 l3tab += l3_table_offset(vpt_start);
558 l2start = l2tab = l3e_to_l2e(*l3tab);
559 l2tab += l2_table_offset(vpt_start);
560 l1start = l1tab = l2e_to_l1e(*l2tab);
561 l1tab += l1_table_offset(vpt_start);
562 for ( count = 0; count < nr_pt_pages; count++ )
563 {
564 l1e_remove_flags(*l1tab, _PAGE_RW);
565 page = mfn_to_page(l1e_get_pfn(*l1tab));
567 /* Read-only mapping + PGC_allocated + page-table page. */
568 page->count_info = PGC_allocated | 3;
569 page->u.inuse.type_info |= PGT_validated | 1;
571 /* Top-level p.t. is pinned. */
572 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
573 {
574 page->count_info += 1;
575 page->u.inuse.type_info += 1 | PGT_pinned;
576 }
578 /* Iterate. */
579 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
580 {
581 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
582 {
583 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
584 l3start = l3tab = l4e_to_l3e(*++l4tab);
585 l2start = l2tab = l3e_to_l2e(*l3tab);
586 }
587 l1start = l1tab = l2e_to_l1e(*l2tab);
588 }
589 }
591 #endif /* __x86_64__ */
593 /* Mask all upcalls... */
594 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
595 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
597 for ( i = 1; i < num_online_cpus(); i++ )
598 (void)alloc_vcpu(d, i, i);
600 /* Set up monitor table */
601 update_pagetables(v);
603 /* Install the new page tables. */
604 local_irq_disable();
605 write_ptbase(v);
607 /* Copy the OS image and free temporary buffer. */
608 (void)loadelfimage(&dsi);
610 p = strstr(dsi.xen_section_string, "HYPERCALL_PAGE=");
611 if ( p != NULL )
612 {
613 p += strlen("HYPERCALL_PAGE=");
614 hypercall_page = simple_strtoul(p, NULL, 16);
615 hypercall_page = dsi.v_start + (hypercall_page << PAGE_SHIFT);
616 if ( (hypercall_page < dsi.v_start) || (hypercall_page >= v_end) )
617 {
618 write_ptbase(current);
619 local_irq_enable();
620 printk("Invalid HYPERCALL_PAGE field in guest header.\n");
621 return -1;
622 }
624 hypercall_page_initialise((void *)hypercall_page);
625 }
627 init_domheap_pages(
628 _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
630 /* Copy the initial ramdisk and free temporary buffer. */
631 if ( initrd_len != 0 )
632 {
633 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
634 init_domheap_pages(
635 _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
636 }
638 /* Set up start info area. */
639 si = (start_info_t *)vstartinfo_start;
640 memset(si, 0, PAGE_SIZE);
641 si->nr_pages = nr_pages;
643 si->shared_info = virt_to_maddr(d->shared_info);
644 if ( opt_dom0_translate )
645 {
646 si->shared_info = max_page << PAGE_SHIFT;
647 set_gpfn_from_mfn(virt_to_maddr(d->shared_info) >> PAGE_SHIFT, max_page);
648 }
650 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
651 si->pt_base = vpt_start;
652 si->nr_pt_frames = nr_pt_pages;
653 si->mfn_list = vphysmap_start;
654 sprintf(si->magic, "xen-%i.%i-x86_%d%s",
655 XEN_VERSION, XEN_SUBVERSION, BITS_PER_LONG, xen_pae ? "p" : "");
657 /* Write the phys->machine and machine->phys table entries. */
658 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
659 {
660 mfn = pfn + alloc_spfn;
661 #ifndef NDEBUG
662 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
663 if ( !opt_dom0_translate && (pfn > REVERSE_START) )
664 mfn = alloc_epfn - (pfn - REVERSE_START);
665 #endif
666 ((unsigned long *)vphysmap_start)[pfn] = mfn;
667 set_gpfn_from_mfn(mfn, pfn);
668 }
669 while ( pfn < nr_pages )
670 {
671 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
672 panic("Not enough RAM for DOM0 reservation.\n");
673 while ( pfn < d->tot_pages )
674 {
675 mfn = page_to_mfn(page);
676 #ifndef NDEBUG
677 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
678 #endif
679 ((unsigned long *)vphysmap_start)[pfn] = mfn;
680 set_gpfn_from_mfn(mfn, pfn);
681 #undef pfn
682 page++; pfn++;
683 }
684 }
686 if ( initrd_len != 0 )
687 {
688 si->mod_start = vinitrd_start;
689 si->mod_len = initrd_len;
690 printk("Initrd len 0x%lx, start at 0x%lx\n",
691 si->mod_len, si->mod_start);
692 }
694 memset(si->cmd_line, 0, sizeof(si->cmd_line));
695 if ( cmdline != NULL )
696 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
698 /* Reinstate the caller's page tables. */
699 write_ptbase(current);
700 local_irq_enable();
702 #if defined(__i386__)
703 /* Destroy low mappings - they were only for our convenience. */
704 zap_low_mappings(l2start);
705 zap_low_mappings(idle_pg_table_l2);
706 #endif
708 init_domain_time(d);
710 set_bit(_VCPUF_initialised, &v->vcpu_flags);
712 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
714 if ( opt_dom0_shadow || opt_dom0_translate )
715 {
716 printk("dom0: shadow enable\n");
717 shadow_mode_enable(d, (opt_dom0_translate
718 ? SHM_enable | SHM_refcounts | SHM_translate
719 : SHM_enable));
720 if ( opt_dom0_translate )
721 {
722 printk("dom0: shadow translate\n");
723 #if defined(__i386__) && defined(CONFIG_X86_PAE)
724 printk("FIXME: PAE code needed here: %s:%d (%s)\n",
725 __FILE__, __LINE__, __FUNCTION__);
726 for ( ; ; )
727 __asm__ __volatile__ ( "hlt" );
728 #else
729 /* Hmm, what does this?
730 Looks like isn't portable across 32/64 bit and pae/non-pae ...
731 -- kraxel */
733 /* mafetter: This code is mostly a hack in order to be able to
734 * test with dom0's which are running with shadow translate.
735 * I expect we'll rip this out once we have a stable set of
736 * domU clients which use the various shadow modes, but it's
737 * useful to leave this here for now...
738 */
740 // map this domain's p2m table into current page table,
741 // so that we can easily access it.
742 //
743 ASSERT( root_get_intpte(idle_pg_table[1]) == 0 );
744 ASSERT( pagetable_get_paddr(d->arch.phys_table) );
745 idle_pg_table[1] = root_from_paddr(
746 pagetable_get_paddr(d->arch.phys_table), __PAGE_HYPERVISOR);
747 translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT),
748 pagetable_get_pfn(v->arch.guest_table));
749 idle_pg_table[1] = root_empty();
750 local_flush_tlb();
751 #endif
752 }
754 update_pagetables(v); /* XXX SMP */
755 printk("dom0: shadow setup done\n");
756 }
758 i = 0;
760 /* DOM0 is permitted full I/O capabilities. */
761 i |= ioports_permit_access(dom0, 0, 0xFFFF);
762 i |= iomem_permit_access(dom0, 0UL, ~0UL);
763 i |= irqs_permit_access(dom0, 0, NR_PIRQS-1);
765 /*
766 * Modify I/O port access permissions.
767 */
768 /* Master Interrupt Controller (PIC). */
769 i |= ioports_deny_access(dom0, 0x20, 0x21);
770 /* Slave Interrupt Controller (PIC). */
771 i |= ioports_deny_access(dom0, 0xA0, 0xA1);
772 /* Interval Timer (PIT). */
773 i |= ioports_deny_access(dom0, 0x40, 0x43);
774 /* PIT Channel 2 / PC Speaker Control. */
775 i |= ioports_deny_access(dom0, 0x61, 0x61);
776 /* Command-line I/O ranges. */
777 process_dom0_ioports_disable();
779 BUG_ON(i != 0);
781 return 0;
782 }
784 int elf_sanity_check(Elf_Ehdr *ehdr)
785 {
786 if ( !IS_ELF(*ehdr) ||
787 #if defined(__i386__)
788 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
789 (ehdr->e_machine != EM_386) ||
790 #elif defined(__x86_64__)
791 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
792 (ehdr->e_machine != EM_X86_64) ||
793 #endif
794 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
795 (ehdr->e_type != ET_EXEC) )
796 {
797 printk("DOM0 image is not a Xen-compatible Elf image.\n");
798 return 0;
799 }
801 return 1;
802 }
804 /*
805 * Local variables:
806 * mode: C
807 * c-set-style: "BSD"
808 * c-basic-offset: 4
809 * tab-width: 4
810 * indent-tabs-mode: nil
811 * End:
812 */