ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 8468:d966b7a00959

Allow non-privileged domains restricted access to
I/O memory and physical interrupts, under control
of domain0. Capabilities are maintained as rangesets
in Xen.

Signed-off-by: Ryan Wilson <hap9@epoch.ncsc.mil>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sat Dec 31 14:15:22 2005 +0100 (2005-12-31)
parents 4369fd869f51
children 1572681e4e5a
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/compile.h>
19 #include <xen/iocap.h>
20 #include <asm/regs.h>
21 #include <asm/system.h>
22 #include <asm/io.h>
23 #include <asm/processor.h>
24 #include <asm/desc.h>
25 #include <asm/i387.h>
26 #include <asm/shadow.h>
28 static long dom0_nrpages;
30 /*
31 * dom0_mem:
32 * If +ve:
33 * * The specified amount of memory is allocated to domain 0.
34 * If -ve:
35 * * All of memory is allocated to domain 0, minus the specified amount.
36 * If not specified:
37 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
38 * for uses such as DMA buffers (the reservation is clamped to 128MB).
39 */
40 static void parse_dom0_mem(char *s)
41 {
42 unsigned long long bytes;
43 char *t = s;
44 if ( *s == '-' )
45 t++;
46 bytes = parse_size_and_unit(t);
47 dom0_nrpages = bytes >> PAGE_SHIFT;
48 if ( *s == '-' )
49 dom0_nrpages = -dom0_nrpages;
50 }
51 custom_param("dom0_mem", parse_dom0_mem);
53 static unsigned int opt_dom0_shadow = 0;
54 boolean_param("dom0_shadow", opt_dom0_shadow);
56 static unsigned int opt_dom0_translate = 0;
57 boolean_param("dom0_translate", opt_dom0_translate);
59 static char opt_dom0_ioports_disable[200] = "";
60 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
62 #if defined(__i386__)
63 /* No ring-3 access in initial leaf page tables. */
64 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
65 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
66 #define L3_PROT (_PAGE_PRESENT)
67 #elif defined(__x86_64__)
68 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
69 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
70 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
71 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
72 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
73 #endif
75 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
76 #define round_pgdown(_p) ((_p)&PAGE_MASK)
78 static struct pfn_info *alloc_chunk(struct domain *d, unsigned long max_pages)
79 {
80 struct pfn_info *page;
81 unsigned int order;
82 /*
83 * Allocate up to 2MB at a time: It prevents allocating very large chunks
84 * from DMA pools before the >4GB pool is fully depleted.
85 */
86 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
87 max_pages = 2UL << (20 - PAGE_SHIFT);
88 order = get_order_from_pages(max_pages);
89 if ( (max_pages & (max_pages-1)) != 0 )
90 order--;
91 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
92 if ( order-- == 0 )
93 break;
94 return page;
95 }
97 static void process_dom0_ioports_disable(void)
98 {
99 unsigned long io_from, io_to;
100 char *t, *u, *s = opt_dom0_ioports_disable;
102 if ( *s == '\0' )
103 return;
105 while ( (t = strsep(&s, ",")) != NULL )
106 {
107 io_from = simple_strtoul(t, &u, 16);
108 if ( u == t )
109 {
110 parse_error:
111 printk("Invalid ioport range <%s> "
112 "in dom0_ioports_disable, skipping\n", t);
113 continue;
114 }
116 if ( *u == '\0' )
117 io_to = io_from;
118 else if ( *u == '-' )
119 io_to = simple_strtoul(u + 1, &u, 16);
120 else
121 goto parse_error;
123 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
124 goto parse_error;
126 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
127 io_from, io_to);
129 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
130 BUG();
131 }
132 }
134 int construct_dom0(struct domain *d,
135 unsigned long _image_start, unsigned long image_len,
136 unsigned long _initrd_start, unsigned long initrd_len,
137 char *cmdline)
138 {
139 int i, rc, dom0_pae, xen_pae, order;
140 unsigned long pfn, mfn;
141 unsigned long nr_pages;
142 unsigned long nr_pt_pages;
143 unsigned long alloc_spfn;
144 unsigned long alloc_epfn;
145 unsigned long count;
146 struct pfn_info *page = NULL;
147 start_info_t *si;
148 struct vcpu *v = d->vcpu[0];
149 #if defined(__i386__)
150 char *image_start = (char *)_image_start; /* use lowmem mappings */
151 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
152 #elif defined(__x86_64__)
153 char *image_start = __va(_image_start);
154 char *initrd_start = __va(_initrd_start);
155 #endif
156 #if CONFIG_PAGING_LEVELS >= 4
157 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
158 #endif
159 #if CONFIG_PAGING_LEVELS >= 3
160 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
161 #endif
162 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
163 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
165 /*
166 * This fully describes the memory layout of the initial domain. All
167 * *_start address are page-aligned, except v_start (and v_end) which are
168 * superpage-aligned.
169 */
170 struct domain_setup_info dsi;
171 unsigned long vinitrd_start;
172 unsigned long vinitrd_end;
173 unsigned long vphysmap_start;
174 unsigned long vphysmap_end;
175 unsigned long vstartinfo_start;
176 unsigned long vstartinfo_end;
177 unsigned long vstack_start;
178 unsigned long vstack_end;
179 unsigned long vpt_start;
180 unsigned long vpt_end;
181 unsigned long v_end;
183 /* Machine address of next candidate page-table page. */
184 unsigned long mpt_alloc;
186 extern void translate_l2pgtable(
187 struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
189 /* Sanity! */
190 BUG_ON(d->domain_id != 0);
191 BUG_ON(d->vcpu[0] == NULL);
192 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
194 memset(&dsi, 0, sizeof(struct domain_setup_info));
195 dsi.image_addr = (unsigned long)image_start;
196 dsi.image_len = image_len;
198 printk("*** LOADING DOMAIN 0 ***\n");
200 d->max_pages = ~0U;
202 /*
203 * If domain 0 allocation isn't specified, reserve 1/16th of available
204 * memory for things like DMA buffers. This reservation is clamped to
205 * a maximum of 128MB.
206 */
207 if ( dom0_nrpages == 0 )
208 {
209 dom0_nrpages = avail_domheap_pages() +
210 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
211 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT);
212 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
213 dom0_nrpages = -dom0_nrpages;
214 }
216 /* Negative memory specification means "all memory - specified amount". */
217 if ( dom0_nrpages < 0 )
218 nr_pages = avail_domheap_pages() +
219 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
220 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
221 dom0_nrpages;
222 else
223 nr_pages = dom0_nrpages;
225 if ( (rc = parseelfimage(&dsi)) != 0 )
226 return rc;
228 if ( dsi.xen_section_string == NULL )
229 {
230 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
231 return -EINVAL;
232 }
234 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
235 xen_pae = (CONFIG_PAGING_LEVELS == 3);
236 if ( dom0_pae != xen_pae )
237 {
238 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
239 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
240 return -EINVAL;
241 }
242 if (strstr(dsi.xen_section_string, "SHADOW=translate"))
243 opt_dom0_translate = 1;
245 /* Align load address to 4MB boundary. */
246 dsi.v_start &= ~((1UL<<22)-1);
248 /*
249 * Why do we need this? The number of page-table frames depends on the
250 * size of the bootstrap address space. But the size of the address space
251 * depends on the number of page-table frames (since each one is mapped
252 * read-only). We have a pair of simultaneous equations in two unknowns,
253 * which we solve by exhaustive search.
254 */
255 vinitrd_start = round_pgup(dsi.v_end);
256 vinitrd_end = vinitrd_start + initrd_len;
257 vphysmap_start = round_pgup(vinitrd_end);
258 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
259 vstartinfo_start = round_pgup(vphysmap_end);
260 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
261 vpt_start = vstartinfo_end;
262 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
263 {
264 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
265 vstack_start = vpt_end;
266 vstack_end = vstack_start + PAGE_SIZE;
267 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
268 if ( (v_end - vstack_end) < (512UL << 10) )
269 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
270 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
271 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
272 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
273 break;
274 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
275 /* 5 pages: 1x 3rd + 4x 2nd level */
276 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
277 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
278 break;
279 #elif defined(__x86_64__)
280 #define NR(_l,_h,_s) \
281 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
282 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
283 if ( (1 + /* # L4 */
284 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
285 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
286 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
287 <= nr_pt_pages )
288 break;
289 #endif
290 }
292 order = get_order_from_bytes(v_end - dsi.v_start);
293 if ( (1UL << order) > nr_pages )
294 panic("Domain 0 allocation is too small for kernel image.\n");
296 /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
297 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
298 panic("Not enough RAM for domain 0 allocation.\n");
299 alloc_spfn = page_to_pfn(page);
300 alloc_epfn = alloc_spfn + d->tot_pages;
302 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
303 " Dom0 alloc.: %"PRIphysaddr"->%"PRIphysaddr,
304 pfn_to_phys(alloc_spfn), pfn_to_phys(alloc_epfn));
305 if ( d->tot_pages < nr_pages )
306 printk(" (%lu pages to be allocated)",
307 nr_pages - d->tot_pages);
308 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
309 " Loaded kernel: %p->%p\n"
310 " Init. ramdisk: %p->%p\n"
311 " Phys-Mach map: %p->%p\n"
312 " Start info: %p->%p\n"
313 " Page tables: %p->%p\n"
314 " Boot stack: %p->%p\n"
315 " TOTAL: %p->%p\n",
316 _p(dsi.v_kernstart), _p(dsi.v_kernend),
317 _p(vinitrd_start), _p(vinitrd_end),
318 _p(vphysmap_start), _p(vphysmap_end),
319 _p(vstartinfo_start), _p(vstartinfo_end),
320 _p(vpt_start), _p(vpt_end),
321 _p(vstack_start), _p(vstack_end),
322 _p(dsi.v_start), _p(v_end));
323 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
325 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
326 {
327 printk("Initial guest OS requires too much space\n"
328 "(%luMB is greater than %luMB limit)\n",
329 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
330 return -ENOMEM;
331 }
333 mpt_alloc = (vpt_start - dsi.v_start) +
334 (unsigned long)pfn_to_phys(alloc_spfn);
336 /*
337 * We're basically forcing default RPLs to 1, so that our "what privilege
338 * level are we returning to?" logic works.
339 */
340 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
341 for ( i = 0; i < 256; i++ )
342 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
344 #if defined(__i386__)
346 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
347 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
349 /*
350 * Protect the lowest 1GB of memory. We use a temporary mapping there
351 * from which we copy the kernel and ramdisk images.
352 */
353 if ( dsi.v_start < (1UL<<30) )
354 {
355 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
356 return -EINVAL;
357 }
359 /* WARNING: The new domain must have its 'processor' field filled in! */
360 #if CONFIG_PAGING_LEVELS == 3
361 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
362 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
363 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
364 for (i = 0; i < 4; i++) {
365 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
366 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
367 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
368 }
369 {
370 unsigned long va;
371 for (va = PERDOMAIN_VIRT_START; va < PERDOMAIN_VIRT_END;
372 va += (1 << L2_PAGETABLE_SHIFT)) {
373 l2tab[va >> L2_PAGETABLE_SHIFT] =
374 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt) +
375 (va-PERDOMAIN_VIRT_START),
376 __PAGE_HYPERVISOR);
377 }
378 }
379 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
380 #else
381 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
382 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
383 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
384 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
385 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
386 l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
387 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
388 #endif
390 l2tab += l2_linear_offset(dsi.v_start);
391 mfn = alloc_spfn;
392 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
393 {
394 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
395 {
396 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
397 mpt_alloc += PAGE_SIZE;
398 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
399 l2tab++;
400 clear_page(l1tab);
401 if ( count == 0 )
402 l1tab += l1_table_offset(dsi.v_start);
403 }
404 *l1tab = l1e_from_pfn(mfn, L1_PROT);
405 l1tab++;
407 page = pfn_to_page(mfn);
408 if ( !get_page_and_type(page, d, PGT_writable_page) )
409 BUG();
411 mfn++;
412 }
414 /* Pages that are part of page tables must be read only. */
415 l2tab = l2start + l2_linear_offset(vpt_start);
416 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
417 l1tab += l1_table_offset(vpt_start);
418 for ( count = 0; count < nr_pt_pages; count++ )
419 {
420 page = pfn_to_page(l1e_get_pfn(*l1tab));
421 if ( !opt_dom0_shadow )
422 l1e_remove_flags(*l1tab, _PAGE_RW);
423 else
424 if ( !get_page_type(page, PGT_writable_page) )
425 BUG();
427 #if CONFIG_PAGING_LEVELS == 3
428 switch (count) {
429 case 0:
430 page->u.inuse.type_info &= ~PGT_type_mask;
431 page->u.inuse.type_info |= PGT_l3_page_table;
432 get_page(page, d); /* an extra ref because of readable mapping */
434 /* Get another ref to L3 page so that it can be pinned. */
435 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
436 BUG();
437 set_bit(_PGT_pinned, &page->u.inuse.type_info);
438 break;
439 case 1 ... 4:
440 page->u.inuse.type_info &= ~PGT_type_mask;
441 page->u.inuse.type_info |= PGT_l2_page_table;
442 page->u.inuse.type_info |=
443 (count-1) << PGT_va_shift;
444 get_page(page, d); /* an extra ref because of readable mapping */
445 break;
446 default:
447 page->u.inuse.type_info &= ~PGT_type_mask;
448 page->u.inuse.type_info |= PGT_l1_page_table;
449 page->u.inuse.type_info |=
450 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
451 get_page(page, d); /* an extra ref because of readable mapping */
452 break;
453 }
454 #else
455 if ( count == 0 )
456 {
457 page->u.inuse.type_info &= ~PGT_type_mask;
458 page->u.inuse.type_info |= PGT_l2_page_table;
460 /*
461 * No longer writable: decrement the type_count.
462 * Installed as CR3: increment both the ref_count and type_count.
463 * Net: just increment the ref_count.
464 */
465 get_page(page, d); /* an extra ref because of readable mapping */
467 /* Get another ref to L2 page so that it can be pinned. */
468 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
469 BUG();
470 set_bit(_PGT_pinned, &page->u.inuse.type_info);
471 }
472 else
473 {
474 page->u.inuse.type_info &= ~PGT_type_mask;
475 page->u.inuse.type_info |= PGT_l1_page_table;
476 page->u.inuse.type_info |=
477 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
479 /*
480 * No longer writable: decrement the type_count.
481 * This is an L1 page, installed in a validated L2 page:
482 * increment both the ref_count and type_count.
483 * Net: just increment the ref_count.
484 */
485 get_page(page, d); /* an extra ref because of readable mapping */
486 }
487 #endif
488 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
489 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
490 }
492 #elif defined(__x86_64__)
494 /* Overlap with Xen protected area? */
495 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
496 (v_end > HYPERVISOR_VIRT_START) )
497 {
498 printk("DOM0 image overlaps with Xen private area.\n");
499 return -EINVAL;
500 }
502 /* WARNING: The new domain must have its 'processor' field filled in! */
503 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
504 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
505 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
506 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
507 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
508 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
509 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
510 v->arch.guest_table = mk_pagetable(__pa(l4start));
512 l4tab += l4_table_offset(dsi.v_start);
513 mfn = alloc_spfn;
514 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
515 {
516 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
517 {
518 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
519 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
520 clear_page(l1tab);
521 if ( count == 0 )
522 l1tab += l1_table_offset(dsi.v_start);
523 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
524 {
525 phys_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
526 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
527 clear_page(l2tab);
528 if ( count == 0 )
529 l2tab += l2_table_offset(dsi.v_start);
530 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
531 {
532 phys_to_page(mpt_alloc)->u.inuse.type_info =
533 PGT_l3_page_table;
534 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
535 clear_page(l3tab);
536 if ( count == 0 )
537 l3tab += l3_table_offset(dsi.v_start);
538 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
539 l4tab++;
540 }
541 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
542 l3tab++;
543 }
544 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
545 l2tab++;
546 }
547 *l1tab = l1e_from_pfn(mfn, L1_PROT);
548 l1tab++;
550 page = pfn_to_page(mfn);
551 if ( (page->u.inuse.type_info == 0) &&
552 !get_page_and_type(page, d, PGT_writable_page) )
553 BUG();
555 mfn++;
556 }
558 /* Pages that are part of page tables must be read only. */
559 l4tab = l4start + l4_table_offset(vpt_start);
560 l3start = l3tab = l4e_to_l3e(*l4tab);
561 l3tab += l3_table_offset(vpt_start);
562 l2start = l2tab = l3e_to_l2e(*l3tab);
563 l2tab += l2_table_offset(vpt_start);
564 l1start = l1tab = l2e_to_l1e(*l2tab);
565 l1tab += l1_table_offset(vpt_start);
566 for ( count = 0; count < nr_pt_pages; count++ )
567 {
568 l1e_remove_flags(*l1tab, _PAGE_RW);
569 page = pfn_to_page(l1e_get_pfn(*l1tab));
571 /* Read-only mapping + PGC_allocated + page-table page. */
572 page->count_info = PGC_allocated | 3;
573 page->u.inuse.type_info |= PGT_validated | 1;
575 /* Top-level p.t. is pinned. */
576 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
577 {
578 page->count_info += 1;
579 page->u.inuse.type_info += 1 | PGT_pinned;
580 }
582 /* Iterate. */
583 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
584 {
585 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
586 {
587 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
588 l3start = l3tab = l4e_to_l3e(*++l4tab);
589 l2start = l2tab = l3e_to_l2e(*l3tab);
590 }
591 l1start = l1tab = l2e_to_l1e(*l2tab);
592 }
593 }
595 #endif /* __x86_64__ */
597 /* Mask all upcalls... */
598 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
599 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
601 for ( i = 1; i < num_online_cpus(); i++ )
602 (void)alloc_vcpu(d, i, i);
604 /* Set up monitor table */
605 update_pagetables(v);
607 /* Install the new page tables. */
608 local_irq_disable();
609 write_ptbase(v);
611 /* Copy the OS image and free temporary buffer. */
612 (void)loadelfimage(&dsi);
614 init_domheap_pages(
615 _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
617 /* Copy the initial ramdisk and free temporary buffer. */
618 if ( initrd_len != 0 )
619 {
620 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
621 init_domheap_pages(
622 _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
623 }
625 /* Set up start info area. */
626 si = (start_info_t *)vstartinfo_start;
627 memset(si, 0, PAGE_SIZE);
628 si->nr_pages = nr_pages;
630 si->shared_info = virt_to_phys(d->shared_info);
631 if ( opt_dom0_translate )
632 {
633 si->shared_info = max_page << PAGE_SHIFT;
634 set_pfn_from_mfn(virt_to_phys(d->shared_info) >> PAGE_SHIFT, max_page);
635 }
637 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
638 si->pt_base = vpt_start;
639 si->nr_pt_frames = nr_pt_pages;
640 si->mfn_list = vphysmap_start;
641 sprintf(si->magic, "xen-%i.%i-x86_%d%s",
642 XEN_VERSION, XEN_SUBVERSION, BITS_PER_LONG, xen_pae ? "p" : "");
644 /* Write the phys->machine and machine->phys table entries. */
645 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
646 {
647 mfn = pfn + alloc_spfn;
648 #ifndef NDEBUG
649 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
650 if ( !opt_dom0_translate && (pfn > REVERSE_START) )
651 mfn = alloc_epfn - (pfn - REVERSE_START);
652 #endif
653 ((unsigned long *)vphysmap_start)[pfn] = mfn;
654 set_pfn_from_mfn(mfn, pfn);
655 }
656 while ( pfn < nr_pages )
657 {
658 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
659 panic("Not enough RAM for DOM0 reservation.\n");
660 while ( pfn < d->tot_pages )
661 {
662 mfn = page_to_pfn(page);
663 #ifndef NDEBUG
664 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
665 #endif
666 ((unsigned long *)vphysmap_start)[pfn] = mfn;
667 set_pfn_from_mfn(mfn, pfn);
668 #undef pfn
669 page++; pfn++;
670 }
671 }
673 if ( initrd_len != 0 )
674 {
675 si->mod_start = vinitrd_start;
676 si->mod_len = initrd_len;
677 printk("Initrd len 0x%lx, start at 0x%lx\n",
678 si->mod_len, si->mod_start);
679 }
681 memset(si->cmd_line, 0, sizeof(si->cmd_line));
682 if ( cmdline != NULL )
683 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
685 /* Reinstate the caller's page tables. */
686 write_ptbase(current);
687 local_irq_enable();
689 #if defined(__i386__)
690 /* Destroy low mappings - they were only for our convenience. */
691 zap_low_mappings(l2start);
692 zap_low_mappings(idle_pg_table_l2);
693 #endif
695 init_domain_time(d);
697 set_bit(_VCPUF_initialised, &v->vcpu_flags);
699 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
701 if ( opt_dom0_shadow || opt_dom0_translate )
702 {
703 printk("dom0: shadow enable\n");
704 shadow_mode_enable(d, (opt_dom0_translate
705 ? SHM_enable | SHM_refcounts | SHM_translate
706 : SHM_enable));
707 if ( opt_dom0_translate )
708 {
709 printk("dom0: shadow translate\n");
710 #if defined(__i386__) && defined(CONFIG_X86_PAE)
711 printk("FIXME: PAE code needed here: %s:%d (%s)\n",
712 __FILE__, __LINE__, __FUNCTION__);
713 for ( ; ; )
714 __asm__ __volatile__ ( "hlt" );
715 #else
716 /* Hmm, what does this?
717 Looks like isn't portable across 32/64 bit and pae/non-pae ...
718 -- kraxel */
720 /* mafetter: This code is mostly a hack in order to be able to
721 * test with dom0's which are running with shadow translate.
722 * I expect we'll rip this out once we have a stable set of
723 * domU clients which use the various shadow modes, but it's
724 * useful to leave this here for now...
725 */
727 // map this domain's p2m table into current page table,
728 // so that we can easily access it.
729 //
730 ASSERT( root_get_intpte(idle_pg_table[1]) == 0 );
731 ASSERT( pagetable_get_paddr(d->arch.phys_table) );
732 idle_pg_table[1] = root_from_paddr(
733 pagetable_get_paddr(d->arch.phys_table), __PAGE_HYPERVISOR);
734 translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT),
735 pagetable_get_pfn(v->arch.guest_table));
736 idle_pg_table[1] = root_empty();
737 local_flush_tlb();
738 #endif
739 }
741 update_pagetables(v); /* XXX SMP */
742 printk("dom0: shadow setup done\n");
743 }
745 i = 0;
747 /* DOM0 is permitted full I/O capabilities. */
748 i |= ioports_permit_access(dom0, 0, 0xFFFF);
749 i |= iomem_permit_access(dom0, 0UL, ~0UL);
750 i |= irqs_permit_access(dom0, 0, NR_PIRQS-1);
752 /*
753 * Modify I/O port access permissions.
754 */
755 /* Master Interrupt Controller (PIC). */
756 i |= ioports_deny_access(dom0, 0x20, 0x21);
757 /* Slave Interrupt Controller (PIC). */
758 i |= ioports_deny_access(dom0, 0xA0, 0xA1);
759 /* Interval Timer (PIT). */
760 i |= ioports_deny_access(dom0, 0x40, 0x43);
761 /* PIT Channel 2 / PC Speaker Control. */
762 i |= ioports_deny_access(dom0, 0x61, 0x61);
763 /* Command-line I/O ranges. */
764 process_dom0_ioports_disable();
766 BUG_ON(i != 0);
768 return 0;
769 }
771 int elf_sanity_check(Elf_Ehdr *ehdr)
772 {
773 if ( !IS_ELF(*ehdr) ||
774 #if defined(__i386__)
775 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
776 (ehdr->e_machine != EM_386) ||
777 #elif defined(__x86_64__)
778 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
779 (ehdr->e_machine != EM_X86_64) ||
780 #endif
781 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
782 (ehdr->e_type != ET_EXEC) )
783 {
784 printk("DOM0 image is not a Xen-compatible Elf image.\n");
785 return 0;
786 }
788 return 1;
789 }
791 /*
792 * Local variables:
793 * mode: C
794 * c-set-style: "BSD"
795 * c-basic-offset: 4
796 * tab-width: 4
797 * indent-tabs-mode: nil
798 * End:
799 */