ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 8798:6ba371536f5c

Add 'dom0_max_vcpus' Xen boot parameter.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Feb 08 17:25:14 2006 +0100 (2006-02-08)
parents 5d9f4e6c9519
children 13e4df60caf1
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/compile.h>
19 #include <xen/iocap.h>
20 #include <asm/regs.h>
21 #include <asm/system.h>
22 #include <asm/io.h>
23 #include <asm/processor.h>
24 #include <asm/desc.h>
25 #include <asm/i387.h>
26 #include <asm/shadow.h>
28 static long dom0_nrpages;
30 /*
31 * dom0_mem:
32 * If +ve:
33 * * The specified amount of memory is allocated to domain 0.
34 * If -ve:
35 * * All of memory is allocated to domain 0, minus the specified amount.
36 * If not specified:
37 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
38 * for uses such as DMA buffers (the reservation is clamped to 128MB).
39 */
40 static void parse_dom0_mem(char *s)
41 {
42 unsigned long long bytes;
43 char *t = s;
44 if ( *s == '-' )
45 t++;
46 bytes = parse_size_and_unit(t);
47 dom0_nrpages = bytes >> PAGE_SHIFT;
48 if ( *s == '-' )
49 dom0_nrpages = -dom0_nrpages;
50 }
51 custom_param("dom0_mem", parse_dom0_mem);
53 static unsigned int opt_dom0_max_vcpus;
54 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
56 static unsigned int opt_dom0_shadow;
57 boolean_param("dom0_shadow", opt_dom0_shadow);
59 static unsigned int opt_dom0_translate;
60 boolean_param("dom0_translate", opt_dom0_translate);
62 static char opt_dom0_ioports_disable[200] = "";
63 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
65 #if defined(__i386__)
66 /* No ring-3 access in initial leaf page tables. */
67 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
68 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
69 #define L3_PROT (_PAGE_PRESENT)
70 #elif defined(__x86_64__)
71 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
72 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
73 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
74 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
75 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
76 #endif
78 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
79 #define round_pgdown(_p) ((_p)&PAGE_MASK)
81 static struct page_info *alloc_chunk(struct domain *d, unsigned long max_pages)
82 {
83 struct page_info *page;
84 unsigned int order;
85 /*
86 * Allocate up to 2MB at a time: It prevents allocating very large chunks
87 * from DMA pools before the >4GB pool is fully depleted.
88 */
89 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
90 max_pages = 2UL << (20 - PAGE_SHIFT);
91 order = get_order_from_pages(max_pages);
92 if ( (max_pages & (max_pages-1)) != 0 )
93 order--;
94 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
95 if ( order-- == 0 )
96 break;
97 return page;
98 }
100 static void process_dom0_ioports_disable(void)
101 {
102 unsigned long io_from, io_to;
103 char *t, *u, *s = opt_dom0_ioports_disable;
105 if ( *s == '\0' )
106 return;
108 while ( (t = strsep(&s, ",")) != NULL )
109 {
110 io_from = simple_strtoul(t, &u, 16);
111 if ( u == t )
112 {
113 parse_error:
114 printk("Invalid ioport range <%s> "
115 "in dom0_ioports_disable, skipping\n", t);
116 continue;
117 }
119 if ( *u == '\0' )
120 io_to = io_from;
121 else if ( *u == '-' )
122 io_to = simple_strtoul(u + 1, &u, 16);
123 else
124 goto parse_error;
126 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
127 goto parse_error;
129 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
130 io_from, io_to);
132 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
133 BUG();
134 }
135 }
137 int construct_dom0(struct domain *d,
138 unsigned long _image_start, unsigned long image_len,
139 unsigned long _initrd_start, unsigned long initrd_len,
140 char *cmdline)
141 {
142 int i, rc, dom0_pae, xen_pae, order;
143 unsigned long pfn, mfn;
144 unsigned long nr_pages;
145 unsigned long nr_pt_pages;
146 unsigned long alloc_spfn;
147 unsigned long alloc_epfn;
148 unsigned long count;
149 struct page_info *page = NULL;
150 start_info_t *si;
151 struct vcpu *v = d->vcpu[0];
152 char *p;
153 unsigned long hypercall_page;
154 #if defined(__i386__)
155 char *image_start = (char *)_image_start; /* use lowmem mappings */
156 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
157 #elif defined(__x86_64__)
158 char *image_start = __va(_image_start);
159 char *initrd_start = __va(_initrd_start);
160 #endif
161 #if CONFIG_PAGING_LEVELS >= 4
162 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
163 #endif
164 #if CONFIG_PAGING_LEVELS >= 3
165 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
166 #endif
167 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
168 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
170 /*
171 * This fully describes the memory layout of the initial domain. All
172 * *_start address are page-aligned, except v_start (and v_end) which are
173 * superpage-aligned.
174 */
175 struct domain_setup_info dsi;
176 unsigned long vinitrd_start;
177 unsigned long vinitrd_end;
178 unsigned long vphysmap_start;
179 unsigned long vphysmap_end;
180 unsigned long vstartinfo_start;
181 unsigned long vstartinfo_end;
182 unsigned long vstack_start;
183 unsigned long vstack_end;
184 unsigned long vpt_start;
185 unsigned long vpt_end;
186 unsigned long v_end;
188 /* Machine address of next candidate page-table page. */
189 unsigned long mpt_alloc;
191 extern void translate_l2pgtable(
192 struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
194 /* Sanity! */
195 BUG_ON(d->domain_id != 0);
196 BUG_ON(d->vcpu[0] == NULL);
197 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
199 memset(&dsi, 0, sizeof(struct domain_setup_info));
200 dsi.image_addr = (unsigned long)image_start;
201 dsi.image_len = image_len;
203 printk("*** LOADING DOMAIN 0 ***\n");
205 d->max_pages = ~0U;
207 /*
208 * If domain 0 allocation isn't specified, reserve 1/16th of available
209 * memory for things like DMA buffers. This reservation is clamped to
210 * a maximum of 128MB.
211 */
212 if ( dom0_nrpages == 0 )
213 {
214 dom0_nrpages = avail_domheap_pages() +
215 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
216 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT);
217 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
218 dom0_nrpages = -dom0_nrpages;
219 }
221 /* Negative memory specification means "all memory - specified amount". */
222 if ( dom0_nrpages < 0 )
223 nr_pages = avail_domheap_pages() +
224 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
225 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
226 dom0_nrpages;
227 else
228 nr_pages = dom0_nrpages;
230 if ( (rc = parseelfimage(&dsi)) != 0 )
231 return rc;
233 if ( dsi.xen_section_string == NULL )
234 {
235 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
236 return -EINVAL;
237 }
239 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
240 xen_pae = (CONFIG_PAGING_LEVELS == 3);
241 if ( dom0_pae != xen_pae )
242 {
243 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
244 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
245 return -EINVAL;
246 }
248 if ( strstr(dsi.xen_section_string, "SHADOW=translate") )
249 opt_dom0_translate = 1;
251 /* Align load address to 4MB boundary. */
252 dsi.v_start &= ~((1UL<<22)-1);
254 /*
255 * Why do we need this? The number of page-table frames depends on the
256 * size of the bootstrap address space. But the size of the address space
257 * depends on the number of page-table frames (since each one is mapped
258 * read-only). We have a pair of simultaneous equations in two unknowns,
259 * which we solve by exhaustive search.
260 */
261 vinitrd_start = round_pgup(dsi.v_end);
262 vinitrd_end = vinitrd_start + initrd_len;
263 vphysmap_start = round_pgup(vinitrd_end);
264 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
265 vstartinfo_start = round_pgup(vphysmap_end);
266 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
267 vpt_start = vstartinfo_end;
268 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
269 {
270 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
271 vstack_start = vpt_end;
272 vstack_end = vstack_start + PAGE_SIZE;
273 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
274 if ( (v_end - vstack_end) < (512UL << 10) )
275 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
276 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
277 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
278 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
279 break;
280 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
281 /* 5 pages: 1x 3rd + 4x 2nd level */
282 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
283 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
284 break;
285 #elif defined(__x86_64__)
286 #define NR(_l,_h,_s) \
287 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
288 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
289 if ( (1 + /* # L4 */
290 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
291 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
292 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
293 <= nr_pt_pages )
294 break;
295 #endif
296 }
298 order = get_order_from_bytes(v_end - dsi.v_start);
299 if ( (1UL << order) > nr_pages )
300 panic("Domain 0 allocation is too small for kernel image.\n");
302 /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
303 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
304 panic("Not enough RAM for domain 0 allocation.\n");
305 alloc_spfn = page_to_mfn(page);
306 alloc_epfn = alloc_spfn + d->tot_pages;
308 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
309 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
310 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
311 if ( d->tot_pages < nr_pages )
312 printk(" (%lu pages to be allocated)",
313 nr_pages - d->tot_pages);
314 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
315 " Loaded kernel: %p->%p\n"
316 " Init. ramdisk: %p->%p\n"
317 " Phys-Mach map: %p->%p\n"
318 " Start info: %p->%p\n"
319 " Page tables: %p->%p\n"
320 " Boot stack: %p->%p\n"
321 " TOTAL: %p->%p\n",
322 _p(dsi.v_kernstart), _p(dsi.v_kernend),
323 _p(vinitrd_start), _p(vinitrd_end),
324 _p(vphysmap_start), _p(vphysmap_end),
325 _p(vstartinfo_start), _p(vstartinfo_end),
326 _p(vpt_start), _p(vpt_end),
327 _p(vstack_start), _p(vstack_end),
328 _p(dsi.v_start), _p(v_end));
329 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
331 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
332 {
333 printk("Initial guest OS requires too much space\n"
334 "(%luMB is greater than %luMB limit)\n",
335 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
336 return -ENOMEM;
337 }
339 mpt_alloc = (vpt_start - dsi.v_start) +
340 (unsigned long)pfn_to_paddr(alloc_spfn);
342 /*
343 * We're basically forcing default RPLs to 1, so that our "what privilege
344 * level are we returning to?" logic works.
345 */
346 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
347 for ( i = 0; i < 256; i++ )
348 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
350 #if defined(__i386__)
352 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
353 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
355 /*
356 * Protect the lowest 1GB of memory. We use a temporary mapping there
357 * from which we copy the kernel and ramdisk images.
358 */
359 if ( dsi.v_start < (1UL<<30) )
360 {
361 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
362 return -EINVAL;
363 }
365 /* WARNING: The new domain must have its 'processor' field filled in! */
366 #if CONFIG_PAGING_LEVELS == 3
367 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
368 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
369 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
370 for (i = 0; i < 4; i++) {
371 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
372 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
373 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
374 }
375 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
376 #else
377 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
378 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
379 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
380 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
381 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
382 #endif
384 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
385 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
386 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
387 __PAGE_HYPERVISOR);
389 l2tab += l2_linear_offset(dsi.v_start);
390 mfn = alloc_spfn;
391 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
392 {
393 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
394 {
395 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
396 mpt_alloc += PAGE_SIZE;
397 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
398 l2tab++;
399 clear_page(l1tab);
400 if ( count == 0 )
401 l1tab += l1_table_offset(dsi.v_start);
402 }
403 *l1tab = l1e_from_pfn(mfn, L1_PROT);
404 l1tab++;
406 page = mfn_to_page(mfn);
407 if ( !get_page_and_type(page, d, PGT_writable_page) )
408 BUG();
410 mfn++;
411 }
413 /* Pages that are part of page tables must be read only. */
414 l2tab = l2start + l2_linear_offset(vpt_start);
415 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
416 l1tab += l1_table_offset(vpt_start);
417 for ( count = 0; count < nr_pt_pages; count++ )
418 {
419 page = mfn_to_page(l1e_get_pfn(*l1tab));
420 if ( !opt_dom0_shadow )
421 l1e_remove_flags(*l1tab, _PAGE_RW);
422 else
423 if ( !get_page_type(page, PGT_writable_page) )
424 BUG();
426 #if CONFIG_PAGING_LEVELS == 3
427 switch (count) {
428 case 0:
429 page->u.inuse.type_info &= ~PGT_type_mask;
430 page->u.inuse.type_info |= PGT_l3_page_table;
431 get_page(page, d); /* an extra ref because of readable mapping */
433 /* Get another ref to L3 page so that it can be pinned. */
434 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
435 BUG();
436 set_bit(_PGT_pinned, &page->u.inuse.type_info);
437 break;
438 case 1 ... 4:
439 page->u.inuse.type_info &= ~PGT_type_mask;
440 page->u.inuse.type_info |= PGT_l2_page_table;
441 page->u.inuse.type_info |=
442 (count-1) << PGT_va_shift;
443 get_page(page, d); /* an extra ref because of readable mapping */
444 break;
445 default:
446 page->u.inuse.type_info &= ~PGT_type_mask;
447 page->u.inuse.type_info |= PGT_l1_page_table;
448 page->u.inuse.type_info |=
449 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
450 get_page(page, d); /* an extra ref because of readable mapping */
451 break;
452 }
453 #else
454 if ( count == 0 )
455 {
456 page->u.inuse.type_info &= ~PGT_type_mask;
457 page->u.inuse.type_info |= PGT_l2_page_table;
459 /*
460 * No longer writable: decrement the type_count.
461 * Installed as CR3: increment both the ref_count and type_count.
462 * Net: just increment the ref_count.
463 */
464 get_page(page, d); /* an extra ref because of readable mapping */
466 /* Get another ref to L2 page so that it can be pinned. */
467 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
468 BUG();
469 set_bit(_PGT_pinned, &page->u.inuse.type_info);
470 }
471 else
472 {
473 page->u.inuse.type_info &= ~PGT_type_mask;
474 page->u.inuse.type_info |= PGT_l1_page_table;
475 page->u.inuse.type_info |=
476 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
478 /*
479 * No longer writable: decrement the type_count.
480 * This is an L1 page, installed in a validated L2 page:
481 * increment both the ref_count and type_count.
482 * Net: just increment the ref_count.
483 */
484 get_page(page, d); /* an extra ref because of readable mapping */
485 }
486 #endif
487 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
488 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
489 }
491 #elif defined(__x86_64__)
493 /* Overlap with Xen protected area? */
494 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
495 (v_end > HYPERVISOR_VIRT_START) )
496 {
497 printk("DOM0 image overlaps with Xen private area.\n");
498 return -EINVAL;
499 }
501 /* WARNING: The new domain must have its 'processor' field filled in! */
502 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
503 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
504 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
505 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
506 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
507 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
508 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
509 v->arch.guest_table = mk_pagetable(__pa(l4start));
511 l4tab += l4_table_offset(dsi.v_start);
512 mfn = alloc_spfn;
513 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
514 {
515 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
516 {
517 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
518 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
519 clear_page(l1tab);
520 if ( count == 0 )
521 l1tab += l1_table_offset(dsi.v_start);
522 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
523 {
524 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
525 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
526 clear_page(l2tab);
527 if ( count == 0 )
528 l2tab += l2_table_offset(dsi.v_start);
529 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
530 {
531 maddr_to_page(mpt_alloc)->u.inuse.type_info =
532 PGT_l3_page_table;
533 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
534 clear_page(l3tab);
535 if ( count == 0 )
536 l3tab += l3_table_offset(dsi.v_start);
537 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
538 l4tab++;
539 }
540 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
541 l3tab++;
542 }
543 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
544 l2tab++;
545 }
546 *l1tab = l1e_from_pfn(mfn, L1_PROT);
547 l1tab++;
549 page = mfn_to_page(mfn);
550 if ( (page->u.inuse.type_info == 0) &&
551 !get_page_and_type(page, d, PGT_writable_page) )
552 BUG();
554 mfn++;
555 }
557 /* Pages that are part of page tables must be read only. */
558 l4tab = l4start + l4_table_offset(vpt_start);
559 l3start = l3tab = l4e_to_l3e(*l4tab);
560 l3tab += l3_table_offset(vpt_start);
561 l2start = l2tab = l3e_to_l2e(*l3tab);
562 l2tab += l2_table_offset(vpt_start);
563 l1start = l1tab = l2e_to_l1e(*l2tab);
564 l1tab += l1_table_offset(vpt_start);
565 for ( count = 0; count < nr_pt_pages; count++ )
566 {
567 l1e_remove_flags(*l1tab, _PAGE_RW);
568 page = mfn_to_page(l1e_get_pfn(*l1tab));
570 /* Read-only mapping + PGC_allocated + page-table page. */
571 page->count_info = PGC_allocated | 3;
572 page->u.inuse.type_info |= PGT_validated | 1;
574 /* Top-level p.t. is pinned. */
575 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
576 {
577 page->count_info += 1;
578 page->u.inuse.type_info += 1 | PGT_pinned;
579 }
581 /* Iterate. */
582 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
583 {
584 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
585 {
586 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
587 l3start = l3tab = l4e_to_l3e(*++l4tab);
588 l2start = l2tab = l3e_to_l2e(*l3tab);
589 }
590 l1start = l1tab = l2e_to_l1e(*l2tab);
591 }
592 }
594 #endif /* __x86_64__ */
596 /* Mask all upcalls... */
597 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
598 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
600 if ( opt_dom0_max_vcpus == 0 )
601 opt_dom0_max_vcpus = num_online_cpus();
602 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
603 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
604 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
606 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
607 (void)alloc_vcpu(d, i, i);
609 /* Set up monitor table */
610 update_pagetables(v);
612 /* Install the new page tables. */
613 local_irq_disable();
614 write_ptbase(v);
616 /* Copy the OS image and free temporary buffer. */
617 (void)loadelfimage(&dsi);
619 p = strstr(dsi.xen_section_string, "HYPERCALL_PAGE=");
620 if ( p != NULL )
621 {
622 p += strlen("HYPERCALL_PAGE=");
623 hypercall_page = simple_strtoul(p, NULL, 16);
624 hypercall_page = dsi.v_start + (hypercall_page << PAGE_SHIFT);
625 if ( (hypercall_page < dsi.v_start) || (hypercall_page >= v_end) )
626 {
627 write_ptbase(current);
628 local_irq_enable();
629 printk("Invalid HYPERCALL_PAGE field in guest header.\n");
630 return -1;
631 }
633 hypercall_page_initialise((void *)hypercall_page);
634 }
636 init_domheap_pages(
637 _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
639 /* Copy the initial ramdisk and free temporary buffer. */
640 if ( initrd_len != 0 )
641 {
642 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
643 init_domheap_pages(
644 _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
645 }
647 /* Set up start info area. */
648 si = (start_info_t *)vstartinfo_start;
649 memset(si, 0, PAGE_SIZE);
650 si->nr_pages = nr_pages;
652 si->shared_info = virt_to_maddr(d->shared_info);
653 if ( opt_dom0_translate )
654 {
655 si->shared_info = max_page << PAGE_SHIFT;
656 set_gpfn_from_mfn(virt_to_maddr(d->shared_info) >> PAGE_SHIFT, max_page);
657 }
659 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
660 si->pt_base = vpt_start;
661 si->nr_pt_frames = nr_pt_pages;
662 si->mfn_list = vphysmap_start;
663 sprintf(si->magic, "xen-%i.%i-x86_%d%s",
664 XEN_VERSION, XEN_SUBVERSION, BITS_PER_LONG, xen_pae ? "p" : "");
666 /* Write the phys->machine and machine->phys table entries. */
667 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
668 {
669 mfn = pfn + alloc_spfn;
670 #ifndef NDEBUG
671 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
672 if ( !opt_dom0_translate && (pfn > REVERSE_START) )
673 mfn = alloc_epfn - (pfn - REVERSE_START);
674 #endif
675 ((unsigned long *)vphysmap_start)[pfn] = mfn;
676 set_gpfn_from_mfn(mfn, pfn);
677 }
678 while ( pfn < nr_pages )
679 {
680 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
681 panic("Not enough RAM for DOM0 reservation.\n");
682 while ( pfn < d->tot_pages )
683 {
684 mfn = page_to_mfn(page);
685 #ifndef NDEBUG
686 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
687 #endif
688 ((unsigned long *)vphysmap_start)[pfn] = mfn;
689 set_gpfn_from_mfn(mfn, pfn);
690 #undef pfn
691 page++; pfn++;
692 }
693 }
695 if ( initrd_len != 0 )
696 {
697 si->mod_start = vinitrd_start;
698 si->mod_len = initrd_len;
699 printk("Initrd len 0x%lx, start at 0x%lx\n",
700 si->mod_len, si->mod_start);
701 }
703 memset(si->cmd_line, 0, sizeof(si->cmd_line));
704 if ( cmdline != NULL )
705 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
707 /* Reinstate the caller's page tables. */
708 write_ptbase(current);
709 local_irq_enable();
711 #if defined(__i386__)
712 /* Destroy low mappings - they were only for our convenience. */
713 zap_low_mappings(l2start);
714 zap_low_mappings(idle_pg_table_l2);
715 #endif
717 init_domain_time(d);
719 set_bit(_VCPUF_initialised, &v->vcpu_flags);
721 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
723 if ( opt_dom0_shadow || opt_dom0_translate )
724 {
725 printk("dom0: shadow enable\n");
726 shadow_mode_enable(d, (opt_dom0_translate
727 ? SHM_enable | SHM_refcounts | SHM_translate
728 : SHM_enable));
729 if ( opt_dom0_translate )
730 {
731 printk("dom0: shadow translate\n");
732 #if defined(__i386__) && defined(CONFIG_X86_PAE)
733 printk("FIXME: PAE code needed here: %s:%d (%s)\n",
734 __FILE__, __LINE__, __FUNCTION__);
735 for ( ; ; )
736 __asm__ __volatile__ ( "hlt" );
737 #else
738 /* Hmm, what does this?
739 Looks like isn't portable across 32/64 bit and pae/non-pae ...
740 -- kraxel */
742 /* mafetter: This code is mostly a hack in order to be able to
743 * test with dom0's which are running with shadow translate.
744 * I expect we'll rip this out once we have a stable set of
745 * domU clients which use the various shadow modes, but it's
746 * useful to leave this here for now...
747 */
749 // map this domain's p2m table into current page table,
750 // so that we can easily access it.
751 //
752 ASSERT( root_get_intpte(idle_pg_table[1]) == 0 );
753 ASSERT( pagetable_get_paddr(d->arch.phys_table) );
754 idle_pg_table[1] = root_from_paddr(
755 pagetable_get_paddr(d->arch.phys_table), __PAGE_HYPERVISOR);
756 translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT),
757 pagetable_get_pfn(v->arch.guest_table));
758 idle_pg_table[1] = root_empty();
759 local_flush_tlb();
760 #endif
761 }
763 update_pagetables(v); /* XXX SMP */
764 printk("dom0: shadow setup done\n");
765 }
767 rc = 0;
769 /* DOM0 is permitted full I/O capabilities. */
770 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
771 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
772 rc |= irqs_permit_access(dom0, 0, NR_PIRQS-1);
774 /*
775 * Modify I/O port access permissions.
776 */
777 /* Master Interrupt Controller (PIC). */
778 rc |= ioports_deny_access(dom0, 0x20, 0x21);
779 /* Slave Interrupt Controller (PIC). */
780 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
781 /* Interval Timer (PIT). */
782 rc |= ioports_deny_access(dom0, 0x40, 0x43);
783 /* PIT Channel 2 / PC Speaker Control. */
784 rc |= ioports_deny_access(dom0, 0x61, 0x61);
785 /* Command-line I/O ranges. */
786 process_dom0_ioports_disable();
788 /*
789 * Modify I/O memory access permissions.
790 */
791 /* Local APIC. */
792 if ( mp_lapic_addr != 0 )
793 {
794 mfn = paddr_to_pfn(mp_lapic_addr);
795 rc |= iomem_deny_access(dom0, mfn, mfn);
796 }
797 /* I/O APICs. */
798 for ( i = 0; i < nr_ioapics; i++ )
799 {
800 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
801 if ( smp_found_config )
802 rc |= iomem_deny_access(dom0, mfn, mfn);
803 }
805 BUG_ON(rc != 0);
807 return 0;
808 }
810 int elf_sanity_check(Elf_Ehdr *ehdr)
811 {
812 if ( !IS_ELF(*ehdr) ||
813 #if defined(__i386__)
814 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
815 (ehdr->e_machine != EM_386) ||
816 #elif defined(__x86_64__)
817 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
818 (ehdr->e_machine != EM_X86_64) ||
819 #endif
820 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
821 (ehdr->e_type != ET_EXEC) )
822 {
823 printk("DOM0 image is not a Xen-compatible Elf image.\n");
824 return 0;
825 }
827 return 1;
828 }
830 /*
831 * Local variables:
832 * mode: C
833 * c-set-style: "BSD"
834 * c-basic-offset: 4
835 * tab-width: 4
836 * indent-tabs-mode: nil
837 * End:
838 */