ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 9038:f1e5b29dd15a

Only panic() if the dom0 kernel requires an unknown feature flag.

It is acceptable for a guest kernel to support (but not require)
a feature which the hypervisor does not understand.

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
author Ian.Campbell@xensource.com
date Mon Feb 27 10:55:13 2006 +0000 (2006-02-27)
parents 13e4df60caf1
children ee8041b0ab86
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/compile.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <asm/regs.h>
22 #include <asm/system.h>
23 #include <asm/io.h>
24 #include <asm/processor.h>
25 #include <asm/desc.h>
26 #include <asm/i387.h>
27 #include <asm/shadow.h>
29 #include <public/version.h>
31 static long dom0_nrpages;
33 /*
34 * dom0_mem:
35 * If +ve:
36 * * The specified amount of memory is allocated to domain 0.
37 * If -ve:
38 * * All of memory is allocated to domain 0, minus the specified amount.
39 * If not specified:
40 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
41 * for uses such as DMA buffers (the reservation is clamped to 128MB).
42 */
43 static void parse_dom0_mem(char *s)
44 {
45 unsigned long long bytes;
46 char *t = s;
47 if ( *s == '-' )
48 t++;
49 bytes = parse_size_and_unit(t);
50 dom0_nrpages = bytes >> PAGE_SHIFT;
51 if ( *s == '-' )
52 dom0_nrpages = -dom0_nrpages;
53 }
54 custom_param("dom0_mem", parse_dom0_mem);
56 static unsigned int opt_dom0_max_vcpus;
57 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
59 static unsigned int opt_dom0_shadow;
60 boolean_param("dom0_shadow", opt_dom0_shadow);
62 static char opt_dom0_ioports_disable[200] = "";
63 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
65 #if defined(__i386__)
66 /* No ring-3 access in initial leaf page tables. */
67 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
68 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
69 #define L3_PROT (_PAGE_PRESENT)
70 #elif defined(__x86_64__)
71 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
72 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
73 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
74 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
75 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
76 #endif
78 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
79 #define round_pgdown(_p) ((_p)&PAGE_MASK)
81 static struct page_info *alloc_chunk(struct domain *d, unsigned long max_pages)
82 {
83 struct page_info *page;
84 unsigned int order;
85 /*
86 * Allocate up to 2MB at a time: It prevents allocating very large chunks
87 * from DMA pools before the >4GB pool is fully depleted.
88 */
89 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
90 max_pages = 2UL << (20 - PAGE_SHIFT);
91 order = get_order_from_pages(max_pages);
92 if ( (max_pages & (max_pages-1)) != 0 )
93 order--;
94 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
95 if ( order-- == 0 )
96 break;
97 return page;
98 }
100 static void process_dom0_ioports_disable(void)
101 {
102 unsigned long io_from, io_to;
103 char *t, *u, *s = opt_dom0_ioports_disable;
105 if ( *s == '\0' )
106 return;
108 while ( (t = strsep(&s, ",")) != NULL )
109 {
110 io_from = simple_strtoul(t, &u, 16);
111 if ( u == t )
112 {
113 parse_error:
114 printk("Invalid ioport range <%s> "
115 "in dom0_ioports_disable, skipping\n", t);
116 continue;
117 }
119 if ( *u == '\0' )
120 io_to = io_from;
121 else if ( *u == '-' )
122 io_to = simple_strtoul(u + 1, &u, 16);
123 else
124 goto parse_error;
126 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
127 goto parse_error;
129 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
130 io_from, io_to);
132 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
133 BUG();
134 }
135 }
137 static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = {
138 [XENFEAT_writable_page_tables] = "writable_page_tables",
139 [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
140 [XENFEAT_auto_translated_physmap] = "auto_translated_physmap",
141 [XENFEAT_supervisor_mode_kernel] = "supervisor_mode_kernel",
142 [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb"
143 };
145 static void parse_features(
146 const char *feats,
147 uint32_t supported[XENFEAT_NR_SUBMAPS],
148 uint32_t required[XENFEAT_NR_SUBMAPS])
149 {
150 const char *end, *p;
151 int i, req;
153 if ( (end = strchr(feats, ',')) == NULL )
154 end = feats + strlen(feats);
156 while ( feats < end )
157 {
158 p = strchr(feats, '|');
159 if ( (p == NULL) || (p > end) )
160 p = end;
162 req = (*feats == '!');
163 if ( req )
164 feats++;
166 for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ )
167 {
168 if ( feature_names[i] == NULL )
169 continue;
171 if ( strncmp(feature_names[i], feats, p-feats) == 0 )
172 {
173 set_bit(i, supported);
174 if ( req )
175 set_bit(i, required);
176 break;
177 }
178 }
180 if ( i == XENFEAT_NR_SUBMAPS*32 )
181 {
182 printk("Unknown kernel feature \"%.*s\".\n",
183 (int)(p-feats), feats);
184 if ( req )
185 panic("Domain 0 requires an unknown hypervisor feature.\n");
186 }
188 feats = p;
189 if ( *feats == '|' )
190 feats++;
191 }
192 }
194 int construct_dom0(struct domain *d,
195 unsigned long _image_start, unsigned long image_len,
196 unsigned long _initrd_start, unsigned long initrd_len,
197 char *cmdline)
198 {
199 int i, rc, dom0_pae, xen_pae, order;
200 unsigned long pfn, mfn;
201 unsigned long nr_pages;
202 unsigned long nr_pt_pages;
203 unsigned long alloc_spfn;
204 unsigned long alloc_epfn;
205 unsigned long count;
206 struct page_info *page = NULL;
207 start_info_t *si;
208 struct vcpu *v = d->vcpu[0];
209 char *p;
210 unsigned long hypercall_page;
211 #if defined(__i386__)
212 char *image_start = (char *)_image_start; /* use lowmem mappings */
213 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
214 #elif defined(__x86_64__)
215 char *image_start = __va(_image_start);
216 char *initrd_start = __va(_initrd_start);
217 #endif
218 #if CONFIG_PAGING_LEVELS >= 4
219 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
220 #endif
221 #if CONFIG_PAGING_LEVELS >= 3
222 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
223 #endif
224 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
225 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
227 /*
228 * This fully describes the memory layout of the initial domain. All
229 * *_start address are page-aligned, except v_start (and v_end) which are
230 * superpage-aligned.
231 */
232 struct domain_setup_info dsi;
233 unsigned long vinitrd_start;
234 unsigned long vinitrd_end;
235 unsigned long vphysmap_start;
236 unsigned long vphysmap_end;
237 unsigned long vstartinfo_start;
238 unsigned long vstartinfo_end;
239 unsigned long vstack_start;
240 unsigned long vstack_end;
241 unsigned long vpt_start;
242 unsigned long vpt_end;
243 unsigned long v_end;
245 /* Machine address of next candidate page-table page. */
246 unsigned long mpt_alloc;
248 /* Features supported. */
249 uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
250 uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
252 extern void translate_l2pgtable(
253 struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
255 /* Sanity! */
256 BUG_ON(d->domain_id != 0);
257 BUG_ON(d->vcpu[0] == NULL);
258 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
260 memset(&dsi, 0, sizeof(struct domain_setup_info));
261 dsi.image_addr = (unsigned long)image_start;
262 dsi.image_len = image_len;
264 printk("*** LOADING DOMAIN 0 ***\n");
266 d->max_pages = ~0U;
268 /*
269 * If domain 0 allocation isn't specified, reserve 1/16th of available
270 * memory for things like DMA buffers. This reservation is clamped to
271 * a maximum of 128MB.
272 */
273 if ( dom0_nrpages == 0 )
274 {
275 dom0_nrpages = avail_domheap_pages() +
276 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
277 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT);
278 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
279 dom0_nrpages = -dom0_nrpages;
280 }
282 /* Negative memory specification means "all memory - specified amount". */
283 if ( dom0_nrpages < 0 )
284 nr_pages = avail_domheap_pages() +
285 ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
286 ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
287 dom0_nrpages;
288 else
289 nr_pages = dom0_nrpages;
291 if ( (rc = parseelfimage(&dsi)) != 0 )
292 return rc;
294 if ( dsi.xen_section_string == NULL )
295 {
296 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
297 return -EINVAL;
298 }
300 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
301 xen_pae = (CONFIG_PAGING_LEVELS == 3);
302 if ( dom0_pae != xen_pae )
303 {
304 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
305 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
306 return -EINVAL;
307 }
309 if ( (p = strstr(dsi.xen_section_string, "FEATURES=")) != NULL )
310 {
311 parse_features(
312 p + strlen("FEATURES="),
313 dom0_features_supported,
314 dom0_features_required);
315 printk("Domain 0 kernel supports features = { %08x }.\n",
316 dom0_features_supported[0]);
317 printk("Domain 0 kernel requires features = { %08x }.\n",
318 dom0_features_required[0]);
319 if ( dom0_features_required[0] )
320 panic("Domain 0 requires an unsupported hypervisor feature.\n");
321 }
323 /* Align load address to 4MB boundary. */
324 dsi.v_start &= ~((1UL<<22)-1);
326 /*
327 * Why do we need this? The number of page-table frames depends on the
328 * size of the bootstrap address space. But the size of the address space
329 * depends on the number of page-table frames (since each one is mapped
330 * read-only). We have a pair of simultaneous equations in two unknowns,
331 * which we solve by exhaustive search.
332 */
333 vinitrd_start = round_pgup(dsi.v_end);
334 vinitrd_end = vinitrd_start + initrd_len;
335 vphysmap_start = round_pgup(vinitrd_end);
336 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
337 vstartinfo_start = round_pgup(vphysmap_end);
338 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
339 vpt_start = vstartinfo_end;
340 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
341 {
342 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
343 vstack_start = vpt_end;
344 vstack_end = vstack_start + PAGE_SIZE;
345 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
346 if ( (v_end - vstack_end) < (512UL << 10) )
347 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
348 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
349 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
350 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
351 break;
352 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
353 /* 5 pages: 1x 3rd + 4x 2nd level */
354 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
355 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
356 break;
357 #elif defined(__x86_64__)
358 #define NR(_l,_h,_s) \
359 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
360 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
361 if ( (1 + /* # L4 */
362 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
363 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
364 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
365 <= nr_pt_pages )
366 break;
367 #endif
368 }
370 order = get_order_from_bytes(v_end - dsi.v_start);
371 if ( (1UL << order) > nr_pages )
372 panic("Domain 0 allocation is too small for kernel image.\n");
374 /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
375 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
376 panic("Not enough RAM for domain 0 allocation.\n");
377 alloc_spfn = page_to_mfn(page);
378 alloc_epfn = alloc_spfn + d->tot_pages;
380 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
381 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
382 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
383 if ( d->tot_pages < nr_pages )
384 printk(" (%lu pages to be allocated)",
385 nr_pages - d->tot_pages);
386 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
387 " Loaded kernel: %p->%p\n"
388 " Init. ramdisk: %p->%p\n"
389 " Phys-Mach map: %p->%p\n"
390 " Start info: %p->%p\n"
391 " Page tables: %p->%p\n"
392 " Boot stack: %p->%p\n"
393 " TOTAL: %p->%p\n",
394 _p(dsi.v_kernstart), _p(dsi.v_kernend),
395 _p(vinitrd_start), _p(vinitrd_end),
396 _p(vphysmap_start), _p(vphysmap_end),
397 _p(vstartinfo_start), _p(vstartinfo_end),
398 _p(vpt_start), _p(vpt_end),
399 _p(vstack_start), _p(vstack_end),
400 _p(dsi.v_start), _p(v_end));
401 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
403 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
404 {
405 printk("Initial guest OS requires too much space\n"
406 "(%luMB is greater than %luMB limit)\n",
407 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
408 return -ENOMEM;
409 }
411 mpt_alloc = (vpt_start - dsi.v_start) +
412 (unsigned long)pfn_to_paddr(alloc_spfn);
414 /*
415 * We're basically forcing default RPLs to 1, so that our "what privilege
416 * level are we returning to?" logic works.
417 */
418 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
419 for ( i = 0; i < 256; i++ )
420 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
422 #if defined(__i386__)
424 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
425 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
427 /*
428 * Protect the lowest 1GB of memory. We use a temporary mapping there
429 * from which we copy the kernel and ramdisk images.
430 */
431 if ( dsi.v_start < (1UL<<30) )
432 {
433 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
434 return -EINVAL;
435 }
437 /* WARNING: The new domain must have its 'processor' field filled in! */
438 #if CONFIG_PAGING_LEVELS == 3
439 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
440 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
441 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
442 for (i = 0; i < 4; i++) {
443 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
444 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
445 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
446 }
447 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
448 #else
449 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
450 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
451 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
452 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
453 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
454 #endif
456 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
457 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
458 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
459 __PAGE_HYPERVISOR);
461 l2tab += l2_linear_offset(dsi.v_start);
462 mfn = alloc_spfn;
463 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
464 {
465 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
466 {
467 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
468 mpt_alloc += PAGE_SIZE;
469 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
470 l2tab++;
471 clear_page(l1tab);
472 if ( count == 0 )
473 l1tab += l1_table_offset(dsi.v_start);
474 }
475 *l1tab = l1e_from_pfn(mfn, L1_PROT);
476 l1tab++;
478 page = mfn_to_page(mfn);
479 if ( !get_page_and_type(page, d, PGT_writable_page) )
480 BUG();
482 mfn++;
483 }
485 /* Pages that are part of page tables must be read only. */
486 l2tab = l2start + l2_linear_offset(vpt_start);
487 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
488 l1tab += l1_table_offset(vpt_start);
489 for ( count = 0; count < nr_pt_pages; count++ )
490 {
491 page = mfn_to_page(l1e_get_pfn(*l1tab));
492 if ( !opt_dom0_shadow )
493 l1e_remove_flags(*l1tab, _PAGE_RW);
494 else
495 if ( !get_page_type(page, PGT_writable_page) )
496 BUG();
498 #if CONFIG_PAGING_LEVELS == 3
499 switch (count) {
500 case 0:
501 page->u.inuse.type_info &= ~PGT_type_mask;
502 page->u.inuse.type_info |= PGT_l3_page_table;
503 get_page(page, d); /* an extra ref because of readable mapping */
505 /* Get another ref to L3 page so that it can be pinned. */
506 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
507 BUG();
508 set_bit(_PGT_pinned, &page->u.inuse.type_info);
509 break;
510 case 1 ... 4:
511 page->u.inuse.type_info &= ~PGT_type_mask;
512 page->u.inuse.type_info |= PGT_l2_page_table;
513 page->u.inuse.type_info |=
514 (count-1) << PGT_va_shift;
515 get_page(page, d); /* an extra ref because of readable mapping */
516 break;
517 default:
518 page->u.inuse.type_info &= ~PGT_type_mask;
519 page->u.inuse.type_info |= PGT_l1_page_table;
520 page->u.inuse.type_info |=
521 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
522 get_page(page, d); /* an extra ref because of readable mapping */
523 break;
524 }
525 #else
526 if ( count == 0 )
527 {
528 page->u.inuse.type_info &= ~PGT_type_mask;
529 page->u.inuse.type_info |= PGT_l2_page_table;
531 /*
532 * No longer writable: decrement the type_count.
533 * Installed as CR3: increment both the ref_count and type_count.
534 * Net: just increment the ref_count.
535 */
536 get_page(page, d); /* an extra ref because of readable mapping */
538 /* Get another ref to L2 page so that it can be pinned. */
539 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
540 BUG();
541 set_bit(_PGT_pinned, &page->u.inuse.type_info);
542 }
543 else
544 {
545 page->u.inuse.type_info &= ~PGT_type_mask;
546 page->u.inuse.type_info |= PGT_l1_page_table;
547 page->u.inuse.type_info |=
548 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
550 /*
551 * No longer writable: decrement the type_count.
552 * This is an L1 page, installed in a validated L2 page:
553 * increment both the ref_count and type_count.
554 * Net: just increment the ref_count.
555 */
556 get_page(page, d); /* an extra ref because of readable mapping */
557 }
558 #endif
559 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
560 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
561 }
563 #elif defined(__x86_64__)
565 /* Overlap with Xen protected area? */
566 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
567 (v_end > HYPERVISOR_VIRT_START) )
568 {
569 printk("DOM0 image overlaps with Xen private area.\n");
570 return -EINVAL;
571 }
573 /* WARNING: The new domain must have its 'processor' field filled in! */
574 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
575 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
576 memcpy(l4tab, &idle_pg_table[0], PAGE_SIZE);
577 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
578 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
579 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
580 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
581 v->arch.guest_table = mk_pagetable(__pa(l4start));
583 l4tab += l4_table_offset(dsi.v_start);
584 mfn = alloc_spfn;
585 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
586 {
587 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
588 {
589 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
590 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
591 clear_page(l1tab);
592 if ( count == 0 )
593 l1tab += l1_table_offset(dsi.v_start);
594 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
595 {
596 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
597 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
598 clear_page(l2tab);
599 if ( count == 0 )
600 l2tab += l2_table_offset(dsi.v_start);
601 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
602 {
603 maddr_to_page(mpt_alloc)->u.inuse.type_info =
604 PGT_l3_page_table;
605 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
606 clear_page(l3tab);
607 if ( count == 0 )
608 l3tab += l3_table_offset(dsi.v_start);
609 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
610 l4tab++;
611 }
612 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
613 l3tab++;
614 }
615 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
616 l2tab++;
617 }
618 *l1tab = l1e_from_pfn(mfn, L1_PROT);
619 l1tab++;
621 page = mfn_to_page(mfn);
622 if ( (page->u.inuse.type_info == 0) &&
623 !get_page_and_type(page, d, PGT_writable_page) )
624 BUG();
626 mfn++;
627 }
629 /* Pages that are part of page tables must be read only. */
630 l4tab = l4start + l4_table_offset(vpt_start);
631 l3start = l3tab = l4e_to_l3e(*l4tab);
632 l3tab += l3_table_offset(vpt_start);
633 l2start = l2tab = l3e_to_l2e(*l3tab);
634 l2tab += l2_table_offset(vpt_start);
635 l1start = l1tab = l2e_to_l1e(*l2tab);
636 l1tab += l1_table_offset(vpt_start);
637 for ( count = 0; count < nr_pt_pages; count++ )
638 {
639 l1e_remove_flags(*l1tab, _PAGE_RW);
640 page = mfn_to_page(l1e_get_pfn(*l1tab));
642 /* Read-only mapping + PGC_allocated + page-table page. */
643 page->count_info = PGC_allocated | 3;
644 page->u.inuse.type_info |= PGT_validated | 1;
646 /* Top-level p.t. is pinned. */
647 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
648 {
649 page->count_info += 1;
650 page->u.inuse.type_info += 1 | PGT_pinned;
651 }
653 /* Iterate. */
654 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
655 {
656 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
657 {
658 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
659 l3start = l3tab = l4e_to_l3e(*++l4tab);
660 l2start = l2tab = l3e_to_l2e(*l3tab);
661 }
662 l1start = l1tab = l2e_to_l1e(*l2tab);
663 }
664 }
666 #endif /* __x86_64__ */
668 /* Mask all upcalls... */
669 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
670 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
672 if ( opt_dom0_max_vcpus == 0 )
673 opt_dom0_max_vcpus = num_online_cpus();
674 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
675 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
676 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
678 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
679 (void)alloc_vcpu(d, i, i);
681 /* Set up monitor table */
682 update_pagetables(v);
684 /* Install the new page tables. */
685 local_irq_disable();
686 write_ptbase(v);
688 /* Copy the OS image and free temporary buffer. */
689 (void)loadelfimage(&dsi);
691 p = strstr(dsi.xen_section_string, "HYPERCALL_PAGE=");
692 if ( p != NULL )
693 {
694 p += strlen("HYPERCALL_PAGE=");
695 hypercall_page = simple_strtoul(p, NULL, 16);
696 hypercall_page = dsi.v_start + (hypercall_page << PAGE_SHIFT);
697 if ( (hypercall_page < dsi.v_start) || (hypercall_page >= v_end) )
698 {
699 write_ptbase(current);
700 local_irq_enable();
701 printk("Invalid HYPERCALL_PAGE field in guest header.\n");
702 return -1;
703 }
705 hypercall_page_initialise((void *)hypercall_page);
706 }
708 init_domheap_pages(
709 _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
711 /* Copy the initial ramdisk and free temporary buffer. */
712 if ( initrd_len != 0 )
713 {
714 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
715 init_domheap_pages(
716 _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
717 }
719 /* Set up start info area. */
720 si = (start_info_t *)vstartinfo_start;
721 memset(si, 0, PAGE_SIZE);
722 si->nr_pages = nr_pages;
724 si->shared_info = virt_to_maddr(d->shared_info);
726 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
727 si->pt_base = vpt_start;
728 si->nr_pt_frames = nr_pt_pages;
729 si->mfn_list = vphysmap_start;
730 sprintf(si->magic, "xen-%i.%i-x86_%d%s",
731 XEN_VERSION, XEN_SUBVERSION, BITS_PER_LONG, xen_pae ? "p" : "");
733 /* Write the phys->machine and machine->phys table entries. */
734 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
735 {
736 mfn = pfn + alloc_spfn;
737 #ifndef NDEBUG
738 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
739 if ( pfn > REVERSE_START )
740 mfn = alloc_epfn - (pfn - REVERSE_START);
741 #endif
742 ((unsigned long *)vphysmap_start)[pfn] = mfn;
743 set_gpfn_from_mfn(mfn, pfn);
744 }
745 while ( pfn < nr_pages )
746 {
747 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
748 panic("Not enough RAM for DOM0 reservation.\n");
749 while ( pfn < d->tot_pages )
750 {
751 mfn = page_to_mfn(page);
752 #ifndef NDEBUG
753 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
754 #endif
755 ((unsigned long *)vphysmap_start)[pfn] = mfn;
756 set_gpfn_from_mfn(mfn, pfn);
757 #undef pfn
758 page++; pfn++;
759 }
760 }
762 if ( initrd_len != 0 )
763 {
764 si->mod_start = vinitrd_start;
765 si->mod_len = initrd_len;
766 printk("Initrd len 0x%lx, start at 0x%lx\n",
767 si->mod_len, si->mod_start);
768 }
770 memset(si->cmd_line, 0, sizeof(si->cmd_line));
771 if ( cmdline != NULL )
772 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
774 /* Reinstate the caller's page tables. */
775 write_ptbase(current);
776 local_irq_enable();
778 #if defined(__i386__)
779 /* Destroy low mappings - they were only for our convenience. */
780 zap_low_mappings(l2start);
781 zap_low_mappings(idle_pg_table_l2);
782 #endif
784 init_domain_time(d);
786 set_bit(_VCPUF_initialised, &v->vcpu_flags);
788 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
790 if ( opt_dom0_shadow )
791 {
792 shadow_mode_enable(d, SHM_enable);
793 update_pagetables(v);
794 }
796 rc = 0;
798 /* DOM0 is permitted full I/O capabilities. */
799 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
800 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
801 rc |= irqs_permit_access(dom0, 0, NR_PIRQS-1);
803 /*
804 * Modify I/O port access permissions.
805 */
806 /* Master Interrupt Controller (PIC). */
807 rc |= ioports_deny_access(dom0, 0x20, 0x21);
808 /* Slave Interrupt Controller (PIC). */
809 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
810 /* Interval Timer (PIT). */
811 rc |= ioports_deny_access(dom0, 0x40, 0x43);
812 /* PIT Channel 2 / PC Speaker Control. */
813 rc |= ioports_deny_access(dom0, 0x61, 0x61);
814 /* Command-line I/O ranges. */
815 process_dom0_ioports_disable();
817 /*
818 * Modify I/O memory access permissions.
819 */
820 /* Local APIC. */
821 if ( mp_lapic_addr != 0 )
822 {
823 mfn = paddr_to_pfn(mp_lapic_addr);
824 rc |= iomem_deny_access(dom0, mfn, mfn);
825 }
826 /* I/O APICs. */
827 for ( i = 0; i < nr_ioapics; i++ )
828 {
829 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
830 if ( smp_found_config )
831 rc |= iomem_deny_access(dom0, mfn, mfn);
832 }
834 BUG_ON(rc != 0);
836 return 0;
837 }
839 int elf_sanity_check(Elf_Ehdr *ehdr)
840 {
841 if ( !IS_ELF(*ehdr) ||
842 #if defined(__i386__)
843 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
844 (ehdr->e_machine != EM_386) ||
845 #elif defined(__x86_64__)
846 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
847 (ehdr->e_machine != EM_X86_64) ||
848 #endif
849 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
850 (ehdr->e_type != ET_EXEC) )
851 {
852 printk("DOM0 image is not a Xen-compatible Elf image.\n");
853 return 0;
854 }
856 return 1;
857 }
859 /*
860 * Local variables:
861 * mode: C
862 * c-set-style: "BSD"
863 * c-basic-offset: 4
864 * tab-width: 4
865 * indent-tabs-mode: nil
866 * End:
867 */