direct-io.hg

view xen/arch/x86/domain_build.c @ 10173:954f4dea9da6

[PAE] Allow pgdirs above 4GB for paravirt guests.
**NOTE**: This obviates the need for lowmem_emergency_pool.
Unpriv guests no longer need to be able to allocate memory
below 4GB for PAE PDPTs.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri May 26 17:22:30 2006 +0100 (2006-05-26)
parents 0ee104bd6557
children 9f937ecc4f54
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/compile.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <asm/regs.h>
22 #include <asm/system.h>
23 #include <asm/io.h>
24 #include <asm/processor.h>
25 #include <asm/desc.h>
26 #include <asm/i387.h>
27 #include <asm/shadow.h>
29 #include <public/version.h>
31 extern unsigned long initial_images_nrpages(void);
32 extern void discard_initial_images(void);
34 static long dom0_nrpages;
36 /*
37 * dom0_mem:
38 * If +ve:
39 * * The specified amount of memory is allocated to domain 0.
40 * If -ve:
41 * * All of memory is allocated to domain 0, minus the specified amount.
42 * If not specified:
43 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
44 * for uses such as DMA buffers (the reservation is clamped to 128MB).
45 */
46 static void parse_dom0_mem(char *s)
47 {
48 unsigned long long bytes;
49 char *t = s;
50 if ( *s == '-' )
51 t++;
52 bytes = parse_size_and_unit(t);
53 dom0_nrpages = bytes >> PAGE_SHIFT;
54 if ( *s == '-' )
55 dom0_nrpages = -dom0_nrpages;
56 }
57 custom_param("dom0_mem", parse_dom0_mem);
59 static unsigned int opt_dom0_max_vcpus;
60 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
62 static unsigned int opt_dom0_shadow;
63 boolean_param("dom0_shadow", opt_dom0_shadow);
65 static char opt_dom0_ioports_disable[200] = "";
66 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
68 #if defined(__i386__)
69 /* No ring-3 access in initial leaf page tables. */
70 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
71 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
72 #define L3_PROT (_PAGE_PRESENT)
73 #elif defined(__x86_64__)
74 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
75 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
76 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
77 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
78 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
79 #endif
81 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
82 #define round_pgdown(_p) ((_p)&PAGE_MASK)
84 static struct page_info *alloc_chunk(struct domain *d, unsigned long max_pages)
85 {
86 struct page_info *page;
87 unsigned int order;
88 /*
89 * Allocate up to 2MB at a time: It prevents allocating very large chunks
90 * from DMA pools before the >4GB pool is fully depleted.
91 */
92 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
93 max_pages = 2UL << (20 - PAGE_SHIFT);
94 order = get_order_from_pages(max_pages);
95 if ( (max_pages & (max_pages-1)) != 0 )
96 order--;
97 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
98 if ( order-- == 0 )
99 break;
100 return page;
101 }
103 static void process_dom0_ioports_disable(void)
104 {
105 unsigned long io_from, io_to;
106 char *t, *u, *s = opt_dom0_ioports_disable;
108 if ( *s == '\0' )
109 return;
111 while ( (t = strsep(&s, ",")) != NULL )
112 {
113 io_from = simple_strtoul(t, &u, 16);
114 if ( u == t )
115 {
116 parse_error:
117 printk("Invalid ioport range <%s> "
118 "in dom0_ioports_disable, skipping\n", t);
119 continue;
120 }
122 if ( *u == '\0' )
123 io_to = io_from;
124 else if ( *u == '-' )
125 io_to = simple_strtoul(u + 1, &u, 16);
126 else
127 goto parse_error;
129 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
130 goto parse_error;
132 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
133 io_from, io_to);
135 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
136 BUG();
137 }
138 }
140 static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = {
141 [XENFEAT_writable_page_tables] = "writable_page_tables",
142 [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
143 [XENFEAT_auto_translated_physmap] = "auto_translated_physmap",
144 [XENFEAT_supervisor_mode_kernel] = "supervisor_mode_kernel",
145 [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb"
146 };
148 static void parse_features(
149 const char *feats,
150 uint32_t supported[XENFEAT_NR_SUBMAPS],
151 uint32_t required[XENFEAT_NR_SUBMAPS])
152 {
153 const char *end, *p;
154 int i, req;
156 if ( (end = strchr(feats, ',')) == NULL )
157 end = feats + strlen(feats);
159 while ( feats < end )
160 {
161 p = strchr(feats, '|');
162 if ( (p == NULL) || (p > end) )
163 p = end;
165 req = (*feats == '!');
166 if ( req )
167 feats++;
169 for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ )
170 {
171 if ( feature_names[i] == NULL )
172 continue;
174 if ( strncmp(feature_names[i], feats, p-feats) == 0 )
175 {
176 set_bit(i, supported);
177 if ( req )
178 set_bit(i, required);
179 break;
180 }
181 }
183 if ( i == XENFEAT_NR_SUBMAPS*32 )
184 {
185 printk("Unknown kernel feature \"%.*s\".\n",
186 (int)(p-feats), feats);
187 if ( req )
188 panic("Domain 0 requires an unknown hypervisor feature.\n");
189 }
191 feats = p;
192 if ( *feats == '|' )
193 feats++;
194 }
195 }
197 int construct_dom0(struct domain *d,
198 unsigned long _image_start, unsigned long image_len,
199 unsigned long _initrd_start, unsigned long initrd_len,
200 char *cmdline)
201 {
202 int i, rc, dom0_pae, xen_pae, order;
203 unsigned long pfn, mfn;
204 unsigned long nr_pages;
205 unsigned long nr_pt_pages;
206 unsigned long alloc_spfn;
207 unsigned long alloc_epfn;
208 unsigned long count;
209 struct page_info *page = NULL;
210 start_info_t *si;
211 struct vcpu *v = d->vcpu[0];
212 char *p;
213 unsigned long hypercall_page;
214 #if defined(__i386__)
215 char *image_start = (char *)_image_start; /* use lowmem mappings */
216 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
217 #elif defined(__x86_64__)
218 char *image_start = __va(_image_start);
219 char *initrd_start = __va(_initrd_start);
220 #endif
221 #if CONFIG_PAGING_LEVELS >= 4
222 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
223 #endif
224 #if CONFIG_PAGING_LEVELS >= 3
225 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
226 #endif
227 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
228 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
230 /*
231 * This fully describes the memory layout of the initial domain. All
232 * *_start address are page-aligned, except v_start (and v_end) which are
233 * superpage-aligned.
234 */
235 struct domain_setup_info dsi;
236 unsigned long vinitrd_start;
237 unsigned long vinitrd_end;
238 unsigned long vphysmap_start;
239 unsigned long vphysmap_end;
240 unsigned long vstartinfo_start;
241 unsigned long vstartinfo_end;
242 unsigned long vstack_start;
243 unsigned long vstack_end;
244 unsigned long vpt_start;
245 unsigned long vpt_end;
246 unsigned long v_end;
248 /* Machine address of next candidate page-table page. */
249 unsigned long mpt_alloc;
251 /* Features supported. */
252 uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
253 uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
255 /* Sanity! */
256 BUG_ON(d->domain_id != 0);
257 BUG_ON(d->vcpu[0] == NULL);
258 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
260 memset(&dsi, 0, sizeof(struct domain_setup_info));
261 dsi.image_addr = (unsigned long)image_start;
262 dsi.image_len = image_len;
264 printk("*** LOADING DOMAIN 0 ***\n");
266 d->max_pages = ~0U;
268 /*
269 * If domain 0 allocation isn't specified, reserve 1/16th of available
270 * memory for things like DMA buffers. This reservation is clamped to
271 * a maximum of 128MB.
272 */
273 if ( dom0_nrpages == 0 )
274 {
275 dom0_nrpages = avail_domheap_pages() + initial_images_nrpages();
276 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
277 dom0_nrpages = -dom0_nrpages;
278 }
280 /* Negative memory specification means "all memory - specified amount". */
281 if ( dom0_nrpages < 0 )
282 nr_pages = avail_domheap_pages() + initial_images_nrpages() +
283 dom0_nrpages;
284 else
285 nr_pages = dom0_nrpages;
287 if ( (rc = parseelfimage(&dsi)) != 0 )
288 return rc;
290 if ( dsi.xen_section_string == NULL )
291 {
292 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
293 return -EINVAL;
294 }
296 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
297 xen_pae = (CONFIG_PAGING_LEVELS == 3);
298 if ( dom0_pae != xen_pae )
299 {
300 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
301 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
302 return -EINVAL;
303 }
305 if ( (p = strstr(dsi.xen_section_string, "FEATURES=")) != NULL )
306 {
307 parse_features(
308 p + strlen("FEATURES="),
309 dom0_features_supported,
310 dom0_features_required);
311 printk("Domain 0 kernel supports features = { %08x }.\n",
312 dom0_features_supported[0]);
313 printk("Domain 0 kernel requires features = { %08x }.\n",
314 dom0_features_required[0]);
315 if ( dom0_features_required[0] )
316 panic("Domain 0 requires an unsupported hypervisor feature.\n");
317 }
319 /* Align load address to 4MB boundary. */
320 dsi.v_start &= ~((1UL<<22)-1);
322 /*
323 * Why do we need this? The number of page-table frames depends on the
324 * size of the bootstrap address space. But the size of the address space
325 * depends on the number of page-table frames (since each one is mapped
326 * read-only). We have a pair of simultaneous equations in two unknowns,
327 * which we solve by exhaustive search.
328 */
329 vinitrd_start = round_pgup(dsi.v_end);
330 vinitrd_end = vinitrd_start + initrd_len;
331 vphysmap_start = round_pgup(vinitrd_end);
332 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
333 vstartinfo_start = round_pgup(vphysmap_end);
334 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
335 vpt_start = vstartinfo_end;
336 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
337 {
338 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
339 vstack_start = vpt_end;
340 vstack_end = vstack_start + PAGE_SIZE;
341 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
342 if ( (v_end - vstack_end) < (512UL << 10) )
343 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
344 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
345 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
346 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
347 break;
348 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
349 /* 5 pages: 1x 3rd + 4x 2nd level */
350 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
351 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
352 break;
353 #elif defined(__x86_64__)
354 #define NR(_l,_h,_s) \
355 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
356 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
357 if ( (1 + /* # L4 */
358 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
359 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
360 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
361 <= nr_pt_pages )
362 break;
363 #endif
364 }
366 order = get_order_from_bytes(v_end - dsi.v_start);
367 if ( (1UL << order) > nr_pages )
368 panic("Domain 0 allocation is too small for kernel image.\n");
370 /*
371 * Allocate from DMA pool: on i386 this ensures that our low-memory 1:1
372 * mapping covers the allocation.
373 */
374 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
375 panic("Not enough RAM for domain 0 allocation.\n");
376 alloc_spfn = page_to_mfn(page);
377 alloc_epfn = alloc_spfn + d->tot_pages;
379 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
380 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
381 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
382 if ( d->tot_pages < nr_pages )
383 printk(" (%lu pages to be allocated)",
384 nr_pages - d->tot_pages);
385 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
386 " Loaded kernel: %p->%p\n"
387 " Init. ramdisk: %p->%p\n"
388 " Phys-Mach map: %p->%p\n"
389 " Start info: %p->%p\n"
390 " Page tables: %p->%p\n"
391 " Boot stack: %p->%p\n"
392 " TOTAL: %p->%p\n",
393 _p(dsi.v_kernstart), _p(dsi.v_kernend),
394 _p(vinitrd_start), _p(vinitrd_end),
395 _p(vphysmap_start), _p(vphysmap_end),
396 _p(vstartinfo_start), _p(vstartinfo_end),
397 _p(vpt_start), _p(vpt_end),
398 _p(vstack_start), _p(vstack_end),
399 _p(dsi.v_start), _p(v_end));
400 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
402 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
403 {
404 printk("Initial guest OS requires too much space\n"
405 "(%luMB is greater than %luMB limit)\n",
406 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
407 return -ENOMEM;
408 }
410 mpt_alloc = (vpt_start - dsi.v_start) +
411 (unsigned long)pfn_to_paddr(alloc_spfn);
413 /*
414 * We're basically forcing default RPLs to 1, so that our "what privilege
415 * level are we returning to?" logic works.
416 */
417 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
418 for ( i = 0; i < 256; i++ )
419 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
421 #if defined(__i386__)
423 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
424 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
426 /*
427 * Protect the lowest 1GB of memory. We use a temporary mapping there
428 * from which we copy the kernel and ramdisk images.
429 */
430 if ( dsi.v_start < (1UL<<30) )
431 {
432 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
433 return -EINVAL;
434 }
436 /* WARNING: The new domain must have its 'processor' field filled in! */
437 #if CONFIG_PAGING_LEVELS == 3
438 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
439 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
440 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
441 for (i = 0; i < 4; i++) {
442 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
443 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
444 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
445 }
446 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
447 #else
448 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
449 memcpy(l2tab, idle_pg_table, PAGE_SIZE);
450 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
451 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
452 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
453 #endif
455 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
456 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
457 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
458 __PAGE_HYPERVISOR);
460 l2tab += l2_linear_offset(dsi.v_start);
461 mfn = alloc_spfn;
462 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
463 {
464 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
465 {
466 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
467 mpt_alloc += PAGE_SIZE;
468 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
469 l2tab++;
470 clear_page(l1tab);
471 if ( count == 0 )
472 l1tab += l1_table_offset(dsi.v_start);
473 }
474 *l1tab = l1e_from_pfn(mfn, L1_PROT);
475 l1tab++;
477 page = mfn_to_page(mfn);
478 if ( !get_page_and_type(page, d, PGT_writable_page) )
479 BUG();
481 mfn++;
482 }
484 /* Pages that are part of page tables must be read only. */
485 l2tab = l2start + l2_linear_offset(vpt_start);
486 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
487 l1tab += l1_table_offset(vpt_start);
488 for ( count = 0; count < nr_pt_pages; count++ )
489 {
490 page = mfn_to_page(l1e_get_pfn(*l1tab));
491 if ( !opt_dom0_shadow )
492 l1e_remove_flags(*l1tab, _PAGE_RW);
493 else
494 if ( !get_page_type(page, PGT_writable_page) )
495 BUG();
497 #if CONFIG_PAGING_LEVELS == 3
498 switch (count) {
499 case 0:
500 page->u.inuse.type_info &= ~PGT_type_mask;
501 page->u.inuse.type_info |= PGT_l3_page_table;
502 get_page(page, d); /* an extra ref because of readable mapping */
504 /* Get another ref to L3 page so that it can be pinned. */
505 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
506 BUG();
507 set_bit(_PGT_pinned, &page->u.inuse.type_info);
508 break;
509 case 1 ... 4:
510 page->u.inuse.type_info &= ~PGT_type_mask;
511 page->u.inuse.type_info |= PGT_l2_page_table;
512 page->u.inuse.type_info |=
513 (count-1) << PGT_va_shift;
514 get_page(page, d); /* an extra ref because of readable mapping */
515 break;
516 default:
517 page->u.inuse.type_info &= ~PGT_type_mask;
518 page->u.inuse.type_info |= PGT_l1_page_table;
519 page->u.inuse.type_info |=
520 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
521 get_page(page, d); /* an extra ref because of readable mapping */
522 break;
523 }
524 #else
525 if ( count == 0 )
526 {
527 page->u.inuse.type_info &= ~PGT_type_mask;
528 page->u.inuse.type_info |= PGT_l2_page_table;
530 /*
531 * No longer writable: decrement the type_count.
532 * Installed as CR3: increment both the ref_count and type_count.
533 * Net: just increment the ref_count.
534 */
535 get_page(page, d); /* an extra ref because of readable mapping */
537 /* Get another ref to L2 page so that it can be pinned. */
538 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
539 BUG();
540 set_bit(_PGT_pinned, &page->u.inuse.type_info);
541 }
542 else
543 {
544 page->u.inuse.type_info &= ~PGT_type_mask;
545 page->u.inuse.type_info |= PGT_l1_page_table;
546 page->u.inuse.type_info |=
547 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
549 /*
550 * No longer writable: decrement the type_count.
551 * This is an L1 page, installed in a validated L2 page:
552 * increment both the ref_count and type_count.
553 * Net: just increment the ref_count.
554 */
555 get_page(page, d); /* an extra ref because of readable mapping */
556 }
557 #endif
558 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
559 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
560 }
562 #elif defined(__x86_64__)
564 /* Overlap with Xen protected area? */
565 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
566 (v_end > HYPERVISOR_VIRT_START) )
567 {
568 printk("DOM0 image overlaps with Xen private area.\n");
569 return -EINVAL;
570 }
572 /* WARNING: The new domain must have its 'processor' field filled in! */
573 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
574 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
575 memcpy(l4tab, idle_pg_table, PAGE_SIZE);
576 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
577 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
578 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
579 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
580 v->arch.guest_table = mk_pagetable(__pa(l4start));
582 l4tab += l4_table_offset(dsi.v_start);
583 mfn = alloc_spfn;
584 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
585 {
586 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
587 {
588 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
589 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
590 clear_page(l1tab);
591 if ( count == 0 )
592 l1tab += l1_table_offset(dsi.v_start);
593 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
594 {
595 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
596 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
597 clear_page(l2tab);
598 if ( count == 0 )
599 l2tab += l2_table_offset(dsi.v_start);
600 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
601 {
602 maddr_to_page(mpt_alloc)->u.inuse.type_info =
603 PGT_l3_page_table;
604 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
605 clear_page(l3tab);
606 if ( count == 0 )
607 l3tab += l3_table_offset(dsi.v_start);
608 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
609 l4tab++;
610 }
611 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
612 l3tab++;
613 }
614 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
615 l2tab++;
616 }
617 *l1tab = l1e_from_pfn(mfn, L1_PROT);
618 l1tab++;
620 page = mfn_to_page(mfn);
621 if ( (page->u.inuse.type_info == 0) &&
622 !get_page_and_type(page, d, PGT_writable_page) )
623 BUG();
625 mfn++;
626 }
628 /* Pages that are part of page tables must be read only. */
629 l4tab = l4start + l4_table_offset(vpt_start);
630 l3start = l3tab = l4e_to_l3e(*l4tab);
631 l3tab += l3_table_offset(vpt_start);
632 l2start = l2tab = l3e_to_l2e(*l3tab);
633 l2tab += l2_table_offset(vpt_start);
634 l1start = l1tab = l2e_to_l1e(*l2tab);
635 l1tab += l1_table_offset(vpt_start);
636 for ( count = 0; count < nr_pt_pages; count++ )
637 {
638 l1e_remove_flags(*l1tab, _PAGE_RW);
639 page = mfn_to_page(l1e_get_pfn(*l1tab));
641 /* Read-only mapping + PGC_allocated + page-table page. */
642 page->count_info = PGC_allocated | 3;
643 page->u.inuse.type_info |= PGT_validated | 1;
645 /* Top-level p.t. is pinned. */
646 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
647 {
648 page->count_info += 1;
649 page->u.inuse.type_info += 1 | PGT_pinned;
650 }
652 /* Iterate. */
653 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
654 {
655 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
656 {
657 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
658 l3start = l3tab = l4e_to_l3e(*++l4tab);
659 l2start = l2tab = l3e_to_l2e(*l3tab);
660 }
661 l1start = l1tab = l2e_to_l1e(*l2tab);
662 }
663 }
665 #endif /* __x86_64__ */
667 /* Mask all upcalls... */
668 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
669 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
671 if ( opt_dom0_max_vcpus == 0 )
672 opt_dom0_max_vcpus = num_online_cpus();
673 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
674 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
675 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
677 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
678 (void)alloc_vcpu(d, i, i);
680 /* Set up monitor table */
681 update_pagetables(v);
683 /* Install the new page tables. */
684 local_irq_disable();
685 write_ptbase(v);
687 /* Copy the OS image and free temporary buffer. */
688 (void)loadelfimage(&dsi);
690 p = strstr(dsi.xen_section_string, "HYPERCALL_PAGE=");
691 if ( p != NULL )
692 {
693 p += strlen("HYPERCALL_PAGE=");
694 hypercall_page = simple_strtoul(p, NULL, 16);
695 hypercall_page = dsi.v_start + (hypercall_page << PAGE_SHIFT);
696 if ( (hypercall_page < dsi.v_start) || (hypercall_page >= v_end) )
697 {
698 write_ptbase(current);
699 local_irq_enable();
700 printk("Invalid HYPERCALL_PAGE field in guest header.\n");
701 return -1;
702 }
704 hypercall_page_initialise((void *)hypercall_page);
705 }
707 /* Copy the initial ramdisk. */
708 if ( initrd_len != 0 )
709 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
711 /* Free temporary buffers. */
712 discard_initial_images();
714 /* Set up start info area. */
715 si = (start_info_t *)vstartinfo_start;
716 memset(si, 0, PAGE_SIZE);
717 si->nr_pages = nr_pages;
719 si->shared_info = virt_to_maddr(d->shared_info);
721 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
722 si->pt_base = vpt_start;
723 si->nr_pt_frames = nr_pt_pages;
724 si->mfn_list = vphysmap_start;
725 sprintf(si->magic, "xen-%i.%i-x86_%d%s",
726 XEN_VERSION, XEN_SUBVERSION, BITS_PER_LONG, xen_pae ? "p" : "");
728 /* Write the phys->machine and machine->phys table entries. */
729 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
730 {
731 mfn = pfn + alloc_spfn;
732 #ifndef NDEBUG
733 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
734 if ( pfn > REVERSE_START )
735 mfn = alloc_epfn - (pfn - REVERSE_START);
736 #endif
737 ((unsigned long *)vphysmap_start)[pfn] = mfn;
738 set_gpfn_from_mfn(mfn, pfn);
739 }
740 while ( pfn < nr_pages )
741 {
742 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
743 panic("Not enough RAM for DOM0 reservation.\n");
744 while ( pfn < d->tot_pages )
745 {
746 mfn = page_to_mfn(page);
747 #ifndef NDEBUG
748 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
749 #endif
750 ((unsigned long *)vphysmap_start)[pfn] = mfn;
751 set_gpfn_from_mfn(mfn, pfn);
752 #undef pfn
753 page++; pfn++;
754 }
755 }
757 if ( initrd_len != 0 )
758 {
759 si->mod_start = vinitrd_start;
760 si->mod_len = initrd_len;
761 printk("Initrd len 0x%lx, start at 0x%lx\n",
762 si->mod_len, si->mod_start);
763 }
765 memset(si->cmd_line, 0, sizeof(si->cmd_line));
766 if ( cmdline != NULL )
767 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
769 /* Reinstate the caller's page tables. */
770 write_ptbase(current);
771 local_irq_enable();
773 #if defined(__i386__)
774 /* Destroy low mappings - they were only for our convenience. */
775 zap_low_mappings(l2start);
776 zap_low_mappings(idle_pg_table_l2);
777 #endif
779 update_domain_wallclock_time(d);
781 set_bit(_VCPUF_initialised, &v->vcpu_flags);
783 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
785 if ( opt_dom0_shadow )
786 {
787 shadow_mode_enable(d, SHM_enable);
788 update_pagetables(v);
789 }
791 if ( supervisor_mode_kernel )
792 {
793 v->arch.guest_context.kernel_ss &= ~3;
794 v->arch.guest_context.user_regs.ss &= ~3;
795 v->arch.guest_context.user_regs.es &= ~3;
796 v->arch.guest_context.user_regs.ds &= ~3;
797 v->arch.guest_context.user_regs.fs &= ~3;
798 v->arch.guest_context.user_regs.gs &= ~3;
799 printk("Dom0 runs in ring 0 (supervisor mode)\n");
800 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
801 dom0_features_supported) )
802 panic("Dom0 does not support supervisor-mode execution\n");
803 }
804 else
805 {
806 if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
807 panic("Dom0 requires supervisor-mode execution\n");
808 }
810 rc = 0;
812 /* DOM0 is permitted full I/O capabilities. */
813 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
814 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
815 rc |= irqs_permit_access(dom0, 0, NR_IRQS-1);
817 /*
818 * Modify I/O port access permissions.
819 */
820 /* Master Interrupt Controller (PIC). */
821 rc |= ioports_deny_access(dom0, 0x20, 0x21);
822 /* Slave Interrupt Controller (PIC). */
823 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
824 /* Interval Timer (PIT). */
825 rc |= ioports_deny_access(dom0, 0x40, 0x43);
826 /* PIT Channel 2 / PC Speaker Control. */
827 rc |= ioports_deny_access(dom0, 0x61, 0x61);
828 /* Command-line I/O ranges. */
829 process_dom0_ioports_disable();
831 /*
832 * Modify I/O memory access permissions.
833 */
834 /* Local APIC. */
835 if ( mp_lapic_addr != 0 )
836 {
837 mfn = paddr_to_pfn(mp_lapic_addr);
838 rc |= iomem_deny_access(dom0, mfn, mfn);
839 }
840 /* I/O APICs. */
841 for ( i = 0; i < nr_ioapics; i++ )
842 {
843 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
844 if ( smp_found_config )
845 rc |= iomem_deny_access(dom0, mfn, mfn);
846 }
848 BUG_ON(rc != 0);
850 return 0;
851 }
853 int elf_sanity_check(Elf_Ehdr *ehdr)
854 {
855 if ( !IS_ELF(*ehdr) ||
856 #if defined(__i386__)
857 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
858 (ehdr->e_machine != EM_386) ||
859 #elif defined(__x86_64__)
860 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
861 (ehdr->e_machine != EM_X86_64) ||
862 #endif
863 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
864 (ehdr->e_type != ET_EXEC) )
865 {
866 printk("DOM0 image is not a Xen-compatible Elf image.\n");
867 return 0;
868 }
870 return 1;
871 }
873 /*
874 * Local variables:
875 * mode: C
876 * c-set-style: "BSD"
877 * c-basic-offset: 4
878 * tab-width: 4
879 * indent-tabs-mode: nil
880 * End:
881 */