direct-io.hg

view xen/arch/x86/domain_build.c @ 11229:708b915caf78

[XEN] Revert changes to dom0 domain builder made by cset 11226.

We require that the dom0 domain builder still sets _PAGE_USER.

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
author Ian Campbell <ian.campbell@xensource.com>
date Tue Aug 22 18:47:29 2006 +0100 (2006-08-22)
parents fc5736e0a2eb
children 6aeec897da36
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/console.h>
16 #include <xen/elf.h>
17 #include <xen/kernel.h>
18 #include <xen/domain.h>
19 #include <xen/version.h>
20 #include <xen/iocap.h>
21 #include <xen/bitops.h>
22 #include <asm/regs.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/i387.h>
28 #include <asm/shadow.h>
30 #include <public/version.h>
32 extern unsigned long initial_images_nrpages(void);
33 extern void discard_initial_images(void);
35 static long dom0_nrpages;
37 /*
38 * dom0_mem:
39 * If +ve:
40 * * The specified amount of memory is allocated to domain 0.
41 * If -ve:
42 * * All of memory is allocated to domain 0, minus the specified amount.
43 * If not specified:
44 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
45 * for uses such as DMA buffers (the reservation is clamped to 128MB).
46 */
47 static void parse_dom0_mem(char *s)
48 {
49 unsigned long long bytes;
50 char *t = s;
51 if ( *s == '-' )
52 t++;
53 bytes = parse_size_and_unit(t);
54 dom0_nrpages = bytes >> PAGE_SHIFT;
55 if ( *s == '-' )
56 dom0_nrpages = -dom0_nrpages;
57 }
58 custom_param("dom0_mem", parse_dom0_mem);
60 static unsigned int opt_dom0_max_vcpus;
61 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
63 static unsigned int opt_dom0_shadow;
64 boolean_param("dom0_shadow", opt_dom0_shadow);
66 static char opt_dom0_ioports_disable[200] = "";
67 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
69 #if defined(__i386__)
70 /* No ring-3 access in initial leaf page tables. */
71 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
72 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
73 #define L3_PROT (_PAGE_PRESENT)
74 #elif defined(__x86_64__)
75 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
76 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
77 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
78 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
79 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
80 #endif
82 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
83 #define round_pgdown(_p) ((_p)&PAGE_MASK)
85 static struct page_info *alloc_chunk(struct domain *d, unsigned long max_pages)
86 {
87 struct page_info *page;
88 unsigned int order;
89 /*
90 * Allocate up to 2MB at a time: It prevents allocating very large chunks
91 * from DMA pools before the >4GB pool is fully depleted.
92 */
93 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
94 max_pages = 2UL << (20 - PAGE_SHIFT);
95 order = get_order_from_pages(max_pages);
96 if ( (max_pages & (max_pages-1)) != 0 )
97 order--;
98 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
99 if ( order-- == 0 )
100 break;
101 return page;
102 }
104 static void process_dom0_ioports_disable(void)
105 {
106 unsigned long io_from, io_to;
107 char *t, *u, *s = opt_dom0_ioports_disable;
109 if ( *s == '\0' )
110 return;
112 while ( (t = strsep(&s, ",")) != NULL )
113 {
114 io_from = simple_strtoul(t, &u, 16);
115 if ( u == t )
116 {
117 parse_error:
118 printk("Invalid ioport range <%s> "
119 "in dom0_ioports_disable, skipping\n", t);
120 continue;
121 }
123 if ( *u == '\0' )
124 io_to = io_from;
125 else if ( *u == '-' )
126 io_to = simple_strtoul(u + 1, &u, 16);
127 else
128 goto parse_error;
130 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
131 goto parse_error;
133 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
134 io_from, io_to);
136 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
137 BUG();
138 }
139 }
141 static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = {
142 [XENFEAT_writable_page_tables] = "writable_page_tables",
143 [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
144 [XENFEAT_auto_translated_physmap] = "auto_translated_physmap",
145 [XENFEAT_supervisor_mode_kernel] = "supervisor_mode_kernel",
146 [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb"
147 };
149 static void parse_features(
150 const char *feats,
151 uint32_t supported[XENFEAT_NR_SUBMAPS],
152 uint32_t required[XENFEAT_NR_SUBMAPS])
153 {
154 const char *end, *p;
155 int i, req;
157 if ( (end = strchr(feats, ',')) == NULL )
158 end = feats + strlen(feats);
160 while ( feats < end )
161 {
162 p = strchr(feats, '|');
163 if ( (p == NULL) || (p > end) )
164 p = end;
166 req = (*feats == '!');
167 if ( req )
168 feats++;
170 for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ )
171 {
172 if ( feature_names[i] == NULL )
173 continue;
175 if ( strncmp(feature_names[i], feats, p-feats) == 0 )
176 {
177 set_bit(i, supported);
178 if ( req )
179 set_bit(i, required);
180 break;
181 }
182 }
184 if ( i == XENFEAT_NR_SUBMAPS*32 )
185 {
186 printk("Unknown kernel feature \"%.*s\".\n",
187 (int)(p-feats), feats);
188 if ( req )
189 panic("Domain 0 requires an unknown hypervisor feature.\n");
190 }
192 feats = p;
193 if ( *feats == '|' )
194 feats++;
195 }
196 }
198 int construct_dom0(struct domain *d,
199 unsigned long _image_start, unsigned long image_len,
200 unsigned long _initrd_start, unsigned long initrd_len,
201 char *cmdline)
202 {
203 int i, rc, dom0_pae, xen_pae, order;
204 unsigned long pfn, mfn;
205 unsigned long nr_pages;
206 unsigned long nr_pt_pages;
207 unsigned long alloc_spfn;
208 unsigned long alloc_epfn;
209 unsigned long count;
210 struct page_info *page = NULL;
211 start_info_t *si;
212 struct vcpu *v = d->vcpu[0];
213 char *p;
214 unsigned long hypercall_page;
215 #if defined(__i386__)
216 char *image_start = (char *)_image_start; /* use lowmem mappings */
217 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
218 #elif defined(__x86_64__)
219 char *image_start = __va(_image_start);
220 char *initrd_start = __va(_initrd_start);
221 #endif
222 #if CONFIG_PAGING_LEVELS >= 4
223 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
224 #endif
225 #if CONFIG_PAGING_LEVELS >= 3
226 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
227 #endif
228 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
229 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
231 /*
232 * This fully describes the memory layout of the initial domain. All
233 * *_start address are page-aligned, except v_start (and v_end) which are
234 * superpage-aligned.
235 */
236 struct domain_setup_info dsi;
237 unsigned long vinitrd_start;
238 unsigned long vinitrd_end;
239 unsigned long vphysmap_start;
240 unsigned long vphysmap_end;
241 unsigned long vstartinfo_start;
242 unsigned long vstartinfo_end;
243 unsigned long vstack_start;
244 unsigned long vstack_end;
245 unsigned long vpt_start;
246 unsigned long vpt_end;
247 unsigned long v_end;
249 /* Machine address of next candidate page-table page. */
250 unsigned long mpt_alloc;
252 /* Features supported. */
253 uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
254 uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
256 /* Sanity! */
257 BUG_ON(d->domain_id != 0);
258 BUG_ON(d->vcpu[0] == NULL);
259 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
261 memset(&dsi, 0, sizeof(struct domain_setup_info));
262 dsi.image_addr = (unsigned long)image_start;
263 dsi.image_len = image_len;
265 printk("*** LOADING DOMAIN 0 ***\n");
267 d->max_pages = ~0U;
269 /*
270 * If domain 0 allocation isn't specified, reserve 1/16th of available
271 * memory for things like DMA buffers. This reservation is clamped to
272 * a maximum of 128MB.
273 */
274 if ( dom0_nrpages == 0 )
275 {
276 dom0_nrpages = avail_domheap_pages() + initial_images_nrpages();
277 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
278 dom0_nrpages = -dom0_nrpages;
279 }
281 /* Negative memory specification means "all memory - specified amount". */
282 if ( dom0_nrpages < 0 )
283 nr_pages = avail_domheap_pages() + initial_images_nrpages() +
284 dom0_nrpages;
285 else
286 nr_pages = dom0_nrpages;
288 if ( (rc = parseelfimage(&dsi)) != 0 )
289 return rc;
291 if ( dsi.xen_section_string == NULL )
292 {
293 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
294 return -EINVAL;
295 }
297 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
298 xen_pae = (CONFIG_PAGING_LEVELS == 3);
299 if ( dom0_pae != xen_pae )
300 {
301 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
302 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
303 return -EINVAL;
304 }
306 if ( xen_pae && !!strstr(dsi.xen_section_string, "PAE=yes[extended-cr3]") )
307 set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
309 if ( (p = strstr(dsi.xen_section_string, "FEATURES=")) != NULL )
310 {
311 parse_features(
312 p + strlen("FEATURES="),
313 dom0_features_supported,
314 dom0_features_required);
315 printk("Domain 0 kernel supports features = { %08x }.\n",
316 dom0_features_supported[0]);
317 printk("Domain 0 kernel requires features = { %08x }.\n",
318 dom0_features_required[0]);
319 if ( dom0_features_required[0] )
320 panic("Domain 0 requires an unsupported hypervisor feature.\n");
321 }
323 /* Align load address to 4MB boundary. */
324 dsi.v_start &= ~((1UL<<22)-1);
326 /*
327 * Why do we need this? The number of page-table frames depends on the
328 * size of the bootstrap address space. But the size of the address space
329 * depends on the number of page-table frames (since each one is mapped
330 * read-only). We have a pair of simultaneous equations in two unknowns,
331 * which we solve by exhaustive search.
332 */
333 vinitrd_start = round_pgup(dsi.v_end);
334 vinitrd_end = vinitrd_start + initrd_len;
335 vphysmap_start = round_pgup(vinitrd_end);
336 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
337 vstartinfo_start = round_pgup(vphysmap_end);
338 vstartinfo_end = (vstartinfo_start +
339 sizeof(struct start_info) +
340 sizeof(struct dom0_vga_console_info));
341 vpt_start = round_pgup(vstartinfo_end);
342 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
343 {
344 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
345 vstack_start = vpt_end;
346 vstack_end = vstack_start + PAGE_SIZE;
347 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
348 if ( (v_end - vstack_end) < (512UL << 10) )
349 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
350 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
351 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
352 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
353 break;
354 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
355 /* 5 pages: 1x 3rd + 4x 2nd level */
356 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
357 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
358 break;
359 #elif defined(__x86_64__)
360 #define NR(_l,_h,_s) \
361 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
362 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
363 if ( (1 + /* # L4 */
364 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
365 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
366 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
367 <= nr_pt_pages )
368 break;
369 #endif
370 }
372 order = get_order_from_bytes(v_end - dsi.v_start);
373 if ( (1UL << order) > nr_pages )
374 panic("Domain 0 allocation is too small for kernel image.\n");
376 /*
377 * Allocate from DMA pool: on i386 this ensures that our low-memory 1:1
378 * mapping covers the allocation.
379 */
380 if ( (page = alloc_domheap_pages(d, order, MEMF_dma)) == NULL )
381 panic("Not enough RAM for domain 0 allocation.\n");
382 alloc_spfn = page_to_mfn(page);
383 alloc_epfn = alloc_spfn + d->tot_pages;
385 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
386 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
387 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
388 if ( d->tot_pages < nr_pages )
389 printk(" (%lu pages to be allocated)",
390 nr_pages - d->tot_pages);
391 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
392 " Loaded kernel: %p->%p\n"
393 " Init. ramdisk: %p->%p\n"
394 " Phys-Mach map: %p->%p\n"
395 " Start info: %p->%p\n"
396 " Page tables: %p->%p\n"
397 " Boot stack: %p->%p\n"
398 " TOTAL: %p->%p\n",
399 _p(dsi.v_kernstart), _p(dsi.v_kernend),
400 _p(vinitrd_start), _p(vinitrd_end),
401 _p(vphysmap_start), _p(vphysmap_end),
402 _p(vstartinfo_start), _p(vstartinfo_end),
403 _p(vpt_start), _p(vpt_end),
404 _p(vstack_start), _p(vstack_end),
405 _p(dsi.v_start), _p(v_end));
406 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
408 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
409 {
410 printk("Initial guest OS requires too much space\n"
411 "(%luMB is greater than %luMB limit)\n",
412 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
413 return -ENOMEM;
414 }
416 mpt_alloc = (vpt_start - dsi.v_start) +
417 (unsigned long)pfn_to_paddr(alloc_spfn);
419 /*
420 * We're basically forcing default RPLs to 1, so that our "what privilege
421 * level are we returning to?" logic works.
422 */
423 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
424 for ( i = 0; i < 256; i++ )
425 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
427 #if defined(__i386__)
429 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
430 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
432 /*
433 * Protect the lowest 1GB of memory. We use a temporary mapping there
434 * from which we copy the kernel and ramdisk images.
435 */
436 if ( dsi.v_start < (1UL<<30) )
437 {
438 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
439 return -EINVAL;
440 }
442 /* WARNING: The new domain must have its 'processor' field filled in! */
443 #if CONFIG_PAGING_LEVELS == 3
444 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
445 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
446 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
447 for (i = 0; i < 4; i++) {
448 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
449 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
450 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
451 }
452 v->arch.guest_table = pagetable_from_paddr((unsigned long)l3start);
453 #else
454 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
455 memcpy(l2tab, idle_pg_table, PAGE_SIZE);
456 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
457 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
458 v->arch.guest_table = pagetable_from_paddr((unsigned long)l2start);
459 #endif
461 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
462 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
463 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
464 __PAGE_HYPERVISOR);
466 l2tab += l2_linear_offset(dsi.v_start);
467 mfn = alloc_spfn;
468 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
469 {
470 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
471 {
472 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
473 mpt_alloc += PAGE_SIZE;
474 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
475 l2tab++;
476 clear_page(l1tab);
477 if ( count == 0 )
478 l1tab += l1_table_offset(dsi.v_start);
479 }
480 *l1tab = l1e_from_pfn(mfn, L1_PROT);
481 l1tab++;
483 page = mfn_to_page(mfn);
484 if ( !get_page_and_type(page, d, PGT_writable_page) )
485 BUG();
487 mfn++;
488 }
490 /* Pages that are part of page tables must be read only. */
491 l2tab = l2start + l2_linear_offset(vpt_start);
492 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
493 l1tab += l1_table_offset(vpt_start);
494 for ( count = 0; count < nr_pt_pages; count++ )
495 {
496 page = mfn_to_page(l1e_get_pfn(*l1tab));
497 if ( !opt_dom0_shadow )
498 l1e_remove_flags(*l1tab, _PAGE_RW);
499 else
500 if ( !get_page_type(page, PGT_writable_page) )
501 BUG();
503 #if CONFIG_PAGING_LEVELS == 3
504 switch (count) {
505 case 0:
506 page->u.inuse.type_info &= ~PGT_type_mask;
507 page->u.inuse.type_info |= PGT_l3_page_table;
508 get_page(page, d); /* an extra ref because of readable mapping */
510 /* Get another ref to L3 page so that it can be pinned. */
511 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
512 BUG();
513 set_bit(_PGT_pinned, &page->u.inuse.type_info);
514 break;
515 case 1 ... 4:
516 page->u.inuse.type_info &= ~PGT_type_mask;
517 page->u.inuse.type_info |= PGT_l2_page_table;
518 page->u.inuse.type_info |=
519 (count-1) << PGT_va_shift;
520 get_page(page, d); /* an extra ref because of readable mapping */
521 break;
522 default:
523 page->u.inuse.type_info &= ~PGT_type_mask;
524 page->u.inuse.type_info |= PGT_l1_page_table;
525 page->u.inuse.type_info |=
526 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
527 get_page(page, d); /* an extra ref because of readable mapping */
528 break;
529 }
530 #else
531 if ( count == 0 )
532 {
533 page->u.inuse.type_info &= ~PGT_type_mask;
534 page->u.inuse.type_info |= PGT_l2_page_table;
536 /*
537 * No longer writable: decrement the type_count.
538 * Installed as CR3: increment both the ref_count and type_count.
539 * Net: just increment the ref_count.
540 */
541 get_page(page, d); /* an extra ref because of readable mapping */
543 /* Get another ref to L2 page so that it can be pinned. */
544 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
545 BUG();
546 set_bit(_PGT_pinned, &page->u.inuse.type_info);
547 }
548 else
549 {
550 page->u.inuse.type_info &= ~PGT_type_mask;
551 page->u.inuse.type_info |= PGT_l1_page_table;
552 page->u.inuse.type_info |=
553 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
555 /*
556 * No longer writable: decrement the type_count.
557 * This is an L1 page, installed in a validated L2 page:
558 * increment both the ref_count and type_count.
559 * Net: just increment the ref_count.
560 */
561 get_page(page, d); /* an extra ref because of readable mapping */
562 }
563 #endif
564 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
565 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
566 }
568 #elif defined(__x86_64__)
570 /* Overlap with Xen protected area? */
571 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
572 (v_end > HYPERVISOR_VIRT_START) )
573 {
574 printk("DOM0 image overlaps with Xen private area.\n");
575 return -EINVAL;
576 }
578 /* WARNING: The new domain must have its 'processor' field filled in! */
579 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
580 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
581 memcpy(l4tab, idle_pg_table, PAGE_SIZE);
582 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
583 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
584 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
585 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
586 v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
588 l4tab += l4_table_offset(dsi.v_start);
589 mfn = alloc_spfn;
590 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
591 {
592 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
593 {
594 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
595 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
596 clear_page(l1tab);
597 if ( count == 0 )
598 l1tab += l1_table_offset(dsi.v_start);
599 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
600 {
601 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
602 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
603 clear_page(l2tab);
604 if ( count == 0 )
605 l2tab += l2_table_offset(dsi.v_start);
606 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
607 {
608 maddr_to_page(mpt_alloc)->u.inuse.type_info =
609 PGT_l3_page_table;
610 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
611 clear_page(l3tab);
612 if ( count == 0 )
613 l3tab += l3_table_offset(dsi.v_start);
614 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
615 l4tab++;
616 }
617 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
618 l3tab++;
619 }
620 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
621 l2tab++;
622 }
623 *l1tab = l1e_from_pfn(mfn, L1_PROT);
624 l1tab++;
626 page = mfn_to_page(mfn);
627 if ( (page->u.inuse.type_info == 0) &&
628 !get_page_and_type(page, d, PGT_writable_page) )
629 BUG();
631 mfn++;
632 }
634 /* Pages that are part of page tables must be read only. */
635 l4tab = l4start + l4_table_offset(vpt_start);
636 l3start = l3tab = l4e_to_l3e(*l4tab);
637 l3tab += l3_table_offset(vpt_start);
638 l2start = l2tab = l3e_to_l2e(*l3tab);
639 l2tab += l2_table_offset(vpt_start);
640 l1start = l1tab = l2e_to_l1e(*l2tab);
641 l1tab += l1_table_offset(vpt_start);
642 for ( count = 0; count < nr_pt_pages; count++ )
643 {
644 l1e_remove_flags(*l1tab, _PAGE_RW);
645 page = mfn_to_page(l1e_get_pfn(*l1tab));
647 /* Read-only mapping + PGC_allocated + page-table page. */
648 page->count_info = PGC_allocated | 3;
649 page->u.inuse.type_info |= PGT_validated | 1;
651 /* Top-level p.t. is pinned. */
652 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
653 {
654 page->count_info += 1;
655 page->u.inuse.type_info += 1 | PGT_pinned;
656 }
658 /* Iterate. */
659 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
660 {
661 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
662 {
663 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
664 l3start = l3tab = l4e_to_l3e(*++l4tab);
665 l2start = l2tab = l3e_to_l2e(*l3tab);
666 }
667 l1start = l1tab = l2e_to_l1e(*l2tab);
668 }
669 }
671 #endif /* __x86_64__ */
673 /* Mask all upcalls... */
674 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
675 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
677 if ( opt_dom0_max_vcpus == 0 )
678 opt_dom0_max_vcpus = num_online_cpus();
679 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
680 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
681 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
683 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
684 (void)alloc_vcpu(d, i, i);
686 /* Set up CR3 value for write_ptbase */
687 if ( shadow2_mode_enabled(v->domain) )
688 shadow2_update_paging_modes(v);
689 else
690 update_cr3(v);
692 /* Install the new page tables. */
693 local_irq_disable();
694 write_ptbase(v);
696 /* Copy the OS image and free temporary buffer. */
697 (void)loadelfimage(&dsi);
699 p = strstr(dsi.xen_section_string, "HYPERCALL_PAGE=");
700 if ( p != NULL )
701 {
702 p += strlen("HYPERCALL_PAGE=");
703 hypercall_page = simple_strtoul(p, NULL, 16);
704 printk("(1) hypercall page is %#lx\n", hypercall_page);
705 hypercall_page = dsi.v_start + (hypercall_page << PAGE_SHIFT);
706 printk("(2) hypercall page is %#lx dsi.v_start is %#lx\n", hypercall_page, dsi.v_start);
707 if ( (hypercall_page < dsi.v_start) || (hypercall_page >= v_end) )
708 {
709 write_ptbase(current);
710 local_irq_enable();
711 printk("Invalid HYPERCALL_PAGE field in guest header.\n");
712 return -1;
713 }
715 printk("(3) hypercall page is %#lx\n", hypercall_page);
716 hypercall_page_initialise(d, (void *)hypercall_page);
717 }
719 /* Copy the initial ramdisk. */
720 if ( initrd_len != 0 )
721 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
723 /* Free temporary buffers. */
724 discard_initial_images();
726 /* Set up start info area. */
727 si = (start_info_t *)vstartinfo_start;
728 memset(si, 0, PAGE_SIZE);
729 si->nr_pages = nr_pages;
731 si->shared_info = virt_to_maddr(d->shared_info);
733 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
734 si->pt_base = vpt_start;
735 si->nr_pt_frames = nr_pt_pages;
736 si->mfn_list = vphysmap_start;
737 sprintf(si->magic, "xen-%i.%i-x86_%d%s",
738 xen_major_version(), xen_minor_version(),
739 BITS_PER_LONG, xen_pae ? "p" : "");
741 /* Write the phys->machine and machine->phys table entries. */
742 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
743 {
744 mfn = pfn + alloc_spfn;
745 #ifndef NDEBUG
746 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
747 if ( pfn > REVERSE_START )
748 mfn = alloc_epfn - (pfn - REVERSE_START);
749 #endif
750 ((unsigned long *)vphysmap_start)[pfn] = mfn;
751 set_gpfn_from_mfn(mfn, pfn);
752 }
753 while ( pfn < nr_pages )
754 {
755 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
756 panic("Not enough RAM for DOM0 reservation.\n");
757 while ( pfn < d->tot_pages )
758 {
759 mfn = page_to_mfn(page);
760 #ifndef NDEBUG
761 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
762 #endif
763 ((unsigned long *)vphysmap_start)[pfn] = mfn;
764 set_gpfn_from_mfn(mfn, pfn);
765 #undef pfn
766 page++; pfn++;
767 }
768 }
770 if ( initrd_len != 0 )
771 {
772 si->mod_start = vinitrd_start;
773 si->mod_len = initrd_len;
774 printk("Initrd len 0x%lx, start at 0x%lx\n",
775 si->mod_len, si->mod_start);
776 }
778 memset(si->cmd_line, 0, sizeof(si->cmd_line));
779 if ( cmdline != NULL )
780 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
782 if ( fill_console_start_info((void *)(si + 1)) )
783 {
784 si->console.dom0.info_off = sizeof(struct start_info);
785 si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
786 }
788 /* Reinstate the caller's page tables. */
789 write_ptbase(current);
790 local_irq_enable();
792 #if defined(__i386__)
793 /* Destroy low mappings - they were only for our convenience. */
794 zap_low_mappings(l2start);
795 zap_low_mappings(idle_pg_table_l2);
796 #endif
798 update_domain_wallclock_time(d);
800 set_bit(_VCPUF_initialised, &v->vcpu_flags);
802 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
804 if ( opt_dom0_shadow )
805 if ( shadow2_test_enable(d) == 0 )
806 shadow2_update_paging_modes(v);
808 if ( supervisor_mode_kernel )
809 {
810 v->arch.guest_context.kernel_ss &= ~3;
811 v->arch.guest_context.user_regs.ss &= ~3;
812 v->arch.guest_context.user_regs.es &= ~3;
813 v->arch.guest_context.user_regs.ds &= ~3;
814 v->arch.guest_context.user_regs.fs &= ~3;
815 v->arch.guest_context.user_regs.gs &= ~3;
816 printk("Dom0 runs in ring 0 (supervisor mode)\n");
817 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
818 dom0_features_supported) )
819 panic("Dom0 does not support supervisor-mode execution\n");
820 }
821 else
822 {
823 if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
824 panic("Dom0 requires supervisor-mode execution\n");
825 }
827 rc = 0;
829 /* DOM0 is permitted full I/O capabilities. */
830 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
831 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
832 rc |= irqs_permit_access(dom0, 0, NR_IRQS-1);
834 /*
835 * Modify I/O port access permissions.
836 */
837 /* Master Interrupt Controller (PIC). */
838 rc |= ioports_deny_access(dom0, 0x20, 0x21);
839 /* Slave Interrupt Controller (PIC). */
840 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
841 /* Interval Timer (PIT). */
842 rc |= ioports_deny_access(dom0, 0x40, 0x43);
843 /* PIT Channel 2 / PC Speaker Control. */
844 rc |= ioports_deny_access(dom0, 0x61, 0x61);
845 /* Command-line I/O ranges. */
846 process_dom0_ioports_disable();
848 /*
849 * Modify I/O memory access permissions.
850 */
851 /* Local APIC. */
852 if ( mp_lapic_addr != 0 )
853 {
854 mfn = paddr_to_pfn(mp_lapic_addr);
855 rc |= iomem_deny_access(dom0, mfn, mfn);
856 }
857 /* I/O APICs. */
858 for ( i = 0; i < nr_ioapics; i++ )
859 {
860 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
861 if ( smp_found_config )
862 rc |= iomem_deny_access(dom0, mfn, mfn);
863 }
865 BUG_ON(rc != 0);
867 return 0;
868 }
870 int elf_sanity_check(Elf_Ehdr *ehdr)
871 {
872 if ( !IS_ELF(*ehdr) ||
873 #if defined(__i386__)
874 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
875 (ehdr->e_machine != EM_386) ||
876 #elif defined(__x86_64__)
877 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
878 (ehdr->e_machine != EM_X86_64) ||
879 #endif
880 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
881 (ehdr->e_type != ET_EXEC) )
882 {
883 printk("DOM0 image is not a Xen-compatible Elf image.\n");
884 return 0;
885 }
887 return 1;
888 }
890 /*
891 * Local variables:
892 * mode: C
893 * c-set-style: "BSD"
894 * c-basic-offset: 4
895 * tab-width: 4
896 * indent-tabs-mode: nil
897 * End:
898 */