ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 10892:0d2ba35c0cf2

[XEN] Add hypercall support for HVM guests. This is
fairly useless at the moment, since all of the hypercalls
fail, since copy_from_user doesn't work correctly in HVM
domains.

Signed-off-by: Steven Smith <ssmith@xensource.com>

Add a CPUID hypervisor platform interface at leaf
0x40000000. Allow hypercall transfer page to be filled
in via MSR 0x40000000.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Aug 01 17:18:05 2006 +0100 (2006-08-01)
parents b786bfb058eb
children ae71e1d92d44
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/version.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <asm/regs.h>
22 #include <asm/system.h>
23 #include <asm/io.h>
24 #include <asm/processor.h>
25 #include <asm/desc.h>
26 #include <asm/i387.h>
27 #include <asm/shadow.h>
29 #include <public/version.h>
31 extern unsigned long initial_images_nrpages(void);
32 extern void discard_initial_images(void);
34 static long dom0_nrpages;
36 /*
37 * dom0_mem:
38 * If +ve:
39 * * The specified amount of memory is allocated to domain 0.
40 * If -ve:
41 * * All of memory is allocated to domain 0, minus the specified amount.
42 * If not specified:
43 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
44 * for uses such as DMA buffers (the reservation is clamped to 128MB).
45 */
46 static void parse_dom0_mem(char *s)
47 {
48 unsigned long long bytes;
49 char *t = s;
50 if ( *s == '-' )
51 t++;
52 bytes = parse_size_and_unit(t);
53 dom0_nrpages = bytes >> PAGE_SHIFT;
54 if ( *s == '-' )
55 dom0_nrpages = -dom0_nrpages;
56 }
57 custom_param("dom0_mem", parse_dom0_mem);
59 static unsigned int opt_dom0_max_vcpus;
60 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
62 static unsigned int opt_dom0_shadow;
63 boolean_param("dom0_shadow", opt_dom0_shadow);
65 static char opt_dom0_ioports_disable[200] = "";
66 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
68 #if defined(__i386__)
69 /* No ring-3 access in initial leaf page tables. */
70 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
71 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
72 #define L3_PROT (_PAGE_PRESENT)
73 #elif defined(__x86_64__)
74 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
75 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
76 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
77 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
78 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
79 #endif
81 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
82 #define round_pgdown(_p) ((_p)&PAGE_MASK)
84 static struct page_info *alloc_chunk(struct domain *d, unsigned long max_pages)
85 {
86 struct page_info *page;
87 unsigned int order;
88 /*
89 * Allocate up to 2MB at a time: It prevents allocating very large chunks
90 * from DMA pools before the >4GB pool is fully depleted.
91 */
92 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
93 max_pages = 2UL << (20 - PAGE_SHIFT);
94 order = get_order_from_pages(max_pages);
95 if ( (max_pages & (max_pages-1)) != 0 )
96 order--;
97 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
98 if ( order-- == 0 )
99 break;
100 return page;
101 }
103 static void process_dom0_ioports_disable(void)
104 {
105 unsigned long io_from, io_to;
106 char *t, *u, *s = opt_dom0_ioports_disable;
108 if ( *s == '\0' )
109 return;
111 while ( (t = strsep(&s, ",")) != NULL )
112 {
113 io_from = simple_strtoul(t, &u, 16);
114 if ( u == t )
115 {
116 parse_error:
117 printk("Invalid ioport range <%s> "
118 "in dom0_ioports_disable, skipping\n", t);
119 continue;
120 }
122 if ( *u == '\0' )
123 io_to = io_from;
124 else if ( *u == '-' )
125 io_to = simple_strtoul(u + 1, &u, 16);
126 else
127 goto parse_error;
129 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
130 goto parse_error;
132 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
133 io_from, io_to);
135 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
136 BUG();
137 }
138 }
140 static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = {
141 [XENFEAT_writable_page_tables] = "writable_page_tables",
142 [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
143 [XENFEAT_auto_translated_physmap] = "auto_translated_physmap",
144 [XENFEAT_supervisor_mode_kernel] = "supervisor_mode_kernel",
145 [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb"
146 };
148 static void parse_features(
149 const char *feats,
150 uint32_t supported[XENFEAT_NR_SUBMAPS],
151 uint32_t required[XENFEAT_NR_SUBMAPS])
152 {
153 const char *end, *p;
154 int i, req;
156 if ( (end = strchr(feats, ',')) == NULL )
157 end = feats + strlen(feats);
159 while ( feats < end )
160 {
161 p = strchr(feats, '|');
162 if ( (p == NULL) || (p > end) )
163 p = end;
165 req = (*feats == '!');
166 if ( req )
167 feats++;
169 for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ )
170 {
171 if ( feature_names[i] == NULL )
172 continue;
174 if ( strncmp(feature_names[i], feats, p-feats) == 0 )
175 {
176 set_bit(i, supported);
177 if ( req )
178 set_bit(i, required);
179 break;
180 }
181 }
183 if ( i == XENFEAT_NR_SUBMAPS*32 )
184 {
185 printk("Unknown kernel feature \"%.*s\".\n",
186 (int)(p-feats), feats);
187 if ( req )
188 panic("Domain 0 requires an unknown hypervisor feature.\n");
189 }
191 feats = p;
192 if ( *feats == '|' )
193 feats++;
194 }
195 }
197 int construct_dom0(struct domain *d,
198 unsigned long _image_start, unsigned long image_len,
199 unsigned long _initrd_start, unsigned long initrd_len,
200 char *cmdline)
201 {
202 int i, rc, dom0_pae, xen_pae, order;
203 unsigned long pfn, mfn;
204 unsigned long nr_pages;
205 unsigned long nr_pt_pages;
206 unsigned long alloc_spfn;
207 unsigned long alloc_epfn;
208 unsigned long count;
209 struct page_info *page = NULL;
210 start_info_t *si;
211 struct vcpu *v = d->vcpu[0];
212 char *p;
213 unsigned long hypercall_page;
214 #if defined(__i386__)
215 char *image_start = (char *)_image_start; /* use lowmem mappings */
216 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
217 #elif defined(__x86_64__)
218 char *image_start = __va(_image_start);
219 char *initrd_start = __va(_initrd_start);
220 #endif
221 #if CONFIG_PAGING_LEVELS >= 4
222 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
223 #endif
224 #if CONFIG_PAGING_LEVELS >= 3
225 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
226 #endif
227 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
228 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
230 /*
231 * This fully describes the memory layout of the initial domain. All
232 * *_start address are page-aligned, except v_start (and v_end) which are
233 * superpage-aligned.
234 */
235 struct domain_setup_info dsi;
236 unsigned long vinitrd_start;
237 unsigned long vinitrd_end;
238 unsigned long vphysmap_start;
239 unsigned long vphysmap_end;
240 unsigned long vstartinfo_start;
241 unsigned long vstartinfo_end;
242 unsigned long vstack_start;
243 unsigned long vstack_end;
244 unsigned long vpt_start;
245 unsigned long vpt_end;
246 unsigned long v_end;
248 /* Machine address of next candidate page-table page. */
249 unsigned long mpt_alloc;
251 /* Features supported. */
252 uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
253 uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
255 /* Sanity! */
256 BUG_ON(d->domain_id != 0);
257 BUG_ON(d->vcpu[0] == NULL);
258 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
260 memset(&dsi, 0, sizeof(struct domain_setup_info));
261 dsi.image_addr = (unsigned long)image_start;
262 dsi.image_len = image_len;
264 printk("*** LOADING DOMAIN 0 ***\n");
266 d->max_pages = ~0U;
268 /*
269 * If domain 0 allocation isn't specified, reserve 1/16th of available
270 * memory for things like DMA buffers. This reservation is clamped to
271 * a maximum of 128MB.
272 */
273 if ( dom0_nrpages == 0 )
274 {
275 dom0_nrpages = avail_domheap_pages() + initial_images_nrpages();
276 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
277 dom0_nrpages = -dom0_nrpages;
278 }
280 /* Negative memory specification means "all memory - specified amount". */
281 if ( dom0_nrpages < 0 )
282 nr_pages = avail_domheap_pages() + initial_images_nrpages() +
283 dom0_nrpages;
284 else
285 nr_pages = dom0_nrpages;
287 if ( (rc = parseelfimage(&dsi)) != 0 )
288 return rc;
290 if ( dsi.xen_section_string == NULL )
291 {
292 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
293 return -EINVAL;
294 }
296 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
297 xen_pae = (CONFIG_PAGING_LEVELS == 3);
298 if ( dom0_pae != xen_pae )
299 {
300 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
301 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
302 return -EINVAL;
303 }
305 if ( xen_pae && !!strstr(dsi.xen_section_string, "PAE=yes[extended-cr3]") )
306 set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
308 if ( (p = strstr(dsi.xen_section_string, "FEATURES=")) != NULL )
309 {
310 parse_features(
311 p + strlen("FEATURES="),
312 dom0_features_supported,
313 dom0_features_required);
314 printk("Domain 0 kernel supports features = { %08x }.\n",
315 dom0_features_supported[0]);
316 printk("Domain 0 kernel requires features = { %08x }.\n",
317 dom0_features_required[0]);
318 if ( dom0_features_required[0] )
319 panic("Domain 0 requires an unsupported hypervisor feature.\n");
320 }
322 /* Align load address to 4MB boundary. */
323 dsi.v_start &= ~((1UL<<22)-1);
325 /*
326 * Why do we need this? The number of page-table frames depends on the
327 * size of the bootstrap address space. But the size of the address space
328 * depends on the number of page-table frames (since each one is mapped
329 * read-only). We have a pair of simultaneous equations in two unknowns,
330 * which we solve by exhaustive search.
331 */
332 vinitrd_start = round_pgup(dsi.v_end);
333 vinitrd_end = vinitrd_start + initrd_len;
334 vphysmap_start = round_pgup(vinitrd_end);
335 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
336 vstartinfo_start = round_pgup(vphysmap_end);
337 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
338 vpt_start = vstartinfo_end;
339 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
340 {
341 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
342 vstack_start = vpt_end;
343 vstack_end = vstack_start + PAGE_SIZE;
344 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
345 if ( (v_end - vstack_end) < (512UL << 10) )
346 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
347 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
348 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
349 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
350 break;
351 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
352 /* 5 pages: 1x 3rd + 4x 2nd level */
353 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
354 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
355 break;
356 #elif defined(__x86_64__)
357 #define NR(_l,_h,_s) \
358 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
359 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
360 if ( (1 + /* # L4 */
361 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
362 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
363 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
364 <= nr_pt_pages )
365 break;
366 #endif
367 }
369 order = get_order_from_bytes(v_end - dsi.v_start);
370 if ( (1UL << order) > nr_pages )
371 panic("Domain 0 allocation is too small for kernel image.\n");
373 /*
374 * Allocate from DMA pool: on i386 this ensures that our low-memory 1:1
375 * mapping covers the allocation.
376 */
377 if ( (page = alloc_domheap_pages(d, order, MEMF_dma)) == NULL )
378 panic("Not enough RAM for domain 0 allocation.\n");
379 alloc_spfn = page_to_mfn(page);
380 alloc_epfn = alloc_spfn + d->tot_pages;
382 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
383 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
384 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
385 if ( d->tot_pages < nr_pages )
386 printk(" (%lu pages to be allocated)",
387 nr_pages - d->tot_pages);
388 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
389 " Loaded kernel: %p->%p\n"
390 " Init. ramdisk: %p->%p\n"
391 " Phys-Mach map: %p->%p\n"
392 " Start info: %p->%p\n"
393 " Page tables: %p->%p\n"
394 " Boot stack: %p->%p\n"
395 " TOTAL: %p->%p\n",
396 _p(dsi.v_kernstart), _p(dsi.v_kernend),
397 _p(vinitrd_start), _p(vinitrd_end),
398 _p(vphysmap_start), _p(vphysmap_end),
399 _p(vstartinfo_start), _p(vstartinfo_end),
400 _p(vpt_start), _p(vpt_end),
401 _p(vstack_start), _p(vstack_end),
402 _p(dsi.v_start), _p(v_end));
403 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
405 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
406 {
407 printk("Initial guest OS requires too much space\n"
408 "(%luMB is greater than %luMB limit)\n",
409 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
410 return -ENOMEM;
411 }
413 mpt_alloc = (vpt_start - dsi.v_start) +
414 (unsigned long)pfn_to_paddr(alloc_spfn);
416 /*
417 * We're basically forcing default RPLs to 1, so that our "what privilege
418 * level are we returning to?" logic works.
419 */
420 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
421 for ( i = 0; i < 256; i++ )
422 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
424 #if defined(__i386__)
426 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
427 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
429 /*
430 * Protect the lowest 1GB of memory. We use a temporary mapping there
431 * from which we copy the kernel and ramdisk images.
432 */
433 if ( dsi.v_start < (1UL<<30) )
434 {
435 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
436 return -EINVAL;
437 }
439 /* WARNING: The new domain must have its 'processor' field filled in! */
440 #if CONFIG_PAGING_LEVELS == 3
441 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
442 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
443 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
444 for (i = 0; i < 4; i++) {
445 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
446 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
447 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
448 }
449 v->arch.guest_table = pagetable_from_paddr((unsigned long)l3start);
450 #else
451 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
452 memcpy(l2tab, idle_pg_table, PAGE_SIZE);
453 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
454 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
455 v->arch.guest_table = pagetable_from_paddr((unsigned long)l2start);
456 #endif
458 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
459 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
460 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
461 __PAGE_HYPERVISOR);
463 l2tab += l2_linear_offset(dsi.v_start);
464 mfn = alloc_spfn;
465 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
466 {
467 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
468 {
469 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
470 mpt_alloc += PAGE_SIZE;
471 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
472 l2tab++;
473 clear_page(l1tab);
474 if ( count == 0 )
475 l1tab += l1_table_offset(dsi.v_start);
476 }
477 *l1tab = l1e_from_pfn(mfn, L1_PROT);
478 l1tab++;
480 page = mfn_to_page(mfn);
481 if ( !get_page_and_type(page, d, PGT_writable_page) )
482 BUG();
484 mfn++;
485 }
487 /* Pages that are part of page tables must be read only. */
488 l2tab = l2start + l2_linear_offset(vpt_start);
489 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
490 l1tab += l1_table_offset(vpt_start);
491 for ( count = 0; count < nr_pt_pages; count++ )
492 {
493 page = mfn_to_page(l1e_get_pfn(*l1tab));
494 if ( !opt_dom0_shadow )
495 l1e_remove_flags(*l1tab, _PAGE_RW);
496 else
497 if ( !get_page_type(page, PGT_writable_page) )
498 BUG();
500 #if CONFIG_PAGING_LEVELS == 3
501 switch (count) {
502 case 0:
503 page->u.inuse.type_info &= ~PGT_type_mask;
504 page->u.inuse.type_info |= PGT_l3_page_table;
505 get_page(page, d); /* an extra ref because of readable mapping */
507 /* Get another ref to L3 page so that it can be pinned. */
508 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
509 BUG();
510 set_bit(_PGT_pinned, &page->u.inuse.type_info);
511 break;
512 case 1 ... 4:
513 page->u.inuse.type_info &= ~PGT_type_mask;
514 page->u.inuse.type_info |= PGT_l2_page_table;
515 page->u.inuse.type_info |=
516 (count-1) << PGT_va_shift;
517 get_page(page, d); /* an extra ref because of readable mapping */
518 break;
519 default:
520 page->u.inuse.type_info &= ~PGT_type_mask;
521 page->u.inuse.type_info |= PGT_l1_page_table;
522 page->u.inuse.type_info |=
523 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
524 get_page(page, d); /* an extra ref because of readable mapping */
525 break;
526 }
527 #else
528 if ( count == 0 )
529 {
530 page->u.inuse.type_info &= ~PGT_type_mask;
531 page->u.inuse.type_info |= PGT_l2_page_table;
533 /*
534 * No longer writable: decrement the type_count.
535 * Installed as CR3: increment both the ref_count and type_count.
536 * Net: just increment the ref_count.
537 */
538 get_page(page, d); /* an extra ref because of readable mapping */
540 /* Get another ref to L2 page so that it can be pinned. */
541 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
542 BUG();
543 set_bit(_PGT_pinned, &page->u.inuse.type_info);
544 }
545 else
546 {
547 page->u.inuse.type_info &= ~PGT_type_mask;
548 page->u.inuse.type_info |= PGT_l1_page_table;
549 page->u.inuse.type_info |=
550 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
552 /*
553 * No longer writable: decrement the type_count.
554 * This is an L1 page, installed in a validated L2 page:
555 * increment both the ref_count and type_count.
556 * Net: just increment the ref_count.
557 */
558 get_page(page, d); /* an extra ref because of readable mapping */
559 }
560 #endif
561 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
562 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
563 }
565 #elif defined(__x86_64__)
567 /* Overlap with Xen protected area? */
568 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
569 (v_end > HYPERVISOR_VIRT_START) )
570 {
571 printk("DOM0 image overlaps with Xen private area.\n");
572 return -EINVAL;
573 }
575 /* WARNING: The new domain must have its 'processor' field filled in! */
576 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
577 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
578 memcpy(l4tab, idle_pg_table, PAGE_SIZE);
579 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
580 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
581 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
582 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
583 v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
585 l4tab += l4_table_offset(dsi.v_start);
586 mfn = alloc_spfn;
587 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
588 {
589 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
590 {
591 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
592 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
593 clear_page(l1tab);
594 if ( count == 0 )
595 l1tab += l1_table_offset(dsi.v_start);
596 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
597 {
598 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
599 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
600 clear_page(l2tab);
601 if ( count == 0 )
602 l2tab += l2_table_offset(dsi.v_start);
603 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
604 {
605 maddr_to_page(mpt_alloc)->u.inuse.type_info =
606 PGT_l3_page_table;
607 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
608 clear_page(l3tab);
609 if ( count == 0 )
610 l3tab += l3_table_offset(dsi.v_start);
611 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
612 l4tab++;
613 }
614 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
615 l3tab++;
616 }
617 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
618 l2tab++;
619 }
620 *l1tab = l1e_from_pfn(mfn, L1_PROT);
621 l1tab++;
623 page = mfn_to_page(mfn);
624 if ( (page->u.inuse.type_info == 0) &&
625 !get_page_and_type(page, d, PGT_writable_page) )
626 BUG();
628 mfn++;
629 }
631 /* Pages that are part of page tables must be read only. */
632 l4tab = l4start + l4_table_offset(vpt_start);
633 l3start = l3tab = l4e_to_l3e(*l4tab);
634 l3tab += l3_table_offset(vpt_start);
635 l2start = l2tab = l3e_to_l2e(*l3tab);
636 l2tab += l2_table_offset(vpt_start);
637 l1start = l1tab = l2e_to_l1e(*l2tab);
638 l1tab += l1_table_offset(vpt_start);
639 for ( count = 0; count < nr_pt_pages; count++ )
640 {
641 l1e_remove_flags(*l1tab, _PAGE_RW);
642 page = mfn_to_page(l1e_get_pfn(*l1tab));
644 /* Read-only mapping + PGC_allocated + page-table page. */
645 page->count_info = PGC_allocated | 3;
646 page->u.inuse.type_info |= PGT_validated | 1;
648 /* Top-level p.t. is pinned. */
649 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
650 {
651 page->count_info += 1;
652 page->u.inuse.type_info += 1 | PGT_pinned;
653 }
655 /* Iterate. */
656 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
657 {
658 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
659 {
660 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
661 l3start = l3tab = l4e_to_l3e(*++l4tab);
662 l2start = l2tab = l3e_to_l2e(*l3tab);
663 }
664 l1start = l1tab = l2e_to_l1e(*l2tab);
665 }
666 }
668 #endif /* __x86_64__ */
670 /* Mask all upcalls... */
671 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
672 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
674 if ( opt_dom0_max_vcpus == 0 )
675 opt_dom0_max_vcpus = num_online_cpus();
676 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
677 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
678 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
680 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
681 (void)alloc_vcpu(d, i, i);
683 /* Set up monitor table */
684 update_pagetables(v);
686 /* Install the new page tables. */
687 local_irq_disable();
688 write_ptbase(v);
690 /* Copy the OS image and free temporary buffer. */
691 (void)loadelfimage(&dsi);
693 p = strstr(dsi.xen_section_string, "HYPERCALL_PAGE=");
694 if ( p != NULL )
695 {
696 p += strlen("HYPERCALL_PAGE=");
697 hypercall_page = simple_strtoul(p, NULL, 16);
698 hypercall_page = dsi.v_start + (hypercall_page << PAGE_SHIFT);
699 if ( (hypercall_page < dsi.v_start) || (hypercall_page >= v_end) )
700 {
701 write_ptbase(current);
702 local_irq_enable();
703 printk("Invalid HYPERCALL_PAGE field in guest header.\n");
704 return -1;
705 }
707 hypercall_page_initialise(d, (void *)hypercall_page);
708 }
710 /* Copy the initial ramdisk. */
711 if ( initrd_len != 0 )
712 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
714 /* Free temporary buffers. */
715 discard_initial_images();
717 /* Set up start info area. */
718 si = (start_info_t *)vstartinfo_start;
719 memset(si, 0, PAGE_SIZE);
720 si->nr_pages = nr_pages;
722 si->shared_info = virt_to_maddr(d->shared_info);
724 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
725 si->pt_base = vpt_start;
726 si->nr_pt_frames = nr_pt_pages;
727 si->mfn_list = vphysmap_start;
728 sprintf(si->magic, "xen-%i.%i-x86_%d%s",
729 xen_major_version(), xen_minor_version(),
730 BITS_PER_LONG, xen_pae ? "p" : "");
732 /* Write the phys->machine and machine->phys table entries. */
733 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
734 {
735 mfn = pfn + alloc_spfn;
736 #ifndef NDEBUG
737 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
738 if ( pfn > REVERSE_START )
739 mfn = alloc_epfn - (pfn - REVERSE_START);
740 #endif
741 ((unsigned long *)vphysmap_start)[pfn] = mfn;
742 set_gpfn_from_mfn(mfn, pfn);
743 }
744 while ( pfn < nr_pages )
745 {
746 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
747 panic("Not enough RAM for DOM0 reservation.\n");
748 while ( pfn < d->tot_pages )
749 {
750 mfn = page_to_mfn(page);
751 #ifndef NDEBUG
752 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
753 #endif
754 ((unsigned long *)vphysmap_start)[pfn] = mfn;
755 set_gpfn_from_mfn(mfn, pfn);
756 #undef pfn
757 page++; pfn++;
758 }
759 }
761 if ( initrd_len != 0 )
762 {
763 si->mod_start = vinitrd_start;
764 si->mod_len = initrd_len;
765 printk("Initrd len 0x%lx, start at 0x%lx\n",
766 si->mod_len, si->mod_start);
767 }
769 memset(si->cmd_line, 0, sizeof(si->cmd_line));
770 if ( cmdline != NULL )
771 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
773 /* Reinstate the caller's page tables. */
774 write_ptbase(current);
775 local_irq_enable();
777 #if defined(__i386__)
778 /* Destroy low mappings - they were only for our convenience. */
779 zap_low_mappings(l2start);
780 zap_low_mappings(idle_pg_table_l2);
781 #endif
783 update_domain_wallclock_time(d);
785 set_bit(_VCPUF_initialised, &v->vcpu_flags);
787 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
789 if ( opt_dom0_shadow )
790 {
791 shadow_mode_enable(d, SHM_enable);
792 update_pagetables(v);
793 }
795 if ( supervisor_mode_kernel )
796 {
797 v->arch.guest_context.kernel_ss &= ~3;
798 v->arch.guest_context.user_regs.ss &= ~3;
799 v->arch.guest_context.user_regs.es &= ~3;
800 v->arch.guest_context.user_regs.ds &= ~3;
801 v->arch.guest_context.user_regs.fs &= ~3;
802 v->arch.guest_context.user_regs.gs &= ~3;
803 printk("Dom0 runs in ring 0 (supervisor mode)\n");
804 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
805 dom0_features_supported) )
806 panic("Dom0 does not support supervisor-mode execution\n");
807 }
808 else
809 {
810 if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
811 panic("Dom0 requires supervisor-mode execution\n");
812 }
814 rc = 0;
816 /* DOM0 is permitted full I/O capabilities. */
817 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
818 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
819 rc |= irqs_permit_access(dom0, 0, NR_IRQS-1);
821 /*
822 * Modify I/O port access permissions.
823 */
824 /* Master Interrupt Controller (PIC). */
825 rc |= ioports_deny_access(dom0, 0x20, 0x21);
826 /* Slave Interrupt Controller (PIC). */
827 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
828 /* Interval Timer (PIT). */
829 rc |= ioports_deny_access(dom0, 0x40, 0x43);
830 /* PIT Channel 2 / PC Speaker Control. */
831 rc |= ioports_deny_access(dom0, 0x61, 0x61);
832 /* Command-line I/O ranges. */
833 process_dom0_ioports_disable();
835 /*
836 * Modify I/O memory access permissions.
837 */
838 /* Local APIC. */
839 if ( mp_lapic_addr != 0 )
840 {
841 mfn = paddr_to_pfn(mp_lapic_addr);
842 rc |= iomem_deny_access(dom0, mfn, mfn);
843 }
844 /* I/O APICs. */
845 for ( i = 0; i < nr_ioapics; i++ )
846 {
847 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
848 if ( smp_found_config )
849 rc |= iomem_deny_access(dom0, mfn, mfn);
850 }
852 BUG_ON(rc != 0);
854 return 0;
855 }
857 int elf_sanity_check(Elf_Ehdr *ehdr)
858 {
859 if ( !IS_ELF(*ehdr) ||
860 #if defined(__i386__)
861 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
862 (ehdr->e_machine != EM_386) ||
863 #elif defined(__x86_64__)
864 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
865 (ehdr->e_machine != EM_X86_64) ||
866 #endif
867 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
868 (ehdr->e_type != ET_EXEC) )
869 {
870 printk("DOM0 image is not a Xen-compatible Elf image.\n");
871 return 0;
872 }
874 return 1;
875 }
877 /*
878 * Local variables:
879 * mode: C
880 * c-set-style: "BSD"
881 * c-basic-offset: 4
882 * tab-width: 4
883 * indent-tabs-mode: nil
884 * End:
885 */