ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 9776:72f9c751d3ea

Replace &foo[0] with foo where the latter seems cleaner
(which is usually, and particularly when its an argument
to one of the bitops functions).

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Apr 19 18:32:20 2006 +0100 (2006-04-19)
parents 08aede767c63
children 0ee104bd6557
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/elf.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/compile.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <asm/regs.h>
22 #include <asm/system.h>
23 #include <asm/io.h>
24 #include <asm/processor.h>
25 #include <asm/desc.h>
26 #include <asm/i387.h>
27 #include <asm/shadow.h>
29 #include <public/version.h>
31 extern unsigned long initial_images_nrpages(void);
32 extern void discard_initial_images(void);
34 static long dom0_nrpages;
36 /*
37 * dom0_mem:
38 * If +ve:
39 * * The specified amount of memory is allocated to domain 0.
40 * If -ve:
41 * * All of memory is allocated to domain 0, minus the specified amount.
42 * If not specified:
43 * * All of memory is allocated to domain 0, minus 1/16th which is reserved
44 * for uses such as DMA buffers (the reservation is clamped to 128MB).
45 */
46 static void parse_dom0_mem(char *s)
47 {
48 unsigned long long bytes;
49 char *t = s;
50 if ( *s == '-' )
51 t++;
52 bytes = parse_size_and_unit(t);
53 dom0_nrpages = bytes >> PAGE_SHIFT;
54 if ( *s == '-' )
55 dom0_nrpages = -dom0_nrpages;
56 }
57 custom_param("dom0_mem", parse_dom0_mem);
59 static unsigned int opt_dom0_max_vcpus;
60 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
62 static unsigned int opt_dom0_shadow;
63 boolean_param("dom0_shadow", opt_dom0_shadow);
65 static char opt_dom0_ioports_disable[200] = "";
66 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
68 #if defined(__i386__)
69 /* No ring-3 access in initial leaf page tables. */
70 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
71 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
72 #define L3_PROT (_PAGE_PRESENT)
73 #elif defined(__x86_64__)
74 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
75 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
76 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
77 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
78 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
79 #endif
81 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
82 #define round_pgdown(_p) ((_p)&PAGE_MASK)
84 static struct page_info *alloc_chunk(struct domain *d, unsigned long max_pages)
85 {
86 struct page_info *page;
87 unsigned int order;
88 /*
89 * Allocate up to 2MB at a time: It prevents allocating very large chunks
90 * from DMA pools before the >4GB pool is fully depleted.
91 */
92 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
93 max_pages = 2UL << (20 - PAGE_SHIFT);
94 order = get_order_from_pages(max_pages);
95 if ( (max_pages & (max_pages-1)) != 0 )
96 order--;
97 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
98 if ( order-- == 0 )
99 break;
100 return page;
101 }
103 static void process_dom0_ioports_disable(void)
104 {
105 unsigned long io_from, io_to;
106 char *t, *u, *s = opt_dom0_ioports_disable;
108 if ( *s == '\0' )
109 return;
111 while ( (t = strsep(&s, ",")) != NULL )
112 {
113 io_from = simple_strtoul(t, &u, 16);
114 if ( u == t )
115 {
116 parse_error:
117 printk("Invalid ioport range <%s> "
118 "in dom0_ioports_disable, skipping\n", t);
119 continue;
120 }
122 if ( *u == '\0' )
123 io_to = io_from;
124 else if ( *u == '-' )
125 io_to = simple_strtoul(u + 1, &u, 16);
126 else
127 goto parse_error;
129 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
130 goto parse_error;
132 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
133 io_from, io_to);
135 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
136 BUG();
137 }
138 }
140 static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = {
141 [XENFEAT_writable_page_tables] = "writable_page_tables",
142 [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
143 [XENFEAT_auto_translated_physmap] = "auto_translated_physmap",
144 [XENFEAT_supervisor_mode_kernel] = "supervisor_mode_kernel",
145 [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb"
146 };
148 static void parse_features(
149 const char *feats,
150 uint32_t supported[XENFEAT_NR_SUBMAPS],
151 uint32_t required[XENFEAT_NR_SUBMAPS])
152 {
153 const char *end, *p;
154 int i, req;
156 if ( (end = strchr(feats, ',')) == NULL )
157 end = feats + strlen(feats);
159 while ( feats < end )
160 {
161 p = strchr(feats, '|');
162 if ( (p == NULL) || (p > end) )
163 p = end;
165 req = (*feats == '!');
166 if ( req )
167 feats++;
169 for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ )
170 {
171 if ( feature_names[i] == NULL )
172 continue;
174 if ( strncmp(feature_names[i], feats, p-feats) == 0 )
175 {
176 set_bit(i, supported);
177 if ( req )
178 set_bit(i, required);
179 break;
180 }
181 }
183 if ( i == XENFEAT_NR_SUBMAPS*32 )
184 {
185 printk("Unknown kernel feature \"%.*s\".\n",
186 (int)(p-feats), feats);
187 if ( req )
188 panic("Domain 0 requires an unknown hypervisor feature.\n");
189 }
191 feats = p;
192 if ( *feats == '|' )
193 feats++;
194 }
195 }
197 int construct_dom0(struct domain *d,
198 unsigned long _image_start, unsigned long image_len,
199 unsigned long _initrd_start, unsigned long initrd_len,
200 char *cmdline)
201 {
202 int i, rc, dom0_pae, xen_pae, order;
203 unsigned long pfn, mfn;
204 unsigned long nr_pages;
205 unsigned long nr_pt_pages;
206 unsigned long alloc_spfn;
207 unsigned long alloc_epfn;
208 unsigned long count;
209 struct page_info *page = NULL;
210 start_info_t *si;
211 struct vcpu *v = d->vcpu[0];
212 char *p;
213 unsigned long hypercall_page;
214 #if defined(__i386__)
215 char *image_start = (char *)_image_start; /* use lowmem mappings */
216 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
217 #elif defined(__x86_64__)
218 char *image_start = __va(_image_start);
219 char *initrd_start = __va(_initrd_start);
220 #endif
221 #if CONFIG_PAGING_LEVELS >= 4
222 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
223 #endif
224 #if CONFIG_PAGING_LEVELS >= 3
225 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
226 #endif
227 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
228 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
230 /*
231 * This fully describes the memory layout of the initial domain. All
232 * *_start address are page-aligned, except v_start (and v_end) which are
233 * superpage-aligned.
234 */
235 struct domain_setup_info dsi;
236 unsigned long vinitrd_start;
237 unsigned long vinitrd_end;
238 unsigned long vphysmap_start;
239 unsigned long vphysmap_end;
240 unsigned long vstartinfo_start;
241 unsigned long vstartinfo_end;
242 unsigned long vstack_start;
243 unsigned long vstack_end;
244 unsigned long vpt_start;
245 unsigned long vpt_end;
246 unsigned long v_end;
248 /* Machine address of next candidate page-table page. */
249 unsigned long mpt_alloc;
251 /* Features supported. */
252 uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
253 uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
255 /* Sanity! */
256 BUG_ON(d->domain_id != 0);
257 BUG_ON(d->vcpu[0] == NULL);
258 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
260 memset(&dsi, 0, sizeof(struct domain_setup_info));
261 dsi.image_addr = (unsigned long)image_start;
262 dsi.image_len = image_len;
264 printk("*** LOADING DOMAIN 0 ***\n");
266 d->max_pages = ~0U;
268 /*
269 * If domain 0 allocation isn't specified, reserve 1/16th of available
270 * memory for things like DMA buffers. This reservation is clamped to
271 * a maximum of 128MB.
272 */
273 if ( dom0_nrpages == 0 )
274 {
275 dom0_nrpages = avail_domheap_pages() + initial_images_nrpages();
276 dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
277 dom0_nrpages = -dom0_nrpages;
278 }
280 /* Negative memory specification means "all memory - specified amount". */
281 if ( dom0_nrpages < 0 )
282 nr_pages = avail_domheap_pages() + initial_images_nrpages() +
283 dom0_nrpages;
284 else
285 nr_pages = dom0_nrpages;
287 if ( (rc = parseelfimage(&dsi)) != 0 )
288 return rc;
290 if ( dsi.xen_section_string == NULL )
291 {
292 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
293 return -EINVAL;
294 }
296 dom0_pae = !!strstr(dsi.xen_section_string, "PAE=yes");
297 xen_pae = (CONFIG_PAGING_LEVELS == 3);
298 if ( dom0_pae != xen_pae )
299 {
300 printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n",
301 xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
302 return -EINVAL;
303 }
305 if ( (p = strstr(dsi.xen_section_string, "FEATURES=")) != NULL )
306 {
307 parse_features(
308 p + strlen("FEATURES="),
309 dom0_features_supported,
310 dom0_features_required);
311 printk("Domain 0 kernel supports features = { %08x }.\n",
312 dom0_features_supported[0]);
313 printk("Domain 0 kernel requires features = { %08x }.\n",
314 dom0_features_required[0]);
315 if ( dom0_features_required[0] )
316 panic("Domain 0 requires an unsupported hypervisor feature.\n");
317 }
319 /* Align load address to 4MB boundary. */
320 dsi.v_start &= ~((1UL<<22)-1);
322 /*
323 * Why do we need this? The number of page-table frames depends on the
324 * size of the bootstrap address space. But the size of the address space
325 * depends on the number of page-table frames (since each one is mapped
326 * read-only). We have a pair of simultaneous equations in two unknowns,
327 * which we solve by exhaustive search.
328 */
329 vinitrd_start = round_pgup(dsi.v_end);
330 vinitrd_end = vinitrd_start + initrd_len;
331 vphysmap_start = round_pgup(vinitrd_end);
332 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
333 vstartinfo_start = round_pgup(vphysmap_end);
334 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
335 vpt_start = vstartinfo_end;
336 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
337 {
338 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
339 vstack_start = vpt_end;
340 vstack_end = vstack_start + PAGE_SIZE;
341 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
342 if ( (v_end - vstack_end) < (512UL << 10) )
343 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
344 #if defined(__i386__) && !defined(CONFIG_X86_PAE)
345 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
346 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
347 break;
348 #elif defined(__i386__) && defined(CONFIG_X86_PAE)
349 /* 5 pages: 1x 3rd + 4x 2nd level */
350 if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
351 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
352 break;
353 #elif defined(__x86_64__)
354 #define NR(_l,_h,_s) \
355 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
356 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
357 if ( (1 + /* # L4 */
358 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
359 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
360 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
361 <= nr_pt_pages )
362 break;
363 #endif
364 }
366 order = get_order_from_bytes(v_end - dsi.v_start);
367 if ( (1UL << order) > nr_pages )
368 panic("Domain 0 allocation is too small for kernel image.\n");
370 /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
371 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
372 panic("Not enough RAM for domain 0 allocation.\n");
373 alloc_spfn = page_to_mfn(page);
374 alloc_epfn = alloc_spfn + d->tot_pages;
376 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
377 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
378 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
379 if ( d->tot_pages < nr_pages )
380 printk(" (%lu pages to be allocated)",
381 nr_pages - d->tot_pages);
382 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
383 " Loaded kernel: %p->%p\n"
384 " Init. ramdisk: %p->%p\n"
385 " Phys-Mach map: %p->%p\n"
386 " Start info: %p->%p\n"
387 " Page tables: %p->%p\n"
388 " Boot stack: %p->%p\n"
389 " TOTAL: %p->%p\n",
390 _p(dsi.v_kernstart), _p(dsi.v_kernend),
391 _p(vinitrd_start), _p(vinitrd_end),
392 _p(vphysmap_start), _p(vphysmap_end),
393 _p(vstartinfo_start), _p(vstartinfo_end),
394 _p(vpt_start), _p(vpt_end),
395 _p(vstack_start), _p(vstack_end),
396 _p(dsi.v_start), _p(v_end));
397 printk(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
399 if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
400 {
401 printk("Initial guest OS requires too much space\n"
402 "(%luMB is greater than %luMB limit)\n",
403 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
404 return -ENOMEM;
405 }
407 mpt_alloc = (vpt_start - dsi.v_start) +
408 (unsigned long)pfn_to_paddr(alloc_spfn);
410 /*
411 * We're basically forcing default RPLs to 1, so that our "what privilege
412 * level are we returning to?" logic works.
413 */
414 v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS;
415 for ( i = 0; i < 256; i++ )
416 v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS;
418 #if defined(__i386__)
420 v->arch.guest_context.failsafe_callback_cs = FLAT_KERNEL_CS;
421 v->arch.guest_context.event_callback_cs = FLAT_KERNEL_CS;
423 /*
424 * Protect the lowest 1GB of memory. We use a temporary mapping there
425 * from which we copy the kernel and ramdisk images.
426 */
427 if ( dsi.v_start < (1UL<<30) )
428 {
429 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
430 return -EINVAL;
431 }
433 /* WARNING: The new domain must have its 'processor' field filled in! */
434 #if CONFIG_PAGING_LEVELS == 3
435 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
436 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
437 memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
438 for (i = 0; i < 4; i++) {
439 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
440 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
441 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
442 }
443 v->arch.guest_table = mk_pagetable((unsigned long)l3start);
444 #else
445 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
446 memcpy(l2tab, idle_pg_table, PAGE_SIZE);
447 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
448 l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
449 v->arch.guest_table = mk_pagetable((unsigned long)l2start);
450 #endif
452 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
453 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
454 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
455 __PAGE_HYPERVISOR);
457 l2tab += l2_linear_offset(dsi.v_start);
458 mfn = alloc_spfn;
459 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
460 {
461 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
462 {
463 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
464 mpt_alloc += PAGE_SIZE;
465 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
466 l2tab++;
467 clear_page(l1tab);
468 if ( count == 0 )
469 l1tab += l1_table_offset(dsi.v_start);
470 }
471 *l1tab = l1e_from_pfn(mfn, L1_PROT);
472 l1tab++;
474 page = mfn_to_page(mfn);
475 if ( !get_page_and_type(page, d, PGT_writable_page) )
476 BUG();
478 mfn++;
479 }
481 /* Pages that are part of page tables must be read only. */
482 l2tab = l2start + l2_linear_offset(vpt_start);
483 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
484 l1tab += l1_table_offset(vpt_start);
485 for ( count = 0; count < nr_pt_pages; count++ )
486 {
487 page = mfn_to_page(l1e_get_pfn(*l1tab));
488 if ( !opt_dom0_shadow )
489 l1e_remove_flags(*l1tab, _PAGE_RW);
490 else
491 if ( !get_page_type(page, PGT_writable_page) )
492 BUG();
494 #if CONFIG_PAGING_LEVELS == 3
495 switch (count) {
496 case 0:
497 page->u.inuse.type_info &= ~PGT_type_mask;
498 page->u.inuse.type_info |= PGT_l3_page_table;
499 get_page(page, d); /* an extra ref because of readable mapping */
501 /* Get another ref to L3 page so that it can be pinned. */
502 if ( !get_page_and_type(page, d, PGT_l3_page_table) )
503 BUG();
504 set_bit(_PGT_pinned, &page->u.inuse.type_info);
505 break;
506 case 1 ... 4:
507 page->u.inuse.type_info &= ~PGT_type_mask;
508 page->u.inuse.type_info |= PGT_l2_page_table;
509 page->u.inuse.type_info |=
510 (count-1) << PGT_va_shift;
511 get_page(page, d); /* an extra ref because of readable mapping */
512 break;
513 default:
514 page->u.inuse.type_info &= ~PGT_type_mask;
515 page->u.inuse.type_info |= PGT_l1_page_table;
516 page->u.inuse.type_info |=
517 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift;
518 get_page(page, d); /* an extra ref because of readable mapping */
519 break;
520 }
521 #else
522 if ( count == 0 )
523 {
524 page->u.inuse.type_info &= ~PGT_type_mask;
525 page->u.inuse.type_info |= PGT_l2_page_table;
527 /*
528 * No longer writable: decrement the type_count.
529 * Installed as CR3: increment both the ref_count and type_count.
530 * Net: just increment the ref_count.
531 */
532 get_page(page, d); /* an extra ref because of readable mapping */
534 /* Get another ref to L2 page so that it can be pinned. */
535 if ( !get_page_and_type(page, d, PGT_l2_page_table) )
536 BUG();
537 set_bit(_PGT_pinned, &page->u.inuse.type_info);
538 }
539 else
540 {
541 page->u.inuse.type_info &= ~PGT_type_mask;
542 page->u.inuse.type_info |= PGT_l1_page_table;
543 page->u.inuse.type_info |=
544 ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
546 /*
547 * No longer writable: decrement the type_count.
548 * This is an L1 page, installed in a validated L2 page:
549 * increment both the ref_count and type_count.
550 * Net: just increment the ref_count.
551 */
552 get_page(page, d); /* an extra ref because of readable mapping */
553 }
554 #endif
555 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
556 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
557 }
559 #elif defined(__x86_64__)
561 /* Overlap with Xen protected area? */
562 if ( (dsi.v_start < HYPERVISOR_VIRT_END) &&
563 (v_end > HYPERVISOR_VIRT_START) )
564 {
565 printk("DOM0 image overlaps with Xen private area.\n");
566 return -EINVAL;
567 }
569 /* WARNING: The new domain must have its 'processor' field filled in! */
570 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
571 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
572 memcpy(l4tab, idle_pg_table, PAGE_SIZE);
573 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
574 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
575 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
576 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
577 v->arch.guest_table = mk_pagetable(__pa(l4start));
579 l4tab += l4_table_offset(dsi.v_start);
580 mfn = alloc_spfn;
581 for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
582 {
583 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
584 {
585 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
586 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
587 clear_page(l1tab);
588 if ( count == 0 )
589 l1tab += l1_table_offset(dsi.v_start);
590 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
591 {
592 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
593 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
594 clear_page(l2tab);
595 if ( count == 0 )
596 l2tab += l2_table_offset(dsi.v_start);
597 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
598 {
599 maddr_to_page(mpt_alloc)->u.inuse.type_info =
600 PGT_l3_page_table;
601 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
602 clear_page(l3tab);
603 if ( count == 0 )
604 l3tab += l3_table_offset(dsi.v_start);
605 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
606 l4tab++;
607 }
608 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
609 l3tab++;
610 }
611 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
612 l2tab++;
613 }
614 *l1tab = l1e_from_pfn(mfn, L1_PROT);
615 l1tab++;
617 page = mfn_to_page(mfn);
618 if ( (page->u.inuse.type_info == 0) &&
619 !get_page_and_type(page, d, PGT_writable_page) )
620 BUG();
622 mfn++;
623 }
625 /* Pages that are part of page tables must be read only. */
626 l4tab = l4start + l4_table_offset(vpt_start);
627 l3start = l3tab = l4e_to_l3e(*l4tab);
628 l3tab += l3_table_offset(vpt_start);
629 l2start = l2tab = l3e_to_l2e(*l3tab);
630 l2tab += l2_table_offset(vpt_start);
631 l1start = l1tab = l2e_to_l1e(*l2tab);
632 l1tab += l1_table_offset(vpt_start);
633 for ( count = 0; count < nr_pt_pages; count++ )
634 {
635 l1e_remove_flags(*l1tab, _PAGE_RW);
636 page = mfn_to_page(l1e_get_pfn(*l1tab));
638 /* Read-only mapping + PGC_allocated + page-table page. */
639 page->count_info = PGC_allocated | 3;
640 page->u.inuse.type_info |= PGT_validated | 1;
642 /* Top-level p.t. is pinned. */
643 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
644 {
645 page->count_info += 1;
646 page->u.inuse.type_info += 1 | PGT_pinned;
647 }
649 /* Iterate. */
650 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
651 {
652 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
653 {
654 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
655 l3start = l3tab = l4e_to_l3e(*++l4tab);
656 l2start = l2tab = l3e_to_l2e(*l3tab);
657 }
658 l1start = l1tab = l2e_to_l1e(*l2tab);
659 }
660 }
662 #endif /* __x86_64__ */
664 /* Mask all upcalls... */
665 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
666 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
668 if ( opt_dom0_max_vcpus == 0 )
669 opt_dom0_max_vcpus = num_online_cpus();
670 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
671 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
672 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
674 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
675 (void)alloc_vcpu(d, i, i);
677 /* Set up monitor table */
678 update_pagetables(v);
680 /* Install the new page tables. */
681 local_irq_disable();
682 write_ptbase(v);
684 /* Copy the OS image and free temporary buffer. */
685 (void)loadelfimage(&dsi);
687 p = strstr(dsi.xen_section_string, "HYPERCALL_PAGE=");
688 if ( p != NULL )
689 {
690 p += strlen("HYPERCALL_PAGE=");
691 hypercall_page = simple_strtoul(p, NULL, 16);
692 hypercall_page = dsi.v_start + (hypercall_page << PAGE_SHIFT);
693 if ( (hypercall_page < dsi.v_start) || (hypercall_page >= v_end) )
694 {
695 write_ptbase(current);
696 local_irq_enable();
697 printk("Invalid HYPERCALL_PAGE field in guest header.\n");
698 return -1;
699 }
701 hypercall_page_initialise((void *)hypercall_page);
702 }
704 /* Copy the initial ramdisk. */
705 if ( initrd_len != 0 )
706 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
708 /* Free temporary buffers. */
709 discard_initial_images();
711 /* Set up start info area. */
712 si = (start_info_t *)vstartinfo_start;
713 memset(si, 0, PAGE_SIZE);
714 si->nr_pages = nr_pages;
716 si->shared_info = virt_to_maddr(d->shared_info);
718 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
719 si->pt_base = vpt_start;
720 si->nr_pt_frames = nr_pt_pages;
721 si->mfn_list = vphysmap_start;
722 sprintf(si->magic, "xen-%i.%i-x86_%d%s",
723 XEN_VERSION, XEN_SUBVERSION, BITS_PER_LONG, xen_pae ? "p" : "");
725 /* Write the phys->machine and machine->phys table entries. */
726 for ( pfn = 0; pfn < d->tot_pages; pfn++ )
727 {
728 mfn = pfn + alloc_spfn;
729 #ifndef NDEBUG
730 #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
731 if ( pfn > REVERSE_START )
732 mfn = alloc_epfn - (pfn - REVERSE_START);
733 #endif
734 ((unsigned long *)vphysmap_start)[pfn] = mfn;
735 set_gpfn_from_mfn(mfn, pfn);
736 }
737 while ( pfn < nr_pages )
738 {
739 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
740 panic("Not enough RAM for DOM0 reservation.\n");
741 while ( pfn < d->tot_pages )
742 {
743 mfn = page_to_mfn(page);
744 #ifndef NDEBUG
745 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
746 #endif
747 ((unsigned long *)vphysmap_start)[pfn] = mfn;
748 set_gpfn_from_mfn(mfn, pfn);
749 #undef pfn
750 page++; pfn++;
751 }
752 }
754 if ( initrd_len != 0 )
755 {
756 si->mod_start = vinitrd_start;
757 si->mod_len = initrd_len;
758 printk("Initrd len 0x%lx, start at 0x%lx\n",
759 si->mod_len, si->mod_start);
760 }
762 memset(si->cmd_line, 0, sizeof(si->cmd_line));
763 if ( cmdline != NULL )
764 strncpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)-1);
766 /* Reinstate the caller's page tables. */
767 write_ptbase(current);
768 local_irq_enable();
770 #if defined(__i386__)
771 /* Destroy low mappings - they were only for our convenience. */
772 zap_low_mappings(l2start);
773 zap_low_mappings(idle_pg_table_l2);
774 #endif
776 update_domain_wallclock_time(d);
778 set_bit(_VCPUF_initialised, &v->vcpu_flags);
780 new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
782 if ( opt_dom0_shadow )
783 {
784 shadow_mode_enable(d, SHM_enable);
785 update_pagetables(v);
786 }
788 if ( supervisor_mode_kernel )
789 {
790 v->arch.guest_context.kernel_ss &= ~3;
791 v->arch.guest_context.user_regs.ss &= ~3;
792 v->arch.guest_context.user_regs.es &= ~3;
793 v->arch.guest_context.user_regs.ds &= ~3;
794 v->arch.guest_context.user_regs.fs &= ~3;
795 v->arch.guest_context.user_regs.gs &= ~3;
796 printk("Dom0 runs in ring 0 (supervisor mode)\n");
797 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
798 dom0_features_supported) )
799 panic("Dom0 does not support supervisor-mode execution\n");
800 }
801 else
802 {
803 if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
804 panic("Dom0 requires supervisor-mode execution\n");
805 }
807 rc = 0;
809 /* DOM0 is permitted full I/O capabilities. */
810 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
811 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
812 rc |= irqs_permit_access(dom0, 0, NR_PIRQS-1);
814 /*
815 * Modify I/O port access permissions.
816 */
817 /* Master Interrupt Controller (PIC). */
818 rc |= ioports_deny_access(dom0, 0x20, 0x21);
819 /* Slave Interrupt Controller (PIC). */
820 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
821 /* Interval Timer (PIT). */
822 rc |= ioports_deny_access(dom0, 0x40, 0x43);
823 /* PIT Channel 2 / PC Speaker Control. */
824 rc |= ioports_deny_access(dom0, 0x61, 0x61);
825 /* Command-line I/O ranges. */
826 process_dom0_ioports_disable();
828 /*
829 * Modify I/O memory access permissions.
830 */
831 /* Local APIC. */
832 if ( mp_lapic_addr != 0 )
833 {
834 mfn = paddr_to_pfn(mp_lapic_addr);
835 rc |= iomem_deny_access(dom0, mfn, mfn);
836 }
837 /* I/O APICs. */
838 for ( i = 0; i < nr_ioapics; i++ )
839 {
840 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
841 if ( smp_found_config )
842 rc |= iomem_deny_access(dom0, mfn, mfn);
843 }
845 BUG_ON(rc != 0);
847 return 0;
848 }
850 int elf_sanity_check(Elf_Ehdr *ehdr)
851 {
852 if ( !IS_ELF(*ehdr) ||
853 #if defined(__i386__)
854 (ehdr->e_ident[EI_CLASS] != ELFCLASS32) ||
855 (ehdr->e_machine != EM_386) ||
856 #elif defined(__x86_64__)
857 (ehdr->e_ident[EI_CLASS] != ELFCLASS64) ||
858 (ehdr->e_machine != EM_X86_64) ||
859 #endif
860 (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) ||
861 (ehdr->e_type != ET_EXEC) )
862 {
863 printk("DOM0 image is not a Xen-compatible Elf image.\n");
864 return 0;
865 }
867 return 1;
868 }
870 /*
871 * Local variables:
872 * mode: C
873 * c-set-style: "BSD"
874 * c-basic-offset: 4
875 * tab-width: 4
876 * indent-tabs-mode: nil
877 * End:
878 */