ia64/xen-unstable

view xen/arch/x86/domain_build.c @ 19797:8440fc9f7a25

x86-64: do not pass unmanageable amounts of memory to Dom0

Due to address space restrictions it is not possible to successfully
pass more than about 500Gb to a Linux Dom0 unless its kernel specifies
a non-default phys-to-machine map location via XEN_ELFNOTE_INIT_P2M.

For non-Linux Dom0 kernels I can't say whether the limit could be set
to close to 1Tb, but since passing such huge amounts of memory isn't
very useful anyway (and can be enforced via dom0_mem=3D), the patch
doesn't attempt to guess the kernel type and restricts the memory
amount in all cases.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:31:17 2009 +0100 (2009-06-18)
parents 2f9e1348aa98
children
line source
1 /******************************************************************************
2 * domain_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
7 #include <xen/config.h>
8 #include <xen/init.h>
9 #include <xen/lib.h>
10 #include <xen/ctype.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/delay.h>
14 #include <xen/event.h>
15 #include <xen/console.h>
16 #include <xen/kernel.h>
17 #include <xen/domain.h>
18 #include <xen/version.h>
19 #include <xen/iocap.h>
20 #include <xen/bitops.h>
21 #include <xen/compat.h>
22 #include <xen/libelf.h>
23 #include <asm/regs.h>
24 #include <asm/system.h>
25 #include <asm/io.h>
26 #include <asm/processor.h>
27 #include <asm/desc.h>
28 #include <asm/i387.h>
29 #include <asm/paging.h>
30 #include <asm/p2m.h>
31 #include <asm/e820.h>
33 #include <public/version.h>
35 int __init bzimage_parse(
36 char *output, char **image_start, unsigned long *image_len);
38 extern unsigned long initial_images_nrpages(void);
39 extern void discard_initial_images(void);
41 static long __initdata dom0_nrpages;
42 static long __initdata dom0_min_nrpages;
43 static long __initdata dom0_max_nrpages = LONG_MAX;
45 /*
46 * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
47 *
48 * <min_amt>: The minimum amount of memory which should be allocated for dom0.
49 * <max_amt>: The maximum amount of memory which should be allocated for dom0.
50 * <amt>: The precise amount of memory to allocate for dom0.
51 *
52 * Notes:
53 * 1. <amt> is clamped from below by <min_amt> and from above by available
54 * memory and <max_amt>
55 * 2. <min_amt> is clamped from above by available memory and <max_amt>
56 * 3. <min_amt> is ignored if it is greater than <max_amt>
57 * 4. If <amt> is not specified, it is calculated as follows:
58 * "All of memory is allocated to domain 0, minus 1/16th which is reserved
59 * for uses such as DMA buffers (the reservation is clamped to 128MB)."
60 *
61 * Each value can be specified as positive or negative:
62 * If +ve: The specified amount is an absolute value.
63 * If -ve: The specified amount is subtracted from total available memory.
64 */
65 static long __init parse_amt(const char *s, const char **ps)
66 {
67 long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT;
68 return (*s == '-') ? -pages : pages;
69 }
70 static void __init parse_dom0_mem(const char *s)
71 {
72 do {
73 if ( !strncmp(s, "min:", 4) )
74 dom0_min_nrpages = parse_amt(s+4, &s);
75 else if ( !strncmp(s, "max:", 4) )
76 dom0_max_nrpages = parse_amt(s+4, &s);
77 else
78 dom0_nrpages = parse_amt(s, &s);
79 if ( *s != ',' )
80 break;
81 } while ( *s++ == ',' );
82 }
83 custom_param("dom0_mem", parse_dom0_mem);
85 static unsigned int __initdata opt_dom0_max_vcpus;
86 integer_param("dom0_max_vcpus", opt_dom0_max_vcpus);
88 struct vcpu *__init alloc_dom0_vcpu0(void)
89 {
90 if ( opt_dom0_max_vcpus == 0 )
91 opt_dom0_max_vcpus = num_online_cpus();
92 if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
93 opt_dom0_max_vcpus = MAX_VIRT_CPUS;
95 dom0->vcpu = xmalloc_array(struct vcpu *, opt_dom0_max_vcpus);
96 if ( !dom0->vcpu )
97 return NULL;
98 memset(dom0->vcpu, 0, opt_dom0_max_vcpus * sizeof(*dom0->vcpu));
99 dom0->max_vcpus = opt_dom0_max_vcpus;
101 return alloc_vcpu(dom0, 0, 0);
102 }
104 static unsigned int opt_dom0_shadow;
105 boolean_param("dom0_shadow", opt_dom0_shadow);
107 static char opt_dom0_ioports_disable[200] = "";
108 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
110 #if defined(__i386__)
111 /* No ring-3 access in initial leaf page tables. */
112 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
113 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
114 #define L3_PROT (_PAGE_PRESENT)
115 #elif defined(__x86_64__)
116 /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
117 #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
118 #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
119 /* ... except for compatibility mode guests. */
120 #define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
121 #define L2_PROT (BASE_PROT|_PAGE_DIRTY)
122 #define L3_PROT (BASE_PROT|_PAGE_DIRTY)
123 #define L4_PROT (BASE_PROT|_PAGE_DIRTY)
124 #endif
126 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
127 #define round_pgdown(_p) ((_p)&PAGE_MASK)
129 static struct page_info * __init alloc_chunk(
130 struct domain *d, unsigned long max_pages)
131 {
132 struct page_info *page;
133 unsigned int order;
134 /*
135 * Allocate up to 2MB at a time: It prevents allocating very large chunks
136 * from DMA pools before the >4GB pool is fully depleted.
137 */
138 if ( max_pages > (2UL << (20 - PAGE_SHIFT)) )
139 max_pages = 2UL << (20 - PAGE_SHIFT);
140 order = get_order_from_pages(max_pages);
141 if ( (max_pages & (max_pages-1)) != 0 )
142 order--;
143 while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
144 if ( order-- == 0 )
145 break;
146 return page;
147 }
149 static unsigned long __init compute_dom0_nr_pages(
150 #ifdef __x86_64__
151 unsigned long vstart, unsigned long vend, size_t sizeof_long)
152 #else
153 void)
154 #endif
155 {
156 unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
157 unsigned long nr_pages = dom0_nrpages;
158 unsigned long min_pages = dom0_min_nrpages;
159 unsigned long max_pages = dom0_max_nrpages;
161 /*
162 * If domain 0 allocation isn't specified, reserve 1/16th of available
163 * memory for things like DMA buffers. This reservation is clamped to
164 * a maximum of 128MB.
165 */
166 if ( nr_pages == 0 )
167 nr_pages = -min(avail / 16, 128UL << (20 - PAGE_SHIFT));
169 /* Negative memory specification means "all memory - specified amount". */
170 if ( (long)nr_pages < 0 ) nr_pages += avail;
171 if ( (long)min_pages < 0 ) min_pages += avail;
172 if ( (long)max_pages < 0 ) max_pages += avail;
174 /* Clamp dom0 memory according to min/max limits and available memory. */
175 nr_pages = max(nr_pages, min_pages);
176 nr_pages = min(nr_pages, max_pages);
177 nr_pages = min(nr_pages, avail);
179 #ifdef __x86_64__
180 if ( vstart && dom0_nrpages <= 0 &&
181 (dom0_min_nrpages <= 0 || nr_pages > min_pages) )
182 {
183 /*
184 * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M
185 * note) require that there is enough virtual space beyond the initial
186 * allocation to set up their initial page tables. This space is
187 * roughly the same size as the p2m table, so make sure the initial
188 * allocation doesn't consume more than about half the space that's
189 * available between params.virt_base and the address space end.
190 */
191 unsigned long end = vend + nr_pages * sizeof_long;
193 if ( end > vstart )
194 end += end - vstart;
195 if ( end <= vstart ||
196 (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) )
197 {
198 end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long);
199 nr_pages = (end - vend) / (2 * sizeof_long);
200 if ( dom0_min_nrpages > 0 && nr_pages < min_pages )
201 nr_pages = min_pages;
202 printk("Dom0 memory clipped to %lu pages\n", nr_pages);
203 }
204 }
205 #endif
207 return nr_pages;
208 }
210 static void __init process_dom0_ioports_disable(void)
211 {
212 unsigned long io_from, io_to;
213 char *t, *s = opt_dom0_ioports_disable;
214 const char *u;
216 if ( *s == '\0' )
217 return;
219 while ( (t = strsep(&s, ",")) != NULL )
220 {
221 io_from = simple_strtoul(t, &u, 16);
222 if ( u == t )
223 {
224 parse_error:
225 printk("Invalid ioport range <%s> "
226 "in dom0_ioports_disable, skipping\n", t);
227 continue;
228 }
230 if ( *u == '\0' )
231 io_to = io_from;
232 else if ( *u == '-' )
233 io_to = simple_strtoul(u + 1, &u, 16);
234 else
235 goto parse_error;
237 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
238 goto parse_error;
240 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
241 io_from, io_to);
243 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
244 BUG();
245 }
246 }
248 int __init construct_dom0(
249 struct domain *d,
250 unsigned long _image_base,
251 unsigned long _image_start, unsigned long image_len,
252 unsigned long _initrd_start, unsigned long initrd_len,
253 char *cmdline)
254 {
255 int i, rc, compatible, compat32, order, machine;
256 struct cpu_user_regs *regs;
257 unsigned long pfn, mfn;
258 unsigned long nr_pages;
259 unsigned long nr_pt_pages;
260 unsigned long alloc_spfn;
261 unsigned long alloc_epfn;
262 unsigned long count;
263 struct page_info *page = NULL;
264 start_info_t *si;
265 struct vcpu *v = d->vcpu[0];
266 unsigned long long value;
267 #if defined(__i386__)
268 char *image_base = (char *)_image_base; /* use lowmem mappings */
269 char *image_start = (char *)_image_start; /* use lowmem mappings */
270 char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
271 #elif defined(__x86_64__)
272 char *image_base = __va(_image_base);
273 char *image_start = __va(_image_start);
274 char *initrd_start = __va(_initrd_start);
275 #endif
276 #if CONFIG_PAGING_LEVELS >= 4
277 l4_pgentry_t *l4tab = NULL, *l4start = NULL;
278 #endif
279 l3_pgentry_t *l3tab = NULL, *l3start = NULL;
280 l2_pgentry_t *l2tab = NULL, *l2start = NULL;
281 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
283 /*
284 * This fully describes the memory layout of the initial domain. All
285 * *_start address are page-aligned, except v_start (and v_end) which are
286 * superpage-aligned.
287 */
288 struct elf_binary elf;
289 struct elf_dom_parms parms;
290 unsigned long vkern_start;
291 unsigned long vkern_end;
292 unsigned long vinitrd_start;
293 unsigned long vinitrd_end;
294 unsigned long vphysmap_start;
295 unsigned long vphysmap_end;
296 unsigned long vstartinfo_start;
297 unsigned long vstartinfo_end;
298 unsigned long vstack_start;
299 unsigned long vstack_end;
300 unsigned long vpt_start;
301 unsigned long vpt_end;
302 unsigned long v_start;
303 unsigned long v_end;
305 /* Machine address of next candidate page-table page. */
306 unsigned long mpt_alloc;
308 /* Sanity! */
309 BUG_ON(d->domain_id != 0);
310 BUG_ON(d->vcpu[0] == NULL);
311 BUG_ON(v->is_initialised);
313 printk("*** LOADING DOMAIN 0 ***\n");
315 d->max_pages = ~0U;
317 if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 )
318 return rc;
320 if ( (rc = elf_init(&elf, image_start, image_len)) != 0 )
321 return rc;
322 #ifdef VERBOSE
323 elf_set_verbose(&elf);
324 #endif
325 elf_parse_binary(&elf);
326 if ( (rc = elf_xen_parse(&elf, &parms)) != 0 )
327 return rc;
329 /* compatibility check */
330 compatible = 0;
331 compat32 = 0;
332 machine = elf_uval(&elf, elf.ehdr, e_machine);
333 switch (CONFIG_PAGING_LEVELS) {
334 case 3: /* x86_32p */
335 if (parms.pae == PAEKERN_bimodal)
336 parms.pae = PAEKERN_extended_cr3;
337 printk(" Xen kernel: 32-bit, PAE, lsb\n");
338 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
339 compatible = 1;
340 break;
341 case 4: /* x86_64 */
342 printk(" Xen kernel: 64-bit, lsb, compat32\n");
343 if (elf_32bit(&elf) && parms.pae == PAEKERN_bimodal)
344 parms.pae = PAEKERN_extended_cr3;
345 if (elf_32bit(&elf) && parms.pae && machine == EM_386)
346 {
347 compat32 = 1;
348 compatible = 1;
349 }
350 if (elf_64bit(&elf) && machine == EM_X86_64)
351 compatible = 1;
352 break;
353 }
354 printk(" Dom0 kernel: %s%s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
355 elf_64bit(&elf) ? "64-bit" : "32-bit",
356 parms.pae ? ", PAE" : "",
357 elf_msb(&elf) ? "msb" : "lsb",
358 elf.pstart, elf.pend);
359 if ( elf.bsd_symtab_pstart )
360 printk(" Dom0 symbol map 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
361 elf.bsd_symtab_pstart, elf.bsd_symtab_pend);
363 if ( !compatible )
364 {
365 printk("Mismatch between Xen and DOM0 kernel\n");
366 return -EINVAL;
367 }
369 #if defined(__x86_64__)
370 if ( compat32 )
371 {
372 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
373 v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
375 nr_pages = compute_dom0_nr_pages(parms.virt_base,
376 round_pgup(parms.virt_kend) + round_pgup(initrd_len),
377 sizeof(unsigned int));
378 }
379 else if (parms.p2m_base != UNSET_ADDR)
380 nr_pages = compute_dom0_nr_pages(0, 0, 0);
381 else
382 nr_pages = compute_dom0_nr_pages(parms.virt_base,
383 round_pgup(parms.virt_kend) + round_pgup(initrd_len),
384 sizeof(unsigned long));
385 #else
386 nr_pages = compute_dom0_nr_pages();
387 #endif
389 if ( parms.pae == PAEKERN_extended_cr3 )
390 set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
392 if ( (parms.virt_hv_start_low != UNSET_ADDR) && elf_32bit(&elf) )
393 {
394 unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
395 value = (parms.virt_hv_start_low + mask) & ~mask;
396 BUG_ON(!is_pv_32bit_domain(d));
397 #if defined(__i386__)
398 if ( value > HYPERVISOR_VIRT_START )
399 panic("Domain 0 expects too high a hypervisor start address.\n");
400 #else
401 if ( value > __HYPERVISOR_COMPAT_VIRT_START )
402 panic("Domain 0 expects too high a hypervisor start address.\n");
403 HYPERVISOR_COMPAT_VIRT_START(d) =
404 max_t(unsigned int, m2p_compat_vstart, value);
405 #endif
406 }
408 if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) )
409 {
410 printk(XENLOG_WARNING "P2M table base ignored\n");
411 parms.p2m_base = UNSET_ADDR;
412 }
414 domain_set_alloc_bitsize(d);
416 /*
417 * Why do we need this? The number of page-table frames depends on the
418 * size of the bootstrap address space. But the size of the address space
419 * depends on the number of page-table frames (since each one is mapped
420 * read-only). We have a pair of simultaneous equations in two unknowns,
421 * which we solve by exhaustive search.
422 */
423 v_start = parms.virt_base;
424 vkern_start = parms.virt_kstart;
425 vkern_end = parms.virt_kend;
426 vinitrd_start = round_pgup(vkern_end);
427 vinitrd_end = vinitrd_start + initrd_len;
428 vphysmap_start = round_pgup(vinitrd_end);
429 vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
430 sizeof(unsigned long) :
431 sizeof(unsigned int)));
432 if ( parms.p2m_base != UNSET_ADDR )
433 vphysmap_end = vphysmap_start;
434 vstartinfo_start = round_pgup(vphysmap_end);
435 vstartinfo_end = (vstartinfo_start +
436 sizeof(struct start_info) +
437 sizeof(struct dom0_vga_console_info));
438 vpt_start = round_pgup(vstartinfo_end);
439 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
440 {
441 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
442 vstack_start = vpt_end;
443 vstack_end = vstack_start + PAGE_SIZE;
444 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
445 if ( (v_end - vstack_end) < (512UL << 10) )
446 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
447 #if defined(__i386__)
448 /* 5 pages: 1x 3rd + 4x 2nd level */
449 if ( (((v_end - v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
450 L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
451 break;
452 #elif defined(__x86_64__)
453 #define NR(_l,_h,_s) \
454 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
455 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
456 if ( (1 + /* # L4 */
457 NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
458 (!is_pv_32on64_domain(d) ?
459 NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */
460 4) + /* # compat L2 */
461 NR(v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
462 <= nr_pt_pages )
463 break;
464 #endif
465 }
467 order = get_order_from_bytes(v_end - v_start);
468 if ( (1UL << order) > nr_pages )
469 panic("Domain 0 allocation is too small for kernel image.\n");
471 #ifdef __i386__
472 /* Ensure that our low-memory 1:1 mapping covers the allocation. */
473 page = alloc_domheap_pages(d, order, MEMF_bits(30));
474 #else
475 if ( parms.p2m_base != UNSET_ADDR )
476 {
477 vphysmap_start = parms.p2m_base;
478 vphysmap_end = vphysmap_start + nr_pages * sizeof(unsigned long);
479 }
480 page = alloc_domheap_pages(d, order, 0);
481 #endif
482 if ( page == NULL )
483 panic("Not enough RAM for domain 0 allocation.\n");
484 alloc_spfn = page_to_mfn(page);
485 alloc_epfn = alloc_spfn + d->tot_pages;
487 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
488 " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr,
489 pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
490 if ( d->tot_pages < nr_pages )
491 printk(" (%lu pages to be allocated)",
492 nr_pages - d->tot_pages);
493 printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
494 " Loaded kernel: %p->%p\n"
495 " Init. ramdisk: %p->%p\n"
496 " Phys-Mach map: %p->%p\n"
497 " Start info: %p->%p\n"
498 " Page tables: %p->%p\n"
499 " Boot stack: %p->%p\n"
500 " TOTAL: %p->%p\n",
501 _p(vkern_start), _p(vkern_end),
502 _p(vinitrd_start), _p(vinitrd_end),
503 _p(vphysmap_start), _p(vphysmap_end),
504 _p(vstartinfo_start), _p(vstartinfo_end),
505 _p(vpt_start), _p(vpt_end),
506 _p(vstack_start), _p(vstack_end),
507 _p(v_start), _p(v_end));
508 printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
510 mpt_alloc = (vpt_start - v_start) +
511 (unsigned long)pfn_to_paddr(alloc_spfn);
513 #if defined(__i386__)
514 /*
515 * Protect the lowest 1GB of memory. We use a temporary mapping there
516 * from which we copy the kernel and ramdisk images.
517 */
518 if ( v_start < (1UL<<30) )
519 {
520 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
521 return -EINVAL;
522 }
524 /* WARNING: The new domain must have its 'processor' field filled in! */
525 l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
526 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
527 for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
528 copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
529 idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
530 l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
531 l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
532 l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
533 }
534 v->arch.guest_table = pagetable_from_paddr((unsigned long)l3start);
536 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
537 l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
538 l2e_from_page(perdomain_pt_page(d, i), __PAGE_HYPERVISOR);
540 l2tab += l2_linear_offset(v_start);
541 mfn = alloc_spfn;
542 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
543 {
544 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
545 {
546 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
547 mpt_alloc += PAGE_SIZE;
548 *l2tab = l2e_from_paddr((unsigned long)l1start, L2_PROT);
549 l2tab++;
550 clear_page(l1tab);
551 if ( count == 0 )
552 l1tab += l1_table_offset(v_start);
553 }
554 *l1tab = l1e_from_pfn(mfn, L1_PROT);
555 l1tab++;
557 page = mfn_to_page(mfn);
558 if ( !get_page_and_type(page, d, PGT_writable_page) )
559 BUG();
561 mfn++;
562 }
564 /* Pages that are part of page tables must be read only. */
565 l2tab = l2start + l2_linear_offset(vpt_start);
566 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*l2tab);
567 l1tab += l1_table_offset(vpt_start);
568 for ( count = 0; count < nr_pt_pages; count++ )
569 {
570 page = mfn_to_page(l1e_get_pfn(*l1tab));
571 if ( !opt_dom0_shadow )
572 l1e_remove_flags(*l1tab, _PAGE_RW);
573 else
574 if ( !get_page_type(page, PGT_writable_page) )
575 BUG();
577 switch ( count )
578 {
579 case 0:
580 page->u.inuse.type_info &= ~PGT_type_mask;
581 page->u.inuse.type_info |= PGT_l3_page_table;
582 get_page(page, d); /* an extra ref because of readable mapping */
584 /* Get another ref to L3 page so that it can be pinned. */
585 page->u.inuse.type_info++;
586 page->count_info++;
587 set_bit(_PGT_pinned, &page->u.inuse.type_info);
588 break;
589 case 1 ... 4:
590 page->u.inuse.type_info &= ~PGT_type_mask;
591 page->u.inuse.type_info |= PGT_l2_page_table;
592 if ( count == 4 )
593 page->u.inuse.type_info |= PGT_pae_xen_l2;
594 get_page(page, d); /* an extra ref because of readable mapping */
595 break;
596 default:
597 page->u.inuse.type_info &= ~PGT_type_mask;
598 page->u.inuse.type_info |= PGT_l1_page_table;
599 get_page(page, d); /* an extra ref because of readable mapping */
600 break;
601 }
602 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
603 l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_paddr(*++l2tab);
604 }
606 #elif defined(__x86_64__)
608 /* Overlap with Xen protected area? */
609 if ( !is_pv_32on64_domain(d) ?
610 ((v_start < HYPERVISOR_VIRT_END) &&
611 (v_end > HYPERVISOR_VIRT_START)) :
612 (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) )
613 {
614 printk("DOM0 image overlaps with Xen private area.\n");
615 return -EINVAL;
616 }
618 if ( is_pv_32on64_domain(d) )
619 {
620 v->arch.guest_context.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS;
621 v->arch.guest_context.event_callback_cs = FLAT_COMPAT_KERNEL_CS;
622 }
624 /* WARNING: The new domain must have its 'processor' field filled in! */
625 if ( !is_pv_32on64_domain(d) )
626 {
627 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
628 l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
629 }
630 else
631 {
632 page = alloc_domheap_page(NULL, 0);
633 if ( !page )
634 panic("Not enough RAM for domain 0 PML4.\n");
635 page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
636 l4start = l4tab = page_to_virt(page);
637 }
638 copy_page(l4tab, idle_pg_table);
639 l4tab[0] = l4e_empty(); /* zap trampoline mapping */
640 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
641 l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
642 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
643 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
644 v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
645 if ( is_pv_32on64_domain(d) )
646 v->arch.guest_table_user = v->arch.guest_table;
648 l4tab += l4_table_offset(v_start);
649 mfn = alloc_spfn;
650 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
651 {
652 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
653 {
654 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
655 l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
656 clear_page(l1tab);
657 if ( count == 0 )
658 l1tab += l1_table_offset(v_start);
659 if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
660 {
661 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
662 l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
663 clear_page(l2tab);
664 if ( count == 0 )
665 l2tab += l2_table_offset(v_start);
666 if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
667 {
668 maddr_to_page(mpt_alloc)->u.inuse.type_info =
669 PGT_l3_page_table;
670 l3start = l3tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
671 clear_page(l3tab);
672 if ( count == 0 )
673 l3tab += l3_table_offset(v_start);
674 *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
675 l4tab++;
676 }
677 *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
678 l3tab++;
679 }
680 *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
681 l2tab++;
682 }
683 *l1tab = l1e_from_pfn(mfn, (!is_pv_32on64_domain(d) ?
684 L1_PROT : COMPAT_L1_PROT));
685 l1tab++;
687 page = mfn_to_page(mfn);
688 if ( (page->u.inuse.type_info == 0) &&
689 !get_page_and_type(page, d, PGT_writable_page) )
690 BUG();
692 mfn++;
693 }
695 if ( is_pv_32on64_domain(d) )
696 {
697 /* Ensure the first four L3 entries are all populated. */
698 for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
699 {
700 if ( !l3e_get_intpte(*l3tab) )
701 {
702 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
703 l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
704 clear_page(l2tab);
705 *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
706 }
707 if ( i == 3 )
708 l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
709 }
710 /* Install read-only guest visible MPT mapping. */
711 l2tab = l3e_to_l2e(l3start[3]);
712 memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
713 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
714 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
715 }
717 /* Pages that are part of page tables must be read only. */
718 l4tab = l4start + l4_table_offset(vpt_start);
719 l3start = l3tab = l4e_to_l3e(*l4tab);
720 l3tab += l3_table_offset(vpt_start);
721 l2start = l2tab = l3e_to_l2e(*l3tab);
722 l2tab += l2_table_offset(vpt_start);
723 l1start = l1tab = l2e_to_l1e(*l2tab);
724 l1tab += l1_table_offset(vpt_start);
725 for ( count = 0; count < nr_pt_pages; count++ )
726 {
727 l1e_remove_flags(*l1tab, _PAGE_RW);
728 page = mfn_to_page(l1e_get_pfn(*l1tab));
730 /* Read-only mapping + PGC_allocated + page-table page. */
731 page->count_info = PGC_allocated | 3;
732 page->u.inuse.type_info |= PGT_validated | 1;
734 /* Top-level p.t. is pinned. */
735 if ( (page->u.inuse.type_info & PGT_type_mask) ==
736 (!is_pv_32on64_domain(d) ?
737 PGT_l4_page_table : PGT_l3_page_table) )
738 {
739 page->count_info += 1;
740 page->u.inuse.type_info += 1 | PGT_pinned;
741 }
743 /* Iterate. */
744 if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
745 {
746 if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
747 {
748 if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
749 l3start = l3tab = l4e_to_l3e(*++l4tab);
750 l2start = l2tab = l3e_to_l2e(*l3tab);
751 }
752 l1start = l1tab = l2e_to_l1e(*l2tab);
753 }
754 }
756 #endif /* __x86_64__ */
758 /* Mask all upcalls... */
759 for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
760 shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
762 printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
764 for ( i = 1; i < opt_dom0_max_vcpus; i++ )
765 (void)alloc_vcpu(d, i, i % num_online_cpus());
767 /* Set up CR3 value for write_ptbase */
768 if ( paging_mode_enabled(d) )
769 paging_update_paging_modes(v);
770 else
771 update_cr3(v);
773 /* We run on dom0's page tables for the final part of the build process. */
774 write_ptbase(v);
776 /* Copy the OS image and free temporary buffer. */
777 elf.dest = (void*)vkern_start;
778 elf_load_binary(&elf);
780 if ( UNSET_ADDR != parms.virt_hypercall )
781 {
782 if ( (parms.virt_hypercall < v_start) ||
783 (parms.virt_hypercall >= v_end) )
784 {
785 write_ptbase(current);
786 printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
787 return -1;
788 }
789 hypercall_page_initialise(
790 d, (void *)(unsigned long)parms.virt_hypercall);
791 }
793 /* Copy the initial ramdisk. */
794 if ( initrd_len != 0 )
795 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
797 /* Free temporary buffers. */
798 discard_initial_images();
800 /* Set up start info area. */
801 si = (start_info_t *)vstartinfo_start;
802 clear_page(si);
803 si->nr_pages = nr_pages;
805 si->shared_info = virt_to_maddr(d->shared_info);
807 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
808 si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
809 si->pt_base = vpt_start + 2 * PAGE_SIZE * !!is_pv_32on64_domain(d);
810 si->nr_pt_frames = nr_pt_pages;
811 si->mfn_list = vphysmap_start;
812 snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
813 elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
815 count = d->tot_pages;
816 #ifdef __x86_64__
817 /* Set up the phys->machine table if not part of the initial mapping. */
818 if ( parms.p2m_base != UNSET_ADDR )
819 {
820 unsigned long va = vphysmap_start;
822 if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
823 panic("DOM0 P->M table overlaps initial mapping");
825 while ( va < vphysmap_end )
826 {
827 if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
828 >> PAGE_SHIFT) + 3 > nr_pages )
829 panic("Dom0 allocation too small for initial P->M table.\n");
831 l4tab = l4start + l4_table_offset(va);
832 if ( !l4e_get_intpte(*l4tab) )
833 {
834 page = alloc_domheap_page(d, 0);
835 if ( !page )
836 break;
837 /* No mapping, PGC_allocated + page-table page. */
838 page->count_info = PGC_allocated | 2;
839 page->u.inuse.type_info =
840 PGT_l3_page_table | PGT_validated | 1;
841 clear_page(page_to_virt(page));
842 *l4tab = l4e_from_page(page, L4_PROT);
843 }
844 l3tab = page_to_virt(l4e_get_page(*l4tab));
845 l3tab += l3_table_offset(va);
846 if ( !l3e_get_intpte(*l3tab) )
847 {
848 if ( cpu_has_page1gb &&
849 !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
850 vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) &&
851 (page = alloc_domheap_pages(d,
852 L3_PAGETABLE_SHIFT -
853 PAGE_SHIFT,
854 0)) != NULL )
855 {
856 *l3tab = l3e_from_page(page,
857 L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
858 va += 1UL << L3_PAGETABLE_SHIFT;
859 continue;
860 }
861 if ( (page = alloc_domheap_page(d, 0)) == NULL )
862 break;
863 else
864 {
865 /* No mapping, PGC_allocated + page-table page. */
866 page->count_info = PGC_allocated | 2;
867 page->u.inuse.type_info =
868 PGT_l2_page_table | PGT_validated | 1;
869 clear_page(page_to_virt(page));
870 *l3tab = l3e_from_page(page, L3_PROT);
871 }
872 }
873 l2tab = page_to_virt(l3e_get_page(*l3tab));
874 l2tab += l2_table_offset(va);
875 if ( !l2e_get_intpte(*l2tab) )
876 {
877 if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
878 vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) &&
879 (page = alloc_domheap_pages(d,
880 L2_PAGETABLE_SHIFT -
881 PAGE_SHIFT,
882 0)) != NULL )
883 {
884 *l2tab = l2e_from_page(page,
885 L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
886 va += 1UL << L2_PAGETABLE_SHIFT;
887 continue;
888 }
889 if ( (page = alloc_domheap_page(d, 0)) == NULL )
890 break;
891 else
892 {
893 /* No mapping, PGC_allocated + page-table page. */
894 page->count_info = PGC_allocated | 2;
895 page->u.inuse.type_info =
896 PGT_l1_page_table | PGT_validated | 1;
897 clear_page(page_to_virt(page));
898 *l2tab = l2e_from_page(page, L2_PROT);
899 }
900 }
901 l1tab = page_to_virt(l2e_get_page(*l2tab));
902 l1tab += l1_table_offset(va);
903 BUG_ON(l1e_get_intpte(*l1tab));
904 page = alloc_domheap_page(d, 0);
905 if ( !page )
906 break;
907 *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
908 va += PAGE_SIZE;
909 va &= PAGE_MASK;
910 }
911 if ( !page )
912 panic("Not enough RAM for DOM0 P->M table.\n");
913 }
914 #endif
916 /* Write the phys->machine and machine->phys table entries. */
917 for ( pfn = 0; pfn < count; pfn++ )
918 {
919 mfn = pfn + alloc_spfn;
920 #ifndef NDEBUG
921 #define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT)
922 if ( pfn > REVERSE_START )
923 mfn = alloc_epfn - (pfn - REVERSE_START);
924 #endif
925 if ( !is_pv_32on64_domain(d) )
926 ((unsigned long *)vphysmap_start)[pfn] = mfn;
927 else
928 ((unsigned int *)vphysmap_start)[pfn] = mfn;
929 set_gpfn_from_mfn(mfn, pfn);
930 }
931 si->first_p2m_pfn = pfn;
932 si->nr_p2m_frames = d->tot_pages - count;
933 page_list_for_each ( page, &d->page_list )
934 {
935 mfn = page_to_mfn(page);
936 if ( get_gpfn_from_mfn(mfn) >= count )
937 {
938 BUG_ON(is_pv_32bit_domain(d));
939 if ( !page->u.inuse.type_info &&
940 !get_page_and_type(page, d, PGT_writable_page) )
941 BUG();
942 ((unsigned long *)vphysmap_start)[pfn] = mfn;
943 set_gpfn_from_mfn(mfn, pfn);
944 ++pfn;
945 #ifndef NDEBUG
946 ++alloc_epfn;
947 #endif
948 }
949 }
950 BUG_ON(pfn != d->tot_pages);
951 while ( pfn < nr_pages )
952 {
953 if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
954 panic("Not enough RAM for DOM0 reservation.\n");
955 while ( pfn < d->tot_pages )
956 {
957 mfn = page_to_mfn(page);
958 #ifndef NDEBUG
959 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
960 #endif
961 if ( !is_pv_32on64_domain(d) )
962 ((unsigned long *)vphysmap_start)[pfn] = mfn;
963 else
964 ((unsigned int *)vphysmap_start)[pfn] = mfn;
965 set_gpfn_from_mfn(mfn, pfn);
966 #undef pfn
967 page++; pfn++;
968 }
969 }
971 if ( initrd_len != 0 )
972 {
973 si->mod_start = vinitrd_start;
974 si->mod_len = initrd_len;
975 }
977 memset(si->cmd_line, 0, sizeof(si->cmd_line));
978 if ( cmdline != NULL )
979 strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line));
981 if ( fill_console_start_info((void *)(si + 1)) )
982 {
983 si->console.dom0.info_off = sizeof(struct start_info);
984 si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
985 }
987 #if defined(__x86_64__)
988 if ( is_pv_32on64_domain(d) )
989 xlat_start_info(si, XLAT_start_info_console_dom0);
990 #endif
992 /* Return to idle domain's page tables. */
993 write_ptbase(current);
995 #if defined(__i386__)
996 /* Destroy low mappings - they were only for our convenience. */
997 zap_low_mappings(l2start);
998 #endif
1000 update_domain_wallclock_time(d);
1002 v->is_initialised = 1;
1003 clear_bit(_VPF_down, &v->pause_flags);
1005 /*
1006 * Initial register values:
1007 * DS,ES,FS,GS = FLAT_KERNEL_DS
1008 * CS:EIP = FLAT_KERNEL_CS:start_pc
1009 * SS:ESP = FLAT_KERNEL_SS:start_stack
1010 * ESI = start_info
1011 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
1012 */
1013 regs = &v->arch.guest_context.user_regs;
1014 regs->ds = regs->es = regs->fs = regs->gs =
1015 !is_pv_32on64_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS;
1016 regs->ss = (!is_pv_32on64_domain(d) ?
1017 FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS);
1018 regs->cs = (!is_pv_32on64_domain(d) ?
1019 FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS);
1020 regs->eip = parms.virt_entry;
1021 regs->esp = vstack_end;
1022 regs->esi = vstartinfo_start;
1023 regs->eflags = X86_EFLAGS_IF;
1025 if ( opt_dom0_shadow )
1026 if ( paging_enable(d, PG_SH_enable) == 0 )
1027 paging_update_paging_modes(v);
1029 if ( supervisor_mode_kernel )
1031 v->arch.guest_context.kernel_ss &= ~3;
1032 v->arch.guest_context.user_regs.ss &= ~3;
1033 v->arch.guest_context.user_regs.es &= ~3;
1034 v->arch.guest_context.user_regs.ds &= ~3;
1035 v->arch.guest_context.user_regs.fs &= ~3;
1036 v->arch.guest_context.user_regs.gs &= ~3;
1037 printk("Dom0 runs in ring 0 (supervisor mode)\n");
1038 if ( !test_bit(XENFEAT_supervisor_mode_kernel,
1039 parms.f_supported) )
1040 panic("Dom0 does not support supervisor-mode execution\n");
1042 else
1044 if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) )
1045 panic("Dom0 requires supervisor-mode execution\n");
1048 rc = 0;
1050 /* DOM0 is permitted full I/O capabilities. */
1051 rc |= ioports_permit_access(dom0, 0, 0xFFFF);
1052 rc |= iomem_permit_access(dom0, 0UL, ~0UL);
1053 rc |= irqs_permit_access(dom0, 0, d->nr_pirqs - 1);
1055 /*
1056 * Modify I/O port access permissions.
1057 */
1058 /* Master Interrupt Controller (PIC). */
1059 rc |= ioports_deny_access(dom0, 0x20, 0x21);
1060 /* Slave Interrupt Controller (PIC). */
1061 rc |= ioports_deny_access(dom0, 0xA0, 0xA1);
1062 /* Interval Timer (PIT). */
1063 rc |= ioports_deny_access(dom0, 0x40, 0x43);
1064 /* PIT Channel 2 / PC Speaker Control. */
1065 rc |= ioports_deny_access(dom0, 0x61, 0x61);
1066 /* PCI configuration space (NB. 0xcf8 has special treatment). */
1067 rc |= ioports_deny_access(dom0, 0xcfc, 0xcff);
1068 /* Command-line I/O ranges. */
1069 process_dom0_ioports_disable();
1071 /*
1072 * Modify I/O memory access permissions.
1073 */
1074 /* Local APIC. */
1075 if ( mp_lapic_addr != 0 )
1077 mfn = paddr_to_pfn(mp_lapic_addr);
1078 rc |= iomem_deny_access(dom0, mfn, mfn);
1080 /* I/O APICs. */
1081 for ( i = 0; i < nr_ioapics; i++ )
1083 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
1084 if ( smp_found_config )
1085 rc |= iomem_deny_access(dom0, mfn, mfn);
1088 /* Remove access to E820_UNUSABLE I/O regions above 1MB. */
1089 for ( i = 0; i < e820.nr_map; i++ )
1091 unsigned long sfn, efn;
1092 sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul);
1093 efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1);
1094 if ( (e820.map[i].type == E820_UNUSABLE) &&
1095 (e820.map[i].size != 0) &&
1096 (sfn <= efn) )
1097 rc |= iomem_deny_access(dom0, sfn, efn);
1100 BUG_ON(rc != 0);
1102 return 0;
1105 /*
1106 * Local variables:
1107 * mode: C
1108 * c-set-style: "BSD"
1109 * c-basic-offset: 4
1110 * tab-width: 4
1111 * indent-tabs-mode: nil
1112 * End:
1113 */