ia64/xen-unstable

view xen/arch/x86/x86_64/mm.c @ 19098:20f94c2757b8

x86-64: also use 1G page mappings for M2P table

Also, specify the node for the respective page table allocations.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jan 27 11:23:30 2009 +0000 (2009-01-27)
parents 39517e863cc8
children 67a0ffade665
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/guest_access.h>
26 #include <asm/current.h>
27 #include <asm/asm_defns.h>
28 #include <asm/page.h>
29 #include <asm/flushtlb.h>
30 #include <asm/fixmap.h>
31 #include <asm/hypercall.h>
32 #include <asm/msr.h>
33 #include <asm/numa.h>
34 #include <public/memory.h>
36 #ifdef CONFIG_COMPAT
37 unsigned int m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
38 #endif
40 DEFINE_PER_CPU(char, compat_arg_xlat[COMPAT_ARG_XLAT_SIZE]);
42 /* Top-level master (and idle-domain) page directory. */
43 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
44 idle_pg_table[L4_PAGETABLE_ENTRIES];
46 /* Enough page directories to map bottom 4GB of the memory map. */
47 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
48 l3_identmap[L3_PAGETABLE_ENTRIES];
49 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
50 l2_identmap[4*L2_PAGETABLE_ENTRIES];
52 /* Enough page directories to map the Xen text and static data. */
53 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
54 l3_xenmap[L3_PAGETABLE_ENTRIES];
55 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
56 l2_xenmap[L2_PAGETABLE_ENTRIES];
58 void *alloc_xen_pagetable(void)
59 {
60 extern int early_boot;
61 unsigned long mfn;
63 if ( !early_boot )
64 {
65 struct page_info *pg = alloc_domheap_page(NULL, 0);
66 BUG_ON(pg == NULL);
67 return page_to_virt(pg);
68 }
70 mfn = alloc_boot_pages(1, 1);
71 BUG_ON(mfn == 0);
72 return mfn_to_virt(mfn);
73 }
75 l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
76 {
77 l4_pgentry_t *pl4e;
79 pl4e = &idle_pg_table[l4_table_offset(v)];
80 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
81 {
82 l3_pgentry_t *pl3e = alloc_xen_pagetable();
83 clear_page(pl3e);
84 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
85 }
87 return l4e_to_l3e(*pl4e) + l3_table_offset(v);
88 }
90 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
91 {
92 l3_pgentry_t *pl3e;
94 pl3e = virt_to_xen_l3e(v);
95 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
96 {
97 l2_pgentry_t *pl2e = alloc_xen_pagetable();
98 clear_page(pl2e);
99 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
100 }
102 BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
103 return l3e_to_l2e(*pl3e) + l2_table_offset(v);
104 }
106 void __init paging_init(void)
107 {
108 unsigned long i, mpt_size, va;
109 unsigned int memflags;
110 l3_pgentry_t *l3_ro_mpt;
111 l2_pgentry_t *l2_ro_mpt = NULL;
112 struct page_info *l1_pg, *l2_pg, *l3_pg;
114 /* Create user-accessible L2 directory to map the MPT for guests. */
115 if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL )
116 goto nomem;
117 l3_ro_mpt = page_to_virt(l3_pg);
118 clear_page(l3_ro_mpt);
119 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
120 l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER));
122 /*
123 * Allocate and map the machine-to-phys table.
124 * This also ensures L3 is present for fixmaps.
125 */
126 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
127 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
128 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
129 {
130 BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
131 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
132 memflags = MEMF_node(phys_to_nid(i <<
133 (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
135 if ( cpu_has_page1gb &&
136 !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
137 (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) &&
138 (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
139 memflags)) != NULL )
140 {
141 map_pages_to_xen(
142 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
143 page_to_mfn(l1_pg),
144 1UL << (2 * PAGETABLE_ORDER),
145 PAGE_HYPERVISOR);
146 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
147 0x77, 1UL << L3_PAGETABLE_SHIFT);
149 ASSERT(!l2_table_offset(va));
150 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
151 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
152 l3e_from_page(l1_pg,
153 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
154 i += (1UL << PAGETABLE_ORDER) - 1;
155 continue;
156 }
158 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
159 memflags)) == NULL )
160 goto nomem;
161 map_pages_to_xen(
162 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
163 page_to_mfn(l1_pg),
164 1UL << PAGETABLE_ORDER,
165 PAGE_HYPERVISOR);
166 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55,
167 1UL << L2_PAGETABLE_SHIFT);
168 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
169 {
170 if ( (l2_pg = alloc_domheap_page(NULL, memflags)) == NULL )
171 goto nomem;
172 l2_ro_mpt = page_to_virt(l2_pg);
173 clear_page(l2_ro_mpt);
174 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
175 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
176 ASSERT(!l2_table_offset(va));
177 }
178 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
179 l2e_write(l2_ro_mpt, l2e_from_page(
180 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
181 l2_ro_mpt++;
182 }
184 /* Create user-accessible L2 directory to map the MPT for compat guests. */
185 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
186 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
187 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
188 HIRO_COMPAT_MPT_VIRT_START)]);
189 if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
190 goto nomem;
191 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
192 clear_page(l2_ro_mpt);
193 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
194 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
195 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
196 /* Allocate and map the compatibility mode machine-to-phys table. */
197 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
198 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
199 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
200 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
201 if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
202 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
203 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
204 {
205 memflags = MEMF_node(phys_to_nid(i <<
206 (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
207 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
208 memflags)) == NULL )
209 goto nomem;
210 map_pages_to_xen(
211 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
212 page_to_mfn(l1_pg),
213 1UL << PAGETABLE_ORDER,
214 PAGE_HYPERVISOR);
215 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
216 (i << L2_PAGETABLE_SHIFT)),
217 0x55,
218 1UL << L2_PAGETABLE_SHIFT);
219 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
220 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
221 l2_ro_mpt++;
222 }
224 /* Set up linear page table mapping. */
225 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
226 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
227 return;
229 nomem:
230 panic("Not enough memory for m2p table\n");
231 }
233 void __init setup_idle_pagetable(void)
234 {
235 /* Install per-domain mappings for idle domain. */
236 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
237 l4e_from_page(
238 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
239 __PAGE_HYPERVISOR));
240 }
242 void __init zap_low_mappings(void)
243 {
244 BUG_ON(num_online_cpus() != 1);
246 /* Remove aliased mapping of first 1:1 PML4 entry. */
247 l4e_write(&idle_pg_table[0], l4e_empty());
248 flush_local(FLUSH_TLB_GLOBAL);
250 /* Replace with mapping of the boot trampoline only. */
251 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
252 0x10, __PAGE_HYPERVISOR);
253 }
255 void __init subarch_init_memory(void)
256 {
257 unsigned long i, n, v, m2p_start_mfn;
258 l3_pgentry_t l3e;
259 l2_pgentry_t l2e;
261 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
262 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
263 /* M2P table is mappable read-only by privileged domains. */
264 for ( v = RDWR_MPT_VIRT_START;
265 v != RDWR_MPT_VIRT_END;
266 v += n << PAGE_SHIFT )
267 {
268 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
269 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
270 l3_table_offset(v)];
271 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
272 continue;
273 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
274 {
275 n = L1_PAGETABLE_ENTRIES;
276 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
277 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
278 continue;
279 m2p_start_mfn = l2e_get_pfn(l2e);
280 }
281 else
282 {
283 m2p_start_mfn = l3e_get_pfn(l3e);
284 }
286 for ( i = 0; i < n; i++ )
287 {
288 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
289 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
290 }
291 }
293 for ( v = RDWR_COMPAT_MPT_VIRT_START;
294 v != RDWR_COMPAT_MPT_VIRT_END;
295 v += 1 << L2_PAGETABLE_SHIFT )
296 {
297 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
298 l3_table_offset(v)];
299 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
300 continue;
301 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
302 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
303 continue;
304 m2p_start_mfn = l2e_get_pfn(l2e);
306 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
307 {
308 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
309 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
310 }
311 }
312 }
314 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
315 {
316 struct xen_machphys_mfn_list xmml;
317 l3_pgentry_t l3e;
318 l2_pgentry_t l2e;
319 unsigned long v;
320 xen_pfn_t mfn;
321 unsigned int i;
322 long rc = 0;
324 switch ( op )
325 {
326 case XENMEM_machphys_mfn_list:
327 if ( copy_from_guest(&xmml, arg, 1) )
328 return -EFAULT;
330 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
331 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
332 for ( i = 0, v = RDWR_MPT_VIRT_START;
333 (i != xmml.max_extents) && (v != RDWR_MPT_VIRT_END);
334 i++, v += 1UL << L2_PAGETABLE_SHIFT )
335 {
336 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
337 l3_table_offset(v)];
338 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
339 break;
340 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
341 {
342 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
343 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
344 break;
345 mfn = l2e_get_pfn(l2e);
346 }
347 else
348 {
349 mfn = l3e_get_pfn(l3e)
350 + (l2_table_offset(v) << PAGETABLE_ORDER);
351 }
352 ASSERT(!l1_table_offset(v));
353 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
354 return -EFAULT;
355 }
357 xmml.nr_extents = i;
358 if ( copy_to_guest(arg, &xmml, 1) )
359 return -EFAULT;
361 break;
363 default:
364 rc = -ENOSYS;
365 break;
366 }
368 return rc;
369 }
371 long do_stack_switch(unsigned long ss, unsigned long esp)
372 {
373 fixup_guest_stack_selector(current->domain, ss);
374 current->arch.guest_context.kernel_ss = ss;
375 current->arch.guest_context.kernel_sp = esp;
376 return 0;
377 }
379 long do_set_segment_base(unsigned int which, unsigned long base)
380 {
381 struct vcpu *v = current;
382 long ret = 0;
384 switch ( which )
385 {
386 case SEGBASE_FS:
387 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
388 ret = -EFAULT;
389 else
390 v->arch.guest_context.fs_base = base;
391 break;
393 case SEGBASE_GS_USER:
394 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
395 ret = -EFAULT;
396 else
397 v->arch.guest_context.gs_base_user = base;
398 break;
400 case SEGBASE_GS_KERNEL:
401 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
402 ret = -EFAULT;
403 else
404 v->arch.guest_context.gs_base_kernel = base;
405 break;
407 case SEGBASE_GS_USER_SEL:
408 __asm__ __volatile__ (
409 " swapgs \n"
410 "1: movl %k0,%%gs \n"
411 " "safe_swapgs" \n"
412 ".section .fixup,\"ax\" \n"
413 "2: xorl %k0,%k0 \n"
414 " jmp 1b \n"
415 ".previous \n"
416 ".section __ex_table,\"a\"\n"
417 " .align 8 \n"
418 " .quad 1b,2b \n"
419 ".previous "
420 : : "r" (base&0xffff) );
421 break;
423 default:
424 ret = -EINVAL;
425 break;
426 }
428 return ret;
429 }
432 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
433 int check_descriptor(const struct domain *dom, struct desc_struct *d)
434 {
435 u32 a = d->a, b = d->b;
436 u16 cs;
437 unsigned int dpl;
439 /* A not-present descriptor will always fault, so is safe. */
440 if ( !(b & _SEGMENT_P) )
441 goto good;
443 /* Check and fix up the DPL. */
444 dpl = (b >> 13) & 3;
445 __fixup_guest_selector(dom, dpl);
446 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
448 /* All code and data segments are okay. No base/limit checking. */
449 if ( (b & _SEGMENT_S) )
450 {
451 if ( is_pv_32bit_domain(dom) )
452 {
453 unsigned long base, limit;
455 if ( b & _SEGMENT_L )
456 goto bad;
458 /*
459 * Older PAE Linux guests use segments which are limited to
460 * 0xf6800000. Extend these to allow access to the larger read-only
461 * M2P table available in 32on64 mode.
462 */
463 base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16);
465 limit = (b & 0xf0000) | (a & 0xffff);
466 limit++; /* We add one because limit is inclusive. */
468 if ( (b & _SEGMENT_G) )
469 limit <<= 12;
471 if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
472 {
473 a |= 0x0000ffff;
474 b |= 0x000f0000;
475 }
476 }
478 goto good;
479 }
481 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
482 if ( (b & _SEGMENT_TYPE) == 0x000 )
483 goto good;
485 /* Everything but a call gate is discarded here. */
486 if ( (b & _SEGMENT_TYPE) != 0xc00 )
487 goto bad;
489 /* Validate the target code selector. */
490 cs = a >> 16;
491 if ( !guest_gate_selector_okay(dom, cs) )
492 goto bad;
493 /*
494 * Force DPL to zero, causing a GP fault with its error code indicating
495 * the gate in use, allowing emulation. This is necessary because with
496 * native guests (kernel in ring 3) call gates cannot be used directly
497 * to transition from user to kernel mode (and whether a gate is used
498 * to enter the kernel can only be determined when the gate is being
499 * used), and with compat guests call gates cannot be used at all as
500 * there are only 64-bit ones.
501 * Store the original DPL in the selector's RPL field.
502 */
503 b &= ~_SEGMENT_DPL;
504 cs = (cs & ~3) | dpl;
505 a = (a & 0xffffU) | (cs << 16);
507 /* Reserved bits must be zero. */
508 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
509 goto bad;
511 good:
512 d->a = a;
513 d->b = b;
514 return 1;
515 bad:
516 return 0;
517 }
519 void domain_set_alloc_bitsize(struct domain *d)
520 {
521 if ( !is_pv_32on64_domain(d) ||
522 (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
523 d->arch.physaddr_bitsize > 0 )
524 return;
525 d->arch.physaddr_bitsize =
526 /* 2^n entries can be contained in guest's p2m mapping space */
527 fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
528 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
529 + PAGE_SHIFT;
530 }
532 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
533 {
534 if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
535 return bits;
536 return min(d->arch.physaddr_bitsize, bits);
537 }
539 #include "compat/mm.c"
541 /*
542 * Local variables:
543 * mode: C
544 * c-set-style: "BSD"
545 * c-basic-offset: 4
546 * tab-width: 4
547 * indent-tabs-mode: nil
548 * End:
549 */