ia64/xen-unstable

view xen/arch/x86/x86_64/mm.c @ 18666:c003e5a23a4e

Clean up spinlock operations and compile as first-class functions.

This follows modern Linux, since apparently outlining spinlock
operations does not slow down execution. The cleanups will also allow
more convenient addition of diagnostic code.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Oct 20 16:48:17 2008 +0100 (2008-10-20)
parents 7f1a36b834e1
children 8e18dd41c6c7
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/numa.h>
25 #include <xen/sched.h>
26 #include <xen/guest_access.h>
27 #include <asm/current.h>
28 #include <asm/asm_defns.h>
29 #include <asm/page.h>
30 #include <asm/flushtlb.h>
31 #include <asm/fixmap.h>
32 #include <asm/hypercall.h>
33 #include <asm/msr.h>
34 #include <public/memory.h>
36 #ifdef CONFIG_COMPAT
37 unsigned int m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
38 #endif
40 DEFINE_PER_CPU(char, compat_arg_xlat[COMPAT_ARG_XLAT_SIZE]);
42 /* Top-level master (and idle-domain) page directory. */
43 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
44 idle_pg_table[L4_PAGETABLE_ENTRIES];
46 /* Enough page directories to map bottom 4GB of the memory map. */
47 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
48 l3_identmap[L3_PAGETABLE_ENTRIES];
49 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
50 l2_identmap[4*L2_PAGETABLE_ENTRIES];
52 /* Enough page directories to map the Xen text and static data. */
53 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
54 l3_xenmap[L3_PAGETABLE_ENTRIES];
55 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
56 l2_xenmap[L2_PAGETABLE_ENTRIES];
58 void *alloc_xen_pagetable(void)
59 {
60 extern int early_boot;
61 unsigned long mfn;
63 if ( !early_boot )
64 {
65 struct page_info *pg = alloc_domheap_page(NULL, 0);
66 BUG_ON(pg == NULL);
67 return page_to_virt(pg);
68 }
70 mfn = alloc_boot_pages(1, 1);
71 BUG_ON(mfn == 0);
72 return mfn_to_virt(mfn);
73 }
75 l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
76 {
77 l4_pgentry_t *pl4e;
79 pl4e = &idle_pg_table[l4_table_offset(v)];
80 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
81 {
82 l3_pgentry_t *pl3e = alloc_xen_pagetable();
83 clear_page(pl3e);
84 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
85 }
87 return l4e_to_l3e(*pl4e) + l3_table_offset(v);
88 }
90 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
91 {
92 l3_pgentry_t *pl3e;
94 pl3e = virt_to_xen_l3e(v);
95 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
96 {
97 l2_pgentry_t *pl2e = alloc_xen_pagetable();
98 clear_page(pl2e);
99 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
100 }
102 BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
103 return l3e_to_l2e(*pl3e) + l2_table_offset(v);
104 }
106 void __init paging_init(void)
107 {
108 unsigned long i, mpt_size, va;
109 l3_pgentry_t *l3_ro_mpt;
110 l2_pgentry_t *l2_ro_mpt = NULL;
111 struct page_info *l1_pg, *l2_pg, *l3_pg;
113 /* Create user-accessible L2 directory to map the MPT for guests. */
114 if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL )
115 goto nomem;
116 l3_ro_mpt = page_to_virt(l3_pg);
117 clear_page(l3_ro_mpt);
118 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
119 l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER));
121 /*
122 * Allocate and map the machine-to-phys table.
123 * This also ensures L3 is present for fixmaps.
124 */
125 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
126 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
127 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
128 {
129 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
130 goto nomem;
131 map_pages_to_xen(
132 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
133 page_to_mfn(l1_pg),
134 1UL << PAGETABLE_ORDER,
135 PAGE_HYPERVISOR);
136 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55,
137 1UL << L2_PAGETABLE_SHIFT);
138 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
139 {
140 if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
141 goto nomem;
142 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
143 l2_ro_mpt = page_to_virt(l2_pg);
144 clear_page(l2_ro_mpt);
145 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
146 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
147 l2_ro_mpt += l2_table_offset(va);
148 }
149 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
150 l2e_write(l2_ro_mpt, l2e_from_page(
151 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
152 l2_ro_mpt++;
153 }
155 /* Create user-accessible L2 directory to map the MPT for compat guests. */
156 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
157 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
158 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
159 HIRO_COMPAT_MPT_VIRT_START)]);
160 if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
161 goto nomem;
162 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
163 clear_page(l2_ro_mpt);
164 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
165 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
166 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
167 /* Allocate and map the compatibility mode machine-to-phys table. */
168 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
169 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
170 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
171 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
172 if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
173 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
174 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
175 {
176 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
177 goto nomem;
178 map_pages_to_xen(
179 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
180 page_to_mfn(l1_pg),
181 1UL << PAGETABLE_ORDER,
182 PAGE_HYPERVISOR);
183 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
184 (i << L2_PAGETABLE_SHIFT)),
185 0x55,
186 1UL << L2_PAGETABLE_SHIFT);
187 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
188 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
189 l2_ro_mpt++;
190 }
192 /* Set up linear page table mapping. */
193 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
194 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
195 return;
197 nomem:
198 panic("Not enough memory for m2p table\n");
199 }
201 void __init setup_idle_pagetable(void)
202 {
203 /* Install per-domain mappings for idle domain. */
204 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
205 l4e_from_page(
206 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
207 __PAGE_HYPERVISOR));
208 }
210 unsigned long clone_idle_pagetable(struct vcpu *v)
211 {
212 struct domain *d = v->domain;
213 struct page_info *page = alloc_domheap_page(NULL,
214 MEMF_node(vcpu_to_node(v)));
215 l4_pgentry_t *l4_table = page_to_virt(page);
217 if ( !page )
218 return 0;
220 copy_page(l4_table, idle_pg_table);
221 l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
222 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
223 __PAGE_HYPERVISOR);
225 return __pa(l4_table);
226 }
228 void __init zap_low_mappings(void)
229 {
230 BUG_ON(num_online_cpus() != 1);
232 /* Remove aliased mapping of first 1:1 PML4 entry. */
233 l4e_write(&idle_pg_table[0], l4e_empty());
234 flush_local(FLUSH_TLB_GLOBAL);
236 /* Replace with mapping of the boot trampoline only. */
237 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
238 0x10, __PAGE_HYPERVISOR);
239 }
241 void __init subarch_init_memory(void)
242 {
243 unsigned long i, v, m2p_start_mfn;
244 l3_pgentry_t l3e;
245 l2_pgentry_t l2e;
247 /*
248 * We are rather picky about the layout of 'struct page_info'. The
249 * count_info and domain fields must be adjacent, as we perform atomic
250 * 64-bit operations on them.
251 */
252 BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) !=
253 (offsetof(struct page_info, count_info) + sizeof(u32)));
254 BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0);
256 /* M2P table is mappable read-only by privileged domains. */
257 for ( v = RDWR_MPT_VIRT_START;
258 v != RDWR_MPT_VIRT_END;
259 v += 1 << L2_PAGETABLE_SHIFT )
260 {
261 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
262 l3_table_offset(v)];
263 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
264 continue;
265 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
266 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
267 continue;
268 m2p_start_mfn = l2e_get_pfn(l2e);
270 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
271 {
272 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
273 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
274 }
275 }
277 for ( v = RDWR_COMPAT_MPT_VIRT_START;
278 v != RDWR_COMPAT_MPT_VIRT_END;
279 v += 1 << L2_PAGETABLE_SHIFT )
280 {
281 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
282 l3_table_offset(v)];
283 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
284 continue;
285 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
286 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
287 continue;
288 m2p_start_mfn = l2e_get_pfn(l2e);
290 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
291 {
292 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
293 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
294 }
295 }
296 }
298 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
299 {
300 struct xen_machphys_mfn_list xmml;
301 l3_pgentry_t l3e;
302 l2_pgentry_t l2e;
303 unsigned long v;
304 xen_pfn_t mfn;
305 unsigned int i;
306 long rc = 0;
308 switch ( op )
309 {
310 case XENMEM_machphys_mfn_list:
311 if ( copy_from_guest(&xmml, arg, 1) )
312 return -EFAULT;
314 for ( i = 0, v = RDWR_MPT_VIRT_START;
315 (i != xmml.max_extents) && (v != RDWR_MPT_VIRT_END);
316 i++, v += 1 << 21 )
317 {
318 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
319 l3_table_offset(v)];
320 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
321 break;
322 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
323 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
324 break;
325 mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
326 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
327 return -EFAULT;
328 }
330 xmml.nr_extents = i;
331 if ( copy_to_guest(arg, &xmml, 1) )
332 return -EFAULT;
334 break;
336 default:
337 rc = -ENOSYS;
338 break;
339 }
341 return rc;
342 }
344 long do_stack_switch(unsigned long ss, unsigned long esp)
345 {
346 fixup_guest_stack_selector(current->domain, ss);
347 current->arch.guest_context.kernel_ss = ss;
348 current->arch.guest_context.kernel_sp = esp;
349 return 0;
350 }
352 long do_set_segment_base(unsigned int which, unsigned long base)
353 {
354 struct vcpu *v = current;
355 long ret = 0;
357 switch ( which )
358 {
359 case SEGBASE_FS:
360 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
361 ret = -EFAULT;
362 else
363 v->arch.guest_context.fs_base = base;
364 break;
366 case SEGBASE_GS_USER:
367 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
368 ret = -EFAULT;
369 else
370 v->arch.guest_context.gs_base_user = base;
371 break;
373 case SEGBASE_GS_KERNEL:
374 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
375 ret = -EFAULT;
376 else
377 v->arch.guest_context.gs_base_kernel = base;
378 break;
380 case SEGBASE_GS_USER_SEL:
381 __asm__ __volatile__ (
382 " swapgs \n"
383 "1: movl %k0,%%gs \n"
384 " "safe_swapgs" \n"
385 ".section .fixup,\"ax\" \n"
386 "2: xorl %k0,%k0 \n"
387 " jmp 1b \n"
388 ".previous \n"
389 ".section __ex_table,\"a\"\n"
390 " .align 8 \n"
391 " .quad 1b,2b \n"
392 ".previous "
393 : : "r" (base&0xffff) );
394 break;
396 default:
397 ret = -EINVAL;
398 break;
399 }
401 return ret;
402 }
405 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
406 int check_descriptor(const struct domain *dom, struct desc_struct *d)
407 {
408 u32 a = d->a, b = d->b;
409 u16 cs;
410 unsigned int dpl;
412 /* A not-present descriptor will always fault, so is safe. */
413 if ( !(b & _SEGMENT_P) )
414 goto good;
416 /* Check and fix up the DPL. */
417 dpl = (b >> 13) & 3;
418 __fixup_guest_selector(dom, dpl);
419 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
421 /* All code and data segments are okay. No base/limit checking. */
422 if ( (b & _SEGMENT_S) )
423 {
424 if ( is_pv_32bit_domain(dom) )
425 {
426 unsigned long base, limit;
428 if ( b & _SEGMENT_L )
429 goto bad;
431 /*
432 * Older PAE Linux guests use segments which are limited to
433 * 0xf6800000. Extend these to allow access to the larger read-only
434 * M2P table available in 32on64 mode.
435 */
436 base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16);
438 limit = (b & 0xf0000) | (a & 0xffff);
439 limit++; /* We add one because limit is inclusive. */
441 if ( (b & _SEGMENT_G) )
442 limit <<= 12;
444 if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
445 {
446 a |= 0x0000ffff;
447 b |= 0x000f0000;
448 }
449 }
451 goto good;
452 }
454 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
455 if ( (b & _SEGMENT_TYPE) == 0x000 )
456 goto good;
458 /* Everything but a call gate is discarded here. */
459 if ( (b & _SEGMENT_TYPE) != 0xc00 )
460 goto bad;
462 /* Validate the target code selector. */
463 cs = a >> 16;
464 if ( !guest_gate_selector_okay(dom, cs) )
465 goto bad;
466 /*
467 * Force DPL to zero, causing a GP fault with its error code indicating
468 * the gate in use, allowing emulation. This is necessary because with
469 * native guests (kernel in ring 3) call gates cannot be used directly
470 * to transition from user to kernel mode (and whether a gate is used
471 * to enter the kernel can only be determined when the gate is being
472 * used), and with compat guests call gates cannot be used at all as
473 * there are only 64-bit ones.
474 * Store the original DPL in the selector's RPL field.
475 */
476 b &= ~_SEGMENT_DPL;
477 cs = (cs & ~3) | dpl;
478 a = (a & 0xffffU) | (cs << 16);
480 /* Reserved bits must be zero. */
481 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
482 goto bad;
484 good:
485 d->a = a;
486 d->b = b;
487 return 1;
488 bad:
489 return 0;
490 }
492 void domain_set_alloc_bitsize(struct domain *d)
493 {
494 if ( !is_pv_32on64_domain(d) ||
495 (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
496 d->arch.physaddr_bitsize > 0 )
497 return;
498 d->arch.physaddr_bitsize =
499 /* 2^n entries can be contained in guest's p2m mapping space */
500 fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
501 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
502 + PAGE_SHIFT;
503 }
505 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
506 {
507 if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
508 return bits;
509 return min(d->arch.physaddr_bitsize, bits);
510 }
512 #include "compat/mm.c"
514 /*
515 * Local variables:
516 * mode: C
517 * c-set-style: "BSD"
518 * c-basic-offset: 4
519 * tab-width: 4
520 * indent-tabs-mode: nil
521 * End:
522 */