ia64/xen-unstable

view xen/arch/x86/x86_64/mm.c @ 15425:79b180596baf

x86: introduce specialized clear_page()

More than doubles performance of page clearing on not too old
processors (SSE2 supported).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Wed Jun 20 16:18:03 2007 +0100 (2007-06-20)
parents 03a13457d993
children ecb89c6ce615
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/guest_access.h>
26 #include <asm/current.h>
27 #include <asm/asm_defns.h>
28 #include <asm/page.h>
29 #include <asm/flushtlb.h>
30 #include <asm/fixmap.h>
31 #include <asm/hypercall.h>
32 #include <asm/msr.h>
33 #include <public/memory.h>
35 #ifdef CONFIG_COMPAT
36 unsigned int m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
37 #endif
39 /* Top-level master (and idle-domain) page directory. */
40 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
41 idle_pg_table[L4_PAGETABLE_ENTRIES];
43 /* Enough page directories to map bottom 4GB of the memory map. */
44 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
45 l3_identmap[L3_PAGETABLE_ENTRIES];
46 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
47 l2_identmap[4*L2_PAGETABLE_ENTRIES];
49 /* Enough page directories to map the Xen text and static data. */
50 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
51 l3_xenmap[L3_PAGETABLE_ENTRIES];
52 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
53 l2_xenmap[L2_PAGETABLE_ENTRIES];
55 void *alloc_xen_pagetable(void)
56 {
57 extern int early_boot;
58 unsigned long mfn;
60 if ( !early_boot )
61 {
62 struct page_info *pg = alloc_domheap_page(NULL);
63 BUG_ON(pg == NULL);
64 return page_to_virt(pg);
65 }
67 /* Early pagetables must come from low 1GB of memory. */
68 mfn = alloc_boot_low_pages(1, 1); /* 0x0 - 0x40000000 */
69 BUG_ON(mfn == 0);
70 return mfn_to_virt(mfn);
71 }
73 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
74 {
75 l4_pgentry_t *pl4e;
76 l3_pgentry_t *pl3e;
77 l2_pgentry_t *pl2e;
79 pl4e = &idle_pg_table[l4_table_offset(v)];
80 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
81 {
82 pl3e = alloc_xen_pagetable();
83 clear_page(pl3e);
84 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
85 }
87 pl3e = l4e_to_l3e(*pl4e) + l3_table_offset(v);
88 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
89 {
90 pl2e = alloc_xen_pagetable();
91 clear_page(pl2e);
92 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
93 }
95 pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(v);
96 return pl2e;
97 }
99 void __init paging_init(void)
100 {
101 unsigned long i, mpt_size, va;
102 l3_pgentry_t *l3_ro_mpt;
103 l2_pgentry_t *l2_ro_mpt = NULL;
104 struct page_info *l1_pg, *l2_pg;
106 /* Create user-accessible L2 directory to map the MPT for guests. */
107 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
108 goto nomem;
109 l3_ro_mpt = page_to_virt(l2_pg);
110 clear_page(l3_ro_mpt);
111 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
112 l4e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
114 /*
115 * Allocate and map the machine-to-phys table.
116 * This also ensures L3 is present for fixmaps.
117 */
118 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
119 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
120 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
121 {
122 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
123 goto nomem;
124 map_pages_to_xen(
125 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
126 page_to_mfn(l1_pg),
127 1UL << PAGETABLE_ORDER,
128 PAGE_HYPERVISOR);
129 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55,
130 1UL << L2_PAGETABLE_SHIFT);
131 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
132 {
133 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
134 goto nomem;
135 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
136 l2_ro_mpt = page_to_virt(l2_pg);
137 clear_page(l2_ro_mpt);
138 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
139 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
140 l2_ro_mpt += l2_table_offset(va);
141 }
142 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
143 l2e_write(l2_ro_mpt, l2e_from_page(
144 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
145 l2_ro_mpt++;
146 }
148 #ifdef CONFIG_COMPAT
149 if ( !compat_disabled )
150 {
151 /* Create user-accessible L2 directory to map the MPT for compatibility guests. */
152 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
153 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
154 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
155 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
156 goto nomem;
157 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
158 clear_page(l2_ro_mpt);
159 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
160 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
161 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
162 /*
163 * Allocate and map the compatibility mode machine-to-phys table.
164 */
165 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
166 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
167 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
168 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
169 if ( m2p_compat_vstart + mpt_size < MACH2PHYS_COMPAT_VIRT_END )
170 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
171 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
172 {
173 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
174 goto nomem;
175 map_pages_to_xen(
176 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
177 page_to_mfn(l1_pg),
178 1UL << PAGETABLE_ORDER,
179 PAGE_HYPERVISOR);
180 memset((void *)(RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
181 0x55,
182 1UL << L2_PAGETABLE_SHIFT);
183 /* NB. Cannot be GLOBAL as the pt entries get copied into per-VM space. */
184 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
185 l2_ro_mpt++;
186 }
187 }
188 #endif
190 /* Set up linear page table mapping. */
191 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
192 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
193 return;
195 nomem:
196 panic("Not enough memory for m2p table\n");
197 }
199 void __init setup_idle_pagetable(void)
200 {
201 /* Install per-domain mappings for idle domain. */
202 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
203 l4e_from_page(
204 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
205 __PAGE_HYPERVISOR));
206 }
208 void __init zap_low_mappings(void)
209 {
210 BUG_ON(num_online_cpus() != 1);
212 /* Remove aliased mapping of first 1:1 PML4 entry. */
213 l4e_write(&idle_pg_table[0], l4e_empty());
214 local_flush_tlb_pge();
216 /* Replace with mapping of the boot trampoline only. */
217 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
218 0x10, __PAGE_HYPERVISOR);
219 }
221 void __init subarch_init_memory(void)
222 {
223 unsigned long i, v, m2p_start_mfn;
224 l3_pgentry_t l3e;
225 l2_pgentry_t l2e;
227 /*
228 * We are rather picky about the layout of 'struct page_info'. The
229 * count_info and domain fields must be adjacent, as we perform atomic
230 * 64-bit operations on them.
231 */
232 BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) !=
233 (offsetof(struct page_info, count_info) + sizeof(u32)));
234 BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0);
235 BUILD_BUG_ON(sizeof(struct page_info) !=
236 (32 + BITS_TO_LONGS(NR_CPUS)*sizeof(long)));
238 /* M2P table is mappable read-only by privileged domains. */
239 for ( v = RDWR_MPT_VIRT_START;
240 v != RDWR_MPT_VIRT_END;
241 v += 1 << L2_PAGETABLE_SHIFT )
242 {
243 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
244 l3_table_offset(v)];
245 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
246 continue;
247 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
248 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
249 continue;
250 m2p_start_mfn = l2e_get_pfn(l2e);
252 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
253 {
254 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
255 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
256 }
257 }
258 #ifdef CONFIG_COMPAT
259 if ( !compat_disabled )
260 {
261 for ( v = RDWR_COMPAT_MPT_VIRT_START;
262 v != RDWR_COMPAT_MPT_VIRT_END;
263 v += 1 << L2_PAGETABLE_SHIFT )
264 {
265 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
266 l3_table_offset(v)];
267 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
268 continue;
269 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
270 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
271 continue;
272 m2p_start_mfn = l2e_get_pfn(l2e);
274 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
275 {
276 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
277 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
278 }
279 }
280 }
281 #endif
282 }
284 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
285 {
286 struct xen_machphys_mfn_list xmml;
287 l3_pgentry_t l3e;
288 l2_pgentry_t l2e;
289 unsigned long v;
290 xen_pfn_t mfn;
291 unsigned int i;
292 long rc = 0;
294 switch ( op )
295 {
296 case XENMEM_machphys_mfn_list:
297 if ( copy_from_guest(&xmml, arg, 1) )
298 return -EFAULT;
300 for ( i = 0, v = RDWR_MPT_VIRT_START;
301 (i != xmml.max_extents) && (v != RDWR_MPT_VIRT_END);
302 i++, v += 1 << 21 )
303 {
304 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
305 l3_table_offset(v)];
306 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
307 break;
308 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
309 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
310 break;
311 mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
312 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
313 return -EFAULT;
314 }
316 xmml.nr_extents = i;
317 if ( copy_to_guest(arg, &xmml, 1) )
318 return -EFAULT;
320 break;
322 default:
323 rc = -ENOSYS;
324 break;
325 }
327 return rc;
328 }
330 long do_stack_switch(unsigned long ss, unsigned long esp)
331 {
332 fixup_guest_stack_selector(current->domain, ss);
333 current->arch.guest_context.kernel_ss = ss;
334 current->arch.guest_context.kernel_sp = esp;
335 return 0;
336 }
338 long do_set_segment_base(unsigned int which, unsigned long base)
339 {
340 struct vcpu *v = current;
341 long ret = 0;
343 switch ( which )
344 {
345 case SEGBASE_FS:
346 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
347 ret = -EFAULT;
348 else
349 v->arch.guest_context.fs_base = base;
350 break;
352 case SEGBASE_GS_USER:
353 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
354 ret = -EFAULT;
355 else
356 v->arch.guest_context.gs_base_user = base;
357 break;
359 case SEGBASE_GS_KERNEL:
360 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
361 ret = -EFAULT;
362 else
363 v->arch.guest_context.gs_base_kernel = base;
364 break;
366 case SEGBASE_GS_USER_SEL:
367 __asm__ __volatile__ (
368 " swapgs \n"
369 "1: movl %k0,%%gs \n"
370 " "safe_swapgs" \n"
371 ".section .fixup,\"ax\" \n"
372 "2: xorl %k0,%k0 \n"
373 " jmp 1b \n"
374 ".previous \n"
375 ".section __ex_table,\"a\"\n"
376 " .align 8 \n"
377 " .quad 1b,2b \n"
378 ".previous "
379 : : "r" (base&0xffff) );
380 break;
382 default:
383 ret = -EINVAL;
384 break;
385 }
387 return ret;
388 }
391 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
392 int check_descriptor(const struct domain *dom, struct desc_struct *d)
393 {
394 u32 a = d->a, b = d->b;
395 u16 cs;
397 /* A not-present descriptor will always fault, so is safe. */
398 if ( !(b & _SEGMENT_P) )
399 goto good;
401 /* Check and fix up the DPL. */
402 if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL(dom) << 13) )
403 d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL(dom) << 13);
405 /* All code and data segments are okay. No base/limit checking. */
406 if ( (b & _SEGMENT_S) )
407 {
408 if ( is_pv_32bit_domain(dom) && (b & _SEGMENT_L) )
409 goto bad;
410 goto good;
411 }
413 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
414 if ( (b & _SEGMENT_TYPE) == 0x000 )
415 goto good;
417 /* Everything but a call gate is discarded here. */
418 if ( (b & _SEGMENT_TYPE) != 0xc00 )
419 goto bad;
421 /* Validate and fix up the target code selector. */
422 cs = a >> 16;
423 fixup_guest_code_selector(dom, cs);
424 if ( !guest_gate_selector_okay(dom, cs) )
425 goto bad;
426 a = d->a = (d->a & 0xffffU) | (cs << 16);
428 /* Reserved bits must be zero. */
429 if ( (b & 0xe0) != 0 )
430 goto bad;
432 good:
433 return 1;
434 bad:
435 return 0;
436 }
438 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
439 {
440 if ( d == NULL )
441 return bits;
442 return min(d->arch.physaddr_bitsize, bits);
443 }
445 #include "compat/mm.c"
447 /*
448 * Local variables:
449 * mode: C
450 * c-set-style: "BSD"
451 * c-basic-offset: 4
452 * tab-width: 4
453 * indent-tabs-mode: nil
454 * End:
455 */