ia64/xen-unstable

view xen/arch/x86/x86_64/mm.c @ 14107:1e5a83fb928b

xen memory allocator: Allow per-domain bitwidth restrictions.
Original patch by Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Sat Feb 24 13:57:34 2007 +0000 (2007-02-24)
parents da37c365b375
children 9e5e94942045
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/guest_access.h>
26 #include <asm/current.h>
27 #include <asm/asm_defns.h>
28 #include <asm/page.h>
29 #include <asm/flushtlb.h>
30 #include <asm/fixmap.h>
31 #include <asm/hypercall.h>
32 #include <asm/msr.h>
33 #include <public/memory.h>
35 #ifdef CONFIG_COMPAT
36 unsigned int m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
37 #endif
39 struct page_info *alloc_xen_pagetable(void)
40 {
41 extern int early_boot;
42 unsigned long pfn;
44 if ( !early_boot )
45 return alloc_domheap_page(NULL);
47 /* Early pagetables must come from low 1GB of memory. */
48 pfn = alloc_boot_low_pages(1, 1); /* 0x0 - 0x40000000 */
49 return ((pfn == 0) ? NULL : mfn_to_page(pfn));
50 }
52 void free_xen_pagetable(struct page_info *pg)
53 {
54 free_domheap_page(pg);
55 }
57 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
58 {
59 l4_pgentry_t *pl4e;
60 l3_pgentry_t *pl3e;
61 l2_pgentry_t *pl2e;
63 pl4e = &idle_pg_table[l4_table_offset(v)];
64 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
65 {
66 pl3e = page_to_virt(alloc_xen_pagetable());
67 clear_page(pl3e);
68 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
69 }
71 pl3e = l4e_to_l3e(*pl4e) + l3_table_offset(v);
72 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
73 {
74 pl2e = page_to_virt(alloc_xen_pagetable());
75 clear_page(pl2e);
76 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
77 }
79 pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(v);
80 return pl2e;
81 }
83 void __init paging_init(void)
84 {
85 unsigned long i, mpt_size, va;
86 l3_pgentry_t *l3_ro_mpt;
87 l2_pgentry_t *l2_ro_mpt = NULL;
88 struct page_info *l1_pg, *l2_pg;
90 /* Create user-accessible L2 directory to map the MPT for guests. */
91 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
92 goto nomem;
93 l3_ro_mpt = clear_page(page_to_virt(l2_pg));
94 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
95 l4e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
97 /*
98 * Allocate and map the machine-to-phys table.
99 * This also ensures L3 is present for fixmaps.
100 */
101 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
102 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
103 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
104 {
105 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
106 goto nomem;
107 map_pages_to_xen(
108 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
109 page_to_mfn(l1_pg),
110 1UL << PAGETABLE_ORDER,
111 PAGE_HYPERVISOR);
112 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55,
113 1UL << L2_PAGETABLE_SHIFT);
114 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
115 {
116 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
117 goto nomem;
118 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
119 l2_ro_mpt = clear_page(page_to_virt(l2_pg));
120 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
121 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
122 l2_ro_mpt += l2_table_offset(va);
123 }
124 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
125 l2e_write(l2_ro_mpt, l2e_from_page(
126 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
127 l2_ro_mpt++;
128 }
130 #ifdef CONFIG_COMPAT
131 if ( !compat_disabled )
132 {
133 /* Create user-accessible L2 directory to map the MPT for compatibility guests. */
134 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
135 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
136 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
137 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
138 goto nomem;
139 compat_idle_pg_table_l2 = l2_ro_mpt = clear_page(page_to_virt(l2_pg));
140 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
141 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
142 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
143 /*
144 * Allocate and map the compatibility mode machine-to-phys table.
145 */
146 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
147 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
148 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
149 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
150 if ( m2p_compat_vstart + mpt_size < MACH2PHYS_COMPAT_VIRT_END )
151 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
152 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
153 {
154 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
155 goto nomem;
156 map_pages_to_xen(
157 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
158 page_to_mfn(l1_pg),
159 1UL << PAGETABLE_ORDER,
160 PAGE_HYPERVISOR);
161 memset((void *)(RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
162 0x55,
163 1UL << L2_PAGETABLE_SHIFT);
164 /* NB. Cannot be GLOBAL as the pt entries get copied into per-VM space. */
165 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
166 l2_ro_mpt++;
167 }
168 }
169 #endif
171 /* Set up linear page table mapping. */
172 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
173 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
174 return;
176 nomem:
177 panic("Not enough memory for m2p table\n");
178 }
180 void __init setup_idle_pagetable(void)
181 {
182 /* Install per-domain mappings for idle domain. */
183 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
184 l4e_from_page(
185 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
186 __PAGE_HYPERVISOR));
187 }
189 void __init zap_low_mappings(void)
190 {
191 l4e_write(&idle_pg_table[0], l4e_empty());
192 flush_tlb_all_pge();
193 }
195 void subarch_init_memory(void)
196 {
197 unsigned long i, v, m2p_start_mfn;
198 l3_pgentry_t l3e;
199 l2_pgentry_t l2e;
201 /*
202 * We are rather picky about the layout of 'struct page_info'. The
203 * count_info and domain fields must be adjacent, as we perform atomic
204 * 64-bit operations on them.
205 */
206 BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) !=
207 (offsetof(struct page_info, count_info) + sizeof(u32)));
208 BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0);
209 BUILD_BUG_ON(sizeof(struct page_info) !=
210 (32 + BITS_TO_LONGS(NR_CPUS)*sizeof(long)));
212 /* M2P table is mappable read-only by privileged domains. */
213 for ( v = RDWR_MPT_VIRT_START;
214 v != RDWR_MPT_VIRT_END;
215 v += 1 << L2_PAGETABLE_SHIFT )
216 {
217 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
218 l3_table_offset(v)];
219 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
220 continue;
221 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
222 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
223 continue;
224 m2p_start_mfn = l2e_get_pfn(l2e);
226 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
227 {
228 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
229 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
230 }
231 }
232 #ifdef CONFIG_COMPAT
233 if ( !compat_disabled )
234 {
235 for ( v = RDWR_COMPAT_MPT_VIRT_START;
236 v != RDWR_COMPAT_MPT_VIRT_END;
237 v += 1 << L2_PAGETABLE_SHIFT )
238 {
239 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
240 l3_table_offset(v)];
241 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
242 continue;
243 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
244 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
245 continue;
246 m2p_start_mfn = l2e_get_pfn(l2e);
248 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
249 {
250 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
251 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
252 }
253 }
254 }
255 #endif
256 }
258 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
259 {
260 struct xen_machphys_mfn_list xmml;
261 l3_pgentry_t l3e;
262 l2_pgentry_t l2e;
263 unsigned long v;
264 xen_pfn_t mfn;
265 unsigned int i;
266 long rc = 0;
268 switch ( op )
269 {
270 case XENMEM_machphys_mfn_list:
271 if ( copy_from_guest(&xmml, arg, 1) )
272 return -EFAULT;
274 for ( i = 0, v = RDWR_MPT_VIRT_START;
275 (i != xmml.max_extents) && (v != RDWR_MPT_VIRT_END);
276 i++, v += 1 << 21 )
277 {
278 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
279 l3_table_offset(v)];
280 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
281 break;
282 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
283 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
284 break;
285 mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
286 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
287 return -EFAULT;
288 }
290 xmml.nr_extents = i;
291 if ( copy_to_guest(arg, &xmml, 1) )
292 return -EFAULT;
294 break;
296 default:
297 rc = -ENOSYS;
298 break;
299 }
301 return rc;
302 }
304 long do_stack_switch(unsigned long ss, unsigned long esp)
305 {
306 fixup_guest_stack_selector(current->domain, ss);
307 current->arch.guest_context.kernel_ss = ss;
308 current->arch.guest_context.kernel_sp = esp;
309 return 0;
310 }
312 long do_set_segment_base(unsigned int which, unsigned long base)
313 {
314 struct vcpu *v = current;
315 long ret = 0;
317 switch ( which )
318 {
319 case SEGBASE_FS:
320 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
321 ret = -EFAULT;
322 else
323 v->arch.guest_context.fs_base = base;
324 break;
326 case SEGBASE_GS_USER:
327 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
328 ret = -EFAULT;
329 else
330 v->arch.guest_context.gs_base_user = base;
331 break;
333 case SEGBASE_GS_KERNEL:
334 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
335 ret = -EFAULT;
336 else
337 v->arch.guest_context.gs_base_kernel = base;
338 break;
340 case SEGBASE_GS_USER_SEL:
341 __asm__ __volatile__ (
342 " swapgs \n"
343 "1: movl %k0,%%gs \n"
344 " "safe_swapgs" \n"
345 ".section .fixup,\"ax\" \n"
346 "2: xorl %k0,%k0 \n"
347 " jmp 1b \n"
348 ".previous \n"
349 ".section __ex_table,\"a\"\n"
350 " .align 8 \n"
351 " .quad 1b,2b \n"
352 ".previous "
353 : : "r" (base&0xffff) );
354 break;
356 default:
357 ret = -EINVAL;
358 break;
359 }
361 return ret;
362 }
365 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
366 int check_descriptor(const struct domain *dom, struct desc_struct *d)
367 {
368 u32 a = d->a, b = d->b;
369 u16 cs;
371 /* A not-present descriptor will always fault, so is safe. */
372 if ( !(b & _SEGMENT_P) )
373 goto good;
375 /* Check and fix up the DPL. */
376 if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL(dom) << 13) )
377 d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL(dom) << 13);
379 /* All code and data segments are okay. No base/limit checking. */
380 if ( (b & _SEGMENT_S) )
381 {
382 if ( !IS_COMPAT(dom) || !(b & _SEGMENT_L) )
383 goto good;
384 goto bad;
385 }
387 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
388 if ( (b & _SEGMENT_TYPE) == 0x000 )
389 goto good;
391 /* Everything but a call gate is discarded here. */
392 if ( (b & _SEGMENT_TYPE) != 0xc00 )
393 goto bad;
395 /* Validate and fix up the target code selector. */
396 cs = a >> 16;
397 fixup_guest_code_selector(dom, cs);
398 if ( !guest_gate_selector_okay(dom, cs) )
399 goto bad;
400 a = d->a = (d->a & 0xffffU) | (cs << 16);
402 /* Reserved bits must be zero. */
403 if ( (b & 0xe0) != 0 )
404 goto bad;
406 good:
407 return 1;
408 bad:
409 return 0;
410 }
412 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
413 {
414 if ( d == NULL )
415 return bits;
416 return min(d->arch.physaddr_bitsize, bits);
417 }
419 #include "compat/mm.c"
421 /*
422 * Local variables:
423 * mode: C
424 * c-set-style: "BSD"
425 * c-basic-offset: 4
426 * tab-width: 4
427 * indent-tabs-mode: nil
428 * End:
429 */