ia64/xen-unstable

view xen/arch/x86/x86_64/mm.c @ 11146:f43729552603

[HVM] Ensure the read-only M2P table is mapped without _PAGE_GLOBAL
otherwise context switch to shadow-mode-translate guests does not
work properly (they reuse that area of virtual address space).
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Aug 15 16:21:12 2006 +0100 (2006-08-15)
parents 37f206c7405a
children 88e6bd5e2b54
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/guest_access.h>
26 #include <asm/current.h>
27 #include <asm/asm_defns.h>
28 #include <asm/page.h>
29 #include <asm/flushtlb.h>
30 #include <asm/fixmap.h>
31 #include <asm/msr.h>
32 #include <public/memory.h>
34 struct page_info *alloc_xen_pagetable(void)
35 {
36 extern int early_boot;
37 unsigned long pfn;
39 if ( !early_boot )
40 return alloc_domheap_page(NULL);
42 pfn = alloc_boot_pages(1, 1);
43 return ((pfn == 0) ? NULL : mfn_to_page(pfn));
44 }
46 void free_xen_pagetable(struct page_info *pg)
47 {
48 free_domheap_page(pg);
49 }
51 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
52 {
53 l4_pgentry_t *pl4e;
54 l3_pgentry_t *pl3e;
55 l2_pgentry_t *pl2e;
57 pl4e = &idle_pg_table[l4_table_offset(v)];
58 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
59 {
60 pl3e = page_to_virt(alloc_xen_pagetable());
61 clear_page(pl3e);
62 *pl4e = l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR);
63 }
65 pl3e = l4e_to_l3e(*pl4e) + l3_table_offset(v);
66 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
67 {
68 pl2e = page_to_virt(alloc_xen_pagetable());
69 clear_page(pl2e);
70 *pl3e = l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR);
71 }
73 pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(v);
74 return pl2e;
75 }
77 void __init paging_init(void)
78 {
79 unsigned long i, mpt_size;
80 l3_pgentry_t *l3_ro_mpt;
81 l2_pgentry_t *l2_ro_mpt;
82 struct page_info *pg;
84 idle_vcpu[0]->arch.monitor_table =
85 pagetable_from_paddr(__pa(idle_pg_table));
87 /* Create user-accessible L2 directory to map the MPT for guests. */
88 l3_ro_mpt = alloc_xenheap_page();
89 clear_page(l3_ro_mpt);
90 idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)] =
91 l4e_from_page(
92 virt_to_page(l3_ro_mpt), __PAGE_HYPERVISOR | _PAGE_USER);
93 l2_ro_mpt = alloc_xenheap_page();
94 clear_page(l2_ro_mpt);
95 l3_ro_mpt[l3_table_offset(RO_MPT_VIRT_START)] =
96 l3e_from_page(
97 virt_to_page(l2_ro_mpt), __PAGE_HYPERVISOR | _PAGE_USER);
98 l2_ro_mpt += l2_table_offset(RO_MPT_VIRT_START);
100 /*
101 * Allocate and map the machine-to-phys table.
102 * This also ensures L3 is present for fixmaps.
103 */
104 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
105 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
106 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
107 {
108 if ( (pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
109 panic("Not enough memory for m2p table\n");
110 map_pages_to_xen(
111 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT), page_to_mfn(pg),
112 1UL << PAGETABLE_ORDER,
113 PAGE_HYPERVISOR);
114 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55,
115 1UL << L2_PAGETABLE_SHIFT);
116 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
117 *l2_ro_mpt++ = l2e_from_page(
118 pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT);
119 BUG_ON(((unsigned long)l2_ro_mpt & ~PAGE_MASK) == 0);
120 }
122 /* Set up linear page table mapping. */
123 idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)] =
124 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR);
126 /* Install per-domain mappings for idle domain. */
127 idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
128 l4e_from_page(
129 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
130 __PAGE_HYPERVISOR);
131 }
133 void __init zap_low_mappings(void)
134 {
135 idle_pg_table[0] = l4e_empty();
136 flush_tlb_all_pge();
137 }
139 void subarch_init_memory(void)
140 {
141 unsigned long i, v, m2p_start_mfn;
142 l3_pgentry_t l3e;
143 l2_pgentry_t l2e;
145 /*
146 * We are rather picky about the layout of 'struct page_info'. The
147 * count_info and domain fields must be adjacent, as we perform atomic
148 * 64-bit operations on them.
149 */
150 BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) !=
151 (offsetof(struct page_info, count_info) + sizeof(u32)));
152 BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0);
153 BUILD_BUG_ON(sizeof(struct page_info) !=
154 (32 + BITS_TO_LONGS(NR_CPUS)*sizeof(long)));
156 /* M2P table is mappable read-only by privileged domains. */
157 for ( v = RDWR_MPT_VIRT_START;
158 v != RDWR_MPT_VIRT_END;
159 v += 1 << L2_PAGETABLE_SHIFT )
160 {
161 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
162 l3_table_offset(v)];
163 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
164 continue;
165 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
166 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
167 continue;
168 m2p_start_mfn = l2e_get_pfn(l2e);
170 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
171 {
172 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
173 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
174 }
175 }
176 }
178 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
179 {
180 struct xen_machphys_mfn_list xmml;
181 l3_pgentry_t l3e;
182 l2_pgentry_t l2e;
183 unsigned long mfn, v;
184 unsigned int i;
185 long rc = 0;
187 switch ( op )
188 {
189 case XENMEM_machphys_mfn_list:
190 if ( copy_from_guest(&xmml, arg, 1) )
191 return -EFAULT;
193 for ( i = 0, v = RDWR_MPT_VIRT_START;
194 (i != xmml.max_extents) && (v != RDWR_MPT_VIRT_END);
195 i++, v += 1 << 21 )
196 {
197 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
198 l3_table_offset(v)];
199 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
200 break;
201 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
202 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
203 break;
204 mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
205 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
206 return -EFAULT;
207 }
209 xmml.nr_extents = i;
210 if ( copy_to_guest(arg, &xmml, 1) )
211 return -EFAULT;
213 break;
215 default:
216 rc = -ENOSYS;
217 break;
218 }
220 return rc;
221 }
223 long do_stack_switch(unsigned long ss, unsigned long esp)
224 {
225 fixup_guest_stack_selector(ss);
226 current->arch.guest_context.kernel_ss = ss;
227 current->arch.guest_context.kernel_sp = esp;
228 return 0;
229 }
231 long do_set_segment_base(unsigned int which, unsigned long base)
232 {
233 struct vcpu *v = current;
234 long ret = 0;
236 switch ( which )
237 {
238 case SEGBASE_FS:
239 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
240 ret = -EFAULT;
241 else
242 v->arch.guest_context.fs_base = base;
243 break;
245 case SEGBASE_GS_USER:
246 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
247 ret = -EFAULT;
248 else
249 v->arch.guest_context.gs_base_user = base;
250 break;
252 case SEGBASE_GS_KERNEL:
253 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
254 ret = -EFAULT;
255 else
256 v->arch.guest_context.gs_base_kernel = base;
257 break;
259 case SEGBASE_GS_USER_SEL:
260 __asm__ __volatile__ (
261 " swapgs \n"
262 "1: movl %k0,%%gs \n"
263 " "safe_swapgs" \n"
264 ".section .fixup,\"ax\" \n"
265 "2: xorl %k0,%k0 \n"
266 " jmp 1b \n"
267 ".previous \n"
268 ".section __ex_table,\"a\"\n"
269 " .align 8 \n"
270 " .quad 1b,2b \n"
271 ".previous "
272 : : "r" (base&0xffff) );
273 break;
275 default:
276 ret = -EINVAL;
277 break;
278 }
280 return ret;
281 }
284 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
285 int check_descriptor(struct desc_struct *d)
286 {
287 u32 a = d->a, b = d->b;
288 u16 cs;
290 /* A not-present descriptor will always fault, so is safe. */
291 if ( !(b & _SEGMENT_P) )
292 goto good;
294 /* Check and fix up the DPL. */
295 if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL << 13) )
296 d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL << 13);
298 /* All code and data segments are okay. No base/limit checking. */
299 if ( (b & _SEGMENT_S) )
300 goto good;
302 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
303 if ( (b & _SEGMENT_TYPE) == 0x000 )
304 goto good;
306 /* Everything but a call gate is discarded here. */
307 if ( (b & _SEGMENT_TYPE) != 0xc00 )
308 goto bad;
310 /* Validate and fix up the target code selector. */
311 cs = a >> 16;
312 fixup_guest_code_selector(cs);
313 if ( !guest_gate_selector_okay(cs) )
314 goto bad;
315 a = d->a = (d->a & 0xffffU) | (cs << 16);
317 /* Reserved bits must be zero. */
318 if ( (b & 0xe0) != 0 )
319 goto bad;
321 good:
322 return 1;
323 bad:
324 return 0;
325 }
327 /*
328 * Local variables:
329 * mode: C
330 * c-set-style: "BSD"
331 * c-basic-offset: 4
332 * tab-width: 4
333 * indent-tabs-mode: nil
334 * End:
335 */