ia64/xen-unstable

view xen/arch/x86/x86_32/mm.c @ 11146:f43729552603

[HVM] Ensure the read-only M2P table is mapped without _PAGE_GLOBAL
otherwise context switch to shadow-mode-translate guests does not
work properly (they reuse that area of virtual address space).
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Aug 15 16:21:12 2006 +0100 (2006-08-15)
parents 37f206c7405a
children 0f917d63e960
line source
1 /******************************************************************************
2 * arch/x86/x86_32/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 #include <xen/config.h>
22 #include <xen/lib.h>
23 #include <xen/init.h>
24 #include <xen/mm.h>
25 #include <xen/sched.h>
26 #include <xen/guest_access.h>
27 #include <asm/current.h>
28 #include <asm/page.h>
29 #include <asm/flushtlb.h>
30 #include <asm/fixmap.h>
31 #include <public/memory.h>
33 unsigned int PAGE_HYPERVISOR = __PAGE_HYPERVISOR;
34 unsigned int PAGE_HYPERVISOR_NOCACHE = __PAGE_HYPERVISOR_NOCACHE;
36 static unsigned long mpt_size;
38 struct page_info *alloc_xen_pagetable(void)
39 {
40 extern int early_boot;
41 extern unsigned long xenheap_phys_start;
42 struct page_info *pg;
44 if ( !early_boot )
45 {
46 void *v = alloc_xenheap_page();
47 return ((v == NULL) ? NULL : virt_to_page(v));
48 }
50 pg = maddr_to_page(xenheap_phys_start);
51 xenheap_phys_start += PAGE_SIZE;
52 return pg;
53 }
55 void free_xen_pagetable(struct page_info *pg)
56 {
57 free_xenheap_page(page_to_virt(pg));
58 }
60 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
61 {
62 return &idle_pg_table_l2[l2_linear_offset(v)];
63 }
65 void __init paging_init(void)
66 {
67 void *ioremap_pt;
68 unsigned long v;
69 struct page_info *pg;
70 int i;
72 #ifdef CONFIG_X86_PAE
73 printk("PAE enabled, limit: %d GB\n", MACHPHYS_MBYTES);
74 #else
75 printk("PAE disabled.\n");
76 #endif
78 idle_vcpu[0]->arch.monitor_table =
79 pagetable_from_paddr(__pa(idle_pg_table));
81 if ( cpu_has_pge )
82 {
83 /* Suitable Xen mapping can be GLOBAL. */
84 set_in_cr4(X86_CR4_PGE);
85 PAGE_HYPERVISOR |= _PAGE_GLOBAL;
86 PAGE_HYPERVISOR_NOCACHE |= _PAGE_GLOBAL;
87 /* Transform early mappings (e.g., the frametable). */
88 for ( v = HYPERVISOR_VIRT_START; v; v += (1 << L2_PAGETABLE_SHIFT) )
89 if ( (l2e_get_flags(idle_pg_table_l2[l2_linear_offset(v)]) &
90 (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT) )
91 l2e_add_flags(idle_pg_table_l2[l2_linear_offset(v)],
92 _PAGE_GLOBAL);
93 }
95 /*
96 * Allocate and map the machine-to-phys table and create read-only mapping
97 * of MPT for guest-OS use.
98 */
99 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
100 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
101 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
102 {
103 if ( (pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
104 panic("Not enough memory to bootstrap Xen.\n");
105 idle_pg_table_l2[l2_linear_offset(RDWR_MPT_VIRT_START) + i] =
106 l2e_from_page(pg, PAGE_HYPERVISOR | _PAGE_PSE);
107 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
108 idle_pg_table_l2[l2_linear_offset(RO_MPT_VIRT_START) + i] =
109 l2e_from_page(pg, (__PAGE_HYPERVISOR | _PAGE_PSE) & ~_PAGE_RW);
110 }
112 /* Fill with an obvious debug pattern. */
113 for ( i = 0; i < (mpt_size / BYTES_PER_LONG); i++)
114 set_gpfn_from_mfn(i, 0x55555555);
116 /* Create page tables for ioremap(). */
117 for ( i = 0; i < (IOREMAP_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
118 {
119 ioremap_pt = alloc_xenheap_page();
120 clear_page(ioremap_pt);
121 idle_pg_table_l2[l2_linear_offset(IOREMAP_VIRT_START) + i] =
122 l2e_from_page(virt_to_page(ioremap_pt), __PAGE_HYPERVISOR);
123 }
125 /* Install per-domain mappings for idle domain. */
126 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
127 idle_pg_table_l2[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
128 l2e_from_page(virt_to_page(idle_vcpu[0]->domain->
129 arch.mm_perdomain_pt) + i,
130 __PAGE_HYPERVISOR);
131 }
133 void __init zap_low_mappings(l2_pgentry_t *base)
134 {
135 int i;
136 u32 addr;
138 for (i = 0; ; i++) {
139 addr = (i << L2_PAGETABLE_SHIFT);
140 if (addr >= HYPERVISOR_VIRT_START)
141 break;
142 if (l2e_get_paddr(base[i]) != addr)
143 continue;
144 base[i] = l2e_empty();
145 }
146 flush_tlb_all_pge();
147 }
149 void subarch_init_memory(void)
150 {
151 unsigned long m2p_start_mfn;
152 unsigned int i, j;
154 /*
155 * We are rather picky about the layout of 'struct page_info'. The
156 * count_info and domain fields must be adjacent, as we perform atomic
157 * 64-bit operations on them. Also, just for sanity, we assert the size
158 * of the structure here.
159 */
160 BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) !=
161 (offsetof(struct page_info, count_info) + sizeof(u32)));
162 BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0);
163 BUILD_BUG_ON(sizeof(struct page_info) != 24);
165 /* M2P table is mappable read-only by privileged domains. */
166 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
167 {
168 m2p_start_mfn = l2e_get_pfn(
169 idle_pg_table_l2[l2_linear_offset(RDWR_MPT_VIRT_START) + i]);
170 for ( j = 0; j < L2_PAGETABLE_ENTRIES; j++ )
171 {
172 struct page_info *page = mfn_to_page(m2p_start_mfn + j);
173 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
174 }
175 }
177 if ( supervisor_mode_kernel )
178 {
179 /* Guest kernel runs in ring 0, not ring 1. */
180 struct desc_struct *d;
181 d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
182 d[0].b &= ~_SEGMENT_DPL;
183 d[1].b &= ~_SEGMENT_DPL;
184 }
185 }
187 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
188 {
189 struct xen_machphys_mfn_list xmml;
190 unsigned long mfn;
191 unsigned int i, max;
192 long rc = 0;
194 switch ( op )
195 {
196 case XENMEM_machphys_mfn_list:
197 if ( copy_from_guest(&xmml, arg, 1) )
198 return -EFAULT;
200 max = min_t(unsigned int, xmml.max_extents, mpt_size >> 21);
202 for ( i = 0; i < max; i++ )
203 {
204 mfn = l2e_get_pfn(idle_pg_table_l2[l2_linear_offset(
205 RDWR_MPT_VIRT_START + (i << 21))]) + l1_table_offset(i << 21);
206 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
207 return -EFAULT;
208 }
210 xmml.nr_extents = i;
211 if ( copy_to_guest(arg, &xmml, 1) )
212 return -EFAULT;
214 break;
216 default:
217 rc = -ENOSYS;
218 break;
219 }
221 return rc;
222 }
224 long do_stack_switch(unsigned long ss, unsigned long esp)
225 {
226 int nr = smp_processor_id();
227 struct tss_struct *t = &init_tss[nr];
229 fixup_guest_stack_selector(ss);
231 current->arch.guest_context.kernel_ss = ss;
232 current->arch.guest_context.kernel_sp = esp;
233 t->ss1 = ss;
234 t->esp1 = esp;
236 return 0;
237 }
239 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
240 int check_descriptor(struct desc_struct *d)
241 {
242 unsigned long base, limit;
243 u32 a = d->a, b = d->b;
244 u16 cs;
246 /* Let a ring0 guest kernel set any descriptor it wants to. */
247 if ( supervisor_mode_kernel )
248 return 1;
250 /* A not-present descriptor will always fault, so is safe. */
251 if ( !(b & _SEGMENT_P) )
252 goto good;
254 /*
255 * We don't allow a DPL of zero. There is no legitimate reason for
256 * specifying DPL==0, and it gets rather dangerous if we also accept call
257 * gates (consider a call gate pointing at another kernel descriptor with
258 * DPL 0 -- this would get the OS ring-0 privileges).
259 */
260 if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL << 13) )
261 d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL << 13);
263 if ( !(b & _SEGMENT_S) )
264 {
265 /*
266 * System segment:
267 * 1. Don't allow interrupt or trap gates as they belong in the IDT.
268 * 2. Don't allow TSS descriptors or task gates as we don't
269 * virtualise x86 tasks.
270 * 3. Don't allow LDT descriptors because they're unnecessary and
271 * I'm uneasy about allowing an LDT page to contain LDT
272 * descriptors. In any case, Xen automatically creates the
273 * required descriptor when reloading the LDT register.
274 * 4. We allow call gates but they must not jump to a private segment.
275 */
277 /* Disallow everything but call gates. */
278 if ( (b & _SEGMENT_TYPE) != 0xc00 )
279 goto bad;
281 /* Validate and fix up the target code selector. */
282 cs = a >> 16;
283 fixup_guest_code_selector(cs);
284 if ( !guest_gate_selector_okay(cs) )
285 goto bad;
286 a = d->a = (d->a & 0xffffU) | (cs << 16);
288 /* Reserved bits must be zero. */
289 if ( (b & 0xe0) != 0 )
290 goto bad;
292 /* No base/limit check is needed for a call gate. */
293 goto good;
294 }
296 /* Check that base is at least a page away from Xen-private area. */
297 base = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16);
298 if ( base >= (GUEST_SEGMENT_MAX_ADDR - PAGE_SIZE) )
299 goto bad;
301 /* Check and truncate the limit if necessary. */
302 limit = (b&0xf0000) | (a&0xffff);
303 limit++; /* We add one because limit is inclusive. */
304 if ( (b & _SEGMENT_G) )
305 limit <<= 12;
307 if ( (b & (_SEGMENT_CODE | _SEGMENT_EC)) == _SEGMENT_EC )
308 {
309 /*
310 * DATA, GROWS-DOWN.
311 * Grows-down limit check.
312 * NB. limit == 0xFFFFF provides no access (if G=1).
313 * limit == 0x00000 provides 4GB-4kB access (if G=1).
314 */
315 if ( (base + limit) > base )
316 {
317 limit = -(base & PAGE_MASK);
318 goto truncate;
319 }
320 }
321 else
322 {
323 /*
324 * DATA, GROWS-UP.
325 * CODE (CONFORMING AND NON-CONFORMING).
326 * Grows-up limit check.
327 * NB. limit == 0xFFFFF provides 4GB access (if G=1).
328 * limit == 0x00000 provides 4kB access (if G=1).
329 */
330 if ( ((base + limit) <= base) ||
331 ((base + limit) > GUEST_SEGMENT_MAX_ADDR) )
332 {
333 limit = GUEST_SEGMENT_MAX_ADDR - base;
334 truncate:
335 if ( !(b & _SEGMENT_G) )
336 goto bad; /* too dangerous; too hard to work out... */
337 limit = (limit >> 12) - 1;
338 d->a &= ~0x0ffff; d->a |= limit & 0x0ffff;
339 d->b &= ~0xf0000; d->b |= limit & 0xf0000;
340 }
341 }
343 good:
344 return 1;
345 bad:
346 return 0;
347 }
349 /*
350 * Local variables:
351 * mode: C
352 * c-set-style: "BSD"
353 * c-basic-offset: 4
354 * tab-width: 4
355 * indent-tabs-mode: nil
356 * End:
357 */