ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 14175:5943a8314d69

[XEN] Make the compat-mode l4 page table look more like a page table
and remove some special-case code in the shadows.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Wed Feb 28 13:17:27 2007 +0000 (2007-02-28)
parents 720afbf74001
children 7b35a9682d81
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include "private.h"
37 #include "types.h"
39 /* THINGS TO DO LATER:
40 *
41 * TEARDOWN HEURISTICS
42 * Also: have a heuristic for when to destroy a previous paging-mode's
43 * shadows. When a guest is done with its start-of-day 32-bit tables
44 * and reuses the memory we want to drop those shadows. Start with
45 * shadows in a page in two modes as a hint, but beware of clever tricks
46 * like reusing a pagetable for both PAE and 64-bit during boot...
47 *
48 * PAE LINEAR MAPS
49 * Rework shadow_get_l*e() to have the option of using map_domain_page()
50 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
51 * Then we can test the speed difference made by linear maps. If the
52 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
53 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
54 * to share l2h pages again.
55 *
56 * GUEST_WALK_TABLES TLB FLUSH COALESCE
57 * guest_walk_tables can do up to three remote TLB flushes as it walks to
58 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
59 * and if we do flush, re-do the walk. If anything has changed, then
60 * pause all the other vcpus and do the walk *again*.
61 *
62 * WP DISABLED
63 * Consider how to implement having the WP bit of CR0 set to 0.
64 * Since we need to be able to cause write faults to pagetables, this might
65 * end up looking like not having the (guest) pagetables present at all in
66 * HVM guests...
67 *
68 * PSE disabled / PSE36
69 * We don't support any modes other than PSE enabled, PSE36 disabled.
70 * Neither of those would be hard to change, but we'd need to be able to
71 * deal with shadows made in one mode and used in another.
72 */
74 #define FETCH_TYPE_PREFETCH 1
75 #define FETCH_TYPE_DEMAND 2
76 #define FETCH_TYPE_WRITE 4
77 typedef enum {
78 ft_prefetch = FETCH_TYPE_PREFETCH,
79 ft_demand_read = FETCH_TYPE_DEMAND,
80 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
81 } fetch_type_t;
83 #ifdef DEBUG_TRACE_DUMP
84 static char *fetch_type_names[] = {
85 [ft_prefetch] "prefetch",
86 [ft_demand_read] "demand read",
87 [ft_demand_write] "demand write",
88 };
89 #endif
91 /**************************************************************************/
92 /* Hash table mapping from guest pagetables to shadows
93 *
94 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
95 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
96 * shadow L1 which maps its "splinters".
97 */
99 static inline mfn_t
100 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
101 /* Look for FL1 shadows in the hash table */
102 {
103 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
104 return smfn;
105 }
107 static inline mfn_t
108 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
109 /* Look for shadows in the hash table */
110 {
111 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
112 perfc_incrc(shadow_get_shadow_status);
113 return smfn;
114 }
116 static inline void
117 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
118 /* Put an FL1 shadow into the hash table */
119 {
120 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
121 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
123 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
124 }
126 static inline void
127 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
128 /* Put a shadow into the hash table */
129 {
130 struct domain *d = v->domain;
131 int res;
133 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
134 d->domain_id, v->vcpu_id, mfn_x(gmfn),
135 shadow_type, mfn_x(smfn));
137 #ifdef CONFIG_COMPAT
138 if ( !IS_COMPAT(d) || shadow_type != SH_type_l4_64_shadow )
139 #endif
140 {
141 res = get_page(mfn_to_page(gmfn), d);
142 ASSERT(res == 1);
143 }
145 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
146 }
148 static inline void
149 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
150 /* Remove a shadow from the hash table */
151 {
152 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
153 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
154 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
155 }
157 static inline void
158 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
159 /* Remove a shadow from the hash table */
160 {
161 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
162 v->domain->domain_id, v->vcpu_id,
163 mfn_x(gmfn), shadow_type, mfn_x(smfn));
164 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
165 #ifdef CONFIG_COMPAT
166 if ( !IS_COMPAT(v->domain) || shadow_type != SH_type_l4_64_shadow )
167 #endif
168 put_page(mfn_to_page(gmfn));
169 }
171 /**************************************************************************/
172 /* CPU feature support querying */
174 static inline int
175 guest_supports_superpages(struct vcpu *v)
176 {
177 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
178 * CR4.PSE is set or the guest is in PAE or long mode */
179 return (is_hvm_vcpu(v) && (GUEST_PAGING_LEVELS != 2
180 || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
181 }
183 static inline int
184 guest_supports_nx(struct vcpu *v)
185 {
186 if ( !is_hvm_vcpu(v) )
187 return cpu_has_nx;
189 // XXX - fix this!
190 return 1;
191 }
194 /**************************************************************************/
195 /* Functions for walking the guest page tables */
198 /* Walk the guest pagetables, filling the walk_t with what we see.
199 * Takes an uninitialised walk_t. The caller must call unmap_walk()
200 * on the walk_t before discarding it or calling guest_walk_tables again.
201 * If "guest_op" is non-zero, we are serving a genuine guest memory access,
202 * and must (a) be under the shadow lock, and (b) remove write access
203 * from any gueat PT pages we see, as we will be using their contents to
204 * perform shadow updates.
205 * Returns 0 for success or non-zero if the guest pagetables are malformed.
206 * N.B. Finding a not-present entry does not cause a non-zero return code. */
207 static inline int
208 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
209 {
210 ASSERT(!guest_op || shadow_locked_by_me(v->domain));
212 perfc_incrc(shadow_guest_walk);
213 memset(gw, 0, sizeof(*gw));
214 gw->va = va;
216 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
217 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
218 /* Get l4e from the top level table */
219 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
220 gw->l4e = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable
221 + guest_l4_table_offset(va);
222 /* Walk down to the l3e */
223 if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
224 gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
225 if ( !mfn_valid(gw->l3mfn) ) return 1;
226 /* This mfn is a pagetable: make sure the guest can't write to it. */
227 if ( guest_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
228 flush_tlb_mask(v->domain->domain_dirty_cpumask);
229 gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
230 + guest_l3_table_offset(va);
231 #else /* PAE only... */
232 /* Get l3e from the cache of the guest's top level table */
233 gw->l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
234 #endif /* PAE or 64... */
235 /* Walk down to the l2e */
236 if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
237 gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
238 if ( !mfn_valid(gw->l2mfn) ) return 1;
239 /* This mfn is a pagetable: make sure the guest can't write to it. */
240 if ( guest_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
241 flush_tlb_mask(v->domain->domain_dirty_cpumask);
242 gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
243 + guest_l2_table_offset(va);
244 #else /* 32-bit only... */
245 /* Get l2e from the top level table */
246 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
247 gw->l2e = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable
248 + guest_l2_table_offset(va);
249 #endif /* All levels... */
251 if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
252 if ( guest_supports_superpages(v) &&
253 (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
254 {
255 /* Special case: this guest VA is in a PSE superpage, so there's
256 * no guest l1e. We make one up so that the propagation code
257 * can generate a shadow l1 table. Start with the gfn of the
258 * first 4k-page of the superpage. */
259 gfn_t start = guest_l2e_get_gfn(*gw->l2e);
260 /* Grant full access in the l1e, since all the guest entry's
261 * access controls are enforced in the shadow l2e. This lets
262 * us reflect l2 changes later without touching the l1s. */
263 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
264 _PAGE_ACCESSED|_PAGE_DIRTY);
265 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
266 * of the level 1 */
267 if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
268 flags |= _PAGE_PAT;
269 /* Increment the pfn by the right number of 4k pages.
270 * The ~0x1 is to mask out the PAT bit mentioned above. */
271 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
272 gw->eff_l1e = guest_l1e_from_gfn(start, flags);
273 gw->l1e = NULL;
274 gw->l1mfn = _mfn(INVALID_MFN);
275 }
276 else
277 {
278 /* Not a superpage: carry on and find the l1e. */
279 gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
280 if ( !mfn_valid(gw->l1mfn) ) return 1;
281 /* This mfn is a pagetable: make sure the guest can't write to it. */
282 if ( guest_op
283 && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
284 flush_tlb_mask(v->domain->domain_dirty_cpumask);
285 gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
286 + guest_l1_table_offset(va);
287 gw->eff_l1e = *gw->l1e;
288 }
290 return 0;
291 }
293 /* Given a walk_t, translate the gw->va into the guest's notion of the
294 * corresponding frame number. */
295 static inline gfn_t
296 guest_walk_to_gfn(walk_t *gw)
297 {
298 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
299 return _gfn(INVALID_GFN);
300 return guest_l1e_get_gfn(gw->eff_l1e);
301 }
303 /* Given a walk_t, translate the gw->va into the guest's notion of the
304 * corresponding physical address. */
305 static inline paddr_t
306 guest_walk_to_gpa(walk_t *gw)
307 {
308 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
309 return 0;
310 return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
311 }
314 /* Unmap (and reinitialise) a guest walk.
315 * Call this to dispose of any walk filled in by guest_walk_tables() */
316 static void unmap_walk(struct vcpu *v, walk_t *gw)
317 {
318 #if GUEST_PAGING_LEVELS >= 3
319 #if GUEST_PAGING_LEVELS >= 4
320 if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
321 #endif
322 if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
323 #endif
324 if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
325 #ifdef DEBUG
326 memset(gw, 0, sizeof(*gw));
327 #endif
328 }
331 /* Pretty-print the contents of a guest-walk */
332 static inline void print_gw(walk_t *gw)
333 {
334 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
335 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
336 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
337 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
338 SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
339 if ( gw->l4e )
340 SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
341 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
342 #endif /* PAE or 64... */
343 SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
344 if ( gw->l3e )
345 SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
346 #endif /* All levels... */
347 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
348 SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
349 if ( gw->l2e )
350 SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
351 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
352 SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
353 if ( gw->l1e )
354 SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
355 SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
356 }
359 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
360 /* Lightweight audit: pass all the shadows associated with this guest walk
361 * through the audit mechanisms */
362 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
363 {
364 mfn_t smfn;
366 if ( !(SHADOW_AUDIT_ENABLE) )
367 return;
369 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
370 if ( mfn_valid(gw->l4mfn)
371 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
372 SH_type_l4_shadow))) )
373 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
374 if ( mfn_valid(gw->l3mfn)
375 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
376 SH_type_l3_shadow))) )
377 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
378 #endif /* PAE or 64... */
379 if ( mfn_valid(gw->l2mfn) )
380 {
381 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
382 SH_type_l2_shadow))) )
383 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
384 #if GUEST_PAGING_LEVELS == 3
385 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
386 SH_type_l2h_shadow))) )
387 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
388 #endif
389 }
390 if ( mfn_valid(gw->l1mfn)
391 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
392 SH_type_l1_shadow))) )
393 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
394 else if ( gw->l2e
395 && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
396 && mfn_valid(
397 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
398 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
399 }
401 #else
402 #define sh_audit_gw(_v, _gw) do {} while(0)
403 #endif /* audit code */
407 /**************************************************************************/
408 /* Function to write to the guest tables, for propagating accessed and
409 * dirty bits from the shadow to the guest.
410 * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
411 * and an operation type. The guest entry is always passed as an l1e:
412 * since we only ever write flags, that's OK.
413 * Returns the new flag bits of the guest entry. */
415 static u32 guest_set_ad_bits(struct vcpu *v,
416 mfn_t gmfn,
417 guest_l1e_t *ep,
418 unsigned int level,
419 fetch_type_t ft)
420 {
421 u32 flags;
422 int res = 0;
424 ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
425 ASSERT(level <= GUEST_PAGING_LEVELS);
426 ASSERT(shadow_locked_by_me(v->domain));
428 flags = guest_l1e_get_flags(*ep);
430 /* Only set A and D bits for guest-initiated accesses */
431 if ( !(ft & FETCH_TYPE_DEMAND) )
432 return flags;
434 ASSERT(mfn_valid(gmfn)
435 && (sh_mfn_is_a_page_table(gmfn)
436 || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
437 == 0)));
439 /* PAE l3s do not have A and D bits */
440 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
442 /* Need the D bit as well for writes, in L1es and PSE L2es. */
443 if ( ft == ft_demand_write
444 && (level == 1 ||
445 (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
446 {
447 if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
448 == (_PAGE_DIRTY | _PAGE_ACCESSED) )
449 return flags; /* Guest already has A and D bits set */
450 flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
451 perfc_incrc(shadow_ad_update);
452 }
453 else
454 {
455 if ( flags & _PAGE_ACCESSED )
456 return flags; /* Guest already has A bit set */
457 flags |= _PAGE_ACCESSED;
458 perfc_incrc(shadow_a_update);
459 }
461 /* Set the bit(s) */
462 sh_mark_dirty(v->domain, gmfn);
463 SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
464 "old flags = %#x, new flags = %#x\n",
465 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep),
466 flags);
467 *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
469 /* Propagate this change to any other shadows of the page
470 * (only necessary if there is more than one shadow) */
471 if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
472 {
473 u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
474 /* More than one type bit set in shadow-flags? */
475 if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
476 res = sh_validate_guest_entry(v, gmfn, ep, sizeof (*ep));
477 }
479 /* We should never need to flush the TLB or recopy PAE entries */
480 ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
482 return flags;
483 }
485 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
486 void *
487 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
488 unsigned long *gl1mfn)
489 {
490 void *pl1e = NULL;
491 walk_t gw;
493 ASSERT(shadow_mode_translate(v->domain));
495 // XXX -- this is expensive, but it's easy to cobble together...
496 // FIXME!
498 shadow_lock(v->domain);
499 guest_walk_tables(v, addr, &gw, 1);
501 if ( gw.l2e &&
502 (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) &&
503 !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) )
504 {
505 if ( gl1mfn )
506 *gl1mfn = mfn_x(gw.l1mfn);
507 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
508 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
509 }
511 unmap_walk(v, &gw);
512 shadow_unlock(v->domain);
514 return pl1e;
515 }
517 void
518 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
519 {
520 walk_t gw;
522 ASSERT(shadow_mode_translate(v->domain));
524 // XXX -- this is expensive, but it's easy to cobble together...
525 // FIXME!
527 shadow_lock(v->domain);
528 guest_walk_tables(v, addr, &gw, 1);
529 *(guest_l1e_t *)eff_l1e = gw.eff_l1e;
530 unmap_walk(v, &gw);
531 shadow_unlock(v->domain);
532 }
533 #endif /* CONFIG==SHADOW==GUEST */
535 /**************************************************************************/
536 /* Functions to compute the correct index into a shadow page, given an
537 * index into the guest page (as returned by guest_get_index()).
538 * This is trivial when the shadow and guest use the same sized PTEs, but
539 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
540 * PAE- or 64-bit shadows).
541 *
542 * These functions also increment the shadow mfn, when necessary. When PTE
543 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
544 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
545 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
546 * which shadow page we really want. Similarly, when PTE sizes are
547 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
548 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
549 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
550 * space.)
551 *
552 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
553 * of shadow (to store both the shadow, and the info that would normally be
554 * stored in page_info fields). This arrangement allows the shadow and the
555 * "page_info" fields to always be stored in the same page (in fact, in
556 * the same cache line), avoiding an extra call to map_domain_page().
557 */
559 static inline u32
560 guest_index(void *ptr)
561 {
562 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
563 }
565 static u32
566 shadow_l1_index(mfn_t *smfn, u32 guest_index)
567 {
568 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
569 *smfn = _mfn(mfn_x(*smfn) +
570 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
571 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
572 #else
573 return guest_index;
574 #endif
575 }
577 static u32
578 shadow_l2_index(mfn_t *smfn, u32 guest_index)
579 {
580 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
581 // Because we use 2 shadow l2 entries for each guest entry, the number of
582 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
583 //
584 *smfn = _mfn(mfn_x(*smfn) +
585 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
587 // We multiple by two to get the index of the first of the two entries
588 // used to shadow the specified guest entry.
589 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
590 #else
591 return guest_index;
592 #endif
593 }
595 #if GUEST_PAGING_LEVELS >= 4
597 static u32
598 shadow_l3_index(mfn_t *smfn, u32 guest_index)
599 {
600 return guest_index;
601 }
603 static u32
604 shadow_l4_index(mfn_t *smfn, u32 guest_index)
605 {
606 return guest_index;
607 }
609 #endif // GUEST_PAGING_LEVELS >= 4
612 /**************************************************************************/
613 /* Function which computes shadow entries from their corresponding guest
614 * entries. This is the "heart" of the shadow code. It operates using
615 * level-1 shadow types, but handles all levels of entry.
616 * Don't call it directly, but use the four wrappers below.
617 */
619 static always_inline void
620 _sh_propagate(struct vcpu *v,
621 void *guest_entry_ptr,
622 mfn_t guest_table_mfn,
623 mfn_t target_mfn,
624 void *shadow_entry_ptr,
625 int level,
626 fetch_type_t ft,
627 int mmio)
628 {
629 guest_l1e_t *gp = guest_entry_ptr;
630 shadow_l1e_t *sp = shadow_entry_ptr;
631 struct domain *d = v->domain;
632 u32 pass_thru_flags;
633 u32 gflags, sflags;
635 /* We don't shadow PAE l3s */
636 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
638 if ( mfn_valid(guest_table_mfn) )
639 /* Handle A and D bit propagation into the guest */
640 gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
641 else
642 {
643 /* Must be an fl1e or a prefetch */
644 ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
645 gflags = guest_l1e_get_flags(*gp);
646 }
648 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
649 {
650 /* If a guest l1 entry is not present, shadow with the magic
651 * guest-not-present entry. */
652 if ( level == 1 )
653 *sp = sh_l1e_gnp();
654 else
655 *sp = shadow_l1e_empty();
656 goto done;
657 }
659 if ( level == 1 && mmio )
660 {
661 /* Guest l1e maps MMIO space */
662 *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
663 goto done;
664 }
666 // Must have a valid target_mfn, unless this is a prefetch. In the
667 // case of a prefetch, an invalid mfn means that we can not usefully
668 // shadow anything, and so we return early.
669 //
670 if ( !mfn_valid(target_mfn) )
671 {
672 ASSERT((ft == ft_prefetch));
673 *sp = shadow_l1e_empty();
674 goto done;
675 }
677 // Propagate bits from the guest to the shadow.
678 // Some of these may be overwritten, below.
679 // Since we know the guest's PRESENT bit is set, we also set the shadow's
680 // SHADOW_PRESENT bit.
681 //
682 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
683 _PAGE_RW | _PAGE_PRESENT);
684 if ( guest_supports_nx(v) )
685 pass_thru_flags |= _PAGE_NX_BIT;
686 sflags = gflags & pass_thru_flags;
688 // Set the A&D bits for higher level shadows.
689 // Higher level entries do not, strictly speaking, have dirty bits, but
690 // since we use shadow linear tables, each of these entries may, at some
691 // point in time, also serve as a shadow L1 entry.
692 // By setting both the A&D bits in each of these, we eliminate the burden
693 // on the hardware to update these bits on initial accesses.
694 //
695 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
696 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
698 // If the A or D bit has not yet been set in the guest, then we must
699 // prevent the corresponding kind of access.
700 //
701 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
702 sflags &= ~_PAGE_PRESENT;
704 /* D bits exist in L1es and PSE L2es */
705 if ( unlikely(((level == 1) ||
706 ((level == 2) &&
707 (gflags & _PAGE_PSE) &&
708 guest_supports_superpages(v)))
709 && !(gflags & _PAGE_DIRTY)) )
710 sflags &= ~_PAGE_RW;
712 // shadow_mode_log_dirty support
713 //
714 // Only allow the guest write access to a page a) on a demand fault,
715 // or b) if the page is already marked as dirty.
716 //
717 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
718 {
719 if ( ft & FETCH_TYPE_WRITE )
720 sh_mark_dirty(d, target_mfn);
721 else if ( !sh_mfn_is_dirty(d, target_mfn) )
722 sflags &= ~_PAGE_RW;
723 }
725 // protect guest page tables
726 //
727 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
728 {
729 if ( shadow_mode_trap_reads(d) )
730 {
731 // if we are trapping both reads & writes, then mark this page
732 // as not present...
733 //
734 sflags &= ~_PAGE_PRESENT;
735 }
736 else
737 {
738 // otherwise, just prevent any writes...
739 //
740 sflags &= ~_PAGE_RW;
741 }
742 }
744 // PV guests in 64-bit mode use two different page tables for user vs
745 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
746 // It is always shadowed as present...
747 if ( (GUEST_PAGING_LEVELS == 4) && !IS_COMPAT(d) && !is_hvm_domain(d) )
748 {
749 sflags |= _PAGE_USER;
750 }
752 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
753 done:
754 SHADOW_DEBUG(PROPAGATE,
755 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
756 fetch_type_names[ft], level, gp->l1, sp->l1);
757 }
760 /* These four wrappers give us a little bit of type-safety back around the
761 * use of void-* pointers in _sh_propagate(), and allow the compiler to
762 * optimize out some level checks. */
764 #if GUEST_PAGING_LEVELS >= 4
765 static void
766 l4e_propagate_from_guest(struct vcpu *v,
767 guest_l4e_t *gl4e,
768 mfn_t gl4mfn,
769 mfn_t sl3mfn,
770 shadow_l4e_t *sl4e,
771 fetch_type_t ft)
772 {
773 _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, 0);
774 }
776 static void
777 l3e_propagate_from_guest(struct vcpu *v,
778 guest_l3e_t *gl3e,
779 mfn_t gl3mfn,
780 mfn_t sl2mfn,
781 shadow_l3e_t *sl3e,
782 fetch_type_t ft)
783 {
784 _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, 0);
785 }
786 #endif // GUEST_PAGING_LEVELS >= 4
788 static void
789 l2e_propagate_from_guest(struct vcpu *v,
790 guest_l2e_t *gl2e,
791 mfn_t gl2mfn,
792 mfn_t sl1mfn,
793 shadow_l2e_t *sl2e,
794 fetch_type_t ft)
795 {
796 _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, 0);
797 }
799 static void
800 l1e_propagate_from_guest(struct vcpu *v,
801 guest_l1e_t *gl1e,
802 mfn_t gl1mfn,
803 mfn_t gmfn,
804 shadow_l1e_t *sl1e,
805 fetch_type_t ft,
806 int mmio)
807 {
808 _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, mmio);
809 }
812 /**************************************************************************/
813 /* These functions update shadow entries (and do bookkeeping on the shadow
814 * tables they are in). It is intended that they are the only
815 * functions which ever write (non-zero) data onto a shadow page.
816 */
818 static inline void safe_write_entry(void *dst, void *src)
819 /* Copy one PTE safely when processors might be running on the
820 * destination pagetable. This does *not* give safety against
821 * concurrent writes (that's what the shadow lock is for), just
822 * stops the hardware picking up partially written entries. */
823 {
824 volatile unsigned long *d = dst;
825 unsigned long *s = src;
826 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
827 #if CONFIG_PAGING_LEVELS == 3
828 /* In PAE mode, pagetable entries are larger
829 * than machine words, so won't get written atomically. We need to make
830 * sure any other cpu running on these shadows doesn't see a
831 * half-written entry. Do this by marking the entry not-present first,
832 * then writing the high word before the low word. */
833 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
834 d[0] = 0;
835 d[1] = s[1];
836 d[0] = s[0];
837 #else
838 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
839 * which will be an atomic write, since the entry is aligned. */
840 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
841 *d = *s;
842 #endif
843 }
846 static inline void
847 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
848 /* This function does the actual writes to shadow pages.
849 * It must not be called directly, since it doesn't do the bookkeeping
850 * that shadow_set_l*e() functions do. */
851 {
852 shadow_l1e_t *dst = d;
853 shadow_l1e_t *src = s;
854 void *map = NULL;
855 int i;
857 /* Because we mirror access rights at all levels in the shadow, an
858 * l2 (or higher) entry with the RW bit cleared will leave us with
859 * no write access through the linear map.
860 * We detect that by writing to the shadow with copy_to_user() and
861 * using map_domain_page() to get a writeable mapping if we need to. */
862 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
863 {
864 perfc_incrc(shadow_linear_map_failed);
865 map = sh_map_domain_page(mfn);
866 ASSERT(map != NULL);
867 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
868 }
871 for ( i = 0; i < entries; i++ )
872 safe_write_entry(dst++, src++);
874 if ( map != NULL ) sh_unmap_domain_page(map);
875 }
877 static inline int
878 perms_strictly_increased(u32 old_flags, u32 new_flags)
879 /* Given the flags of two entries, are the new flags a strict
880 * increase in rights over the old ones? */
881 {
882 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
883 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
884 /* Flip the NX bit, since it's the only one that decreases rights;
885 * we calculate as if it were an "X" bit. */
886 of ^= _PAGE_NX_BIT;
887 nf ^= _PAGE_NX_BIT;
888 /* If the changed bits are all set in the new flags, then rights strictly
889 * increased between old and new. */
890 return ((of | (of ^ nf)) == nf);
891 }
893 static int inline
894 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
895 {
896 int res;
897 mfn_t mfn;
898 struct domain *owner;
900 ASSERT(!sh_l1e_is_magic(sl1e));
902 if ( !shadow_mode_refcounts(d) )
903 return 1;
905 res = get_page_from_l1e(sl1e, d);
907 // If a privileged domain is attempting to install a map of a page it does
908 // not own, we let it succeed anyway.
909 //
910 if ( unlikely(!res) &&
911 IS_PRIV(d) &&
912 !shadow_mode_translate(d) &&
913 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
914 (owner = page_get_owner(mfn_to_page(mfn))) &&
915 (d != owner) )
916 {
917 res = get_page_from_l1e(sl1e, owner);
918 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
919 "which is owned by domain %d: %s\n",
920 d->domain_id, mfn_x(mfn), owner->domain_id,
921 res ? "success" : "failed");
922 }
924 if ( unlikely(!res) )
925 {
926 perfc_incrc(shadow_get_page_fail);
927 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
928 }
930 return res;
931 }
933 static void inline
934 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
935 {
936 if ( !shadow_mode_refcounts(d) )
937 return;
939 put_page_from_l1e(sl1e, d);
940 }
942 #if GUEST_PAGING_LEVELS >= 4
943 static int shadow_set_l4e(struct vcpu *v,
944 shadow_l4e_t *sl4e,
945 shadow_l4e_t new_sl4e,
946 mfn_t sl4mfn)
947 {
948 int flags = 0, ok;
949 shadow_l4e_t old_sl4e;
950 paddr_t paddr;
951 ASSERT(sl4e != NULL);
952 old_sl4e = *sl4e;
954 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
956 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
957 | (((unsigned long)sl4e) & ~PAGE_MASK));
959 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
960 {
961 /* About to install a new reference */
962 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
963 ok = sh_get_ref(v, sl3mfn, paddr);
964 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
965 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
966 ok |= sh_pin(v, sl3mfn);
967 if ( !ok )
968 {
969 domain_crash(v->domain);
970 return SHADOW_SET_ERROR;
971 }
972 }
974 /* Write the new entry */
975 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
976 flags |= SHADOW_SET_CHANGED;
978 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
979 {
980 /* We lost a reference to an old mfn. */
981 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
982 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
983 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
984 shadow_l4e_get_flags(new_sl4e)) )
985 {
986 flags |= SHADOW_SET_FLUSH;
987 }
988 sh_put_ref(v, osl3mfn, paddr);
989 }
990 return flags;
991 }
993 static int shadow_set_l3e(struct vcpu *v,
994 shadow_l3e_t *sl3e,
995 shadow_l3e_t new_sl3e,
996 mfn_t sl3mfn)
997 {
998 int flags = 0;
999 shadow_l3e_t old_sl3e;
1000 paddr_t paddr;
1001 ASSERT(sl3e != NULL);
1002 old_sl3e = *sl3e;
1004 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1006 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1007 | (((unsigned long)sl3e) & ~PAGE_MASK));
1009 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1010 /* About to install a new reference */
1011 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1013 domain_crash(v->domain);
1014 return SHADOW_SET_ERROR;
1017 /* Write the new entry */
1018 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1019 flags |= SHADOW_SET_CHANGED;
1021 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1023 /* We lost a reference to an old mfn. */
1024 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1025 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1026 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1027 shadow_l3e_get_flags(new_sl3e)) )
1029 flags |= SHADOW_SET_FLUSH;
1031 sh_put_ref(v, osl2mfn, paddr);
1033 return flags;
1035 #endif /* GUEST_PAGING_LEVELS >= 4 */
1037 static int shadow_set_l2e(struct vcpu *v,
1038 shadow_l2e_t *sl2e,
1039 shadow_l2e_t new_sl2e,
1040 mfn_t sl2mfn)
1042 int flags = 0;
1043 shadow_l2e_t old_sl2e;
1044 paddr_t paddr;
1046 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1047 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1048 * shadows. Reference counting and up-pointers track from the first
1049 * page of the shadow to the first l2e, so make sure that we're
1050 * working with those:
1051 * Align the pointer down so it's pointing at the first of the pair */
1052 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1053 /* Align the mfn of the shadow entry too */
1054 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1055 #endif
1057 ASSERT(sl2e != NULL);
1058 old_sl2e = *sl2e;
1060 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1062 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1063 | (((unsigned long)sl2e) & ~PAGE_MASK));
1065 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1066 /* About to install a new reference */
1067 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1069 domain_crash(v->domain);
1070 return SHADOW_SET_ERROR;
1073 /* Write the new entry */
1074 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1076 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1077 /* The l1 shadow is two pages long and need to be pointed to by
1078 * two adjacent l1es. The pair have the same flags, but point
1079 * at odd and even MFNs */
1080 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1081 pair[1].l2 |= (1<<PAGE_SHIFT);
1082 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1084 #else /* normal case */
1085 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1086 #endif
1087 flags |= SHADOW_SET_CHANGED;
1089 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1091 /* We lost a reference to an old mfn. */
1092 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1093 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1094 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1095 shadow_l2e_get_flags(new_sl2e)) )
1097 flags |= SHADOW_SET_FLUSH;
1099 sh_put_ref(v, osl1mfn, paddr);
1101 return flags;
1104 static int shadow_set_l1e(struct vcpu *v,
1105 shadow_l1e_t *sl1e,
1106 shadow_l1e_t new_sl1e,
1107 mfn_t sl1mfn)
1109 int flags = 0;
1110 struct domain *d = v->domain;
1111 shadow_l1e_t old_sl1e;
1112 ASSERT(sl1e != NULL);
1114 old_sl1e = *sl1e;
1116 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1118 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1119 && !sh_l1e_is_magic(new_sl1e) )
1121 /* About to install a new reference */
1122 if ( shadow_mode_refcounts(d) ) {
1123 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1125 /* Doesn't look like a pagetable. */
1126 flags |= SHADOW_SET_ERROR;
1127 new_sl1e = shadow_l1e_empty();
1132 /* Write the new entry */
1133 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1134 flags |= SHADOW_SET_CHANGED;
1136 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1137 && !sh_l1e_is_magic(old_sl1e) )
1139 /* We lost a reference to an old mfn. */
1140 /* N.B. Unlike higher-level sets, never need an extra flush
1141 * when writing an l1e. Because it points to the same guest frame
1142 * as the guest l1e did, it's the guest's responsibility to
1143 * trigger a flush later. */
1144 if ( shadow_mode_refcounts(d) )
1146 shadow_put_page_from_l1e(old_sl1e, d);
1149 return flags;
1153 /**************************************************************************/
1154 /* Macros to walk pagetables. These take the shadow of a pagetable and
1155 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1156 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1157 * second entry (since pairs of entries are managed together). For multi-page
1158 * shadows they walk all pages.
1160 * Arguments are an MFN, the variable to point to each entry, a variable
1161 * to indicate that we are done (we will shortcut to the end of the scan
1162 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1163 * and the code.
1165 * WARNING: These macros have side-effects. They change the values of both
1166 * the pointer and the MFN. */
1168 static inline void increment_ptr_to_guest_entry(void *ptr)
1170 if ( ptr )
1172 guest_l1e_t **entry = ptr;
1173 (*entry)++;
1177 /* All kinds of l1: touch all entries */
1178 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1179 do { \
1180 int _i; \
1181 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1182 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1183 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1184 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1185 { \
1186 (_sl1e) = _sp + _i; \
1187 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1188 {_code} \
1189 if ( _done ) break; \
1190 increment_ptr_to_guest_entry(_gl1p); \
1191 } \
1192 unmap_shadow_page(_sp); \
1193 } while (0)
1195 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1196 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1197 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1198 do { \
1199 int __done = 0; \
1200 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1201 ({ (__done = _done); }), _code); \
1202 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1203 if ( !__done ) \
1204 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1205 ({ (__done = _done); }), _code); \
1206 } while (0)
1207 #else /* Everything else; l1 shadows are only one page */
1208 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1209 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1210 #endif
1213 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1215 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1216 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1217 do { \
1218 int _i, _j, __done = 0; \
1219 int _xen = !shadow_mode_external(_dom); \
1220 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1221 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1222 { \
1223 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1224 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1225 if ( (!(_xen)) \
1226 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1227 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1228 { \
1229 (_sl2e) = _sp + _i; \
1230 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1231 {_code} \
1232 if ( (__done = (_done)) ) break; \
1233 increment_ptr_to_guest_entry(_gl2p); \
1234 } \
1235 unmap_shadow_page(_sp); \
1236 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1237 } \
1238 } while (0)
1240 #elif GUEST_PAGING_LEVELS == 2
1242 /* 32-bit on 32-bit: avoid Xen entries */
1243 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1244 do { \
1245 int _i; \
1246 int _xen = !shadow_mode_external(_dom); \
1247 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1248 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1249 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1250 if ( (!(_xen)) \
1251 || \
1252 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1253 { \
1254 (_sl2e) = _sp + _i; \
1255 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1256 {_code} \
1257 if ( _done ) break; \
1258 increment_ptr_to_guest_entry(_gl2p); \
1259 } \
1260 unmap_shadow_page(_sp); \
1261 } while (0)
1263 #elif GUEST_PAGING_LEVELS == 3
1265 /* PAE: if it's an l2h, don't touch Xen mappings */
1266 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1267 do { \
1268 int _i; \
1269 int _xen = !shadow_mode_external(_dom); \
1270 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1271 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1272 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1273 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1274 if ( (!(_xen)) \
1275 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1276 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1277 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1278 { \
1279 (_sl2e) = _sp + _i; \
1280 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1281 {_code} \
1282 if ( _done ) break; \
1283 increment_ptr_to_guest_entry(_gl2p); \
1284 } \
1285 unmap_shadow_page(_sp); \
1286 } while (0)
1288 #else
1290 /* 64-bit l2: touch all entries except for PAE compat guests. */
1291 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1292 do { \
1293 int _i; \
1294 int _xen = !shadow_mode_external(_dom); \
1295 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1296 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1297 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1298 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1299 { \
1300 if ( (!(_xen)) \
1301 || !IS_COMPAT(_dom) \
1302 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1303 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1304 { \
1305 (_sl2e) = _sp + _i; \
1306 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1307 {_code} \
1308 if ( _done ) break; \
1309 increment_ptr_to_guest_entry(_gl2p); \
1310 } \
1311 } \
1312 unmap_shadow_page(_sp); \
1313 } while (0)
1315 #endif /* different kinds of l2 */
1317 #if GUEST_PAGING_LEVELS == 4
1319 /* 64-bit l3: touch all entries */
1320 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1321 do { \
1322 int _i; \
1323 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1324 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1325 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1326 { \
1327 (_sl3e) = _sp + _i; \
1328 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1329 {_code} \
1330 if ( _done ) break; \
1331 increment_ptr_to_guest_entry(_gl3p); \
1332 } \
1333 unmap_shadow_page(_sp); \
1334 } while (0)
1336 /* 64-bit l4: avoid Xen mappings */
1337 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1338 do { \
1339 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1340 int _xen = !shadow_mode_external(_dom); \
1341 int _i; \
1342 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1343 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1344 { \
1345 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1346 { \
1347 (_sl4e) = _sp + _i; \
1348 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1349 {_code} \
1350 if ( _done ) break; \
1351 } \
1352 increment_ptr_to_guest_entry(_gl4p); \
1353 } \
1354 unmap_shadow_page(_sp); \
1355 } while (0)
1357 #endif
1361 /**************************************************************************/
1362 /* Functions to install Xen mappings and linear mappings in shadow pages */
1364 // XXX -- this function should probably be moved to shadow-common.c, but that
1365 // probably wants to wait until the shadow types have been moved from
1366 // shadow-types.h to shadow-private.h
1367 //
1368 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1369 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1371 struct domain *d = v->domain;
1372 shadow_l4e_t *sl4e;
1374 sl4e = sh_map_domain_page(sl4mfn);
1375 ASSERT(sl4e != NULL);
1376 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1378 /* Copy the common Xen mappings from the idle domain */
1379 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1380 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1381 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1383 /* Install the per-domain mappings for this domain */
1384 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1385 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1386 __PAGE_HYPERVISOR);
1388 /* Linear mapping */
1389 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1390 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1392 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1394 // linear tables may not be used with translated PV guests
1395 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1396 shadow_l4e_empty();
1398 else
1400 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1401 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1404 if ( shadow_mode_translate(v->domain) )
1406 /* install domain-specific P2M table */
1407 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1408 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1409 __PAGE_HYPERVISOR);
1412 if ( IS_COMPAT(v->domain) )
1414 /* install compat arg xlat entry */
1415 sl4e[shadow_l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1416 shadow_l4e_from_mfn(
1417 page_to_mfn(virt_to_page(d->arch.mm_arg_xlat_l3)),
1418 __PAGE_HYPERVISOR);
1421 sh_unmap_domain_page(sl4e);
1423 #endif
1425 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1426 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1427 // place, which means that we need to populate the l2h entry in the l3
1428 // table.
1430 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1432 struct domain *d = v->domain;
1433 shadow_l2e_t *sl2e;
1434 #if CONFIG_PAGING_LEVELS == 3
1435 int i;
1436 #else
1438 if ( !pv_32bit_guest(v) )
1439 return;
1440 #endif
1442 sl2e = sh_map_domain_page(sl2hmfn);
1443 ASSERT(sl2e != NULL);
1444 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1446 #if CONFIG_PAGING_LEVELS == 3
1448 /* Copy the common Xen mappings from the idle domain */
1449 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1450 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1451 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1453 /* Install the per-domain mappings for this domain */
1454 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1455 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1456 shadow_l2e_from_mfn(
1457 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1458 __PAGE_HYPERVISOR);
1460 /* We don't set up a linear mapping here because we can't until this
1461 * l2h is installed in an l3e. sh_update_linear_entries() handles
1462 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1463 * We zero them here, just as a safety measure.
1464 */
1465 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1466 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1467 shadow_l2e_empty();
1468 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1469 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1470 shadow_l2e_empty();
1472 if ( shadow_mode_translate(d) )
1474 /* Install the domain-specific p2m table */
1475 l3_pgentry_t *p2m;
1476 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1477 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1478 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1480 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1481 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1482 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1483 __PAGE_HYPERVISOR)
1484 : shadow_l2e_empty();
1486 sh_unmap_domain_page(p2m);
1489 #else
1491 /* Copy the common Xen mappings from the idle domain */
1492 memcpy(
1493 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1494 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1495 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1497 #endif
1499 sh_unmap_domain_page(sl2e);
1501 #endif
1504 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1505 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1507 struct domain *d = v->domain;
1508 shadow_l2e_t *sl2e;
1509 int i;
1511 sl2e = sh_map_domain_page(sl2mfn);
1512 ASSERT(sl2e != NULL);
1513 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1515 /* Copy the common Xen mappings from the idle domain */
1516 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1517 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1518 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1520 /* Install the per-domain mappings for this domain */
1521 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1522 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1523 shadow_l2e_from_mfn(
1524 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1525 __PAGE_HYPERVISOR);
1527 /* Linear mapping */
1528 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1529 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1531 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1533 // linear tables may not be used with translated PV guests
1534 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1535 shadow_l2e_empty();
1537 else
1539 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1540 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1543 if ( shadow_mode_translate(d) )
1545 /* install domain-specific P2M table */
1546 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1547 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1548 __PAGE_HYPERVISOR);
1551 sh_unmap_domain_page(sl2e);
1553 #endif
1557 /**************************************************************************/
1558 /* Create a shadow of a given guest page.
1559 */
1560 static mfn_t
1561 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1563 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1564 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1565 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1567 if ( shadow_type != SH_type_l2_32_shadow
1568 && shadow_type != SH_type_l2_pae_shadow
1569 && shadow_type != SH_type_l2h_pae_shadow
1570 && shadow_type != SH_type_l4_64_shadow )
1571 /* Lower-level shadow, not yet linked form a higher level */
1572 mfn_to_shadow_page(smfn)->up = 0;
1574 #if GUEST_PAGING_LEVELS == 4
1575 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1576 if ( shadow_type == SH_type_l4_64_shadow &&
1577 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1579 /* We're shadowing a new l4, but we've been assuming the guest uses
1580 * only one l4 per vcpu and context switches using an l4 entry.
1581 * Count the number of active l4 shadows. If there are enough
1582 * of them, decide that this isn't an old linux guest, and stop
1583 * pinning l3es. This is not very quick but it doesn't happen
1584 * very often. */
1585 struct list_head *l, *t;
1586 struct shadow_page_info *sp;
1587 struct vcpu *v2;
1588 int l4count = 0, vcpus = 0;
1589 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1591 sp = list_entry(l, struct shadow_page_info, list);
1592 if ( sp->type == SH_type_l4_64_shadow )
1593 l4count++;
1595 for_each_vcpu ( v->domain, v2 )
1596 vcpus++;
1597 if ( l4count > 2 * vcpus )
1599 /* Unpin all the pinned l3 tables, and don't pin any more. */
1600 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1602 sp = list_entry(l, struct shadow_page_info, list);
1603 if ( sp->type == SH_type_l3_64_shadow )
1604 sh_unpin(v, shadow_page_to_mfn(sp));
1606 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1609 #endif
1610 #endif
1612 // Create the Xen mappings...
1613 if ( !shadow_mode_external(v->domain) )
1615 switch (shadow_type)
1617 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1618 case SH_type_l4_shadow:
1619 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1620 #endif
1621 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1622 case SH_type_l2h_shadow:
1623 #ifdef CONFIG_COMPAT
1624 ASSERT( IS_COMPAT(v->domain) );
1625 #endif
1626 sh_install_xen_entries_in_l2h(v, smfn); break;
1627 #endif
1628 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1629 case SH_type_l2_shadow:
1630 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1631 #endif
1632 default: /* Do nothing */ break;
1636 shadow_promote(v, gmfn, shadow_type);
1637 set_shadow_status(v, gmfn, shadow_type, smfn);
1639 return smfn;
1642 /* Make a splintered superpage shadow */
1643 static mfn_t
1644 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1646 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1647 (unsigned long) gfn_x(gfn));
1649 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1650 gfn_x(gfn), mfn_x(smfn));
1652 set_fl1_shadow_status(v, gfn, smfn);
1653 return smfn;
1657 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1658 mfn_t
1659 sh_make_monitor_table(struct vcpu *v)
1661 struct domain *d = v->domain;
1663 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1665 /* Guarantee we can get the memory we need */
1666 shadow_prealloc(d, SHADOW_MAX_ORDER);
1668 #if CONFIG_PAGING_LEVELS == 4
1670 mfn_t m4mfn;
1671 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1672 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1673 /* Remember the level of this table */
1674 mfn_to_page(m4mfn)->shadow_flags = 4;
1675 #if SHADOW_PAGING_LEVELS < 4
1676 // Install a monitor l3 table in slot 0 of the l4 table.
1677 // This is used for shadow linear maps.
1679 mfn_t m3mfn;
1680 l4_pgentry_t *l4e;
1681 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1682 mfn_to_page(m3mfn)->shadow_flags = 3;
1683 l4e = sh_map_domain_page(m4mfn);
1684 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1685 sh_unmap_domain_page(l4e);
1686 if ( pv_32bit_guest(v) )
1688 // Install a monitor l2 table in slot 3 of the l3 table.
1689 // This is used for all Xen entries.
1690 mfn_t m2mfn;
1691 l3_pgentry_t *l3e;
1692 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1693 mfn_to_page(m2mfn)->shadow_flags = 2;
1694 l3e = sh_map_domain_page(m3mfn);
1695 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1696 sh_install_xen_entries_in_l2h(v, m2mfn);
1697 sh_unmap_domain_page(l3e);
1700 #endif /* SHADOW_PAGING_LEVELS < 4 */
1701 return m4mfn;
1704 #elif CONFIG_PAGING_LEVELS == 3
1707 mfn_t m3mfn, m2mfn;
1708 l3_pgentry_t *l3e;
1709 l2_pgentry_t *l2e;
1710 int i;
1712 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1713 /* Remember the level of this table */
1714 mfn_to_page(m3mfn)->shadow_flags = 3;
1716 // Install a monitor l2 table in slot 3 of the l3 table.
1717 // This is used for all Xen entries, including linear maps
1718 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1719 mfn_to_page(m2mfn)->shadow_flags = 2;
1720 l3e = sh_map_domain_page(m3mfn);
1721 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1722 sh_install_xen_entries_in_l2h(v, m2mfn);
1723 /* Install the monitor's own linear map */
1724 l2e = sh_map_domain_page(m2mfn);
1725 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1726 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1727 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1728 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1729 : l2e_empty();
1730 sh_unmap_domain_page(l2e);
1731 sh_unmap_domain_page(l3e);
1733 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1734 return m3mfn;
1737 #elif CONFIG_PAGING_LEVELS == 2
1740 mfn_t m2mfn;
1741 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1742 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1743 /* Remember the level of this table */
1744 mfn_to_page(m2mfn)->shadow_flags = 2;
1745 return m2mfn;
1748 #else
1749 #error this should not happen
1750 #endif /* CONFIG_PAGING_LEVELS */
1752 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1754 /**************************************************************************/
1755 /* These functions also take a virtual address and return the level-N
1756 * shadow table mfn and entry, but they create the shadow pagetables if
1757 * they are needed. The "demand" argument is non-zero when handling
1758 * a demand fault (so we know what to do about accessed bits &c).
1759 * If the necessary tables are not present in the guest, they return NULL. */
1761 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1762 * more levels than the guest, the upper levels are always fixed and do not
1763 * reflect any information from the guest, so we do not use these functions
1764 * to access them. */
1766 #if GUEST_PAGING_LEVELS >= 4
1767 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1768 walk_t *gw,
1769 mfn_t *sl4mfn)
1771 /* There is always a shadow of the top level table. Get it. */
1772 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1773 /* Reading the top level table is always valid. */
1774 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1777 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1778 walk_t *gw,
1779 mfn_t *sl3mfn,
1780 fetch_type_t ft)
1782 mfn_t sl4mfn;
1783 shadow_l4e_t *sl4e;
1784 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1785 /* Get the l4e */
1786 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1787 ASSERT(sl4e != NULL);
1788 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1790 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1791 ASSERT(mfn_valid(*sl3mfn));
1793 else
1795 int r;
1796 shadow_l4e_t new_sl4e;
1797 /* No l3 shadow installed: find and install it. */
1798 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1799 if ( !mfn_valid(*sl3mfn) )
1801 /* No l3 shadow of this page exists at all: make one. */
1802 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1804 /* Install the new sl3 table in the sl4e */
1805 l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
1806 *sl3mfn, &new_sl4e, ft);
1807 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1808 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1809 if ( r & SHADOW_SET_ERROR )
1810 return NULL;
1812 /* Now follow it down a level. Guaranteed to succeed. */
1813 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1815 #endif /* GUEST_PAGING_LEVELS >= 4 */
1818 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1819 walk_t *gw,
1820 mfn_t *sl2mfn,
1821 fetch_type_t ft)
1823 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1824 mfn_t sl3mfn = _mfn(INVALID_MFN);
1825 shadow_l3e_t *sl3e;
1826 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1827 /* Get the l3e */
1828 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1829 if ( sl3e == NULL ) return NULL;
1830 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1832 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1833 ASSERT(mfn_valid(*sl2mfn));
1835 else
1837 int r;
1838 shadow_l3e_t new_sl3e;
1839 unsigned int t = SH_type_l2_shadow;
1841 #ifdef CONFIG_COMPAT
1842 /* Tag compat L2 containing hypervisor (m2p) mappings */
1843 if ( IS_COMPAT(v->domain) &&
1844 guest_l4_table_offset(gw->va) == 0 &&
1845 guest_l3_table_offset(gw->va) == 3 )
1846 t = SH_type_l2h_shadow;
1847 #endif
1848 /* No l2 shadow installed: find and install it. */
1849 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1850 if ( !mfn_valid(*sl2mfn) )
1852 /* No l2 shadow of this page exists at all: make one. */
1853 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1855 /* Install the new sl2 table in the sl3e */
1856 l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
1857 *sl2mfn, &new_sl3e, ft);
1858 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1859 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1860 if ( r & SHADOW_SET_ERROR )
1861 return NULL;
1863 /* Now follow it down a level. Guaranteed to succeed. */
1864 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1865 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1866 /* We never demand-shadow PAE l3es: they are only created in
1867 * sh_update_cr3(). Check if the relevant sl3e is present. */
1868 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1869 + shadow_l3_linear_offset(gw->va);
1870 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1871 return NULL;
1872 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1873 ASSERT(mfn_valid(*sl2mfn));
1874 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1875 #else /* 32bit... */
1876 /* There is always a shadow of the top level table. Get it. */
1877 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1878 /* This next line is important: the guest l2 has a 16k
1879 * shadow, we need to return the right mfn of the four. This
1880 * call will set it for us as a side-effect. */
1881 (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
1882 /* Reading the top level table is always valid. */
1883 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1884 #endif
1888 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1889 walk_t *gw,
1890 mfn_t *sl1mfn,
1891 fetch_type_t ft)
1893 mfn_t sl2mfn;
1894 shadow_l2e_t *sl2e;
1896 /* Get the l2e */
1897 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1898 if ( sl2e == NULL ) return NULL;
1899 /* Install the sl1 in the l2e if it wasn't there or if we need to
1900 * re-do it to fix a PSE dirty bit. */
1901 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1902 && likely(ft != ft_demand_write
1903 || (guest_l2e_get_flags(*gw->l2e) & _PAGE_DIRTY)
1904 || !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)) )
1906 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1907 ASSERT(mfn_valid(*sl1mfn));
1909 else
1911 shadow_l2e_t new_sl2e;
1912 int r, flags = guest_l2e_get_flags(*gw->l2e);
1913 /* No l1 shadow installed: find and install it. */
1914 if ( !(flags & _PAGE_PRESENT) )
1915 return NULL; /* No guest page. */
1916 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1918 /* Splintering a superpage */
1919 gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
1920 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1921 if ( !mfn_valid(*sl1mfn) )
1923 /* No fl1 shadow of this superpage exists at all: make one. */
1924 *sl1mfn = make_fl1_shadow(v, l2gfn);
1927 else
1929 /* Shadowing an actual guest l1 table */
1930 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1931 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1932 if ( !mfn_valid(*sl1mfn) )
1934 /* No l1 shadow of this page exists at all: make one. */
1935 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1938 /* Install the new sl1 table in the sl2e */
1939 l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
1940 *sl1mfn, &new_sl2e, ft);
1941 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1942 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1943 if ( r & SHADOW_SET_ERROR )
1944 return NULL;
1945 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1946 * the guest l1 table has an 8k shadow, and we need to return
1947 * the right mfn of the pair. This call will set it for us as a
1948 * side-effect. (In all other cases, it's a no-op and will be
1949 * compiled out.) */
1950 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1952 /* Now follow it down a level. Guaranteed to succeed. */
1953 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1958 /**************************************************************************/
1959 /* Destructors for shadow tables:
1960 * Unregister the shadow, decrement refcounts of any entries present in it,
1961 * and release the memory.
1963 * N.B. These destructors do not clear the contents of the shadows.
1964 * This allows us to delay TLB shootdowns until the page is being reused.
1965 * See shadow_alloc() and shadow_free() for how this is handled.
1966 */
1968 #if GUEST_PAGING_LEVELS >= 4
1969 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1971 shadow_l4e_t *sl4e;
1972 u32 t = mfn_to_shadow_page(smfn)->type;
1973 mfn_t gmfn, sl4mfn;
1975 SHADOW_DEBUG(DESTROY_SHADOW,
1976 "%s(%05lx)\n", __func__, mfn_x(smfn));
1977 ASSERT(t == SH_type_l4_shadow);
1979 /* Record that the guest page isn't shadowed any more (in this type) */
1980 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1981 delete_shadow_status(v, gmfn, t, smfn);
1982 shadow_demote(v, gmfn, t);
1983 /* Decrement refcounts of all the old entries */
1984 sl4mfn = smfn;
1985 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
1986 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1988 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1989 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1990 | ((unsigned long)sl4e & ~PAGE_MASK));
1992 });
1994 /* Put the memory back in the pool */
1995 shadow_free(v->domain, smfn);
1998 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2000 shadow_l3e_t *sl3e;
2001 u32 t = mfn_to_shadow_page(smfn)->type;
2002 mfn_t gmfn, sl3mfn;
2004 SHADOW_DEBUG(DESTROY_SHADOW,
2005 "%s(%05lx)\n", __func__, mfn_x(smfn));
2006 ASSERT(t == SH_type_l3_shadow);
2008 /* Record that the guest page isn't shadowed any more (in this type) */
2009 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2010 delete_shadow_status(v, gmfn, t, smfn);
2011 shadow_demote(v, gmfn, t);
2013 /* Decrement refcounts of all the old entries */
2014 sl3mfn = smfn;
2015 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2016 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2017 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2018 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2019 | ((unsigned long)sl3e & ~PAGE_MASK));
2020 });
2022 /* Put the memory back in the pool */
2023 shadow_free(v->domain, smfn);
2025 #endif /* GUEST_PAGING_LEVELS >= 4 */
2028 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2030 shadow_l2e_t *sl2e;
2031 u32 t = mfn_to_shadow_page(smfn)->type;
2032 mfn_t gmfn, sl2mfn;
2034 SHADOW_DEBUG(DESTROY_SHADOW,
2035 "%s(%05lx)\n", __func__, mfn_x(smfn));
2037 #if GUEST_PAGING_LEVELS >= 3
2038 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2039 #else
2040 ASSERT(t == SH_type_l2_shadow);
2041 #endif
2043 /* Record that the guest page isn't shadowed any more (in this type) */
2044 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2045 delete_shadow_status(v, gmfn, t, smfn);
2046 shadow_demote(v, gmfn, t);
2048 /* Decrement refcounts of all the old entries */
2049 sl2mfn = smfn;
2050 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2051 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2052 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2053 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2054 | ((unsigned long)sl2e & ~PAGE_MASK));
2055 });
2057 /* Put the memory back in the pool */
2058 shadow_free(v->domain, smfn);
2061 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2063 struct domain *d = v->domain;
2064 shadow_l1e_t *sl1e;
2065 u32 t = mfn_to_shadow_page(smfn)->type;
2067 SHADOW_DEBUG(DESTROY_SHADOW,
2068 "%s(%05lx)\n", __func__, mfn_x(smfn));
2069 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2071 /* Record that the guest page isn't shadowed any more (in this type) */
2072 if ( t == SH_type_fl1_shadow )
2074 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2075 delete_fl1_shadow_status(v, gfn, smfn);
2077 else
2079 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2080 delete_shadow_status(v, gmfn, t, smfn);
2081 shadow_demote(v, gmfn, t);
2084 if ( shadow_mode_refcounts(d) )
2086 /* Decrement refcounts of all the old entries */
2087 mfn_t sl1mfn = smfn;
2088 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2089 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2090 && !sh_l1e_is_magic(*sl1e) )
2091 shadow_put_page_from_l1e(*sl1e, d);
2092 });
2095 /* Put the memory back in the pool */
2096 shadow_free(v->domain, smfn);
2099 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2100 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2102 struct domain *d = v->domain;
2103 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2105 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2106 /* Need to destroy the l3 monitor page in slot 0 too */
2108 mfn_t m3mfn;
2109 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2110 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2111 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2112 if ( pv_32bit_guest(v) )
2114 /* Need to destroy the l2 monitor page in slot 3 too */
2115 l3_pgentry_t *l3e = sh_map_domain_page(m3mfn);
2116 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2117 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2118 sh_unmap_domain_page(l3e);
2120 shadow_free(d, m3mfn);
2121 sh_unmap_domain_page(l4e);
2123 #elif CONFIG_PAGING_LEVELS == 3
2124 /* Need to destroy the l2 monitor page in slot 4 too */
2126 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2127 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2128 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2129 sh_unmap_domain_page(l3e);
2131 #endif
2133 /* Put the memory back in the pool */
2134 shadow_free(d, mmfn);
2136 #endif
2138 /**************************************************************************/
2139 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2140 * These are called from common code when we are running out of shadow
2141 * memory, and unpinning all the top-level shadows hasn't worked.
2143 * This implementation is pretty crude and slow, but we hope that it won't
2144 * be called very often. */
2146 #if GUEST_PAGING_LEVELS == 2
2148 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2150 shadow_l2e_t *sl2e;
2151 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2152 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2153 });
2156 #elif GUEST_PAGING_LEVELS == 3
2158 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2159 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2161 shadow_l2e_t *sl2e;
2162 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2163 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2164 });
2167 #elif GUEST_PAGING_LEVELS == 4
2169 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2171 shadow_l4e_t *sl4e;
2172 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2173 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2174 });
2177 #endif
2179 /**************************************************************************/
2180 /* Internal translation functions.
2181 * These functions require a pointer to the shadow entry that will be updated.
2182 */
2184 /* These functions take a new guest entry, translate it to shadow and write
2185 * the shadow entry.
2187 * They return the same bitmaps as the shadow_set_lXe() functions.
2188 */
2190 #if GUEST_PAGING_LEVELS >= 4
2191 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2193 shadow_l4e_t new_sl4e;
2194 guest_l4e_t *new_gl4e = new_ge;
2195 shadow_l4e_t *sl4p = se;
2196 mfn_t sl3mfn = _mfn(INVALID_MFN);
2197 int result = 0;
2199 perfc_incrc(shadow_validate_gl4e_calls);
2201 if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
2203 gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
2204 mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
2205 if ( mfn_valid(gl3mfn) )
2206 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2207 else
2208 result |= SHADOW_SET_ERROR;
2210 l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
2211 sl3mfn, &new_sl4e, ft_prefetch);
2213 // check for updates to xen reserved slots
2214 if ( !shadow_mode_external(v->domain) )
2216 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2217 sizeof(shadow_l4e_t));
2218 int reserved_xen_slot = !is_guest_l4_slot(v->domain, shadow_index);
2220 if ( unlikely(reserved_xen_slot) )
2222 // attempt by the guest to write to a xen reserved slot
2223 //
2224 SHADOW_PRINTK("%s out-of-range update "
2225 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2226 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2227 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2229 SHADOW_ERROR("out-of-range l4e update\n");
2230 result |= SHADOW_SET_ERROR;
2233 // do not call shadow_set_l4e...
2234 return result;
2238 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2239 return result;
2243 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2245 shadow_l3e_t new_sl3e;
2246 guest_l3e_t *new_gl3e = new_ge;
2247 shadow_l3e_t *sl3p = se;
2248 mfn_t sl2mfn = _mfn(INVALID_MFN);
2249 int result = 0;
2251 perfc_incrc(shadow_validate_gl3e_calls);
2253 if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
2255 gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
2256 mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
2257 if ( mfn_valid(gl2mfn) )
2258 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2259 else
2260 result |= SHADOW_SET_ERROR;
2262 l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
2263 sl2mfn, &new_sl3e, ft_prefetch);
2264 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2266 return result;
2268 #endif // GUEST_PAGING_LEVELS >= 4
2270 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2272 shadow_l2e_t new_sl2e;
2273 guest_l2e_t *new_gl2e = new_ge;
2274 shadow_l2e_t *sl2p = se;
2275 mfn_t sl1mfn = _mfn(INVALID_MFN);
2276 int result = 0;
2278 perfc_incrc(shadow_validate_gl2e_calls);
2280 if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
2282 gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
2283 if ( guest_supports_superpages(v) &&
2284 (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
2286 // superpage -- need to look up the shadow L1 which holds the
2287 // splitters...
2288 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2289 #if 0
2290 // XXX - it's possible that we want to do some kind of prefetch
2291 // for superpage fl1's here, but this is *not* on the demand path,
2292 // so we'll hold off trying that for now...
2293 //
2294 if ( !mfn_valid(sl1mfn) )
2295 sl1mfn = make_fl1_shadow(v, gl1gfn);
2296 #endif
2298 else
2300 mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
2301 if ( mfn_valid(gl1mfn) )
2302 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2303 else
2304 result |= SHADOW_SET_ERROR;
2307 l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
2308 sl1mfn, &new_sl2e, ft_prefetch);
2310 // check for updates to xen reserved slots in PV guests...
2311 // XXX -- need to revisit this for PV 3-on-4 guests.
2312 //
2313 #if SHADOW_PAGING_LEVELS < 4
2314 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2315 if ( !shadow_mode_external(v->domain) )
2317 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2318 sizeof(shadow_l2e_t));
2319 int reserved_xen_slot;
2321 #if SHADOW_PAGING_LEVELS == 3
2322 reserved_xen_slot =
2323 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2324 (shadow_index
2325 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2326 #else /* SHADOW_PAGING_LEVELS == 2 */
2327 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2328 #endif
2330 if ( unlikely(reserved_xen_slot) )
2332 // attempt by the guest to write to a xen reserved slot
2333 //
2334 SHADOW_PRINTK("%s out-of-range update "
2335 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2336 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2337 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2339 SHADOW_ERROR("out-of-range l2e update\n");
2340 result |= SHADOW_SET_ERROR;
2343 // do not call shadow_set_l2e...
2344 return result;
2347 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2348 #endif /* SHADOW_PAGING_LEVELS < 4 */
2350 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2352 return result;
2355 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2357 shadow_l1e_t new_sl1e;
2358 guest_l1e_t *new_gl1e = new_ge;
2359 shadow_l1e_t *sl1p = se;
2360 gfn_t gfn;
2361 mfn_t gmfn;
2362 int result = 0, mmio;
2364 perfc_incrc(shadow_validate_gl1e_calls);
2366 gfn = guest_l1e_get_gfn(*new_gl1e);
2367 gmfn = vcpu_gfn_to_mfn(v, gfn);
2369 mmio = (is_hvm_vcpu(v) && paging_vcpu_mode_translate(v) &&
2370 mmio_space(gfn_to_paddr(gfn)));
2371 l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e,
2372 ft_prefetch, mmio);
2374 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2375 return result;
2379 /**************************************************************************/
2380 /* Functions which translate and install the shadows of arbitrary guest
2381 * entries that we have just seen the guest write. */
2384 static inline int
2385 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2386 void *new_gp, u32 size, u32 sh_type,
2387 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2388 int (*validate_ge)(struct vcpu *v, void *ge,
2389 mfn_t smfn, void *se))
2390 /* Generic function for mapping and validating. */
2392 mfn_t smfn, smfn2, map_mfn;
2393 shadow_l1e_t *sl1p;
2394 u32 shadow_idx, guest_idx;
2395 int result = 0;
2397 /* Align address and size to guest entry boundaries */
2398 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2399 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2400 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2401 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2403 /* Map the shadow page */
2404 smfn = get_shadow_status(v, gmfn, sh_type);
2405 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2406 guest_idx = guest_index(new_gp);
2407 map_mfn = smfn;
2408 shadow_idx = shadow_index(&map_mfn, guest_idx);
2409 sl1p = map_shadow_page(map_mfn);
2411 /* Validate one entry at a time */
2412 while ( size )
2414 smfn2 = smfn;
2415 guest_idx = guest_index(new_gp);
2416 shadow_idx = shadow_index(&smfn2, guest_idx);
2417 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2419 /* We have moved to another page of the shadow */
2420 map_mfn = smfn2;
2421 unmap_shadow_page(sl1p);
2422 sl1p = map_shadow_page(map_mfn);
2424 result |= validate_ge(v,
2425 new_gp,
2426 map_mfn,
2427 &sl1p[shadow_idx]);
2428 size -= sizeof(guest_l1e_t);
2429 new_gp += sizeof(guest_l1e_t);
2431 unmap_shadow_page(sl1p);
2432 return result;
2436 int
2437 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2438 void *new_gl4p, u32 size)
2440 #if GUEST_PAGING_LEVELS >= 4
2441 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2442 SH_type_l4_shadow,
2443 shadow_l4_index,
2444 validate_gl4e);
2445 #else // ! GUEST_PAGING_LEVELS >= 4
2446 SHADOW_PRINTK("called in wrong paging mode!\n");
2447 BUG();
2448 return 0;
2449 #endif
2452 int
2453 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2454 void *new_gl3p, u32 size)
2456 #if GUEST_PAGING_LEVELS >= 4
2457 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2458 SH_type_l3_shadow,
2459 shadow_l3_index,
2460 validate_gl3e);
2461 #else // ! GUEST_PAGING_LEVELS >= 4
2462 SHADOW_PRINTK("called in wrong paging mode!\n");
2463 BUG();
2464 return 0;
2465 #endif
2468 int
2469 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2470 void *new_gl2p, u32 size)
2472 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2473 SH_type_l2_shadow,
2474 shadow_l2_index,
2475 validate_gl2e);
2478 int
2479 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2480 void *new_gl2p, u32 size)
2482 #if GUEST_PAGING_LEVELS >= 3
2483 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2484 SH_type_l2h_shadow,
2485 shadow_l2_index,
2486 validate_gl2e);
2487 #else /* Non-PAE guests don't have different kinds of l2 table */
2488 SHADOW_PRINTK("called in wrong paging mode!\n");
2489 BUG();
2490 return 0;
2491 #endif
2494 int
2495 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2496 void *new_gl1p, u32 size)
2498 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2499 SH_type_l1_shadow,
2500 shadow_l1_index,
2501 validate_gl1e);
2505 /**************************************************************************/
2506 /* Optimization: If we see two emulated writes of zeros to the same
2507 * page-table without another kind of page fault in between, we guess
2508 * that this is a batch of changes (for process destruction) and
2509 * unshadow the page so we don't take a pagefault on every entry. This
2510 * should also make finding writeable mappings of pagetables much
2511 * easier. */
2513 /* Look to see if this is the second emulated write in a row to this
2514 * page, and unshadow/unhook if it is */
2515 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2517 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2518 if ( v->arch.paging.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2519 sh_mfn_is_a_page_table(gmfn) )
2521 u32 flags = mfn_to_page(gmfn)->shadow_flags;
2522 if ( !(flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
2524 perfc_incrc(shadow_early_unshadow);
2525 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2528 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
2529 #endif
2532 /* Stop counting towards early unshadows, as we've seen a real page fault */
2533 static inline void reset_early_unshadow(struct vcpu *v)
2535 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2536 v->arch.paging.shadow.last_emulated_mfn = INVALID_MFN;
2537 #endif
2542 /**************************************************************************/
2543 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2544 * demand-faulted a shadow l1e in the fault handler, to see if it's
2545 * worth fetching some more.
2546 */
2548 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2550 /* XXX magic number */
2551 #define PREFETCH_DISTANCE 32
2553 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2554 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2556 int i, dist, mmio;
2557 gfn_t gfn;
2558 mfn_t gmfn;
2559 guest_l1e_t gl1e;
2560 shadow_l1e_t sl1e;
2561 u32 gflags;
2563 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2564 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2565 /* And no more than a maximum fetches-per-fault */
2566 if ( dist > PREFETCH_DISTANCE )
2567 dist = PREFETCH_DISTANCE;
2569 for ( i = 1; i < dist ; i++ )
2571 /* No point in prefetching if there's already a shadow */
2572 if ( ptr_sl1e[i].l1 != 0 )
2573 break;
2575 if ( gw->l1e )
2577 /* Normal guest page; grab the next guest entry */
2578 gl1e = gw->l1e[i];
2579 /* Not worth continuing if we hit an entry that will need another
2580 * fault for A/D-bit propagation anyway */
2581 gflags = guest_l1e_get_flags(gl1e);
2582 if ( (gflags & _PAGE_PRESENT)
2583 && (!(gflags & _PAGE_ACCESSED)
2584 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2585 break;
2587 else
2589 /* Fragmented superpage, unless we've been called wrongly */
2590 ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
2591 /* Increment the l1e's GFN by the right number of guest pages */
2592 gl1e = guest_l1e_from_gfn(
2593 _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i),
2594 guest_l1e_get_flags(gw->eff_l1e));
2597 /* Look at the gfn that the l1e is pointing at */
2598 gfn = guest_l1e_get_gfn(gl1e);
2599 gmfn = vcpu_gfn_to_mfn(v, gfn);
2600 mmio = ( is_hvm_vcpu(v)
2601 && paging_vcpu_mode_translate(v)
2602 && mmio_space(gfn_to_paddr(gfn)) );
2604 /* Propagate the entry. Safe to use a pointer to our local
2605 * gl1e, since this is not a demand-fetch so there will be no
2606 * write-back to the guest. */
2607 l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
2608 gmfn, &sl1e, ft_prefetch, mmio);
2609 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2613 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2616 /**************************************************************************/
2617 /* Entry points into the shadow code */
2619 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2620 * for pagefaults. Returns 1 if this fault was an artefact of the
2621 * shadow code (and the guest should retry) or 0 if it is not (and the
2622 * fault should be handled elsewhere or passed to the guest). */
2624 static int sh_page_fault(struct vcpu *v,
2625 unsigned long va,
2626 struct cpu_user_regs *regs)
2628 struct domain *d = v->domain;
2629 walk_t gw;
2630 u32 accumulated_gflags;
2631 gfn_t gfn;
2632 mfn_t gmfn, sl1mfn=_mfn(0);
2633 shadow_l1e_t sl1e, *ptr_sl1e;
2634 paddr_t gpa;
2635 struct sh_emulate_ctxt emul_ctxt;
2636 struct x86_emulate_ops *emul_ops;
2637 int r, mmio;
2638 fetch_type_t ft = 0;
2640 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
2641 v->domain->domain_id, v->vcpu_id, va, regs->error_code);
2643 perfc_incrc(shadow_fault);
2644 //
2645 // XXX: Need to think about eventually mapping superpages directly in the
2646 // shadow (when possible), as opposed to splintering them into a
2647 // bunch of 4K maps.
2648 //
2650 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2651 if ( (regs->error_code & PFEC_reserved_bit) )
2653 /* The only reasons for reserved bits to be set in shadow entries
2654 * are the two "magic" shadow_l1e entries. */
2655 if ( likely((__copy_from_user(&sl1e,
2656 (sh_linear_l1_table(v)
2657 + shadow_l1_linear_offset(va)),
2658 sizeof(sl1e)) == 0)
2659 && sh_l1e_is_magic(sl1e)) )
2661 if ( sh_l1e_is_gnp(sl1e) )
2663 if ( likely(!is_hvm_domain(d) ||
2664 paging_vcpu_mode_translate(v)) )
2666 /* Not-present in a guest PT: pass to the guest as
2667 * a not-present fault (by flipping two bits). */
2668 ASSERT(regs->error_code & PFEC_page_present);
2669 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2670 perfc_incrc(shadow_fault_fast_gnp);
2671 SHADOW_PRINTK("fast path not-present\n");
2672 return 0;
2674 else
2676 /* Not-present in the P2M: MMIO */
2677 gpa = va;
2680 else
2682 /* Magic MMIO marker: extract gfn for MMIO address */
2683 ASSERT(sh_l1e_is_mmio(sl1e));
2684 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2685 << PAGE_SHIFT)
2686 | (va & ~PAGE_MASK);
2688 perfc_incrc(shadow_fault_fast_mmio);
2689 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2690 reset_early_unshadow(v);
2691 handle_mmio(gpa);
2692 return EXCRET_fault_fixed;
2694 else
2696 /* This should be exceptionally rare: another vcpu has fixed
2697 * the tables between the fault and our reading the l1e.
2698 * Retry and let the hardware give us the right fault next time. */
2699 perfc_incrc(shadow_fault_fast_fail);
2700 SHADOW_PRINTK("fast path false alarm!\n");
2701 return EXCRET_fault_fixed;
2704 #endif /* SHOPT_FAST_FAULT_PATH */
2706 /* Detect if this page fault happened while we were already in Xen
2707 * doing a shadow operation. If that happens, the only thing we can
2708 * do is let Xen's normal fault handlers try to fix it. In any case,
2709 * a diagnostic trace of the fault will be more useful than
2710 * a BUG() when we try to take the lock again. */
2711 if ( unlikely(shadow_locked_by_me(d)) )
2713 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
2714 d->arch.paging.shadow.locker_function);
2715 return 0;
2718 shadow_lock(d);
2720 shadow_audit_tables(v);
2722 if ( guest_walk_tables(v, va, &gw, 1) != 0 )
2724 SHADOW_PRINTK("malformed guest pagetable!");
2725 print_gw(&gw);
2728 sh_audit_gw(v, &gw);
2730 // We do not look at the gw->l1e, as that will not exist for superpages.
2731 // Instead, we use the gw->eff_l1e...
2732 //
2733 // We need not check all the levels of the guest page table entries for
2734 // present vs not-present, as the eff_l1e will always be not present if
2735 // one of the higher level entries is not present.
2736 //
2737 if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
2739 if ( is_hvm_domain(d) && !paging_vcpu_mode_translate(v) )
2741 /* Not present in p2m map, means this is mmio */
2742 gpa = va;
2743 goto mmio;
2746 perfc_incrc(shadow_fault_bail_not_present);
2747 goto not_a_shadow_fault;
2750 // All levels of the guest page table are now known to be present.
2751 accumulated_gflags = accumulate_guest_flags(v, &gw);
2753 // Check for attempts to access supervisor-only pages from user mode,
2754 // i.e. ring 3. Such errors are not caused or dealt with by the shadow
2755 // code.
2756 //
2757 if ( (regs->error_code & PFEC_user_mode) &&
2758 !(accumulated_gflags & _PAGE_USER) )
2760 /* illegal user-mode access to supervisor-only page */
2761 perfc_incrc(shadow_fault_bail_user_supervisor);
2762 goto not_a_shadow_fault;
2765 // Was it a write fault?
2766 ft = ((regs->error_code & PFEC_write_access)
2767 ? ft_demand_write : ft_demand_read);
2768 if ( ft == ft_demand_write )
2770 if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
2772 perfc_incrc(shadow_fault_bail_ro_mapping);
2773 goto not_a_shadow_fault;
2776 else // must have been either an insn fetch or read fault
2778 // Check for NX bit violations: attempts to execute code that is
2779 // marked "do not execute". Such errors are not caused or dealt with
2780 // by the shadow code.
2781 //
2782 if ( regs->error_code & PFEC_insn_fetch )
2784 if ( accumulated_gflags & _PAGE_NX_BIT )
2786 /* NX prevented this code fetch */
2787 perfc_incrc(shadow_fault_bail_nx);
2788 goto not_a_shadow_fault;
2793 /* What mfn is the guest trying to access? */
2794 gfn = guest_l1e_get_gfn(gw.eff_l1e);
2795 gmfn = vcpu_gfn_to_mfn(v, gfn);
2796 mmio = (is_hvm_domain(d)
2797 && paging_vcpu_mode_translate(v)
2798 && mmio_space(gfn_to_paddr(gfn)));
2800 if ( !mmio && !mfn_valid(gmfn) )
2802 perfc_incrc(shadow_fault_bail_bad_gfn);
2803 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
2804 gfn_x(gfn), mfn_x(gmfn));
2805 goto not_a_shadow_fault;
2808 /* Make sure there is enough free shadow memory to build a chain of
2809 * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
2810 * to allocate all we need. (We never allocate a top-level shadow
2811 * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
2812 shadow_prealloc(d, SHADOW_MAX_ORDER);
2814 /* Acquire the shadow. This must happen before we figure out the rights
2815 * for the shadow entry, since we might promote a page here. */
2816 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2817 if ( unlikely(ptr_sl1e == NULL) )
2819 /* Couldn't get the sl1e! Since we know the guest entries
2820 * are OK, this can only have been caused by a failed
2821 * shadow_set_l*e(), which will have crashed the guest.
2822 * Get out of the fault handler immediately. */
2823 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2824 unmap_walk(v, &gw);
2825 shadow_unlock(d);
2826 return 0;
2829 /* Calculate the shadow entry and write it */
2830 l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn,
2831 gmfn, &sl1e, ft, mmio);
2832 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2834 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2835 /* Prefetch some more shadow entries */
2836 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2837 #endif
2839 /* Need to emulate accesses to page tables */
2840 if ( sh_mfn_is_a_page_table(gmfn) )
2842 if ( ft == ft_demand_write )
2844 perfc_incrc(shadow_fault_emulate_write);
2845 goto emulate;
2847 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2849 perfc_incrc(shadow_fault_emulate_read);
2850 goto emulate;
2854 if ( mmio )
2856 gpa = guest_walk_to_gpa(&gw);
2857 goto mmio;
2860 perfc_incrc(shadow_fault_fixed);
2861 d->arch.paging.shadow.fault_count++;
2862 reset_early_unshadow(v);
2864 done:
2865 sh_audit_gw(v, &gw);
2866 unmap_walk(v, &gw);
2867 SHADOW_PRINTK("fixed\n");
2868 shadow_audit_tables(v);
2869 shadow_unlock(d);
2870 return EXCRET_fault_fixed;
2872 emulate:
2873 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
2874 goto not_a_shadow_fault;
2876 if ( is_hvm_domain(d) )
2878 /*
2879 * If we are in the middle of injecting an exception or interrupt then
2880 * we should not emulate: it is not the instruction at %eip that caused
2881 * the fault. Furthermore it is almost certainly the case the handler
2882 * stack is currently considered to be a page table, so we should
2883 * unshadow the faulting page before exiting.
2884 */
2885 if ( unlikely(hvm_event_injection_faulted(v)) )
2887 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
2888 "injection: cr2=%#lx, mfn=%#lx\n",
2889 va, mfn_x(gmfn));
2890 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2891 goto done;
2894 hvm_store_cpu_guest_regs(v, regs, NULL);
2897 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
2898 (unsigned long)regs->eip, (unsigned long)regs->esp);
2900 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
2902 /*
2903 * We do not emulate user writes. Instead we use them as a hint that the
2904 * page is no longer a page table. This behaviour differs from native, but
2905 * it seems very unlikely that any OS grants user access to page tables.
2906 */
2907 r = X86EMUL_UNHANDLEABLE;
2908 if ( !(regs->error_code & PFEC_user_mode) )
2909 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
2911 /*
2912 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
2913 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
2914 * then it must be 'failable': we cannot require the unshadow to succeed.
2915 */
2916 if ( r == X86EMUL_UNHANDLEABLE )
2918 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
2919 mfn_x(gmfn));
2920 perfc_incrc(shadow_fault_emulate_failed);
2921 /* If this is actually a page table, then we have a bug, and need
2922 * to support more operations in the emulator. More likely,
2923 * though, this is a hint that this page should not be shadowed. */
2924 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2927 /* Emulator has changed the user registers: write back */
2928 if ( is_hvm_domain(d) )
2929 hvm_load_cpu_guest_regs(v, regs);
2930 goto done;
2932 mmio:
2933 if ( !guest_mode(regs) )
2934 goto not_a_shadow_fault;
2935 perfc_incrc(shadow_fault_mmio);
2936 sh_audit_gw(v, &gw);
2937 unmap_walk(v, &gw);
2938 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
2939 shadow_audit_tables(v);
2940 reset_early_unshadow(v);
2941 shadow_unlock(d);
2942 handle_mmio(gpa);
2943 return EXCRET_fault_fixed;
2945 not_a_shadow_fault:
2946 sh_audit_gw(v, &gw);
2947 unmap_walk(v, &gw);
2948 SHADOW_PRINTK("not a shadow fault\n");
2949 shadow_audit_tables(v);
2950 reset_early_unshadow(v);
2951 shadow_unlock(d);
2952 return 0;
2956 static int
2957 sh_invlpg(struct vcpu *v, unsigned long va)
2958 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
2959 * instruction should be issued on the hardware, or 0 if it's safe not
2960 * to do so. */
2962 shadow_l2e_t sl2e;
2964 perfc_incrc(shadow_invlpg);
2966 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
2967 * run as high as 6% of invlpg calls where we haven't shadowed the l2
2968 * yet. */
2969 #if SHADOW_PAGING_LEVELS == 4
2971 shadow_l3e_t sl3e;
2972 if ( !(shadow_l4e_get_flags(
2973 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
2974 & _PAGE_PRESENT) )
2975 return 0;
2976 /* This must still be a copy-from-user because we don't have the
2977 * shadow lock, and the higher-level shadows might disappear
2978 * under our feet. */
2979 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
2980 + shadow_l3_linear_offset(va)),
2981 sizeof (sl3e)) != 0 )
2983 perfc_incrc(shadow_invlpg_fault);
2984 return 0;
2986 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
2987 return 0;
2989 #elif SHADOW_PAGING_LEVELS == 3
2990 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
2991 & _PAGE_PRESENT) )
2992 // no need to flush anything if there's no SL2...
2993 return 0;
2994 #endif
2996 /* This must still be a copy-from-user because we don't have the shadow
2997 * lock, and the higher-level shadows might disappear under our feet. */
2998 if ( __copy_from_user(&sl2e,
2999 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3000 sizeof (sl2e)) != 0 )
3002 perfc_incrc(shadow_invlpg_fault);
3003 return 0;
3006 // If there's nothing shadowed for this particular sl2e, then
3007 // there is no need to do an invlpg, either...
3008 //
3009 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3010 return 0;
3012 // Check to see if the SL2 is a splintered superpage...
3013 // If so, then we'll need to flush the entire TLB (because that's
3014 // easier than invalidating all of the individual 4K pages).
3015 //
3016 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
3017 == SH_type_fl1_shadow )
3019 local_flush_tlb();
3020 return 0;
3023 return 1;
3026 static unsigned long
3027 sh_gva_to_gfn(struct vcpu *v, unsigned long va)
3028 /* Called to translate a guest virtual address to what the *guest*
3029 * pagetables would map it to. */
3031 walk_t gw;
3032 gfn_t gfn;
3034 guest_walk_tables(v, va, &gw, 0);
3035 gfn = guest_walk_to_gfn(&gw);
3036 unmap_walk(v, &gw);
3038 return gfn_x(gfn);
3042 static inline void
3043 sh_update_linear_entries(struct vcpu *v)
3044 /* Sync up all the linear mappings for this vcpu's pagetables */
3046 struct domain *d = v->domain;
3048 /* Linear pagetables in PV guests
3049 * ------------------------------
3051 * Guest linear pagetables, which map the guest pages, are at
3052 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3053 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3054 * are set up at shadow creation time, but (of course!) the PAE case
3055 * is subtler. Normal linear mappings are made by having an entry
3056 * in the top-level table that points to itself (shadow linear) or
3057 * to the guest top-level table (guest linear). For PAE, to set up
3058 * a linear map requires us to copy the four top-level entries into
3059 * level-2 entries. That means that every time we change a PAE l3e,
3060 * we need to reflect the change into the copy.
3062 * Linear pagetables in HVM guests
3063 * -------------------------------
3065 * For HVM guests, the linear pagetables are installed in the monitor
3066 * tables (since we can't put them in the shadow). Shadow linear
3067 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3068 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3069 * a linear pagetable of the monitor tables themselves. We have
3070 * the same issue of having to re-copy PAE l3 entries whevever we use
3071 * PAE shadows.
3073 * Because HVM guests run on the same monitor tables regardless of the
3074 * shadow tables in use, the linear mapping of the shadow tables has to
3075 * be updated every time v->arch.shadow_table changes.
3076 */
3078 /* Don't try to update the monitor table if it doesn't exist */
3079 if ( shadow_mode_external(d)
3080 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3081 return;
3083 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3085 /* For PV, one l4e points at the guest l4, one points at the shadow
3086 * l4. No maintenance required.
3087 * For HVM, just need to update the l4e that points to the shadow l4. */
3089 if ( shadow_mode_external(d) )
3091 /* Use the linear map if we can; otherwise make a new mapping */
3092 if ( v == current )
3094 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3095 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3096 __PAGE_HYPERVISOR);
3098 else
3100 l4_pgentry_t *ml4e;
3101 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3102 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3103 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3104 __PAGE_HYPERVISOR);
3105 sh_unmap_domain_page(ml4e);
3109 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3111 /* PV: XXX
3113 * HVM: To give ourselves a linear map of the shadows, we need to
3114 * extend a PAE shadow to 4 levels. We do this by having a monitor
3115 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3116 * entries into it. Then, by having the monitor l4e for shadow
3117 * pagetables also point to the monitor l4, we can use it to access
3118 * the shadows.
3119 */
3121 if ( shadow_mode_external(d) )
3123 /* Install copies of the shadow l3es into the monitor l3 table.
3124 * The monitor l3 table is hooked into slot 0 of the monitor
3125 * l4 table, so we use l3 linear indices 0 to 3 */
3126 shadow_l3e_t *sl3e;
3127 l3_pgentry_t *ml3e;
3128 mfn_t l3mfn;
3129 int i;
3131 /* Use linear mappings if we can; otherwise make new mappings */
3132 if ( v == current )
3134 ml3e = __linear_l3_table;
3135 l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
3137 else
3139 l4_pgentry_t *ml4e;
3140 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3141 ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
3142 l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
3143 ml3e = sh_map_domain_page(l3mfn);
3144 sh_unmap_domain_page(ml4e);
3147 /* Shadow l3 tables are made up by sh_update_cr3 */
3148 sl3e = v->arch.paging.shadow.l3table;
3150 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3152 ml3e[i] =
3153 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3154 ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3155 __PAGE_HYPERVISOR)
3156 : l3e_empty();
3159 if ( v != current )
3160 sh_unmap_domain_page(ml3e);
3162 else
3163 domain_crash(d); /* XXX */
3165 #elif CONFIG_PAGING_LEVELS == 3
3167 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3168 * entries in the shadow, and the shadow's l3 entries into the
3169 * shadow-linear-map l2 entries in the shadow. This is safe to do
3170 * because Xen does not let guests share high-slot l2 tables between l3s,
3171 * so we know we're not treading on anyone's toes.
3173 * HVM: need to copy the shadow's l3 entries into the
3174 * shadow-linear-map l2 entries in the monitor table. This is safe
3175 * because we have one monitor table for each vcpu. The monitor's
3176 * own l3es don't need to be copied because they never change.
3177 * XXX That might change if we start stuffing things into the rest
3178 * of the monitor's virtual address space.
3179 */
3181 l2_pgentry_t *l2e, new_l2e;
3182 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3183 int i;
3184 int unmap_l2e = 0;
3186 #if GUEST_PAGING_LEVELS == 2
3188 /* Shadow l3 tables were built by sh_update_cr3 */
3189 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3190 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3192 #else /* GUEST_PAGING_LEVELS == 3 */
3194 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3195 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3197 #endif /* GUEST_PAGING_LEVELS */
3199 /* Choose where to write the entries, using linear maps if possible */
3200 if ( shadow_mode_external(d) )
3202 if ( v == current )
3204 /* From the monitor tables, it's safe to use linear maps
3205 * to update monitor l2s */
3206 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3208 else
3210 /* Map the monitor table's high l2 */
3211 l3_pgentry_t *l3e;
3212 l3e = sh_map_domain_page(
3213 pagetable_get_mfn(v->arch.monitor_table));
3214 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3215 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3216 unmap_l2e = 1;
3217 sh_unmap_domain_page(l3e);
3220 else
3222 /* Map the shadow table's high l2 */
3223 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3224 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3225 unmap_l2e = 1;
3228 /* Write linear mapping of guest (only in PV, and only when
3229 * not translated). */
3230 if ( !shadow_mode_translate(d) )
3232 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3234 new_l2e =
3235 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3236 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3237 __PAGE_HYPERVISOR)
3238 : l2e_empty());
3239 safe_write_entry(
3240 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3241 &new_l2e);
3245 /* Write linear mapping of shadow. */
3246 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3248 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3249 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3250 __PAGE_HYPERVISOR)
3251 : l2e_empty();
3252 safe_write_entry(
3253 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3254 &new_l2e);
3257 if ( unmap_l2e )
3258 sh_unmap_domain_page(l2e);
3261 #elif CONFIG_PAGING_LEVELS == 2
3263 /* For PV, one l2e points at the guest l2, one points at the shadow
3264 * l2. No maintenance required.
3265 * For HVM, just need to update the l2e that points to the shadow l2. */
3267 if ( shadow_mode_external(d) )
3269 /* Use the linear map if we can; otherwise make a new mapping */
3270 if ( v == current )
3272 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3273 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3274 __PAGE_HYPERVISOR);
3276 else
3278 l2_pgentry_t *ml2e;
3279 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3280 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3281 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3282 __PAGE_HYPERVISOR);
3283 sh_unmap_domain_page(ml2e);
3287 #else
3288 #error this should not happen
3289 #endif
3293 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3294 * Does all appropriate management/bookkeeping/refcounting/etc...
3295 */
3296 static void
3297 sh_detach_old_tables(struct vcpu *v)
3299 mfn_t smfn;
3300 int i = 0;
3302 ////
3303 //// vcpu->arch.paging.shadow.guest_vtable
3304 ////
3306 #if GUEST_PAGING_LEVELS == 3
3307 /* PAE guests don't have a mapping of the guest top-level table */
3308 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3309 #else
3310 if ( v->arch.paging.shadow.guest_vtable )
3312 struct domain *d = v->domain;
3313 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3314 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3315 v->arch.paging.shadow.guest_vtable = NULL;
3317 #endif
3320 ////
3321 //// vcpu->arch.shadow_table[]
3322 ////
3324 #if GUEST_PAGING_LEVELS == 3
3325 /* PAE guests have four shadow_table entries */
3326 for ( i = 0 ; i < 4 ; i++ )
3327 #endif
3329 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3330 if ( mfn_x(smfn) )
3331 sh_put_ref(v, smfn, 0);
3332 v->arch.shadow_table[i] = pagetable_null();
3336 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3337 static void
3338 sh_set_toplevel_shadow(struct vcpu *v,
3339 int slot,
3340 mfn_t gmfn,
3341 unsigned int root_type)
3343 mfn_t smfn;
3344 pagetable_t old_entry, new_entry;
3346 struct domain *d = v->domain;
3348 /* Remember the old contents of this slot */
3349 old_entry = v->arch.shadow_table[slot];
3351 /* Now figure out the new contents: is this a valid guest MFN? */
3352 if ( !mfn_valid(gmfn) )
3354 new_entry = pagetable_null();
3355 goto install_new_entry;
3358 /* Guest mfn is valid: shadow it and install the shadow */
3359 smfn = get_shadow_status(v, gmfn, root_type);
3360 if ( !mfn_valid(smfn) )
3362 /* Make sure there's enough free shadow memory. */
3363 shadow_prealloc(d, SHADOW_MAX_ORDER);
3364 /* Shadow the page. */
3365 smfn = sh_make_shadow(v, gmfn, root_type);
3367 ASSERT(mfn_valid(smfn));
3369 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
3370 /* Once again OK to unhook entries from this table if we see fork/exit */
3371 ASSERT(sh_mfn_is_a_page_table(gmfn));
3372 mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
3373 #endif
3375 /* Pin the shadow and put it (back) on the list of pinned shadows */
3376 if ( sh_pin(v, smfn) == 0 )
3378 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3379 domain_crash(v->domain);
3382 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3383 * or the next call to set_toplevel_shadow() */
3384 if ( !sh_get_ref(v, smfn, 0) )
3386 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3387 domain_crash(v->domain);
3390 new_entry = pagetable_from_mfn(smfn);
3392 install_new_entry:
3393 /* Done. Install it */
3394 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3395 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3396 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3397 v->arch.shadow_table[slot] = new_entry;
3399 /* Decrement the refcount of the old contents of this slot */
3400 if ( !pagetable_is_null(old_entry) )
3401 sh_put_ref(v, pagetable_get_mfn(old_entry), 0);
3405 static void
3406 sh_update_cr3(struct vcpu *v, int do_locking)
3407 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3408 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3409 * if appropriate).
3410 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3411 * this function will call hvm_update_guest_cr3() to tell them where the
3412 * shadow tables are.
3413 * If do_locking != 0, assume we are being called from outside the
3414 * shadow code, and must take and release the shadow lock; otherwise
3415 * that is the caller's responsibility.
3416 */
3418 struct domain *d = v->domain;
3419 mfn_t gmfn;
3420 #if GUEST_PAGING_LEVELS == 3
3421 guest_l3e_t *gl3e;
3422 u32 guest_idx=0;
3423 int i;
3424 #endif
3426 /* Don't do anything on an uninitialised vcpu */
3427 if ( !is_hvm_domain(d) && !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
3429 ASSERT(v->arch.cr3 == 0);
3430 return;
3433 if ( do_locking ) shadow_lock(v->domain);
3435 ASSERT(shadow_locked_by_me(v->domain));
3436 ASSERT(v->arch.paging.mode);
3438 ////
3439 //// vcpu->arch.guest_table is already set
3440 ////
3442 #ifndef NDEBUG
3443 /* Double-check that the HVM code has sent us a sane guest_table */
3444 if ( is_hvm_domain(d) )
3446 gfn_t gfn;
3448 ASSERT(shadow_mode_external(d));
3450 // Is paging enabled on this vcpu?
3451 if ( paging_vcpu_mode_translate(v) )
3453 gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
3454 gmfn = vcpu_gfn_to_mfn(v, gfn);
3455 ASSERT(mfn_valid(gmfn));
3456 ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
3458 else
3460 /* Paging disabled: guest_table points at (part of) p2m */
3461 #if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
3462 /* For everything else, they sould be the same */
3463 ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
3464 #endif
3467 #endif
3469 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3470 d->domain_id, v->vcpu_id,
3471 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3473 #if GUEST_PAGING_LEVELS == 4
3474 if ( !(v->arch.flags & TF_kernel_mode) && !IS_COMPAT(v->domain) )
3475 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3476 else
3477 #endif
3478 gmfn = pagetable_get_mfn(v->arch.guest_table);
3481 ////
3482 //// vcpu->arch.paging.shadow.guest_vtable
3483 ////
3484 #if GUEST_PAGING_LEVELS == 4
3485 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3487 if ( v->arch.paging.shadow.guest_vtable )
3488 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3489 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3491 else
3492 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
3493 #elif GUEST_PAGING_LEVELS == 3
3494 /* On PAE guests we don't use a mapping of the guest's own top-level
3495 * table. We cache the current state of that table and shadow that,
3496 * until the next CR3 write makes us refresh our cache. */
3497 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3499 if ( shadow_mode_external(d) && paging_vcpu_mode_translate(v) )
3500 /* Paging enabled: find where in the page the l3 table is */
3501 guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
3502 else
3503 /* Paging disabled or PV: l3 is at the start of a page */
3504 guest_idx = 0;
3506 // Ignore the low 2 bits of guest_idx -- they are really just
3507 // cache control.
3508 guest_idx &= ~3;
3510 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
3511 for ( i = 0; i < 4 ; i++ )
3512 v->arch.paging.shadow.gl3e[i] = gl3e[i];
3513 sh_unmap_domain_page(gl3e);
3514 #elif GUEST_PAGING_LEVELS == 2
3515 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3517 if ( v->arch.paging.shadow.guest_vtable )
3518 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3519 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3521 else
3522 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
3523 #else
3524 #error this should never happen
3525 #endif
3527 #if 0
3528 printk("%s %s %d gmfn=%05lx shadow.guest_vtable=%p\n",
3529 __func__, __FILE__, __LINE__, gmfn, v->arch.paging.shadow.guest_vtable);
3530 #endif
3532 ////
3533 //// vcpu->arch.shadow_table[]
3534 ////
3536 /* We revoke write access to the new guest toplevel page(s) before we
3537 * replace the old shadow pagetable(s), so that we can safely use the
3538 * (old) shadow linear maps in the writeable mapping heuristics. */
3539 #if GUEST_PAGING_LEVELS == 2
3540 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
3541 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3542 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3543 #elif GUEST_PAGING_LEVELS == 3
3544 /* PAE guests have four shadow_table entries, based on the
3545 * current values of the guest's four l3es. */
3547 int flush = 0;
3548 gfn_t gl2gfn;
3549 mfn_t gl2mfn;
3550 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
3551 /* First, make all four entries read-only. */
3552 for ( i = 0; i < 4; i++ )
3554 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3556 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3557 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3558 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
3561 if ( flush )
3562 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3563 /* Now install the new shadows. */
3564 for ( i = 0; i < 4; i++ )
3566 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3568 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3569 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3570 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3571 ? SH_type_l2h_shadow
3572 : SH_type_l2_shadow);
3574 else
3575 /* The guest is not present: clear out the shadow. */
3576 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3579 #elif GUEST_PAGING_LEVELS == 4
3580 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
3581 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3582 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3583 #else
3584 #error This should never happen
3585 #endif
3587 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3588 #endif
3590 ///
3591 /// v->arch.paging.shadow.l3table
3592 ///
3593 #if SHADOW_PAGING_LEVELS == 3
3595 mfn_t smfn;
3596 int i;
3597 for ( i = 0; i < 4; i++ )
3599 #if GUEST_PAGING_LEVELS == 2
3600 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3601 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3602 #else
3603 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3604 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3605 #endif
3606 v->arch.paging.shadow.l3table[i] =
3607 (mfn_x(smfn) == 0)
3608 ? shadow_l3e_empty()
3609 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3612 #endif /* SHADOW_PAGING_LEVELS == 3 */
3615 ///
3616 /// v->arch.cr3
3617 ///
3618 if ( shadow_mode_external(d) )
3620 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3622 else // not shadow_mode_external...
3624 /* We don't support PV except guest == shadow == config levels */
3625 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3626 #if SHADOW_PAGING_LEVELS == 3
3627 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3628 * Don't use make_cr3 because (a) we know it's below 4GB, and
3629 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3630 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
3631 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
3632 #else
3633 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3634 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3635 #endif
3639 ///
3640 /// v->arch.hvm_vcpu.hw_cr3
3641 ///
3642 if ( shadow_mode_external(d) )
3644 ASSERT(is_hvm_domain(d));
3645 #if SHADOW_PAGING_LEVELS == 3
3646 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3647 hvm_update_guest_cr3(v, virt_to_maddr(&v->arch.paging.shadow.l3table));
3648 #else
3649 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3650 hvm_update_guest_cr3(v, pagetable_get_paddr(v->arch.shadow_table[0]));
3651 #endif
3654 /* Fix up the linear pagetable mappings */
3655 sh_update_linear_entries(v);
3657 /* Release the lock, if we took it (otherwise it's the caller's problem) */
3658 if ( do_locking ) shadow_unlock(v->domain);
3662 /**************************************************************************/
3663 /* Functions to revoke guest rights */
3665 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3666 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3667 /* Look up this vaddr in the current shadow and see if it's a writeable
3668 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3670 shadow_l1e_t sl1e, *sl1p;
3671 shadow_l2e_t *sl2p;
3672 #if SHADOW_PAGING_LEVELS >= 3
3673 shadow_l3e_t *sl3p;
3674 #if SHADOW_PAGING_LEVELS >= 4
3675 shadow_l4e_t *sl4p;
3676 #endif
3677 #endif
3678 mfn_t sl1mfn;
3679 int r;
3681 /* Carefully look in the shadow linear map for the l1e we expect */
3682 #if SHADOW_PAGING_LEVELS >= 4
3683 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3684 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3685 return 0;
3686 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3687 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3688 return 0;
3689 #elif SHADOW_PAGING_LEVELS == 3
3690 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
3691 + shadow_l3_linear_offset(vaddr);
3692 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3693 return 0;
3694 #endif
3695 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3696 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3697 return 0;
3698 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3699 sl1e = *sl1p;
3700 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3701 != (_PAGE_PRESENT|_PAGE_RW))
3702 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3703 return 0;
3705 /* Found it! Need to remove its write permissions. */
3706 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3707 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3708 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3709 ASSERT( !(r & SHADOW_SET_ERROR) );
3710 return 1;
3712 #endif
3714 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
3715 mfn_t readonly_mfn)
3716 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3718 shadow_l1e_t *sl1e;
3719 int done = 0;
3720 int flags;
3721 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3723 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3725 flags = shadow_l1e_get_flags(*sl1e);
3726 if ( (flags & _PAGE_PRESENT)
3727 && (flags & _PAGE_RW)
3728 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3730 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
3731 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
3732 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3733 /* Remember the last shadow that we shot a writeable mapping in */
3734 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3735 #endif
3736 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3737 & PGT_count_mask) == 0 )
3738 /* This breaks us cleanly out of the FOREACH macro */
3739 done = 1;
3741 });
3742 return done;
3746 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3747 /* Excises all mappings to guest frame from this shadow l1 table */
3749 shadow_l1e_t *sl1e;
3750 int done = 0;
3751 int flags;
3753 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3755 flags = shadow_l1e_get_flags(*sl1e);
3756 if ( (flags & _PAGE_PRESENT)
3757 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3759 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3760 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3761 /* This breaks us cleanly out of the FOREACH macro */
3762 done = 1;
3764 });
3765 return done;
3768 /**************************************************************************/
3769 /* Functions to excise all pointers to shadows from higher-level shadows. */
3771 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3772 /* Blank out a single shadow entry */
3774 switch ( mfn_to_shadow_page(smfn)->type )
3776 case SH_type_l1_shadow:
3777 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3778 case SH_type_l2_shadow:
3779 #if GUEST_PAGING_LEVELS >= 3
3780 case SH_type_l2h_shadow:
3781 #endif
3782 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3783 #if GUEST_PAGING_LEVELS >= 4
3784 case SH_type_l3_shadow:
3785 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3786 case SH_type_l4_shadow:
3787 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3788 #endif
3789 default: BUG(); /* Called with the wrong kind of shadow. */
3793 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3794 /* Remove all mappings of this l1 shadow from this l2 shadow */
3796 shadow_l2e_t *sl2e;
3797 int done = 0;
3798 int flags;
3800 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
3802 flags = shadow_l2e_get_flags(*sl2e);
3803 if ( (flags & _PAGE_PRESENT)
3804 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3806 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3807 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
3808 /* This breaks us cleanly out of the FOREACH macro */
3809 done = 1;
3811 });
3812 return done;
3815 #if GUEST_PAGING_LEVELS >= 4
3816 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3817 /* Remove all mappings of this l2 shadow from this l3 shadow */
3819 shadow_l3e_t *sl3e;
3820 int done = 0;
3821 int flags;
3823 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3825 flags = shadow_l3e_get_flags(*sl3e);
3826 if ( (flags & _PAGE_PRESENT)
3827 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3829 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3830 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
3831 /* This breaks us cleanly out of the FOREACH macro */
3832 done = 1;
3834 });
3835 return done;
3838 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
3839 /* Remove all mappings of this l3 shadow from this l4 shadow */
3841 shadow_l4e_t *sl4e;
3842 int done = 0;
3843 int flags;
3845 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
3847 flags = shadow_l4e_get_flags(*sl4e);
3848 if ( (flags & _PAGE_PRESENT)
3849 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
3851 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
3852 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
3853 /* This breaks us cleanly out of the FOREACH macro */
3854 done = 1;
3856 });
3857 return done;
3859 #endif /* 64bit guest */
3861 /**************************************************************************/
3862 /* Handling HVM guest writes to pagetables */
3864 /* Check that the user is allowed to perform this write.
3865 * Returns a mapped pointer to write to, and the mfn it's on,
3866 * or NULL for error. */
3867 static inline void * emulate_map_dest(struct vcpu *v,
3868 unsigned long vaddr,
3869 struct sh_emulate_ctxt *sh_ctxt,
3870 mfn_t *mfnp)
3872 walk_t gw;
3873 u32 flags, errcode;
3874 gfn_t gfn;
3875 mfn_t mfn;
3877 guest_walk_tables(v, vaddr, &gw, 1);
3878 flags = accumulate_guest_flags(v, &gw);
3879 gfn = guest_l1e_get_gfn(gw.eff_l1e);
3880 mfn = vcpu_gfn_to_mfn(v, gfn);
3881 sh_audit_gw(v, &gw);
3882 unmap_walk(v, &gw);
3884 if ( !(flags & _PAGE_PRESENT) )
3886 errcode = 0;
3887 goto page_fault;
3890 if ( !(flags & _PAGE_RW) ||
3891 (!(flags & _PAGE_USER) && ring_3(sh_ctxt->ctxt.regs)) )
3893 errcode = PFEC_page_present;
3894 goto page_fault;
3897 if ( !mfn_valid(mfn) )
3898 return NULL;
3900 *mfnp = mfn;
3901 return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
3903 page_fault:
3904 errcode |= PFEC_write_access;
3905 if ( is_hvm_vcpu(v) )
3906 hvm_inject_exception(TRAP_page_fault, errcode, vaddr);
3907 else
3908 propagate_page_fault(vaddr, errcode);
3909 return NULL;
3912 static int safe_not_to_verify_write(mfn_t gmfn, void *dst, void *src,
3913 int bytes)
3915 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
3916 struct page_info *pg = mfn_to_page(gmfn);
3917 if ( !(pg->shadow_flags & SHF_32)
3918 && ((unsigned long)dst & 7) == 0 )
3920 /* Not shadowed 32-bit: aligned 64-bit writes that leave the
3921 * present bit unset are safe to ignore. */
3922 if ( (*(u64*)src & _PAGE_PRESENT) == 0
3923 && (*(u64*)dst & _PAGE_PRESENT) == 0 )
3924 return 1;
3926 else if ( !(pg->shadow_flags & (SHF_PAE|SHF_64))
3927 && ((unsigned long)dst & 3) == 0 )
3929 /* Not shadowed PAE/64-bit: aligned 32-bit writes that leave the
3930 * present bit unset are safe to ignore. */
3931 if ( (*(u32*)src & _PAGE_PRESENT) == 0
3932 && (*(u32*)dst & _PAGE_PRESENT) == 0 )
3933 return 1;
3935 #endif
3936 return 0;
3940 int
3941 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
3942 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
3944 mfn_t mfn;
3945 void *addr;
3946 int skip;
3948 if ( vaddr & (bytes-1) )
3949 return X86EMUL_UNHANDLEABLE;
3951 ASSERT(shadow_locked_by_me(v->domain));
3952 ASSERT(((vaddr & ~PAGE_MASK) + bytes) <= PAGE_SIZE);
3954 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
3955 return X86EMUL_EXCEPTION;
3957 skip = safe_not_to_verify_write(mfn, addr, src, bytes);
3958 memcpy(addr, src, bytes);
3959 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
3961 /* If we are writing zeros to this page, might want to unshadow */
3962 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
3963 check_for_early_unshadow(v, mfn);
3965 sh_mark_dirty(v->domain, mfn);
3967 sh_unmap_domain_page(addr);
3968 shadow_audit_tables(v);
3969 return X86EMUL_OKAY;
3972 int
3973 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
3974 unsigned long old, unsigned long new,
3975 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
3977 mfn_t mfn;
3978 void *addr;
3979 unsigned long prev;
3980 int rv = X86EMUL_OKAY, skip;
3982 ASSERT(shadow_locked_by_me(v->domain));
3983 ASSERT(bytes <= sizeof(unsigned long));
3985 if ( vaddr & (bytes-1) )
3986 return X86EMUL_UNHANDLEABLE;
3988 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
3989 return X86EMUL_EXCEPTION;
3991 skip = safe_not_to_verify_write(mfn, &new, &old, bytes);
3993 switch ( bytes )
3995 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
3996 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
3997 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
3998 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
3999 default:
4000 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4001 prev = ~old;
4004 if ( prev == old )
4006 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
4008 else
4009 rv = X86EMUL_CMPXCHG_FAILED;
4011 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4012 " wanted %#lx now %#lx bytes %u\n",
4013 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4015 /* If we are writing zeros to this page, might want to unshadow */
4016 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
4017 check_for_early_unshadow(v, mfn);
4019 sh_mark_dirty(v->domain, mfn);
4021 sh_unmap_domain_page(addr);
4022 shadow_audit_tables(v);
4023 return rv;
4026 int
4027 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4028 unsigned long old_lo, unsigned long old_hi,
4029 unsigned long new_lo, unsigned long new_hi,
4030 struct sh_emulate_ctxt *sh_ctxt)
4032 mfn_t mfn;
4033 void *addr;
4034 u64 old, new, prev;
4035 int rv = X86EMUL_OKAY, skip;
4037 ASSERT(shadow_locked_by_me(v->domain));
4039 if ( vaddr & 7 )
4040 return X86EMUL_UNHANDLEABLE;
4042 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
4043 return X86EMUL_EXCEPTION;
4045 old = (((u64) old_hi) << 32) | (u64) old_lo;
4046 new = (((u64) new_hi) << 32) | (u64) new_lo;
4047 skip = safe_not_to_verify_write(mfn, &new, &old, 8);
4048 prev = cmpxchg(((u64 *)addr), old, new);
4050 if ( prev == old )
4052 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, 8);
4054 else
4055 rv = X86EMUL_CMPXCHG_FAILED;
4057 /* If we are writing zeros to this page, might want to unshadow */
4058 if ( *(u32 *)addr == 0 )
4059 check_for_early_unshadow(v, mfn);
4061 sh_mark_dirty(v->domain, mfn);
4063 sh_unmap_domain_page(addr);
4064 shadow_audit_tables(v);
4065 return rv;
4069 /**************************************************************************/
4070 /* Audit tools */
4072 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4074 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4075 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4076 "gl" #_level "mfn = %" PRI_mfn \
4077 " sl" #_level "mfn = %" PRI_mfn \
4078 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4079 " gl" #_level "e = %" SH_PRI_gpte \
4080 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4081 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4082 _level, guest_index(gl ## _level ## e), \
4083 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4084 gl ## _level ## e, sl ## _level ## e, \
4085 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4086 ##_a); \
4087 BUG(); \
4088 done = 1; \
4089 } while (0)
4092 static char * sh_audit_flags(struct vcpu *v, int level,
4093 int gflags, int sflags)
4094 /* Common code for auditing flag bits */
4096 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4097 return "shadow is present but guest is not present";
4098 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4099 return "global bit set in PV shadow";
4100 if ( level == 2 && (sflags & _PAGE_PSE) )
4101 return "PS bit set in shadow";
4102 #if SHADOW_PAGING_LEVELS == 3
4103 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4104 #endif
4105 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4106 return "accessed bit not propagated";
4107 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4108 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4109 return "dirty bit not propagated";
4110 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4111 return "user/supervisor bit does not match";
4112 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4113 return "NX bit does not match";
4114 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4115 return "shadow grants write access but guest does not";
4116 return NULL;
4119 static inline mfn_t
4120 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
4121 /* Convert this gfn to an mfn in the manner appropriate for the
4122 * guest pagetable it's used in (gmfn) */
4124 if ( !shadow_mode_translate(v->domain) )
4125 return _mfn(gfn_x(gfn));
4127 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
4128 != PGT_writable_page )
4129 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
4130 else
4131 return gfn_to_mfn(v->domain, gfn_x(gfn));
4135 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4137 guest_l1e_t *gl1e, *gp;
4138 shadow_l1e_t *sl1e;
4139 mfn_t mfn, gmfn, gl1mfn;
4140 gfn_t gfn;
4141 char *s;
4142 int done = 0;
4144 /* Follow the backpointer */
4145 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4146 gl1e = gp = sh_map_domain_page(gl1mfn);
4147 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4149 if ( sh_l1e_is_magic(*sl1e) )
4151 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4152 if ( sh_l1e_is_gnp(*sl1e) )
4154 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4155 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4157 else
4159 ASSERT(sh_l1e_is_mmio(*sl1e));
4160 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4161 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4162 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4163 " but guest gfn is %" SH_PRI_gfn,
4164 gfn_x(gfn),
4165 gfn_x(guest_l1e_get_gfn(*gl1e)));
4167 #endif
4169 else
4171 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4172 shadow_l1e_get_flags(*sl1e));
4173 if ( s ) AUDIT_FAIL(1, "%s", s);
4175 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4177 gfn = guest_l1e_get_gfn(*gl1e);
4178 mfn = shadow_l1e_get_mfn(*sl1e);
4179 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
4180 if ( mfn_x(gmfn) != mfn_x(mfn) )
4181 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4182 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4183 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4186 });
4187 sh_unmap_domain_page(gp);
4188 return done;
4191 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4193 guest_l1e_t *gl1e, e;
4194 shadow_l1e_t *sl1e;
4195 mfn_t gl1mfn = _mfn(INVALID_MFN);
4196 int f;
4197 int done = 0;
4199 /* fl1 has no useful backpointer: all we can check are flags */
4200 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4201 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4202 f = shadow_l1e_get_flags(*sl1e);
4203 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4204 if ( !(f == 0
4205 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4206 _PAGE_ACCESSED|_PAGE_DIRTY)
4207 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4208 || sh_l1e_is_magic(*sl1e)) )
4209 AUDIT_FAIL(1, "fl1e has bad flags");
4210 });
4211 return 0;
4214 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4216 guest_l2e_t *gl2e, *gp;
4217 shadow_l2e_t *sl2e;
4218 mfn_t mfn, gmfn, gl2mfn;
4219 gfn_t gfn;
4220 char *s;
4221 int done = 0;
4223 /* Follow the backpointer */
4224 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4225 gl2e = gp = sh_map_domain_page(gl2mfn);
4226 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
4228 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4229 shadow_l2e_get_flags(*sl2e));
4230 if ( s ) AUDIT_FAIL(2, "%s", s);
4232 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4234 gfn = guest_l2e_get_gfn(*gl2e);
4235 mfn = shadow_l2e_get_mfn(*sl2e);
4236 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4237 ? get_fl1_shadow_status(v, gfn)
4238 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4239 SH_type_l1_shadow);
4240 if ( mfn_x(gmfn) != mfn_x(mfn) )
4241 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4242 " (--> %" PRI_mfn ")"
4243 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4244 gfn_x(gfn),
4245 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4246 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4247 mfn_x(gmfn), mfn_x(mfn));
4249 });
4250 sh_unmap_domain_page(gp);
4251 return 0;
4254 #if GUEST_PAGING_LEVELS >= 4
4255 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4257 guest_l3e_t *gl3e, *gp;
4258 shadow_l3e_t *sl3e;
4259 mfn_t mfn, gmfn, gl3mfn;
4260 gfn_t gfn;
4261 char *s;
4262 int done = 0;
4264 /* Follow the backpointer */
4265 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4266 gl3e = gp = sh_map_domain_page(gl3mfn);
4267 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4269 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4270 shadow_l3e_get_flags(*sl3e));
4271 if ( s ) AUDIT_FAIL(3, "%s", s);
4273 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4275 gfn = guest_l3e_get_gfn(*gl3e);
4276 mfn = shadow_l3e_get_mfn(*sl3e);
4277 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4278 ((GUEST_PAGING_LEVELS == 3 ||
4279 IS_COMPAT(v->domain))
4280 && !shadow_mode_external(v->domain)
4281 && (guest_index(gl3e) % 4) == 3)
4282 ? SH_type_l2h_shadow
4283 : SH_type_l2_shadow);
4284 if ( mfn_x(gmfn) != mfn_x(mfn) )
4285 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4286 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4287 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4289 });
4290 sh_unmap_domain_page(gp);
4291 return 0;
4294 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4296 guest_l4e_t *gl4e, *gp;
4297 shadow_l4e_t *sl4e;
4298 mfn_t mfn, gmfn, gl4mfn;
4299 gfn_t gfn;
4300 char *s;
4301 int done = 0;
4303 /* Follow the backpointer */
4304 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4305 gl4e = gp = sh_map_domain_page(gl4mfn);
4306 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
4308 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4309 shadow_l4e_get_flags(*sl4e));
4310 if ( s ) AUDIT_FAIL(4, "%s", s);
4312 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4314 gfn = guest_l4e_get_gfn(*gl4e);
4315 mfn = shadow_l4e_get_mfn(*sl4e);
4316 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4317 SH_type_l3_shadow);
4318 if ( mfn_x(gmfn) != mfn_x(mfn) )
4319 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4320 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4321 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4323 });
4324 sh_unmap_domain_page(gp);
4325 return 0;
4327 #endif /* GUEST_PAGING_LEVELS >= 4 */
4330 #undef AUDIT_FAIL
4332 #endif /* Audit code */
4334 /**************************************************************************/
4335 /* Entry points into this mode of the shadow code.
4336 * This will all be mangled by the preprocessor to uniquify everything. */
4337 struct paging_mode sh_paging_mode = {
4338 .page_fault = sh_page_fault,
4339 .invlpg = sh_invlpg,
4340 .gva_to_gfn = sh_gva_to_gfn,
4341 .update_cr3 = sh_update_cr3,
4342 .update_paging_modes = shadow_update_paging_modes,
4343 .write_p2m_entry = shadow_write_p2m_entry,
4344 .write_guest_entry = shadow_write_guest_entry,
4345 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
4346 .guest_map_l1e = sh_guest_map_l1e,
4347 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4348 .guest_levels = GUEST_PAGING_LEVELS,
4349 .shadow.detach_old_tables = sh_detach_old_tables,
4350 .shadow.x86_emulate_write = sh_x86_emulate_write,
4351 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4352 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4353 .shadow.make_monitor_table = sh_make_monitor_table,
4354 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
4355 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4356 .shadow.guess_wrmap = sh_guess_wrmap,
4357 #endif
4358 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
4359 };
4361 /*
4362 * Local variables:
4363 * mode: C
4364 * c-set-style: "BSD"
4365 * c-basic-offset: 4
4366 * indent-tabs-mode: nil
4367 * End:
4368 */