ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 19822:1b6616141e82

x86 shadow: Fix a few SHOPT_OUT_OF_SYNC ifdefs.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jun 24 10:47:07 2009 +0100 (2009-06-24)
parents cecc76506afc
children
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include <asm/guest_pt.h>
39 #include "private.h"
40 #include "types.h"
42 /* THINGS TO DO LATER:
43 *
44 * TEARDOWN HEURISTICS
45 * Also: have a heuristic for when to destroy a previous paging-mode's
46 * shadows. When a guest is done with its start-of-day 32-bit tables
47 * and reuses the memory we want to drop those shadows. Start with
48 * shadows in a page in two modes as a hint, but beware of clever tricks
49 * like reusing a pagetable for both PAE and 64-bit during boot...
50 *
51 * PAE LINEAR MAPS
52 * Rework shadow_get_l*e() to have the option of using map_domain_page()
53 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
54 * Then we can test the speed difference made by linear maps. If the
55 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
56 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
57 * to share l2h pages again.
58 *
59 * PSE disabled / PSE36
60 * We don't support any modes other than PSE enabled, PSE36 disabled.
61 * Neither of those would be hard to change, but we'd need to be able to
62 * deal with shadows made in one mode and used in another.
63 */
65 #define FETCH_TYPE_PREFETCH 1
66 #define FETCH_TYPE_DEMAND 2
67 #define FETCH_TYPE_WRITE 4
68 typedef enum {
69 ft_prefetch = FETCH_TYPE_PREFETCH,
70 ft_demand_read = FETCH_TYPE_DEMAND,
71 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
72 } fetch_type_t;
74 #ifdef DEBUG_TRACE_DUMP
75 static char *fetch_type_names[] = {
76 [ft_prefetch] "prefetch",
77 [ft_demand_read] "demand read",
78 [ft_demand_write] "demand write",
79 };
80 #endif
82 /**************************************************************************/
83 /* Hash table mapping from guest pagetables to shadows
84 *
85 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
86 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
87 * shadow L1 which maps its "splinters".
88 */
90 static inline mfn_t
91 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
92 /* Look for FL1 shadows in the hash table */
93 {
94 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
95 return smfn;
96 }
98 static inline mfn_t
99 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
100 /* Look for shadows in the hash table */
101 {
102 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
103 perfc_incr(shadow_get_shadow_status);
104 return smfn;
105 }
107 static inline void
108 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
109 /* Put an FL1 shadow into the hash table */
110 {
111 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
112 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
114 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
115 }
117 static inline void
118 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
119 /* Put a shadow into the hash table */
120 {
121 struct domain *d = v->domain;
122 int res;
124 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
125 d->domain_id, v->vcpu_id, mfn_x(gmfn),
126 shadow_type, mfn_x(smfn));
128 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
129 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
130 {
131 res = get_page(mfn_to_page(gmfn), d);
132 ASSERT(res == 1);
133 }
135 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
136 }
138 static inline void
139 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
140 /* Remove a shadow from the hash table */
141 {
142 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
143 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
144 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
145 }
147 static inline void
148 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
149 /* Remove a shadow from the hash table */
150 {
151 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
152 v->domain->domain_id, v->vcpu_id,
153 mfn_x(gmfn), shadow_type, mfn_x(smfn));
154 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
155 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
156 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
157 put_page(mfn_to_page(gmfn));
158 }
161 /**************************************************************************/
162 /* Functions for walking the guest page tables */
164 static inline uint32_t
165 sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
166 uint32_t pfec)
167 {
168 return guest_walk_tables(v, va, gw, pfec,
169 #if GUEST_PAGING_LEVELS == 3 /* PAE */
170 _mfn(INVALID_MFN),
171 v->arch.paging.shadow.gl3e
172 #else /* 32 or 64 */
173 pagetable_get_mfn(v->arch.guest_table),
174 v->arch.paging.shadow.guest_vtable
175 #endif
176 );
177 }
179 /* This validation is called with lock held, and after write permission
180 * removal. Then check is atomic and no more inconsistent content can
181 * be observed before lock is released
182 *
183 * Return 1 to indicate success and 0 for inconsistency
184 */
185 static inline uint32_t
186 shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
187 {
188 struct domain *d = v->domain;
189 guest_l1e_t *l1p;
190 guest_l2e_t *l2p;
191 #if GUEST_PAGING_LEVELS >= 4
192 guest_l3e_t *l3p;
193 guest_l4e_t *l4p;
194 #endif
195 int mismatch = 0;
197 ASSERT(shadow_locked_by_me(d));
199 if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
200 return 1;
202 /* We may consider caching guest page mapping from last
203 * guest table walk. However considering this check happens
204 * relatively less-frequent, and a bit burden here to
205 * remap guest page is better than caching mapping in each
206 * guest table walk.
207 *
208 * Also when inconsistency occurs, simply return to trigger
209 * another fault instead of re-validate new path to make
210 * logic simple.
211 */
212 perfc_incr(shadow_check_gwalk);
213 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
214 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
215 l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
216 mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
217 l3p = sh_map_domain_page(gw->l3mfn);
218 mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
219 sh_unmap_domain_page(l3p);
220 #else
221 mismatch |= (gw->l3e.l3 !=
222 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
223 #endif
224 l2p = sh_map_domain_page(gw->l2mfn);
225 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
226 sh_unmap_domain_page(l2p);
227 #else
228 l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
229 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
230 #endif
231 if ( !(guest_supports_superpages(v) &&
232 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
233 {
234 l1p = sh_map_domain_page(gw->l1mfn);
235 mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
236 sh_unmap_domain_page(l1p);
237 }
239 return !mismatch;
240 }
242 /* Remove write access permissions from a gwalk_t in a batch, and
243 * return OR-ed result for TLB flush hint and need to rewalk the guest
244 * pages.
245 *
246 * Syncing pages will remove write access to that page; but it may
247 * also give write access to other pages in the path. If we resync any
248 * pages, re-walk from the beginning.
249 */
250 #define GW_RMWR_FLUSHTLB 1
251 #define GW_RMWR_REWALK 2
253 static inline uint32_t
254 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
255 {
256 uint32_t rc = 0;
258 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
259 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
260 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
261 if ( mfn_is_out_of_sync(gw->l3mfn) )
262 {
263 sh_resync(v, gw->l3mfn);
264 rc = GW_RMWR_REWALK;
265 }
266 else
267 #endif /* OOS */
268 if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
269 rc = GW_RMWR_FLUSHTLB;
270 #endif /* GUEST_PAGING_LEVELS >= 4 */
272 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
273 if ( mfn_is_out_of_sync(gw->l2mfn) )
274 {
275 sh_resync(v, gw->l2mfn);
276 rc |= GW_RMWR_REWALK;
277 }
278 else
279 #endif /* OOS */
280 if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
281 rc |= GW_RMWR_FLUSHTLB;
282 #endif /* GUEST_PAGING_LEVELS >= 3 */
284 if ( !(guest_supports_superpages(v) &&
285 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
286 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
287 && !mfn_is_out_of_sync(gw->l1mfn)
288 #endif /* OOS */
289 && sh_remove_write_access(v, gw->l1mfn, 1, va) )
290 rc |= GW_RMWR_FLUSHTLB;
292 return rc;
293 }
295 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
296 /* Lightweight audit: pass all the shadows associated with this guest walk
297 * through the audit mechanisms */
298 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
299 {
300 mfn_t smfn;
302 if ( !(SHADOW_AUDIT_ENABLE) )
303 return;
305 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
306 if ( mfn_valid(gw->l4mfn)
307 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
308 SH_type_l4_shadow))) )
309 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
310 if ( mfn_valid(gw->l3mfn)
311 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
312 SH_type_l3_shadow))) )
313 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
314 #endif /* PAE or 64... */
315 if ( mfn_valid(gw->l2mfn) )
316 {
317 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
318 SH_type_l2_shadow))) )
319 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
320 #if GUEST_PAGING_LEVELS == 3
321 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
322 SH_type_l2h_shadow))) )
323 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
324 #endif
325 }
326 if ( mfn_valid(gw->l1mfn)
327 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
328 SH_type_l1_shadow))) )
329 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
330 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
331 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
332 && mfn_valid(
333 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
334 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
335 }
337 #else
338 #define sh_audit_gw(_v, _gw) do {} while(0)
339 #endif /* audit code */
342 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
343 void *
344 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
345 unsigned long *gl1mfn)
346 {
347 void *pl1e = NULL;
348 walk_t gw;
350 ASSERT(shadow_mode_translate(v->domain));
352 // XXX -- this is expensive, but it's easy to cobble together...
353 // FIXME!
355 if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0
356 && mfn_valid(gw.l1mfn) )
357 {
358 if ( gl1mfn )
359 *gl1mfn = mfn_x(gw.l1mfn);
360 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
361 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
362 }
364 return pl1e;
365 }
367 void
368 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
369 {
370 walk_t gw;
372 ASSERT(shadow_mode_translate(v->domain));
374 // XXX -- this is expensive, but it's easy to cobble together...
375 // FIXME!
377 (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
378 *(guest_l1e_t *)eff_l1e = gw.l1e;
379 }
380 #endif /* CONFIG == GUEST (== SHADOW) */
382 /**************************************************************************/
383 /* Functions to compute the correct index into a shadow page, given an
384 * index into the guest page (as returned by guest_get_index()).
385 * This is trivial when the shadow and guest use the same sized PTEs, but
386 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
387 * PAE- or 64-bit shadows).
388 *
389 * These functions also increment the shadow mfn, when necessary. When PTE
390 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
391 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
392 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
393 * which shadow page we really want. Similarly, when PTE sizes are
394 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
395 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
396 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
397 * space.)
398 *
399 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
400 * of shadow (to store both the shadow, and the info that would normally be
401 * stored in page_info fields). This arrangement allows the shadow and the
402 * "page_info" fields to always be stored in the same page (in fact, in
403 * the same cache line), avoiding an extra call to map_domain_page().
404 */
406 static inline u32
407 guest_index(void *ptr)
408 {
409 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
410 }
412 static u32
413 shadow_l1_index(mfn_t *smfn, u32 guest_index)
414 {
415 #if (GUEST_PAGING_LEVELS == 2)
416 *smfn = _mfn(mfn_x(*smfn) +
417 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
418 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
419 #else
420 return guest_index;
421 #endif
422 }
424 static u32
425 shadow_l2_index(mfn_t *smfn, u32 guest_index)
426 {
427 #if (GUEST_PAGING_LEVELS == 2)
428 // Because we use 2 shadow l2 entries for each guest entry, the number of
429 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
430 //
431 *smfn = _mfn(mfn_x(*smfn) +
432 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
434 // We multiply by two to get the index of the first of the two entries
435 // used to shadow the specified guest entry.
436 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
437 #else
438 return guest_index;
439 #endif
440 }
442 #if GUEST_PAGING_LEVELS >= 4
444 static u32
445 shadow_l3_index(mfn_t *smfn, u32 guest_index)
446 {
447 return guest_index;
448 }
450 static u32
451 shadow_l4_index(mfn_t *smfn, u32 guest_index)
452 {
453 return guest_index;
454 }
456 #endif // GUEST_PAGING_LEVELS >= 4
459 /**************************************************************************/
460 /* Function which computes shadow entries from their corresponding guest
461 * entries. This is the "heart" of the shadow code. It operates using
462 * level-1 shadow types, but handles all levels of entry.
463 * Don't call it directly, but use the four wrappers below.
464 */
466 static always_inline void
467 _sh_propagate(struct vcpu *v,
468 guest_intpte_t guest_intpte,
469 mfn_t target_mfn,
470 void *shadow_entry_ptr,
471 int level,
472 fetch_type_t ft,
473 p2m_type_t p2mt)
474 {
475 guest_l1e_t guest_entry = { guest_intpte };
476 shadow_l1e_t *sp = shadow_entry_ptr;
477 struct domain *d = v->domain;
478 struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
479 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
480 u32 pass_thru_flags;
481 u32 gflags, sflags;
483 /* We don't shadow PAE l3s */
484 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
486 /* Check there's something for the shadows to map to */
487 if ( !p2m_is_valid(p2mt) )
488 {
489 *sp = shadow_l1e_empty();
490 goto done;
491 }
493 gflags = guest_l1e_get_flags(guest_entry);
495 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
496 {
497 /* If a guest l1 entry is not present, shadow with the magic
498 * guest-not-present entry. */
499 if ( level == 1 )
500 *sp = sh_l1e_gnp();
501 else
502 *sp = shadow_l1e_empty();
503 goto done;
504 }
506 if ( level == 1 && p2mt == p2m_mmio_dm )
507 {
508 /* Guest l1e maps emulated MMIO space */
509 *sp = sh_l1e_mmio(target_gfn, gflags);
510 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
511 d->arch.paging.shadow.has_fast_mmio_entries = 1;
512 goto done;
513 }
515 // Must have a valid target_mfn unless this is a prefetch or an l1
516 // pointing at MMIO space. In the case of a prefetch, an invalid
517 // mfn means that we can not usefully shadow anything, and so we
518 // return early.
519 //
520 if ( !mfn_valid(target_mfn)
521 && !(level == 1 && (!shadow_mode_refcounts(d)
522 || p2mt == p2m_mmio_direct)) )
523 {
524 ASSERT((ft == ft_prefetch));
525 *sp = shadow_l1e_empty();
526 goto done;
527 }
529 // Propagate bits from the guest to the shadow.
530 // Some of these may be overwritten, below.
531 // Since we know the guest's PRESENT bit is set, we also set the shadow's
532 // SHADOW_PRESENT bit.
533 //
534 pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
535 _PAGE_RW | _PAGE_PRESENT);
536 if ( guest_supports_nx(v) )
537 pass_thru_flags |= _PAGE_NX_BIT;
538 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
539 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
540 sflags = gflags & pass_thru_flags;
542 /*
543 * For HVM domains with direct access to MMIO areas, set the correct
544 * caching attributes in the shadows to match what was asked for.
545 */
546 if ( (level == 1) && is_hvm_domain(d) && has_arch_pdevs(d) &&
547 !is_xen_heap_mfn(mfn_x(target_mfn)) )
548 {
549 unsigned int type;
551 /* compute the PAT index for shadow page entry when VT-d is enabled
552 * and device assigned.
553 * 1) direct MMIO: compute the PAT index with gMTRR=UC and gPAT.
554 * 2) if enables snoop control, compute the PAT index as WB.
555 * 3) if disables snoop control, compute the PAT index with
556 * gMTRR and gPAT.
557 */
558 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
559 sflags |= pat_type_2_pte_flags(type);
560 else if ( d->arch.hvm_domain.is_in_uc_mode )
561 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
562 else if ( p2mt == p2m_mmio_direct )
563 sflags |= get_pat_flags(v,
564 gflags,
565 gfn_to_paddr(target_gfn),
566 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT,
567 MTRR_TYPE_UNCACHABLE);
568 else if ( iommu_snoop )
569 sflags |= pat_type_2_pte_flags(PAT_TYPE_WRBACK);
570 else
571 sflags |= get_pat_flags(v,
572 gflags,
573 gfn_to_paddr(target_gfn),
574 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT,
575 NO_HARDCODE_MEM_TYPE);
576 }
578 // Set the A&D bits for higher level shadows.
579 // Higher level entries do not, strictly speaking, have dirty bits, but
580 // since we use shadow linear tables, each of these entries may, at some
581 // point in time, also serve as a shadow L1 entry.
582 // By setting both the A&D bits in each of these, we eliminate the burden
583 // on the hardware to update these bits on initial accesses.
584 //
585 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
586 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
588 // If the A or D bit has not yet been set in the guest, then we must
589 // prevent the corresponding kind of access.
590 //
591 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
592 sflags &= ~_PAGE_PRESENT;
594 /* D bits exist in L1es and PSE L2es */
595 if ( unlikely(((level == 1) ||
596 ((level == 2) &&
597 (gflags & _PAGE_PSE) &&
598 guest_supports_superpages(v)))
599 && !(gflags & _PAGE_DIRTY)) )
600 sflags &= ~_PAGE_RW;
602 // shadow_mode_log_dirty support
603 //
604 // Only allow the guest write access to a page a) on a demand fault,
605 // or b) if the page is already marked as dirty.
606 //
607 // (We handle log-dirty entirely inside the shadow code, without using the
608 // p2m_ram_logdirty p2m type: only HAP uses that.)
609 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
610 {
611 if ( mfn_valid(target_mfn) ) {
612 if ( ft & FETCH_TYPE_WRITE )
613 paging_mark_dirty(d, mfn_x(target_mfn));
614 else if ( !sh_mfn_is_dirty(d, target_mfn) )
615 sflags &= ~_PAGE_RW;
616 }
617 }
619 if ( unlikely((level == 1) && dirty_vram
620 && dirty_vram->last_dirty == -1
621 && gfn_x(target_gfn) >= dirty_vram->begin_pfn
622 && gfn_x(target_gfn) < dirty_vram->end_pfn) )
623 {
624 if ( ft & FETCH_TYPE_WRITE )
625 dirty_vram->last_dirty = NOW();
626 else
627 sflags &= ~_PAGE_RW;
628 }
630 /* Read-only memory */
631 if ( p2mt == p2m_ram_ro )
632 sflags &= ~_PAGE_RW;
634 // protect guest page tables
635 //
636 if ( unlikely((level == 1)
637 && sh_mfn_is_a_page_table(target_mfn)
638 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
639 /* Unless the page is out of sync and the guest is
640 writing to it. */
641 && !(mfn_oos_may_write(target_mfn)
642 && (ft == ft_demand_write))
643 #endif /* OOS */
644 ) )
645 {
646 if ( shadow_mode_trap_reads(d) )
647 {
648 // if we are trapping both reads & writes, then mark this page
649 // as not present...
650 //
651 sflags &= ~_PAGE_PRESENT;
652 }
653 else
654 {
655 // otherwise, just prevent any writes...
656 //
657 sflags &= ~_PAGE_RW;
658 }
659 }
661 // PV guests in 64-bit mode use two different page tables for user vs
662 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
663 // It is always shadowed as present...
664 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
665 && !is_hvm_domain(d) )
666 {
667 sflags |= _PAGE_USER;
668 }
670 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
672 done:
673 SHADOW_DEBUG(PROPAGATE,
674 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
675 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
676 }
679 /* These four wrappers give us a little bit of type-safety back around
680 * the use of void-* pointers and intpte types in _sh_propagate(), and
681 * allow the compiler to optimize out some level checks. */
683 #if GUEST_PAGING_LEVELS >= 4
684 static void
685 l4e_propagate_from_guest(struct vcpu *v,
686 guest_l4e_t gl4e,
687 mfn_t sl3mfn,
688 shadow_l4e_t *sl4e,
689 fetch_type_t ft)
690 {
691 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
692 }
694 static void
695 l3e_propagate_from_guest(struct vcpu *v,
696 guest_l3e_t gl3e,
697 mfn_t sl2mfn,
698 shadow_l3e_t *sl3e,
699 fetch_type_t ft)
700 {
701 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
702 }
703 #endif // GUEST_PAGING_LEVELS >= 4
705 static void
706 l2e_propagate_from_guest(struct vcpu *v,
707 guest_l2e_t gl2e,
708 mfn_t sl1mfn,
709 shadow_l2e_t *sl2e,
710 fetch_type_t ft)
711 {
712 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
713 }
715 static void
716 l1e_propagate_from_guest(struct vcpu *v,
717 guest_l1e_t gl1e,
718 mfn_t gmfn,
719 shadow_l1e_t *sl1e,
720 fetch_type_t ft,
721 p2m_type_t p2mt)
722 {
723 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
724 }
727 /**************************************************************************/
728 /* These functions update shadow entries (and do bookkeeping on the shadow
729 * tables they are in). It is intended that they are the only
730 * functions which ever write (non-zero) data onto a shadow page.
731 */
733 static inline void safe_write_entry(void *dst, void *src)
734 /* Copy one PTE safely when processors might be running on the
735 * destination pagetable. This does *not* give safety against
736 * concurrent writes (that's what the shadow lock is for), just
737 * stops the hardware picking up partially written entries. */
738 {
739 volatile unsigned long *d = dst;
740 unsigned long *s = src;
741 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
742 #if CONFIG_PAGING_LEVELS == 3
743 /* In PAE mode, pagetable entries are larger
744 * than machine words, so won't get written atomically. We need to make
745 * sure any other cpu running on these shadows doesn't see a
746 * half-written entry. Do this by marking the entry not-present first,
747 * then writing the high word before the low word. */
748 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
749 d[0] = 0;
750 d[1] = s[1];
751 d[0] = s[0];
752 #else
753 /* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
754 * which will be an atomic write, since the entry is aligned. */
755 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
756 *d = *s;
757 #endif
758 }
761 static inline void
762 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
763 /* This function does the actual writes to shadow pages.
764 * It must not be called directly, since it doesn't do the bookkeeping
765 * that shadow_set_l*e() functions do. */
766 {
767 shadow_l1e_t *dst = d;
768 shadow_l1e_t *src = s;
769 void *map = NULL;
770 int i;
772 /* Because we mirror access rights at all levels in the shadow, an
773 * l2 (or higher) entry with the RW bit cleared will leave us with
774 * no write access through the linear map.
775 * We detect that by writing to the shadow with copy_to_user() and
776 * using map_domain_page() to get a writeable mapping if we need to. */
777 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
778 {
779 perfc_incr(shadow_linear_map_failed);
780 map = sh_map_domain_page(mfn);
781 ASSERT(map != NULL);
782 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
783 }
786 for ( i = 0; i < entries; i++ )
787 safe_write_entry(dst++, src++);
789 if ( map != NULL ) sh_unmap_domain_page(map);
790 }
792 static inline int
793 perms_strictly_increased(u32 old_flags, u32 new_flags)
794 /* Given the flags of two entries, are the new flags a strict
795 * increase in rights over the old ones? */
796 {
797 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
798 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
799 /* Flip the NX bit, since it's the only one that decreases rights;
800 * we calculate as if it were an "X" bit. */
801 of ^= _PAGE_NX_BIT;
802 nf ^= _PAGE_NX_BIT;
803 /* If the changed bits are all set in the new flags, then rights strictly
804 * increased between old and new. */
805 return ((of | (of ^ nf)) == nf);
806 }
808 static int inline
809 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
810 {
811 int res;
812 mfn_t mfn;
813 struct domain *owner;
815 ASSERT(!sh_l1e_is_magic(sl1e));
817 if ( !shadow_mode_refcounts(d) )
818 return 1;
820 res = get_page_from_l1e(sl1e, d, d);
822 // If a privileged domain is attempting to install a map of a page it does
823 // not own, we let it succeed anyway.
824 //
825 if ( unlikely(!res) &&
826 !shadow_mode_translate(d) &&
827 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
828 (owner = page_get_owner(mfn_to_page(mfn))) &&
829 (d != owner) &&
830 IS_PRIV_FOR(d, owner))
831 {
832 res = get_page_from_l1e(sl1e, d, owner);
833 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
834 "which is owned by domain %d: %s\n",
835 d->domain_id, mfn_x(mfn), owner->domain_id,
836 res ? "success" : "failed");
837 }
839 if ( unlikely(!res) )
840 {
841 perfc_incr(shadow_get_page_fail);
842 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
843 }
845 return res;
846 }
848 static void inline
849 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
850 {
851 if ( !shadow_mode_refcounts(d) )
852 return;
854 put_page_from_l1e(sl1e, d);
855 }
857 #if GUEST_PAGING_LEVELS >= 4
858 static int shadow_set_l4e(struct vcpu *v,
859 shadow_l4e_t *sl4e,
860 shadow_l4e_t new_sl4e,
861 mfn_t sl4mfn)
862 {
863 int flags = 0, ok;
864 shadow_l4e_t old_sl4e;
865 paddr_t paddr;
866 ASSERT(sl4e != NULL);
867 old_sl4e = *sl4e;
869 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
871 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
872 | (((unsigned long)sl4e) & ~PAGE_MASK));
874 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
875 {
876 /* About to install a new reference */
877 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
878 ok = sh_get_ref(v, sl3mfn, paddr);
879 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
880 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
881 ok |= sh_pin(v, sl3mfn);
882 if ( !ok )
883 {
884 domain_crash(v->domain);
885 return SHADOW_SET_ERROR;
886 }
887 }
889 /* Write the new entry */
890 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
891 flags |= SHADOW_SET_CHANGED;
893 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
894 {
895 /* We lost a reference to an old mfn. */
896 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
897 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
898 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
899 shadow_l4e_get_flags(new_sl4e)) )
900 {
901 flags |= SHADOW_SET_FLUSH;
902 }
903 sh_put_ref(v, osl3mfn, paddr);
904 }
905 return flags;
906 }
908 static int shadow_set_l3e(struct vcpu *v,
909 shadow_l3e_t *sl3e,
910 shadow_l3e_t new_sl3e,
911 mfn_t sl3mfn)
912 {
913 int flags = 0;
914 shadow_l3e_t old_sl3e;
915 paddr_t paddr;
916 ASSERT(sl3e != NULL);
917 old_sl3e = *sl3e;
919 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
921 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
922 | (((unsigned long)sl3e) & ~PAGE_MASK));
924 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
925 {
926 /* About to install a new reference */
927 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
928 {
929 domain_crash(v->domain);
930 return SHADOW_SET_ERROR;
931 }
932 }
934 /* Write the new entry */
935 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
936 flags |= SHADOW_SET_CHANGED;
938 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
939 {
940 /* We lost a reference to an old mfn. */
941 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
942 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
943 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
944 shadow_l3e_get_flags(new_sl3e)) )
945 {
946 flags |= SHADOW_SET_FLUSH;
947 }
948 sh_put_ref(v, osl2mfn, paddr);
949 }
950 return flags;
951 }
952 #endif /* GUEST_PAGING_LEVELS >= 4 */
954 static int shadow_set_l2e(struct vcpu *v,
955 shadow_l2e_t *sl2e,
956 shadow_l2e_t new_sl2e,
957 mfn_t sl2mfn)
958 {
959 int flags = 0;
960 shadow_l2e_t old_sl2e;
961 paddr_t paddr;
963 #if GUEST_PAGING_LEVELS == 2
964 /* In 2-on-3 we work with pairs of l2es pointing at two-page
965 * shadows. Reference counting and up-pointers track from the first
966 * page of the shadow to the first l2e, so make sure that we're
967 * working with those:
968 * Align the pointer down so it's pointing at the first of the pair */
969 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
970 /* Align the mfn of the shadow entry too */
971 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
972 #endif
974 ASSERT(sl2e != NULL);
975 old_sl2e = *sl2e;
977 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
979 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
980 | (((unsigned long)sl2e) & ~PAGE_MASK));
982 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
983 {
984 mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
986 /* About to install a new reference */
987 if ( !sh_get_ref(v, sl1mfn, paddr) )
988 {
989 domain_crash(v->domain);
990 return SHADOW_SET_ERROR;
991 }
992 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
993 {
994 struct page_info *sp = mfn_to_page(sl1mfn);
995 mfn_t gl1mfn = _mfn(sp->v.sh.back);
997 /* If the shadow is a fl1 then the backpointer contains
998 the GFN instead of the GMFN, and it's definitely not
999 OOS. */
1000 if ( (sp->u.sh.type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
1001 && mfn_is_out_of_sync(gl1mfn) )
1002 sh_resync(v, gl1mfn);
1004 #endif
1007 /* Write the new entry */
1008 #if GUEST_PAGING_LEVELS == 2
1010 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1011 /* The l1 shadow is two pages long and need to be pointed to by
1012 * two adjacent l1es. The pair have the same flags, but point
1013 * at odd and even MFNs */
1014 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1015 pair[1].l2 |= (1<<PAGE_SHIFT);
1016 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1018 #else /* normal case */
1019 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1020 #endif
1021 flags |= SHADOW_SET_CHANGED;
1023 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1025 /* We lost a reference to an old mfn. */
1026 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1027 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1028 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1029 shadow_l2e_get_flags(new_sl2e)) )
1031 flags |= SHADOW_SET_FLUSH;
1033 sh_put_ref(v, osl1mfn, paddr);
1035 return flags;
1038 static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1039 shadow_l1e_t *sl1e,
1040 mfn_t sl1mfn,
1041 struct domain *d)
1043 mfn_t mfn = shadow_l1e_get_mfn(new_sl1e);
1044 int flags = shadow_l1e_get_flags(new_sl1e);
1045 unsigned long gfn;
1046 struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
1048 if ( !dirty_vram /* tracking disabled? */
1049 || !(flags & _PAGE_RW) /* read-only mapping? */
1050 || !mfn_valid(mfn) ) /* mfn can be invalid in mmio_direct */
1051 return;
1053 gfn = mfn_to_gfn(d, mfn);
1055 if ( (gfn >= dirty_vram->begin_pfn) && (gfn < dirty_vram->end_pfn) )
1057 unsigned long i = gfn - dirty_vram->begin_pfn;
1058 struct page_info *page = mfn_to_page(mfn);
1060 if ( (page->u.inuse.type_info & PGT_count_mask) == 1 )
1061 /* Initial guest reference, record it */
1062 dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1063 | ((unsigned long)sl1e & ~PAGE_MASK);
1067 static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1068 shadow_l1e_t *sl1e,
1069 mfn_t sl1mfn,
1070 struct domain *d)
1072 mfn_t mfn = shadow_l1e_get_mfn(old_sl1e);
1073 int flags = shadow_l1e_get_flags(old_sl1e);
1074 unsigned long gfn;
1075 struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
1077 if ( !dirty_vram /* tracking disabled? */
1078 || !(flags & _PAGE_RW) /* read-only mapping? */
1079 || !mfn_valid(mfn) ) /* mfn can be invalid in mmio_direct */
1080 return;
1082 gfn = mfn_to_gfn(d, mfn);
1084 if ( (gfn >= dirty_vram->begin_pfn) && (gfn < dirty_vram->end_pfn) )
1086 unsigned long i = gfn - dirty_vram->begin_pfn;
1087 struct page_info *page = mfn_to_page(mfn);
1088 int dirty = 0;
1089 paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1090 | ((unsigned long)sl1e & ~PAGE_MASK);
1092 if ( (page->u.inuse.type_info & PGT_count_mask) == 1 )
1094 /* Last reference */
1095 if ( dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1096 /* We didn't know it was that one, let's say it is dirty */
1097 dirty = 1;
1099 else
1101 ASSERT(dirty_vram->sl1ma[i] == sl1ma);
1102 dirty_vram->sl1ma[i] = INVALID_PADDR;
1103 if ( flags & _PAGE_DIRTY )
1104 dirty = 1;
1107 else
1109 /* We had more than one reference, just consider the page dirty. */
1110 dirty = 1;
1111 /* Check that it's not the one we recorded. */
1112 if ( dirty_vram->sl1ma[i] == sl1ma )
1114 /* Too bad, we remembered the wrong one... */
1115 dirty_vram->sl1ma[i] = INVALID_PADDR;
1117 else
1119 /* Ok, our recorded sl1e is still pointing to this page, let's
1120 * just hope it will remain. */
1123 if ( dirty )
1125 dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
1126 dirty_vram->last_dirty = NOW();
1131 static int shadow_set_l1e(struct vcpu *v,
1132 shadow_l1e_t *sl1e,
1133 shadow_l1e_t new_sl1e,
1134 mfn_t sl1mfn)
1136 int flags = 0;
1137 struct domain *d = v->domain;
1138 shadow_l1e_t old_sl1e;
1139 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1140 mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
1141 #endif
1142 ASSERT(sl1e != NULL);
1144 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1145 if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
1146 && ((shadow_l1e_get_flags(new_sl1e) & (_PAGE_RW|_PAGE_PRESENT))
1147 == (_PAGE_RW|_PAGE_PRESENT)) )
1148 oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
1149 #endif
1151 old_sl1e = *sl1e;
1153 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1155 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1156 && !sh_l1e_is_magic(new_sl1e) )
1158 /* About to install a new reference */
1159 if ( shadow_mode_refcounts(d) ) {
1160 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
1161 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1163 /* Doesn't look like a pagetable. */
1164 flags |= SHADOW_SET_ERROR;
1165 new_sl1e = shadow_l1e_empty();
1167 else
1169 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1174 /* Write the new entry */
1175 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1176 flags |= SHADOW_SET_CHANGED;
1178 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1179 && !sh_l1e_is_magic(old_sl1e) )
1181 /* We lost a reference to an old mfn. */
1182 /* N.B. Unlike higher-level sets, never need an extra flush
1183 * when writing an l1e. Because it points to the same guest frame
1184 * as the guest l1e did, it's the guest's responsibility to
1185 * trigger a flush later. */
1186 if ( shadow_mode_refcounts(d) )
1188 shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1189 shadow_put_page_from_l1e(old_sl1e, d);
1190 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
1193 return flags;
1197 /**************************************************************************/
1198 /* Macros to walk pagetables. These take the shadow of a pagetable and
1199 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1200 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1201 * second entry (since pairs of entries are managed together). For multi-page
1202 * shadows they walk all pages.
1204 * Arguments are an MFN, the variable to point to each entry, a variable
1205 * to indicate that we are done (we will shortcut to the end of the scan
1206 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1207 * and the code.
1209 * WARNING: These macros have side-effects. They change the values of both
1210 * the pointer and the MFN. */
1212 static inline void increment_ptr_to_guest_entry(void *ptr)
1214 if ( ptr )
1216 guest_l1e_t **entry = ptr;
1217 (*entry)++;
1221 /* All kinds of l1: touch all entries */
1222 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1223 do { \
1224 int _i; \
1225 shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \
1226 ASSERT(mfn_to_page(_sl1mfn)->u.sh.type == SH_type_l1_shadow \
1227 || mfn_to_page(_sl1mfn)->u.sh.type == SH_type_fl1_shadow);\
1228 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1229 { \
1230 (_sl1e) = _sp + _i; \
1231 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1232 {_code} \
1233 if ( _done ) break; \
1234 increment_ptr_to_guest_entry(_gl1p); \
1235 } \
1236 sh_unmap_domain_page(_sp); \
1237 } while (0)
1239 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1240 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1241 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1242 do { \
1243 int __done = 0; \
1244 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1245 ({ (__done = _done); }), _code); \
1246 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1247 if ( !__done ) \
1248 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1249 ({ (__done = _done); }), _code); \
1250 } while (0)
1251 #else /* Everything else; l1 shadows are only one page */
1252 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1253 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1254 #endif
1257 #if GUEST_PAGING_LEVELS == 2
1259 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1260 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1261 do { \
1262 int _i, _j, __done = 0; \
1263 int _xen = !shadow_mode_external(_dom); \
1264 ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_32_shadow);\
1265 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1266 { \
1267 shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \
1268 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1269 if ( (!(_xen)) \
1270 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1271 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1272 { \
1273 (_sl2e) = _sp + _i; \
1274 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1275 {_code} \
1276 if ( (__done = (_done)) ) break; \
1277 increment_ptr_to_guest_entry(_gl2p); \
1278 } \
1279 sh_unmap_domain_page(_sp); \
1280 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1281 } \
1282 } while (0)
1284 #elif GUEST_PAGING_LEVELS == 3
1286 /* PAE: if it's an l2h, don't touch Xen mappings */
1287 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1288 do { \
1289 int _i; \
1290 int _xen = !shadow_mode_external(_dom); \
1291 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1292 ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_pae_shadow \
1293 || mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow);\
1294 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1295 if ( (!(_xen)) \
1296 || mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_pae_shadow\
1297 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1298 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1299 { \
1300 (_sl2e) = _sp + _i; \
1301 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1302 {_code} \
1303 if ( _done ) break; \
1304 increment_ptr_to_guest_entry(_gl2p); \
1305 } \
1306 sh_unmap_domain_page(_sp); \
1307 } while (0)
1309 #else
1311 /* 64-bit l2: touch all entries except for PAE compat guests. */
1312 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1313 do { \
1314 int _i; \
1315 int _xen = !shadow_mode_external(_dom); \
1316 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1317 ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_64_shadow ||\
1318 mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_64_shadow);\
1319 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1320 { \
1321 if ( (!(_xen)) \
1322 || !is_pv_32on64_domain(_dom) \
1323 || mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_64_shadow\
1324 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1325 { \
1326 (_sl2e) = _sp + _i; \
1327 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1328 {_code} \
1329 if ( _done ) break; \
1330 increment_ptr_to_guest_entry(_gl2p); \
1331 } \
1332 } \
1333 sh_unmap_domain_page(_sp); \
1334 } while (0)
1336 #endif /* different kinds of l2 */
1338 #if GUEST_PAGING_LEVELS == 4
1340 /* 64-bit l3: touch all entries */
1341 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1342 do { \
1343 int _i; \
1344 shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \
1345 ASSERT(mfn_to_page(_sl3mfn)->u.sh.type == SH_type_l3_64_shadow);\
1346 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1347 { \
1348 (_sl3e) = _sp + _i; \
1349 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1350 {_code} \
1351 if ( _done ) break; \
1352 increment_ptr_to_guest_entry(_gl3p); \
1353 } \
1354 sh_unmap_domain_page(_sp); \
1355 } while (0)
1357 /* 64-bit l4: avoid Xen mappings */
1358 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1359 do { \
1360 shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \
1361 int _xen = !shadow_mode_external(_dom); \
1362 int _i; \
1363 ASSERT(mfn_to_page(_sl4mfn)->u.sh.type == SH_type_l4_64_shadow);\
1364 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1365 { \
1366 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1367 { \
1368 (_sl4e) = _sp + _i; \
1369 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1370 {_code} \
1371 if ( _done ) break; \
1372 } \
1373 increment_ptr_to_guest_entry(_gl4p); \
1374 } \
1375 sh_unmap_domain_page(_sp); \
1376 } while (0)
1378 #endif
1382 /**************************************************************************/
1383 /* Functions to install Xen mappings and linear mappings in shadow pages */
1385 // XXX -- this function should probably be moved to shadow-common.c, but that
1386 // probably wants to wait until the shadow types have been moved from
1387 // shadow-types.h to shadow-private.h
1388 //
1389 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1390 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1392 struct domain *d = v->domain;
1393 shadow_l4e_t *sl4e;
1395 sl4e = sh_map_domain_page(sl4mfn);
1396 ASSERT(sl4e != NULL);
1397 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1399 /* Copy the common Xen mappings from the idle domain */
1400 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1401 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1402 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1404 /* Install the per-domain mappings for this domain */
1405 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1406 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1407 __PAGE_HYPERVISOR);
1409 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1410 * shadows on 64-bit xen, this linear mapping is later replaced by the
1411 * monitor pagetable structure, which is built in make_monitor_table
1412 * and maintained by sh_update_linear_entries. */
1413 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1414 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1416 /* Self linear mapping. */
1417 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1419 // linear tables may not be used with translated PV guests
1420 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1421 shadow_l4e_empty();
1423 else
1425 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1426 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1429 if ( shadow_mode_translate(v->domain) )
1431 /* install domain-specific P2M table */
1432 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1433 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1434 __PAGE_HYPERVISOR);
1437 sh_unmap_domain_page(sl4e);
1439 #endif
1441 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1442 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1443 // place, which means that we need to populate the l2h entry in the l3
1444 // table.
1446 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1448 struct domain *d = v->domain;
1449 shadow_l2e_t *sl2e;
1450 #if CONFIG_PAGING_LEVELS == 3
1451 int i;
1452 #else
1454 if ( !is_pv_32on64_vcpu(v) )
1455 return;
1456 #endif
1458 sl2e = sh_map_domain_page(sl2hmfn);
1459 ASSERT(sl2e != NULL);
1460 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1462 #if CONFIG_PAGING_LEVELS == 3
1464 /* Copy the common Xen mappings from the idle domain */
1465 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1466 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1467 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1469 /* Install the per-domain mappings for this domain */
1470 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1471 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1472 shadow_l2e_from_mfn(
1473 page_to_mfn(perdomain_pt_page(d, i)),
1474 __PAGE_HYPERVISOR);
1476 /* We don't set up a linear mapping here because we can't until this
1477 * l2h is installed in an l3e. sh_update_linear_entries() handles
1478 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1479 * We zero them here, just as a safety measure.
1480 */
1481 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1482 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1483 shadow_l2e_empty();
1484 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1485 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1486 shadow_l2e_empty();
1488 if ( shadow_mode_translate(d) )
1490 /* Install the domain-specific p2m table */
1491 l3_pgentry_t *p2m;
1492 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1493 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1494 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1496 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1497 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1498 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1499 __PAGE_HYPERVISOR)
1500 : shadow_l2e_empty();
1502 sh_unmap_domain_page(p2m);
1505 #else
1507 /* Copy the common Xen mappings from the idle domain */
1508 memcpy(
1509 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1510 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1511 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1513 #endif
1515 sh_unmap_domain_page(sl2e);
1517 #endif
1523 /**************************************************************************/
1524 /* Create a shadow of a given guest page.
1525 */
1526 static mfn_t
1527 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1529 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1530 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1531 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1533 if ( shadow_type != SH_type_l2_32_shadow
1534 && shadow_type != SH_type_l2_pae_shadow
1535 && shadow_type != SH_type_l2h_pae_shadow
1536 && shadow_type != SH_type_l4_64_shadow )
1537 /* Lower-level shadow, not yet linked form a higher level */
1538 mfn_to_page(smfn)->up = 0;
1540 #if GUEST_PAGING_LEVELS == 4
1541 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1542 if ( shadow_type == SH_type_l4_64_shadow &&
1543 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1545 /* We're shadowing a new l4, but we've been assuming the guest uses
1546 * only one l4 per vcpu and context switches using an l4 entry.
1547 * Count the number of active l4 shadows. If there are enough
1548 * of them, decide that this isn't an old linux guest, and stop
1549 * pinning l3es. This is not very quick but it doesn't happen
1550 * very often. */
1551 struct page_info *sp, *t;
1552 struct vcpu *v2;
1553 int l4count = 0, vcpus = 0;
1554 page_list_for_each(sp, &v->domain->arch.paging.shadow.pinned_shadows)
1556 if ( sp->u.sh.type == SH_type_l4_64_shadow )
1557 l4count++;
1559 for_each_vcpu ( v->domain, v2 )
1560 vcpus++;
1561 if ( l4count > 2 * vcpus )
1563 /* Unpin all the pinned l3 tables, and don't pin any more. */
1564 page_list_for_each_safe(sp, t, &v->domain->arch.paging.shadow.pinned_shadows)
1566 if ( sp->u.sh.type == SH_type_l3_64_shadow )
1567 sh_unpin(v, page_to_mfn(sp));
1569 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1572 #endif
1573 #endif
1575 // Create the Xen mappings...
1576 if ( !shadow_mode_external(v->domain) )
1578 switch (shadow_type)
1580 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1581 case SH_type_l4_shadow:
1582 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1583 #endif
1584 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1585 case SH_type_l2h_shadow:
1586 sh_install_xen_entries_in_l2h(v, smfn); break;
1587 #endif
1588 default: /* Do nothing */ break;
1592 shadow_promote(v, gmfn, shadow_type);
1593 set_shadow_status(v, gmfn, shadow_type, smfn);
1595 return smfn;
1598 /* Make a splintered superpage shadow */
1599 static mfn_t
1600 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1602 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1603 (unsigned long) gfn_x(gfn));
1605 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1606 gfn_x(gfn), mfn_x(smfn));
1608 set_fl1_shadow_status(v, gfn, smfn);
1609 return smfn;
1613 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1614 mfn_t
1615 sh_make_monitor_table(struct vcpu *v)
1617 struct domain *d = v->domain;
1619 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1621 /* Guarantee we can get the memory we need */
1622 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1624 #if CONFIG_PAGING_LEVELS == 4
1626 mfn_t m4mfn;
1627 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1628 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1629 /* Remember the level of this table */
1630 mfn_to_page(m4mfn)->shadow_flags = 4;
1631 #if SHADOW_PAGING_LEVELS < 4
1633 mfn_t m3mfn, m2mfn;
1634 l4_pgentry_t *l4e;
1635 l3_pgentry_t *l3e;
1636 /* Install an l3 table and an l2 table that will hold the shadow
1637 * linear map entries. This overrides the linear map entry that
1638 * was installed by sh_install_xen_entries_in_l4. */
1639 l4e = sh_map_domain_page(m4mfn);
1641 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1642 mfn_to_page(m3mfn)->shadow_flags = 3;
1643 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1644 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1646 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1647 mfn_to_page(m2mfn)->shadow_flags = 2;
1648 l3e = sh_map_domain_page(m3mfn);
1649 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1650 sh_unmap_domain_page(l3e);
1652 if ( is_pv_32on64_vcpu(v) )
1654 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1655 * area into its usual VAs in the monitor tables */
1656 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1657 mfn_to_page(m3mfn)->shadow_flags = 3;
1658 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1660 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1661 mfn_to_page(m2mfn)->shadow_flags = 2;
1662 l3e = sh_map_domain_page(m3mfn);
1663 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1664 sh_install_xen_entries_in_l2h(v, m2mfn);
1665 sh_unmap_domain_page(l3e);
1668 sh_unmap_domain_page(l4e);
1670 #endif /* SHADOW_PAGING_LEVELS < 4 */
1671 return m4mfn;
1674 #elif CONFIG_PAGING_LEVELS == 3
1677 mfn_t m3mfn, m2mfn;
1678 l3_pgentry_t *l3e;
1679 l2_pgentry_t *l2e;
1680 int i;
1682 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1683 /* Remember the level of this table */
1684 mfn_to_page(m3mfn)->shadow_flags = 3;
1686 // Install a monitor l2 table in slot 3 of the l3 table.
1687 // This is used for all Xen entries, including linear maps
1688 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1689 mfn_to_page(m2mfn)->shadow_flags = 2;
1690 l3e = sh_map_domain_page(m3mfn);
1691 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1692 sh_install_xen_entries_in_l2h(v, m2mfn);
1693 /* Install the monitor's own linear map */
1694 l2e = sh_map_domain_page(m2mfn);
1695 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1696 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1697 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1698 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1699 : l2e_empty();
1700 sh_unmap_domain_page(l2e);
1701 sh_unmap_domain_page(l3e);
1703 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1704 return m3mfn;
1707 #else
1708 #error this should not happen
1709 #endif /* CONFIG_PAGING_LEVELS */
1711 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1713 /**************************************************************************/
1714 /* These functions also take a virtual address and return the level-N
1715 * shadow table mfn and entry, but they create the shadow pagetables if
1716 * they are needed. The "demand" argument is non-zero when handling
1717 * a demand fault (so we know what to do about accessed bits &c).
1718 * If the necessary tables are not present in the guest, they return NULL. */
1720 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1721 * more levels than the guest, the upper levels are always fixed and do not
1722 * reflect any information from the guest, so we do not use these functions
1723 * to access them. */
1725 #if GUEST_PAGING_LEVELS >= 4
1726 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1727 walk_t *gw,
1728 mfn_t *sl4mfn)
1730 /* There is always a shadow of the top level table. Get it. */
1731 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1732 /* Reading the top level table is always valid. */
1733 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1736 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1737 walk_t *gw,
1738 mfn_t *sl3mfn,
1739 fetch_type_t ft,
1740 int *resync)
1742 mfn_t sl4mfn;
1743 shadow_l4e_t *sl4e;
1744 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1745 /* Get the l4e */
1746 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1747 ASSERT(sl4e != NULL);
1748 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1750 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1751 ASSERT(mfn_valid(*sl3mfn));
1753 else
1755 int r;
1756 shadow_l4e_t new_sl4e;
1757 /* No l3 shadow installed: find and install it. */
1758 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1759 if ( !mfn_valid(*sl3mfn) )
1761 /* No l3 shadow of this page exists at all: make one. */
1762 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1764 /* Install the new sl3 table in the sl4e */
1765 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
1766 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1767 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1768 if ( r & SHADOW_SET_ERROR )
1769 return NULL;
1771 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1772 *resync |= 1;
1773 #endif
1776 /* Now follow it down a level. Guaranteed to succeed. */
1777 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1779 #endif /* GUEST_PAGING_LEVELS >= 4 */
1782 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1783 walk_t *gw,
1784 mfn_t *sl2mfn,
1785 fetch_type_t ft,
1786 int *resync)
1788 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1789 mfn_t sl3mfn = _mfn(INVALID_MFN);
1790 shadow_l3e_t *sl3e;
1791 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1792 /* Get the l3e */
1793 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync);
1794 if ( sl3e == NULL ) return NULL;
1795 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1797 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1798 ASSERT(mfn_valid(*sl2mfn));
1800 else
1802 int r;
1803 shadow_l3e_t new_sl3e;
1804 unsigned int t = SH_type_l2_shadow;
1806 /* Tag compat L2 containing hypervisor (m2p) mappings */
1807 if ( is_pv_32on64_domain(v->domain) &&
1808 guest_l4_table_offset(gw->va) == 0 &&
1809 guest_l3_table_offset(gw->va) == 3 )
1810 t = SH_type_l2h_shadow;
1812 /* No l2 shadow installed: find and install it. */
1813 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1814 if ( !mfn_valid(*sl2mfn) )
1816 /* No l2 shadow of this page exists at all: make one. */
1817 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1819 /* Install the new sl2 table in the sl3e */
1820 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
1821 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1822 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1823 if ( r & SHADOW_SET_ERROR )
1824 return NULL;
1826 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1827 *resync |= 1;
1828 #endif
1831 /* Now follow it down a level. Guaranteed to succeed. */
1832 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1833 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1834 /* We never demand-shadow PAE l3es: they are only created in
1835 * sh_update_cr3(). Check if the relevant sl3e is present. */
1836 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1837 + shadow_l3_linear_offset(gw->va);
1838 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1839 return NULL;
1840 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1841 ASSERT(mfn_valid(*sl2mfn));
1842 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1843 #else /* 32bit... */
1844 /* There is always a shadow of the top level table. Get it. */
1845 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1846 /* This next line is important: the guest l2 has a 16k
1847 * shadow, we need to return the right mfn of the four. This
1848 * call will set it for us as a side-effect. */
1849 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
1850 /* Reading the top level table is always valid. */
1851 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1852 #endif
1856 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1857 walk_t *gw,
1858 mfn_t *sl1mfn,
1859 fetch_type_t ft)
1861 mfn_t sl2mfn;
1862 int resync = 0;
1863 shadow_l2e_t *sl2e;
1865 /* Get the l2e */
1866 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync);
1867 if ( sl2e == NULL ) return NULL;
1869 /* Install the sl1 in the l2e if it wasn't there or if we need to
1870 * re-do it to fix a PSE dirty bit. */
1871 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1872 && likely(ft != ft_demand_write
1873 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
1874 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
1876 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1877 ASSERT(mfn_valid(*sl1mfn));
1879 else
1881 shadow_l2e_t new_sl2e;
1882 int r, flags = guest_l2e_get_flags(gw->l2e);
1883 /* No l1 shadow installed: find and install it. */
1884 if ( !(flags & _PAGE_PRESENT) )
1885 return NULL; /* No guest page. */
1886 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1888 /* Splintering a superpage */
1889 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
1890 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1891 if ( !mfn_valid(*sl1mfn) )
1893 /* No fl1 shadow of this superpage exists at all: make one. */
1894 *sl1mfn = make_fl1_shadow(v, l2gfn);
1897 else
1899 /* Shadowing an actual guest l1 table */
1900 if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
1901 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1902 if ( !mfn_valid(*sl1mfn) )
1904 /* No l1 shadow of this page exists at all: make one. */
1905 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1908 /* Install the new sl1 table in the sl2e */
1909 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
1910 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1911 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1912 if ( r & SHADOW_SET_ERROR )
1913 return NULL;
1915 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1916 * the guest l1 table has an 8k shadow, and we need to return
1917 * the right mfn of the pair. This call will set it for us as a
1918 * side-effect. (In all other cases, it's a no-op and will be
1919 * compiled out.) */
1920 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1923 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1924 /* All pages walked are now pagetables. Safe to resync pages
1925 in case level 4 or 3 shadows were set. */
1926 if ( resync )
1927 shadow_resync_all(v, 0);
1928 #endif
1930 /* Now follow it down a level. Guaranteed to succeed. */
1931 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1936 /**************************************************************************/
1937 /* Destructors for shadow tables:
1938 * Unregister the shadow, decrement refcounts of any entries present in it,
1939 * and release the memory.
1941 * N.B. These destructors do not clear the contents of the shadows.
1942 * This allows us to delay TLB shootdowns until the page is being reused.
1943 * See shadow_alloc() and shadow_free() for how this is handled.
1944 */
1946 #if GUEST_PAGING_LEVELS >= 4
1947 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1949 shadow_l4e_t *sl4e;
1950 u32 t = mfn_to_page(smfn)->u.sh.type;
1951 mfn_t gmfn, sl4mfn;
1953 SHADOW_DEBUG(DESTROY_SHADOW,
1954 "%s(%05lx)\n", __func__, mfn_x(smfn));
1955 ASSERT(t == SH_type_l4_shadow);
1957 /* Record that the guest page isn't shadowed any more (in this type) */
1958 gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
1959 delete_shadow_status(v, gmfn, t, smfn);
1960 shadow_demote(v, gmfn, t);
1961 /* Decrement refcounts of all the old entries */
1962 sl4mfn = smfn;
1963 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
1964 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1966 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1967 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1968 | ((unsigned long)sl4e & ~PAGE_MASK));
1970 });
1972 /* Put the memory back in the pool */
1973 shadow_free(v->domain, smfn);
1976 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
1978 shadow_l3e_t *sl3e;
1979 u32 t = mfn_to_page(smfn)->u.sh.type;
1980 mfn_t gmfn, sl3mfn;
1982 SHADOW_DEBUG(DESTROY_SHADOW,
1983 "%s(%05lx)\n", __func__, mfn_x(smfn));
1984 ASSERT(t == SH_type_l3_shadow);
1986 /* Record that the guest page isn't shadowed any more (in this type) */
1987 gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
1988 delete_shadow_status(v, gmfn, t, smfn);
1989 shadow_demote(v, gmfn, t);
1991 /* Decrement refcounts of all the old entries */
1992 sl3mfn = smfn;
1993 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
1994 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1995 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
1996 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1997 | ((unsigned long)sl3e & ~PAGE_MASK));
1998 });
2000 /* Put the memory back in the pool */
2001 shadow_free(v->domain, smfn);
2003 #endif /* GUEST_PAGING_LEVELS >= 4 */
2006 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2008 shadow_l2e_t *sl2e;
2009 u32 t = mfn_to_page(smfn)->u.sh.type;
2010 mfn_t gmfn, sl2mfn;
2012 SHADOW_DEBUG(DESTROY_SHADOW,
2013 "%s(%05lx)\n", __func__, mfn_x(smfn));
2015 #if GUEST_PAGING_LEVELS >= 3
2016 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2017 #else
2018 ASSERT(t == SH_type_l2_shadow);
2019 #endif
2021 /* Record that the guest page isn't shadowed any more (in this type) */
2022 gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
2023 delete_shadow_status(v, gmfn, t, smfn);
2024 shadow_demote(v, gmfn, t);
2026 /* Decrement refcounts of all the old entries */
2027 sl2mfn = smfn;
2028 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2029 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2030 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2031 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2032 | ((unsigned long)sl2e & ~PAGE_MASK));
2033 });
2035 /* Put the memory back in the pool */
2036 shadow_free(v->domain, smfn);
2039 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2041 struct domain *d = v->domain;
2042 shadow_l1e_t *sl1e;
2043 u32 t = mfn_to_page(smfn)->u.sh.type;
2045 SHADOW_DEBUG(DESTROY_SHADOW,
2046 "%s(%05lx)\n", __func__, mfn_x(smfn));
2047 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2049 /* Record that the guest page isn't shadowed any more (in this type) */
2050 if ( t == SH_type_fl1_shadow )
2052 gfn_t gfn = _gfn(mfn_to_page(smfn)->v.sh.back);
2053 delete_fl1_shadow_status(v, gfn, smfn);
2055 else
2057 mfn_t gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
2058 delete_shadow_status(v, gmfn, t, smfn);
2059 shadow_demote(v, gmfn, t);
2062 if ( shadow_mode_refcounts(d) )
2064 /* Decrement refcounts of all the old entries */
2065 mfn_t sl1mfn = smfn;
2066 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2067 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2068 && !sh_l1e_is_magic(*sl1e) ) {
2069 shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2070 shadow_put_page_from_l1e(*sl1e, d);
2072 });
2075 /* Put the memory back in the pool */
2076 shadow_free(v->domain, smfn);
2079 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2080 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2082 struct domain *d = v->domain;
2083 ASSERT(mfn_to_page(mmfn)->u.sh.type == SH_type_monitor_table);
2085 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2087 mfn_t m3mfn;
2088 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2089 l3_pgentry_t *l3e;
2090 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2092 /* Need to destroy the l3 and l2 monitor pages used
2093 * for the linear map */
2094 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2095 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2096 l3e = sh_map_domain_page(m3mfn);
2097 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2098 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2099 sh_unmap_domain_page(l3e);
2100 shadow_free(d, m3mfn);
2102 if ( is_pv_32on64_vcpu(v) )
2104 /* Need to destroy the l3 and l2 monitor pages that map the
2105 * Xen VAs at 3GB-4GB */
2106 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2107 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2108 l3e = sh_map_domain_page(m3mfn);
2109 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2110 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2111 sh_unmap_domain_page(l3e);
2112 shadow_free(d, m3mfn);
2114 sh_unmap_domain_page(l4e);
2116 #elif CONFIG_PAGING_LEVELS == 3
2117 /* Need to destroy the l2 monitor page in slot 4 too */
2119 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2120 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2121 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2122 sh_unmap_domain_page(l3e);
2124 #endif
2126 /* Put the memory back in the pool */
2127 shadow_free(d, mmfn);
2129 #endif
2131 /**************************************************************************/
2132 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2133 * These are called from common code when we are running out of shadow
2134 * memory, and unpinning all the top-level shadows hasn't worked.
2136 * This implementation is pretty crude and slow, but we hope that it won't
2137 * be called very often. */
2139 #if GUEST_PAGING_LEVELS == 2
2141 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2143 shadow_l2e_t *sl2e;
2144 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2145 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2146 });
2149 #elif GUEST_PAGING_LEVELS == 3
2151 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2152 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2154 shadow_l2e_t *sl2e;
2155 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2156 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2157 });
2160 #elif GUEST_PAGING_LEVELS == 4
2162 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2164 shadow_l4e_t *sl4e;
2165 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2166 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2167 });
2170 #endif
2172 /**************************************************************************/
2173 /* Internal translation functions.
2174 * These functions require a pointer to the shadow entry that will be updated.
2175 */
2177 /* These functions take a new guest entry, translate it to shadow and write
2178 * the shadow entry.
2180 * They return the same bitmaps as the shadow_set_lXe() functions.
2181 */
2183 #if GUEST_PAGING_LEVELS >= 4
2184 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2186 shadow_l4e_t new_sl4e;
2187 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2188 shadow_l4e_t *sl4p = se;
2189 mfn_t sl3mfn = _mfn(INVALID_MFN);
2190 struct domain *d = v->domain;
2191 p2m_type_t p2mt;
2192 int result = 0;
2194 perfc_incr(shadow_validate_gl4e_calls);
2196 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2198 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2199 mfn_t gl3mfn = gfn_to_mfn_query(d, gl3gfn, &p2mt);
2200 if ( p2m_is_ram(p2mt) )
2201 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2202 else if ( p2mt != p2m_populate_on_demand )
2203 result |= SHADOW_SET_ERROR;
2205 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
2206 if ( mfn_valid(sl3mfn) )
2207 shadow_resync_all(v, 0);
2208 #endif
2210 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2212 // check for updates to xen reserved slots
2213 if ( !shadow_mode_external(d) )
2215 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2216 sizeof(shadow_l4e_t));
2217 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2219 if ( unlikely(reserved_xen_slot) )
2221 // attempt by the guest to write to a xen reserved slot
2222 //
2223 SHADOW_PRINTK("%s out-of-range update "
2224 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2225 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2226 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2228 SHADOW_ERROR("out-of-range l4e update\n");
2229 result |= SHADOW_SET_ERROR;
2232 // do not call shadow_set_l4e...
2233 return result;
2237 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2238 return result;
2242 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2244 shadow_l3e_t new_sl3e;
2245 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2246 shadow_l3e_t *sl3p = se;
2247 mfn_t sl2mfn = _mfn(INVALID_MFN);
2248 p2m_type_t p2mt;
2249 int result = 0;
2251 perfc_incr(shadow_validate_gl3e_calls);
2253 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2255 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2256 mfn_t gl2mfn = gfn_to_mfn_query(v->domain, gl2gfn, &p2mt);
2257 if ( p2m_is_ram(p2mt) )
2258 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2259 else if ( p2mt != p2m_populate_on_demand )
2260 result |= SHADOW_SET_ERROR;
2262 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
2263 if ( mfn_valid(sl2mfn) )
2264 shadow_resync_all(v, 0);
2265 #endif
2267 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2268 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2270 return result;
2272 #endif // GUEST_PAGING_LEVELS >= 4
2274 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2276 shadow_l2e_t new_sl2e;
2277 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2278 shadow_l2e_t *sl2p = se;
2279 mfn_t sl1mfn = _mfn(INVALID_MFN);
2280 p2m_type_t p2mt;
2281 int result = 0;
2283 perfc_incr(shadow_validate_gl2e_calls);
2285 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2287 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2288 if ( guest_supports_superpages(v) &&
2289 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2291 // superpage -- need to look up the shadow L1 which holds the
2292 // splitters...
2293 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2294 #if 0
2295 // XXX - it's possible that we want to do some kind of prefetch
2296 // for superpage fl1's here, but this is *not* on the demand path,
2297 // so we'll hold off trying that for now...
2298 //
2299 if ( !mfn_valid(sl1mfn) )
2300 sl1mfn = make_fl1_shadow(v, gl1gfn);
2301 #endif
2303 else
2305 mfn_t gl1mfn = gfn_to_mfn_query(v->domain, gl1gfn, &p2mt);
2306 if ( p2m_is_ram(p2mt) )
2307 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2308 else if ( p2mt != p2m_populate_on_demand )
2309 result |= SHADOW_SET_ERROR;
2312 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2314 // check for updates to xen reserved slots in PV guests...
2315 // XXX -- need to revisit this for PV 3-on-4 guests.
2316 //
2317 #if SHADOW_PAGING_LEVELS < 4
2318 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2319 if ( !shadow_mode_external(v->domain) )
2321 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2322 sizeof(shadow_l2e_t));
2323 int reserved_xen_slot;
2325 #if SHADOW_PAGING_LEVELS == 3
2326 reserved_xen_slot =
2327 ((mfn_to_page(sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow) &&
2328 (shadow_index
2329 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2330 #else /* SHADOW_PAGING_LEVELS == 2 */
2331 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2332 #endif
2334 if ( unlikely(reserved_xen_slot) )
2336 // attempt by the guest to write to a xen reserved slot
2337 //
2338 SHADOW_PRINTK("%s out-of-range update "
2339 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2340 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2341 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2343 SHADOW_ERROR("out-of-range l2e update\n");
2344 result |= SHADOW_SET_ERROR;
2347 // do not call shadow_set_l2e...
2348 return result;
2351 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2352 #endif /* SHADOW_PAGING_LEVELS < 4 */
2354 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2356 return result;
2359 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2361 shadow_l1e_t new_sl1e;
2362 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2363 shadow_l1e_t *sl1p = se;
2364 gfn_t gfn;
2365 mfn_t gmfn;
2366 p2m_type_t p2mt;
2367 int result = 0;
2368 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2369 mfn_t gl1mfn;
2370 #endif /* OOS */
2372 perfc_incr(shadow_validate_gl1e_calls);
2374 gfn = guest_l1e_get_gfn(new_gl1e);
2375 gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
2377 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2378 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2380 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2381 gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
2382 if ( mfn_valid(gl1mfn)
2383 && mfn_is_out_of_sync(gl1mfn) )
2385 /* Update the OOS snapshot. */
2386 mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
2387 guest_l1e_t *snp;
2389 ASSERT(mfn_valid(snpmfn));
2391 snp = sh_map_domain_page(snpmfn);
2392 snp[guest_index(new_ge)] = new_gl1e;
2393 sh_unmap_domain_page(snp);
2395 #endif /* OOS */
2397 return result;
2400 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2401 /**************************************************************************/
2402 /* Special validation function for re-syncing out-of-sync shadows.
2403 * Walks the *shadow* page, and for every entry that it finds,
2404 * revalidates the guest entry that corresponds to it.
2405 * N.B. This function is called with the vcpu that unsynced the page,
2406 * *not* the one that is causing it to be resynced. */
2407 void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
2409 mfn_t sl1mfn;
2410 shadow_l1e_t *sl1p;
2411 guest_l1e_t *gl1p, *gp, *snp;
2412 int rc = 0;
2414 ASSERT(mfn_valid(snpmfn));
2416 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2417 ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
2419 snp = sh_map_domain_page(snpmfn);
2420 gp = sh_map_domain_page(gl1mfn);
2421 gl1p = gp;
2423 SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
2424 guest_l1e_t gl1e = *gl1p;
2425 guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
2427 if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
2429 gfn_t gfn;
2430 mfn_t gmfn;
2431 p2m_type_t p2mt;
2432 shadow_l1e_t nsl1e;
2434 gfn = guest_l1e_get_gfn(gl1e);
2435 gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
2436 l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
2437 rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
2439 *snpl1p = gl1e;
2441 });
2443 sh_unmap_domain_page(gp);
2444 sh_unmap_domain_page(snp);
2446 /* Setting shadow L1 entries should never need us to flush the TLB */
2447 ASSERT(!(rc & SHADOW_SET_FLUSH));
2450 /* Figure out whether it's definitely safe not to sync this l1 table.
2451 * That is: if we can tell that it's only used once, and that the
2452 * toplevel shadow responsible is not one of ours.
2453 * N.B. This function is called with the vcpu that required the resync,
2454 * *not* the one that originally unsynced the page, but it is
2455 * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
2456 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
2458 struct page_info *sp;
2459 mfn_t smfn;
2461 smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2462 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2464 /* Up to l2 */
2465 sp = mfn_to_page(smfn);
2466 if ( sp->u.sh.count != 1 || !sp->up )
2467 return 0;
2468 smfn = _mfn(sp->up >> PAGE_SHIFT);
2469 ASSERT(mfn_valid(smfn));
2471 #if (SHADOW_PAGING_LEVELS == 4)
2472 /* up to l3 */
2473 sp = mfn_to_page(smfn);
2474 if ( sp->u.sh.count != 1 || !sp->up )
2475 return 0;
2476 smfn = _mfn(sp->up >> PAGE_SHIFT);
2477 ASSERT(mfn_valid(smfn));
2479 /* up to l4 */
2480 sp = mfn_to_page(smfn);
2481 if ( sp->u.sh.count != 1
2482 || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
2483 return 0;
2484 smfn = _mfn(sp->up >> PAGE_SHIFT);
2485 ASSERT(mfn_valid(smfn));
2486 #endif
2488 #if (GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS == 3)
2489 /* In 2-on-3 shadow mode the up pointer contains the link to the
2490 * shadow page, but the shadow_table contains only the first of the
2491 * four pages that makes the PAE top shadow tables. */
2492 smfn = _mfn(mfn_x(smfn) & ~0x3UL);
2493 #endif
2495 if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
2496 #if (SHADOW_PAGING_LEVELS == 3)
2497 || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
2498 || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
2499 || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
2500 #endif
2502 return 0;
2504 /* Only in use in one toplevel shadow, and it's not the one we're
2505 * running on */
2506 return 1;
2508 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
2511 /**************************************************************************/
2512 /* Functions which translate and install the shadows of arbitrary guest
2513 * entries that we have just seen the guest write. */
2516 static inline int
2517 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2518 void *new_gp, u32 size, u32 sh_type,
2519 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2520 int (*validate_ge)(struct vcpu *v, void *ge,
2521 mfn_t smfn, void *se))
2522 /* Generic function for mapping and validating. */
2524 mfn_t smfn, smfn2, map_mfn;
2525 shadow_l1e_t *sl1p;
2526 u32 shadow_idx, guest_idx;
2527 int result = 0;
2529 /* Align address and size to guest entry boundaries */
2530 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2531 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2532 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2533 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2535 /* Map the shadow page */
2536 smfn = get_shadow_status(v, gmfn, sh_type);
2537 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2538 guest_idx = guest_index(new_gp);
2539 map_mfn = smfn;
2540 shadow_idx = shadow_index(&map_mfn, guest_idx);
2541 sl1p = sh_map_domain_page(map_mfn);
2543 /* Validate one entry at a time */
2544 while ( size )
2546 smfn2 = smfn;
2547 guest_idx = guest_index(new_gp);
2548 shadow_idx = shadow_index(&smfn2, guest_idx);
2549 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2551 /* We have moved to another page of the shadow */
2552 map_mfn = smfn2;
2553 sh_unmap_domain_page(sl1p);
2554 sl1p = sh_map_domain_page(map_mfn);
2556 result |= validate_ge(v,
2557 new_gp,
2558 map_mfn,
2559 &sl1p[shadow_idx]);
2560 size -= sizeof(guest_l1e_t);
2561 new_gp += sizeof(guest_l1e_t);
2563 sh_unmap_domain_page(sl1p);
2564 return result;
2568 int
2569 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2570 void *new_gl4p, u32 size)
2572 #if GUEST_PAGING_LEVELS >= 4
2573 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2574 SH_type_l4_shadow,
2575 shadow_l4_index,
2576 validate_gl4e);
2577 #else // ! GUEST_PAGING_LEVELS >= 4
2578 SHADOW_ERROR("called in wrong paging mode!\n");
2579 BUG();
2580 return 0;
2581 #endif
2584 int
2585 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2586 void *new_gl3p, u32 size)
2588 #if GUEST_PAGING_LEVELS >= 4
2589 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2590 SH_type_l3_shadow,
2591 shadow_l3_index,
2592 validate_gl3e);
2593 #else // ! GUEST_PAGING_LEVELS >= 4
2594 SHADOW_ERROR("called in wrong paging mode!\n");
2595 BUG();
2596 return 0;
2597 #endif
2600 int
2601 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2602 void *new_gl2p, u32 size)
2604 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2605 SH_type_l2_shadow,
2606 shadow_l2_index,
2607 validate_gl2e);
2610 int
2611 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2612 void *new_gl2p, u32 size)
2614 #if GUEST_PAGING_LEVELS >= 3
2615 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2616 SH_type_l2h_shadow,
2617 shadow_l2_index,
2618 validate_gl2e);
2619 #else /* Non-PAE guests don't have different kinds of l2 table */
2620 SHADOW_ERROR("called in wrong paging mode!\n");
2621 BUG();
2622 return 0;
2623 #endif
2626 int
2627 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2628 void *new_gl1p, u32 size)
2630 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2631 SH_type_l1_shadow,
2632 shadow_l1_index,
2633 validate_gl1e);
2637 /**************************************************************************/
2638 /* Optimization: If we see two emulated writes of zeros to the same
2639 * page-table without another kind of page fault in between, we guess
2640 * that this is a batch of changes (for process destruction) and
2641 * unshadow the page so we don't take a pagefault on every entry. This
2642 * should also make finding writeable mappings of pagetables much
2643 * easier. */
2645 /* Look to see if this is the second emulated write in a row to this
2646 * page, and unshadow if it is */
2647 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2649 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2650 if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2651 && sh_mfn_is_a_page_table(gmfn) )
2653 perfc_incr(shadow_early_unshadow);
2654 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2655 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
2657 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2658 #endif
2661 /* Stop counting towards early unshadows, as we've seen a real page fault */
2662 static inline void reset_early_unshadow(struct vcpu *v)
2664 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2665 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2666 #endif
2671 /**************************************************************************/
2672 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2673 * demand-faulted a shadow l1e in the fault handler, to see if it's
2674 * worth fetching some more.
2675 */
2677 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2679 /* XXX magic number */
2680 #define PREFETCH_DISTANCE 32
2682 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2683 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2685 int i, dist;
2686 gfn_t gfn;
2687 mfn_t gmfn;
2688 guest_l1e_t *gl1p = NULL, gl1e;
2689 shadow_l1e_t sl1e;
2690 u32 gflags;
2691 p2m_type_t p2mt;
2692 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2693 guest_l1e_t *snpl1p = NULL;
2694 #endif /* OOS */
2697 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2698 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2699 /* And no more than a maximum fetches-per-fault */
2700 if ( dist > PREFETCH_DISTANCE )
2701 dist = PREFETCH_DISTANCE;
2703 if ( mfn_valid(gw->l1mfn) )
2705 /* Normal guest page; grab the next guest entry */
2706 gl1p = sh_map_domain_page(gw->l1mfn);
2707 gl1p += guest_l1_table_offset(gw->va);
2709 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2710 if ( mfn_is_out_of_sync(gw->l1mfn) )
2712 mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
2714 ASSERT(mfn_valid(snpmfn));
2715 snpl1p = sh_map_domain_page(snpmfn);
2716 snpl1p += guest_l1_table_offset(gw->va);
2718 #endif /* OOS */
2721 for ( i = 1; i < dist ; i++ )
2723 /* No point in prefetching if there's already a shadow */
2724 if ( ptr_sl1e[i].l1 != 0 )
2725 break;
2727 if ( mfn_valid(gw->l1mfn) )
2729 /* Normal guest page; grab the next guest entry */
2730 gl1e = gl1p[i];
2731 /* Not worth continuing if we hit an entry that will need another
2732 * fault for A/D-bit propagation anyway */
2733 gflags = guest_l1e_get_flags(gl1e);
2734 if ( (gflags & _PAGE_PRESENT)
2735 && (!(gflags & _PAGE_ACCESSED)
2736 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2737 break;
2739 else
2741 /* Fragmented superpage, unless we've been called wrongly */
2742 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2743 /* Increment the l1e's GFN by the right number of guest pages */
2744 gl1e = guest_l1e_from_gfn(
2745 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2746 guest_l1e_get_flags(gw->l1e));
2749 /* Look at the gfn that the l1e is pointing at */
2750 gfn = guest_l1e_get_gfn(gl1e);
2751 gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
2753 /* Propagate the entry. */
2754 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2755 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2757 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2758 if ( snpl1p != NULL )
2759 snpl1p[i] = gl1e;
2760 #endif /* OOS */
2762 if ( gl1p != NULL )
2763 sh_unmap_domain_page(gl1p);
2764 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2765 if ( snpl1p != NULL )
2766 sh_unmap_domain_page(snpl1p);
2767 #endif /* OOS */
2770 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2772 #if GUEST_PAGING_LEVELS == 4
2773 typedef u64 guest_va_t;
2774 typedef u64 guest_pa_t;
2775 #elif GUEST_PAGING_LEVELS == 3
2776 typedef u32 guest_va_t;
2777 typedef u64 guest_pa_t;
2778 #else
2779 typedef u32 guest_va_t;
2780 typedef u32 guest_pa_t;
2781 #endif
2783 static inline void trace_shadow_gen(u32 event, guest_va_t va)
2785 if ( tb_init_done )
2787 event |= (GUEST_PAGING_LEVELS-2)<<8;
2788 __trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va);
2792 static inline void trace_shadow_fixup(guest_l1e_t gl1e,
2793 guest_va_t va)
2795 if ( tb_init_done )
2797 struct {
2798 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2799 so put it first for alignment sake. */
2800 guest_l1e_t gl1e;
2801 guest_va_t va;
2802 u32 flags;
2803 } __attribute__((packed)) d;
2804 u32 event;
2806 event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8);
2808 d.gl1e = gl1e;
2809 d.va = va;
2810 d.flags = this_cpu(trace_shadow_path_flags);
2812 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2816 static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
2817 guest_va_t va)
2819 if ( tb_init_done )
2821 struct {
2822 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2823 so put it first for alignment sake. */
2824 guest_l1e_t gl1e;
2825 guest_va_t va;
2826 u32 flags;
2827 } __attribute__((packed)) d;
2828 u32 event;
2830 event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8);
2832 d.gl1e = gl1e;
2833 d.va = va;
2834 d.flags = this_cpu(trace_shadow_path_flags);
2836 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2840 static inline void trace_shadow_emulate_other(u32 event,
2841 guest_va_t va,
2842 gfn_t gfn)
2844 if ( tb_init_done )
2846 struct {
2847 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2848 so put it first for alignment sake. */
2849 #if GUEST_PAGING_LEVELS == 2
2850 u32 gfn;
2851 #else
2852 u64 gfn;
2853 #endif
2854 guest_va_t va;
2855 } __attribute__((packed)) d;
2857 event |= ((GUEST_PAGING_LEVELS-2)<<8);
2859 d.gfn=gfn_x(gfn);
2860 d.va = va;
2862 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2866 #if GUEST_PAGING_LEVELS == 3
2867 static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va);
2868 static DEFINE_PER_CPU(int,trace_extra_emulation_count);
2869 #endif
2870 static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
2872 static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
2874 if ( tb_init_done )
2876 struct {
2877 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2878 so put it first for alignment sake. */
2879 guest_l1e_t gl1e, write_val;
2880 guest_va_t va;
2881 unsigned flags:29, emulation_count:3;
2882 } __attribute__((packed)) d;
2883 u32 event;
2885 event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8);
2887 d.gl1e = gl1e;
2888 d.write_val.l1 = this_cpu(trace_emulate_write_val);
2889 d.va = va;
2890 #if GUEST_PAGING_LEVELS == 3
2891 d.emulation_count = this_cpu(trace_extra_emulation_count);
2892 #endif
2893 d.flags = this_cpu(trace_shadow_path_flags);
2895 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2899 /**************************************************************************/
2900 /* Entry points into the shadow code */
2902 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2903 * for pagefaults. Returns 1 if this fault was an artefact of the
2904 * shadow code (and the guest should retry) or 0 if it is not (and the
2905 * fault should be handled elsewhere or passed to the guest). */
2907 static int sh_page_fault(struct vcpu *v,
2908 unsigned long va,
2909 struct cpu_user_regs *regs)
2911 struct domain *d = v->domain;
2912 walk_t gw;
2913 gfn_t gfn = _gfn(0);
2914 mfn_t gmfn, sl1mfn = _mfn(0);
2915 shadow_l1e_t sl1e, *ptr_sl1e;
2916 paddr_t gpa;
2917 struct sh_emulate_ctxt emul_ctxt;
2918 struct x86_emulate_ops *emul_ops;
2919 int r;
2920 fetch_type_t ft = 0;
2921 p2m_type_t p2mt;
2922 uint32_t rc;
2923 int version;
2924 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2925 int fast_emul = 0;
2926 #endif
2928 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
2929 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
2930 regs->eip);
2932 perfc_incr(shadow_fault);
2934 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2935 /* If faulting frame is successfully emulated in last shadow fault
2936 * it's highly likely to reach same emulation action for this frame.
2937 * Then try to emulate early to avoid lock aquisition.
2938 */
2939 if ( v->arch.paging.last_write_emul_ok
2940 && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
2942 /* check whether error code is 3, or else fall back to normal path
2943 * in case of some validation is required
2944 */
2945 if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
2947 fast_emul = 1;
2948 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
2950 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2951 /* Fall back to the slow path if we're trying to emulate
2952 writes to an out of sync page. */
2953 if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
2955 fast_emul = 0;
2956 v->arch.paging.last_write_emul_ok = 0;
2957 goto page_fault_slow_path;
2959 #endif /* OOS */
2961 perfc_incr(shadow_fault_fast_emulate);
2962 goto early_emulation;
2964 else
2965 v->arch.paging.last_write_emul_ok = 0;
2967 #endif
2969 //
2970 // XXX: Need to think about eventually mapping superpages directly in the
2971 // shadow (when possible), as opposed to splintering them into a
2972 // bunch of 4K maps.
2973 //
2975 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
2976 if ( (regs->error_code & PFEC_reserved_bit) )
2978 /* The only reasons for reserved bits to be set in shadow entries
2979 * are the two "magic" shadow_l1e entries. */
2980 if ( likely((__copy_from_user(&sl1e,
2981 (sh_linear_l1_table(v)
2982 + shadow_l1_linear_offset(va)),
2983 sizeof(sl1e)) == 0)
2984 && sh_l1e_is_magic(sl1e)) )
2986 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2987 /* First, need to check that this isn't an out-of-sync
2988 * shadow l1e. If it is, we fall back to the slow path, which
2989 * will sync it up again. */
2991 shadow_l2e_t sl2e;
2992 mfn_t gl1mfn;
2993 if ( (__copy_from_user(&sl2e,
2994 (sh_linear_l2_table(v)
2995 + shadow_l2_linear_offset(va)),
2996 sizeof(sl2e)) != 0)
2997 || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
2998 || !mfn_valid(gl1mfn = _mfn(mfn_to_page(
2999 shadow_l2e_get_mfn(sl2e))->v.sh.back))
3000 || unlikely(mfn_is_out_of_sync(gl1mfn)) )
3002 /* Hit the slow path as if there had been no
3003 * shadow entry at all, and let it tidy up */
3004 ASSERT(regs->error_code & PFEC_page_present);
3005 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3006 goto page_fault_slow_path;
3009 #endif /* SHOPT_OUT_OF_SYNC */
3011 if ( sh_l1e_is_gnp(sl1e) )
3013 /* Not-present in a guest PT: pass to the guest as
3014 * a not-present fault (by flipping two bits). */
3015 ASSERT(regs->error_code & PFEC_page_present);
3016 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3017 reset_early_unshadow(v);
3018 perfc_incr(shadow_fault_fast_gnp);
3019 SHADOW_PRINTK("fast path not-present\n");
3020 trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
3021 return 0;
3023 else
3025 /* Magic MMIO marker: extract gfn for MMIO address */
3026 ASSERT(sh_l1e_is_mmio(sl1e));
3027 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
3028 << PAGE_SHIFT)
3029 | (va & ~PAGE_MASK);
3031 perfc_incr(shadow_fault_fast_mmio);
3032 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
3033 reset_early_unshadow(v);
3034 trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
3035 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3036 ? EXCRET_fault_fixed : 0);
3038 else
3040 /* This should be exceptionally rare: another vcpu has fixed
3041 * the tables between the fault and our reading the l1e.
3042 * Retry and let the hardware give us the right fault next time. */
3043 perfc_incr(shadow_fault_fast_fail);
3044 SHADOW_PRINTK("fast path false alarm!\n");
3045 trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
3046 return EXCRET_fault_fixed;
3050 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3051 page_fault_slow_path:
3052 #endif
3053 #endif /* SHOPT_FAST_FAULT_PATH */
3055 /* Detect if this page fault happened while we were already in Xen
3056 * doing a shadow operation. If that happens, the only thing we can
3057 * do is let Xen's normal fault handlers try to fix it. In any case,
3058 * a diagnostic trace of the fault will be more useful than
3059 * a BUG() when we try to take the lock again. */
3060 if ( unlikely(shadow_locked_by_me(d)) )
3062 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
3063 d->arch.paging.shadow.locker_function);
3064 return 0;
3067 rewalk:
3069 /* The walk is done in a lock-free style, with some sanity check
3070 * postponed after grabbing shadow lock later. Those delayed checks
3071 * will make sure no inconsistent mapping being translated into
3072 * shadow page table. */
3073 version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
3074 rmb();
3075 rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
3077 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3078 regs->error_code &= ~PFEC_page_present;
3079 if ( !(rc & _PAGE_PRESENT) )
3080 regs->error_code |= PFEC_page_present;
3081 #endif
3083 if ( rc != 0 )
3085 perfc_incr(shadow_fault_bail_real_fault);
3086 SHADOW_PRINTK("not a shadow fault\n");
3087 reset_early_unshadow(v);
3088 goto propagate;
3091 /* It's possible that the guest has put pagetables in memory that it has
3092 * already used for some special purpose (ioreq pages, or granted pages).
3093 * If that happens we'll have killed the guest already but it's still not
3094 * safe to propagate entries out of the guest PT so get out now. */
3095 if ( unlikely(d->is_shutting_down) )
3097 SHADOW_PRINTK("guest is shutting down\n");
3098 goto propagate;
3101 /* What kind of access are we dealing with? */
3102 ft = ((regs->error_code & PFEC_write_access)
3103 ? ft_demand_write : ft_demand_read);
3105 /* What mfn is the guest trying to access? */
3106 gfn = guest_l1e_get_gfn(gw.l1e);
3107 gmfn = gfn_to_mfn_guest(d, gfn, &p2mt);
3109 if ( shadow_mode_refcounts(d) &&
3110 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3112 perfc_incr(shadow_fault_bail_bad_gfn);
3113 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3114 gfn_x(gfn), mfn_x(gmfn));
3115 reset_early_unshadow(v);
3116 goto propagate;
3119 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3120 /* Remember this successful VA->GFN translation for later. */
3121 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3122 regs->error_code | PFEC_page_present);
3123 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3125 shadow_lock(d);
3127 TRACE_CLEAR_PATH_FLAGS;
3129 /* Make sure there is enough free shadow memory to build a chain of
3130 * shadow tables. (We never allocate a top-level shadow on this path,
3131 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3132 * SH_type_l1_shadow isn't correct in the latter case, all page
3133 * tables are the same size there.)
3135 * Preallocate shadow pages *before* removing writable accesses
3136 * otherwhise an OOS L1 might be demoted and promoted again with
3137 * writable mappings. */
3138 shadow_prealloc(d,
3139 SH_type_l1_shadow,
3140 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3142 rc = gw_remove_write_accesses(v, va, &gw);
3144 /* First bit set: Removed write access to a page. */
3145 if ( rc & GW_RMWR_FLUSHTLB )
3147 /* Write permission removal is also a hint that other gwalks
3148 * overlapping with this one may be inconsistent
3149 */
3150 perfc_incr(shadow_rm_write_flush_tlb);
3151 atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3152 flush_tlb_mask(&d->domain_dirty_cpumask);
3155 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3156 /* Second bit set: Resynced a page. Re-walk needed. */
3157 if ( rc & GW_RMWR_REWALK )
3159 shadow_unlock(d);
3160 goto rewalk;
3162 #endif /* OOS */
3164 if ( !shadow_check_gwalk(v, va, &gw, version) )
3166 perfc_incr(shadow_inconsistent_gwalk);
3167 shadow_unlock(d);
3168 goto rewalk;
3171 shadow_audit_tables(v);
3172 sh_audit_gw(v, &gw);
3174 /* Acquire the shadow. This must happen before we figure out the rights
3175 * for the shadow entry, since we might promote a page here. */
3176 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3177 if ( unlikely(ptr_sl1e == NULL) )
3179 /* Couldn't get the sl1e! Since we know the guest entries
3180 * are OK, this can only have been caused by a failed
3181 * shadow_set_l*e(), which will have crashed the guest.
3182 * Get out of the fault handler immediately. */
3183 ASSERT(d->is_shutting_down);
3184 shadow_unlock(d);
3185 trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va);
3186 return 0;
3189 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3190 /* Always unsync when writing to L1 page tables. */
3191 if ( sh_mfn_is_a_page_table(gmfn)
3192 && ft == ft_demand_write )
3193 sh_unsync(v, gmfn);
3195 if ( unlikely(d->is_shutting_down) )
3197 /* We might end up with a crashed domain here if
3198 * sh_remove_shadows() in a previous sh_resync() call has
3199 * failed. We cannot safely continue since some page is still
3200 * OOS but not in the hash table anymore. */
3201 shadow_unlock(d);
3202 return 0;
3204 #endif /* OOS */
3206 /* Calculate the shadow entry and write it */
3207 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3208 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
3210 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3211 if ( mfn_valid(gw.l1mfn)
3212 && mfn_is_out_of_sync(gw.l1mfn) )
3214 /* Update the OOS snapshot. */
3215 mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
3216 guest_l1e_t *snp;
3218 ASSERT(mfn_valid(snpmfn));
3220 snp = sh_map_domain_page(snpmfn);
3221 snp[guest_l1_table_offset(va)] = gw.l1e;
3222 sh_unmap_domain_page(snp);
3224 #endif /* OOS */
3226 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3227 /* Prefetch some more shadow entries */
3228 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3229 #endif
3231 /* Need to emulate accesses to page tables */
3232 if ( sh_mfn_is_a_page_table(gmfn)
3233 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3234 /* Unless they've been allowed to go out of sync with their
3235 shadows and we don't need to unshadow it. */
3236 && !(mfn_is_out_of_sync(gmfn)
3237 && !(regs->error_code & PFEC_user_mode))
3238 #endif
3241 if ( ft == ft_demand_write )
3243 perfc_incr(shadow_fault_emulate_write);
3244 goto emulate;
3246 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3248 perfc_incr(shadow_fault_emulate_read);
3249 goto emulate;
3253 /* Need to hand off device-model MMIO to the device model */
3254 if ( p2mt == p2m_mmio_dm )
3256 gpa = guest_walk_to_gpa(&gw);
3257 goto mmio;
3260 /* Ignore attempts to write to read-only memory. */
3261 if ( (p2mt == p2m_ram_ro) && (ft == ft_demand_write) )
3263 static unsigned long lastpage;
3264 if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
3265 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
3266 " page. va page=%#lx, mfn=%#lx\n",
3267 va & PAGE_MASK, mfn_x(gmfn));
3268 goto emulate_readonly; /* skip over the instruction */
3271 /* In HVM guests, we force CR0.WP always to be set, so that the
3272 * pagetables are always write-protected. If the guest thinks
3273 * CR0.WP is clear, we must emulate faulting supervisor writes to
3274 * allow the guest to write through read-only PTEs. Emulate if the
3275 * fault was a non-user write to a present page. */
3276 if ( is_hvm_domain(d)
3277 && unlikely(!hvm_wp_enabled(v))
3278 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
3280 perfc_incr(shadow_fault_emulate_wp);
3281 goto emulate;
3284 perfc_incr(shadow_fault_fixed);
3285 d->arch.paging.log_dirty.fault_count++;
3286 reset_early_unshadow(v);
3288 trace_shadow_fixup(gw.l1e, va);
3289 done:
3290 sh_audit_gw(v, &gw);
3291 SHADOW_PRINTK("fixed\n");
3292 shadow_audit_tables(v);
3293 shadow_unlock(d);
3294 return EXCRET_fault_fixed;
3296 emulate:
3297 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3298 goto not_a_shadow_fault;
3300 /*
3301 * We do not emulate user writes. Instead we use them as a hint that the
3302 * page is no longer a page table. This behaviour differs from native, but
3303 * it seems very unlikely that any OS grants user access to page tables.
3304 */
3305 if ( (regs->error_code & PFEC_user_mode) )
3307 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3308 mfn_x(gmfn));
3309 perfc_incr(shadow_fault_emulate_failed);
3310 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3311 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
3312 va, gfn);
3313 goto done;
3316 /*
3317 * Write from userspace to ro-mem needs to jump here to avoid getting
3318 * caught by user-mode page-table check above.
3319 */
3320 emulate_readonly:
3321 /*
3322 * We don't need to hold the lock for the whole emulation; we will
3323 * take it again when we write to the pagetables.
3324 */
3325 sh_audit_gw(v, &gw);
3326 shadow_audit_tables(v);
3327 shadow_unlock(d);
3329 this_cpu(trace_emulate_write_val) = 0;
3331 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3332 early_emulation:
3333 #endif
3334 if ( is_hvm_domain(d) )
3336 /*
3337 * If we are in the middle of injecting an exception or interrupt then
3338 * we should not emulate: it is not the instruction at %eip that caused
3339 * the fault. Furthermore it is almost certainly the case the handler
3340 * stack is currently considered to be a page table, so we should
3341 * unshadow the faulting page before exiting.
3342 */
3343 if ( unlikely(hvm_event_pending(v)) )
3345 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3346 if ( fast_emul )
3348 perfc_incr(shadow_fault_fast_emulate_fail);
3349 v->arch.paging.last_write_emul_ok = 0;
3351 #endif
3352 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3353 "injection: cr2=%#lx, mfn=%#lx\n",
3354 va, mfn_x(gmfn));
3355 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3356 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
3357 va, gfn);
3358 return EXCRET_fault_fixed;
3362 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3363 (unsigned long)regs->eip, (unsigned long)regs->esp);
3365 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3367 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3369 /*
3370 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3371 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3372 * then it must be 'failable': we cannot require the unshadow to succeed.
3373 */
3374 if ( r == X86EMUL_UNHANDLEABLE )
3376 perfc_incr(shadow_fault_emulate_failed);
3377 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3378 if ( fast_emul )
3380 perfc_incr(shadow_fault_fast_emulate_fail);
3381 v->arch.paging.last_write_emul_ok = 0;
3383 #endif
3384 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3385 mfn_x(gmfn));
3386 /* If this is actually a page table, then we have a bug, and need
3387 * to support more operations in the emulator. More likely,
3388 * though, this is a hint that this page should not be shadowed. */
3389 shadow_remove_all_shadows(v, gmfn);
3391 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
3392 va, gfn);
3393 goto emulate_done;
3396 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3397 /* Record successfully emulated information as heuristics to next
3398 * fault on same frame for acceleration. But be careful to verify
3399 * its attribute still as page table, or else unshadow triggered
3400 * in write emulation normally requires a re-sync with guest page
3401 * table to recover r/w permission. Incorrect record for such case
3402 * will cause unexpected more shadow faults due to propagation is
3403 * skipped.
3404 */
3405 if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3407 if ( !fast_emul )
3409 v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3410 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3411 v->arch.paging.last_write_emul_ok = 1;
3414 else if ( fast_emul )
3415 v->arch.paging.last_write_emul_ok = 0;
3416 #endif
3418 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3419 if ( r == X86EMUL_OKAY ) {
3420 int i, emulation_count=0;
3421 this_cpu(trace_emulate_initial_va) = va;
3422 /* Emulate up to four extra instructions in the hope of catching
3423 * the "second half" of a 64-bit pagetable write. */
3424 for ( i = 0 ; i < 4 ; i++ )
3426 shadow_continue_emulation(&emul_ctxt, regs);
3427 v->arch.paging.last_write_was_pt = 0;
3428 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3429 if ( r == X86EMUL_OKAY )
3431 emulation_count++;
3432 if ( v->arch.paging.last_write_was_pt )
3434 perfc_incr(shadow_em_ex_pt);
3435 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
3436 break; /* Don't emulate past the other half of the write */
3438 else
3439 perfc_incr(shadow_em_ex_non_pt);
3441 else
3443 perfc_incr(shadow_em_ex_fail);
3444 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
3445 break; /* Don't emulate again if we failed! */
3448 this_cpu(trace_extra_emulation_count)=emulation_count;
3450 #endif /* PAE guest */
3452 trace_shadow_emulate(gw.l1e, va);
3453 emulate_done:
3454 SHADOW_PRINTK("emulated\n");
3455 return EXCRET_fault_fixed;
3457 mmio:
3458 if ( !guest_mode(regs) )
3459 goto not_a_shadow_fault;
3460 perfc_incr(shadow_fault_mmio);
3461 sh_audit_gw(v, &gw);
3462 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3463 shadow_audit_tables(v);
3464 reset_early_unshadow(v);
3465 shadow_unlock(d);
3466 trace_shadow_gen(TRC_SHADOW_MMIO, va);
3467 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3468 ? EXCRET_fault_fixed : 0);
3470 not_a_shadow_fault:
3471 sh_audit_gw(v, &gw);
3472 SHADOW_PRINTK("not a shadow fault\n");
3473 shadow_audit_tables(v);
3474 reset_early_unshadow(v);
3475 shadow_unlock(d);
3477 propagate:
3478 trace_not_shadow_fault(gw.l1e, va);
3480 return 0;
3484 static int
3485 sh_invlpg(struct vcpu *v, unsigned long va)
3486 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3487 * instruction should be issued on the hardware, or 0 if it's safe not
3488 * to do so. */
3490 mfn_t sl1mfn;
3491 shadow_l2e_t sl2e;
3493 perfc_incr(shadow_invlpg);
3495 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3496 /* No longer safe to use cached gva->gfn translations */
3497 vtlb_flush(v);
3498 #endif
3500 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3501 v->arch.paging.last_write_emul_ok = 0;
3502 #endif
3504 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3505 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3506 * yet. */
3507 #if SHADOW_PAGING_LEVELS == 4
3509 shadow_l3e_t sl3e;
3510 if ( !(shadow_l4e_get_flags(
3511 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3512 & _PAGE_PRESENT) )
3513 return 0;
3514 /* This must still be a copy-from-user because we don't have the
3515 * shadow lock, and the higher-level shadows might disappear
3516 * under our feet. */
3517 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3518 + shadow_l3_linear_offset(va)),
3519 sizeof (sl3e)) != 0 )
3521 perfc_incr(shadow_invlpg_fault);
3522 return 0;
3524 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3525 return 0;
3527 #else /* SHADOW_PAGING_LEVELS == 3 */
3528 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3529 & _PAGE_PRESENT) )
3530 // no need to flush anything if there's no SL2...
3531 return 0;
3532 #endif
3534 /* This must still be a copy-from-user because we don't have the shadow
3535 * lock, and the higher-level shadows might disappear under our feet. */
3536 if ( __copy_from_user(&sl2e,
3537 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3538 sizeof (sl2e)) != 0 )
3540 perfc_incr(shadow_invlpg_fault);
3541 return 0;
3544 // If there's nothing shadowed for this particular sl2e, then
3545 // there is no need to do an invlpg, either...
3546 //
3547 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3548 return 0;
3550 // Check to see if the SL2 is a splintered superpage...
3551 // If so, then we'll need to flush the entire TLB (because that's
3552 // easier than invalidating all of the individual 4K pages).
3553 //
3554 sl1mfn = shadow_l2e_get_mfn(sl2e);
3555 if ( mfn_to_page(sl1mfn)->u.sh.type
3556 == SH_type_fl1_shadow )
3558 flush_tlb_local();
3559 return 0;
3562 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3563 /* Check to see if the SL1 is out of sync. */
3565 mfn_t gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
3566 struct page_info *pg = mfn_to_page(gl1mfn);
3567 if ( mfn_valid(gl1mfn)
3568 && page_is_out_of_sync(pg) )
3570 /* The test above may give false positives, since we don't
3571 * hold the shadow lock yet. Check again with the lock held. */
3572 shadow_lock(v->domain);
3574 /* This must still be a copy-from-user because we didn't
3575 * have the shadow lock last time we checked, and the
3576 * higher-level shadows might have disappeared under our
3577 * feet. */
3578 if ( __copy_from_user(&sl2e,
3579 sh_linear_l2_table(v)
3580 + shadow_l2_linear_offset(va),
3581 sizeof (sl2e)) != 0 )
3583 perfc_incr(shadow_invlpg_fault);
3584 shadow_unlock(v->domain);
3585 return 0;
3588 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3590 shadow_unlock(v->domain);
3591 return 0;
3594 sl1mfn = shadow_l2e_get_mfn(sl2e);
3595 gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
3596 pg = mfn_to_page(gl1mfn);
3598 if ( likely(sh_mfn_is_a_page_table(gl1mfn)
3599 && page_is_out_of_sync(pg) ) )
3601 shadow_l1e_t *sl1;
3602 sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
3603 /* Remove the shadow entry that maps this VA */
3604 (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
3606 shadow_unlock(v->domain);
3607 /* Need the invlpg, to pick up the disappeareance of the sl1e */
3608 return 1;
3611 #endif
3613 return 1;
3617 static unsigned long
3618 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3619 /* Called to translate a guest virtual address to what the *guest*
3620 * pagetables would map it to. */
3622 walk_t gw;
3623 gfn_t gfn;
3625 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3626 /* Check the vTLB cache first */
3627 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3628 if ( VALID_GFN(vtlb_gfn) )
3629 return vtlb_gfn;
3630 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3632 if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
3634 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3635 pfec[0] &= ~PFEC_page_present;
3636 return INVALID_GFN;
3638 gfn = guest_walk_to_gfn(&gw);
3640 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3641 /* Remember this successful VA->GFN translation for later. */
3642 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3643 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3645 return gfn_x(gfn);
3649 static inline void
3650 sh_update_linear_entries(struct vcpu *v)
3651 /* Sync up all the linear mappings for this vcpu's pagetables */
3653 struct domain *d = v->domain;
3655 /* Linear pagetables in PV guests
3656 * ------------------------------
3658 * Guest linear pagetables, which map the guest pages, are at
3659 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3660 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3661 * are set up at shadow creation time, but (of course!) the PAE case
3662 * is subtler. Normal linear mappings are made by having an entry
3663 * in the top-level table that points to itself (shadow linear) or
3664 * to the guest top-level table (guest linear). For PAE, to set up
3665 * a linear map requires us to copy the four top-level entries into
3666 * level-2 entries. That means that every time we change a PAE l3e,
3667 * we need to reflect the change into the copy.
3669 * Linear pagetables in HVM guests
3670 * -------------------------------
3672 * For HVM guests, the linear pagetables are installed in the monitor
3673 * tables (since we can't put them in the shadow). Shadow linear
3674 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3675 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3676 * a linear pagetable of the monitor tables themselves. We have
3677 * the same issue of having to re-copy PAE l3 entries whevever we use
3678 * PAE shadows.
3680 * Because HVM guests run on the same monitor tables regardless of the
3681 * shadow tables in use, the linear mapping of the shadow tables has to
3682 * be updated every time v->arch.shadow_table changes.
3683 */
3685 /* Don't try to update the monitor table if it doesn't exist */
3686 if ( shadow_mode_external(d)
3687 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3688 return;
3690 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3692 /* For PV, one l4e points at the guest l4, one points at the shadow
3693 * l4. No maintenance required.
3694 * For HVM, just need to update the l4e that points to the shadow l4. */
3696 if ( shadow_mode_external(d) )
3698 /* Use the linear map if we can; otherwise make a new mapping */
3699 if ( v == current )
3701 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3702 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3703 __PAGE_HYPERVISOR);
3705 else
3707 l4_pgentry_t *ml4e;
3708 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3709 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3710 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3711 __PAGE_HYPERVISOR);
3712 sh_unmap_domain_page(ml4e);
3716 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3718 /* PV: XXX
3720 * HVM: To give ourselves a linear map of the shadows, we need to
3721 * extend a PAE shadow to 4 levels. We do this by having a monitor
3722 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3723 * entries into it. Then, by having the monitor l4e for shadow
3724 * pagetables also point to the monitor l4, we can use it to access
3725 * the shadows.
3726 */
3728 if ( shadow_mode_external(d) )
3730 /* Install copies of the shadow l3es into the monitor l2 table
3731 * that maps SH_LINEAR_PT_VIRT_START. */
3732 shadow_l3e_t *sl3e;
3733 l2_pgentry_t *ml2e;
3734 int i;
3736 /* Use linear mappings if we can; otherwise make new mappings */
3737 if ( v == current )
3738 ml2e = __linear_l2_table
3739 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3740 else
3742 mfn_t l3mfn, l2mfn;
3743 l4_pgentry_t *ml4e;
3744 l3_pgentry_t *ml3e;
3745 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3746 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3748 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3749 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3750 ml3e = sh_map_domain_page(l3mfn);
3751 sh_unmap_domain_page(ml4e);
3753 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3754 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3755 ml2e = sh_map_domain_page(l2mfn);
3756 sh_unmap_domain_page(ml3e);
3759 /* Shadow l3 tables are made up by sh_update_cr3 */
3760 sl3e = v->arch.paging.shadow.l3table;
3762 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3764 ml2e[i] =
3765 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3766 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3767 __PAGE_HYPERVISOR)
3768 : l2e_empty();
3771 if ( v != current )
3772 sh_unmap_domain_page(ml2e);
3774 else
3775 domain_crash(d); /* XXX */
3777 #elif CONFIG_PAGING_LEVELS == 3
3779 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3780 * entries in the shadow, and the shadow's l3 entries into the
3781 * shadow-linear-map l2 entries in the shadow. This is safe to do
3782 * because Xen does not let guests share high-slot l2 tables between l3s,
3783 * so we know we're not treading on anyone's toes.
3785 * HVM: need to copy the shadow's l3 entries into the
3786 * shadow-linear-map l2 entries in the monitor table. This is safe
3787 * because we have one monitor table for each vcpu. The monitor's
3788 * own l3es don't need to be copied because they never change.
3789 * XXX That might change if we start stuffing things into the rest
3790 * of the monitor's virtual address space.
3791 */
3793 l2_pgentry_t *l2e, new_l2e;
3794 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3795 int i;
3796 int unmap_l2e = 0;
3798 #if GUEST_PAGING_LEVELS == 2
3800 /* Shadow l3 tables were built by sh_update_cr3 */
3801 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3802 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3804 #else /* GUEST_PAGING_LEVELS == 3 */
3806 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3807 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3809 #endif /* GUEST_PAGING_LEVELS */
3811 /* Choose where to write the entries, using linear maps if possible */
3812 if ( shadow_mode_external(d) )
3814 if ( v == current )
3816 /* From the monitor tables, it's safe to use linear maps
3817 * to update monitor l2s */
3818 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3820 else
3822 /* Map the monitor table's high l2 */
3823 l3_pgentry_t *l3e;
3824 l3e = sh_map_domain_page(
3825 pagetable_get_mfn(v->arch.monitor_table));
3826 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3827 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3828 unmap_l2e = 1;
3829 sh_unmap_domain_page(l3e);
3832 else
3834 /* Map the shadow table's high l2 */
3835 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3836 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3837 unmap_l2e = 1;
3840 /* Write linear mapping of guest (only in PV, and only when
3841 * not translated). */
3842 if ( !shadow_mode_translate(d) )
3844 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3846 new_l2e =
3847 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3848 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3849 __PAGE_HYPERVISOR)
3850 : l2e_empty());
3851 safe_write_entry(
3852 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3853 &new_l2e);
3857 /* Write linear mapping of shadow. */
3858 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3860 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3861 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3862 __PAGE_HYPERVISOR)
3863 : l2e_empty();
3864 safe_write_entry(
3865 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3866 &new_l2e);
3869 if ( unmap_l2e )
3870 sh_unmap_domain_page(l2e);
3873 #else
3874 #error this should not happen
3875 #endif
3877 if ( shadow_mode_external(d) )
3879 /*
3880 * Having modified the linear pagetable mapping, flush local host TLBs.
3881 * This was not needed when vmenter/vmexit always had the side effect
3882 * of flushing host TLBs but, with ASIDs, it is possible to finish
3883 * this CR3 update, vmenter the guest, vmexit due to a page fault,
3884 * without an intervening host TLB flush. Then the page fault code
3885 * could use the linear pagetable to read a top-level shadow page
3886 * table entry. But, without this change, it would fetch the wrong
3887 * value due to a stale TLB.
3888 */
3889 flush_tlb_local();
3894 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3895 * Does all appropriate management/bookkeeping/refcounting/etc...
3896 */
3897 static void
3898 sh_detach_old_tables(struct vcpu *v)
3900 mfn_t smfn;
3901 int i = 0;
3903 ////
3904 //// vcpu->arch.paging.shadow.guest_vtable
3905 ////
3907 #if GUEST_PAGING_LEVELS == 3
3908 /* PAE guests don't have a mapping of the guest top-level table */
3909 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3910 #else
3911 if ( v->arch.paging.shadow.guest_vtable )
3913 struct domain *d = v->domain;
3914 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3915 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3916 v->arch.paging.shadow.guest_vtable = NULL;
3918 #endif // !NDEBUG
3921 ////
3922 //// vcpu->arch.shadow_table[]
3923 ////
3925 #if GUEST_PAGING_LEVELS == 3
3926 /* PAE guests have four shadow_table entries */
3927 for ( i = 0 ; i < 4 ; i++ )
3928 #endif
3930 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3931 if ( mfn_x(smfn) )
3932 sh_put_ref(v, smfn, 0);
3933 v->arch.shadow_table[i] = pagetable_null();
3937 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3938 static void
3939 sh_set_toplevel_shadow(struct vcpu *v,
3940 int slot,
3941 mfn_t gmfn,
3942 unsigned int root_type)
3944 mfn_t smfn;
3945 pagetable_t old_entry, new_entry;
3947 struct domain *d = v->domain;
3949 /* Remember the old contents of this slot */
3950 old_entry = v->arch.shadow_table[slot];
3952 /* Now figure out the new contents: is this a valid guest MFN? */
3953 if ( !mfn_valid(gmfn) )
3955 new_entry = pagetable_null();
3956 goto install_new_entry;
3959 /* Guest mfn is valid: shadow it and install the shadow */
3960 smfn = get_shadow_status(v, gmfn, root_type);
3961 if ( !mfn_valid(smfn) )
3963 /* Make sure there's enough free shadow memory. */
3964 shadow_prealloc(d, root_type, 1);
3965 /* Shadow the page. */
3966 smfn = sh_make_shadow(v, gmfn, root_type);
3968 ASSERT(mfn_valid(smfn));
3970 /* Pin the shadow and put it (back) on the list of pinned shadows */
3971 if ( sh_pin(v, smfn) == 0 )
3973 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3974 domain_crash(v->domain);
3977 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3978 * or the next call to set_toplevel_shadow() */
3979 if ( !sh_get_ref(v, smfn, 0) )
3981 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3982 domain_crash(v->domain);
3985 new_entry = pagetable_from_mfn(smfn);
3987 install_new_entry:
3988 /* Done. Install it */
3989 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3990 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3991 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3992 v->arch.shadow_table[slot] = new_entry;
3994 /* Decrement the refcount of the old contents of this slot */
3995 if ( !pagetable_is_null(old_entry) ) {
3996 mfn_t old_smfn = pagetable_get_mfn(old_entry);
3997 /* Need to repin the old toplevel shadow if it's been unpinned
3998 * by shadow_prealloc(): in PV mode we're still running on this
3999 * shadow and it's not safe to free it yet. */
4000 if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(v, old_smfn) )
4002 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
4003 domain_crash(v->domain);
4005 sh_put_ref(v, old_smfn, 0);
4010 static void
4011 sh_update_cr3(struct vcpu *v, int do_locking)
4012 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
4013 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
4014 * if appropriate).
4015 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
4016 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
4017 * shadow tables are.
4018 * If do_locking != 0, assume we are being called from outside the
4019 * shadow code, and must take and release the shadow lock; otherwise
4020 * that is the caller's responsibility.
4021 */
4023 struct domain *d = v->domain;
4024 mfn_t gmfn;
4025 #if GUEST_PAGING_LEVELS == 3
4026 guest_l3e_t *gl3e;
4027 u32 guest_idx=0;
4028 int i;
4029 #endif
4031 /* Don't do anything on an uninitialised vcpu */
4032 if ( !is_hvm_domain(d) && !v->is_initialised )
4034 ASSERT(v->arch.cr3 == 0);
4035 return;
4038 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4039 /* Need to resync all the shadow entries on a TLB flush. Resync
4040 * current vcpus OOS pages before switching to the new shadow
4041 * tables so that the VA hint is still valid. */
4042 shadow_resync_current_vcpu(v, do_locking);
4043 #endif
4045 if ( do_locking ) shadow_lock(v->domain);
4047 ASSERT(shadow_locked_by_me(v->domain));
4048 ASSERT(v->arch.paging.mode);
4050 ////
4051 //// vcpu->arch.guest_table is already set
4052 ////
4054 #ifndef NDEBUG
4055 /* Double-check that the HVM code has sent us a sane guest_table */
4056 if ( is_hvm_domain(d) )
4058 ASSERT(shadow_mode_external(d));
4059 if ( hvm_paging_enabled(v) )
4060 ASSERT(pagetable_get_pfn(v->arch.guest_table));
4061 else
4062 ASSERT(v->arch.guest_table.pfn
4063 == d->arch.paging.shadow.unpaged_pagetable.pfn);
4065 #endif
4067 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
4068 d->domain_id, v->vcpu_id,
4069 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
4071 #if GUEST_PAGING_LEVELS == 4
4072 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
4073 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
4074 else
4075 #endif
4076 gmfn = pagetable_get_mfn(v->arch.guest_table);
4079 ////
4080 //// vcpu->arch.paging.shadow.guest_vtable
4081 ////
4082 #if GUEST_PAGING_LEVELS == 4
4083 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4085 if ( v->arch.paging.shadow.guest_vtable )
4086 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4087 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4088 /* PAGING_LEVELS==4 implies 64-bit, which means that
4089 * map_domain_page_global can't fail */
4090 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
4092 else
4093 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
4094 #elif GUEST_PAGING_LEVELS == 3
4095 /* On PAE guests we don't use a mapping of the guest's own top-level
4096 * table. We cache the current state of that table and shadow that,
4097 * until the next CR3 write makes us refresh our cache. */
4098 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4100 if ( shadow_mode_external(d) )
4101 /* Find where in the page the l3 table is */
4102 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
4103 else
4104 /* PV guest: l3 is at the start of a page */
4105 guest_idx = 0;
4107 // Ignore the low 2 bits of guest_idx -- they are really just
4108 // cache control.
4109 guest_idx &= ~3;
4111 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
4112 for ( i = 0; i < 4 ; i++ )
4113 v->arch.paging.shadow.gl3e[i] = gl3e[i];
4114 sh_unmap_domain_page(gl3e);
4115 #elif GUEST_PAGING_LEVELS == 2
4116 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4118 if ( v->arch.paging.shadow.guest_vtable )
4119 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4120 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4121 /* Does this really need map_domain_page_global? Handle the
4122 * error properly if so. */
4123 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
4125 else
4126 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
4127 #else
4128 #error this should never happen
4129 #endif
4132 ////
4133 //// vcpu->arch.shadow_table[]
4134 ////
4136 /* We revoke write access to the new guest toplevel page(s) before we
4137 * replace the old shadow pagetable(s), so that we can safely use the
4138 * (old) shadow linear maps in the writeable mapping heuristics. */
4139 #if GUEST_PAGING_LEVELS == 2
4140 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
4141 flush_tlb_mask(&v->domain->domain_dirty_cpumask);
4142 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
4143 #elif GUEST_PAGING_LEVELS == 3
4144 /* PAE guests have four shadow_table entries, based on the
4145 * current values of the guest's four l3es. */
4147 int flush = 0;
4148 gfn_t gl2gfn;
4149 mfn_t gl2mfn;
4150 p2m_type_t p2mt;
4151 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
4152 /* First, make all four entries read-only. */
4153 for ( i = 0; i < 4; i++ )
4155 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4157 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4158 gl2mfn = gfn_to_mfn_query(d, gl2gfn, &p2mt);
4159 if ( p2m_is_ram(p2mt) )
4160 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
4163 if ( flush )
4164 flush_tlb_mask(&v->domain->domain_dirty_cpumask);
4165 /* Now install the new shadows. */
4166 for ( i = 0; i < 4; i++ )
4168 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4170 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4171 gl2mfn = gfn_to_mfn_query(d, gl2gfn, &p2mt);
4172 if ( p2m_is_ram(p2mt) )
4173 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
4174 ? SH_type_l2h_shadow
4175 : SH_type_l2_shadow);
4176 else
4177 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4179 else
4180 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4183 #elif GUEST_PAGING_LEVELS == 4
4184 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
4185 flush_tlb_mask(&v->domain->domain_dirty_cpumask);
4186 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
4187 #else
4188 #error This should never happen
4189 #endif
4192 ///
4193 /// v->arch.paging.shadow.l3table
4194 ///
4195 #if SHADOW_PAGING_LEVELS == 3
4197 mfn_t smfn;
4198 int i;
4199 for ( i = 0; i < 4; i++ )
4201 #if GUEST_PAGING_LEVELS == 2
4202 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
4203 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
4204 #else
4205 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
4206 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4207 #endif
4208 v->arch.paging.shadow.l3table[i] =
4209 (mfn_x(smfn) == 0)
4210 ? shadow_l3e_empty()
4211 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
4214 #endif /* SHADOW_PAGING_LEVELS == 3 */
4217 ///
4218 /// v->arch.cr3
4219 ///
4220 if ( shadow_mode_external(d) )
4222 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4224 else // not shadow_mode_external...
4226 /* We don't support PV except guest == shadow == config levels */
4227 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4228 #if SHADOW_PAGING_LEVELS == 3
4229 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4230 * Don't use make_cr3 because (a) we know it's below 4GB, and
4231 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4232 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4233 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4234 #else
4235 /* 4-on-4: Just use the shadow top-level directly */
4236 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4237 #endif
4241 ///
4242 /// v->arch.hvm_vcpu.hw_cr[3]
4243 ///
4244 if ( shadow_mode_external(d) )
4246 ASSERT(is_hvm_domain(d));
4247 #if SHADOW_PAGING_LEVELS == 3
4248 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4249 v->arch.hvm_vcpu.hw_cr[3] =
4250 virt_to_maddr(&v->arch.paging.shadow.l3table);
4251 #else
4252 /* 4-on-4: Just use the shadow top-level directly */
4253 v->arch.hvm_vcpu.hw_cr[3] =
4254 pagetable_get_paddr(v->arch.shadow_table[0]);
4255 #endif
4256 hvm_update_guest_cr(v, 3);
4259 /* Fix up the linear pagetable mappings */
4260 sh_update_linear_entries(v);
4262 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4263 /* No longer safe to use cached gva->gfn translations */
4264 vtlb_flush(v);
4265 #endif
4267 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4268 v->arch.paging.last_write_emul_ok = 0;
4269 #endif
4271 /* Release the lock, if we took it (otherwise it's the caller's problem) */
4272 if ( do_locking ) shadow_unlock(v->domain);
4274 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4275 /* Need to resync all the shadow entries on a TLB flush. We only
4276 * update the shadows, leaving the pages out of sync. Also, we try
4277 * to skip synchronization of shadows not mapped in the new
4278 * tables. */
4279 shadow_sync_other_vcpus(v, do_locking);
4280 #endif
4285 /**************************************************************************/
4286 /* Functions to revoke guest rights */
4288 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
4289 int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
4290 mfn_t smfn, unsigned long off)
4292 int r;
4293 shadow_l1e_t *sl1p, sl1e;
4294 struct page_info *sp;
4296 ASSERT(mfn_valid(gmfn));
4297 ASSERT(mfn_valid(smfn));
4299 sp = mfn_to_page(smfn);
4301 if ( ((sp->count_info & PGC_count_mask) != 0)
4302 || (sp->u.sh.type != SH_type_l1_shadow
4303 && sp->u.sh.type != SH_type_fl1_shadow) )
4304 goto fail;
4306 sl1p = sh_map_domain_page(smfn);
4307 sl1p += off;
4308 sl1e = *sl1p;
4309 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4310 != (_PAGE_PRESENT|_PAGE_RW))
4311 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4313 sh_unmap_domain_page(sl1p);
4314 goto fail;
4317 /* Found it! Need to remove its write permissions. */
4318 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4319 r = shadow_set_l1e(v, sl1p, sl1e, smfn);
4320 ASSERT( !(r & SHADOW_SET_ERROR) );
4322 sh_unmap_domain_page(sl1p);
4323 perfc_incr(shadow_writeable_h_7);
4324 return 1;
4326 fail:
4327 perfc_incr(shadow_writeable_h_8);
4328 return 0;
4330 #endif /* OOS */
4332 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4333 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4334 /* Look up this vaddr in the current shadow and see if it's a writeable
4335 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4337 shadow_l1e_t sl1e, *sl1p;
4338 shadow_l2e_t *sl2p;
4339 shadow_l3e_t *sl3p;
4340 #if SHADOW_PAGING_LEVELS >= 4
4341 shadow_l4e_t *sl4p;
4342 #endif
4343 mfn_t sl1mfn;
4344 int r;
4346 /* Carefully look in the shadow linear map for the l1e we expect */
4347 #if SHADOW_PAGING_LEVELS >= 4
4348 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4349 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4350 return 0;
4351 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4352 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4353 return 0;
4354 #else /* SHADOW_PAGING_LEVELS == 3 */
4355 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4356 + shadow_l3_linear_offset(vaddr);
4357 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4358 return 0;
4359 #endif
4360 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4361 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4362 return 0;
4363 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4364 sl1e = *sl1p;
4365 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4366 != (_PAGE_PRESENT|_PAGE_RW))
4367 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4368 return 0;
4370 /* Found it! Need to remove its write permissions. */
4371 sl1mfn = shadow_l2e_get_mfn(*sl2p);
4372 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4373 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
4374 ASSERT( !(r & SHADOW_SET_ERROR) );
4375 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);
4376 return 1;
4378 #endif
4380 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4381 mfn_t readonly_mfn)
4382 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4384 shadow_l1e_t *sl1e;
4385 int done = 0;
4386 int flags;
4387 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4388 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4389 #endif
4391 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4393 flags = shadow_l1e_get_flags(*sl1e);
4394 if ( (flags & _PAGE_PRESENT)
4395 && (flags & _PAGE_RW)
4396 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4398 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4399 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
4400 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4401 /* Remember the last shadow that we shot a writeable mapping in */
4402 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4403 #endif
4404 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4405 & PGT_count_mask) == 0 )
4406 /* This breaks us cleanly out of the FOREACH macro */
4407 done = 1;
4409 });
4410 return done;
4414 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4415 /* Excises all mappings to guest frame from this shadow l1 table */
4417 shadow_l1e_t *sl1e;
4418 int done = 0;
4419 int flags;
4421 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4423 flags = shadow_l1e_get_flags(*sl1e);
4424 if ( (flags & _PAGE_PRESENT)
4425 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4427 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
4428 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4429 /* This breaks us cleanly out of the FOREACH macro */
4430 done = 1;
4432 });
4433 return done;
4436 /**************************************************************************/
4437 /* Functions to excise all pointers to shadows from higher-level shadows. */
4439 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4440 /* Blank out a single shadow entry */
4442 switch ( mfn_to_page(smfn)->u.sh.type )
4444 case SH_type_l1_shadow:
4445 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
4446 case SH_type_l2_shadow:
4447 #if GUEST_PAGING_LEVELS >= 3
4448 case SH_type_l2h_shadow:
4449 #endif
4450 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
4451 #if GUEST_PAGING_LEVELS >= 4
4452 case SH_type_l3_shadow:
4453 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
4454 case SH_type_l4_shadow:
4455 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
4456 #endif
4457 default: BUG(); /* Called with the wrong kind of shadow. */
4461 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4462 /* Remove all mappings of this l1 shadow from this l2 shadow */
4464 shadow_l2e_t *sl2e;
4465 int done = 0;
4466 int flags;
4468 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4470 flags = shadow_l2e_get_flags(*sl2e);
4471 if ( (flags & _PAGE_PRESENT)
4472 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4474 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4475 if ( mfn_to_page(sl1mfn)->u.sh.type == 0 )
4476 /* This breaks us cleanly out of the FOREACH macro */
4477 done = 1;
4479 });
4480 return done;
4483 #if GUEST_PAGING_LEVELS >= 4
4484 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4485 /* Remove all mappings of this l2 shadow from this l3 shadow */
4487 shadow_l3e_t *sl3e;
4488 int done = 0;
4489 int flags;
4491 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4493 flags = shadow_l3e_get_flags(*sl3e);
4494 if ( (flags & _PAGE_PRESENT)
4495 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4497 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4498 if ( mfn_to_page(sl2mfn)->u.sh.type == 0 )
4499 /* This breaks us cleanly out of the FOREACH macro */
4500 done = 1;
4502 });
4503 return done;
4506 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4507 /* Remove all mappings of this l3 shadow from this l4 shadow */
4509 shadow_l4e_t *sl4e;
4510 int done = 0;
4511 int flags;
4513 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4515 flags = shadow_l4e_get_flags(*sl4e);
4516 if ( (flags & _PAGE_PRESENT)
4517 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4519 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4520 if ( mfn_to_page(sl3mfn)->u.sh.type == 0 )
4521 /* This breaks us cleanly out of the FOREACH macro */
4522 done = 1;
4524 });
4525 return done;
4527 #endif /* 64bit guest */
4529 /**************************************************************************/
4530 /* Handling HVM guest writes to pagetables */
4532 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4533 #define BAD_GVA_TO_GFN (~0UL)
4534 #define BAD_GFN_TO_MFN (~1UL)
4535 #define READONLY_GFN (~2UL)
4536 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4537 unsigned long vaddr,
4538 struct sh_emulate_ctxt *sh_ctxt)
4540 unsigned long gfn;
4541 mfn_t mfn;
4542 p2m_type_t p2mt;
4543 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4545 /* Translate the VA to a GFN */
4546 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4547 if ( gfn == INVALID_GFN )
4549 if ( is_hvm_vcpu(v) )
4550 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4551 else
4552 propagate_page_fault(vaddr, pfec);
4553 return _mfn(BAD_GVA_TO_GFN);
4556 /* Translate the GFN to an MFN */
4557 /* PoD: query only if shadow lock is held (to avoid deadlock) */
4558 if ( shadow_locked_by_me(v->domain) )
4559 mfn = gfn_to_mfn_query(v->domain, _gfn(gfn), &p2mt);
4560 else
4561 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4563 if ( p2mt == p2m_ram_ro )
4564 return _mfn(READONLY_GFN);
4565 if ( !p2m_is_ram(p2mt) )
4566 return _mfn(BAD_GFN_TO_MFN);
4568 ASSERT(mfn_valid(mfn));
4569 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4570 return mfn;
4573 /* Check that the user is allowed to perform this write.
4574 * Returns a mapped pointer to write to, or NULL for error. */
4575 #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
4576 #define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION)
4577 #define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY)
4578 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3)
4579 static void *emulate_map_dest(struct vcpu *v,
4580 unsigned long vaddr,
4581 u32 bytes,
4582 struct sh_emulate_ctxt *sh_ctxt)
4584 unsigned long offset;
4585 void *map = NULL;
4587 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4588 if ( !mfn_valid(sh_ctxt->mfn1) )
4589 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4590 MAPPING_EXCEPTION :
4591 (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
4592 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4594 #ifndef NDEBUG
4595 /* We don't emulate user-mode writes to page tables */
4596 if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 )
4598 gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
4599 "emulate_map_dest(). This should never happen!\n");
4600 return MAPPING_UNHANDLEABLE;
4602 #endif
4604 /* Unaligned writes mean probably this isn't a pagetable */
4605 if ( vaddr & (bytes - 1) )
4606 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4608 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4610 /* Whole write fits on a single page */
4611 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4612 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4614 else
4616 /* Cross-page emulated writes are only supported for HVM guests;
4617 * PV guests ought to know better */
4618 if ( !is_hvm_vcpu(v) )
4619 return MAPPING_UNHANDLEABLE;
4621 /* This write crosses a page boundary. Translate the second page */
4622 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4623 sh_ctxt);
4624 if ( !mfn_valid(sh_ctxt->mfn2) )
4625 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4626 MAPPING_EXCEPTION :
4627 (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
4628 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4630 /* Cross-page writes mean probably not a pagetable */
4631 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4633 /* Hack: we map the pages into the vcpu's LDT space, since we
4634 * know that we're not going to need the LDT for HVM guests,
4635 * and only HVM guests are allowed unaligned writes. */
4636 ASSERT(is_hvm_vcpu(v));
4637 map = (void *)LDT_VIRT_START(v);
4638 offset = l1_linear_offset((unsigned long) map);
4639 l1e_write(&__linear_l1_table[offset],
4640 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4641 l1e_write(&__linear_l1_table[offset + 1],
4642 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4643 flush_tlb_local();
4644 map += (vaddr & ~PAGE_MASK);
4647 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4648 /* Remember if the bottom bit was clear, so we can choose not to run
4649 * the change through the verify code if it's still clear afterwards */
4650 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4651 #endif
4653 return map;
4656 /* Tidy up after the emulated write: mark pages dirty, verify the new
4657 * contents, and undo the mapping */
4658 static void emulate_unmap_dest(struct vcpu *v,
4659 void *addr,
4660 u32 bytes,
4661 struct sh_emulate_ctxt *sh_ctxt)
4663 u32 b1 = bytes, b2 = 0, shflags;
4665 ASSERT(mfn_valid(sh_ctxt->mfn1));
4667 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4668 if ( likely(bytes >= 4)
4669 && (*(u32 *)addr == 0)
4670 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4671 check_for_early_unshadow(v, sh_ctxt->mfn1);
4672 else
4673 reset_early_unshadow(v);
4675 /* We can avoid re-verifying the page contents after the write if:
4676 * - it was no larger than the PTE type of this pagetable;
4677 * - it was aligned to the PTE boundaries; and
4678 * - _PAGE_PRESENT was clear before and after the write. */
4679 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4680 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4681 if ( sh_ctxt->low_bit_was_clear
4682 && !(*(u8 *)addr & _PAGE_PRESENT)
4683 && ((!(shflags & SHF_32)
4684 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4685 * the present bit unset are safe to ignore. */
4686 && ((unsigned long)addr & 7) == 0
4687 && bytes <= 8)
4688 ||
4689 (!(shflags & (SHF_PAE|SHF_64))
4690 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4691 * leave the present bit unset are safe to ignore. */
4692 && ((unsigned long)addr & 3) == 0
4693 && bytes <= 4)) )
4695 /* Writes with this alignment constraint can't possibly cross pages */
4696 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4698 else
4699 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4701 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4703 /* Validate as two writes, one to each page */
4704 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4705 b2 = bytes - b1;
4706 ASSERT(b2 < bytes);
4708 if ( likely(b1 > 0) )
4709 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4710 if ( unlikely(b2 > 0) )
4711 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4714 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4716 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4718 unsigned long offset;
4719 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4720 /* Undo the hacky two-frame contiguous map. */
4721 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4722 offset = l1_linear_offset((unsigned long) addr);
4723 l1e_write(&__linear_l1_table[offset], l1e_empty());
4724 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4725 flush_tlb_all();
4727 else
4728 sh_unmap_domain_page(addr);
4730 atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4733 static int
4734 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4735 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4737 void *addr;
4739 /* Unaligned writes are only acceptable on HVM */
4740 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4741 return X86EMUL_UNHANDLEABLE;
4743 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4744 if ( emulate_map_dest_failed(addr) )
4745 return (long)addr;
4747 shadow_lock(v->domain);
4748 memcpy(addr, src, bytes);
4750 if ( tb_init_done )
4752 #if GUEST_PAGING_LEVELS == 3
4753 if ( vaddr == this_cpu(trace_emulate_initial_va) )
4754 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4755 else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
4757 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
4758 memcpy(&this_cpu(trace_emulate_write_val),
4759 (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE);
4761 #else
4762 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4763 #endif
4766 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4767 shadow_audit_tables(v);
4768 shadow_unlock(v->domain);
4769 return X86EMUL_OKAY;
4772 static int
4773 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4774 unsigned long old, unsigned long new,
4775 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4777 void *addr;
4778 unsigned long prev;
4779 int rv = X86EMUL_OKAY;
4781 /* Unaligned writes are only acceptable on HVM */
4782 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4783 return X86EMUL_UNHANDLEABLE;
4785 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4786 if ( emulate_map_dest_failed(addr) )
4787 return (long)addr;
4789 shadow_lock(v->domain);
4790 switch ( bytes )
4792 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4793 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4794 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4795 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4796 default:
4797 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4798 prev = ~old;
4801 if ( prev != old )
4802 rv = X86EMUL_CMPXCHG_FAILED;
4804 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4805 " wanted %#lx now %#lx bytes %u\n",
4806 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4808 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4809 shadow_audit_tables(v);
4810 shadow_unlock(v->domain);
4811 return rv;
4814 #ifdef __i386__
4815 static int
4816 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4817 unsigned long old_lo, unsigned long old_hi,
4818 unsigned long new_lo, unsigned long new_hi,
4819 struct sh_emulate_ctxt *sh_ctxt)
4821 void *addr;
4822 u64 old, new, prev;
4823 int rv = X86EMUL_OKAY;
4825 /* Unaligned writes are only acceptable on HVM */
4826 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4827 return X86EMUL_UNHANDLEABLE;
4829 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4830 if ( emulate_map_dest_failed(addr) )
4831 return (long)addr;
4833 old = (((u64) old_hi) << 32) | (u64) old_lo;
4834 new = (((u64) new_hi) << 32) | (u64) new_lo;
4836 shadow_lock(v->domain);
4837 prev = cmpxchg(((u64 *)addr), old, new);
4839 if ( prev != old )
4840 rv = X86EMUL_CMPXCHG_FAILED;
4842 emulate_unmap_dest(v, addr, 8, sh_ctxt);
4843 shadow_audit_tables(v);
4844 shadow_unlock(v->domain);
4845 return rv;
4847 #endif
4849 /**************************************************************************/
4850 /* Audit tools */
4852 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4854 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4855 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4856 "gl" #_level "mfn = %" PRI_mfn \
4857 " sl" #_level "mfn = %" PRI_mfn \
4858 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4859 " gl" #_level "e = %" SH_PRI_gpte \
4860 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4861 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4862 _level, guest_index(gl ## _level ## e), \
4863 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4864 gl ## _level ## e, sl ## _level ## e, \
4865 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4866 ##_a); \
4867 BUG(); \
4868 done = 1; \
4869 } while (0)
4871 #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
4872 printk("Shadow %u-on-%u audit failed at level %i\n" \
4873 "gl" #_level "mfn = %" PRI_mfn \
4874 " sl" #_level "mfn = %" PRI_mfn \
4875 " Error: " _fmt "\n", \
4876 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4877 _level, \
4878 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4879 ##_a); \
4880 BUG(); \
4881 done = 1; \
4882 } while (0)
4884 static char * sh_audit_flags(struct vcpu *v, int level,
4885 int gflags, int sflags)
4886 /* Common code for auditing flag bits */
4888 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4889 return "shadow is present but guest is not present";
4890 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4891 return "global bit set in PV shadow";
4892 if ( level == 2 && (sflags & _PAGE_PSE) )
4893 return "PS bit set in shadow";
4894 #if SHADOW_PAGING_LEVELS == 3
4895 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4896 #endif
4897 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4898 return "accessed bit not propagated";
4899 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4900 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4901 return "dirty bit not propagated";
4902 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4903 return "user/supervisor bit does not match";
4904 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4905 return "NX bit does not match";
4906 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4907 return "shadow grants write access but guest does not";
4908 return NULL;
4911 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4913 guest_l1e_t *gl1e, *gp;
4914 shadow_l1e_t *sl1e;
4915 mfn_t mfn, gmfn, gl1mfn;
4916 gfn_t gfn;
4917 p2m_type_t p2mt;
4918 char *s;
4919 int done = 0;
4921 /* Follow the backpointer */
4922 gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
4924 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4925 /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
4926 if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
4928 oos_audit_hash_is_present(v->domain, gl1mfn);
4929 return 0;
4931 #endif
4933 gl1e = gp = sh_map_domain_page(gl1mfn);
4934 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4936 if ( sh_l1e_is_magic(*sl1e) )
4938 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
4939 if ( sh_l1e_is_gnp(*sl1e) )
4941 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4942 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4944 else
4946 ASSERT(sh_l1e_is_mmio(*sl1e));
4947 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4948 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4949 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4950 " but guest gfn is %" SH_PRI_gfn,
4951 gfn_x(gfn),
4952 gfn_x(guest_l1e_get_gfn(*gl1e)));
4954 #endif
4956 else
4958 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4959 shadow_l1e_get_flags(*sl1e));
4960 if ( s ) AUDIT_FAIL(1, "%s", s);
4962 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4964 gfn = guest_l1e_get_gfn(*gl1e);
4965 mfn = shadow_l1e_get_mfn(*sl1e);
4966 gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
4967 if ( mfn_x(gmfn) != mfn_x(mfn) )
4968 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4969 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4970 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4973 });
4974 sh_unmap_domain_page(gp);
4975 return done;
4978 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4980 guest_l1e_t *gl1e, e;
4981 shadow_l1e_t *sl1e;
4982 mfn_t gl1mfn = _mfn(INVALID_MFN);
4983 int f;
4984 int done = 0;
4986 /* fl1 has no useful backpointer: all we can check are flags */
4987 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4988 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4989 f = shadow_l1e_get_flags(*sl1e);
4990 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4991 if ( !(f == 0
4992 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4993 _PAGE_ACCESSED)
4994 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED)
4995 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4996 _PAGE_ACCESSED|_PAGE_DIRTY)
4997 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4998 || sh_l1e_is_magic(*sl1e)) )
4999 AUDIT_FAIL(1, "fl1e has bad flags");
5000 });
5001 return 0;
5004 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
5006 guest_l2e_t *gl2e, *gp;
5007 shadow_l2e_t *sl2e;
5008 mfn_t mfn, gmfn, gl2mfn;
5009 gfn_t gfn;
5010 p2m_type_t p2mt;
5011 char *s;
5012 int done = 0;
5014 /* Follow the backpointer */
5015 gl2mfn = _mfn(mfn_to_page(sl2mfn)->v.sh.back);
5017 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5018 /* Only L1's may be out of sync. */
5019 if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
5020 AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
5021 #endif
5023 gl2e = gp = sh_map_domain_page(gl2mfn);
5024 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
5026 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
5027 shadow_l2e_get_flags(*sl2e));
5028 if ( s ) AUDIT_FAIL(2, "%s", s);
5030 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5032 gfn = guest_l2e_get_gfn(*gl2e);
5033 mfn = shadow_l2e_get_mfn(*sl2e);
5034 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
5035 ? get_fl1_shadow_status(v, gfn)
5036 : get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt),
5037 SH_type_l1_shadow);
5038 if ( mfn_x(gmfn) != mfn_x(mfn) )
5039 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
5040 " (--> %" PRI_mfn ")"
5041 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5042 gfn_x(gfn),
5043 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
5044 : mfn_x(gfn_to_mfn_query(v->domain, gfn, &p2mt)),
5045 mfn_x(gmfn), mfn_x(mfn));
5047 });
5048 sh_unmap_domain_page(gp);
5049 return 0;
5052 #if GUEST_PAGING_LEVELS >= 4
5053 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
5055 guest_l3e_t *gl3e, *gp;
5056 shadow_l3e_t *sl3e;
5057 mfn_t mfn, gmfn, gl3mfn;
5058 gfn_t gfn;
5059 p2m_type_t p2mt;
5060 char *s;
5061 int done = 0;
5063 /* Follow the backpointer */
5064 gl3mfn = _mfn(mfn_to_page(sl3mfn)->v.sh.back);
5066 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5067 /* Only L1's may be out of sync. */
5068 if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
5069 AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
5070 #endif
5072 gl3e = gp = sh_map_domain_page(gl3mfn);
5073 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
5075 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
5076 shadow_l3e_get_flags(*sl3e));
5077 if ( s ) AUDIT_FAIL(3, "%s", s);
5079 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5081 gfn = guest_l3e_get_gfn(*gl3e);
5082 mfn = shadow_l3e_get_mfn(*sl3e);
5083 gmfn = get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt),
5084 ((GUEST_PAGING_LEVELS == 3 ||
5085 is_pv_32on64_vcpu(v))
5086 && !shadow_mode_external(v->domain)
5087 && (guest_index(gl3e) % 4) == 3)
5088 ? SH_type_l2h_shadow
5089 : SH_type_l2_shadow);
5090 if ( mfn_x(gmfn) != mfn_x(mfn) )
5091 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
5092 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5093 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5095 });
5096 sh_unmap_domain_page(gp);
5097 return 0;
5100 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
5102 guest_l4e_t *gl4e, *gp;
5103 shadow_l4e_t *sl4e;
5104 mfn_t mfn, gmfn, gl4mfn;
5105 gfn_t gfn;
5106 p2m_type_t p2mt;
5107 char *s;
5108 int done = 0;
5110 /* Follow the backpointer */
5111 gl4mfn = _mfn(mfn_to_page(sl4mfn)->v.sh.back);
5113 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5114 /* Only L1's may be out of sync. */
5115 if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
5116 AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
5117 #endif
5119 gl4e = gp = sh_map_domain_page(gl4mfn);
5120 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
5122 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
5123 shadow_l4e_get_flags(*sl4e));
5124 if ( s ) AUDIT_FAIL(4, "%s", s);
5126 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5128 gfn = guest_l4e_get_gfn(*gl4e);
5129 mfn = shadow_l4e_get_mfn(*sl4e);
5130 gmfn = get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt),
5131 SH_type_l3_shadow);
5132 if ( mfn_x(gmfn) != mfn_x(mfn) )
5133 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
5134 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5135 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5137 });
5138 sh_unmap_domain_page(gp);
5139 return 0;
5141 #endif /* GUEST_PAGING_LEVELS >= 4 */
5144 #undef AUDIT_FAIL
5146 #endif /* Audit code */
5148 /**************************************************************************/
5149 /* Entry points into this mode of the shadow code.
5150 * This will all be mangled by the preprocessor to uniquify everything. */
5151 struct paging_mode sh_paging_mode = {
5152 .page_fault = sh_page_fault,
5153 .invlpg = sh_invlpg,
5154 .gva_to_gfn = sh_gva_to_gfn,
5155 .update_cr3 = sh_update_cr3,
5156 .update_paging_modes = shadow_update_paging_modes,
5157 .write_p2m_entry = shadow_write_p2m_entry,
5158 .write_guest_entry = shadow_write_guest_entry,
5159 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
5160 .guest_map_l1e = sh_guest_map_l1e,
5161 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
5162 .guest_levels = GUEST_PAGING_LEVELS,
5163 .shadow.detach_old_tables = sh_detach_old_tables,
5164 .shadow.x86_emulate_write = sh_x86_emulate_write,
5165 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
5166 #ifdef __i386__
5167 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
5168 #endif
5169 .shadow.make_monitor_table = sh_make_monitor_table,
5170 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
5171 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
5172 .shadow.guess_wrmap = sh_guess_wrmap,
5173 #endif
5174 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
5175 };
5177 /*
5178 * Local variables:
5179 * mode: C
5180 * c-set-style: "BSD"
5181 * c-basic-offset: 4
5182 * indent-tabs-mode: nil
5183 * End:
5184 */