ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 19237:07e65892fc8e

[VTD] Utilise the snoop control capability in shadow with VT-d code

We compute the shadow PAT index in leaf page entries now as:
1) No VT-d assigned: let shadow PAT index as WB, handled already
in shadow code before.
2) direct assigned MMIO area: let shadow code compute the shadow
PAT with gMTRR=UC and gPAT value.
3) Snoop control enable: let shadow PAT index as WB.
4) Snoop control disable: let shadow code compute the shadow
PAT with gMTRR and gPAT, handled already in shadow code before

Signed-off-by: Xin, Xiaohui <xiaohui.xin@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Feb 20 11:11:40 2009 +0000 (2009-02-20)
parents 2262fddac319
children f1080b20cd15
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include <asm/guest_pt.h>
39 #include "private.h"
40 #include "types.h"
42 /* THINGS TO DO LATER:
43 *
44 * TEARDOWN HEURISTICS
45 * Also: have a heuristic for when to destroy a previous paging-mode's
46 * shadows. When a guest is done with its start-of-day 32-bit tables
47 * and reuses the memory we want to drop those shadows. Start with
48 * shadows in a page in two modes as a hint, but beware of clever tricks
49 * like reusing a pagetable for both PAE and 64-bit during boot...
50 *
51 * PAE LINEAR MAPS
52 * Rework shadow_get_l*e() to have the option of using map_domain_page()
53 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
54 * Then we can test the speed difference made by linear maps. If the
55 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
56 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
57 * to share l2h pages again.
58 *
59 * PSE disabled / PSE36
60 * We don't support any modes other than PSE enabled, PSE36 disabled.
61 * Neither of those would be hard to change, but we'd need to be able to
62 * deal with shadows made in one mode and used in another.
63 */
65 #define FETCH_TYPE_PREFETCH 1
66 #define FETCH_TYPE_DEMAND 2
67 #define FETCH_TYPE_WRITE 4
68 typedef enum {
69 ft_prefetch = FETCH_TYPE_PREFETCH,
70 ft_demand_read = FETCH_TYPE_DEMAND,
71 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
72 } fetch_type_t;
74 #ifdef DEBUG_TRACE_DUMP
75 static char *fetch_type_names[] = {
76 [ft_prefetch] "prefetch",
77 [ft_demand_read] "demand read",
78 [ft_demand_write] "demand write",
79 };
80 #endif
82 /**************************************************************************/
83 /* Hash table mapping from guest pagetables to shadows
84 *
85 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
86 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
87 * shadow L1 which maps its "splinters".
88 */
90 static inline mfn_t
91 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
92 /* Look for FL1 shadows in the hash table */
93 {
94 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
95 return smfn;
96 }
98 static inline mfn_t
99 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
100 /* Look for shadows in the hash table */
101 {
102 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
103 perfc_incr(shadow_get_shadow_status);
104 return smfn;
105 }
107 static inline void
108 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
109 /* Put an FL1 shadow into the hash table */
110 {
111 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
112 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
114 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
115 }
117 static inline void
118 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
119 /* Put a shadow into the hash table */
120 {
121 struct domain *d = v->domain;
122 int res;
124 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
125 d->domain_id, v->vcpu_id, mfn_x(gmfn),
126 shadow_type, mfn_x(smfn));
128 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
129 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
130 {
131 res = get_page(mfn_to_page(gmfn), d);
132 ASSERT(res == 1);
133 }
135 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
136 }
138 static inline void
139 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
140 /* Remove a shadow from the hash table */
141 {
142 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
143 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
144 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
145 }
147 static inline void
148 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
149 /* Remove a shadow from the hash table */
150 {
151 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
152 v->domain->domain_id, v->vcpu_id,
153 mfn_x(gmfn), shadow_type, mfn_x(smfn));
154 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
155 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
156 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
157 put_page(mfn_to_page(gmfn));
158 }
161 /**************************************************************************/
162 /* Functions for walking the guest page tables */
164 static inline uint32_t
165 sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
166 uint32_t pfec)
167 {
168 return guest_walk_tables(v, va, gw, pfec,
169 #if GUEST_PAGING_LEVELS == 3 /* PAE */
170 _mfn(INVALID_MFN),
171 v->arch.paging.shadow.gl3e
172 #else /* 32 or 64 */
173 pagetable_get_mfn(v->arch.guest_table),
174 v->arch.paging.shadow.guest_vtable
175 #endif
176 );
177 }
179 /* This validation is called with lock held, and after write permission
180 * removal. Then check is atomic and no more inconsistent content can
181 * be observed before lock is released
182 *
183 * Return 1 to indicate success and 0 for inconsistency
184 */
185 static inline uint32_t
186 shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
187 {
188 struct domain *d = v->domain;
189 guest_l1e_t *l1p;
190 guest_l2e_t *l2p;
191 #if GUEST_PAGING_LEVELS >= 4
192 guest_l3e_t *l3p;
193 guest_l4e_t *l4p;
194 #endif
195 int mismatch = 0;
197 ASSERT(shadow_locked_by_me(d));
199 if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
200 return 1;
202 /* We may consider caching guest page mapping from last
203 * guest table walk. However considering this check happens
204 * relatively less-frequent, and a bit burden here to
205 * remap guest page is better than caching mapping in each
206 * guest table walk.
207 *
208 * Also when inconsistency occurs, simply return to trigger
209 * another fault instead of re-validate new path to make
210 * logic simple.
211 */
212 perfc_incr(shadow_check_gwalk);
213 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
214 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
215 l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
216 mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
217 l3p = sh_map_domain_page(gw->l3mfn);
218 mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
219 sh_unmap_domain_page(l3p);
220 #else
221 mismatch |= (gw->l3e.l3 !=
222 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
223 #endif
224 l2p = sh_map_domain_page(gw->l2mfn);
225 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
226 sh_unmap_domain_page(l2p);
227 #else
228 l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
229 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
230 #endif
231 if ( !(guest_supports_superpages(v) &&
232 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
233 {
234 l1p = sh_map_domain_page(gw->l1mfn);
235 mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
236 sh_unmap_domain_page(l1p);
237 }
239 return !mismatch;
240 }
242 /* Remove write access permissions from a gwalk_t in a batch, and
243 * return OR-ed result for TLB flush hint and need to rewalk the guest
244 * pages.
245 *
246 * Syncing pages will remove write access to that page; but it may
247 * also give write access to other pages in the path. If we resync any
248 * pages, re-walk from the beginning.
249 */
250 #define GW_RMWR_FLUSHTLB 1
251 #define GW_RMWR_REWALK 2
253 static inline uint32_t
254 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
255 {
256 uint32_t rc = 0;
258 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
259 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
260 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
261 if ( mfn_is_out_of_sync(gw->l3mfn) )
262 {
263 sh_resync(v, gw->l3mfn);
264 rc = GW_RMWR_REWALK;
265 }
266 else
267 #endif /* OOS */
268 if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
269 rc = GW_RMWR_FLUSHTLB;
270 #endif /* GUEST_PAGING_LEVELS >= 4 */
272 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
273 if ( mfn_is_out_of_sync(gw->l2mfn) )
274 {
275 sh_resync(v, gw->l2mfn);
276 rc |= GW_RMWR_REWALK;
277 }
278 else
279 #endif /* OOS */
280 if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
281 rc |= GW_RMWR_FLUSHTLB;
282 #endif /* GUEST_PAGING_LEVELS >= 3 */
284 if ( !(guest_supports_superpages(v) &&
285 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
286 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
287 && !mfn_is_out_of_sync(gw->l1mfn)
288 #endif /* OOS */
289 && sh_remove_write_access(v, gw->l1mfn, 1, va) )
290 rc |= GW_RMWR_FLUSHTLB;
292 return rc;
293 }
295 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
296 /* Lightweight audit: pass all the shadows associated with this guest walk
297 * through the audit mechanisms */
298 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
299 {
300 mfn_t smfn;
302 if ( !(SHADOW_AUDIT_ENABLE) )
303 return;
305 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
306 if ( mfn_valid(gw->l4mfn)
307 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
308 SH_type_l4_shadow))) )
309 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
310 if ( mfn_valid(gw->l3mfn)
311 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
312 SH_type_l3_shadow))) )
313 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
314 #endif /* PAE or 64... */
315 if ( mfn_valid(gw->l2mfn) )
316 {
317 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
318 SH_type_l2_shadow))) )
319 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
320 #if GUEST_PAGING_LEVELS == 3
321 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
322 SH_type_l2h_shadow))) )
323 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
324 #endif
325 }
326 if ( mfn_valid(gw->l1mfn)
327 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
328 SH_type_l1_shadow))) )
329 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
330 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
331 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
332 && mfn_valid(
333 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
334 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
335 }
337 #else
338 #define sh_audit_gw(_v, _gw) do {} while(0)
339 #endif /* audit code */
342 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
343 void *
344 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
345 unsigned long *gl1mfn)
346 {
347 void *pl1e = NULL;
348 walk_t gw;
350 ASSERT(shadow_mode_translate(v->domain));
352 // XXX -- this is expensive, but it's easy to cobble together...
353 // FIXME!
355 if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0
356 && mfn_valid(gw.l1mfn) )
357 {
358 if ( gl1mfn )
359 *gl1mfn = mfn_x(gw.l1mfn);
360 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
361 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
362 }
364 return pl1e;
365 }
367 void
368 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
369 {
370 walk_t gw;
372 ASSERT(shadow_mode_translate(v->domain));
374 // XXX -- this is expensive, but it's easy to cobble together...
375 // FIXME!
377 (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
378 *(guest_l1e_t *)eff_l1e = gw.l1e;
379 }
380 #endif /* CONFIG == GUEST (== SHADOW) */
382 /**************************************************************************/
383 /* Functions to compute the correct index into a shadow page, given an
384 * index into the guest page (as returned by guest_get_index()).
385 * This is trivial when the shadow and guest use the same sized PTEs, but
386 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
387 * PAE- or 64-bit shadows).
388 *
389 * These functions also increment the shadow mfn, when necessary. When PTE
390 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
391 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
392 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
393 * which shadow page we really want. Similarly, when PTE sizes are
394 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
395 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
396 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
397 * space.)
398 *
399 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
400 * of shadow (to store both the shadow, and the info that would normally be
401 * stored in page_info fields). This arrangement allows the shadow and the
402 * "page_info" fields to always be stored in the same page (in fact, in
403 * the same cache line), avoiding an extra call to map_domain_page().
404 */
406 static inline u32
407 guest_index(void *ptr)
408 {
409 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
410 }
412 static u32
413 shadow_l1_index(mfn_t *smfn, u32 guest_index)
414 {
415 #if (GUEST_PAGING_LEVELS == 2)
416 *smfn = _mfn(mfn_x(*smfn) +
417 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
418 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
419 #else
420 return guest_index;
421 #endif
422 }
424 static u32
425 shadow_l2_index(mfn_t *smfn, u32 guest_index)
426 {
427 #if (GUEST_PAGING_LEVELS == 2)
428 // Because we use 2 shadow l2 entries for each guest entry, the number of
429 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
430 //
431 *smfn = _mfn(mfn_x(*smfn) +
432 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
434 // We multiply by two to get the index of the first of the two entries
435 // used to shadow the specified guest entry.
436 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
437 #else
438 return guest_index;
439 #endif
440 }
442 #if GUEST_PAGING_LEVELS >= 4
444 static u32
445 shadow_l3_index(mfn_t *smfn, u32 guest_index)
446 {
447 return guest_index;
448 }
450 static u32
451 shadow_l4_index(mfn_t *smfn, u32 guest_index)
452 {
453 return guest_index;
454 }
456 #endif // GUEST_PAGING_LEVELS >= 4
459 /**************************************************************************/
460 /* Function which computes shadow entries from their corresponding guest
461 * entries. This is the "heart" of the shadow code. It operates using
462 * level-1 shadow types, but handles all levels of entry.
463 * Don't call it directly, but use the four wrappers below.
464 */
466 static always_inline void
467 _sh_propagate(struct vcpu *v,
468 guest_intpte_t guest_intpte,
469 mfn_t target_mfn,
470 void *shadow_entry_ptr,
471 int level,
472 fetch_type_t ft,
473 p2m_type_t p2mt)
474 {
475 guest_l1e_t guest_entry = { guest_intpte };
476 shadow_l1e_t *sp = shadow_entry_ptr;
477 struct domain *d = v->domain;
478 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
479 u32 pass_thru_flags;
480 u32 gflags, sflags;
482 /* We don't shadow PAE l3s */
483 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
485 /* Check there's something for the shadows to map to */
486 if ( !p2m_is_valid(p2mt) )
487 {
488 *sp = shadow_l1e_empty();
489 goto done;
490 }
492 gflags = guest_l1e_get_flags(guest_entry);
494 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
495 {
496 /* If a guest l1 entry is not present, shadow with the magic
497 * guest-not-present entry. */
498 if ( level == 1 )
499 *sp = sh_l1e_gnp();
500 else
501 *sp = shadow_l1e_empty();
502 goto done;
503 }
505 if ( level == 1 && p2mt == p2m_mmio_dm )
506 {
507 /* Guest l1e maps emulated MMIO space */
508 *sp = sh_l1e_mmio(target_gfn, gflags);
509 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
510 d->arch.paging.shadow.has_fast_mmio_entries = 1;
511 goto done;
512 }
514 // Must have a valid target_mfn unless this is a prefetch or an l1
515 // pointing at MMIO space. In the case of a prefetch, an invalid
516 // mfn means that we can not usefully shadow anything, and so we
517 // return early.
518 //
519 if ( !mfn_valid(target_mfn)
520 && !(level == 1 && (!shadow_mode_refcounts(d)
521 || p2mt == p2m_mmio_direct)) )
522 {
523 ASSERT((ft == ft_prefetch));
524 *sp = shadow_l1e_empty();
525 goto done;
526 }
528 // Propagate bits from the guest to the shadow.
529 // Some of these may be overwritten, below.
530 // Since we know the guest's PRESENT bit is set, we also set the shadow's
531 // SHADOW_PRESENT bit.
532 //
533 pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
534 _PAGE_RW | _PAGE_PRESENT);
535 if ( guest_supports_nx(v) )
536 pass_thru_flags |= _PAGE_NX_BIT;
537 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
538 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
539 sflags = gflags & pass_thru_flags;
541 /*
542 * For HVM domains with direct access to MMIO areas, set the correct
543 * caching attributes in the shadows to match what was asked for.
544 */
545 if ( (level == 1) && is_hvm_domain(d) && has_arch_pdevs(d) &&
546 !is_xen_heap_mfn(mfn_x(target_mfn)) )
547 {
548 unsigned int type;
550 /* compute the PAT index for shadow page entry when VT-d is enabled
551 * and device assigned.
552 * 1) direct MMIO: compute the PAT index with gMTRR=UC and gPAT.
553 * 2) if enables snoop control, compute the PAT index as WB.
554 * 3) if disables snoop control, compute the PAT index with
555 * gMTRR and gPAT.
556 */
557 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
558 sflags |= pat_type_2_pte_flags(type);
559 else if ( d->arch.hvm_domain.is_in_uc_mode )
560 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
561 else if ( p2mt == p2m_mmio_direct )
562 sflags |= get_pat_flags(v,
563 gflags,
564 gfn_to_paddr(target_gfn),
565 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT,
566 MTRR_TYPE_UNCACHABLE);
567 else if ( iommu_snoop )
568 sflags |= pat_type_2_pte_flags(PAT_TYPE_WRBACK);
569 else
570 sflags |= get_pat_flags(v,
571 gflags,
572 gfn_to_paddr(target_gfn),
573 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT,
574 NO_HARDCODE_MEM_TYPE);
575 }
577 // Set the A&D bits for higher level shadows.
578 // Higher level entries do not, strictly speaking, have dirty bits, but
579 // since we use shadow linear tables, each of these entries may, at some
580 // point in time, also serve as a shadow L1 entry.
581 // By setting both the A&D bits in each of these, we eliminate the burden
582 // on the hardware to update these bits on initial accesses.
583 //
584 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
585 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
587 // If the A or D bit has not yet been set in the guest, then we must
588 // prevent the corresponding kind of access.
589 //
590 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
591 sflags &= ~_PAGE_PRESENT;
593 /* D bits exist in L1es and PSE L2es */
594 if ( unlikely(((level == 1) ||
595 ((level == 2) &&
596 (gflags & _PAGE_PSE) &&
597 guest_supports_superpages(v)))
598 && !(gflags & _PAGE_DIRTY)) )
599 sflags &= ~_PAGE_RW;
601 // shadow_mode_log_dirty support
602 //
603 // Only allow the guest write access to a page a) on a demand fault,
604 // or b) if the page is already marked as dirty.
605 //
606 // (We handle log-dirty entirely inside the shadow code, without using the
607 // p2m_ram_logdirty p2m type: only HAP uses that.)
608 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
609 {
610 if ( mfn_valid(target_mfn) ) {
611 if ( ft & FETCH_TYPE_WRITE )
612 paging_mark_dirty(d, mfn_x(target_mfn));
613 else if ( !sh_mfn_is_dirty(d, target_mfn) )
614 sflags &= ~_PAGE_RW;
615 }
616 }
618 if ( unlikely((level == 1) && d->dirty_vram
619 && d->dirty_vram->last_dirty == -1
620 && gfn_x(target_gfn) >= d->dirty_vram->begin_pfn
621 && gfn_x(target_gfn) < d->dirty_vram->end_pfn) )
622 {
623 if ( ft & FETCH_TYPE_WRITE )
624 d->dirty_vram->last_dirty = NOW();
625 else
626 sflags &= ~_PAGE_RW;
627 }
629 /* Read-only memory */
630 if ( p2mt == p2m_ram_ro )
631 sflags &= ~_PAGE_RW;
633 // protect guest page tables
634 //
635 if ( unlikely((level == 1)
636 && sh_mfn_is_a_page_table(target_mfn)
637 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
638 /* Unless the page is out of sync and the guest is
639 writing to it. */
640 && !(mfn_oos_may_write(target_mfn)
641 && (ft == ft_demand_write))
642 #endif /* OOS */
643 ) )
644 {
645 if ( shadow_mode_trap_reads(d) )
646 {
647 // if we are trapping both reads & writes, then mark this page
648 // as not present...
649 //
650 sflags &= ~_PAGE_PRESENT;
651 }
652 else
653 {
654 // otherwise, just prevent any writes...
655 //
656 sflags &= ~_PAGE_RW;
657 }
658 }
660 // PV guests in 64-bit mode use two different page tables for user vs
661 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
662 // It is always shadowed as present...
663 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
664 && !is_hvm_domain(d) )
665 {
666 sflags |= _PAGE_USER;
667 }
669 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
671 done:
672 SHADOW_DEBUG(PROPAGATE,
673 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
674 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
675 }
678 /* These four wrappers give us a little bit of type-safety back around
679 * the use of void-* pointers and intpte types in _sh_propagate(), and
680 * allow the compiler to optimize out some level checks. */
682 #if GUEST_PAGING_LEVELS >= 4
683 static void
684 l4e_propagate_from_guest(struct vcpu *v,
685 guest_l4e_t gl4e,
686 mfn_t sl3mfn,
687 shadow_l4e_t *sl4e,
688 fetch_type_t ft)
689 {
690 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
691 }
693 static void
694 l3e_propagate_from_guest(struct vcpu *v,
695 guest_l3e_t gl3e,
696 mfn_t sl2mfn,
697 shadow_l3e_t *sl3e,
698 fetch_type_t ft)
699 {
700 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
701 }
702 #endif // GUEST_PAGING_LEVELS >= 4
704 static void
705 l2e_propagate_from_guest(struct vcpu *v,
706 guest_l2e_t gl2e,
707 mfn_t sl1mfn,
708 shadow_l2e_t *sl2e,
709 fetch_type_t ft)
710 {
711 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
712 }
714 static void
715 l1e_propagate_from_guest(struct vcpu *v,
716 guest_l1e_t gl1e,
717 mfn_t gmfn,
718 shadow_l1e_t *sl1e,
719 fetch_type_t ft,
720 p2m_type_t p2mt)
721 {
722 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
723 }
726 /**************************************************************************/
727 /* These functions update shadow entries (and do bookkeeping on the shadow
728 * tables they are in). It is intended that they are the only
729 * functions which ever write (non-zero) data onto a shadow page.
730 */
732 static inline void safe_write_entry(void *dst, void *src)
733 /* Copy one PTE safely when processors might be running on the
734 * destination pagetable. This does *not* give safety against
735 * concurrent writes (that's what the shadow lock is for), just
736 * stops the hardware picking up partially written entries. */
737 {
738 volatile unsigned long *d = dst;
739 unsigned long *s = src;
740 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
741 #if CONFIG_PAGING_LEVELS == 3
742 /* In PAE mode, pagetable entries are larger
743 * than machine words, so won't get written atomically. We need to make
744 * sure any other cpu running on these shadows doesn't see a
745 * half-written entry. Do this by marking the entry not-present first,
746 * then writing the high word before the low word. */
747 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
748 d[0] = 0;
749 d[1] = s[1];
750 d[0] = s[0];
751 #else
752 /* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
753 * which will be an atomic write, since the entry is aligned. */
754 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
755 *d = *s;
756 #endif
757 }
760 static inline void
761 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
762 /* This function does the actual writes to shadow pages.
763 * It must not be called directly, since it doesn't do the bookkeeping
764 * that shadow_set_l*e() functions do. */
765 {
766 shadow_l1e_t *dst = d;
767 shadow_l1e_t *src = s;
768 void *map = NULL;
769 int i;
771 /* Because we mirror access rights at all levels in the shadow, an
772 * l2 (or higher) entry with the RW bit cleared will leave us with
773 * no write access through the linear map.
774 * We detect that by writing to the shadow with copy_to_user() and
775 * using map_domain_page() to get a writeable mapping if we need to. */
776 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
777 {
778 perfc_incr(shadow_linear_map_failed);
779 map = sh_map_domain_page(mfn);
780 ASSERT(map != NULL);
781 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
782 }
785 for ( i = 0; i < entries; i++ )
786 safe_write_entry(dst++, src++);
788 if ( map != NULL ) sh_unmap_domain_page(map);
789 }
791 static inline int
792 perms_strictly_increased(u32 old_flags, u32 new_flags)
793 /* Given the flags of two entries, are the new flags a strict
794 * increase in rights over the old ones? */
795 {
796 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
797 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
798 /* Flip the NX bit, since it's the only one that decreases rights;
799 * we calculate as if it were an "X" bit. */
800 of ^= _PAGE_NX_BIT;
801 nf ^= _PAGE_NX_BIT;
802 /* If the changed bits are all set in the new flags, then rights strictly
803 * increased between old and new. */
804 return ((of | (of ^ nf)) == nf);
805 }
807 static int inline
808 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
809 {
810 int res;
811 mfn_t mfn;
812 struct domain *owner;
814 ASSERT(!sh_l1e_is_magic(sl1e));
816 if ( !shadow_mode_refcounts(d) )
817 return 1;
819 res = get_page_from_l1e(sl1e, d);
821 // If a privileged domain is attempting to install a map of a page it does
822 // not own, we let it succeed anyway.
823 //
824 if ( unlikely(!res) &&
825 !shadow_mode_translate(d) &&
826 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
827 (owner = page_get_owner(mfn_to_page(mfn))) &&
828 (d != owner) &&
829 IS_PRIV_FOR(d, owner))
830 {
831 res = get_page_from_l1e(sl1e, owner);
832 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
833 "which is owned by domain %d: %s\n",
834 d->domain_id, mfn_x(mfn), owner->domain_id,
835 res ? "success" : "failed");
836 }
838 if ( unlikely(!res) )
839 {
840 perfc_incr(shadow_get_page_fail);
841 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
842 }
844 return res;
845 }
847 static void inline
848 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
849 {
850 if ( !shadow_mode_refcounts(d) )
851 return;
853 put_page_from_l1e(sl1e, d);
854 }
856 #if GUEST_PAGING_LEVELS >= 4
857 static int shadow_set_l4e(struct vcpu *v,
858 shadow_l4e_t *sl4e,
859 shadow_l4e_t new_sl4e,
860 mfn_t sl4mfn)
861 {
862 int flags = 0, ok;
863 shadow_l4e_t old_sl4e;
864 paddr_t paddr;
865 ASSERT(sl4e != NULL);
866 old_sl4e = *sl4e;
868 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
870 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
871 | (((unsigned long)sl4e) & ~PAGE_MASK));
873 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
874 {
875 /* About to install a new reference */
876 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
877 ok = sh_get_ref(v, sl3mfn, paddr);
878 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
879 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
880 ok |= sh_pin(v, sl3mfn);
881 if ( !ok )
882 {
883 domain_crash(v->domain);
884 return SHADOW_SET_ERROR;
885 }
886 }
888 /* Write the new entry */
889 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
890 flags |= SHADOW_SET_CHANGED;
892 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
893 {
894 /* We lost a reference to an old mfn. */
895 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
896 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
897 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
898 shadow_l4e_get_flags(new_sl4e)) )
899 {
900 flags |= SHADOW_SET_FLUSH;
901 }
902 sh_put_ref(v, osl3mfn, paddr);
903 }
904 return flags;
905 }
907 static int shadow_set_l3e(struct vcpu *v,
908 shadow_l3e_t *sl3e,
909 shadow_l3e_t new_sl3e,
910 mfn_t sl3mfn)
911 {
912 int flags = 0;
913 shadow_l3e_t old_sl3e;
914 paddr_t paddr;
915 ASSERT(sl3e != NULL);
916 old_sl3e = *sl3e;
918 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
920 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
921 | (((unsigned long)sl3e) & ~PAGE_MASK));
923 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
924 {
925 /* About to install a new reference */
926 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
927 {
928 domain_crash(v->domain);
929 return SHADOW_SET_ERROR;
930 }
931 }
933 /* Write the new entry */
934 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
935 flags |= SHADOW_SET_CHANGED;
937 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
938 {
939 /* We lost a reference to an old mfn. */
940 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
941 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
942 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
943 shadow_l3e_get_flags(new_sl3e)) )
944 {
945 flags |= SHADOW_SET_FLUSH;
946 }
947 sh_put_ref(v, osl2mfn, paddr);
948 }
949 return flags;
950 }
951 #endif /* GUEST_PAGING_LEVELS >= 4 */
953 static int shadow_set_l2e(struct vcpu *v,
954 shadow_l2e_t *sl2e,
955 shadow_l2e_t new_sl2e,
956 mfn_t sl2mfn)
957 {
958 int flags = 0;
959 shadow_l2e_t old_sl2e;
960 paddr_t paddr;
962 #if GUEST_PAGING_LEVELS == 2
963 /* In 2-on-3 we work with pairs of l2es pointing at two-page
964 * shadows. Reference counting and up-pointers track from the first
965 * page of the shadow to the first l2e, so make sure that we're
966 * working with those:
967 * Align the pointer down so it's pointing at the first of the pair */
968 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
969 /* Align the mfn of the shadow entry too */
970 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
971 #endif
973 ASSERT(sl2e != NULL);
974 old_sl2e = *sl2e;
976 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
978 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
979 | (((unsigned long)sl2e) & ~PAGE_MASK));
981 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
982 {
983 mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
985 /* About to install a new reference */
986 if ( !sh_get_ref(v, sl1mfn, paddr) )
987 {
988 domain_crash(v->domain);
989 return SHADOW_SET_ERROR;
990 }
991 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
992 {
993 struct page_info *sp = mfn_to_page(sl1mfn);
994 mfn_t gl1mfn = _mfn(sp->v.sh.back);
996 /* If the shadow is a fl1 then the backpointer contains
997 the GFN instead of the GMFN, and it's definitely not
998 OOS. */
999 if ( (sp->u.sh.type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
1000 && mfn_is_out_of_sync(gl1mfn) )
1001 sh_resync(v, gl1mfn);
1003 #endif
1006 /* Write the new entry */
1007 #if GUEST_PAGING_LEVELS == 2
1009 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1010 /* The l1 shadow is two pages long and need to be pointed to by
1011 * two adjacent l1es. The pair have the same flags, but point
1012 * at odd and even MFNs */
1013 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1014 pair[1].l2 |= (1<<PAGE_SHIFT);
1015 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1017 #else /* normal case */
1018 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1019 #endif
1020 flags |= SHADOW_SET_CHANGED;
1022 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1024 /* We lost a reference to an old mfn. */
1025 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1026 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1027 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1028 shadow_l2e_get_flags(new_sl2e)) )
1030 flags |= SHADOW_SET_FLUSH;
1032 sh_put_ref(v, osl1mfn, paddr);
1034 return flags;
1037 static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1038 shadow_l1e_t *sl1e,
1039 mfn_t sl1mfn,
1040 struct domain *d)
1042 mfn_t mfn;
1043 unsigned long gfn;
1045 if ( !d->dirty_vram ) return;
1047 mfn = shadow_l1e_get_mfn(new_sl1e);
1049 if ( !mfn_valid(mfn) ) return; /* m2p for mmio_direct may not exist */
1051 gfn = mfn_to_gfn(d, mfn);
1053 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1054 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1055 struct page_info *page = mfn_to_page(mfn);
1057 if ( (page->u.inuse.type_info & PGT_count_mask) == 1 )
1058 /* Initial guest reference, record it */
1059 d->dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1060 | ((unsigned long)sl1e & ~PAGE_MASK);
1064 static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1065 shadow_l1e_t *sl1e,
1066 mfn_t sl1mfn,
1067 struct domain *d)
1069 mfn_t mfn;
1070 unsigned long gfn;
1072 if ( !d->dirty_vram ) return;
1074 mfn = shadow_l1e_get_mfn(old_sl1e);
1076 if ( !mfn_valid(mfn) ) return;
1078 gfn = mfn_to_gfn(d, mfn);
1080 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1081 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1082 struct page_info *page = mfn_to_page(mfn);
1083 int dirty = 0;
1084 paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1085 | ((unsigned long)sl1e & ~PAGE_MASK);
1087 if ( (page->u.inuse.type_info & PGT_count_mask) == 1 ) {
1088 /* Last reference */
1089 if ( d->dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1090 /* We didn't know it was that one, let's say it is dirty */
1091 dirty = 1;
1092 } else {
1093 ASSERT(d->dirty_vram->sl1ma[i] == sl1ma);
1094 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1095 if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_DIRTY )
1096 dirty = 1;
1098 } else {
1099 /* We had more than one reference, just consider the page dirty. */
1100 dirty = 1;
1101 /* Check that it's not the one we recorded. */
1102 if ( d->dirty_vram->sl1ma[i] == sl1ma ) {
1103 /* Too bad, we remembered the wrong one... */
1104 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1105 } else {
1106 /* Ok, our recorded sl1e is still pointing to this page, let's
1107 * just hope it will remain. */
1110 if ( dirty ) {
1111 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
1112 d->dirty_vram->last_dirty = NOW();
1117 static int shadow_set_l1e(struct vcpu *v,
1118 shadow_l1e_t *sl1e,
1119 shadow_l1e_t new_sl1e,
1120 mfn_t sl1mfn)
1122 int flags = 0;
1123 struct domain *d = v->domain;
1124 shadow_l1e_t old_sl1e;
1125 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1126 mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
1127 #endif
1128 ASSERT(sl1e != NULL);
1130 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1131 if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
1132 && ((shadow_l1e_get_flags(new_sl1e) & (_PAGE_RW|_PAGE_PRESENT))
1133 == (_PAGE_RW|_PAGE_PRESENT)) )
1134 oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
1135 #endif
1137 old_sl1e = *sl1e;
1139 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1141 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1142 && !sh_l1e_is_magic(new_sl1e) )
1144 /* About to install a new reference */
1145 if ( shadow_mode_refcounts(d) ) {
1146 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
1147 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1149 /* Doesn't look like a pagetable. */
1150 flags |= SHADOW_SET_ERROR;
1151 new_sl1e = shadow_l1e_empty();
1153 else
1155 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1160 /* Write the new entry */
1161 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1162 flags |= SHADOW_SET_CHANGED;
1164 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1165 && !sh_l1e_is_magic(old_sl1e) )
1167 /* We lost a reference to an old mfn. */
1168 /* N.B. Unlike higher-level sets, never need an extra flush
1169 * when writing an l1e. Because it points to the same guest frame
1170 * as the guest l1e did, it's the guest's responsibility to
1171 * trigger a flush later. */
1172 if ( shadow_mode_refcounts(d) )
1174 shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1175 shadow_put_page_from_l1e(old_sl1e, d);
1176 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
1179 return flags;
1183 /**************************************************************************/
1184 /* Macros to walk pagetables. These take the shadow of a pagetable and
1185 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1186 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1187 * second entry (since pairs of entries are managed together). For multi-page
1188 * shadows they walk all pages.
1190 * Arguments are an MFN, the variable to point to each entry, a variable
1191 * to indicate that we are done (we will shortcut to the end of the scan
1192 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1193 * and the code.
1195 * WARNING: These macros have side-effects. They change the values of both
1196 * the pointer and the MFN. */
1198 static inline void increment_ptr_to_guest_entry(void *ptr)
1200 if ( ptr )
1202 guest_l1e_t **entry = ptr;
1203 (*entry)++;
1207 /* All kinds of l1: touch all entries */
1208 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1209 do { \
1210 int _i; \
1211 shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \
1212 ASSERT(mfn_to_page(_sl1mfn)->u.sh.type == SH_type_l1_shadow \
1213 || mfn_to_page(_sl1mfn)->u.sh.type == SH_type_fl1_shadow);\
1214 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1215 { \
1216 (_sl1e) = _sp + _i; \
1217 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1218 {_code} \
1219 if ( _done ) break; \
1220 increment_ptr_to_guest_entry(_gl1p); \
1221 } \
1222 sh_unmap_domain_page(_sp); \
1223 } while (0)
1225 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1226 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1227 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1228 do { \
1229 int __done = 0; \
1230 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1231 ({ (__done = _done); }), _code); \
1232 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1233 if ( !__done ) \
1234 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1235 ({ (__done = _done); }), _code); \
1236 } while (0)
1237 #else /* Everything else; l1 shadows are only one page */
1238 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1239 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1240 #endif
1243 #if GUEST_PAGING_LEVELS == 2
1245 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1246 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1247 do { \
1248 int _i, _j, __done = 0; \
1249 int _xen = !shadow_mode_external(_dom); \
1250 ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_32_shadow);\
1251 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1252 { \
1253 shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \
1254 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1255 if ( (!(_xen)) \
1256 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1257 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1258 { \
1259 (_sl2e) = _sp + _i; \
1260 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1261 {_code} \
1262 if ( (__done = (_done)) ) break; \
1263 increment_ptr_to_guest_entry(_gl2p); \
1264 } \
1265 sh_unmap_domain_page(_sp); \
1266 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1267 } \
1268 } while (0)
1270 #elif GUEST_PAGING_LEVELS == 3
1272 /* PAE: if it's an l2h, don't touch Xen mappings */
1273 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1274 do { \
1275 int _i; \
1276 int _xen = !shadow_mode_external(_dom); \
1277 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1278 ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_pae_shadow \
1279 || mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow);\
1280 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1281 if ( (!(_xen)) \
1282 || mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_pae_shadow\
1283 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1284 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1285 { \
1286 (_sl2e) = _sp + _i; \
1287 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1288 {_code} \
1289 if ( _done ) break; \
1290 increment_ptr_to_guest_entry(_gl2p); \
1291 } \
1292 sh_unmap_domain_page(_sp); \
1293 } while (0)
1295 #else
1297 /* 64-bit l2: touch all entries except for PAE compat guests. */
1298 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1299 do { \
1300 int _i; \
1301 int _xen = !shadow_mode_external(_dom); \
1302 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1303 ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_64_shadow ||\
1304 mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_64_shadow);\
1305 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1306 { \
1307 if ( (!(_xen)) \
1308 || !is_pv_32on64_domain(_dom) \
1309 || mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_64_shadow\
1310 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1311 { \
1312 (_sl2e) = _sp + _i; \
1313 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1314 {_code} \
1315 if ( _done ) break; \
1316 increment_ptr_to_guest_entry(_gl2p); \
1317 } \
1318 } \
1319 sh_unmap_domain_page(_sp); \
1320 } while (0)
1322 #endif /* different kinds of l2 */
1324 #if GUEST_PAGING_LEVELS == 4
1326 /* 64-bit l3: touch all entries */
1327 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1328 do { \
1329 int _i; \
1330 shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \
1331 ASSERT(mfn_to_page(_sl3mfn)->u.sh.type == SH_type_l3_64_shadow);\
1332 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1333 { \
1334 (_sl3e) = _sp + _i; \
1335 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1336 {_code} \
1337 if ( _done ) break; \
1338 increment_ptr_to_guest_entry(_gl3p); \
1339 } \
1340 sh_unmap_domain_page(_sp); \
1341 } while (0)
1343 /* 64-bit l4: avoid Xen mappings */
1344 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1345 do { \
1346 shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \
1347 int _xen = !shadow_mode_external(_dom); \
1348 int _i; \
1349 ASSERT(mfn_to_page(_sl4mfn)->u.sh.type == SH_type_l4_64_shadow);\
1350 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1351 { \
1352 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1353 { \
1354 (_sl4e) = _sp + _i; \
1355 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1356 {_code} \
1357 if ( _done ) break; \
1358 } \
1359 increment_ptr_to_guest_entry(_gl4p); \
1360 } \
1361 sh_unmap_domain_page(_sp); \
1362 } while (0)
1364 #endif
1368 /**************************************************************************/
1369 /* Functions to install Xen mappings and linear mappings in shadow pages */
1371 // XXX -- this function should probably be moved to shadow-common.c, but that
1372 // probably wants to wait until the shadow types have been moved from
1373 // shadow-types.h to shadow-private.h
1374 //
1375 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1376 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1378 struct domain *d = v->domain;
1379 shadow_l4e_t *sl4e;
1381 sl4e = sh_map_domain_page(sl4mfn);
1382 ASSERT(sl4e != NULL);
1383 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1385 /* Copy the common Xen mappings from the idle domain */
1386 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1387 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1388 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1390 /* Install the per-domain mappings for this domain */
1391 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1392 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1393 __PAGE_HYPERVISOR);
1395 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1396 * shadows on 64-bit xen, this linear mapping is later replaced by the
1397 * monitor pagetable structure, which is built in make_monitor_table
1398 * and maintained by sh_update_linear_entries. */
1399 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1400 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1402 /* Self linear mapping. */
1403 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1405 // linear tables may not be used with translated PV guests
1406 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1407 shadow_l4e_empty();
1409 else
1411 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1412 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1415 if ( shadow_mode_translate(v->domain) )
1417 /* install domain-specific P2M table */
1418 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1419 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1420 __PAGE_HYPERVISOR);
1423 sh_unmap_domain_page(sl4e);
1425 #endif
1427 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1428 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1429 // place, which means that we need to populate the l2h entry in the l3
1430 // table.
1432 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1434 struct domain *d = v->domain;
1435 shadow_l2e_t *sl2e;
1436 #if CONFIG_PAGING_LEVELS == 3
1437 int i;
1438 #else
1440 if ( !is_pv_32on64_vcpu(v) )
1441 return;
1442 #endif
1444 sl2e = sh_map_domain_page(sl2hmfn);
1445 ASSERT(sl2e != NULL);
1446 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1448 #if CONFIG_PAGING_LEVELS == 3
1450 /* Copy the common Xen mappings from the idle domain */
1451 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1452 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1453 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1455 /* Install the per-domain mappings for this domain */
1456 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1457 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1458 shadow_l2e_from_mfn(
1459 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1460 __PAGE_HYPERVISOR);
1462 /* We don't set up a linear mapping here because we can't until this
1463 * l2h is installed in an l3e. sh_update_linear_entries() handles
1464 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1465 * We zero them here, just as a safety measure.
1466 */
1467 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1468 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1469 shadow_l2e_empty();
1470 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1471 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1472 shadow_l2e_empty();
1474 if ( shadow_mode_translate(d) )
1476 /* Install the domain-specific p2m table */
1477 l3_pgentry_t *p2m;
1478 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1479 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1480 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1482 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1483 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1484 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1485 __PAGE_HYPERVISOR)
1486 : shadow_l2e_empty();
1488 sh_unmap_domain_page(p2m);
1491 #else
1493 /* Copy the common Xen mappings from the idle domain */
1494 memcpy(
1495 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1496 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1497 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1499 #endif
1501 sh_unmap_domain_page(sl2e);
1503 #endif
1509 /**************************************************************************/
1510 /* Create a shadow of a given guest page.
1511 */
1512 static mfn_t
1513 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1515 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1516 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1517 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1519 if ( shadow_type != SH_type_l2_32_shadow
1520 && shadow_type != SH_type_l2_pae_shadow
1521 && shadow_type != SH_type_l2h_pae_shadow
1522 && shadow_type != SH_type_l4_64_shadow )
1523 /* Lower-level shadow, not yet linked form a higher level */
1524 mfn_to_page(smfn)->up = 0;
1526 #if GUEST_PAGING_LEVELS == 4
1527 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1528 if ( shadow_type == SH_type_l4_64_shadow &&
1529 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1531 /* We're shadowing a new l4, but we've been assuming the guest uses
1532 * only one l4 per vcpu and context switches using an l4 entry.
1533 * Count the number of active l4 shadows. If there are enough
1534 * of them, decide that this isn't an old linux guest, and stop
1535 * pinning l3es. This is not very quick but it doesn't happen
1536 * very often. */
1537 struct page_info *sp, *t;
1538 struct vcpu *v2;
1539 int l4count = 0, vcpus = 0;
1540 page_list_for_each(sp, &v->domain->arch.paging.shadow.pinned_shadows)
1542 if ( sp->u.sh.type == SH_type_l4_64_shadow )
1543 l4count++;
1545 for_each_vcpu ( v->domain, v2 )
1546 vcpus++;
1547 if ( l4count > 2 * vcpus )
1549 /* Unpin all the pinned l3 tables, and don't pin any more. */
1550 page_list_for_each_safe(sp, t, &v->domain->arch.paging.shadow.pinned_shadows)
1552 if ( sp->u.sh.type == SH_type_l3_64_shadow )
1553 sh_unpin(v, page_to_mfn(sp));
1555 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1558 #endif
1559 #endif
1561 // Create the Xen mappings...
1562 if ( !shadow_mode_external(v->domain) )
1564 switch (shadow_type)
1566 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1567 case SH_type_l4_shadow:
1568 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1569 #endif
1570 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1571 case SH_type_l2h_shadow:
1572 sh_install_xen_entries_in_l2h(v, smfn); break;
1573 #endif
1574 default: /* Do nothing */ break;
1578 shadow_promote(v, gmfn, shadow_type);
1579 set_shadow_status(v, gmfn, shadow_type, smfn);
1581 return smfn;
1584 /* Make a splintered superpage shadow */
1585 static mfn_t
1586 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1588 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1589 (unsigned long) gfn_x(gfn));
1591 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1592 gfn_x(gfn), mfn_x(smfn));
1594 set_fl1_shadow_status(v, gfn, smfn);
1595 return smfn;
1599 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1600 mfn_t
1601 sh_make_monitor_table(struct vcpu *v)
1603 struct domain *d = v->domain;
1605 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1607 /* Guarantee we can get the memory we need */
1608 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1610 #if CONFIG_PAGING_LEVELS == 4
1612 mfn_t m4mfn;
1613 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1614 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1615 /* Remember the level of this table */
1616 mfn_to_page(m4mfn)->shadow_flags = 4;
1617 #if SHADOW_PAGING_LEVELS < 4
1619 mfn_t m3mfn, m2mfn;
1620 l4_pgentry_t *l4e;
1621 l3_pgentry_t *l3e;
1622 /* Install an l3 table and an l2 table that will hold the shadow
1623 * linear map entries. This overrides the linear map entry that
1624 * was installed by sh_install_xen_entries_in_l4. */
1625 l4e = sh_map_domain_page(m4mfn);
1627 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1628 mfn_to_page(m3mfn)->shadow_flags = 3;
1629 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1630 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1632 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1633 mfn_to_page(m2mfn)->shadow_flags = 2;
1634 l3e = sh_map_domain_page(m3mfn);
1635 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1636 sh_unmap_domain_page(l3e);
1638 if ( is_pv_32on64_vcpu(v) )
1640 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1641 * area into its usual VAs in the monitor tables */
1642 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1643 mfn_to_page(m3mfn)->shadow_flags = 3;
1644 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1646 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1647 mfn_to_page(m2mfn)->shadow_flags = 2;
1648 l3e = sh_map_domain_page(m3mfn);
1649 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1650 sh_install_xen_entries_in_l2h(v, m2mfn);
1651 sh_unmap_domain_page(l3e);
1654 sh_unmap_domain_page(l4e);
1656 #endif /* SHADOW_PAGING_LEVELS < 4 */
1657 return m4mfn;
1660 #elif CONFIG_PAGING_LEVELS == 3
1663 mfn_t m3mfn, m2mfn;
1664 l3_pgentry_t *l3e;
1665 l2_pgentry_t *l2e;
1666 int i;
1668 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1669 /* Remember the level of this table */
1670 mfn_to_page(m3mfn)->shadow_flags = 3;
1672 // Install a monitor l2 table in slot 3 of the l3 table.
1673 // This is used for all Xen entries, including linear maps
1674 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1675 mfn_to_page(m2mfn)->shadow_flags = 2;
1676 l3e = sh_map_domain_page(m3mfn);
1677 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1678 sh_install_xen_entries_in_l2h(v, m2mfn);
1679 /* Install the monitor's own linear map */
1680 l2e = sh_map_domain_page(m2mfn);
1681 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1682 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1683 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1684 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1685 : l2e_empty();
1686 sh_unmap_domain_page(l2e);
1687 sh_unmap_domain_page(l3e);
1689 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1690 return m3mfn;
1693 #else
1694 #error this should not happen
1695 #endif /* CONFIG_PAGING_LEVELS */
1697 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1699 /**************************************************************************/
1700 /* These functions also take a virtual address and return the level-N
1701 * shadow table mfn and entry, but they create the shadow pagetables if
1702 * they are needed. The "demand" argument is non-zero when handling
1703 * a demand fault (so we know what to do about accessed bits &c).
1704 * If the necessary tables are not present in the guest, they return NULL. */
1706 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1707 * more levels than the guest, the upper levels are always fixed and do not
1708 * reflect any information from the guest, so we do not use these functions
1709 * to access them. */
1711 #if GUEST_PAGING_LEVELS >= 4
1712 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1713 walk_t *gw,
1714 mfn_t *sl4mfn)
1716 /* There is always a shadow of the top level table. Get it. */
1717 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1718 /* Reading the top level table is always valid. */
1719 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1722 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1723 walk_t *gw,
1724 mfn_t *sl3mfn,
1725 fetch_type_t ft,
1726 int *resync)
1728 mfn_t sl4mfn;
1729 shadow_l4e_t *sl4e;
1730 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1731 /* Get the l4e */
1732 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1733 ASSERT(sl4e != NULL);
1734 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1736 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1737 ASSERT(mfn_valid(*sl3mfn));
1739 else
1741 int r;
1742 shadow_l4e_t new_sl4e;
1743 /* No l3 shadow installed: find and install it. */
1744 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1745 if ( !mfn_valid(*sl3mfn) )
1747 /* No l3 shadow of this page exists at all: make one. */
1748 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1750 /* Install the new sl3 table in the sl4e */
1751 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
1752 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1753 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1754 if ( r & SHADOW_SET_ERROR )
1755 return NULL;
1757 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
1758 *resync |= 1;
1759 #endif
1762 /* Now follow it down a level. Guaranteed to succeed. */
1763 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1765 #endif /* GUEST_PAGING_LEVELS >= 4 */
1768 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1769 walk_t *gw,
1770 mfn_t *sl2mfn,
1771 fetch_type_t ft,
1772 int *resync)
1774 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1775 mfn_t sl3mfn = _mfn(INVALID_MFN);
1776 shadow_l3e_t *sl3e;
1777 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1778 /* Get the l3e */
1779 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync);
1780 if ( sl3e == NULL ) return NULL;
1781 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1783 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1784 ASSERT(mfn_valid(*sl2mfn));
1786 else
1788 int r;
1789 shadow_l3e_t new_sl3e;
1790 unsigned int t = SH_type_l2_shadow;
1792 /* Tag compat L2 containing hypervisor (m2p) mappings */
1793 if ( is_pv_32on64_domain(v->domain) &&
1794 guest_l4_table_offset(gw->va) == 0 &&
1795 guest_l3_table_offset(gw->va) == 3 )
1796 t = SH_type_l2h_shadow;
1798 /* No l2 shadow installed: find and install it. */
1799 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1800 if ( !mfn_valid(*sl2mfn) )
1802 /* No l2 shadow of this page exists at all: make one. */
1803 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1805 /* Install the new sl2 table in the sl3e */
1806 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
1807 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1808 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1809 if ( r & SHADOW_SET_ERROR )
1810 return NULL;
1812 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
1813 *resync |= 1;
1814 #endif
1817 /* Now follow it down a level. Guaranteed to succeed. */
1818 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1819 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1820 /* We never demand-shadow PAE l3es: they are only created in
1821 * sh_update_cr3(). Check if the relevant sl3e is present. */
1822 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1823 + shadow_l3_linear_offset(gw->va);
1824 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1825 return NULL;
1826 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1827 ASSERT(mfn_valid(*sl2mfn));
1828 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1829 #else /* 32bit... */
1830 /* There is always a shadow of the top level table. Get it. */
1831 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1832 /* This next line is important: the guest l2 has a 16k
1833 * shadow, we need to return the right mfn of the four. This
1834 * call will set it for us as a side-effect. */
1835 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
1836 /* Reading the top level table is always valid. */
1837 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1838 #endif
1842 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1843 walk_t *gw,
1844 mfn_t *sl1mfn,
1845 fetch_type_t ft)
1847 mfn_t sl2mfn;
1848 int resync = 0;
1849 shadow_l2e_t *sl2e;
1851 /* Get the l2e */
1852 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync);
1853 if ( sl2e == NULL ) return NULL;
1855 /* Install the sl1 in the l2e if it wasn't there or if we need to
1856 * re-do it to fix a PSE dirty bit. */
1857 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1858 && likely(ft != ft_demand_write
1859 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
1860 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
1862 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1863 ASSERT(mfn_valid(*sl1mfn));
1865 else
1867 shadow_l2e_t new_sl2e;
1868 int r, flags = guest_l2e_get_flags(gw->l2e);
1869 /* No l1 shadow installed: find and install it. */
1870 if ( !(flags & _PAGE_PRESENT) )
1871 return NULL; /* No guest page. */
1872 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1874 /* Splintering a superpage */
1875 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
1876 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1877 if ( !mfn_valid(*sl1mfn) )
1879 /* No fl1 shadow of this superpage exists at all: make one. */
1880 *sl1mfn = make_fl1_shadow(v, l2gfn);
1883 else
1885 /* Shadowing an actual guest l1 table */
1886 if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
1887 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1888 if ( !mfn_valid(*sl1mfn) )
1890 /* No l1 shadow of this page exists at all: make one. */
1891 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1894 /* Install the new sl1 table in the sl2e */
1895 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
1896 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1897 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1898 if ( r & SHADOW_SET_ERROR )
1899 return NULL;
1901 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1902 * the guest l1 table has an 8k shadow, and we need to return
1903 * the right mfn of the pair. This call will set it for us as a
1904 * side-effect. (In all other cases, it's a no-op and will be
1905 * compiled out.) */
1906 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1909 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
1910 /* All pages walked are now pagetables. Safe to resync pages
1911 in case level 4 or 3 shadows were set. */
1912 if ( resync )
1913 shadow_resync_all(v, 0);
1914 #endif
1916 /* Now follow it down a level. Guaranteed to succeed. */
1917 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1922 /**************************************************************************/
1923 /* Destructors for shadow tables:
1924 * Unregister the shadow, decrement refcounts of any entries present in it,
1925 * and release the memory.
1927 * N.B. These destructors do not clear the contents of the shadows.
1928 * This allows us to delay TLB shootdowns until the page is being reused.
1929 * See shadow_alloc() and shadow_free() for how this is handled.
1930 */
1932 #if GUEST_PAGING_LEVELS >= 4
1933 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1935 shadow_l4e_t *sl4e;
1936 u32 t = mfn_to_page(smfn)->u.sh.type;
1937 mfn_t gmfn, sl4mfn;
1939 SHADOW_DEBUG(DESTROY_SHADOW,
1940 "%s(%05lx)\n", __func__, mfn_x(smfn));
1941 ASSERT(t == SH_type_l4_shadow);
1943 /* Record that the guest page isn't shadowed any more (in this type) */
1944 gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
1945 delete_shadow_status(v, gmfn, t, smfn);
1946 shadow_demote(v, gmfn, t);
1947 /* Decrement refcounts of all the old entries */
1948 sl4mfn = smfn;
1949 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
1950 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1952 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1953 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1954 | ((unsigned long)sl4e & ~PAGE_MASK));
1956 });
1958 /* Put the memory back in the pool */
1959 shadow_free(v->domain, smfn);
1962 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
1964 shadow_l3e_t *sl3e;
1965 u32 t = mfn_to_page(smfn)->u.sh.type;
1966 mfn_t gmfn, sl3mfn;
1968 SHADOW_DEBUG(DESTROY_SHADOW,
1969 "%s(%05lx)\n", __func__, mfn_x(smfn));
1970 ASSERT(t == SH_type_l3_shadow);
1972 /* Record that the guest page isn't shadowed any more (in this type) */
1973 gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
1974 delete_shadow_status(v, gmfn, t, smfn);
1975 shadow_demote(v, gmfn, t);
1977 /* Decrement refcounts of all the old entries */
1978 sl3mfn = smfn;
1979 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
1980 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1981 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
1982 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1983 | ((unsigned long)sl3e & ~PAGE_MASK));
1984 });
1986 /* Put the memory back in the pool */
1987 shadow_free(v->domain, smfn);
1989 #endif /* GUEST_PAGING_LEVELS >= 4 */
1992 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
1994 shadow_l2e_t *sl2e;
1995 u32 t = mfn_to_page(smfn)->u.sh.type;
1996 mfn_t gmfn, sl2mfn;
1998 SHADOW_DEBUG(DESTROY_SHADOW,
1999 "%s(%05lx)\n", __func__, mfn_x(smfn));
2001 #if GUEST_PAGING_LEVELS >= 3
2002 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2003 #else
2004 ASSERT(t == SH_type_l2_shadow);
2005 #endif
2007 /* Record that the guest page isn't shadowed any more (in this type) */
2008 gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
2009 delete_shadow_status(v, gmfn, t, smfn);
2010 shadow_demote(v, gmfn, t);
2012 /* Decrement refcounts of all the old entries */
2013 sl2mfn = smfn;
2014 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2015 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2016 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2017 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2018 | ((unsigned long)sl2e & ~PAGE_MASK));
2019 });
2021 /* Put the memory back in the pool */
2022 shadow_free(v->domain, smfn);
2025 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2027 struct domain *d = v->domain;
2028 shadow_l1e_t *sl1e;
2029 u32 t = mfn_to_page(smfn)->u.sh.type;
2031 SHADOW_DEBUG(DESTROY_SHADOW,
2032 "%s(%05lx)\n", __func__, mfn_x(smfn));
2033 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2035 /* Record that the guest page isn't shadowed any more (in this type) */
2036 if ( t == SH_type_fl1_shadow )
2038 gfn_t gfn = _gfn(mfn_to_page(smfn)->v.sh.back);
2039 delete_fl1_shadow_status(v, gfn, smfn);
2041 else
2043 mfn_t gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
2044 delete_shadow_status(v, gmfn, t, smfn);
2045 shadow_demote(v, gmfn, t);
2048 if ( shadow_mode_refcounts(d) )
2050 /* Decrement refcounts of all the old entries */
2051 mfn_t sl1mfn = smfn;
2052 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2053 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2054 && !sh_l1e_is_magic(*sl1e) ) {
2055 shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2056 shadow_put_page_from_l1e(*sl1e, d);
2058 });
2061 /* Put the memory back in the pool */
2062 shadow_free(v->domain, smfn);
2065 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2066 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2068 struct domain *d = v->domain;
2069 ASSERT(mfn_to_page(mmfn)->u.sh.type == SH_type_monitor_table);
2071 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2073 mfn_t m3mfn;
2074 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2075 l3_pgentry_t *l3e;
2076 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2078 /* Need to destroy the l3 and l2 monitor pages used
2079 * for the linear map */
2080 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2081 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2082 l3e = sh_map_domain_page(m3mfn);
2083 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2084 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2085 sh_unmap_domain_page(l3e);
2086 shadow_free(d, m3mfn);
2088 if ( is_pv_32on64_vcpu(v) )
2090 /* Need to destroy the l3 and l2 monitor pages that map the
2091 * Xen VAs at 3GB-4GB */
2092 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2093 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2094 l3e = sh_map_domain_page(m3mfn);
2095 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2096 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2097 sh_unmap_domain_page(l3e);
2098 shadow_free(d, m3mfn);
2100 sh_unmap_domain_page(l4e);
2102 #elif CONFIG_PAGING_LEVELS == 3
2103 /* Need to destroy the l2 monitor page in slot 4 too */
2105 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2106 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2107 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2108 sh_unmap_domain_page(l3e);
2110 #endif
2112 /* Put the memory back in the pool */
2113 shadow_free(d, mmfn);
2115 #endif
2117 /**************************************************************************/
2118 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2119 * These are called from common code when we are running out of shadow
2120 * memory, and unpinning all the top-level shadows hasn't worked.
2122 * This implementation is pretty crude and slow, but we hope that it won't
2123 * be called very often. */
2125 #if GUEST_PAGING_LEVELS == 2
2127 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2129 shadow_l2e_t *sl2e;
2130 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2131 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2132 });
2135 #elif GUEST_PAGING_LEVELS == 3
2137 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2138 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2140 shadow_l2e_t *sl2e;
2141 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2142 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2143 });
2146 #elif GUEST_PAGING_LEVELS == 4
2148 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2150 shadow_l4e_t *sl4e;
2151 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2152 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2153 });
2156 #endif
2158 /**************************************************************************/
2159 /* Internal translation functions.
2160 * These functions require a pointer to the shadow entry that will be updated.
2161 */
2163 /* These functions take a new guest entry, translate it to shadow and write
2164 * the shadow entry.
2166 * They return the same bitmaps as the shadow_set_lXe() functions.
2167 */
2169 #if GUEST_PAGING_LEVELS >= 4
2170 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2172 shadow_l4e_t new_sl4e;
2173 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2174 shadow_l4e_t *sl4p = se;
2175 mfn_t sl3mfn = _mfn(INVALID_MFN);
2176 struct domain *d = v->domain;
2177 p2m_type_t p2mt;
2178 int result = 0;
2180 perfc_incr(shadow_validate_gl4e_calls);
2182 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2184 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2185 mfn_t gl3mfn = gfn_to_mfn_query(d, gl3gfn, &p2mt);
2186 if ( p2m_is_ram(p2mt) )
2187 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2188 else if ( p2mt != p2m_populate_on_demand )
2189 result |= SHADOW_SET_ERROR;
2191 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
2192 if ( mfn_valid(sl3mfn) )
2193 shadow_resync_all(v, 0);
2194 #endif
2196 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2198 // check for updates to xen reserved slots
2199 if ( !shadow_mode_external(d) )
2201 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2202 sizeof(shadow_l4e_t));
2203 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2205 if ( unlikely(reserved_xen_slot) )
2207 // attempt by the guest to write to a xen reserved slot
2208 //
2209 SHADOW_PRINTK("%s out-of-range update "
2210 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2211 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2212 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2214 SHADOW_ERROR("out-of-range l4e update\n");
2215 result |= SHADOW_SET_ERROR;
2218 // do not call shadow_set_l4e...
2219 return result;
2223 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2224 return result;
2228 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2230 shadow_l3e_t new_sl3e;
2231 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2232 shadow_l3e_t *sl3p = se;
2233 mfn_t sl2mfn = _mfn(INVALID_MFN);
2234 p2m_type_t p2mt;
2235 int result = 0;
2237 perfc_incr(shadow_validate_gl3e_calls);
2239 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2241 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2242 mfn_t gl2mfn = gfn_to_mfn_query(v->domain, gl2gfn, &p2mt);
2243 if ( p2m_is_ram(p2mt) )
2244 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2245 else if ( p2mt != p2m_populate_on_demand )
2246 result |= SHADOW_SET_ERROR;
2248 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
2249 if ( mfn_valid(sl2mfn) )
2250 shadow_resync_all(v, 0);
2251 #endif
2253 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2254 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2256 return result;
2258 #endif // GUEST_PAGING_LEVELS >= 4
2260 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2262 shadow_l2e_t new_sl2e;
2263 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2264 shadow_l2e_t *sl2p = se;
2265 mfn_t sl1mfn = _mfn(INVALID_MFN);
2266 p2m_type_t p2mt;
2267 int result = 0;
2269 perfc_incr(shadow_validate_gl2e_calls);
2271 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2273 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2274 if ( guest_supports_superpages(v) &&
2275 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2277 // superpage -- need to look up the shadow L1 which holds the
2278 // splitters...
2279 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2280 #if 0
2281 // XXX - it's possible that we want to do some kind of prefetch
2282 // for superpage fl1's here, but this is *not* on the demand path,
2283 // so we'll hold off trying that for now...
2284 //
2285 if ( !mfn_valid(sl1mfn) )
2286 sl1mfn = make_fl1_shadow(v, gl1gfn);
2287 #endif
2289 else
2291 mfn_t gl1mfn = gfn_to_mfn_query(v->domain, gl1gfn, &p2mt);
2292 if ( p2m_is_ram(p2mt) )
2293 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2294 else if ( p2mt != p2m_populate_on_demand )
2295 result |= SHADOW_SET_ERROR;
2298 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2300 // check for updates to xen reserved slots in PV guests...
2301 // XXX -- need to revisit this for PV 3-on-4 guests.
2302 //
2303 #if SHADOW_PAGING_LEVELS < 4
2304 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2305 if ( !shadow_mode_external(v->domain) )
2307 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2308 sizeof(shadow_l2e_t));
2309 int reserved_xen_slot;
2311 #if SHADOW_PAGING_LEVELS == 3
2312 reserved_xen_slot =
2313 ((mfn_to_page(sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow) &&
2314 (shadow_index
2315 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2316 #else /* SHADOW_PAGING_LEVELS == 2 */
2317 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2318 #endif
2320 if ( unlikely(reserved_xen_slot) )
2322 // attempt by the guest to write to a xen reserved slot
2323 //
2324 SHADOW_PRINTK("%s out-of-range update "
2325 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2326 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2327 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2329 SHADOW_ERROR("out-of-range l2e update\n");
2330 result |= SHADOW_SET_ERROR;
2333 // do not call shadow_set_l2e...
2334 return result;
2337 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2338 #endif /* SHADOW_PAGING_LEVELS < 4 */
2340 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2342 return result;
2345 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2347 shadow_l1e_t new_sl1e;
2348 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2349 shadow_l1e_t *sl1p = se;
2350 gfn_t gfn;
2351 mfn_t gmfn;
2352 p2m_type_t p2mt;
2353 int result = 0;
2354 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2355 mfn_t gl1mfn;
2356 #endif /* OOS */
2358 perfc_incr(shadow_validate_gl1e_calls);
2360 gfn = guest_l1e_get_gfn(new_gl1e);
2361 gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
2363 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2364 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2366 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2367 gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
2368 if ( mfn_valid(gl1mfn)
2369 && mfn_is_out_of_sync(gl1mfn) )
2371 /* Update the OOS snapshot. */
2372 mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
2373 guest_l1e_t *snp;
2375 ASSERT(mfn_valid(snpmfn));
2377 snp = sh_map_domain_page(snpmfn);
2378 snp[guest_index(new_ge)] = new_gl1e;
2379 sh_unmap_domain_page(snp);
2381 #endif /* OOS */
2383 return result;
2386 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2387 /**************************************************************************/
2388 /* Special validation function for re-syncing out-of-sync shadows.
2389 * Walks the *shadow* page, and for every entry that it finds,
2390 * revalidates the guest entry that corresponds to it.
2391 * N.B. This function is called with the vcpu that unsynced the page,
2392 * *not* the one that is causing it to be resynced. */
2393 void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
2395 mfn_t sl1mfn;
2396 shadow_l1e_t *sl1p;
2397 guest_l1e_t *gl1p, *gp, *snp;
2398 int rc = 0;
2400 ASSERT(mfn_valid(snpmfn));
2402 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2403 ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
2405 snp = sh_map_domain_page(snpmfn);
2406 gp = sh_map_domain_page(gl1mfn);
2407 gl1p = gp;
2409 SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
2410 guest_l1e_t gl1e = *gl1p;
2411 guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
2413 if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
2415 gfn_t gfn;
2416 mfn_t gmfn;
2417 p2m_type_t p2mt;
2418 shadow_l1e_t nsl1e;
2420 gfn = guest_l1e_get_gfn(gl1e);
2421 gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
2422 l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
2423 rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
2425 *snpl1p = gl1e;
2427 });
2429 sh_unmap_domain_page(gp);
2430 sh_unmap_domain_page(snp);
2432 /* Setting shadow L1 entries should never need us to flush the TLB */
2433 ASSERT(!(rc & SHADOW_SET_FLUSH));
2436 /* Figure out whether it's definitely safe not to sync this l1 table.
2437 * That is: if we can tell that it's only used once, and that the
2438 * toplevel shadow responsible is not one of ours.
2439 * N.B. This function is called with the vcpu that required the resync,
2440 * *not* the one that originally unsynced the page, but it is
2441 * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
2442 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
2444 struct page_info *sp;
2445 mfn_t smfn;
2447 smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2448 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2450 /* Up to l2 */
2451 sp = mfn_to_page(smfn);
2452 if ( sp->u.sh.count != 1 || !sp->up )
2453 return 0;
2454 smfn = _mfn(sp->up >> PAGE_SHIFT);
2455 ASSERT(mfn_valid(smfn));
2457 #if (SHADOW_PAGING_LEVELS == 4)
2458 /* up to l3 */
2459 sp = mfn_to_page(smfn);
2460 if ( sp->u.sh.count != 1 || !sp->up )
2461 return 0;
2462 smfn = _mfn(sp->up >> PAGE_SHIFT);
2463 ASSERT(mfn_valid(smfn));
2465 /* up to l4 */
2466 sp = mfn_to_page(smfn);
2467 if ( sp->u.sh.count != 1
2468 || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
2469 return 0;
2470 smfn = _mfn(sp->up >> PAGE_SHIFT);
2471 ASSERT(mfn_valid(smfn));
2473 #if (GUEST_PAGING_LEVELS == 2)
2474 /* In 2-on-3 shadow mode the up pointer contains the link to the
2475 * shadow page, but the shadow_table contains only the first of the
2476 * four pages that makes the PAE top shadow tables. */
2477 smfn = _mfn(mfn_x(smfn) & ~0x3UL);
2478 #endif
2480 #endif
2482 if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
2483 #if (SHADOW_PAGING_LEVELS == 3)
2484 || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
2485 || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
2486 || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
2487 #endif
2489 return 0;
2491 /* Only in use in one toplevel shadow, and it's not the one we're
2492 * running on */
2493 return 1;
2495 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
2498 /**************************************************************************/
2499 /* Functions which translate and install the shadows of arbitrary guest
2500 * entries that we have just seen the guest write. */
2503 static inline int
2504 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2505 void *new_gp, u32 size, u32 sh_type,
2506 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2507 int (*validate_ge)(struct vcpu *v, void *ge,
2508 mfn_t smfn, void *se))
2509 /* Generic function for mapping and validating. */
2511 mfn_t smfn, smfn2, map_mfn;
2512 shadow_l1e_t *sl1p;
2513 u32 shadow_idx, guest_idx;
2514 int result = 0;
2516 /* Align address and size to guest entry boundaries */
2517 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2518 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2519 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2520 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2522 /* Map the shadow page */
2523 smfn = get_shadow_status(v, gmfn, sh_type);
2524 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2525 guest_idx = guest_index(new_gp);
2526 map_mfn = smfn;
2527 shadow_idx = shadow_index(&map_mfn, guest_idx);
2528 sl1p = sh_map_domain_page(map_mfn);
2530 /* Validate one entry at a time */
2531 while ( size )
2533 smfn2 = smfn;
2534 guest_idx = guest_index(new_gp);
2535 shadow_idx = shadow_index(&smfn2, guest_idx);
2536 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2538 /* We have moved to another page of the shadow */
2539 map_mfn = smfn2;
2540 sh_unmap_domain_page(sl1p);
2541 sl1p = sh_map_domain_page(map_mfn);
2543 result |= validate_ge(v,
2544 new_gp,
2545 map_mfn,
2546 &sl1p[shadow_idx]);
2547 size -= sizeof(guest_l1e_t);
2548 new_gp += sizeof(guest_l1e_t);
2550 sh_unmap_domain_page(sl1p);
2551 return result;
2555 int
2556 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2557 void *new_gl4p, u32 size)
2559 #if GUEST_PAGING_LEVELS >= 4
2560 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2561 SH_type_l4_shadow,
2562 shadow_l4_index,
2563 validate_gl4e);
2564 #else // ! GUEST_PAGING_LEVELS >= 4
2565 SHADOW_ERROR("called in wrong paging mode!\n");
2566 BUG();
2567 return 0;
2568 #endif
2571 int
2572 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2573 void *new_gl3p, u32 size)
2575 #if GUEST_PAGING_LEVELS >= 4
2576 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2577 SH_type_l3_shadow,
2578 shadow_l3_index,
2579 validate_gl3e);
2580 #else // ! GUEST_PAGING_LEVELS >= 4
2581 SHADOW_ERROR("called in wrong paging mode!\n");
2582 BUG();
2583 return 0;
2584 #endif
2587 int
2588 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2589 void *new_gl2p, u32 size)
2591 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2592 SH_type_l2_shadow,
2593 shadow_l2_index,
2594 validate_gl2e);
2597 int
2598 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2599 void *new_gl2p, u32 size)
2601 #if GUEST_PAGING_LEVELS >= 3
2602 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2603 SH_type_l2h_shadow,
2604 shadow_l2_index,
2605 validate_gl2e);
2606 #else /* Non-PAE guests don't have different kinds of l2 table */
2607 SHADOW_ERROR("called in wrong paging mode!\n");
2608 BUG();
2609 return 0;
2610 #endif
2613 int
2614 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2615 void *new_gl1p, u32 size)
2617 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2618 SH_type_l1_shadow,
2619 shadow_l1_index,
2620 validate_gl1e);
2624 /**************************************************************************/
2625 /* Optimization: If we see two emulated writes of zeros to the same
2626 * page-table without another kind of page fault in between, we guess
2627 * that this is a batch of changes (for process destruction) and
2628 * unshadow the page so we don't take a pagefault on every entry. This
2629 * should also make finding writeable mappings of pagetables much
2630 * easier. */
2632 /* Look to see if this is the second emulated write in a row to this
2633 * page, and unshadow if it is */
2634 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2636 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2637 if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2638 && sh_mfn_is_a_page_table(gmfn) )
2640 perfc_incr(shadow_early_unshadow);
2641 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2642 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
2644 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2645 #endif
2648 /* Stop counting towards early unshadows, as we've seen a real page fault */
2649 static inline void reset_early_unshadow(struct vcpu *v)
2651 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2652 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2653 #endif
2658 /**************************************************************************/
2659 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2660 * demand-faulted a shadow l1e in the fault handler, to see if it's
2661 * worth fetching some more.
2662 */
2664 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2666 /* XXX magic number */
2667 #define PREFETCH_DISTANCE 32
2669 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2670 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2672 int i, dist;
2673 gfn_t gfn;
2674 mfn_t gmfn;
2675 guest_l1e_t *gl1p = NULL, gl1e;
2676 shadow_l1e_t sl1e;
2677 u32 gflags;
2678 p2m_type_t p2mt;
2679 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2680 guest_l1e_t *snpl1p = NULL;
2681 #endif /* OOS */
2684 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2685 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2686 /* And no more than a maximum fetches-per-fault */
2687 if ( dist > PREFETCH_DISTANCE )
2688 dist = PREFETCH_DISTANCE;
2690 if ( mfn_valid(gw->l1mfn) )
2692 /* Normal guest page; grab the next guest entry */
2693 gl1p = sh_map_domain_page(gw->l1mfn);
2694 gl1p += guest_l1_table_offset(gw->va);
2696 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2697 if ( mfn_is_out_of_sync(gw->l1mfn) )
2699 mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
2701 ASSERT(mfn_valid(snpmfn));
2702 snpl1p = sh_map_domain_page(snpmfn);
2703 snpl1p += guest_l1_table_offset(gw->va);
2705 #endif /* OOS */
2708 for ( i = 1; i < dist ; i++ )
2710 /* No point in prefetching if there's already a shadow */
2711 if ( ptr_sl1e[i].l1 != 0 )
2712 break;
2714 if ( mfn_valid(gw->l1mfn) )
2716 /* Normal guest page; grab the next guest entry */
2717 gl1e = gl1p[i];
2718 /* Not worth continuing if we hit an entry that will need another
2719 * fault for A/D-bit propagation anyway */
2720 gflags = guest_l1e_get_flags(gl1e);
2721 if ( (gflags & _PAGE_PRESENT)
2722 && (!(gflags & _PAGE_ACCESSED)
2723 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2724 break;
2726 else
2728 /* Fragmented superpage, unless we've been called wrongly */
2729 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2730 /* Increment the l1e's GFN by the right number of guest pages */
2731 gl1e = guest_l1e_from_gfn(
2732 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2733 guest_l1e_get_flags(gw->l1e));
2736 /* Look at the gfn that the l1e is pointing at */
2737 gfn = guest_l1e_get_gfn(gl1e);
2738 gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
2740 /* Propagate the entry. */
2741 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2742 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2744 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2745 if ( snpl1p != NULL )
2746 snpl1p[i] = gl1e;
2747 #endif /* OOS */
2749 if ( gl1p != NULL )
2750 sh_unmap_domain_page(gl1p);
2751 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2752 if ( snpl1p != NULL )
2753 sh_unmap_domain_page(snpl1p);
2754 #endif /* OOS */
2757 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2759 #if GUEST_PAGING_LEVELS == 4
2760 typedef u64 guest_va_t;
2761 typedef u64 guest_pa_t;
2762 #elif GUEST_PAGING_LEVELS == 3
2763 typedef u32 guest_va_t;
2764 typedef u64 guest_pa_t;
2765 #else
2766 typedef u32 guest_va_t;
2767 typedef u32 guest_pa_t;
2768 #endif
2770 static inline void trace_shadow_gen(u32 event, guest_va_t va)
2772 if ( tb_init_done )
2774 event |= (GUEST_PAGING_LEVELS-2)<<8;
2775 __trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va);
2779 static inline void trace_shadow_fixup(guest_l1e_t gl1e,
2780 guest_va_t va)
2782 if ( tb_init_done )
2784 struct {
2785 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2786 so put it first for alignment sake. */
2787 guest_l1e_t gl1e;
2788 guest_va_t va;
2789 u32 flags;
2790 } __attribute__((packed)) d;
2791 u32 event;
2793 event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8);
2795 d.gl1e = gl1e;
2796 d.va = va;
2797 d.flags = this_cpu(trace_shadow_path_flags);
2799 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2803 static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
2804 guest_va_t va)
2806 if ( tb_init_done )
2808 struct {
2809 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2810 so put it first for alignment sake. */
2811 guest_l1e_t gl1e;
2812 guest_va_t va;
2813 u32 flags;
2814 } __attribute__((packed)) d;
2815 u32 event;
2817 event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8);
2819 d.gl1e = gl1e;
2820 d.va = va;
2821 d.flags = this_cpu(trace_shadow_path_flags);
2823 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2827 static inline void trace_shadow_emulate_other(u32 event,
2828 guest_va_t va,
2829 gfn_t gfn)
2831 if ( tb_init_done )
2833 struct {
2834 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2835 so put it first for alignment sake. */
2836 #if GUEST_PAGING_LEVELS == 2
2837 u32 gfn;
2838 #else
2839 u64 gfn;
2840 #endif
2841 guest_va_t va;
2842 } __attribute__((packed)) d;
2844 event |= ((GUEST_PAGING_LEVELS-2)<<8);
2846 d.gfn=gfn_x(gfn);
2847 d.va = va;
2849 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2853 #if GUEST_PAGING_LEVELS == 3
2854 static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va);
2855 static DEFINE_PER_CPU(int,trace_extra_emulation_count);
2856 #endif
2857 static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
2859 static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
2861 if ( tb_init_done )
2863 struct {
2864 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2865 so put it first for alignment sake. */
2866 guest_l1e_t gl1e, write_val;
2867 guest_va_t va;
2868 unsigned flags:29, emulation_count:3;
2869 } __attribute__((packed)) d;
2870 u32 event;
2872 event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8);
2874 d.gl1e = gl1e;
2875 d.write_val.l1 = this_cpu(trace_emulate_write_val);
2876 d.va = va;
2877 #if GUEST_PAGING_LEVELS == 3
2878 d.emulation_count = this_cpu(trace_extra_emulation_count);
2879 #endif
2880 d.flags = this_cpu(trace_shadow_path_flags);
2882 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2886 /**************************************************************************/
2887 /* Entry points into the shadow code */
2889 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2890 * for pagefaults. Returns 1 if this fault was an artefact of the
2891 * shadow code (and the guest should retry) or 0 if it is not (and the
2892 * fault should be handled elsewhere or passed to the guest). */
2894 static int sh_page_fault(struct vcpu *v,
2895 unsigned long va,
2896 struct cpu_user_regs *regs)
2898 struct domain *d = v->domain;
2899 walk_t gw;
2900 gfn_t gfn = _gfn(0);
2901 mfn_t gmfn, sl1mfn = _mfn(0);
2902 shadow_l1e_t sl1e, *ptr_sl1e;
2903 paddr_t gpa;
2904 struct sh_emulate_ctxt emul_ctxt;
2905 struct x86_emulate_ops *emul_ops;
2906 int r;
2907 fetch_type_t ft = 0;
2908 p2m_type_t p2mt;
2909 uint32_t rc;
2910 int version;
2911 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2912 int fast_emul = 0;
2913 #endif
2915 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
2916 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
2917 regs->eip);
2919 perfc_incr(shadow_fault);
2921 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2922 /* If faulting frame is successfully emulated in last shadow fault
2923 * it's highly likely to reach same emulation action for this frame.
2924 * Then try to emulate early to avoid lock aquisition.
2925 */
2926 if ( v->arch.paging.last_write_emul_ok
2927 && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
2929 /* check whether error code is 3, or else fall back to normal path
2930 * in case of some validation is required
2931 */
2932 if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
2934 fast_emul = 1;
2935 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
2937 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2938 /* Fall back to the slow path if we're trying to emulate
2939 writes to an out of sync page. */
2940 if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
2942 fast_emul = 0;
2943 v->arch.paging.last_write_emul_ok = 0;
2944 goto page_fault_slow_path;
2946 #endif /* OOS */
2948 perfc_incr(shadow_fault_fast_emulate);
2949 goto early_emulation;
2951 else
2952 v->arch.paging.last_write_emul_ok = 0;
2954 #endif
2956 //
2957 // XXX: Need to think about eventually mapping superpages directly in the
2958 // shadow (when possible), as opposed to splintering them into a
2959 // bunch of 4K maps.
2960 //
2962 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
2963 if ( (regs->error_code & PFEC_reserved_bit) )
2965 /* The only reasons for reserved bits to be set in shadow entries
2966 * are the two "magic" shadow_l1e entries. */
2967 if ( likely((__copy_from_user(&sl1e,
2968 (sh_linear_l1_table(v)
2969 + shadow_l1_linear_offset(va)),
2970 sizeof(sl1e)) == 0)
2971 && sh_l1e_is_magic(sl1e)) )
2973 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2974 /* First, need to check that this isn't an out-of-sync
2975 * shadow l1e. If it is, we fall back to the slow path, which
2976 * will sync it up again. */
2978 shadow_l2e_t sl2e;
2979 mfn_t gl1mfn;
2980 if ( (__copy_from_user(&sl2e,
2981 (sh_linear_l2_table(v)
2982 + shadow_l2_linear_offset(va)),
2983 sizeof(sl2e)) != 0)
2984 || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
2985 || !mfn_valid(gl1mfn = _mfn(mfn_to_page(
2986 shadow_l2e_get_mfn(sl2e))->v.sh.back))
2987 || unlikely(mfn_is_out_of_sync(gl1mfn)) )
2989 /* Hit the slow path as if there had been no
2990 * shadow entry at all, and let it tidy up */
2991 ASSERT(regs->error_code & PFEC_page_present);
2992 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2993 goto page_fault_slow_path;
2996 #endif /* SHOPT_OUT_OF_SYNC */
2998 if ( sh_l1e_is_gnp(sl1e) )
3000 /* Not-present in a guest PT: pass to the guest as
3001 * a not-present fault (by flipping two bits). */
3002 ASSERT(regs->error_code & PFEC_page_present);
3003 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3004 reset_early_unshadow(v);
3005 perfc_incr(shadow_fault_fast_gnp);
3006 SHADOW_PRINTK("fast path not-present\n");
3007 trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
3008 return 0;
3010 else
3012 /* Magic MMIO marker: extract gfn for MMIO address */
3013 ASSERT(sh_l1e_is_mmio(sl1e));
3014 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
3015 << PAGE_SHIFT)
3016 | (va & ~PAGE_MASK);
3018 perfc_incr(shadow_fault_fast_mmio);
3019 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
3020 reset_early_unshadow(v);
3021 trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
3022 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3023 ? EXCRET_fault_fixed : 0);
3025 else
3027 /* This should be exceptionally rare: another vcpu has fixed
3028 * the tables between the fault and our reading the l1e.
3029 * Retry and let the hardware give us the right fault next time. */
3030 perfc_incr(shadow_fault_fast_fail);
3031 SHADOW_PRINTK("fast path false alarm!\n");
3032 trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
3033 return EXCRET_fault_fixed;
3037 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3038 page_fault_slow_path:
3039 #endif
3040 #endif /* SHOPT_FAST_FAULT_PATH */
3042 /* Detect if this page fault happened while we were already in Xen
3043 * doing a shadow operation. If that happens, the only thing we can
3044 * do is let Xen's normal fault handlers try to fix it. In any case,
3045 * a diagnostic trace of the fault will be more useful than
3046 * a BUG() when we try to take the lock again. */
3047 if ( unlikely(shadow_locked_by_me(d)) )
3049 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
3050 d->arch.paging.shadow.locker_function);
3051 return 0;
3054 rewalk:
3056 /* The walk is done in a lock-free style, with some sanity check
3057 * postponed after grabbing shadow lock later. Those delayed checks
3058 * will make sure no inconsistent mapping being translated into
3059 * shadow page table. */
3060 version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
3061 rmb();
3062 rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
3064 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3065 regs->error_code &= ~PFEC_page_present;
3066 if ( !(rc & _PAGE_PRESENT) )
3067 regs->error_code |= PFEC_page_present;
3068 #endif
3070 if ( rc != 0 )
3072 perfc_incr(shadow_fault_bail_real_fault);
3073 SHADOW_PRINTK("not a shadow fault\n");
3074 reset_early_unshadow(v);
3075 goto propagate;
3078 /* It's possible that the guest has put pagetables in memory that it has
3079 * already used for some special purpose (ioreq pages, or granted pages).
3080 * If that happens we'll have killed the guest already but it's still not
3081 * safe to propagate entries out of the guest PT so get out now. */
3082 if ( unlikely(d->is_shutting_down) )
3084 SHADOW_PRINTK("guest is shutting down\n");
3085 goto propagate;
3088 /* What kind of access are we dealing with? */
3089 ft = ((regs->error_code & PFEC_write_access)
3090 ? ft_demand_write : ft_demand_read);
3092 /* What mfn is the guest trying to access? */
3093 gfn = guest_l1e_get_gfn(gw.l1e);
3094 gmfn = gfn_to_mfn_guest(d, gfn, &p2mt);
3096 if ( shadow_mode_refcounts(d) &&
3097 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3099 perfc_incr(shadow_fault_bail_bad_gfn);
3100 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3101 gfn_x(gfn), mfn_x(gmfn));
3102 reset_early_unshadow(v);
3103 goto propagate;
3106 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3107 /* Remember this successful VA->GFN translation for later. */
3108 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3109 regs->error_code | PFEC_page_present);
3110 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3112 shadow_lock(d);
3114 TRACE_CLEAR_PATH_FLAGS;
3116 rc = gw_remove_write_accesses(v, va, &gw);
3118 /* First bit set: Removed write access to a page. */
3119 if ( rc & GW_RMWR_FLUSHTLB )
3121 /* Write permission removal is also a hint that other gwalks
3122 * overlapping with this one may be inconsistent
3123 */
3124 perfc_incr(shadow_rm_write_flush_tlb);
3125 atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3126 flush_tlb_mask(d->domain_dirty_cpumask);
3129 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3130 /* Second bit set: Resynced a page. Re-walk needed. */
3131 if ( rc & GW_RMWR_REWALK )
3133 shadow_unlock(d);
3134 goto rewalk;
3136 #endif /* OOS */
3138 if ( !shadow_check_gwalk(v, va, &gw, version) )
3140 perfc_incr(shadow_inconsistent_gwalk);
3141 shadow_unlock(d);
3142 goto rewalk;
3145 shadow_audit_tables(v);
3146 sh_audit_gw(v, &gw);
3148 /* Make sure there is enough free shadow memory to build a chain of
3149 * shadow tables. (We never allocate a top-level shadow on this path,
3150 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3151 * SH_type_l1_shadow isn't correct in the latter case, all page
3152 * tables are the same size there.) */
3153 shadow_prealloc(d,
3154 SH_type_l1_shadow,
3155 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3157 /* Acquire the shadow. This must happen before we figure out the rights
3158 * for the shadow entry, since we might promote a page here. */
3159 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3160 if ( unlikely(ptr_sl1e == NULL) )
3162 /* Couldn't get the sl1e! Since we know the guest entries
3163 * are OK, this can only have been caused by a failed
3164 * shadow_set_l*e(), which will have crashed the guest.
3165 * Get out of the fault handler immediately. */
3166 ASSERT(d->is_shutting_down);
3167 shadow_unlock(d);
3168 trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va);
3169 return 0;
3172 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3173 /* Always unsync when writing to L1 page tables. */
3174 if ( sh_mfn_is_a_page_table(gmfn)
3175 && ft == ft_demand_write )
3176 sh_unsync(v, gmfn);
3178 if ( unlikely(d->is_shutting_down) )
3180 /* We might end up with a crashed domain here if
3181 * sh_remove_shadows() in a previous sh_resync() call has
3182 * failed. We cannot safely continue since some page is still
3183 * OOS but not in the hash table anymore. */
3184 shadow_unlock(d);
3185 return 0;
3187 #endif /* OOS */
3189 /* Calculate the shadow entry and write it */
3190 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3191 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
3193 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3194 if ( mfn_valid(gw.l1mfn)
3195 && mfn_is_out_of_sync(gw.l1mfn) )
3197 /* Update the OOS snapshot. */
3198 mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
3199 guest_l1e_t *snp;
3201 ASSERT(mfn_valid(snpmfn));
3203 snp = sh_map_domain_page(snpmfn);
3204 snp[guest_l1_table_offset(va)] = gw.l1e;
3205 sh_unmap_domain_page(snp);
3207 #endif /* OOS */
3209 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3210 /* Prefetch some more shadow entries */
3211 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3212 #endif
3214 /* Need to emulate accesses to page tables */
3215 if ( sh_mfn_is_a_page_table(gmfn)
3216 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3217 /* Unless they've been allowed to go out of sync with their
3218 shadows and we don't need to unshadow it. */
3219 && !(mfn_is_out_of_sync(gmfn)
3220 && !(regs->error_code & PFEC_user_mode))
3221 #endif
3224 if ( ft == ft_demand_write )
3226 perfc_incr(shadow_fault_emulate_write);
3227 goto emulate;
3229 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3231 perfc_incr(shadow_fault_emulate_read);
3232 goto emulate;
3236 /* Need to hand off device-model MMIO to the device model */
3237 if ( p2mt == p2m_mmio_dm )
3239 gpa = guest_walk_to_gpa(&gw);
3240 goto mmio;
3243 /* Ignore attempts to write to read-only memory. */
3244 if ( (p2mt == p2m_ram_ro) && (ft == ft_demand_write) )
3246 static unsigned long lastpage;
3247 if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
3248 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
3249 " page. va page=%#lx, mfn=%#lx\n",
3250 va & PAGE_MASK, mfn_x(gmfn));
3251 goto emulate_readonly; /* skip over the instruction */
3254 /* In HVM guests, we force CR0.WP always to be set, so that the
3255 * pagetables are always write-protected. If the guest thinks
3256 * CR0.WP is clear, we must emulate faulting supervisor writes to
3257 * allow the guest to write through read-only PTEs. Emulate if the
3258 * fault was a non-user write to a present page. */
3259 if ( is_hvm_domain(d)
3260 && unlikely(!hvm_wp_enabled(v))
3261 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
3263 perfc_incr(shadow_fault_emulate_wp);
3264 goto emulate;
3267 perfc_incr(shadow_fault_fixed);
3268 d->arch.paging.log_dirty.fault_count++;
3269 reset_early_unshadow(v);
3271 trace_shadow_fixup(gw.l1e, va);
3272 done:
3273 sh_audit_gw(v, &gw);
3274 SHADOW_PRINTK("fixed\n");
3275 shadow_audit_tables(v);
3276 shadow_unlock(d);
3277 return EXCRET_fault_fixed;
3279 emulate:
3280 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3281 goto not_a_shadow_fault;
3283 /*
3284 * We do not emulate user writes. Instead we use them as a hint that the
3285 * page is no longer a page table. This behaviour differs from native, but
3286 * it seems very unlikely that any OS grants user access to page tables.
3287 */
3288 if ( (regs->error_code & PFEC_user_mode) )
3290 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3291 mfn_x(gmfn));
3292 perfc_incr(shadow_fault_emulate_failed);
3293 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3294 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
3295 va, gfn);
3296 goto done;
3299 /*
3300 * Write from userspace to ro-mem needs to jump here to avoid getting
3301 * caught by user-mode page-table check above.
3302 */
3303 emulate_readonly:
3304 /*
3305 * We don't need to hold the lock for the whole emulation; we will
3306 * take it again when we write to the pagetables.
3307 */
3308 sh_audit_gw(v, &gw);
3309 shadow_audit_tables(v);
3310 shadow_unlock(d);
3312 this_cpu(trace_emulate_write_val) = 0;
3314 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3315 early_emulation:
3316 #endif
3317 if ( is_hvm_domain(d) )
3319 /*
3320 * If we are in the middle of injecting an exception or interrupt then
3321 * we should not emulate: it is not the instruction at %eip that caused
3322 * the fault. Furthermore it is almost certainly the case the handler
3323 * stack is currently considered to be a page table, so we should
3324 * unshadow the faulting page before exiting.
3325 */
3326 if ( unlikely(hvm_event_pending(v)) )
3328 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3329 if ( fast_emul )
3331 perfc_incr(shadow_fault_fast_emulate_fail);
3332 v->arch.paging.last_write_emul_ok = 0;
3334 #endif
3335 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3336 "injection: cr2=%#lx, mfn=%#lx\n",
3337 va, mfn_x(gmfn));
3338 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3339 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
3340 va, gfn);
3341 return EXCRET_fault_fixed;
3345 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3346 (unsigned long)regs->eip, (unsigned long)regs->esp);
3348 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3350 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3352 /*
3353 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3354 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3355 * then it must be 'failable': we cannot require the unshadow to succeed.
3356 */
3357 if ( r == X86EMUL_UNHANDLEABLE )
3359 perfc_incr(shadow_fault_emulate_failed);
3360 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3361 if ( fast_emul )
3363 perfc_incr(shadow_fault_fast_emulate_fail);
3364 v->arch.paging.last_write_emul_ok = 0;
3366 #endif
3367 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3368 mfn_x(gmfn));
3369 /* If this is actually a page table, then we have a bug, and need
3370 * to support more operations in the emulator. More likely,
3371 * though, this is a hint that this page should not be shadowed. */
3372 shadow_remove_all_shadows(v, gmfn);
3374 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
3375 va, gfn);
3376 goto emulate_done;
3379 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3380 /* Record successfully emulated information as heuristics to next
3381 * fault on same frame for acceleration. But be careful to verify
3382 * its attribute still as page table, or else unshadow triggered
3383 * in write emulation normally requires a re-sync with guest page
3384 * table to recover r/w permission. Incorrect record for such case
3385 * will cause unexpected more shadow faults due to propagation is
3386 * skipped.
3387 */
3388 if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3390 if ( !fast_emul )
3392 v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3393 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3394 v->arch.paging.last_write_emul_ok = 1;
3397 else if ( fast_emul )
3398 v->arch.paging.last_write_emul_ok = 0;
3399 #endif
3401 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3402 if ( r == X86EMUL_OKAY ) {
3403 int i, emulation_count=0;
3404 this_cpu(trace_emulate_initial_va) = va;
3405 /* Emulate up to four extra instructions in the hope of catching
3406 * the "second half" of a 64-bit pagetable write. */
3407 for ( i = 0 ; i < 4 ; i++ )
3409 shadow_continue_emulation(&emul_ctxt, regs);
3410 v->arch.paging.last_write_was_pt = 0;
3411 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3412 if ( r == X86EMUL_OKAY )
3414 emulation_count++;
3415 if ( v->arch.paging.last_write_was_pt )
3417 perfc_incr(shadow_em_ex_pt);
3418 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
3419 break; /* Don't emulate past the other half of the write */
3421 else
3422 perfc_incr(shadow_em_ex_non_pt);
3424 else
3426 perfc_incr(shadow_em_ex_fail);
3427 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
3428 break; /* Don't emulate again if we failed! */
3431 this_cpu(trace_extra_emulation_count)=emulation_count;
3433 #endif /* PAE guest */
3435 trace_shadow_emulate(gw.l1e, va);
3436 emulate_done:
3437 SHADOW_PRINTK("emulated\n");
3438 return EXCRET_fault_fixed;
3440 mmio:
3441 if ( !guest_mode(regs) )
3442 goto not_a_shadow_fault;
3443 perfc_incr(shadow_fault_mmio);
3444 sh_audit_gw(v, &gw);
3445 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3446 shadow_audit_tables(v);
3447 reset_early_unshadow(v);
3448 shadow_unlock(d);
3449 trace_shadow_gen(TRC_SHADOW_MMIO, va);
3450 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3451 ? EXCRET_fault_fixed : 0);
3453 not_a_shadow_fault:
3454 sh_audit_gw(v, &gw);
3455 SHADOW_PRINTK("not a shadow fault\n");
3456 shadow_audit_tables(v);
3457 reset_early_unshadow(v);
3458 shadow_unlock(d);
3460 propagate:
3461 trace_not_shadow_fault(gw.l1e, va);
3463 return 0;
3467 static int
3468 sh_invlpg(struct vcpu *v, unsigned long va)
3469 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3470 * instruction should be issued on the hardware, or 0 if it's safe not
3471 * to do so. */
3473 mfn_t sl1mfn;
3474 shadow_l2e_t sl2e;
3476 perfc_incr(shadow_invlpg);
3478 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3479 /* No longer safe to use cached gva->gfn translations */
3480 vtlb_flush(v);
3481 #endif
3483 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3484 v->arch.paging.last_write_emul_ok = 0;
3485 #endif
3487 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3488 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3489 * yet. */
3490 #if SHADOW_PAGING_LEVELS == 4
3492 shadow_l3e_t sl3e;
3493 if ( !(shadow_l4e_get_flags(
3494 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3495 & _PAGE_PRESENT) )
3496 return 0;
3497 /* This must still be a copy-from-user because we don't have the
3498 * shadow lock, and the higher-level shadows might disappear
3499 * under our feet. */
3500 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3501 + shadow_l3_linear_offset(va)),
3502 sizeof (sl3e)) != 0 )
3504 perfc_incr(shadow_invlpg_fault);
3505 return 0;
3507 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3508 return 0;
3510 #else /* SHADOW_PAGING_LEVELS == 3 */
3511 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3512 & _PAGE_PRESENT) )
3513 // no need to flush anything if there's no SL2...
3514 return 0;
3515 #endif
3517 /* This must still be a copy-from-user because we don't have the shadow
3518 * lock, and the higher-level shadows might disappear under our feet. */
3519 if ( __copy_from_user(&sl2e,
3520 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3521 sizeof (sl2e)) != 0 )
3523 perfc_incr(shadow_invlpg_fault);
3524 return 0;
3527 // If there's nothing shadowed for this particular sl2e, then
3528 // there is no need to do an invlpg, either...
3529 //
3530 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3531 return 0;
3533 // Check to see if the SL2 is a splintered superpage...
3534 // If so, then we'll need to flush the entire TLB (because that's
3535 // easier than invalidating all of the individual 4K pages).
3536 //
3537 sl1mfn = shadow_l2e_get_mfn(sl2e);
3538 if ( mfn_to_page(sl1mfn)->u.sh.type
3539 == SH_type_fl1_shadow )
3541 flush_tlb_local();
3542 return 0;
3545 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3546 /* Check to see if the SL1 is out of sync. */
3548 mfn_t gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
3549 struct page_info *pg = mfn_to_page(gl1mfn);
3550 if ( mfn_valid(gl1mfn)
3551 && page_is_out_of_sync(pg) )
3553 /* The test above may give false positives, since we don't
3554 * hold the shadow lock yet. Check again with the lock held. */
3555 shadow_lock(v->domain);
3557 /* This must still be a copy-from-user because we didn't
3558 * have the shadow lock last time we checked, and the
3559 * higher-level shadows might have disappeared under our
3560 * feet. */
3561 if ( __copy_from_user(&sl2e,
3562 sh_linear_l2_table(v)
3563 + shadow_l2_linear_offset(va),
3564 sizeof (sl2e)) != 0 )
3566 perfc_incr(shadow_invlpg_fault);
3567 shadow_unlock(v->domain);
3568 return 0;
3571 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3573 shadow_unlock(v->domain);
3574 return 0;
3577 sl1mfn = shadow_l2e_get_mfn(sl2e);
3578 gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
3579 pg = mfn_to_page(gl1mfn);
3581 if ( likely(sh_mfn_is_a_page_table(gl1mfn)
3582 && page_is_out_of_sync(pg) ) )
3584 shadow_l1e_t *sl1;
3585 sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
3586 /* Remove the shadow entry that maps this VA */
3587 (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
3589 shadow_unlock(v->domain);
3590 /* Need the invlpg, to pick up the disappeareance of the sl1e */
3591 return 1;
3594 #endif
3596 return 1;
3600 static unsigned long
3601 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3602 /* Called to translate a guest virtual address to what the *guest*
3603 * pagetables would map it to. */
3605 walk_t gw;
3606 gfn_t gfn;
3608 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3609 /* Check the vTLB cache first */
3610 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3611 if ( VALID_GFN(vtlb_gfn) )
3612 return vtlb_gfn;
3613 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3615 if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
3617 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3618 pfec[0] &= ~PFEC_page_present;
3619 return INVALID_GFN;
3621 gfn = guest_walk_to_gfn(&gw);
3623 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3624 /* Remember this successful VA->GFN translation for later. */
3625 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3626 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3628 return gfn_x(gfn);
3632 static inline void
3633 sh_update_linear_entries(struct vcpu *v)
3634 /* Sync up all the linear mappings for this vcpu's pagetables */
3636 struct domain *d = v->domain;
3638 /* Linear pagetables in PV guests
3639 * ------------------------------
3641 * Guest linear pagetables, which map the guest pages, are at
3642 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3643 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3644 * are set up at shadow creation time, but (of course!) the PAE case
3645 * is subtler. Normal linear mappings are made by having an entry
3646 * in the top-level table that points to itself (shadow linear) or
3647 * to the guest top-level table (guest linear). For PAE, to set up
3648 * a linear map requires us to copy the four top-level entries into
3649 * level-2 entries. That means that every time we change a PAE l3e,
3650 * we need to reflect the change into the copy.
3652 * Linear pagetables in HVM guests
3653 * -------------------------------
3655 * For HVM guests, the linear pagetables are installed in the monitor
3656 * tables (since we can't put them in the shadow). Shadow linear
3657 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3658 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3659 * a linear pagetable of the monitor tables themselves. We have
3660 * the same issue of having to re-copy PAE l3 entries whevever we use
3661 * PAE shadows.
3663 * Because HVM guests run on the same monitor tables regardless of the
3664 * shadow tables in use, the linear mapping of the shadow tables has to
3665 * be updated every time v->arch.shadow_table changes.
3666 */
3668 /* Don't try to update the monitor table if it doesn't exist */
3669 if ( shadow_mode_external(d)
3670 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3671 return;
3673 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3675 /* For PV, one l4e points at the guest l4, one points at the shadow
3676 * l4. No maintenance required.
3677 * For HVM, just need to update the l4e that points to the shadow l4. */
3679 if ( shadow_mode_external(d) )
3681 /* Use the linear map if we can; otherwise make a new mapping */
3682 if ( v == current )
3684 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3685 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3686 __PAGE_HYPERVISOR);
3688 else
3690 l4_pgentry_t *ml4e;
3691 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3692 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3693 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3694 __PAGE_HYPERVISOR);
3695 sh_unmap_domain_page(ml4e);
3699 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3701 /* PV: XXX
3703 * HVM: To give ourselves a linear map of the shadows, we need to
3704 * extend a PAE shadow to 4 levels. We do this by having a monitor
3705 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3706 * entries into it. Then, by having the monitor l4e for shadow
3707 * pagetables also point to the monitor l4, we can use it to access
3708 * the shadows.
3709 */
3711 if ( shadow_mode_external(d) )
3713 /* Install copies of the shadow l3es into the monitor l2 table
3714 * that maps SH_LINEAR_PT_VIRT_START. */
3715 shadow_l3e_t *sl3e;
3716 l2_pgentry_t *ml2e;
3717 int i;
3719 /* Use linear mappings if we can; otherwise make new mappings */
3720 if ( v == current )
3721 ml2e = __linear_l2_table
3722 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3723 else
3725 mfn_t l3mfn, l2mfn;
3726 l4_pgentry_t *ml4e;
3727 l3_pgentry_t *ml3e;
3728 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3729 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3731 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3732 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3733 ml3e = sh_map_domain_page(l3mfn);
3734 sh_unmap_domain_page(ml4e);
3736 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3737 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3738 ml2e = sh_map_domain_page(l2mfn);
3739 sh_unmap_domain_page(ml3e);
3742 /* Shadow l3 tables are made up by sh_update_cr3 */
3743 sl3e = v->arch.paging.shadow.l3table;
3745 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3747 ml2e[i] =
3748 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3749 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3750 __PAGE_HYPERVISOR)
3751 : l2e_empty();
3754 if ( v != current )
3755 sh_unmap_domain_page(ml2e);
3757 else
3758 domain_crash(d); /* XXX */
3760 #elif CONFIG_PAGING_LEVELS == 3
3762 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3763 * entries in the shadow, and the shadow's l3 entries into the
3764 * shadow-linear-map l2 entries in the shadow. This is safe to do
3765 * because Xen does not let guests share high-slot l2 tables between l3s,
3766 * so we know we're not treading on anyone's toes.
3768 * HVM: need to copy the shadow's l3 entries into the
3769 * shadow-linear-map l2 entries in the monitor table. This is safe
3770 * because we have one monitor table for each vcpu. The monitor's
3771 * own l3es don't need to be copied because they never change.
3772 * XXX That might change if we start stuffing things into the rest
3773 * of the monitor's virtual address space.
3774 */
3776 l2_pgentry_t *l2e, new_l2e;
3777 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3778 int i;
3779 int unmap_l2e = 0;
3781 #if GUEST_PAGING_LEVELS == 2
3783 /* Shadow l3 tables were built by sh_update_cr3 */
3784 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3785 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3787 #else /* GUEST_PAGING_LEVELS == 3 */
3789 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3790 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3792 #endif /* GUEST_PAGING_LEVELS */
3794 /* Choose where to write the entries, using linear maps if possible */
3795 if ( shadow_mode_external(d) )
3797 if ( v == current )
3799 /* From the monitor tables, it's safe to use linear maps
3800 * to update monitor l2s */
3801 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3803 else
3805 /* Map the monitor table's high l2 */
3806 l3_pgentry_t *l3e;
3807 l3e = sh_map_domain_page(
3808 pagetable_get_mfn(v->arch.monitor_table));
3809 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3810 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3811 unmap_l2e = 1;
3812 sh_unmap_domain_page(l3e);
3815 else
3817 /* Map the shadow table's high l2 */
3818 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3819 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3820 unmap_l2e = 1;
3823 /* Write linear mapping of guest (only in PV, and only when
3824 * not translated). */
3825 if ( !shadow_mode_translate(d) )
3827 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3829 new_l2e =
3830 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3831 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3832 __PAGE_HYPERVISOR)
3833 : l2e_empty());
3834 safe_write_entry(
3835 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3836 &new_l2e);
3840 /* Write linear mapping of shadow. */
3841 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3843 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3844 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3845 __PAGE_HYPERVISOR)
3846 : l2e_empty();
3847 safe_write_entry(
3848 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3849 &new_l2e);
3852 if ( unmap_l2e )
3853 sh_unmap_domain_page(l2e);
3856 #else
3857 #error this should not happen
3858 #endif
3860 if ( shadow_mode_external(d) )
3862 /*
3863 * Having modified the linear pagetable mapping, flush local host TLBs.
3864 * This was not needed when vmenter/vmexit always had the side effect
3865 * of flushing host TLBs but, with ASIDs, it is possible to finish
3866 * this CR3 update, vmenter the guest, vmexit due to a page fault,
3867 * without an intervening host TLB flush. Then the page fault code
3868 * could use the linear pagetable to read a top-level shadow page
3869 * table entry. But, without this change, it would fetch the wrong
3870 * value due to a stale TLB.
3871 */
3872 flush_tlb_local();
3877 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3878 * Does all appropriate management/bookkeeping/refcounting/etc...
3879 */
3880 static void
3881 sh_detach_old_tables(struct vcpu *v)
3883 mfn_t smfn;
3884 int i = 0;
3886 ////
3887 //// vcpu->arch.paging.shadow.guest_vtable
3888 ////
3890 #if GUEST_PAGING_LEVELS == 3
3891 /* PAE guests don't have a mapping of the guest top-level table */
3892 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3893 #else
3894 if ( v->arch.paging.shadow.guest_vtable )
3896 struct domain *d = v->domain;
3897 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3898 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3899 v->arch.paging.shadow.guest_vtable = NULL;
3901 #endif // !NDEBUG
3904 ////
3905 //// vcpu->arch.shadow_table[]
3906 ////
3908 #if GUEST_PAGING_LEVELS == 3
3909 /* PAE guests have four shadow_table entries */
3910 for ( i = 0 ; i < 4 ; i++ )
3911 #endif
3913 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3914 if ( mfn_x(smfn) )
3915 sh_put_ref(v, smfn, 0);
3916 v->arch.shadow_table[i] = pagetable_null();
3920 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3921 static void
3922 sh_set_toplevel_shadow(struct vcpu *v,
3923 int slot,
3924 mfn_t gmfn,
3925 unsigned int root_type)
3927 mfn_t smfn;
3928 pagetable_t old_entry, new_entry;
3930 struct domain *d = v->domain;
3932 /* Remember the old contents of this slot */
3933 old_entry = v->arch.shadow_table[slot];
3935 /* Now figure out the new contents: is this a valid guest MFN? */
3936 if ( !mfn_valid(gmfn) )
3938 new_entry = pagetable_null();
3939 goto install_new_entry;
3942 /* Guest mfn is valid: shadow it and install the shadow */
3943 smfn = get_shadow_status(v, gmfn, root_type);
3944 if ( !mfn_valid(smfn) )
3946 /* Make sure there's enough free shadow memory. */
3947 shadow_prealloc(d, root_type, 1);
3948 /* Shadow the page. */
3949 smfn = sh_make_shadow(v, gmfn, root_type);
3951 ASSERT(mfn_valid(smfn));
3953 /* Pin the shadow and put it (back) on the list of pinned shadows */
3954 if ( sh_pin(v, smfn) == 0 )
3956 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3957 domain_crash(v->domain);
3960 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3961 * or the next call to set_toplevel_shadow() */
3962 if ( !sh_get_ref(v, smfn, 0) )
3964 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3965 domain_crash(v->domain);
3968 new_entry = pagetable_from_mfn(smfn);
3970 install_new_entry:
3971 /* Done. Install it */
3972 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3973 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3974 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3975 v->arch.shadow_table[slot] = new_entry;
3977 /* Decrement the refcount of the old contents of this slot */
3978 if ( !pagetable_is_null(old_entry) ) {
3979 mfn_t old_smfn = pagetable_get_mfn(old_entry);
3980 /* Need to repin the old toplevel shadow if it's been unpinned
3981 * by shadow_prealloc(): in PV mode we're still running on this
3982 * shadow and it's not safe to free it yet. */
3983 if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(v, old_smfn) )
3985 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
3986 domain_crash(v->domain);
3988 sh_put_ref(v, old_smfn, 0);
3993 static void
3994 sh_update_cr3(struct vcpu *v, int do_locking)
3995 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3996 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3997 * if appropriate).
3998 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3999 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
4000 * shadow tables are.
4001 * If do_locking != 0, assume we are being called from outside the
4002 * shadow code, and must take and release the shadow lock; otherwise
4003 * that is the caller's responsibility.
4004 */
4006 struct domain *d = v->domain;
4007 mfn_t gmfn;
4008 #if GUEST_PAGING_LEVELS == 3
4009 guest_l3e_t *gl3e;
4010 u32 guest_idx=0;
4011 int i;
4012 #endif
4014 /* Don't do anything on an uninitialised vcpu */
4015 if ( !is_hvm_domain(d) && !v->is_initialised )
4017 ASSERT(v->arch.cr3 == 0);
4018 return;
4021 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4022 /* Need to resync all the shadow entries on a TLB flush. Resync
4023 * current vcpus OOS pages before switching to the new shadow
4024 * tables so that the VA hint is still valid. */
4025 shadow_resync_current_vcpu(v, do_locking);
4026 #endif
4028 if ( do_locking ) shadow_lock(v->domain);
4030 ASSERT(shadow_locked_by_me(v->domain));
4031 ASSERT(v->arch.paging.mode);
4033 ////
4034 //// vcpu->arch.guest_table is already set
4035 ////
4037 #ifndef NDEBUG
4038 /* Double-check that the HVM code has sent us a sane guest_table */
4039 if ( is_hvm_domain(d) )
4041 ASSERT(shadow_mode_external(d));
4042 if ( hvm_paging_enabled(v) )
4043 ASSERT(pagetable_get_pfn(v->arch.guest_table));
4044 else
4045 ASSERT(v->arch.guest_table.pfn
4046 == d->arch.paging.shadow.unpaged_pagetable.pfn);
4048 #endif
4050 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
4051 d->domain_id, v->vcpu_id,
4052 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
4054 #if GUEST_PAGING_LEVELS == 4
4055 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
4056 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
4057 else
4058 #endif
4059 gmfn = pagetable_get_mfn(v->arch.guest_table);
4062 ////
4063 //// vcpu->arch.paging.shadow.guest_vtable
4064 ////
4065 #if GUEST_PAGING_LEVELS == 4
4066 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4068 if ( v->arch.paging.shadow.guest_vtable )
4069 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4070 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4071 /* PAGING_LEVELS==4 implies 64-bit, which means that
4072 * map_domain_page_global can't fail */
4073 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
4075 else
4076 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
4077 #elif GUEST_PAGING_LEVELS == 3
4078 /* On PAE guests we don't use a mapping of the guest's own top-level
4079 * table. We cache the current state of that table and shadow that,
4080 * until the next CR3 write makes us refresh our cache. */
4081 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4083 if ( shadow_mode_external(d) )
4084 /* Find where in the page the l3 table is */
4085 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
4086 else
4087 /* PV guest: l3 is at the start of a page */
4088 guest_idx = 0;
4090 // Ignore the low 2 bits of guest_idx -- they are really just
4091 // cache control.
4092 guest_idx &= ~3;
4094 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
4095 for ( i = 0; i < 4 ; i++ )
4096 v->arch.paging.shadow.gl3e[i] = gl3e[i];
4097 sh_unmap_domain_page(gl3e);
4098 #elif GUEST_PAGING_LEVELS == 2
4099 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4101 if ( v->arch.paging.shadow.guest_vtable )
4102 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4103 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4104 /* Does this really need map_domain_page_global? Handle the
4105 * error properly if so. */
4106 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
4108 else
4109 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
4110 #else
4111 #error this should never happen
4112 #endif
4115 ////
4116 //// vcpu->arch.shadow_table[]
4117 ////
4119 /* We revoke write access to the new guest toplevel page(s) before we
4120 * replace the old shadow pagetable(s), so that we can safely use the
4121 * (old) shadow linear maps in the writeable mapping heuristics. */
4122 #if GUEST_PAGING_LEVELS == 2
4123 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
4124 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4125 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
4126 #elif GUEST_PAGING_LEVELS == 3
4127 /* PAE guests have four shadow_table entries, based on the
4128 * current values of the guest's four l3es. */
4130 int flush = 0;
4131 gfn_t gl2gfn;
4132 mfn_t gl2mfn;
4133 p2m_type_t p2mt;
4134 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
4135 /* First, make all four entries read-only. */
4136 for ( i = 0; i < 4; i++ )
4138 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4140 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4141 gl2mfn = gfn_to_mfn_query(d, gl2gfn, &p2mt);
4142 if ( p2m_is_ram(p2mt) )
4143 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
4146 if ( flush )
4147 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4148 /* Now install the new shadows. */
4149 for ( i = 0; i < 4; i++ )
4151 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4153 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4154 gl2mfn = gfn_to_mfn_query(d, gl2gfn, &p2mt);
4155 if ( p2m_is_ram(p2mt) )
4156 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
4157 ? SH_type_l2h_shadow
4158 : SH_type_l2_shadow);
4159 else
4160 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4162 else
4163 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4166 #elif GUEST_PAGING_LEVELS == 4
4167 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
4168 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4169 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
4170 #else
4171 #error This should never happen
4172 #endif
4175 ///
4176 /// v->arch.paging.shadow.l3table
4177 ///
4178 #if SHADOW_PAGING_LEVELS == 3
4180 mfn_t smfn;
4181 int i;
4182 for ( i = 0; i < 4; i++ )
4184 #if GUEST_PAGING_LEVELS == 2
4185 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
4186 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
4187 #else
4188 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
4189 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4190 #endif
4191 v->arch.paging.shadow.l3table[i] =
4192 (mfn_x(smfn) == 0)
4193 ? shadow_l3e_empty()
4194 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
4197 #endif /* SHADOW_PAGING_LEVELS == 3 */
4200 ///
4201 /// v->arch.cr3
4202 ///
4203 if ( shadow_mode_external(d) )
4205 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4207 else // not shadow_mode_external...
4209 /* We don't support PV except guest == shadow == config levels */
4210 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4211 #if SHADOW_PAGING_LEVELS == 3
4212 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4213 * Don't use make_cr3 because (a) we know it's below 4GB, and
4214 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4215 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4216 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4217 #else
4218 /* 4-on-4: Just use the shadow top-level directly */
4219 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4220 #endif
4224 ///
4225 /// v->arch.hvm_vcpu.hw_cr[3]
4226 ///
4227 if ( shadow_mode_external(d) )
4229 ASSERT(is_hvm_domain(d));
4230 #if SHADOW_PAGING_LEVELS == 3
4231 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4232 v->arch.hvm_vcpu.hw_cr[3] =
4233 virt_to_maddr(&v->arch.paging.shadow.l3table);
4234 #else
4235 /* 4-on-4: Just use the shadow top-level directly */
4236 v->arch.hvm_vcpu.hw_cr[3] =
4237 pagetable_get_paddr(v->arch.shadow_table[0]);
4238 #endif
4239 hvm_update_guest_cr(v, 3);
4242 /* Fix up the linear pagetable mappings */
4243 sh_update_linear_entries(v);
4245 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4246 /* No longer safe to use cached gva->gfn translations */
4247 vtlb_flush(v);
4248 #endif
4250 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4251 v->arch.paging.last_write_emul_ok = 0;
4252 #endif
4254 /* Release the lock, if we took it (otherwise it's the caller's problem) */
4255 if ( do_locking ) shadow_unlock(v->domain);
4257 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4258 /* Need to resync all the shadow entries on a TLB flush. We only
4259 * update the shadows, leaving the pages out of sync. Also, we try
4260 * to skip synchronization of shadows not mapped in the new
4261 * tables. */
4262 shadow_sync_other_vcpus(v, do_locking);
4263 #endif
4268 /**************************************************************************/
4269 /* Functions to revoke guest rights */
4271 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
4272 int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
4273 mfn_t smfn, unsigned long off)
4275 int r;
4276 shadow_l1e_t *sl1p, sl1e;
4277 struct page_info *sp;
4279 ASSERT(mfn_valid(gmfn));
4280 ASSERT(mfn_valid(smfn));
4282 sp = mfn_to_page(smfn);
4284 if ( sp->count_info != 0
4285 || (sp->u.sh.type != SH_type_l1_shadow
4286 && sp->u.sh.type != SH_type_fl1_shadow) )
4287 goto fail;
4289 sl1p = sh_map_domain_page(smfn);
4290 sl1p += off;
4291 sl1e = *sl1p;
4292 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4293 != (_PAGE_PRESENT|_PAGE_RW))
4294 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4296 sh_unmap_domain_page(sl1p);
4297 goto fail;
4300 /* Found it! Need to remove its write permissions. */
4301 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4302 r = shadow_set_l1e(v, sl1p, sl1e, smfn);
4303 ASSERT( !(r & SHADOW_SET_ERROR) );
4305 sh_unmap_domain_page(sl1p);
4306 perfc_incr(shadow_writeable_h_7);
4307 return 1;
4309 fail:
4310 perfc_incr(shadow_writeable_h_8);
4311 return 0;
4313 #endif /* OOS */
4315 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4316 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4317 /* Look up this vaddr in the current shadow and see if it's a writeable
4318 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4320 shadow_l1e_t sl1e, *sl1p;
4321 shadow_l2e_t *sl2p;
4322 shadow_l3e_t *sl3p;
4323 #if SHADOW_PAGING_LEVELS >= 4
4324 shadow_l4e_t *sl4p;
4325 #endif
4326 mfn_t sl1mfn;
4327 int r;
4329 /* Carefully look in the shadow linear map for the l1e we expect */
4330 #if SHADOW_PAGING_LEVELS >= 4
4331 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4332 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4333 return 0;
4334 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4335 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4336 return 0;
4337 #else /* SHADOW_PAGING_LEVELS == 3 */
4338 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4339 + shadow_l3_linear_offset(vaddr);
4340 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4341 return 0;
4342 #endif
4343 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4344 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4345 return 0;
4346 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4347 sl1e = *sl1p;
4348 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4349 != (_PAGE_PRESENT|_PAGE_RW))
4350 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4351 return 0;
4353 /* Found it! Need to remove its write permissions. */
4354 sl1mfn = shadow_l2e_get_mfn(*sl2p);
4355 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4356 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
4357 ASSERT( !(r & SHADOW_SET_ERROR) );
4358 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);
4359 return 1;
4361 #endif
4363 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4364 mfn_t readonly_mfn)
4365 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4367 shadow_l1e_t *sl1e;
4368 int done = 0;
4369 int flags;
4370 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4371 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4372 #endif
4374 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4376 flags = shadow_l1e_get_flags(*sl1e);
4377 if ( (flags & _PAGE_PRESENT)
4378 && (flags & _PAGE_RW)
4379 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4381 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4382 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
4383 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4384 /* Remember the last shadow that we shot a writeable mapping in */
4385 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4386 #endif
4387 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4388 & PGT_count_mask) == 0 )
4389 /* This breaks us cleanly out of the FOREACH macro */
4390 done = 1;
4392 });
4393 return done;
4397 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4398 /* Excises all mappings to guest frame from this shadow l1 table */
4400 shadow_l1e_t *sl1e;
4401 int done = 0;
4402 int flags;
4404 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4406 flags = shadow_l1e_get_flags(*sl1e);
4407 if ( (flags & _PAGE_PRESENT)
4408 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4410 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
4411 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4412 /* This breaks us cleanly out of the FOREACH macro */
4413 done = 1;
4415 });
4416 return done;
4419 /**************************************************************************/
4420 /* Functions to excise all pointers to shadows from higher-level shadows. */
4422 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4423 /* Blank out a single shadow entry */
4425 switch ( mfn_to_page(smfn)->u.sh.type )
4427 case SH_type_l1_shadow:
4428 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
4429 case SH_type_l2_shadow:
4430 #if GUEST_PAGING_LEVELS >= 3
4431 case SH_type_l2h_shadow:
4432 #endif
4433 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
4434 #if GUEST_PAGING_LEVELS >= 4
4435 case SH_type_l3_shadow:
4436 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
4437 case SH_type_l4_shadow:
4438 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
4439 #endif
4440 default: BUG(); /* Called with the wrong kind of shadow. */
4444 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4445 /* Remove all mappings of this l1 shadow from this l2 shadow */
4447 shadow_l2e_t *sl2e;
4448 int done = 0;
4449 int flags;
4451 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4453 flags = shadow_l2e_get_flags(*sl2e);
4454 if ( (flags & _PAGE_PRESENT)
4455 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4457 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4458 if ( mfn_to_page(sl1mfn)->u.sh.type == 0 )
4459 /* This breaks us cleanly out of the FOREACH macro */
4460 done = 1;
4462 });
4463 return done;
4466 #if GUEST_PAGING_LEVELS >= 4
4467 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4468 /* Remove all mappings of this l2 shadow from this l3 shadow */
4470 shadow_l3e_t *sl3e;
4471 int done = 0;
4472 int flags;
4474 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4476 flags = shadow_l3e_get_flags(*sl3e);
4477 if ( (flags & _PAGE_PRESENT)
4478 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4480 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4481 if ( mfn_to_page(sl2mfn)->u.sh.type == 0 )
4482 /* This breaks us cleanly out of the FOREACH macro */
4483 done = 1;
4485 });
4486 return done;
4489 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4490 /* Remove all mappings of this l3 shadow from this l4 shadow */
4492 shadow_l4e_t *sl4e;
4493 int done = 0;
4494 int flags;
4496 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4498 flags = shadow_l4e_get_flags(*sl4e);
4499 if ( (flags & _PAGE_PRESENT)
4500 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4502 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4503 if ( mfn_to_page(sl3mfn)->u.sh.type == 0 )
4504 /* This breaks us cleanly out of the FOREACH macro */
4505 done = 1;
4507 });
4508 return done;
4510 #endif /* 64bit guest */
4512 /**************************************************************************/
4513 /* Handling HVM guest writes to pagetables */
4515 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4516 #define BAD_GVA_TO_GFN (~0UL)
4517 #define BAD_GFN_TO_MFN (~1UL)
4518 #define READONLY_GFN (~2UL)
4519 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4520 unsigned long vaddr,
4521 struct sh_emulate_ctxt *sh_ctxt)
4523 unsigned long gfn;
4524 mfn_t mfn;
4525 p2m_type_t p2mt;
4526 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4528 /* Translate the VA to a GFN */
4529 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4530 if ( gfn == INVALID_GFN )
4532 if ( is_hvm_vcpu(v) )
4533 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4534 else
4535 propagate_page_fault(vaddr, pfec);
4536 return _mfn(BAD_GVA_TO_GFN);
4539 /* Translate the GFN to an MFN */
4540 /* PoD: query only if shadow lock is held (to avoid deadlock) */
4541 if ( shadow_locked_by_me(v->domain) )
4542 mfn = gfn_to_mfn_query(v->domain, _gfn(gfn), &p2mt);
4543 else
4544 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4546 if ( p2mt == p2m_ram_ro )
4547 return _mfn(READONLY_GFN);
4548 if ( !p2m_is_ram(p2mt) )
4549 return _mfn(BAD_GFN_TO_MFN);
4551 ASSERT(mfn_valid(mfn));
4552 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4553 return mfn;
4556 /* Check that the user is allowed to perform this write.
4557 * Returns a mapped pointer to write to, or NULL for error. */
4558 #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
4559 #define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION)
4560 #define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY)
4561 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3)
4562 static void *emulate_map_dest(struct vcpu *v,
4563 unsigned long vaddr,
4564 u32 bytes,
4565 struct sh_emulate_ctxt *sh_ctxt)
4567 unsigned long offset;
4568 void *map = NULL;
4570 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4571 if ( !mfn_valid(sh_ctxt->mfn1) )
4572 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4573 MAPPING_EXCEPTION :
4574 (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
4575 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4577 #ifndef NDEBUG
4578 /* We don't emulate user-mode writes to page tables */
4579 if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 )
4581 gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
4582 "emulate_map_dest(). This should never happen!\n");
4583 return MAPPING_UNHANDLEABLE;
4585 #endif
4587 /* Unaligned writes mean probably this isn't a pagetable */
4588 if ( vaddr & (bytes - 1) )
4589 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4591 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4593 /* Whole write fits on a single page */
4594 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4595 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4597 else
4599 /* Cross-page emulated writes are only supported for HVM guests;
4600 * PV guests ought to know better */
4601 if ( !is_hvm_vcpu(v) )
4602 return MAPPING_UNHANDLEABLE;
4604 /* This write crosses a page boundary. Translate the second page */
4605 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4606 sh_ctxt);
4607 if ( !mfn_valid(sh_ctxt->mfn2) )
4608 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4609 MAPPING_EXCEPTION :
4610 (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
4611 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4613 /* Cross-page writes mean probably not a pagetable */
4614 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4616 /* Hack: we map the pages into the vcpu's LDT space, since we
4617 * know that we're not going to need the LDT for HVM guests,
4618 * and only HVM guests are allowed unaligned writes. */
4619 ASSERT(is_hvm_vcpu(v));
4620 map = (void *)LDT_VIRT_START(v);
4621 offset = l1_linear_offset((unsigned long) map);
4622 l1e_write(&__linear_l1_table[offset],
4623 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4624 l1e_write(&__linear_l1_table[offset + 1],
4625 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4626 flush_tlb_local();
4627 map += (vaddr & ~PAGE_MASK);
4630 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4631 /* Remember if the bottom bit was clear, so we can choose not to run
4632 * the change through the verify code if it's still clear afterwards */
4633 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4634 #endif
4636 return map;
4639 /* Tidy up after the emulated write: mark pages dirty, verify the new
4640 * contents, and undo the mapping */
4641 static void emulate_unmap_dest(struct vcpu *v,
4642 void *addr,
4643 u32 bytes,
4644 struct sh_emulate_ctxt *sh_ctxt)
4646 u32 b1 = bytes, b2 = 0, shflags;
4648 ASSERT(mfn_valid(sh_ctxt->mfn1));
4650 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4651 if ( likely(bytes >= 4)
4652 && (*(u32 *)addr == 0)
4653 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4654 check_for_early_unshadow(v, sh_ctxt->mfn1);
4655 else
4656 reset_early_unshadow(v);
4658 /* We can avoid re-verifying the page contents after the write if:
4659 * - it was no larger than the PTE type of this pagetable;
4660 * - it was aligned to the PTE boundaries; and
4661 * - _PAGE_PRESENT was clear before and after the write. */
4662 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4663 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4664 if ( sh_ctxt->low_bit_was_clear
4665 && !(*(u8 *)addr & _PAGE_PRESENT)
4666 && ((!(shflags & SHF_32)
4667 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4668 * the present bit unset are safe to ignore. */
4669 && ((unsigned long)addr & 7) == 0
4670 && bytes <= 8)
4671 ||
4672 (!(shflags & (SHF_PAE|SHF_64))
4673 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4674 * leave the present bit unset are safe to ignore. */
4675 && ((unsigned long)addr & 3) == 0
4676 && bytes <= 4)) )
4678 /* Writes with this alignment constraint can't possibly cross pages */
4679 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4681 else
4682 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4684 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4686 /* Validate as two writes, one to each page */
4687 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4688 b2 = bytes - b1;
4689 ASSERT(b2 < bytes);
4691 if ( likely(b1 > 0) )
4692 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4693 if ( unlikely(b2 > 0) )
4694 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4697 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4699 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4701 unsigned long offset;
4702 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4703 /* Undo the hacky two-frame contiguous map. */
4704 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4705 offset = l1_linear_offset((unsigned long) addr);
4706 l1e_write(&__linear_l1_table[offset], l1e_empty());
4707 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4708 flush_tlb_all();
4710 else
4711 sh_unmap_domain_page(addr);
4713 atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4716 static int
4717 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4718 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4720 void *addr;
4722 /* Unaligned writes are only acceptable on HVM */
4723 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4724 return X86EMUL_UNHANDLEABLE;
4726 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4727 if ( emulate_map_dest_failed(addr) )
4728 return (long)addr;
4730 shadow_lock(v->domain);
4731 memcpy(addr, src, bytes);
4733 if ( tb_init_done )
4735 #if GUEST_PAGING_LEVELS == 3
4736 if ( vaddr == this_cpu(trace_emulate_initial_va) )
4737 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4738 else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
4740 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
4741 memcpy(&this_cpu(trace_emulate_write_val),
4742 (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE);
4744 #else
4745 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4746 #endif
4749 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4750 shadow_audit_tables(v);
4751 shadow_unlock(v->domain);
4752 return X86EMUL_OKAY;
4755 static int
4756 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4757 unsigned long old, unsigned long new,
4758 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4760 void *addr;
4761 unsigned long prev;
4762 int rv = X86EMUL_OKAY;
4764 /* Unaligned writes are only acceptable on HVM */
4765 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4766 return X86EMUL_UNHANDLEABLE;
4768 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4769 if ( emulate_map_dest_failed(addr) )
4770 return (long)addr;
4772 shadow_lock(v->domain);
4773 switch ( bytes )
4775 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4776 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4777 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4778 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4779 default:
4780 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4781 prev = ~old;
4784 if ( prev != old )
4785 rv = X86EMUL_CMPXCHG_FAILED;
4787 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4788 " wanted %#lx now %#lx bytes %u\n",
4789 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4791 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4792 shadow_audit_tables(v);
4793 shadow_unlock(v->domain);
4794 return rv;
4797 #ifdef __i386__
4798 static int
4799 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4800 unsigned long old_lo, unsigned long old_hi,
4801 unsigned long new_lo, unsigned long new_hi,
4802 struct sh_emulate_ctxt *sh_ctxt)
4804 void *addr;
4805 u64 old, new, prev;
4806 int rv = X86EMUL_OKAY;
4808 /* Unaligned writes are only acceptable on HVM */
4809 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4810 return X86EMUL_UNHANDLEABLE;
4812 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4813 if ( emulate_map_dest_failed(addr) )
4814 return (long)addr;
4816 old = (((u64) old_hi) << 32) | (u64) old_lo;
4817 new = (((u64) new_hi) << 32) | (u64) new_lo;
4819 shadow_lock(v->domain);
4820 prev = cmpxchg(((u64 *)addr), old, new);
4822 if ( prev != old )
4823 rv = X86EMUL_CMPXCHG_FAILED;
4825 emulate_unmap_dest(v, addr, 8, sh_ctxt);
4826 shadow_audit_tables(v);
4827 shadow_unlock(v->domain);
4828 return rv;
4830 #endif
4832 /**************************************************************************/
4833 /* Audit tools */
4835 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4837 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4838 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4839 "gl" #_level "mfn = %" PRI_mfn \
4840 " sl" #_level "mfn = %" PRI_mfn \
4841 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4842 " gl" #_level "e = %" SH_PRI_gpte \
4843 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4844 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4845 _level, guest_index(gl ## _level ## e), \
4846 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4847 gl ## _level ## e, sl ## _level ## e, \
4848 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4849 ##_a); \
4850 BUG(); \
4851 done = 1; \
4852 } while (0)
4854 #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
4855 printk("Shadow %u-on-%u audit failed at level %i\n" \
4856 "gl" #_level "mfn = %" PRI_mfn \
4857 " sl" #_level "mfn = %" PRI_mfn \
4858 " Error: " _fmt "\n", \
4859 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4860 _level, \
4861 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4862 ##_a); \
4863 BUG(); \
4864 done = 1; \
4865 } while (0)
4867 static char * sh_audit_flags(struct vcpu *v, int level,
4868 int gflags, int sflags)
4869 /* Common code for auditing flag bits */
4871 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4872 return "shadow is present but guest is not present";
4873 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4874 return "global bit set in PV shadow";
4875 if ( level == 2 && (sflags & _PAGE_PSE) )
4876 return "PS bit set in shadow";
4877 #if SHADOW_PAGING_LEVELS == 3
4878 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4879 #endif
4880 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4881 return "accessed bit not propagated";
4882 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4883 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4884 return "dirty bit not propagated";
4885 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4886 return "user/supervisor bit does not match";
4887 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4888 return "NX bit does not match";
4889 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4890 return "shadow grants write access but guest does not";
4891 return NULL;
4894 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4896 guest_l1e_t *gl1e, *gp;
4897 shadow_l1e_t *sl1e;
4898 mfn_t mfn, gmfn, gl1mfn;
4899 gfn_t gfn;
4900 p2m_type_t p2mt;
4901 char *s;
4902 int done = 0;
4904 /* Follow the backpointer */
4905 gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
4907 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4908 /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
4909 if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
4911 oos_audit_hash_is_present(v->domain, gl1mfn);
4912 return 0;
4914 #endif
4916 gl1e = gp = sh_map_domain_page(gl1mfn);
4917 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4919 if ( sh_l1e_is_magic(*sl1e) )
4921 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
4922 if ( sh_l1e_is_gnp(*sl1e) )
4924 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4925 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4927 else
4929 ASSERT(sh_l1e_is_mmio(*sl1e));
4930 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4931 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4932 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4933 " but guest gfn is %" SH_PRI_gfn,
4934 gfn_x(gfn),
4935 gfn_x(guest_l1e_get_gfn(*gl1e)));
4937 #endif
4939 else
4941 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4942 shadow_l1e_get_flags(*sl1e));
4943 if ( s ) AUDIT_FAIL(1, "%s", s);
4945 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4947 gfn = guest_l1e_get_gfn(*gl1e);
4948 mfn = shadow_l1e_get_mfn(*sl1e);
4949 gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
4950 if ( mfn_x(gmfn) != mfn_x(mfn) )
4951 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4952 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4953 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4956 });
4957 sh_unmap_domain_page(gp);
4958 return done;
4961 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4963 guest_l1e_t *gl1e, e;
4964 shadow_l1e_t *sl1e;
4965 mfn_t gl1mfn = _mfn(INVALID_MFN);
4966 int f;
4967 int done = 0;
4969 /* fl1 has no useful backpointer: all we can check are flags */
4970 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4971 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4972 f = shadow_l1e_get_flags(*sl1e);
4973 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4974 if ( !(f == 0
4975 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4976 _PAGE_ACCESSED|_PAGE_DIRTY)
4977 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4978 || sh_l1e_is_magic(*sl1e)) )
4979 AUDIT_FAIL(1, "fl1e has bad flags");
4980 });
4981 return 0;
4984 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4986 guest_l2e_t *gl2e, *gp;
4987 shadow_l2e_t *sl2e;
4988 mfn_t mfn, gmfn, gl2mfn;
4989 gfn_t gfn;
4990 p2m_type_t p2mt;
4991 char *s;
4992 int done = 0;
4994 /* Follow the backpointer */
4995 gl2mfn = _mfn(mfn_to_page(sl2mfn)->v.sh.back);
4997 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4998 /* Only L1's may be out of sync. */
4999 if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
5000 AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
5001 #endif
5003 gl2e = gp = sh_map_domain_page(gl2mfn);
5004 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
5006 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
5007 shadow_l2e_get_flags(*sl2e));
5008 if ( s ) AUDIT_FAIL(2, "%s", s);
5010 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5012 gfn = guest_l2e_get_gfn(*gl2e);
5013 mfn = shadow_l2e_get_mfn(*sl2e);
5014 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
5015 ? get_fl1_shadow_status(v, gfn)
5016 : get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt),
5017 SH_type_l1_shadow);
5018 if ( mfn_x(gmfn) != mfn_x(mfn) )
5019 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
5020 " (--> %" PRI_mfn ")"
5021 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5022 gfn_x(gfn),
5023 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
5024 : mfn_x(gfn_to_mfn_query(v->domain, gfn, &p2mt)),
5025 mfn_x(gmfn), mfn_x(mfn));
5027 });
5028 sh_unmap_domain_page(gp);
5029 return 0;
5032 #if GUEST_PAGING_LEVELS >= 4
5033 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
5035 guest_l3e_t *gl3e, *gp;
5036 shadow_l3e_t *sl3e;
5037 mfn_t mfn, gmfn, gl3mfn;
5038 gfn_t gfn;
5039 p2m_type_t p2mt;
5040 char *s;
5041 int done = 0;
5043 /* Follow the backpointer */
5044 gl3mfn = _mfn(mfn_to_page(sl3mfn)->v.sh.back);
5046 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5047 /* Only L1's may be out of sync. */
5048 if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
5049 AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
5050 #endif
5052 gl3e = gp = sh_map_domain_page(gl3mfn);
5053 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
5055 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
5056 shadow_l3e_get_flags(*sl3e));
5057 if ( s ) AUDIT_FAIL(3, "%s", s);
5059 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5061 gfn = guest_l3e_get_gfn(*gl3e);
5062 mfn = shadow_l3e_get_mfn(*sl3e);
5063 gmfn = get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt),
5064 ((GUEST_PAGING_LEVELS == 3 ||
5065 is_pv_32on64_vcpu(v))
5066 && !shadow_mode_external(v->domain)
5067 && (guest_index(gl3e) % 4) == 3)
5068 ? SH_type_l2h_shadow
5069 : SH_type_l2_shadow);
5070 if ( mfn_x(gmfn) != mfn_x(mfn) )
5071 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
5072 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5073 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5075 });
5076 sh_unmap_domain_page(gp);
5077 return 0;
5080 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
5082 guest_l4e_t *gl4e, *gp;
5083 shadow_l4e_t *sl4e;
5084 mfn_t mfn, gmfn, gl4mfn;
5085 gfn_t gfn;
5086 p2m_type_t p2mt;
5087 char *s;
5088 int done = 0;
5090 /* Follow the backpointer */
5091 gl4mfn = _mfn(mfn_to_page(sl4mfn)->v.sh.back);
5093 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5094 /* Only L1's may be out of sync. */
5095 if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
5096 AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
5097 #endif
5099 gl4e = gp = sh_map_domain_page(gl4mfn);
5100 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
5102 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
5103 shadow_l4e_get_flags(*sl4e));
5104 if ( s ) AUDIT_FAIL(4, "%s", s);
5106 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5108 gfn = guest_l4e_get_gfn(*gl4e);
5109 mfn = shadow_l4e_get_mfn(*sl4e);
5110 gmfn = get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt),
5111 SH_type_l3_shadow);
5112 if ( mfn_x(gmfn) != mfn_x(mfn) )
5113 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
5114 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5115 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5117 });
5118 sh_unmap_domain_page(gp);
5119 return 0;
5121 #endif /* GUEST_PAGING_LEVELS >= 4 */
5124 #undef AUDIT_FAIL
5126 #endif /* Audit code */
5128 /**************************************************************************/
5129 /* Entry points into this mode of the shadow code.
5130 * This will all be mangled by the preprocessor to uniquify everything. */
5131 struct paging_mode sh_paging_mode = {
5132 .page_fault = sh_page_fault,
5133 .invlpg = sh_invlpg,
5134 .gva_to_gfn = sh_gva_to_gfn,
5135 .update_cr3 = sh_update_cr3,
5136 .update_paging_modes = shadow_update_paging_modes,
5137 .write_p2m_entry = shadow_write_p2m_entry,
5138 .write_guest_entry = shadow_write_guest_entry,
5139 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
5140 .guest_map_l1e = sh_guest_map_l1e,
5141 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
5142 .guest_levels = GUEST_PAGING_LEVELS,
5143 .shadow.detach_old_tables = sh_detach_old_tables,
5144 .shadow.x86_emulate_write = sh_x86_emulate_write,
5145 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
5146 #ifdef __i386__
5147 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
5148 #endif
5149 .shadow.make_monitor_table = sh_make_monitor_table,
5150 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
5151 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
5152 .shadow.guess_wrmap = sh_guess_wrmap,
5153 #endif
5154 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
5155 };
5157 /*
5158 * Local variables:
5159 * mode: C
5160 * c-set-style: "BSD"
5161 * c-basic-offset: 4
5162 * indent-tabs-mode: nil
5163 * End:
5164 */