ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 18837:a558165cfead

shadow: set fast_emul to zero when emulating to an out-of-sync page.

A small missing thing on the original out of sync patch.
No real bug, but it's better to correctly specify the path.

Signed-off-by: Gianluca Guida <gianluca.guida@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Nov 27 11:21:19 2008 +0000 (2008-11-27)
parents 285f8635f573
children 9be2fe3de567
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include <asm/guest_pt.h>
39 #include "private.h"
40 #include "types.h"
42 /* THINGS TO DO LATER:
43 *
44 * TEARDOWN HEURISTICS
45 * Also: have a heuristic for when to destroy a previous paging-mode's
46 * shadows. When a guest is done with its start-of-day 32-bit tables
47 * and reuses the memory we want to drop those shadows. Start with
48 * shadows in a page in two modes as a hint, but beware of clever tricks
49 * like reusing a pagetable for both PAE and 64-bit during boot...
50 *
51 * PAE LINEAR MAPS
52 * Rework shadow_get_l*e() to have the option of using map_domain_page()
53 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
54 * Then we can test the speed difference made by linear maps. If the
55 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
56 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
57 * to share l2h pages again.
58 *
59 * PSE disabled / PSE36
60 * We don't support any modes other than PSE enabled, PSE36 disabled.
61 * Neither of those would be hard to change, but we'd need to be able to
62 * deal with shadows made in one mode and used in another.
63 */
65 #define FETCH_TYPE_PREFETCH 1
66 #define FETCH_TYPE_DEMAND 2
67 #define FETCH_TYPE_WRITE 4
68 typedef enum {
69 ft_prefetch = FETCH_TYPE_PREFETCH,
70 ft_demand_read = FETCH_TYPE_DEMAND,
71 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
72 } fetch_type_t;
74 #ifdef DEBUG_TRACE_DUMP
75 static char *fetch_type_names[] = {
76 [ft_prefetch] "prefetch",
77 [ft_demand_read] "demand read",
78 [ft_demand_write] "demand write",
79 };
80 #endif
82 /**************************************************************************/
83 /* Hash table mapping from guest pagetables to shadows
84 *
85 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
86 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
87 * shadow L1 which maps its "splinters".
88 */
90 static inline mfn_t
91 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
92 /* Look for FL1 shadows in the hash table */
93 {
94 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
95 return smfn;
96 }
98 static inline mfn_t
99 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
100 /* Look for shadows in the hash table */
101 {
102 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
103 perfc_incr(shadow_get_shadow_status);
104 return smfn;
105 }
107 static inline void
108 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
109 /* Put an FL1 shadow into the hash table */
110 {
111 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
112 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
114 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
115 }
117 static inline void
118 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
119 /* Put a shadow into the hash table */
120 {
121 struct domain *d = v->domain;
122 int res;
124 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
125 d->domain_id, v->vcpu_id, mfn_x(gmfn),
126 shadow_type, mfn_x(smfn));
128 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
129 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
130 {
131 res = get_page(mfn_to_page(gmfn), d);
132 ASSERT(res == 1);
133 }
135 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
136 }
138 static inline void
139 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
140 /* Remove a shadow from the hash table */
141 {
142 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
143 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
144 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
145 }
147 static inline void
148 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
149 /* Remove a shadow from the hash table */
150 {
151 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
152 v->domain->domain_id, v->vcpu_id,
153 mfn_x(gmfn), shadow_type, mfn_x(smfn));
154 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
155 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
156 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
157 put_page(mfn_to_page(gmfn));
158 }
161 /**************************************************************************/
162 /* Functions for walking the guest page tables */
164 static inline uint32_t
165 sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
166 uint32_t pfec)
167 {
168 return guest_walk_tables(v, va, gw, pfec,
169 #if GUEST_PAGING_LEVELS == 3 /* PAE */
170 _mfn(INVALID_MFN),
171 v->arch.paging.shadow.gl3e
172 #else /* 32 or 64 */
173 pagetable_get_mfn(v->arch.guest_table),
174 v->arch.paging.shadow.guest_vtable
175 #endif
176 );
177 }
179 /* This validation is called with lock held, and after write permission
180 * removal. Then check is atomic and no more inconsistent content can
181 * be observed before lock is released
182 *
183 * Return 1 to indicate success and 0 for inconsistency
184 */
185 static inline uint32_t
186 shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
187 {
188 struct domain *d = v->domain;
189 guest_l1e_t *l1p;
190 guest_l2e_t *l2p;
191 #if GUEST_PAGING_LEVELS >= 4
192 guest_l3e_t *l3p;
193 guest_l4e_t *l4p;
194 #endif
195 int mismatch = 0;
197 ASSERT(shadow_locked_by_me(d));
199 if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
200 return 1;
202 /* We may consider caching guest page mapping from last
203 * guest table walk. However considering this check happens
204 * relatively less-frequent, and a bit burden here to
205 * remap guest page is better than caching mapping in each
206 * guest table walk.
207 *
208 * Also when inconsistency occurs, simply return to trigger
209 * another fault instead of re-validate new path to make
210 * logic simple.
211 */
212 perfc_incr(shadow_check_gwalk);
213 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
214 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
215 l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
216 mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
217 l3p = sh_map_domain_page(gw->l3mfn);
218 mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
219 sh_unmap_domain_page(l3p);
220 #else
221 mismatch |= (gw->l3e.l3 !=
222 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
223 #endif
224 l2p = sh_map_domain_page(gw->l2mfn);
225 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
226 sh_unmap_domain_page(l2p);
227 #else
228 l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
229 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
230 #endif
231 if ( !(guest_supports_superpages(v) &&
232 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
233 {
234 l1p = sh_map_domain_page(gw->l1mfn);
235 mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
236 sh_unmap_domain_page(l1p);
237 }
239 return !mismatch;
240 }
242 /* Remove write access permissions from a gwalk_t in a batch, and
243 * return OR-ed result for TLB flush hint and need to rewalk the guest
244 * pages.
245 *
246 * Syncing pages will remove write access to that page; but it may
247 * also give write access to other pages in the path. If we resync any
248 * pages, re-walk from the beginning.
249 */
250 #define GW_RMWR_FLUSHTLB 1
251 #define GW_RMWR_REWALK 2
253 static inline uint32_t
254 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
255 {
256 uint32_t rc = 0;
258 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
259 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
260 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
261 if ( mfn_is_out_of_sync(gw->l3mfn) )
262 {
263 sh_resync(v, gw->l3mfn);
264 rc = GW_RMWR_REWALK;
265 }
266 else
267 #endif /* OOS */
268 if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
269 rc = GW_RMWR_FLUSHTLB;
270 #endif /* GUEST_PAGING_LEVELS >= 4 */
272 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
273 if ( mfn_is_out_of_sync(gw->l2mfn) )
274 {
275 sh_resync(v, gw->l2mfn);
276 rc |= GW_RMWR_REWALK;
277 }
278 else
279 #endif /* OOS */
280 if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
281 rc |= GW_RMWR_FLUSHTLB;
282 #endif /* GUEST_PAGING_LEVELS >= 3 */
284 if ( !(guest_supports_superpages(v) &&
285 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
286 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
287 && !mfn_is_out_of_sync(gw->l1mfn)
288 #endif /* OOS */
289 && sh_remove_write_access(v, gw->l1mfn, 1, va) )
290 rc |= GW_RMWR_FLUSHTLB;
292 return rc;
293 }
295 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
296 /* Lightweight audit: pass all the shadows associated with this guest walk
297 * through the audit mechanisms */
298 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
299 {
300 mfn_t smfn;
302 if ( !(SHADOW_AUDIT_ENABLE) )
303 return;
305 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
306 if ( mfn_valid(gw->l4mfn)
307 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
308 SH_type_l4_shadow))) )
309 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
310 if ( mfn_valid(gw->l3mfn)
311 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
312 SH_type_l3_shadow))) )
313 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
314 #endif /* PAE or 64... */
315 if ( mfn_valid(gw->l2mfn) )
316 {
317 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
318 SH_type_l2_shadow))) )
319 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
320 #if GUEST_PAGING_LEVELS == 3
321 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
322 SH_type_l2h_shadow))) )
323 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
324 #endif
325 }
326 if ( mfn_valid(gw->l1mfn)
327 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
328 SH_type_l1_shadow))) )
329 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
330 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
331 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
332 && mfn_valid(
333 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
334 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
335 }
337 #else
338 #define sh_audit_gw(_v, _gw) do {} while(0)
339 #endif /* audit code */
342 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
343 void *
344 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
345 unsigned long *gl1mfn)
346 {
347 void *pl1e = NULL;
348 walk_t gw;
350 ASSERT(shadow_mode_translate(v->domain));
352 // XXX -- this is expensive, but it's easy to cobble together...
353 // FIXME!
355 if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0
356 && mfn_valid(gw.l1mfn) )
357 {
358 if ( gl1mfn )
359 *gl1mfn = mfn_x(gw.l1mfn);
360 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
361 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
362 }
364 return pl1e;
365 }
367 void
368 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
369 {
370 walk_t gw;
372 ASSERT(shadow_mode_translate(v->domain));
374 // XXX -- this is expensive, but it's easy to cobble together...
375 // FIXME!
377 (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
378 *(guest_l1e_t *)eff_l1e = gw.l1e;
379 }
380 #endif /* CONFIG == GUEST (== SHADOW) */
382 /**************************************************************************/
383 /* Functions to compute the correct index into a shadow page, given an
384 * index into the guest page (as returned by guest_get_index()).
385 * This is trivial when the shadow and guest use the same sized PTEs, but
386 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
387 * PAE- or 64-bit shadows).
388 *
389 * These functions also increment the shadow mfn, when necessary. When PTE
390 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
391 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
392 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
393 * which shadow page we really want. Similarly, when PTE sizes are
394 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
395 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
396 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
397 * space.)
398 *
399 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
400 * of shadow (to store both the shadow, and the info that would normally be
401 * stored in page_info fields). This arrangement allows the shadow and the
402 * "page_info" fields to always be stored in the same page (in fact, in
403 * the same cache line), avoiding an extra call to map_domain_page().
404 */
406 static inline u32
407 guest_index(void *ptr)
408 {
409 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
410 }
412 static u32
413 shadow_l1_index(mfn_t *smfn, u32 guest_index)
414 {
415 #if (GUEST_PAGING_LEVELS == 2)
416 *smfn = _mfn(mfn_x(*smfn) +
417 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
418 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
419 #else
420 return guest_index;
421 #endif
422 }
424 static u32
425 shadow_l2_index(mfn_t *smfn, u32 guest_index)
426 {
427 #if (GUEST_PAGING_LEVELS == 2)
428 // Because we use 2 shadow l2 entries for each guest entry, the number of
429 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
430 //
431 *smfn = _mfn(mfn_x(*smfn) +
432 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
434 // We multiply by two to get the index of the first of the two entries
435 // used to shadow the specified guest entry.
436 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
437 #else
438 return guest_index;
439 #endif
440 }
442 #if GUEST_PAGING_LEVELS >= 4
444 static u32
445 shadow_l3_index(mfn_t *smfn, u32 guest_index)
446 {
447 return guest_index;
448 }
450 static u32
451 shadow_l4_index(mfn_t *smfn, u32 guest_index)
452 {
453 return guest_index;
454 }
456 #endif // GUEST_PAGING_LEVELS >= 4
459 /**************************************************************************/
460 /* Function which computes shadow entries from their corresponding guest
461 * entries. This is the "heart" of the shadow code. It operates using
462 * level-1 shadow types, but handles all levels of entry.
463 * Don't call it directly, but use the four wrappers below.
464 */
466 static always_inline void
467 _sh_propagate(struct vcpu *v,
468 guest_intpte_t guest_intpte,
469 mfn_t target_mfn,
470 void *shadow_entry_ptr,
471 int level,
472 fetch_type_t ft,
473 p2m_type_t p2mt)
474 {
475 guest_l1e_t guest_entry = { guest_intpte };
476 shadow_l1e_t *sp = shadow_entry_ptr;
477 struct domain *d = v->domain;
478 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
479 u32 pass_thru_flags;
480 u32 gflags, sflags;
482 /* We don't shadow PAE l3s */
483 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
485 /* Check there's something for the shadows to map to */
486 if ( !p2m_is_valid(p2mt) )
487 {
488 *sp = shadow_l1e_empty();
489 goto done;
490 }
492 gflags = guest_l1e_get_flags(guest_entry);
494 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
495 {
496 /* If a guest l1 entry is not present, shadow with the magic
497 * guest-not-present entry. */
498 if ( level == 1 )
499 *sp = sh_l1e_gnp();
500 else
501 *sp = shadow_l1e_empty();
502 goto done;
503 }
505 if ( level == 1 && p2mt == p2m_mmio_dm )
506 {
507 /* Guest l1e maps emulated MMIO space */
508 *sp = sh_l1e_mmio(target_gfn, gflags);
509 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
510 d->arch.paging.shadow.has_fast_mmio_entries = 1;
511 goto done;
512 }
514 // Must have a valid target_mfn unless this is a prefetch or an l1
515 // pointing at MMIO space. In the case of a prefetch, an invalid
516 // mfn means that we can not usefully shadow anything, and so we
517 // return early.
518 //
519 if ( !mfn_valid(target_mfn)
520 && !(level == 1 && (!shadow_mode_refcounts(d)
521 || p2mt == p2m_mmio_direct)) )
522 {
523 ASSERT((ft == ft_prefetch));
524 *sp = shadow_l1e_empty();
525 goto done;
526 }
528 // Propagate bits from the guest to the shadow.
529 // Some of these may be overwritten, below.
530 // Since we know the guest's PRESENT bit is set, we also set the shadow's
531 // SHADOW_PRESENT bit.
532 //
533 pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
534 _PAGE_RW | _PAGE_PRESENT);
535 if ( guest_supports_nx(v) )
536 pass_thru_flags |= _PAGE_NX_BIT;
537 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
538 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
539 sflags = gflags & pass_thru_flags;
541 /*
542 * For HVM domains with direct access to MMIO areas, set the correct
543 * caching attributes in the shadows to match what was asked for.
544 */
545 if ( (level == 1) && is_hvm_domain(d) && has_arch_pdevs(d) &&
546 !is_xen_heap_mfn(mfn_x(target_mfn)) )
547 {
548 unsigned int type;
549 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
550 sflags |= pat_type_2_pte_flags(type);
551 else if ( d->arch.hvm_domain.is_in_uc_mode )
552 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
553 else
554 sflags |= get_pat_flags(v,
555 gflags,
556 gfn_to_paddr(target_gfn),
557 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT);
558 }
560 // Set the A&D bits for higher level shadows.
561 // Higher level entries do not, strictly speaking, have dirty bits, but
562 // since we use shadow linear tables, each of these entries may, at some
563 // point in time, also serve as a shadow L1 entry.
564 // By setting both the A&D bits in each of these, we eliminate the burden
565 // on the hardware to update these bits on initial accesses.
566 //
567 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
568 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
570 // If the A or D bit has not yet been set in the guest, then we must
571 // prevent the corresponding kind of access.
572 //
573 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
574 sflags &= ~_PAGE_PRESENT;
576 /* D bits exist in L1es and PSE L2es */
577 if ( unlikely(((level == 1) ||
578 ((level == 2) &&
579 (gflags & _PAGE_PSE) &&
580 guest_supports_superpages(v)))
581 && !(gflags & _PAGE_DIRTY)) )
582 sflags &= ~_PAGE_RW;
584 // shadow_mode_log_dirty support
585 //
586 // Only allow the guest write access to a page a) on a demand fault,
587 // or b) if the page is already marked as dirty.
588 //
589 // (We handle log-dirty entirely inside the shadow code, without using the
590 // p2m_ram_logdirty p2m type: only HAP uses that.)
591 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
592 {
593 if ( mfn_valid(target_mfn) ) {
594 if ( ft & FETCH_TYPE_WRITE )
595 paging_mark_dirty(d, mfn_x(target_mfn));
596 else if ( !sh_mfn_is_dirty(d, target_mfn) )
597 sflags &= ~_PAGE_RW;
598 }
599 }
601 if ( unlikely((level == 1) && d->dirty_vram
602 && d->dirty_vram->last_dirty == -1
603 && gfn_x(target_gfn) >= d->dirty_vram->begin_pfn
604 && gfn_x(target_gfn) < d->dirty_vram->end_pfn) )
605 {
606 if ( ft & FETCH_TYPE_WRITE )
607 d->dirty_vram->last_dirty = NOW();
608 else
609 sflags &= ~_PAGE_RW;
610 }
612 /* Read-only memory */
613 if ( p2mt == p2m_ram_ro )
614 sflags &= ~_PAGE_RW;
616 // protect guest page tables
617 //
618 if ( unlikely((level == 1)
619 && sh_mfn_is_a_page_table(target_mfn)
620 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
621 /* Unless the page is out of sync and the guest is
622 writing to it. */
623 && !(mfn_oos_may_write(target_mfn)
624 && (ft == ft_demand_write))
625 #endif /* OOS */
626 ) )
627 {
628 if ( shadow_mode_trap_reads(d) )
629 {
630 // if we are trapping both reads & writes, then mark this page
631 // as not present...
632 //
633 sflags &= ~_PAGE_PRESENT;
634 }
635 else
636 {
637 // otherwise, just prevent any writes...
638 //
639 sflags &= ~_PAGE_RW;
640 }
641 }
643 // PV guests in 64-bit mode use two different page tables for user vs
644 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
645 // It is always shadowed as present...
646 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
647 && !is_hvm_domain(d) )
648 {
649 sflags |= _PAGE_USER;
650 }
652 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
654 done:
655 SHADOW_DEBUG(PROPAGATE,
656 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
657 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
658 }
661 /* These four wrappers give us a little bit of type-safety back around
662 * the use of void-* pointers and intpte types in _sh_propagate(), and
663 * allow the compiler to optimize out some level checks. */
665 #if GUEST_PAGING_LEVELS >= 4
666 static void
667 l4e_propagate_from_guest(struct vcpu *v,
668 guest_l4e_t gl4e,
669 mfn_t sl3mfn,
670 shadow_l4e_t *sl4e,
671 fetch_type_t ft)
672 {
673 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
674 }
676 static void
677 l3e_propagate_from_guest(struct vcpu *v,
678 guest_l3e_t gl3e,
679 mfn_t sl2mfn,
680 shadow_l3e_t *sl3e,
681 fetch_type_t ft)
682 {
683 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
684 }
685 #endif // GUEST_PAGING_LEVELS >= 4
687 static void
688 l2e_propagate_from_guest(struct vcpu *v,
689 guest_l2e_t gl2e,
690 mfn_t sl1mfn,
691 shadow_l2e_t *sl2e,
692 fetch_type_t ft)
693 {
694 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
695 }
697 static void
698 l1e_propagate_from_guest(struct vcpu *v,
699 guest_l1e_t gl1e,
700 mfn_t gmfn,
701 shadow_l1e_t *sl1e,
702 fetch_type_t ft,
703 p2m_type_t p2mt)
704 {
705 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
706 }
709 /**************************************************************************/
710 /* These functions update shadow entries (and do bookkeeping on the shadow
711 * tables they are in). It is intended that they are the only
712 * functions which ever write (non-zero) data onto a shadow page.
713 */
715 static inline void safe_write_entry(void *dst, void *src)
716 /* Copy one PTE safely when processors might be running on the
717 * destination pagetable. This does *not* give safety against
718 * concurrent writes (that's what the shadow lock is for), just
719 * stops the hardware picking up partially written entries. */
720 {
721 volatile unsigned long *d = dst;
722 unsigned long *s = src;
723 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
724 #if CONFIG_PAGING_LEVELS == 3
725 /* In PAE mode, pagetable entries are larger
726 * than machine words, so won't get written atomically. We need to make
727 * sure any other cpu running on these shadows doesn't see a
728 * half-written entry. Do this by marking the entry not-present first,
729 * then writing the high word before the low word. */
730 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
731 d[0] = 0;
732 d[1] = s[1];
733 d[0] = s[0];
734 #else
735 /* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
736 * which will be an atomic write, since the entry is aligned. */
737 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
738 *d = *s;
739 #endif
740 }
743 static inline void
744 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
745 /* This function does the actual writes to shadow pages.
746 * It must not be called directly, since it doesn't do the bookkeeping
747 * that shadow_set_l*e() functions do. */
748 {
749 shadow_l1e_t *dst = d;
750 shadow_l1e_t *src = s;
751 void *map = NULL;
752 int i;
754 /* Because we mirror access rights at all levels in the shadow, an
755 * l2 (or higher) entry with the RW bit cleared will leave us with
756 * no write access through the linear map.
757 * We detect that by writing to the shadow with copy_to_user() and
758 * using map_domain_page() to get a writeable mapping if we need to. */
759 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
760 {
761 perfc_incr(shadow_linear_map_failed);
762 map = sh_map_domain_page(mfn);
763 ASSERT(map != NULL);
764 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
765 }
768 for ( i = 0; i < entries; i++ )
769 safe_write_entry(dst++, src++);
771 if ( map != NULL ) sh_unmap_domain_page(map);
772 }
774 static inline int
775 perms_strictly_increased(u32 old_flags, u32 new_flags)
776 /* Given the flags of two entries, are the new flags a strict
777 * increase in rights over the old ones? */
778 {
779 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
780 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
781 /* Flip the NX bit, since it's the only one that decreases rights;
782 * we calculate as if it were an "X" bit. */
783 of ^= _PAGE_NX_BIT;
784 nf ^= _PAGE_NX_BIT;
785 /* If the changed bits are all set in the new flags, then rights strictly
786 * increased between old and new. */
787 return ((of | (of ^ nf)) == nf);
788 }
790 static int inline
791 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
792 {
793 int res;
794 mfn_t mfn;
795 struct domain *owner;
797 ASSERT(!sh_l1e_is_magic(sl1e));
799 if ( !shadow_mode_refcounts(d) )
800 return 1;
802 res = get_page_from_l1e(sl1e, d);
804 // If a privileged domain is attempting to install a map of a page it does
805 // not own, we let it succeed anyway.
806 //
807 if ( unlikely(!res) &&
808 !shadow_mode_translate(d) &&
809 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
810 (owner = page_get_owner(mfn_to_page(mfn))) &&
811 (d != owner) &&
812 IS_PRIV_FOR(d, owner))
813 {
814 res = get_page_from_l1e(sl1e, owner);
815 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
816 "which is owned by domain %d: %s\n",
817 d->domain_id, mfn_x(mfn), owner->domain_id,
818 res ? "success" : "failed");
819 }
821 if ( unlikely(!res) )
822 {
823 perfc_incr(shadow_get_page_fail);
824 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
825 }
827 return res;
828 }
830 static void inline
831 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
832 {
833 if ( !shadow_mode_refcounts(d) )
834 return;
836 put_page_from_l1e(sl1e, d);
837 }
839 #if GUEST_PAGING_LEVELS >= 4
840 static int shadow_set_l4e(struct vcpu *v,
841 shadow_l4e_t *sl4e,
842 shadow_l4e_t new_sl4e,
843 mfn_t sl4mfn)
844 {
845 int flags = 0, ok;
846 shadow_l4e_t old_sl4e;
847 paddr_t paddr;
848 ASSERT(sl4e != NULL);
849 old_sl4e = *sl4e;
851 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
853 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
854 | (((unsigned long)sl4e) & ~PAGE_MASK));
856 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
857 {
858 /* About to install a new reference */
859 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
860 ok = sh_get_ref(v, sl3mfn, paddr);
861 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
862 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
863 ok |= sh_pin(v, sl3mfn);
864 if ( !ok )
865 {
866 domain_crash(v->domain);
867 return SHADOW_SET_ERROR;
868 }
869 }
871 /* Write the new entry */
872 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
873 flags |= SHADOW_SET_CHANGED;
875 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
876 {
877 /* We lost a reference to an old mfn. */
878 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
879 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
880 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
881 shadow_l4e_get_flags(new_sl4e)) )
882 {
883 flags |= SHADOW_SET_FLUSH;
884 }
885 sh_put_ref(v, osl3mfn, paddr);
886 }
887 return flags;
888 }
890 static int shadow_set_l3e(struct vcpu *v,
891 shadow_l3e_t *sl3e,
892 shadow_l3e_t new_sl3e,
893 mfn_t sl3mfn)
894 {
895 int flags = 0;
896 shadow_l3e_t old_sl3e;
897 paddr_t paddr;
898 ASSERT(sl3e != NULL);
899 old_sl3e = *sl3e;
901 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
903 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
904 | (((unsigned long)sl3e) & ~PAGE_MASK));
906 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
907 {
908 /* About to install a new reference */
909 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
910 {
911 domain_crash(v->domain);
912 return SHADOW_SET_ERROR;
913 }
914 }
916 /* Write the new entry */
917 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
918 flags |= SHADOW_SET_CHANGED;
920 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
921 {
922 /* We lost a reference to an old mfn. */
923 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
924 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
925 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
926 shadow_l3e_get_flags(new_sl3e)) )
927 {
928 flags |= SHADOW_SET_FLUSH;
929 }
930 sh_put_ref(v, osl2mfn, paddr);
931 }
932 return flags;
933 }
934 #endif /* GUEST_PAGING_LEVELS >= 4 */
936 static int shadow_set_l2e(struct vcpu *v,
937 shadow_l2e_t *sl2e,
938 shadow_l2e_t new_sl2e,
939 mfn_t sl2mfn)
940 {
941 int flags = 0;
942 shadow_l2e_t old_sl2e;
943 paddr_t paddr;
945 #if GUEST_PAGING_LEVELS == 2
946 /* In 2-on-3 we work with pairs of l2es pointing at two-page
947 * shadows. Reference counting and up-pointers track from the first
948 * page of the shadow to the first l2e, so make sure that we're
949 * working with those:
950 * Align the pointer down so it's pointing at the first of the pair */
951 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
952 /* Align the mfn of the shadow entry too */
953 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
954 #endif
956 ASSERT(sl2e != NULL);
957 old_sl2e = *sl2e;
959 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
961 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
962 | (((unsigned long)sl2e) & ~PAGE_MASK));
964 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
965 {
966 mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
968 /* About to install a new reference */
969 if ( !sh_get_ref(v, sl1mfn, paddr) )
970 {
971 domain_crash(v->domain);
972 return SHADOW_SET_ERROR;
973 }
974 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
975 {
976 struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
977 mfn_t gl1mfn = _mfn(sp->backpointer);
979 /* If the shadow is a fl1 then the backpointer contains
980 the GFN instead of the GMFN, and it's definitely not
981 OOS. */
982 if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
983 && mfn_is_out_of_sync(gl1mfn) )
984 sh_resync(v, gl1mfn);
985 }
986 #endif
987 }
989 /* Write the new entry */
990 #if GUEST_PAGING_LEVELS == 2
991 {
992 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
993 /* The l1 shadow is two pages long and need to be pointed to by
994 * two adjacent l1es. The pair have the same flags, but point
995 * at odd and even MFNs */
996 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
997 pair[1].l2 |= (1<<PAGE_SHIFT);
998 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
999 }
1000 #else /* normal case */
1001 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1002 #endif
1003 flags |= SHADOW_SET_CHANGED;
1005 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1007 /* We lost a reference to an old mfn. */
1008 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1009 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1010 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1011 shadow_l2e_get_flags(new_sl2e)) )
1013 flags |= SHADOW_SET_FLUSH;
1015 sh_put_ref(v, osl1mfn, paddr);
1017 return flags;
1020 static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1021 shadow_l1e_t *sl1e,
1022 mfn_t sl1mfn,
1023 struct domain *d)
1025 mfn_t mfn;
1026 unsigned long gfn;
1028 if ( !d->dirty_vram ) return;
1030 mfn = shadow_l1e_get_mfn(new_sl1e);
1032 if ( !mfn_valid(mfn) ) return; /* m2p for mmio_direct may not exist */
1034 gfn = mfn_to_gfn(d, mfn);
1036 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1037 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1038 struct page_info *page = mfn_to_page(mfn);
1039 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1041 if ( count_info == 1 )
1042 /* Initial guest reference, record it */
1043 d->dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1044 | ((unsigned long)sl1e & ~PAGE_MASK);
1048 static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1049 shadow_l1e_t *sl1e,
1050 mfn_t sl1mfn,
1051 struct domain *d)
1053 mfn_t mfn;
1054 unsigned long gfn;
1056 if ( !d->dirty_vram ) return;
1058 mfn = shadow_l1e_get_mfn(old_sl1e);
1060 if ( !mfn_valid(mfn) ) return;
1062 gfn = mfn_to_gfn(d, mfn);
1064 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1065 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1066 struct page_info *page = mfn_to_page(mfn);
1067 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1068 int dirty = 0;
1069 paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1070 | ((unsigned long)sl1e & ~PAGE_MASK);
1072 if ( count_info == 1 ) {
1073 /* Last reference */
1074 if ( d->dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1075 /* We didn't know it was that one, let's say it is dirty */
1076 dirty = 1;
1077 } else {
1078 ASSERT(d->dirty_vram->sl1ma[i] == sl1ma);
1079 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1080 if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_DIRTY )
1081 dirty = 1;
1083 } else {
1084 /* We had more than one reference, just consider the page dirty. */
1085 dirty = 1;
1086 /* Check that it's not the one we recorded. */
1087 if ( d->dirty_vram->sl1ma[i] == sl1ma ) {
1088 /* Too bad, we remembered the wrong one... */
1089 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1090 } else {
1091 /* Ok, our recorded sl1e is still pointing to this page, let's
1092 * just hope it will remain. */
1095 if ( dirty ) {
1096 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
1097 d->dirty_vram->last_dirty = NOW();
1102 static int shadow_set_l1e(struct vcpu *v,
1103 shadow_l1e_t *sl1e,
1104 shadow_l1e_t new_sl1e,
1105 mfn_t sl1mfn)
1107 int flags = 0;
1108 struct domain *d = v->domain;
1109 shadow_l1e_t old_sl1e;
1110 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1111 mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
1112 #endif
1113 ASSERT(sl1e != NULL);
1115 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1116 if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
1117 && ((shadow_l1e_get_flags(new_sl1e) & (_PAGE_RW|_PAGE_PRESENT))
1118 == (_PAGE_RW|_PAGE_PRESENT)) )
1119 oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
1120 #endif
1122 old_sl1e = *sl1e;
1124 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1126 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1127 && !sh_l1e_is_magic(new_sl1e) )
1129 /* About to install a new reference */
1130 if ( shadow_mode_refcounts(d) ) {
1131 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
1132 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1134 /* Doesn't look like a pagetable. */
1135 flags |= SHADOW_SET_ERROR;
1136 new_sl1e = shadow_l1e_empty();
1138 else
1140 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1145 /* Write the new entry */
1146 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1147 flags |= SHADOW_SET_CHANGED;
1149 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1150 && !sh_l1e_is_magic(old_sl1e) )
1152 /* We lost a reference to an old mfn. */
1153 /* N.B. Unlike higher-level sets, never need an extra flush
1154 * when writing an l1e. Because it points to the same guest frame
1155 * as the guest l1e did, it's the guest's responsibility to
1156 * trigger a flush later. */
1157 if ( shadow_mode_refcounts(d) )
1159 shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1160 shadow_put_page_from_l1e(old_sl1e, d);
1161 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
1164 return flags;
1168 /**************************************************************************/
1169 /* Macros to walk pagetables. These take the shadow of a pagetable and
1170 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1171 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1172 * second entry (since pairs of entries are managed together). For multi-page
1173 * shadows they walk all pages.
1175 * Arguments are an MFN, the variable to point to each entry, a variable
1176 * to indicate that we are done (we will shortcut to the end of the scan
1177 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1178 * and the code.
1180 * WARNING: These macros have side-effects. They change the values of both
1181 * the pointer and the MFN. */
1183 static inline void increment_ptr_to_guest_entry(void *ptr)
1185 if ( ptr )
1187 guest_l1e_t **entry = ptr;
1188 (*entry)++;
1192 /* All kinds of l1: touch all entries */
1193 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1194 do { \
1195 int _i; \
1196 shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \
1197 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1198 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1199 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1200 { \
1201 (_sl1e) = _sp + _i; \
1202 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1203 {_code} \
1204 if ( _done ) break; \
1205 increment_ptr_to_guest_entry(_gl1p); \
1206 } \
1207 sh_unmap_domain_page(_sp); \
1208 } while (0)
1210 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1211 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1212 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1213 do { \
1214 int __done = 0; \
1215 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1216 ({ (__done = _done); }), _code); \
1217 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1218 if ( !__done ) \
1219 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1220 ({ (__done = _done); }), _code); \
1221 } while (0)
1222 #else /* Everything else; l1 shadows are only one page */
1223 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1224 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1225 #endif
1228 #if GUEST_PAGING_LEVELS == 2
1230 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1231 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1232 do { \
1233 int _i, _j, __done = 0; \
1234 int _xen = !shadow_mode_external(_dom); \
1235 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1236 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1237 { \
1238 shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \
1239 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1240 if ( (!(_xen)) \
1241 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1242 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1243 { \
1244 (_sl2e) = _sp + _i; \
1245 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1246 {_code} \
1247 if ( (__done = (_done)) ) break; \
1248 increment_ptr_to_guest_entry(_gl2p); \
1249 } \
1250 sh_unmap_domain_page(_sp); \
1251 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1252 } \
1253 } while (0)
1255 #elif GUEST_PAGING_LEVELS == 3
1257 /* PAE: if it's an l2h, don't touch Xen mappings */
1258 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1259 do { \
1260 int _i; \
1261 int _xen = !shadow_mode_external(_dom); \
1262 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1263 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1264 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1265 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1266 if ( (!(_xen)) \
1267 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1268 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1269 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1270 { \
1271 (_sl2e) = _sp + _i; \
1272 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1273 {_code} \
1274 if ( _done ) break; \
1275 increment_ptr_to_guest_entry(_gl2p); \
1276 } \
1277 sh_unmap_domain_page(_sp); \
1278 } while (0)
1280 #else
1282 /* 64-bit l2: touch all entries except for PAE compat guests. */
1283 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1284 do { \
1285 int _i; \
1286 int _xen = !shadow_mode_external(_dom); \
1287 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1288 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1289 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1290 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1291 { \
1292 if ( (!(_xen)) \
1293 || !is_pv_32on64_domain(_dom) \
1294 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1295 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1296 { \
1297 (_sl2e) = _sp + _i; \
1298 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1299 {_code} \
1300 if ( _done ) break; \
1301 increment_ptr_to_guest_entry(_gl2p); \
1302 } \
1303 } \
1304 sh_unmap_domain_page(_sp); \
1305 } while (0)
1307 #endif /* different kinds of l2 */
1309 #if GUEST_PAGING_LEVELS == 4
1311 /* 64-bit l3: touch all entries */
1312 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1313 do { \
1314 int _i; \
1315 shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \
1316 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1317 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1318 { \
1319 (_sl3e) = _sp + _i; \
1320 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1321 {_code} \
1322 if ( _done ) break; \
1323 increment_ptr_to_guest_entry(_gl3p); \
1324 } \
1325 sh_unmap_domain_page(_sp); \
1326 } while (0)
1328 /* 64-bit l4: avoid Xen mappings */
1329 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1330 do { \
1331 shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \
1332 int _xen = !shadow_mode_external(_dom); \
1333 int _i; \
1334 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1335 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1336 { \
1337 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1338 { \
1339 (_sl4e) = _sp + _i; \
1340 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1341 {_code} \
1342 if ( _done ) break; \
1343 } \
1344 increment_ptr_to_guest_entry(_gl4p); \
1345 } \
1346 sh_unmap_domain_page(_sp); \
1347 } while (0)
1349 #endif
1353 /**************************************************************************/
1354 /* Functions to install Xen mappings and linear mappings in shadow pages */
1356 // XXX -- this function should probably be moved to shadow-common.c, but that
1357 // probably wants to wait until the shadow types have been moved from
1358 // shadow-types.h to shadow-private.h
1359 //
1360 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1361 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1363 struct domain *d = v->domain;
1364 shadow_l4e_t *sl4e;
1366 sl4e = sh_map_domain_page(sl4mfn);
1367 ASSERT(sl4e != NULL);
1368 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1370 /* Copy the common Xen mappings from the idle domain */
1371 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1372 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1373 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1375 /* Install the per-domain mappings for this domain */
1376 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1377 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1378 __PAGE_HYPERVISOR);
1380 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1381 * shadows on 64-bit xen, this linear mapping is later replaced by the
1382 * monitor pagetable structure, which is built in make_monitor_table
1383 * and maintained by sh_update_linear_entries. */
1384 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1385 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1387 /* Self linear mapping. */
1388 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1390 // linear tables may not be used with translated PV guests
1391 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1392 shadow_l4e_empty();
1394 else
1396 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1397 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1400 if ( shadow_mode_translate(v->domain) )
1402 /* install domain-specific P2M table */
1403 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1404 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1405 __PAGE_HYPERVISOR);
1408 sh_unmap_domain_page(sl4e);
1410 #endif
1412 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1413 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1414 // place, which means that we need to populate the l2h entry in the l3
1415 // table.
1417 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1419 struct domain *d = v->domain;
1420 shadow_l2e_t *sl2e;
1421 #if CONFIG_PAGING_LEVELS == 3
1422 int i;
1423 #else
1425 if ( !is_pv_32on64_vcpu(v) )
1426 return;
1427 #endif
1429 sl2e = sh_map_domain_page(sl2hmfn);
1430 ASSERT(sl2e != NULL);
1431 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1433 #if CONFIG_PAGING_LEVELS == 3
1435 /* Copy the common Xen mappings from the idle domain */
1436 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1437 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1438 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1440 /* Install the per-domain mappings for this domain */
1441 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1442 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1443 shadow_l2e_from_mfn(
1444 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1445 __PAGE_HYPERVISOR);
1447 /* We don't set up a linear mapping here because we can't until this
1448 * l2h is installed in an l3e. sh_update_linear_entries() handles
1449 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1450 * We zero them here, just as a safety measure.
1451 */
1452 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1453 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1454 shadow_l2e_empty();
1455 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1456 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1457 shadow_l2e_empty();
1459 if ( shadow_mode_translate(d) )
1461 /* Install the domain-specific p2m table */
1462 l3_pgentry_t *p2m;
1463 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1464 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1465 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1467 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1468 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1469 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1470 __PAGE_HYPERVISOR)
1471 : shadow_l2e_empty();
1473 sh_unmap_domain_page(p2m);
1476 #else
1478 /* Copy the common Xen mappings from the idle domain */
1479 memcpy(
1480 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1481 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1482 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1484 #endif
1486 sh_unmap_domain_page(sl2e);
1488 #endif
1494 /**************************************************************************/
1495 /* Create a shadow of a given guest page.
1496 */
1497 static mfn_t
1498 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1500 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1501 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1502 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1504 if ( shadow_type != SH_type_l2_32_shadow
1505 && shadow_type != SH_type_l2_pae_shadow
1506 && shadow_type != SH_type_l2h_pae_shadow
1507 && shadow_type != SH_type_l4_64_shadow )
1508 /* Lower-level shadow, not yet linked form a higher level */
1509 mfn_to_shadow_page(smfn)->up = 0;
1511 #if GUEST_PAGING_LEVELS == 4
1512 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1513 if ( shadow_type == SH_type_l4_64_shadow &&
1514 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1516 /* We're shadowing a new l4, but we've been assuming the guest uses
1517 * only one l4 per vcpu and context switches using an l4 entry.
1518 * Count the number of active l4 shadows. If there are enough
1519 * of them, decide that this isn't an old linux guest, and stop
1520 * pinning l3es. This is not very quick but it doesn't happen
1521 * very often. */
1522 struct list_head *l, *t;
1523 struct shadow_page_info *sp;
1524 struct vcpu *v2;
1525 int l4count = 0, vcpus = 0;
1526 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1528 sp = list_entry(l, struct shadow_page_info, list);
1529 if ( sp->type == SH_type_l4_64_shadow )
1530 l4count++;
1532 for_each_vcpu ( v->domain, v2 )
1533 vcpus++;
1534 if ( l4count > 2 * vcpus )
1536 /* Unpin all the pinned l3 tables, and don't pin any more. */
1537 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1539 sp = list_entry(l, struct shadow_page_info, list);
1540 if ( sp->type == SH_type_l3_64_shadow )
1541 sh_unpin(v, shadow_page_to_mfn(sp));
1543 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1546 #endif
1547 #endif
1549 // Create the Xen mappings...
1550 if ( !shadow_mode_external(v->domain) )
1552 switch (shadow_type)
1554 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1555 case SH_type_l4_shadow:
1556 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1557 #endif
1558 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1559 case SH_type_l2h_shadow:
1560 sh_install_xen_entries_in_l2h(v, smfn); break;
1561 #endif
1562 default: /* Do nothing */ break;
1566 shadow_promote(v, gmfn, shadow_type);
1567 set_shadow_status(v, gmfn, shadow_type, smfn);
1569 return smfn;
1572 /* Make a splintered superpage shadow */
1573 static mfn_t
1574 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1576 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1577 (unsigned long) gfn_x(gfn));
1579 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1580 gfn_x(gfn), mfn_x(smfn));
1582 set_fl1_shadow_status(v, gfn, smfn);
1583 return smfn;
1587 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1588 mfn_t
1589 sh_make_monitor_table(struct vcpu *v)
1591 struct domain *d = v->domain;
1593 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1595 /* Guarantee we can get the memory we need */
1596 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1598 #if CONFIG_PAGING_LEVELS == 4
1600 mfn_t m4mfn;
1601 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1602 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1603 /* Remember the level of this table */
1604 mfn_to_page(m4mfn)->shadow_flags = 4;
1605 #if SHADOW_PAGING_LEVELS < 4
1607 mfn_t m3mfn, m2mfn;
1608 l4_pgentry_t *l4e;
1609 l3_pgentry_t *l3e;
1610 /* Install an l3 table and an l2 table that will hold the shadow
1611 * linear map entries. This overrides the linear map entry that
1612 * was installed by sh_install_xen_entries_in_l4. */
1613 l4e = sh_map_domain_page(m4mfn);
1615 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1616 mfn_to_page(m3mfn)->shadow_flags = 3;
1617 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1618 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1620 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1621 mfn_to_page(m2mfn)->shadow_flags = 2;
1622 l3e = sh_map_domain_page(m3mfn);
1623 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1624 sh_unmap_domain_page(l3e);
1626 if ( is_pv_32on64_vcpu(v) )
1628 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1629 * area into its usual VAs in the monitor tables */
1630 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1631 mfn_to_page(m3mfn)->shadow_flags = 3;
1632 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1634 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1635 mfn_to_page(m2mfn)->shadow_flags = 2;
1636 l3e = sh_map_domain_page(m3mfn);
1637 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1638 sh_install_xen_entries_in_l2h(v, m2mfn);
1639 sh_unmap_domain_page(l3e);
1642 sh_unmap_domain_page(l4e);
1644 #endif /* SHADOW_PAGING_LEVELS < 4 */
1645 return m4mfn;
1648 #elif CONFIG_PAGING_LEVELS == 3
1651 mfn_t m3mfn, m2mfn;
1652 l3_pgentry_t *l3e;
1653 l2_pgentry_t *l2e;
1654 int i;
1656 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1657 /* Remember the level of this table */
1658 mfn_to_page(m3mfn)->shadow_flags = 3;
1660 // Install a monitor l2 table in slot 3 of the l3 table.
1661 // This is used for all Xen entries, including linear maps
1662 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1663 mfn_to_page(m2mfn)->shadow_flags = 2;
1664 l3e = sh_map_domain_page(m3mfn);
1665 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1666 sh_install_xen_entries_in_l2h(v, m2mfn);
1667 /* Install the monitor's own linear map */
1668 l2e = sh_map_domain_page(m2mfn);
1669 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1670 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1671 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1672 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1673 : l2e_empty();
1674 sh_unmap_domain_page(l2e);
1675 sh_unmap_domain_page(l3e);
1677 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1678 return m3mfn;
1681 #else
1682 #error this should not happen
1683 #endif /* CONFIG_PAGING_LEVELS */
1685 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1687 /**************************************************************************/
1688 /* These functions also take a virtual address and return the level-N
1689 * shadow table mfn and entry, but they create the shadow pagetables if
1690 * they are needed. The "demand" argument is non-zero when handling
1691 * a demand fault (so we know what to do about accessed bits &c).
1692 * If the necessary tables are not present in the guest, they return NULL. */
1694 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1695 * more levels than the guest, the upper levels are always fixed and do not
1696 * reflect any information from the guest, so we do not use these functions
1697 * to access them. */
1699 #if GUEST_PAGING_LEVELS >= 4
1700 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1701 walk_t *gw,
1702 mfn_t *sl4mfn)
1704 /* There is always a shadow of the top level table. Get it. */
1705 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1706 /* Reading the top level table is always valid. */
1707 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1710 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1711 walk_t *gw,
1712 mfn_t *sl3mfn,
1713 fetch_type_t ft,
1714 int *resync)
1716 mfn_t sl4mfn;
1717 shadow_l4e_t *sl4e;
1718 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1719 /* Get the l4e */
1720 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1721 ASSERT(sl4e != NULL);
1722 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1724 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1725 ASSERT(mfn_valid(*sl3mfn));
1727 else
1729 int r;
1730 shadow_l4e_t new_sl4e;
1731 /* No l3 shadow installed: find and install it. */
1732 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1733 if ( !mfn_valid(*sl3mfn) )
1735 /* No l3 shadow of this page exists at all: make one. */
1736 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1738 /* Install the new sl3 table in the sl4e */
1739 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
1740 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1741 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1742 if ( r & SHADOW_SET_ERROR )
1743 return NULL;
1745 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
1746 *resync |= 1;
1747 #endif
1750 /* Now follow it down a level. Guaranteed to succeed. */
1751 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1753 #endif /* GUEST_PAGING_LEVELS >= 4 */
1756 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1757 walk_t *gw,
1758 mfn_t *sl2mfn,
1759 fetch_type_t ft,
1760 int *resync)
1762 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1763 mfn_t sl3mfn = _mfn(INVALID_MFN);
1764 shadow_l3e_t *sl3e;
1765 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1766 /* Get the l3e */
1767 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync);
1768 if ( sl3e == NULL ) return NULL;
1769 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1771 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1772 ASSERT(mfn_valid(*sl2mfn));
1774 else
1776 int r;
1777 shadow_l3e_t new_sl3e;
1778 unsigned int t = SH_type_l2_shadow;
1780 /* Tag compat L2 containing hypervisor (m2p) mappings */
1781 if ( is_pv_32on64_domain(v->domain) &&
1782 guest_l4_table_offset(gw->va) == 0 &&
1783 guest_l3_table_offset(gw->va) == 3 )
1784 t = SH_type_l2h_shadow;
1786 /* No l2 shadow installed: find and install it. */
1787 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1788 if ( !mfn_valid(*sl2mfn) )
1790 /* No l2 shadow of this page exists at all: make one. */
1791 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1793 /* Install the new sl2 table in the sl3e */
1794 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
1795 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1796 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1797 if ( r & SHADOW_SET_ERROR )
1798 return NULL;
1800 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
1801 *resync |= 1;
1802 #endif
1805 /* Now follow it down a level. Guaranteed to succeed. */
1806 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1807 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1808 /* We never demand-shadow PAE l3es: they are only created in
1809 * sh_update_cr3(). Check if the relevant sl3e is present. */
1810 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1811 + shadow_l3_linear_offset(gw->va);
1812 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1813 return NULL;
1814 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1815 ASSERT(mfn_valid(*sl2mfn));
1816 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1817 #else /* 32bit... */
1818 /* There is always a shadow of the top level table. Get it. */
1819 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1820 /* This next line is important: the guest l2 has a 16k
1821 * shadow, we need to return the right mfn of the four. This
1822 * call will set it for us as a side-effect. */
1823 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
1824 /* Reading the top level table is always valid. */
1825 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1826 #endif
1830 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1831 walk_t *gw,
1832 mfn_t *sl1mfn,
1833 fetch_type_t ft)
1835 mfn_t sl2mfn;
1836 int resync = 0;
1837 shadow_l2e_t *sl2e;
1839 /* Get the l2e */
1840 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync);
1841 if ( sl2e == NULL ) return NULL;
1843 /* Install the sl1 in the l2e if it wasn't there or if we need to
1844 * re-do it to fix a PSE dirty bit. */
1845 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1846 && likely(ft != ft_demand_write
1847 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
1848 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
1850 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1851 ASSERT(mfn_valid(*sl1mfn));
1853 else
1855 shadow_l2e_t new_sl2e;
1856 int r, flags = guest_l2e_get_flags(gw->l2e);
1857 /* No l1 shadow installed: find and install it. */
1858 if ( !(flags & _PAGE_PRESENT) )
1859 return NULL; /* No guest page. */
1860 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1862 /* Splintering a superpage */
1863 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
1864 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1865 if ( !mfn_valid(*sl1mfn) )
1867 /* No fl1 shadow of this superpage exists at all: make one. */
1868 *sl1mfn = make_fl1_shadow(v, l2gfn);
1871 else
1873 /* Shadowing an actual guest l1 table */
1874 if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
1875 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1876 if ( !mfn_valid(*sl1mfn) )
1878 /* No l1 shadow of this page exists at all: make one. */
1879 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1882 /* Install the new sl1 table in the sl2e */
1883 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
1884 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1885 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1886 if ( r & SHADOW_SET_ERROR )
1887 return NULL;
1889 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
1890 /* All pages walked are now pagetables. Safe to resync pages
1891 in case level 4 or 3 shadows were set. */
1892 if ( resync )
1893 shadow_resync_all(v, 0);
1894 #endif
1896 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1897 * the guest l1 table has an 8k shadow, and we need to return
1898 * the right mfn of the pair. This call will set it for us as a
1899 * side-effect. (In all other cases, it's a no-op and will be
1900 * compiled out.) */
1901 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1903 /* Now follow it down a level. Guaranteed to succeed. */
1904 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1909 /**************************************************************************/
1910 /* Destructors for shadow tables:
1911 * Unregister the shadow, decrement refcounts of any entries present in it,
1912 * and release the memory.
1914 * N.B. These destructors do not clear the contents of the shadows.
1915 * This allows us to delay TLB shootdowns until the page is being reused.
1916 * See shadow_alloc() and shadow_free() for how this is handled.
1917 */
1919 #if GUEST_PAGING_LEVELS >= 4
1920 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1922 shadow_l4e_t *sl4e;
1923 u32 t = mfn_to_shadow_page(smfn)->type;
1924 mfn_t gmfn, sl4mfn;
1926 SHADOW_DEBUG(DESTROY_SHADOW,
1927 "%s(%05lx)\n", __func__, mfn_x(smfn));
1928 ASSERT(t == SH_type_l4_shadow);
1930 /* Record that the guest page isn't shadowed any more (in this type) */
1931 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1932 delete_shadow_status(v, gmfn, t, smfn);
1933 shadow_demote(v, gmfn, t);
1934 /* Decrement refcounts of all the old entries */
1935 sl4mfn = smfn;
1936 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
1937 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1939 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1940 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1941 | ((unsigned long)sl4e & ~PAGE_MASK));
1943 });
1945 /* Put the memory back in the pool */
1946 shadow_free(v->domain, smfn);
1949 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
1951 shadow_l3e_t *sl3e;
1952 u32 t = mfn_to_shadow_page(smfn)->type;
1953 mfn_t gmfn, sl3mfn;
1955 SHADOW_DEBUG(DESTROY_SHADOW,
1956 "%s(%05lx)\n", __func__, mfn_x(smfn));
1957 ASSERT(t == SH_type_l3_shadow);
1959 /* Record that the guest page isn't shadowed any more (in this type) */
1960 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1961 delete_shadow_status(v, gmfn, t, smfn);
1962 shadow_demote(v, gmfn, t);
1964 /* Decrement refcounts of all the old entries */
1965 sl3mfn = smfn;
1966 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
1967 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1968 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
1969 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1970 | ((unsigned long)sl3e & ~PAGE_MASK));
1971 });
1973 /* Put the memory back in the pool */
1974 shadow_free(v->domain, smfn);
1976 #endif /* GUEST_PAGING_LEVELS >= 4 */
1979 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
1981 shadow_l2e_t *sl2e;
1982 u32 t = mfn_to_shadow_page(smfn)->type;
1983 mfn_t gmfn, sl2mfn;
1985 SHADOW_DEBUG(DESTROY_SHADOW,
1986 "%s(%05lx)\n", __func__, mfn_x(smfn));
1988 #if GUEST_PAGING_LEVELS >= 3
1989 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
1990 #else
1991 ASSERT(t == SH_type_l2_shadow);
1992 #endif
1994 /* Record that the guest page isn't shadowed any more (in this type) */
1995 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1996 delete_shadow_status(v, gmfn, t, smfn);
1997 shadow_demote(v, gmfn, t);
1999 /* Decrement refcounts of all the old entries */
2000 sl2mfn = smfn;
2001 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2002 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2003 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2004 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2005 | ((unsigned long)sl2e & ~PAGE_MASK));
2006 });
2008 /* Put the memory back in the pool */
2009 shadow_free(v->domain, smfn);
2012 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2014 struct domain *d = v->domain;
2015 shadow_l1e_t *sl1e;
2016 u32 t = mfn_to_shadow_page(smfn)->type;
2018 SHADOW_DEBUG(DESTROY_SHADOW,
2019 "%s(%05lx)\n", __func__, mfn_x(smfn));
2020 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2022 /* Record that the guest page isn't shadowed any more (in this type) */
2023 if ( t == SH_type_fl1_shadow )
2025 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2026 delete_fl1_shadow_status(v, gfn, smfn);
2028 else
2030 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2031 delete_shadow_status(v, gmfn, t, smfn);
2032 shadow_demote(v, gmfn, t);
2035 if ( shadow_mode_refcounts(d) )
2037 /* Decrement refcounts of all the old entries */
2038 mfn_t sl1mfn = smfn;
2039 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2040 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2041 && !sh_l1e_is_magic(*sl1e) ) {
2042 shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2043 shadow_put_page_from_l1e(*sl1e, d);
2045 });
2048 /* Put the memory back in the pool */
2049 shadow_free(v->domain, smfn);
2052 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2053 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2055 struct domain *d = v->domain;
2056 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2058 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2060 mfn_t m3mfn;
2061 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2062 l3_pgentry_t *l3e;
2063 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2065 /* Need to destroy the l3 and l2 monitor pages used
2066 * for the linear map */
2067 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2068 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2069 l3e = sh_map_domain_page(m3mfn);
2070 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2071 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2072 sh_unmap_domain_page(l3e);
2073 shadow_free(d, m3mfn);
2075 if ( is_pv_32on64_vcpu(v) )
2077 /* Need to destroy the l3 and l2 monitor pages that map the
2078 * Xen VAs at 3GB-4GB */
2079 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2080 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2081 l3e = sh_map_domain_page(m3mfn);
2082 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2083 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2084 sh_unmap_domain_page(l3e);
2085 shadow_free(d, m3mfn);
2087 sh_unmap_domain_page(l4e);
2089 #elif CONFIG_PAGING_LEVELS == 3
2090 /* Need to destroy the l2 monitor page in slot 4 too */
2092 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2093 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2094 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2095 sh_unmap_domain_page(l3e);
2097 #endif
2099 /* Put the memory back in the pool */
2100 shadow_free(d, mmfn);
2102 #endif
2104 /**************************************************************************/
2105 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2106 * These are called from common code when we are running out of shadow
2107 * memory, and unpinning all the top-level shadows hasn't worked.
2109 * This implementation is pretty crude and slow, but we hope that it won't
2110 * be called very often. */
2112 #if GUEST_PAGING_LEVELS == 2
2114 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2116 shadow_l2e_t *sl2e;
2117 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2118 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2119 });
2122 #elif GUEST_PAGING_LEVELS == 3
2124 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2125 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2127 shadow_l2e_t *sl2e;
2128 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2129 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2130 });
2133 #elif GUEST_PAGING_LEVELS == 4
2135 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2137 shadow_l4e_t *sl4e;
2138 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2139 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2140 });
2143 #endif
2145 /**************************************************************************/
2146 /* Internal translation functions.
2147 * These functions require a pointer to the shadow entry that will be updated.
2148 */
2150 /* These functions take a new guest entry, translate it to shadow and write
2151 * the shadow entry.
2153 * They return the same bitmaps as the shadow_set_lXe() functions.
2154 */
2156 #if GUEST_PAGING_LEVELS >= 4
2157 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2159 shadow_l4e_t new_sl4e;
2160 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2161 shadow_l4e_t *sl4p = se;
2162 mfn_t sl3mfn = _mfn(INVALID_MFN);
2163 struct domain *d = v->domain;
2164 p2m_type_t p2mt;
2165 int result = 0;
2167 perfc_incr(shadow_validate_gl4e_calls);
2169 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2171 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2172 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2173 if ( p2m_is_ram(p2mt) )
2174 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2175 else
2176 result |= SHADOW_SET_ERROR;
2178 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
2179 shadow_resync_all(v, 0);
2180 #endif
2182 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2184 // check for updates to xen reserved slots
2185 if ( !shadow_mode_external(d) )
2187 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2188 sizeof(shadow_l4e_t));
2189 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2191 if ( unlikely(reserved_xen_slot) )
2193 // attempt by the guest to write to a xen reserved slot
2194 //
2195 SHADOW_PRINTK("%s out-of-range update "
2196 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2197 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2198 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2200 SHADOW_ERROR("out-of-range l4e update\n");
2201 result |= SHADOW_SET_ERROR;
2204 // do not call shadow_set_l4e...
2205 return result;
2209 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2210 return result;
2214 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2216 shadow_l3e_t new_sl3e;
2217 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2218 shadow_l3e_t *sl3p = se;
2219 mfn_t sl2mfn = _mfn(INVALID_MFN);
2220 p2m_type_t p2mt;
2221 int result = 0;
2223 perfc_incr(shadow_validate_gl3e_calls);
2225 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2227 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2228 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2229 if ( p2m_is_ram(p2mt) )
2230 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2231 else
2232 result |= SHADOW_SET_ERROR;
2234 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
2235 shadow_resync_all(v, 0);
2236 #endif
2238 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2239 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2241 return result;
2243 #endif // GUEST_PAGING_LEVELS >= 4
2245 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2247 shadow_l2e_t new_sl2e;
2248 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2249 shadow_l2e_t *sl2p = se;
2250 mfn_t sl1mfn = _mfn(INVALID_MFN);
2251 p2m_type_t p2mt;
2252 int result = 0;
2254 perfc_incr(shadow_validate_gl2e_calls);
2256 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2258 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2259 if ( guest_supports_superpages(v) &&
2260 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2262 // superpage -- need to look up the shadow L1 which holds the
2263 // splitters...
2264 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2265 #if 0
2266 // XXX - it's possible that we want to do some kind of prefetch
2267 // for superpage fl1's here, but this is *not* on the demand path,
2268 // so we'll hold off trying that for now...
2269 //
2270 if ( !mfn_valid(sl1mfn) )
2271 sl1mfn = make_fl1_shadow(v, gl1gfn);
2272 #endif
2274 else
2276 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2277 if ( p2m_is_ram(p2mt) )
2278 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2279 else
2280 result |= SHADOW_SET_ERROR;
2283 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2285 // check for updates to xen reserved slots in PV guests...
2286 // XXX -- need to revisit this for PV 3-on-4 guests.
2287 //
2288 #if SHADOW_PAGING_LEVELS < 4
2289 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2290 if ( !shadow_mode_external(v->domain) )
2292 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2293 sizeof(shadow_l2e_t));
2294 int reserved_xen_slot;
2296 #if SHADOW_PAGING_LEVELS == 3
2297 reserved_xen_slot =
2298 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2299 (shadow_index
2300 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2301 #else /* SHADOW_PAGING_LEVELS == 2 */
2302 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2303 #endif
2305 if ( unlikely(reserved_xen_slot) )
2307 // attempt by the guest to write to a xen reserved slot
2308 //
2309 SHADOW_PRINTK("%s out-of-range update "
2310 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2311 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2312 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2314 SHADOW_ERROR("out-of-range l2e update\n");
2315 result |= SHADOW_SET_ERROR;
2318 // do not call shadow_set_l2e...
2319 return result;
2322 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2323 #endif /* SHADOW_PAGING_LEVELS < 4 */
2325 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2327 return result;
2330 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2332 shadow_l1e_t new_sl1e;
2333 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2334 shadow_l1e_t *sl1p = se;
2335 gfn_t gfn;
2336 mfn_t gmfn;
2337 p2m_type_t p2mt;
2338 int result = 0;
2339 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2340 mfn_t gl1mfn;
2341 #endif /* OOS */
2343 perfc_incr(shadow_validate_gl1e_calls);
2345 gfn = guest_l1e_get_gfn(new_gl1e);
2346 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2348 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2349 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2351 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2352 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
2353 if ( mfn_valid(gl1mfn)
2354 && mfn_is_out_of_sync(gl1mfn) )
2356 /* Update the OOS snapshot. */
2357 mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
2358 guest_l1e_t *snp;
2360 ASSERT(mfn_valid(snpmfn));
2362 snp = sh_map_domain_page(snpmfn);
2363 snp[guest_index(new_ge)] = new_gl1e;
2364 sh_unmap_domain_page(snp);
2366 #endif /* OOS */
2368 return result;
2371 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2372 /**************************************************************************/
2373 /* Special validation function for re-syncing out-of-sync shadows.
2374 * Walks the *shadow* page, and for every entry that it finds,
2375 * revalidates the guest entry that corresponds to it.
2376 * N.B. This function is called with the vcpu that unsynced the page,
2377 * *not* the one that is causing it to be resynced. */
2378 void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
2380 mfn_t sl1mfn;
2381 shadow_l1e_t *sl1p;
2382 guest_l1e_t *gl1p, *gp, *snp;
2383 int rc = 0;
2385 ASSERT(mfn_valid(snpmfn));
2387 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2388 ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
2390 snp = sh_map_domain_page(snpmfn);
2391 gp = sh_map_domain_page(gl1mfn);
2392 gl1p = gp;
2394 SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
2395 guest_l1e_t gl1e = *gl1p;
2396 guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
2398 if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
2400 gfn_t gfn;
2401 mfn_t gmfn;
2402 p2m_type_t p2mt;
2403 shadow_l1e_t nsl1e;
2405 gfn = guest_l1e_get_gfn(gl1e);
2406 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2407 l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
2408 rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
2410 *snpl1p = gl1e;
2412 });
2414 sh_unmap_domain_page(gp);
2415 sh_unmap_domain_page(snp);
2417 /* Setting shadow L1 entries should never need us to flush the TLB */
2418 ASSERT(!(rc & SHADOW_SET_FLUSH));
2421 /* Figure out whether it's definitely safe not to sync this l1 table.
2422 * That is: if we can tell that it's only used once, and that the
2423 * toplevel shadow responsible is not one of ours.
2424 * N.B. This function is called with the vcpu that required the resync,
2425 * *not* the one that originally unsynced the page, but it is
2426 * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
2427 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
2429 struct shadow_page_info *sp;
2430 mfn_t smfn;
2432 smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2433 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2435 /* Up to l2 */
2436 sp = mfn_to_shadow_page(smfn);
2437 if ( sp->count != 1 || !sp->up )
2438 return 0;
2439 smfn = _mfn(sp->up >> PAGE_SHIFT);
2440 ASSERT(mfn_valid(smfn));
2442 #if (SHADOW_PAGING_LEVELS == 4)
2443 /* up to l3 */
2444 sp = mfn_to_shadow_page(smfn);
2445 if ( sp->count != 1 || !sp->up )
2446 return 0;
2447 smfn = _mfn(sp->up >> PAGE_SHIFT);
2448 ASSERT(mfn_valid(smfn));
2450 /* up to l4 */
2451 sp = mfn_to_shadow_page(smfn);
2452 if ( sp->count != 1
2453 || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
2454 return 0;
2455 smfn = _mfn(sp->up >> PAGE_SHIFT);
2456 ASSERT(mfn_valid(smfn));
2458 #if (GUEST_PAGING_LEVELS == 2)
2459 /* In 2-on-3 shadow mode the up pointer contains the link to the
2460 * shadow page, but the shadow_table contains only the first of the
2461 * four pages that makes the PAE top shadow tables. */
2462 smfn = _mfn(mfn_x(smfn) & ~0x3UL);
2463 #endif
2465 #endif
2467 if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
2468 #if (SHADOW_PAGING_LEVELS == 3)
2469 || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
2470 || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
2471 || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
2472 #endif
2474 return 0;
2476 /* Only in use in one toplevel shadow, and it's not the one we're
2477 * running on */
2478 return 1;
2480 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
2483 /**************************************************************************/
2484 /* Functions which translate and install the shadows of arbitrary guest
2485 * entries that we have just seen the guest write. */
2488 static inline int
2489 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2490 void *new_gp, u32 size, u32 sh_type,
2491 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2492 int (*validate_ge)(struct vcpu *v, void *ge,
2493 mfn_t smfn, void *se))
2494 /* Generic function for mapping and validating. */
2496 mfn_t smfn, smfn2, map_mfn;
2497 shadow_l1e_t *sl1p;
2498 u32 shadow_idx, guest_idx;
2499 int result = 0;
2501 /* Align address and size to guest entry boundaries */
2502 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2503 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2504 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2505 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2507 /* Map the shadow page */
2508 smfn = get_shadow_status(v, gmfn, sh_type);
2509 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2510 guest_idx = guest_index(new_gp);
2511 map_mfn = smfn;
2512 shadow_idx = shadow_index(&map_mfn, guest_idx);
2513 sl1p = sh_map_domain_page(map_mfn);
2515 /* Validate one entry at a time */
2516 while ( size )
2518 smfn2 = smfn;
2519 guest_idx = guest_index(new_gp);
2520 shadow_idx = shadow_index(&smfn2, guest_idx);
2521 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2523 /* We have moved to another page of the shadow */
2524 map_mfn = smfn2;
2525 sh_unmap_domain_page(sl1p);
2526 sl1p = sh_map_domain_page(map_mfn);
2528 result |= validate_ge(v,
2529 new_gp,
2530 map_mfn,
2531 &sl1p[shadow_idx]);
2532 size -= sizeof(guest_l1e_t);
2533 new_gp += sizeof(guest_l1e_t);
2535 sh_unmap_domain_page(sl1p);
2536 return result;
2540 int
2541 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2542 void *new_gl4p, u32 size)
2544 #if GUEST_PAGING_LEVELS >= 4
2545 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2546 SH_type_l4_shadow,
2547 shadow_l4_index,
2548 validate_gl4e);
2549 #else // ! GUEST_PAGING_LEVELS >= 4
2550 SHADOW_ERROR("called in wrong paging mode!\n");
2551 BUG();
2552 return 0;
2553 #endif
2556 int
2557 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2558 void *new_gl3p, u32 size)
2560 #if GUEST_PAGING_LEVELS >= 4
2561 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2562 SH_type_l3_shadow,
2563 shadow_l3_index,
2564 validate_gl3e);
2565 #else // ! GUEST_PAGING_LEVELS >= 4
2566 SHADOW_ERROR("called in wrong paging mode!\n");
2567 BUG();
2568 return 0;
2569 #endif
2572 int
2573 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2574 void *new_gl2p, u32 size)
2576 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2577 SH_type_l2_shadow,
2578 shadow_l2_index,
2579 validate_gl2e);
2582 int
2583 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2584 void *new_gl2p, u32 size)
2586 #if GUEST_PAGING_LEVELS >= 3
2587 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2588 SH_type_l2h_shadow,
2589 shadow_l2_index,
2590 validate_gl2e);
2591 #else /* Non-PAE guests don't have different kinds of l2 table */
2592 SHADOW_ERROR("called in wrong paging mode!\n");
2593 BUG();
2594 return 0;
2595 #endif
2598 int
2599 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2600 void *new_gl1p, u32 size)
2602 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2603 SH_type_l1_shadow,
2604 shadow_l1_index,
2605 validate_gl1e);
2609 /**************************************************************************/
2610 /* Optimization: If we see two emulated writes of zeros to the same
2611 * page-table without another kind of page fault in between, we guess
2612 * that this is a batch of changes (for process destruction) and
2613 * unshadow the page so we don't take a pagefault on every entry. This
2614 * should also make finding writeable mappings of pagetables much
2615 * easier. */
2617 /* Look to see if this is the second emulated write in a row to this
2618 * page, and unshadow if it is */
2619 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2621 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2622 if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2623 && sh_mfn_is_a_page_table(gmfn) )
2625 perfc_incr(shadow_early_unshadow);
2626 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2627 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
2629 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2630 #endif
2633 /* Stop counting towards early unshadows, as we've seen a real page fault */
2634 static inline void reset_early_unshadow(struct vcpu *v)
2636 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2637 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2638 #endif
2643 /**************************************************************************/
2644 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2645 * demand-faulted a shadow l1e in the fault handler, to see if it's
2646 * worth fetching some more.
2647 */
2649 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2651 /* XXX magic number */
2652 #define PREFETCH_DISTANCE 32
2654 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2655 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2657 int i, dist;
2658 gfn_t gfn;
2659 mfn_t gmfn;
2660 guest_l1e_t *gl1p = NULL, gl1e;
2661 shadow_l1e_t sl1e;
2662 u32 gflags;
2663 p2m_type_t p2mt;
2664 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2665 guest_l1e_t *snpl1p = NULL;
2666 #endif /* OOS */
2669 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2670 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2671 /* And no more than a maximum fetches-per-fault */
2672 if ( dist > PREFETCH_DISTANCE )
2673 dist = PREFETCH_DISTANCE;
2675 if ( mfn_valid(gw->l1mfn) )
2677 /* Normal guest page; grab the next guest entry */
2678 gl1p = sh_map_domain_page(gw->l1mfn);
2679 gl1p += guest_l1_table_offset(gw->va);
2681 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2682 if ( mfn_is_out_of_sync(gw->l1mfn) )
2684 mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
2686 ASSERT(mfn_valid(snpmfn));
2687 snpl1p = sh_map_domain_page(snpmfn);
2688 snpl1p += guest_l1_table_offset(gw->va);
2690 #endif /* OOS */
2693 for ( i = 1; i < dist ; i++ )
2695 /* No point in prefetching if there's already a shadow */
2696 if ( ptr_sl1e[i].l1 != 0 )
2697 break;
2699 if ( mfn_valid(gw->l1mfn) )
2701 /* Normal guest page; grab the next guest entry */
2702 gl1e = gl1p[i];
2703 /* Not worth continuing if we hit an entry that will need another
2704 * fault for A/D-bit propagation anyway */
2705 gflags = guest_l1e_get_flags(gl1e);
2706 if ( (gflags & _PAGE_PRESENT)
2707 && (!(gflags & _PAGE_ACCESSED)
2708 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2709 break;
2711 else
2713 /* Fragmented superpage, unless we've been called wrongly */
2714 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2715 /* Increment the l1e's GFN by the right number of guest pages */
2716 gl1e = guest_l1e_from_gfn(
2717 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2718 guest_l1e_get_flags(gw->l1e));
2721 /* Look at the gfn that the l1e is pointing at */
2722 gfn = guest_l1e_get_gfn(gl1e);
2723 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2725 /* Propagate the entry. */
2726 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2727 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2729 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2730 if ( snpl1p != NULL )
2731 snpl1p[i] = gl1e;
2732 #endif /* OOS */
2734 if ( gl1p != NULL )
2735 sh_unmap_domain_page(gl1p);
2736 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2737 if ( snpl1p != NULL )
2738 sh_unmap_domain_page(snpl1p);
2739 #endif /* OOS */
2742 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2744 #if GUEST_PAGING_LEVELS == 4
2745 typedef u64 guest_va_t;
2746 typedef u64 guest_pa_t;
2747 #elif GUEST_PAGING_LEVELS == 3
2748 typedef u32 guest_va_t;
2749 typedef u64 guest_pa_t;
2750 #else
2751 typedef u32 guest_va_t;
2752 typedef u32 guest_pa_t;
2753 #endif
2755 static inline void trace_shadow_gen(u32 event, guest_va_t va)
2757 if ( tb_init_done )
2759 event |= (GUEST_PAGING_LEVELS-2)<<8;
2760 __trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va);
2764 static inline void trace_shadow_fixup(guest_l1e_t gl1e,
2765 guest_va_t va)
2767 if ( tb_init_done )
2769 struct {
2770 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2771 so put it first for alignment sake. */
2772 guest_l1e_t gl1e;
2773 guest_va_t va;
2774 u32 flags;
2775 } __attribute__((packed)) d;
2776 u32 event;
2778 event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8);
2780 d.gl1e = gl1e;
2781 d.va = va;
2782 d.flags = this_cpu(trace_shadow_path_flags);
2784 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2788 static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
2789 guest_va_t va)
2791 if ( tb_init_done )
2793 struct {
2794 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2795 so put it first for alignment sake. */
2796 guest_l1e_t gl1e;
2797 guest_va_t va;
2798 u32 flags;
2799 } __attribute__((packed)) d;
2800 u32 event;
2802 event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8);
2804 d.gl1e = gl1e;
2805 d.va = va;
2806 d.flags = this_cpu(trace_shadow_path_flags);
2808 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2812 static inline void trace_shadow_emulate_other(u32 event,
2813 guest_va_t va,
2814 gfn_t gfn)
2816 if ( tb_init_done )
2818 struct {
2819 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2820 so put it first for alignment sake. */
2821 #if GUEST_PAGING_LEVELS == 2
2822 u32 gfn;
2823 #else
2824 u64 gfn;
2825 #endif
2826 guest_va_t va;
2827 } __attribute__((packed)) d;
2829 event |= ((GUEST_PAGING_LEVELS-2)<<8);
2831 d.gfn=gfn_x(gfn);
2832 d.va = va;
2834 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2838 #if GUEST_PAGING_LEVELS == 3
2839 static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va);
2840 static DEFINE_PER_CPU(int,trace_extra_emulation_count);
2841 #endif
2842 static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
2844 static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
2846 if ( tb_init_done )
2848 struct {
2849 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2850 so put it first for alignment sake. */
2851 guest_l1e_t gl1e, write_val;
2852 guest_va_t va;
2853 unsigned flags:29, emulation_count:3;
2854 } __attribute__((packed)) d;
2855 u32 event;
2857 event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8);
2859 d.gl1e = gl1e;
2860 d.write_val.l1 = this_cpu(trace_emulate_write_val);
2861 d.va = va;
2862 #if GUEST_PAGING_LEVELS == 3
2863 d.emulation_count = this_cpu(trace_extra_emulation_count);
2864 #endif
2865 d.flags = this_cpu(trace_shadow_path_flags);
2867 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2871 /**************************************************************************/
2872 /* Entry points into the shadow code */
2874 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2875 * for pagefaults. Returns 1 if this fault was an artefact of the
2876 * shadow code (and the guest should retry) or 0 if it is not (and the
2877 * fault should be handled elsewhere or passed to the guest). */
2879 static int sh_page_fault(struct vcpu *v,
2880 unsigned long va,
2881 struct cpu_user_regs *regs)
2883 struct domain *d = v->domain;
2884 walk_t gw;
2885 gfn_t gfn = _gfn(0);
2886 mfn_t gmfn, sl1mfn = _mfn(0);
2887 shadow_l1e_t sl1e, *ptr_sl1e;
2888 paddr_t gpa;
2889 struct sh_emulate_ctxt emul_ctxt;
2890 struct x86_emulate_ops *emul_ops;
2891 int r;
2892 fetch_type_t ft = 0;
2893 p2m_type_t p2mt;
2894 uint32_t rc;
2895 int version;
2896 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2897 int fast_emul = 0;
2898 #endif
2900 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
2901 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
2902 regs->eip);
2904 perfc_incr(shadow_fault);
2906 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2907 /* If faulting frame is successfully emulated in last shadow fault
2908 * it's highly likely to reach same emulation action for this frame.
2909 * Then try to emulate early to avoid lock aquisition.
2910 */
2911 if ( v->arch.paging.last_write_emul_ok
2912 && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
2914 /* check whether error code is 3, or else fall back to normal path
2915 * in case of some validation is required
2916 */
2917 if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
2919 fast_emul = 1;
2920 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
2922 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2923 /* Fall back to the slow path if we're trying to emulate
2924 writes to an out of sync page. */
2925 if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
2927 fast_emul = 0;
2928 v->arch.paging.last_write_emul_ok = 0;
2929 goto page_fault_slow_path;
2931 #endif /* OOS */
2933 perfc_incr(shadow_fault_fast_emulate);
2934 goto early_emulation;
2936 else
2937 v->arch.paging.last_write_emul_ok = 0;
2939 #endif
2941 //
2942 // XXX: Need to think about eventually mapping superpages directly in the
2943 // shadow (when possible), as opposed to splintering them into a
2944 // bunch of 4K maps.
2945 //
2947 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
2948 if ( (regs->error_code & PFEC_reserved_bit) )
2950 /* The only reasons for reserved bits to be set in shadow entries
2951 * are the two "magic" shadow_l1e entries. */
2952 if ( likely((__copy_from_user(&sl1e,
2953 (sh_linear_l1_table(v)
2954 + shadow_l1_linear_offset(va)),
2955 sizeof(sl1e)) == 0)
2956 && sh_l1e_is_magic(sl1e)) )
2958 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2959 /* First, need to check that this isn't an out-of-sync
2960 * shadow l1e. If it is, we fall back to the slow path, which
2961 * will sync it up again. */
2963 shadow_l2e_t sl2e;
2964 mfn_t gl1mfn;
2965 if ( (__copy_from_user(&sl2e,
2966 (sh_linear_l2_table(v)
2967 + shadow_l2_linear_offset(va)),
2968 sizeof(sl2e)) != 0)
2969 || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
2970 || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
2971 shadow_l2e_get_mfn(sl2e))->backpointer))
2972 || unlikely(mfn_is_out_of_sync(gl1mfn)) )
2974 /* Hit the slow path as if there had been no
2975 * shadow entry at all, and let it tidy up */
2976 ASSERT(regs->error_code & PFEC_page_present);
2977 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2978 goto page_fault_slow_path;
2981 #endif /* SHOPT_OUT_OF_SYNC */
2983 if ( sh_l1e_is_gnp(sl1e) )
2985 /* Not-present in a guest PT: pass to the guest as
2986 * a not-present fault (by flipping two bits). */
2987 ASSERT(regs->error_code & PFEC_page_present);
2988 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2989 reset_early_unshadow(v);
2990 perfc_incr(shadow_fault_fast_gnp);
2991 SHADOW_PRINTK("fast path not-present\n");
2992 trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
2993 return 0;
2995 else
2997 /* Magic MMIO marker: extract gfn for MMIO address */
2998 ASSERT(sh_l1e_is_mmio(sl1e));
2999 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
3000 << PAGE_SHIFT)
3001 | (va & ~PAGE_MASK);
3003 perfc_incr(shadow_fault_fast_mmio);
3004 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
3005 reset_early_unshadow(v);
3006 trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
3007 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3008 ? EXCRET_fault_fixed : 0);
3010 else
3012 /* This should be exceptionally rare: another vcpu has fixed
3013 * the tables between the fault and our reading the l1e.
3014 * Retry and let the hardware give us the right fault next time. */
3015 perfc_incr(shadow_fault_fast_fail);
3016 SHADOW_PRINTK("fast path false alarm!\n");
3017 trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
3018 return EXCRET_fault_fixed;
3022 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3023 page_fault_slow_path:
3024 #endif
3025 #endif /* SHOPT_FAST_FAULT_PATH */
3027 /* Detect if this page fault happened while we were already in Xen
3028 * doing a shadow operation. If that happens, the only thing we can
3029 * do is let Xen's normal fault handlers try to fix it. In any case,
3030 * a diagnostic trace of the fault will be more useful than
3031 * a BUG() when we try to take the lock again. */
3032 if ( unlikely(shadow_locked_by_me(d)) )
3034 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
3035 d->arch.paging.shadow.locker_function);
3036 return 0;
3039 rewalk:
3041 /* The walk is done in a lock-free style, with some sanity check
3042 * postponed after grabbing shadow lock later. Those delayed checks
3043 * will make sure no inconsistent mapping being translated into
3044 * shadow page table. */
3045 version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
3046 rmb();
3047 rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
3049 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3050 regs->error_code &= ~PFEC_page_present;
3051 if ( !(rc & _PAGE_PRESENT) )
3052 regs->error_code |= PFEC_page_present;
3053 #endif
3055 if ( rc != 0 )
3057 perfc_incr(shadow_fault_bail_real_fault);
3058 SHADOW_PRINTK("not a shadow fault\n");
3059 reset_early_unshadow(v);
3060 goto propagate;
3063 /* It's possible that the guest has put pagetables in memory that it has
3064 * already used for some special purpose (ioreq pages, or granted pages).
3065 * If that happens we'll have killed the guest already but it's still not
3066 * safe to propagate entries out of the guest PT so get out now. */
3067 if ( unlikely(d->is_shutting_down) )
3069 SHADOW_PRINTK("guest is shutting down\n");
3070 goto propagate;
3073 /* What kind of access are we dealing with? */
3074 ft = ((regs->error_code & PFEC_write_access)
3075 ? ft_demand_write : ft_demand_read);
3077 /* What mfn is the guest trying to access? */
3078 gfn = guest_l1e_get_gfn(gw.l1e);
3079 gmfn = gfn_to_mfn(d, gfn, &p2mt);
3081 if ( shadow_mode_refcounts(d) &&
3082 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3084 perfc_incr(shadow_fault_bail_bad_gfn);
3085 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3086 gfn_x(gfn), mfn_x(gmfn));
3087 reset_early_unshadow(v);
3088 goto propagate;
3091 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3092 /* Remember this successful VA->GFN translation for later. */
3093 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3094 regs->error_code | PFEC_page_present);
3095 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3097 shadow_lock(d);
3099 TRACE_CLEAR_PATH_FLAGS;
3101 rc = gw_remove_write_accesses(v, va, &gw);
3103 /* First bit set: Removed write access to a page. */
3104 if ( rc & GW_RMWR_FLUSHTLB )
3106 /* Write permission removal is also a hint that other gwalks
3107 * overlapping with this one may be inconsistent
3108 */
3109 perfc_incr(shadow_rm_write_flush_tlb);
3110 atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3111 flush_tlb_mask(d->domain_dirty_cpumask);
3114 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3115 /* Second bit set: Resynced a page. Re-walk needed. */
3116 if ( rc & GW_RMWR_REWALK )
3118 shadow_unlock(d);
3119 goto rewalk;
3121 #endif /* OOS */
3123 if ( !shadow_check_gwalk(v, va, &gw, version) )
3125 perfc_incr(shadow_inconsistent_gwalk);
3126 shadow_unlock(d);
3127 goto rewalk;
3130 shadow_audit_tables(v);
3131 sh_audit_gw(v, &gw);
3133 /* Make sure there is enough free shadow memory to build a chain of
3134 * shadow tables. (We never allocate a top-level shadow on this path,
3135 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3136 * SH_type_l1_shadow isn't correct in the latter case, all page
3137 * tables are the same size there.) */
3138 shadow_prealloc(d,
3139 SH_type_l1_shadow,
3140 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3142 /* Acquire the shadow. This must happen before we figure out the rights
3143 * for the shadow entry, since we might promote a page here. */
3144 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3145 if ( unlikely(ptr_sl1e == NULL) )
3147 /* Couldn't get the sl1e! Since we know the guest entries
3148 * are OK, this can only have been caused by a failed
3149 * shadow_set_l*e(), which will have crashed the guest.
3150 * Get out of the fault handler immediately. */
3151 ASSERT(d->is_shutting_down);
3152 shadow_unlock(d);
3153 trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va);
3154 return 0;
3157 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3158 /* Always unsync when writing to L1 page tables. */
3159 if ( sh_mfn_is_a_page_table(gmfn)
3160 && ft == ft_demand_write )
3161 sh_unsync(v, gmfn);
3163 if ( unlikely(d->is_shutting_down) )
3165 /* We might end up with a crashed domain here if
3166 * sh_remove_shadows() in a previous sh_resync() call has
3167 * failed. We cannot safely continue since some page is still
3168 * OOS but not in the hash table anymore. */
3169 shadow_unlock(d);
3170 return 0;
3172 #endif /* OOS */
3174 /* Calculate the shadow entry and write it */
3175 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3176 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
3178 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3179 if ( mfn_valid(gw.l1mfn)
3180 && mfn_is_out_of_sync(gw.l1mfn) )
3182 /* Update the OOS snapshot. */
3183 mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
3184 guest_l1e_t *snp;
3186 ASSERT(mfn_valid(snpmfn));
3188 snp = sh_map_domain_page(snpmfn);
3189 snp[guest_l1_table_offset(va)] = gw.l1e;
3190 sh_unmap_domain_page(snp);
3192 #endif /* OOS */
3194 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3195 /* Prefetch some more shadow entries */
3196 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3197 #endif
3199 /* Need to emulate accesses to page tables */
3200 if ( sh_mfn_is_a_page_table(gmfn)
3201 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3202 /* Unless they've been allowed to go out of sync with their
3203 shadows and we don't need to unshadow it. */
3204 && !(mfn_is_out_of_sync(gmfn)
3205 && !(regs->error_code & PFEC_user_mode))
3206 #endif
3209 if ( ft == ft_demand_write )
3211 perfc_incr(shadow_fault_emulate_write);
3212 goto emulate;
3214 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3216 perfc_incr(shadow_fault_emulate_read);
3217 goto emulate;
3221 /* Need to hand off device-model MMIO to the device model */
3222 if ( p2mt == p2m_mmio_dm )
3224 gpa = guest_walk_to_gpa(&gw);
3225 goto mmio;
3228 /* Log attempts to write to read-only memory */
3229 if ( (p2mt == p2m_ram_ro) && (ft == ft_demand_write) )
3231 static unsigned long lastpage = 0;
3232 if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
3233 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
3234 " page. va page=%#lx, mfn=%#lx\n",
3235 va & PAGE_MASK, mfn_x(gmfn));
3236 goto emulate_readonly; /* skip over the instruction */
3239 /* In HVM guests, we force CR0.WP always to be set, so that the
3240 * pagetables are always write-protected. If the guest thinks
3241 * CR0.WP is clear, we must emulate faulting supervisor writes to
3242 * allow the guest to write through read-only PTEs. Emulate if the
3243 * fault was a non-user write to a present page. */
3244 if ( is_hvm_domain(d)
3245 && unlikely(!hvm_wp_enabled(v))
3246 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
3248 perfc_incr(shadow_fault_emulate_wp);
3249 goto emulate;
3252 perfc_incr(shadow_fault_fixed);
3253 d->arch.paging.log_dirty.fault_count++;
3254 reset_early_unshadow(v);
3256 trace_shadow_fixup(gw.l1e, va);
3257 done:
3258 sh_audit_gw(v, &gw);
3259 SHADOW_PRINTK("fixed\n");
3260 shadow_audit_tables(v);
3261 shadow_unlock(d);
3262 return EXCRET_fault_fixed;
3264 emulate:
3265 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3266 goto not_a_shadow_fault;
3268 /*
3269 * We do not emulate user writes. Instead we use them as a hint that the
3270 * page is no longer a page table. This behaviour differs from native, but
3271 * it seems very unlikely that any OS grants user access to page tables.
3272 */
3273 if ( (regs->error_code & PFEC_user_mode) )
3275 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3276 mfn_x(gmfn));
3277 perfc_incr(shadow_fault_emulate_failed);
3278 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3279 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
3280 va, gfn);
3281 goto done;
3284 /*
3285 * Write from userspace to ro-mem needs to jump here to avoid getting
3286 * caught by user-mode page-table check above.
3287 */
3288 emulate_readonly:
3289 /*
3290 * We don't need to hold the lock for the whole emulation; we will
3291 * take it again when we write to the pagetables.
3292 */
3293 sh_audit_gw(v, &gw);
3294 shadow_audit_tables(v);
3295 shadow_unlock(d);
3297 this_cpu(trace_emulate_write_val) = 0;
3299 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3300 early_emulation:
3301 #endif
3302 if ( is_hvm_domain(d) )
3304 /*
3305 * If we are in the middle of injecting an exception or interrupt then
3306 * we should not emulate: it is not the instruction at %eip that caused
3307 * the fault. Furthermore it is almost certainly the case the handler
3308 * stack is currently considered to be a page table, so we should
3309 * unshadow the faulting page before exiting.
3310 */
3311 if ( unlikely(hvm_event_pending(v)) )
3313 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3314 if ( fast_emul )
3316 perfc_incr(shadow_fault_fast_emulate_fail);
3317 v->arch.paging.last_write_emul_ok = 0;
3319 #endif
3320 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3321 "injection: cr2=%#lx, mfn=%#lx\n",
3322 va, mfn_x(gmfn));
3323 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3324 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
3325 va, gfn);
3326 return EXCRET_fault_fixed;
3330 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3331 (unsigned long)regs->eip, (unsigned long)regs->esp);
3333 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3335 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3337 /*
3338 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3339 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3340 * then it must be 'failable': we cannot require the unshadow to succeed.
3341 */
3342 if ( r == X86EMUL_UNHANDLEABLE )
3344 perfc_incr(shadow_fault_emulate_failed);
3345 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3346 if ( fast_emul )
3348 perfc_incr(shadow_fault_fast_emulate_fail);
3349 v->arch.paging.last_write_emul_ok = 0;
3351 #endif
3352 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3353 mfn_x(gmfn));
3354 /* If this is actually a page table, then we have a bug, and need
3355 * to support more operations in the emulator. More likely,
3356 * though, this is a hint that this page should not be shadowed. */
3357 shadow_remove_all_shadows(v, gmfn);
3359 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
3360 va, gfn);
3361 goto emulate_done;
3364 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3365 /* Record successfully emulated information as heuristics to next
3366 * fault on same frame for acceleration. But be careful to verify
3367 * its attribute still as page table, or else unshadow triggered
3368 * in write emulation normally requires a re-sync with guest page
3369 * table to recover r/w permission. Incorrect record for such case
3370 * will cause unexpected more shadow faults due to propagation is
3371 * skipped.
3372 */
3373 if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3375 if ( !fast_emul )
3377 v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3378 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3379 v->arch.paging.last_write_emul_ok = 1;
3382 else if ( fast_emul )
3383 v->arch.paging.last_write_emul_ok = 0;
3384 #endif
3386 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3387 if ( r == X86EMUL_OKAY ) {
3388 int i, emulation_count=0;
3389 this_cpu(trace_emulate_initial_va) = va;
3390 /* Emulate up to four extra instructions in the hope of catching
3391 * the "second half" of a 64-bit pagetable write. */
3392 for ( i = 0 ; i < 4 ; i++ )
3394 shadow_continue_emulation(&emul_ctxt, regs);
3395 v->arch.paging.last_write_was_pt = 0;
3396 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3397 if ( r == X86EMUL_OKAY )
3399 emulation_count++;
3400 if ( v->arch.paging.last_write_was_pt )
3402 perfc_incr(shadow_em_ex_pt);
3403 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
3404 break; /* Don't emulate past the other half of the write */
3406 else
3407 perfc_incr(shadow_em_ex_non_pt);
3409 else
3411 perfc_incr(shadow_em_ex_fail);
3412 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
3413 break; /* Don't emulate again if we failed! */
3416 this_cpu(trace_extra_emulation_count)=emulation_count;
3418 #endif /* PAE guest */
3420 trace_shadow_emulate(gw.l1e, va);
3421 emulate_done:
3422 SHADOW_PRINTK("emulated\n");
3423 return EXCRET_fault_fixed;
3425 mmio:
3426 if ( !guest_mode(regs) )
3427 goto not_a_shadow_fault;
3428 perfc_incr(shadow_fault_mmio);
3429 sh_audit_gw(v, &gw);
3430 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3431 shadow_audit_tables(v);
3432 reset_early_unshadow(v);
3433 shadow_unlock(d);
3434 trace_shadow_gen(TRC_SHADOW_MMIO, va);
3435 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3436 ? EXCRET_fault_fixed : 0);
3438 not_a_shadow_fault:
3439 sh_audit_gw(v, &gw);
3440 SHADOW_PRINTK("not a shadow fault\n");
3441 shadow_audit_tables(v);
3442 reset_early_unshadow(v);
3443 shadow_unlock(d);
3445 propagate:
3446 trace_not_shadow_fault(gw.l1e, va);
3448 return 0;
3452 static int
3453 sh_invlpg(struct vcpu *v, unsigned long va)
3454 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3455 * instruction should be issued on the hardware, or 0 if it's safe not
3456 * to do so. */
3458 mfn_t sl1mfn;
3459 shadow_l2e_t sl2e;
3461 perfc_incr(shadow_invlpg);
3463 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3464 /* No longer safe to use cached gva->gfn translations */
3465 vtlb_flush(v);
3466 #endif
3468 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3469 v->arch.paging.last_write_emul_ok = 0;
3470 #endif
3472 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3473 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3474 * yet. */
3475 #if SHADOW_PAGING_LEVELS == 4
3477 shadow_l3e_t sl3e;
3478 if ( !(shadow_l4e_get_flags(
3479 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3480 & _PAGE_PRESENT) )
3481 return 0;
3482 /* This must still be a copy-from-user because we don't have the
3483 * shadow lock, and the higher-level shadows might disappear
3484 * under our feet. */
3485 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3486 + shadow_l3_linear_offset(va)),
3487 sizeof (sl3e)) != 0 )
3489 perfc_incr(shadow_invlpg_fault);
3490 return 0;
3492 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3493 return 0;
3495 #else /* SHADOW_PAGING_LEVELS == 3 */
3496 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3497 & _PAGE_PRESENT) )
3498 // no need to flush anything if there's no SL2...
3499 return 0;
3500 #endif
3502 /* This must still be a copy-from-user because we don't have the shadow
3503 * lock, and the higher-level shadows might disappear under our feet. */
3504 if ( __copy_from_user(&sl2e,
3505 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3506 sizeof (sl2e)) != 0 )
3508 perfc_incr(shadow_invlpg_fault);
3509 return 0;
3512 // If there's nothing shadowed for this particular sl2e, then
3513 // there is no need to do an invlpg, either...
3514 //
3515 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3516 return 0;
3518 // Check to see if the SL2 is a splintered superpage...
3519 // If so, then we'll need to flush the entire TLB (because that's
3520 // easier than invalidating all of the individual 4K pages).
3521 //
3522 sl1mfn = shadow_l2e_get_mfn(sl2e);
3523 if ( mfn_to_shadow_page(sl1mfn)->type
3524 == SH_type_fl1_shadow )
3526 flush_tlb_local();
3527 return 0;
3530 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3531 /* Check to see if the SL1 is out of sync. */
3533 mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3534 struct page_info *pg = mfn_to_page(gl1mfn);
3535 if ( mfn_valid(gl1mfn)
3536 && page_is_out_of_sync(pg) )
3538 /* The test above may give false positives, since we don't
3539 * hold the shadow lock yet. Check again with the lock held. */
3540 shadow_lock(v->domain);
3542 /* This must still be a copy-from-user because we didn't
3543 * have the shadow lock last time we checked, and the
3544 * higher-level shadows might have disappeared under our
3545 * feet. */
3546 if ( __copy_from_user(&sl2e,
3547 sh_linear_l2_table(v)
3548 + shadow_l2_linear_offset(va),
3549 sizeof (sl2e)) != 0 )
3551 perfc_incr(shadow_invlpg_fault);
3552 shadow_unlock(v->domain);
3553 return 0;
3556 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3558 shadow_unlock(v->domain);
3559 return 0;
3562 sl1mfn = shadow_l2e_get_mfn(sl2e);
3563 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3564 pg = mfn_to_page(gl1mfn);
3566 if ( likely(sh_mfn_is_a_page_table(gl1mfn)
3567 && page_is_out_of_sync(pg) ) )
3569 shadow_l1e_t *sl1;
3570 sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
3571 /* Remove the shadow entry that maps this VA */
3572 (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
3574 shadow_unlock(v->domain);
3575 /* Need the invlpg, to pick up the disappeareance of the sl1e */
3576 return 1;
3579 #endif
3581 return 1;
3585 static unsigned long
3586 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3587 /* Called to translate a guest virtual address to what the *guest*
3588 * pagetables would map it to. */
3590 walk_t gw;
3591 gfn_t gfn;
3593 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3594 /* Check the vTLB cache first */
3595 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3596 if ( VALID_GFN(vtlb_gfn) )
3597 return vtlb_gfn;
3598 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3600 if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
3602 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3603 pfec[0] &= ~PFEC_page_present;
3604 return INVALID_GFN;
3606 gfn = guest_walk_to_gfn(&gw);
3608 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3609 /* Remember this successful VA->GFN translation for later. */
3610 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3611 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3613 return gfn_x(gfn);
3617 static inline void
3618 sh_update_linear_entries(struct vcpu *v)
3619 /* Sync up all the linear mappings for this vcpu's pagetables */
3621 struct domain *d = v->domain;
3623 /* Linear pagetables in PV guests
3624 * ------------------------------
3626 * Guest linear pagetables, which map the guest pages, are at
3627 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3628 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3629 * are set up at shadow creation time, but (of course!) the PAE case
3630 * is subtler. Normal linear mappings are made by having an entry
3631 * in the top-level table that points to itself (shadow linear) or
3632 * to the guest top-level table (guest linear). For PAE, to set up
3633 * a linear map requires us to copy the four top-level entries into
3634 * level-2 entries. That means that every time we change a PAE l3e,
3635 * we need to reflect the change into the copy.
3637 * Linear pagetables in HVM guests
3638 * -------------------------------
3640 * For HVM guests, the linear pagetables are installed in the monitor
3641 * tables (since we can't put them in the shadow). Shadow linear
3642 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3643 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3644 * a linear pagetable of the monitor tables themselves. We have
3645 * the same issue of having to re-copy PAE l3 entries whevever we use
3646 * PAE shadows.
3648 * Because HVM guests run on the same monitor tables regardless of the
3649 * shadow tables in use, the linear mapping of the shadow tables has to
3650 * be updated every time v->arch.shadow_table changes.
3651 */
3653 /* Don't try to update the monitor table if it doesn't exist */
3654 if ( shadow_mode_external(d)
3655 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3656 return;
3658 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3660 /* For PV, one l4e points at the guest l4, one points at the shadow
3661 * l4. No maintenance required.
3662 * For HVM, just need to update the l4e that points to the shadow l4. */
3664 if ( shadow_mode_external(d) )
3666 /* Use the linear map if we can; otherwise make a new mapping */
3667 if ( v == current )
3669 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3670 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3671 __PAGE_HYPERVISOR);
3673 else
3675 l4_pgentry_t *ml4e;
3676 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3677 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3678 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3679 __PAGE_HYPERVISOR);
3680 sh_unmap_domain_page(ml4e);
3684 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3686 /* PV: XXX
3688 * HVM: To give ourselves a linear map of the shadows, we need to
3689 * extend a PAE shadow to 4 levels. We do this by having a monitor
3690 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3691 * entries into it. Then, by having the monitor l4e for shadow
3692 * pagetables also point to the monitor l4, we can use it to access
3693 * the shadows.
3694 */
3696 if ( shadow_mode_external(d) )
3698 /* Install copies of the shadow l3es into the monitor l2 table
3699 * that maps SH_LINEAR_PT_VIRT_START. */
3700 shadow_l3e_t *sl3e;
3701 l2_pgentry_t *ml2e;
3702 int i;
3704 /* Use linear mappings if we can; otherwise make new mappings */
3705 if ( v == current )
3706 ml2e = __linear_l2_table
3707 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3708 else
3710 mfn_t l3mfn, l2mfn;
3711 l4_pgentry_t *ml4e;
3712 l3_pgentry_t *ml3e;
3713 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3714 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3716 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3717 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3718 ml3e = sh_map_domain_page(l3mfn);
3719 sh_unmap_domain_page(ml4e);
3721 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3722 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3723 ml2e = sh_map_domain_page(l2mfn);
3724 sh_unmap_domain_page(ml3e);
3727 /* Shadow l3 tables are made up by sh_update_cr3 */
3728 sl3e = v->arch.paging.shadow.l3table;
3730 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3732 ml2e[i] =
3733 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3734 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3735 __PAGE_HYPERVISOR)
3736 : l2e_empty();
3739 if ( v != current )
3740 sh_unmap_domain_page(ml2e);
3742 else
3743 domain_crash(d); /* XXX */
3745 #elif CONFIG_PAGING_LEVELS == 3
3747 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3748 * entries in the shadow, and the shadow's l3 entries into the
3749 * shadow-linear-map l2 entries in the shadow. This is safe to do
3750 * because Xen does not let guests share high-slot l2 tables between l3s,
3751 * so we know we're not treading on anyone's toes.
3753 * HVM: need to copy the shadow's l3 entries into the
3754 * shadow-linear-map l2 entries in the monitor table. This is safe
3755 * because we have one monitor table for each vcpu. The monitor's
3756 * own l3es don't need to be copied because they never change.
3757 * XXX That might change if we start stuffing things into the rest
3758 * of the monitor's virtual address space.
3759 */
3761 l2_pgentry_t *l2e, new_l2e;
3762 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3763 int i;
3764 int unmap_l2e = 0;
3766 #if GUEST_PAGING_LEVELS == 2
3768 /* Shadow l3 tables were built by sh_update_cr3 */
3769 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3770 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3772 #else /* GUEST_PAGING_LEVELS == 3 */
3774 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3775 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3777 #endif /* GUEST_PAGING_LEVELS */
3779 /* Choose where to write the entries, using linear maps if possible */
3780 if ( shadow_mode_external(d) )
3782 if ( v == current )
3784 /* From the monitor tables, it's safe to use linear maps
3785 * to update monitor l2s */
3786 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3788 else
3790 /* Map the monitor table's high l2 */
3791 l3_pgentry_t *l3e;
3792 l3e = sh_map_domain_page(
3793 pagetable_get_mfn(v->arch.monitor_table));
3794 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3795 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3796 unmap_l2e = 1;
3797 sh_unmap_domain_page(l3e);
3800 else
3802 /* Map the shadow table's high l2 */
3803 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3804 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3805 unmap_l2e = 1;
3808 /* Write linear mapping of guest (only in PV, and only when
3809 * not translated). */
3810 if ( !shadow_mode_translate(d) )
3812 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3814 new_l2e =
3815 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3816 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3817 __PAGE_HYPERVISOR)
3818 : l2e_empty());
3819 safe_write_entry(
3820 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3821 &new_l2e);
3825 /* Write linear mapping of shadow. */
3826 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3828 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3829 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3830 __PAGE_HYPERVISOR)
3831 : l2e_empty();
3832 safe_write_entry(
3833 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3834 &new_l2e);
3837 if ( unmap_l2e )
3838 sh_unmap_domain_page(l2e);
3841 #else
3842 #error this should not happen
3843 #endif
3845 if ( shadow_mode_external(d) )
3847 /*
3848 * Having modified the linear pagetable mapping, flush local host TLBs.
3849 * This was not needed when vmenter/vmexit always had the side effect
3850 * of flushing host TLBs but, with ASIDs, it is possible to finish
3851 * this CR3 update, vmenter the guest, vmexit due to a page fault,
3852 * without an intervening host TLB flush. Then the page fault code
3853 * could use the linear pagetable to read a top-level shadow page
3854 * table entry. But, without this change, it would fetch the wrong
3855 * value due to a stale TLB.
3856 */
3857 flush_tlb_local();
3862 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3863 * Does all appropriate management/bookkeeping/refcounting/etc...
3864 */
3865 static void
3866 sh_detach_old_tables(struct vcpu *v)
3868 mfn_t smfn;
3869 int i = 0;
3871 ////
3872 //// vcpu->arch.paging.shadow.guest_vtable
3873 ////
3875 #if GUEST_PAGING_LEVELS == 3
3876 /* PAE guests don't have a mapping of the guest top-level table */
3877 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3878 #else
3879 if ( v->arch.paging.shadow.guest_vtable )
3881 struct domain *d = v->domain;
3882 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3883 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3884 v->arch.paging.shadow.guest_vtable = NULL;
3886 #endif // !NDEBUG
3889 ////
3890 //// vcpu->arch.shadow_table[]
3891 ////
3893 #if GUEST_PAGING_LEVELS == 3
3894 /* PAE guests have four shadow_table entries */
3895 for ( i = 0 ; i < 4 ; i++ )
3896 #endif
3898 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3899 if ( mfn_x(smfn) )
3900 sh_put_ref(v, smfn, 0);
3901 v->arch.shadow_table[i] = pagetable_null();
3905 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3906 static void
3907 sh_set_toplevel_shadow(struct vcpu *v,
3908 int slot,
3909 mfn_t gmfn,
3910 unsigned int root_type)
3912 mfn_t smfn;
3913 pagetable_t old_entry, new_entry;
3915 struct domain *d = v->domain;
3917 /* Remember the old contents of this slot */
3918 old_entry = v->arch.shadow_table[slot];
3920 /* Now figure out the new contents: is this a valid guest MFN? */
3921 if ( !mfn_valid(gmfn) )
3923 new_entry = pagetable_null();
3924 goto install_new_entry;
3927 /* Guest mfn is valid: shadow it and install the shadow */
3928 smfn = get_shadow_status(v, gmfn, root_type);
3929 if ( !mfn_valid(smfn) )
3931 /* Make sure there's enough free shadow memory. */
3932 shadow_prealloc(d, root_type, 1);
3933 /* Shadow the page. */
3934 smfn = sh_make_shadow(v, gmfn, root_type);
3936 ASSERT(mfn_valid(smfn));
3938 /* Pin the shadow and put it (back) on the list of pinned shadows */
3939 if ( sh_pin(v, smfn) == 0 )
3941 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3942 domain_crash(v->domain);
3945 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3946 * or the next call to set_toplevel_shadow() */
3947 if ( !sh_get_ref(v, smfn, 0) )
3949 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3950 domain_crash(v->domain);
3953 new_entry = pagetable_from_mfn(smfn);
3955 install_new_entry:
3956 /* Done. Install it */
3957 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3958 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3959 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3960 v->arch.shadow_table[slot] = new_entry;
3962 /* Decrement the refcount of the old contents of this slot */
3963 if ( !pagetable_is_null(old_entry) ) {
3964 mfn_t old_smfn = pagetable_get_mfn(old_entry);
3965 /* Need to repin the old toplevel shadow if it's been unpinned
3966 * by shadow_prealloc(): in PV mode we're still running on this
3967 * shadow and it's not safe to free it yet. */
3968 if ( !mfn_to_shadow_page(old_smfn)->pinned && !sh_pin(v, old_smfn) )
3970 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
3971 domain_crash(v->domain);
3973 sh_put_ref(v, old_smfn, 0);
3978 static void
3979 sh_update_cr3(struct vcpu *v, int do_locking)
3980 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3981 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3982 * if appropriate).
3983 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3984 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
3985 * shadow tables are.
3986 * If do_locking != 0, assume we are being called from outside the
3987 * shadow code, and must take and release the shadow lock; otherwise
3988 * that is the caller's responsibility.
3989 */
3991 struct domain *d = v->domain;
3992 mfn_t gmfn;
3993 #if GUEST_PAGING_LEVELS == 3
3994 guest_l3e_t *gl3e;
3995 u32 guest_idx=0;
3996 int i;
3997 #endif
3999 /* Don't do anything on an uninitialised vcpu */
4000 if ( !is_hvm_domain(d) && !v->is_initialised )
4002 ASSERT(v->arch.cr3 == 0);
4003 return;
4006 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4007 /* Need to resync all the shadow entries on a TLB flush. Resync
4008 * current vcpus OOS pages before switching to the new shadow
4009 * tables so that the VA hint is still valid. */
4010 shadow_resync_current_vcpu(v, do_locking);
4011 #endif
4013 if ( do_locking ) shadow_lock(v->domain);
4015 ASSERT(shadow_locked_by_me(v->domain));
4016 ASSERT(v->arch.paging.mode);
4018 ////
4019 //// vcpu->arch.guest_table is already set
4020 ////
4022 #ifndef NDEBUG
4023 /* Double-check that the HVM code has sent us a sane guest_table */
4024 if ( is_hvm_domain(d) )
4026 ASSERT(shadow_mode_external(d));
4027 if ( hvm_paging_enabled(v) )
4028 ASSERT(pagetable_get_pfn(v->arch.guest_table));
4029 else
4030 ASSERT(v->arch.guest_table.pfn
4031 == d->arch.paging.shadow.unpaged_pagetable.pfn);
4033 #endif
4035 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
4036 d->domain_id, v->vcpu_id,
4037 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
4039 #if GUEST_PAGING_LEVELS == 4
4040 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
4041 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
4042 else
4043 #endif
4044 gmfn = pagetable_get_mfn(v->arch.guest_table);
4047 ////
4048 //// vcpu->arch.paging.shadow.guest_vtable
4049 ////
4050 #if GUEST_PAGING_LEVELS == 4
4051 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4053 if ( v->arch.paging.shadow.guest_vtable )
4054 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4055 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4056 /* PAGING_LEVELS==4 implies 64-bit, which means that
4057 * map_domain_page_global can't fail */
4058 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
4060 else
4061 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
4062 #elif GUEST_PAGING_LEVELS == 3
4063 /* On PAE guests we don't use a mapping of the guest's own top-level
4064 * table. We cache the current state of that table and shadow that,
4065 * until the next CR3 write makes us refresh our cache. */
4066 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4068 if ( shadow_mode_external(d) )
4069 /* Find where in the page the l3 table is */
4070 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
4071 else
4072 /* PV guest: l3 is at the start of a page */
4073 guest_idx = 0;
4075 // Ignore the low 2 bits of guest_idx -- they are really just
4076 // cache control.
4077 guest_idx &= ~3;
4079 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
4080 for ( i = 0; i < 4 ; i++ )
4081 v->arch.paging.shadow.gl3e[i] = gl3e[i];
4082 sh_unmap_domain_page(gl3e);
4083 #elif GUEST_PAGING_LEVELS == 2
4084 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4086 if ( v->arch.paging.shadow.guest_vtable )
4087 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4088 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4089 /* Does this really need map_domain_page_global? Handle the
4090 * error properly if so. */
4091 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
4093 else
4094 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
4095 #else
4096 #error this should never happen
4097 #endif
4100 ////
4101 //// vcpu->arch.shadow_table[]
4102 ////
4104 /* We revoke write access to the new guest toplevel page(s) before we
4105 * replace the old shadow pagetable(s), so that we can safely use the
4106 * (old) shadow linear maps in the writeable mapping heuristics. */
4107 #if GUEST_PAGING_LEVELS == 2
4108 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
4109 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4110 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
4111 #elif GUEST_PAGING_LEVELS == 3
4112 /* PAE guests have four shadow_table entries, based on the
4113 * current values of the guest's four l3es. */
4115 int flush = 0;
4116 gfn_t gl2gfn;
4117 mfn_t gl2mfn;
4118 p2m_type_t p2mt;
4119 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
4120 /* First, make all four entries read-only. */
4121 for ( i = 0; i < 4; i++ )
4123 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4125 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4126 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4127 if ( p2m_is_ram(p2mt) )
4128 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
4131 if ( flush )
4132 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4133 /* Now install the new shadows. */
4134 for ( i = 0; i < 4; i++ )
4136 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4138 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4139 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4140 if ( p2m_is_ram(p2mt) )
4141 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
4142 ? SH_type_l2h_shadow
4143 : SH_type_l2_shadow);
4144 else
4145 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4147 else
4148 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4151 #elif GUEST_PAGING_LEVELS == 4
4152 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
4153 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4154 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
4155 #else
4156 #error This should never happen
4157 #endif
4160 ///
4161 /// v->arch.paging.shadow.l3table
4162 ///
4163 #if SHADOW_PAGING_LEVELS == 3
4165 mfn_t smfn;
4166 int i;
4167 for ( i = 0; i < 4; i++ )
4169 #if GUEST_PAGING_LEVELS == 2
4170 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
4171 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
4172 #else
4173 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
4174 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4175 #endif
4176 v->arch.paging.shadow.l3table[i] =
4177 (mfn_x(smfn) == 0)
4178 ? shadow_l3e_empty()
4179 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
4182 #endif /* SHADOW_PAGING_LEVELS == 3 */
4185 ///
4186 /// v->arch.cr3
4187 ///
4188 if ( shadow_mode_external(d) )
4190 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4192 else // not shadow_mode_external...
4194 /* We don't support PV except guest == shadow == config levels */
4195 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4196 #if SHADOW_PAGING_LEVELS == 3
4197 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4198 * Don't use make_cr3 because (a) we know it's below 4GB, and
4199 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4200 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4201 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4202 #else
4203 /* 4-on-4: Just use the shadow top-level directly */
4204 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4205 #endif
4209 ///
4210 /// v->arch.hvm_vcpu.hw_cr[3]
4211 ///
4212 if ( shadow_mode_external(d) )
4214 ASSERT(is_hvm_domain(d));
4215 #if SHADOW_PAGING_LEVELS == 3
4216 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4217 v->arch.hvm_vcpu.hw_cr[3] =
4218 virt_to_maddr(&v->arch.paging.shadow.l3table);
4219 #else
4220 /* 4-on-4: Just use the shadow top-level directly */
4221 v->arch.hvm_vcpu.hw_cr[3] =
4222 pagetable_get_paddr(v->arch.shadow_table[0]);
4223 #endif
4224 hvm_update_guest_cr(v, 3);
4227 /* Fix up the linear pagetable mappings */
4228 sh_update_linear_entries(v);
4230 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4231 /* No longer safe to use cached gva->gfn translations */
4232 vtlb_flush(v);
4233 #endif
4235 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4236 v->arch.paging.last_write_emul_ok = 0;
4237 #endif
4239 /* Release the lock, if we took it (otherwise it's the caller's problem) */
4240 if ( do_locking ) shadow_unlock(v->domain);
4242 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4243 /* Need to resync all the shadow entries on a TLB flush. We only
4244 * update the shadows, leaving the pages out of sync. Also, we try
4245 * to skip synchronization of shadows not mapped in the new
4246 * tables. */
4247 shadow_sync_other_vcpus(v, do_locking);
4248 #endif
4253 /**************************************************************************/
4254 /* Functions to revoke guest rights */
4256 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
4257 int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
4258 mfn_t smfn, unsigned long off)
4260 int r;
4261 shadow_l1e_t *sl1p, sl1e;
4262 struct shadow_page_info *sp;
4264 ASSERT(mfn_valid(gmfn));
4265 ASSERT(mfn_valid(smfn));
4267 sp = mfn_to_shadow_page(smfn);
4269 if ( sp->mbz != 0
4270 || (sp->type != SH_type_l1_shadow
4271 && sp->type != SH_type_fl1_shadow) )
4272 goto fail;
4274 sl1p = sh_map_domain_page(smfn);
4275 sl1p += off;
4276 sl1e = *sl1p;
4277 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4278 != (_PAGE_PRESENT|_PAGE_RW))
4279 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4281 sh_unmap_domain_page(sl1p);
4282 goto fail;
4285 /* Found it! Need to remove its write permissions. */
4286 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4287 r = shadow_set_l1e(v, sl1p, sl1e, smfn);
4288 ASSERT( !(r & SHADOW_SET_ERROR) );
4290 sh_unmap_domain_page(sl1p);
4291 perfc_incr(shadow_writeable_h_7);
4292 return 1;
4294 fail:
4295 perfc_incr(shadow_writeable_h_8);
4296 return 0;
4298 #endif /* OOS */
4300 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4301 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4302 /* Look up this vaddr in the current shadow and see if it's a writeable
4303 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4305 shadow_l1e_t sl1e, *sl1p;
4306 shadow_l2e_t *sl2p;
4307 shadow_l3e_t *sl3p;
4308 #if SHADOW_PAGING_LEVELS >= 4
4309 shadow_l4e_t *sl4p;
4310 #endif
4311 mfn_t sl1mfn;
4312 int r;
4314 /* Carefully look in the shadow linear map for the l1e we expect */
4315 #if SHADOW_PAGING_LEVELS >= 4
4316 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4317 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4318 return 0;
4319 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4320 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4321 return 0;
4322 #else /* SHADOW_PAGING_LEVELS == 3 */
4323 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4324 + shadow_l3_linear_offset(vaddr);
4325 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4326 return 0;
4327 #endif
4328 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4329 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4330 return 0;
4331 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4332 sl1e = *sl1p;
4333 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4334 != (_PAGE_PRESENT|_PAGE_RW))
4335 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4336 return 0;
4338 /* Found it! Need to remove its write permissions. */
4339 sl1mfn = shadow_l2e_get_mfn(*sl2p);
4340 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4341 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
4342 ASSERT( !(r & SHADOW_SET_ERROR) );
4343 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);
4344 return 1;
4346 #endif
4348 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4349 mfn_t readonly_mfn)
4350 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4352 shadow_l1e_t *sl1e;
4353 int done = 0;
4354 int flags;
4355 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4356 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4357 #endif
4359 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4361 flags = shadow_l1e_get_flags(*sl1e);
4362 if ( (flags & _PAGE_PRESENT)
4363 && (flags & _PAGE_RW)
4364 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4366 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4367 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
4368 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4369 /* Remember the last shadow that we shot a writeable mapping in */
4370 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4371 #endif
4372 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4373 & PGT_count_mask) == 0 )
4374 /* This breaks us cleanly out of the FOREACH macro */
4375 done = 1;
4377 });
4378 return done;
4382 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4383 /* Excises all mappings to guest frame from this shadow l1 table */
4385 shadow_l1e_t *sl1e;
4386 int done = 0;
4387 int flags;
4389 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4391 flags = shadow_l1e_get_flags(*sl1e);
4392 if ( (flags & _PAGE_PRESENT)
4393 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4395 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
4396 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4397 /* This breaks us cleanly out of the FOREACH macro */
4398 done = 1;
4400 });
4401 return done;
4404 /**************************************************************************/
4405 /* Functions to excise all pointers to shadows from higher-level shadows. */
4407 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4408 /* Blank out a single shadow entry */
4410 switch ( mfn_to_shadow_page(smfn)->type )
4412 case SH_type_l1_shadow:
4413 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
4414 case SH_type_l2_shadow:
4415 #if GUEST_PAGING_LEVELS >= 3
4416 case SH_type_l2h_shadow:
4417 #endif
4418 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
4419 #if GUEST_PAGING_LEVELS >= 4
4420 case SH_type_l3_shadow:
4421 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
4422 case SH_type_l4_shadow:
4423 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
4424 #endif
4425 default: BUG(); /* Called with the wrong kind of shadow. */
4429 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4430 /* Remove all mappings of this l1 shadow from this l2 shadow */
4432 shadow_l2e_t *sl2e;
4433 int done = 0;
4434 int flags;
4436 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4438 flags = shadow_l2e_get_flags(*sl2e);
4439 if ( (flags & _PAGE_PRESENT)
4440 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4442 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4443 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
4444 /* This breaks us cleanly out of the FOREACH macro */
4445 done = 1;
4447 });
4448 return done;
4451 #if GUEST_PAGING_LEVELS >= 4
4452 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4453 /* Remove all mappings of this l2 shadow from this l3 shadow */
4455 shadow_l3e_t *sl3e;
4456 int done = 0;
4457 int flags;
4459 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4461 flags = shadow_l3e_get_flags(*sl3e);
4462 if ( (flags & _PAGE_PRESENT)
4463 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4465 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4466 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
4467 /* This breaks us cleanly out of the FOREACH macro */
4468 done = 1;
4470 });
4471 return done;
4474 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4475 /* Remove all mappings of this l3 shadow from this l4 shadow */
4477 shadow_l4e_t *sl4e;
4478 int done = 0;
4479 int flags;
4481 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4483 flags = shadow_l4e_get_flags(*sl4e);
4484 if ( (flags & _PAGE_PRESENT)
4485 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4487 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4488 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
4489 /* This breaks us cleanly out of the FOREACH macro */
4490 done = 1;
4492 });
4493 return done;
4495 #endif /* 64bit guest */
4497 /**************************************************************************/
4498 /* Handling HVM guest writes to pagetables */
4500 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4501 #define BAD_GVA_TO_GFN (~0UL)
4502 #define BAD_GFN_TO_MFN (~1UL)
4503 #define READONLY_GFN (~2UL)
4504 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4505 unsigned long vaddr,
4506 struct sh_emulate_ctxt *sh_ctxt)
4508 unsigned long gfn;
4509 mfn_t mfn;
4510 p2m_type_t p2mt;
4511 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4513 /* Translate the VA to a GFN */
4514 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4515 if ( gfn == INVALID_GFN )
4517 if ( is_hvm_vcpu(v) )
4518 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4519 else
4520 propagate_page_fault(vaddr, pfec);
4521 return _mfn(BAD_GVA_TO_GFN);
4524 /* Translate the GFN to an MFN */
4525 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4526 if ( p2mt == p2m_ram_ro )
4527 return _mfn(READONLY_GFN);
4528 if ( !p2m_is_ram(p2mt) )
4529 return _mfn(BAD_GFN_TO_MFN);
4531 ASSERT(mfn_valid(mfn));
4532 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4533 return mfn;
4536 /* Check that the user is allowed to perform this write.
4537 * Returns a mapped pointer to write to, or NULL for error. */
4538 #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
4539 #define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION)
4540 #define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY)
4541 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3)
4542 static void *emulate_map_dest(struct vcpu *v,
4543 unsigned long vaddr,
4544 u32 bytes,
4545 struct sh_emulate_ctxt *sh_ctxt)
4547 unsigned long offset;
4548 void *map = NULL;
4550 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4551 if ( !mfn_valid(sh_ctxt->mfn1) )
4552 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4553 MAPPING_EXCEPTION :
4554 (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
4555 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4557 #ifndef NDEBUG
4558 /* We don't emulate user-mode writes to page tables */
4559 if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 )
4561 gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
4562 "emulate_map_dest(). This should never happen!\n");
4563 return MAPPING_UNHANDLEABLE;
4565 #endif
4567 /* Unaligned writes mean probably this isn't a pagetable */
4568 if ( vaddr & (bytes - 1) )
4569 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4571 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4573 /* Whole write fits on a single page */
4574 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4575 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4577 else
4579 /* Cross-page emulated writes are only supported for HVM guests;
4580 * PV guests ought to know better */
4581 if ( !is_hvm_vcpu(v) )
4582 return MAPPING_UNHANDLEABLE;
4584 /* This write crosses a page boundary. Translate the second page */
4585 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4586 sh_ctxt);
4587 if ( !mfn_valid(sh_ctxt->mfn2) )
4588 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4589 MAPPING_EXCEPTION :
4590 (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
4591 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4593 /* Cross-page writes mean probably not a pagetable */
4594 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4596 /* Hack: we map the pages into the vcpu's LDT space, since we
4597 * know that we're not going to need the LDT for HVM guests,
4598 * and only HVM guests are allowed unaligned writes. */
4599 ASSERT(is_hvm_vcpu(v));
4600 map = (void *)LDT_VIRT_START(v);
4601 offset = l1_linear_offset((unsigned long) map);
4602 l1e_write(&__linear_l1_table[offset],
4603 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4604 l1e_write(&__linear_l1_table[offset + 1],
4605 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4606 flush_tlb_local();
4607 map += (vaddr & ~PAGE_MASK);
4610 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4611 /* Remember if the bottom bit was clear, so we can choose not to run
4612 * the change through the verify code if it's still clear afterwards */
4613 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4614 #endif
4616 return map;
4619 /* Tidy up after the emulated write: mark pages dirty, verify the new
4620 * contents, and undo the mapping */
4621 static void emulate_unmap_dest(struct vcpu *v,
4622 void *addr,
4623 u32 bytes,
4624 struct sh_emulate_ctxt *sh_ctxt)
4626 u32 b1 = bytes, b2 = 0, shflags;
4628 ASSERT(mfn_valid(sh_ctxt->mfn1));
4630 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4631 if ( likely(bytes >= 4)
4632 && (*(u32 *)addr == 0)
4633 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4634 check_for_early_unshadow(v, sh_ctxt->mfn1);
4635 else
4636 reset_early_unshadow(v);
4638 /* We can avoid re-verifying the page contents after the write if:
4639 * - it was no larger than the PTE type of this pagetable;
4640 * - it was aligned to the PTE boundaries; and
4641 * - _PAGE_PRESENT was clear before and after the write. */
4642 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4643 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4644 if ( sh_ctxt->low_bit_was_clear
4645 && !(*(u8 *)addr & _PAGE_PRESENT)
4646 && ((!(shflags & SHF_32)
4647 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4648 * the present bit unset are safe to ignore. */
4649 && ((unsigned long)addr & 7) == 0
4650 && bytes <= 8)
4651 ||
4652 (!(shflags & (SHF_PAE|SHF_64))
4653 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4654 * leave the present bit unset are safe to ignore. */
4655 && ((unsigned long)addr & 3) == 0
4656 && bytes <= 4)) )
4658 /* Writes with this alignment constraint can't possibly cross pages */
4659 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4661 else
4662 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4664 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4666 /* Validate as two writes, one to each page */
4667 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4668 b2 = bytes - b1;
4669 ASSERT(b2 < bytes);
4671 if ( likely(b1 > 0) )
4672 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4673 if ( unlikely(b2 > 0) )
4674 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4677 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4679 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4681 unsigned long offset;
4682 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4683 /* Undo the hacky two-frame contiguous map. */
4684 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4685 offset = l1_linear_offset((unsigned long) addr);
4686 l1e_write(&__linear_l1_table[offset], l1e_empty());
4687 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4688 flush_tlb_all();
4690 else
4691 sh_unmap_domain_page(addr);
4693 atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4696 static int
4697 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4698 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4700 void *addr;
4702 /* Unaligned writes are only acceptable on HVM */
4703 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4704 return X86EMUL_UNHANDLEABLE;
4706 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4707 if ( emulate_map_dest_failed(addr) )
4708 return (long)addr;
4710 shadow_lock(v->domain);
4711 memcpy(addr, src, bytes);
4713 if ( tb_init_done )
4715 #if GUEST_PAGING_LEVELS == 3
4716 if ( vaddr == this_cpu(trace_emulate_initial_va) )
4717 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4718 else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
4720 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
4721 memcpy(&this_cpu(trace_emulate_write_val),
4722 (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE);
4724 #else
4725 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4726 #endif
4729 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4730 shadow_audit_tables(v);
4731 shadow_unlock(v->domain);
4732 return X86EMUL_OKAY;
4735 static int
4736 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4737 unsigned long old, unsigned long new,
4738 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4740 void *addr;
4741 unsigned long prev;
4742 int rv = X86EMUL_OKAY;
4744 /* Unaligned writes are only acceptable on HVM */
4745 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4746 return X86EMUL_UNHANDLEABLE;
4748 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4749 if ( emulate_map_dest_failed(addr) )
4750 return (long)addr;
4752 shadow_lock(v->domain);
4753 switch ( bytes )
4755 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4756 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4757 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4758 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4759 default:
4760 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4761 prev = ~old;
4764 if ( prev != old )
4765 rv = X86EMUL_CMPXCHG_FAILED;
4767 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4768 " wanted %#lx now %#lx bytes %u\n",
4769 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4771 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4772 shadow_audit_tables(v);
4773 shadow_unlock(v->domain);
4774 return rv;
4777 #ifdef __i386__
4778 static int
4779 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4780 unsigned long old_lo, unsigned long old_hi,
4781 unsigned long new_lo, unsigned long new_hi,
4782 struct sh_emulate_ctxt *sh_ctxt)
4784 void *addr;
4785 u64 old, new, prev;
4786 int rv = X86EMUL_OKAY;
4788 /* Unaligned writes are only acceptable on HVM */
4789 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4790 return X86EMUL_UNHANDLEABLE;
4792 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4793 if ( emulate_map_dest_failed(addr) )
4794 return (long)addr;
4796 old = (((u64) old_hi) << 32) | (u64) old_lo;
4797 new = (((u64) new_hi) << 32) | (u64) new_lo;
4799 shadow_lock(v->domain);
4800 prev = cmpxchg(((u64 *)addr), old, new);
4802 if ( prev != old )
4803 rv = X86EMUL_CMPXCHG_FAILED;
4805 emulate_unmap_dest(v, addr, 8, sh_ctxt);
4806 shadow_audit_tables(v);
4807 shadow_unlock(v->domain);
4808 return rv;
4810 #endif
4812 /**************************************************************************/
4813 /* Audit tools */
4815 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4817 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4818 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4819 "gl" #_level "mfn = %" PRI_mfn \
4820 " sl" #_level "mfn = %" PRI_mfn \
4821 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4822 " gl" #_level "e = %" SH_PRI_gpte \
4823 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4824 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4825 _level, guest_index(gl ## _level ## e), \
4826 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4827 gl ## _level ## e, sl ## _level ## e, \
4828 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4829 ##_a); \
4830 BUG(); \
4831 done = 1; \
4832 } while (0)
4834 #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
4835 printk("Shadow %u-on-%u audit failed at level %i\n" \
4836 "gl" #_level "mfn = %" PRI_mfn \
4837 " sl" #_level "mfn = %" PRI_mfn \
4838 " Error: " _fmt "\n", \
4839 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4840 _level, \
4841 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4842 ##_a); \
4843 BUG(); \
4844 done = 1; \
4845 } while (0)
4847 static char * sh_audit_flags(struct vcpu *v, int level,
4848 int gflags, int sflags)
4849 /* Common code for auditing flag bits */
4851 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4852 return "shadow is present but guest is not present";
4853 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4854 return "global bit set in PV shadow";
4855 if ( level == 2 && (sflags & _PAGE_PSE) )
4856 return "PS bit set in shadow";
4857 #if SHADOW_PAGING_LEVELS == 3
4858 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4859 #endif
4860 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4861 return "accessed bit not propagated";
4862 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4863 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4864 return "dirty bit not propagated";
4865 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4866 return "user/supervisor bit does not match";
4867 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4868 return "NX bit does not match";
4869 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4870 return "shadow grants write access but guest does not";
4871 return NULL;
4874 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4876 guest_l1e_t *gl1e, *gp;
4877 shadow_l1e_t *sl1e;
4878 mfn_t mfn, gmfn, gl1mfn;
4879 gfn_t gfn;
4880 p2m_type_t p2mt;
4881 char *s;
4882 int done = 0;
4884 /* Follow the backpointer */
4885 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4887 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4888 /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
4889 if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
4891 oos_audit_hash_is_present(v->domain, gl1mfn);
4892 return 0;
4894 #endif
4896 gl1e = gp = sh_map_domain_page(gl1mfn);
4897 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4899 if ( sh_l1e_is_magic(*sl1e) )
4901 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
4902 if ( sh_l1e_is_gnp(*sl1e) )
4904 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4905 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4907 else
4909 ASSERT(sh_l1e_is_mmio(*sl1e));
4910 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4911 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4912 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4913 " but guest gfn is %" SH_PRI_gfn,
4914 gfn_x(gfn),
4915 gfn_x(guest_l1e_get_gfn(*gl1e)));
4917 #endif
4919 else
4921 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4922 shadow_l1e_get_flags(*sl1e));
4923 if ( s ) AUDIT_FAIL(1, "%s", s);
4925 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4927 gfn = guest_l1e_get_gfn(*gl1e);
4928 mfn = shadow_l1e_get_mfn(*sl1e);
4929 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
4930 if ( mfn_x(gmfn) != mfn_x(mfn) )
4931 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4932 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4933 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4936 });
4937 sh_unmap_domain_page(gp);
4938 return done;
4941 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4943 guest_l1e_t *gl1e, e;
4944 shadow_l1e_t *sl1e;
4945 mfn_t gl1mfn = _mfn(INVALID_MFN);
4946 int f;
4947 int done = 0;
4949 /* fl1 has no useful backpointer: all we can check are flags */
4950 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4951 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4952 f = shadow_l1e_get_flags(*sl1e);
4953 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4954 if ( !(f == 0
4955 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4956 _PAGE_ACCESSED|_PAGE_DIRTY)
4957 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4958 || sh_l1e_is_magic(*sl1e)) )
4959 AUDIT_FAIL(1, "fl1e has bad flags");
4960 });
4961 return 0;
4964 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4966 guest_l2e_t *gl2e, *gp;
4967 shadow_l2e_t *sl2e;
4968 mfn_t mfn, gmfn, gl2mfn;
4969 gfn_t gfn;
4970 p2m_type_t p2mt;
4971 char *s;
4972 int done = 0;
4974 /* Follow the backpointer */
4975 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4977 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4978 /* Only L1's may be out of sync. */
4979 if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
4980 AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
4981 #endif
4983 gl2e = gp = sh_map_domain_page(gl2mfn);
4984 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
4986 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4987 shadow_l2e_get_flags(*sl2e));
4988 if ( s ) AUDIT_FAIL(2, "%s", s);
4990 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4992 gfn = guest_l2e_get_gfn(*gl2e);
4993 mfn = shadow_l2e_get_mfn(*sl2e);
4994 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4995 ? get_fl1_shadow_status(v, gfn)
4996 : get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
4997 SH_type_l1_shadow);
4998 if ( mfn_x(gmfn) != mfn_x(mfn) )
4999 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
5000 " (--> %" PRI_mfn ")"
5001 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5002 gfn_x(gfn),
5003 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
5004 : mfn_x(gfn_to_mfn(v->domain, gfn, &p2mt)),
5005 mfn_x(gmfn), mfn_x(mfn));
5007 });
5008 sh_unmap_domain_page(gp);
5009 return 0;
5012 #if GUEST_PAGING_LEVELS >= 4
5013 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
5015 guest_l3e_t *gl3e, *gp;
5016 shadow_l3e_t *sl3e;
5017 mfn_t mfn, gmfn, gl3mfn;
5018 gfn_t gfn;
5019 p2m_type_t p2mt;
5020 char *s;
5021 int done = 0;
5023 /* Follow the backpointer */
5024 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
5026 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5027 /* Only L1's may be out of sync. */
5028 if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
5029 AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
5030 #endif
5032 gl3e = gp = sh_map_domain_page(gl3mfn);
5033 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
5035 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
5036 shadow_l3e_get_flags(*sl3e));
5037 if ( s ) AUDIT_FAIL(3, "%s", s);
5039 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5041 gfn = guest_l3e_get_gfn(*gl3e);
5042 mfn = shadow_l3e_get_mfn(*sl3e);
5043 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5044 ((GUEST_PAGING_LEVELS == 3 ||
5045 is_pv_32on64_vcpu(v))
5046 && !shadow_mode_external(v->domain)
5047 && (guest_index(gl3e) % 4) == 3)
5048 ? SH_type_l2h_shadow
5049 : SH_type_l2_shadow);
5050 if ( mfn_x(gmfn) != mfn_x(mfn) )
5051 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
5052 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5053 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5055 });
5056 sh_unmap_domain_page(gp);
5057 return 0;
5060 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
5062 guest_l4e_t *gl4e, *gp;
5063 shadow_l4e_t *sl4e;
5064 mfn_t mfn, gmfn, gl4mfn;
5065 gfn_t gfn;
5066 p2m_type_t p2mt;
5067 char *s;
5068 int done = 0;
5070 /* Follow the backpointer */
5071 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
5073 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5074 /* Only L1's may be out of sync. */
5075 if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
5076 AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
5077 #endif
5079 gl4e = gp = sh_map_domain_page(gl4mfn);
5080 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
5082 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
5083 shadow_l4e_get_flags(*sl4e));
5084 if ( s ) AUDIT_FAIL(4, "%s", s);
5086 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5088 gfn = guest_l4e_get_gfn(*gl4e);
5089 mfn = shadow_l4e_get_mfn(*sl4e);
5090 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5091 SH_type_l3_shadow);
5092 if ( mfn_x(gmfn) != mfn_x(mfn) )
5093 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
5094 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5095 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5097 });
5098 sh_unmap_domain_page(gp);
5099 return 0;
5101 #endif /* GUEST_PAGING_LEVELS >= 4 */
5104 #undef AUDIT_FAIL
5106 #endif /* Audit code */
5108 /**************************************************************************/
5109 /* Entry points into this mode of the shadow code.
5110 * This will all be mangled by the preprocessor to uniquify everything. */
5111 struct paging_mode sh_paging_mode = {
5112 .page_fault = sh_page_fault,
5113 .invlpg = sh_invlpg,
5114 .gva_to_gfn = sh_gva_to_gfn,
5115 .update_cr3 = sh_update_cr3,
5116 .update_paging_modes = shadow_update_paging_modes,
5117 .write_p2m_entry = shadow_write_p2m_entry,
5118 .write_guest_entry = shadow_write_guest_entry,
5119 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
5120 .guest_map_l1e = sh_guest_map_l1e,
5121 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
5122 .guest_levels = GUEST_PAGING_LEVELS,
5123 .shadow.detach_old_tables = sh_detach_old_tables,
5124 .shadow.x86_emulate_write = sh_x86_emulate_write,
5125 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
5126 #ifdef __i386__
5127 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
5128 #endif
5129 .shadow.make_monitor_table = sh_make_monitor_table,
5130 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
5131 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
5132 .shadow.guess_wrmap = sh_guess_wrmap,
5133 #endif
5134 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
5135 };
5137 /*
5138 * Local variables:
5139 * mode: C
5140 * c-set-style: "BSD"
5141 * c-basic-offset: 4
5142 * indent-tabs-mode: nil
5143 * End:
5144 */