ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 18842:9be2fe3de567

shadow: fix the fix for promotion/resync race.

Signed-off-by: Gianluca Guida <gianluca.guida@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Nov 28 12:02:43 2008 +0000 (2008-11-28)
parents a558165cfead
children b59db1f95d19
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include <asm/guest_pt.h>
39 #include "private.h"
40 #include "types.h"
42 /* THINGS TO DO LATER:
43 *
44 * TEARDOWN HEURISTICS
45 * Also: have a heuristic for when to destroy a previous paging-mode's
46 * shadows. When a guest is done with its start-of-day 32-bit tables
47 * and reuses the memory we want to drop those shadows. Start with
48 * shadows in a page in two modes as a hint, but beware of clever tricks
49 * like reusing a pagetable for both PAE and 64-bit during boot...
50 *
51 * PAE LINEAR MAPS
52 * Rework shadow_get_l*e() to have the option of using map_domain_page()
53 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
54 * Then we can test the speed difference made by linear maps. If the
55 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
56 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
57 * to share l2h pages again.
58 *
59 * PSE disabled / PSE36
60 * We don't support any modes other than PSE enabled, PSE36 disabled.
61 * Neither of those would be hard to change, but we'd need to be able to
62 * deal with shadows made in one mode and used in another.
63 */
65 #define FETCH_TYPE_PREFETCH 1
66 #define FETCH_TYPE_DEMAND 2
67 #define FETCH_TYPE_WRITE 4
68 typedef enum {
69 ft_prefetch = FETCH_TYPE_PREFETCH,
70 ft_demand_read = FETCH_TYPE_DEMAND,
71 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
72 } fetch_type_t;
74 #ifdef DEBUG_TRACE_DUMP
75 static char *fetch_type_names[] = {
76 [ft_prefetch] "prefetch",
77 [ft_demand_read] "demand read",
78 [ft_demand_write] "demand write",
79 };
80 #endif
82 /**************************************************************************/
83 /* Hash table mapping from guest pagetables to shadows
84 *
85 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
86 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
87 * shadow L1 which maps its "splinters".
88 */
90 static inline mfn_t
91 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
92 /* Look for FL1 shadows in the hash table */
93 {
94 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
95 return smfn;
96 }
98 static inline mfn_t
99 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
100 /* Look for shadows in the hash table */
101 {
102 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
103 perfc_incr(shadow_get_shadow_status);
104 return smfn;
105 }
107 static inline void
108 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
109 /* Put an FL1 shadow into the hash table */
110 {
111 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
112 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
114 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
115 }
117 static inline void
118 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
119 /* Put a shadow into the hash table */
120 {
121 struct domain *d = v->domain;
122 int res;
124 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
125 d->domain_id, v->vcpu_id, mfn_x(gmfn),
126 shadow_type, mfn_x(smfn));
128 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
129 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
130 {
131 res = get_page(mfn_to_page(gmfn), d);
132 ASSERT(res == 1);
133 }
135 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
136 }
138 static inline void
139 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
140 /* Remove a shadow from the hash table */
141 {
142 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
143 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
144 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
145 }
147 static inline void
148 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
149 /* Remove a shadow from the hash table */
150 {
151 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
152 v->domain->domain_id, v->vcpu_id,
153 mfn_x(gmfn), shadow_type, mfn_x(smfn));
154 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
155 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
156 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
157 put_page(mfn_to_page(gmfn));
158 }
161 /**************************************************************************/
162 /* Functions for walking the guest page tables */
164 static inline uint32_t
165 sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
166 uint32_t pfec)
167 {
168 return guest_walk_tables(v, va, gw, pfec,
169 #if GUEST_PAGING_LEVELS == 3 /* PAE */
170 _mfn(INVALID_MFN),
171 v->arch.paging.shadow.gl3e
172 #else /* 32 or 64 */
173 pagetable_get_mfn(v->arch.guest_table),
174 v->arch.paging.shadow.guest_vtable
175 #endif
176 );
177 }
179 /* This validation is called with lock held, and after write permission
180 * removal. Then check is atomic and no more inconsistent content can
181 * be observed before lock is released
182 *
183 * Return 1 to indicate success and 0 for inconsistency
184 */
185 static inline uint32_t
186 shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
187 {
188 struct domain *d = v->domain;
189 guest_l1e_t *l1p;
190 guest_l2e_t *l2p;
191 #if GUEST_PAGING_LEVELS >= 4
192 guest_l3e_t *l3p;
193 guest_l4e_t *l4p;
194 #endif
195 int mismatch = 0;
197 ASSERT(shadow_locked_by_me(d));
199 if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
200 return 1;
202 /* We may consider caching guest page mapping from last
203 * guest table walk. However considering this check happens
204 * relatively less-frequent, and a bit burden here to
205 * remap guest page is better than caching mapping in each
206 * guest table walk.
207 *
208 * Also when inconsistency occurs, simply return to trigger
209 * another fault instead of re-validate new path to make
210 * logic simple.
211 */
212 perfc_incr(shadow_check_gwalk);
213 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
214 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
215 l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
216 mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
217 l3p = sh_map_domain_page(gw->l3mfn);
218 mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
219 sh_unmap_domain_page(l3p);
220 #else
221 mismatch |= (gw->l3e.l3 !=
222 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
223 #endif
224 l2p = sh_map_domain_page(gw->l2mfn);
225 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
226 sh_unmap_domain_page(l2p);
227 #else
228 l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
229 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
230 #endif
231 if ( !(guest_supports_superpages(v) &&
232 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
233 {
234 l1p = sh_map_domain_page(gw->l1mfn);
235 mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
236 sh_unmap_domain_page(l1p);
237 }
239 return !mismatch;
240 }
242 /* Remove write access permissions from a gwalk_t in a batch, and
243 * return OR-ed result for TLB flush hint and need to rewalk the guest
244 * pages.
245 *
246 * Syncing pages will remove write access to that page; but it may
247 * also give write access to other pages in the path. If we resync any
248 * pages, re-walk from the beginning.
249 */
250 #define GW_RMWR_FLUSHTLB 1
251 #define GW_RMWR_REWALK 2
253 static inline uint32_t
254 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
255 {
256 uint32_t rc = 0;
258 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
259 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
260 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
261 if ( mfn_is_out_of_sync(gw->l3mfn) )
262 {
263 sh_resync(v, gw->l3mfn);
264 rc = GW_RMWR_REWALK;
265 }
266 else
267 #endif /* OOS */
268 if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
269 rc = GW_RMWR_FLUSHTLB;
270 #endif /* GUEST_PAGING_LEVELS >= 4 */
272 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
273 if ( mfn_is_out_of_sync(gw->l2mfn) )
274 {
275 sh_resync(v, gw->l2mfn);
276 rc |= GW_RMWR_REWALK;
277 }
278 else
279 #endif /* OOS */
280 if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
281 rc |= GW_RMWR_FLUSHTLB;
282 #endif /* GUEST_PAGING_LEVELS >= 3 */
284 if ( !(guest_supports_superpages(v) &&
285 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
286 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
287 && !mfn_is_out_of_sync(gw->l1mfn)
288 #endif /* OOS */
289 && sh_remove_write_access(v, gw->l1mfn, 1, va) )
290 rc |= GW_RMWR_FLUSHTLB;
292 return rc;
293 }
295 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
296 /* Lightweight audit: pass all the shadows associated with this guest walk
297 * through the audit mechanisms */
298 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
299 {
300 mfn_t smfn;
302 if ( !(SHADOW_AUDIT_ENABLE) )
303 return;
305 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
306 if ( mfn_valid(gw->l4mfn)
307 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
308 SH_type_l4_shadow))) )
309 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
310 if ( mfn_valid(gw->l3mfn)
311 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
312 SH_type_l3_shadow))) )
313 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
314 #endif /* PAE or 64... */
315 if ( mfn_valid(gw->l2mfn) )
316 {
317 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
318 SH_type_l2_shadow))) )
319 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
320 #if GUEST_PAGING_LEVELS == 3
321 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
322 SH_type_l2h_shadow))) )
323 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
324 #endif
325 }
326 if ( mfn_valid(gw->l1mfn)
327 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
328 SH_type_l1_shadow))) )
329 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
330 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
331 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
332 && mfn_valid(
333 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
334 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
335 }
337 #else
338 #define sh_audit_gw(_v, _gw) do {} while(0)
339 #endif /* audit code */
342 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
343 void *
344 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
345 unsigned long *gl1mfn)
346 {
347 void *pl1e = NULL;
348 walk_t gw;
350 ASSERT(shadow_mode_translate(v->domain));
352 // XXX -- this is expensive, but it's easy to cobble together...
353 // FIXME!
355 if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0
356 && mfn_valid(gw.l1mfn) )
357 {
358 if ( gl1mfn )
359 *gl1mfn = mfn_x(gw.l1mfn);
360 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
361 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
362 }
364 return pl1e;
365 }
367 void
368 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
369 {
370 walk_t gw;
372 ASSERT(shadow_mode_translate(v->domain));
374 // XXX -- this is expensive, but it's easy to cobble together...
375 // FIXME!
377 (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
378 *(guest_l1e_t *)eff_l1e = gw.l1e;
379 }
380 #endif /* CONFIG == GUEST (== SHADOW) */
382 /**************************************************************************/
383 /* Functions to compute the correct index into a shadow page, given an
384 * index into the guest page (as returned by guest_get_index()).
385 * This is trivial when the shadow and guest use the same sized PTEs, but
386 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
387 * PAE- or 64-bit shadows).
388 *
389 * These functions also increment the shadow mfn, when necessary. When PTE
390 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
391 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
392 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
393 * which shadow page we really want. Similarly, when PTE sizes are
394 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
395 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
396 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
397 * space.)
398 *
399 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
400 * of shadow (to store both the shadow, and the info that would normally be
401 * stored in page_info fields). This arrangement allows the shadow and the
402 * "page_info" fields to always be stored in the same page (in fact, in
403 * the same cache line), avoiding an extra call to map_domain_page().
404 */
406 static inline u32
407 guest_index(void *ptr)
408 {
409 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
410 }
412 static u32
413 shadow_l1_index(mfn_t *smfn, u32 guest_index)
414 {
415 #if (GUEST_PAGING_LEVELS == 2)
416 *smfn = _mfn(mfn_x(*smfn) +
417 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
418 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
419 #else
420 return guest_index;
421 #endif
422 }
424 static u32
425 shadow_l2_index(mfn_t *smfn, u32 guest_index)
426 {
427 #if (GUEST_PAGING_LEVELS == 2)
428 // Because we use 2 shadow l2 entries for each guest entry, the number of
429 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
430 //
431 *smfn = _mfn(mfn_x(*smfn) +
432 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
434 // We multiply by two to get the index of the first of the two entries
435 // used to shadow the specified guest entry.
436 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
437 #else
438 return guest_index;
439 #endif
440 }
442 #if GUEST_PAGING_LEVELS >= 4
444 static u32
445 shadow_l3_index(mfn_t *smfn, u32 guest_index)
446 {
447 return guest_index;
448 }
450 static u32
451 shadow_l4_index(mfn_t *smfn, u32 guest_index)
452 {
453 return guest_index;
454 }
456 #endif // GUEST_PAGING_LEVELS >= 4
459 /**************************************************************************/
460 /* Function which computes shadow entries from their corresponding guest
461 * entries. This is the "heart" of the shadow code. It operates using
462 * level-1 shadow types, but handles all levels of entry.
463 * Don't call it directly, but use the four wrappers below.
464 */
466 static always_inline void
467 _sh_propagate(struct vcpu *v,
468 guest_intpte_t guest_intpte,
469 mfn_t target_mfn,
470 void *shadow_entry_ptr,
471 int level,
472 fetch_type_t ft,
473 p2m_type_t p2mt)
474 {
475 guest_l1e_t guest_entry = { guest_intpte };
476 shadow_l1e_t *sp = shadow_entry_ptr;
477 struct domain *d = v->domain;
478 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
479 u32 pass_thru_flags;
480 u32 gflags, sflags;
482 /* We don't shadow PAE l3s */
483 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
485 /* Check there's something for the shadows to map to */
486 if ( !p2m_is_valid(p2mt) )
487 {
488 *sp = shadow_l1e_empty();
489 goto done;
490 }
492 gflags = guest_l1e_get_flags(guest_entry);
494 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
495 {
496 /* If a guest l1 entry is not present, shadow with the magic
497 * guest-not-present entry. */
498 if ( level == 1 )
499 *sp = sh_l1e_gnp();
500 else
501 *sp = shadow_l1e_empty();
502 goto done;
503 }
505 if ( level == 1 && p2mt == p2m_mmio_dm )
506 {
507 /* Guest l1e maps emulated MMIO space */
508 *sp = sh_l1e_mmio(target_gfn, gflags);
509 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
510 d->arch.paging.shadow.has_fast_mmio_entries = 1;
511 goto done;
512 }
514 // Must have a valid target_mfn unless this is a prefetch or an l1
515 // pointing at MMIO space. In the case of a prefetch, an invalid
516 // mfn means that we can not usefully shadow anything, and so we
517 // return early.
518 //
519 if ( !mfn_valid(target_mfn)
520 && !(level == 1 && (!shadow_mode_refcounts(d)
521 || p2mt == p2m_mmio_direct)) )
522 {
523 ASSERT((ft == ft_prefetch));
524 *sp = shadow_l1e_empty();
525 goto done;
526 }
528 // Propagate bits from the guest to the shadow.
529 // Some of these may be overwritten, below.
530 // Since we know the guest's PRESENT bit is set, we also set the shadow's
531 // SHADOW_PRESENT bit.
532 //
533 pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
534 _PAGE_RW | _PAGE_PRESENT);
535 if ( guest_supports_nx(v) )
536 pass_thru_flags |= _PAGE_NX_BIT;
537 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
538 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
539 sflags = gflags & pass_thru_flags;
541 /*
542 * For HVM domains with direct access to MMIO areas, set the correct
543 * caching attributes in the shadows to match what was asked for.
544 */
545 if ( (level == 1) && is_hvm_domain(d) && has_arch_pdevs(d) &&
546 !is_xen_heap_mfn(mfn_x(target_mfn)) )
547 {
548 unsigned int type;
549 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
550 sflags |= pat_type_2_pte_flags(type);
551 else if ( d->arch.hvm_domain.is_in_uc_mode )
552 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
553 else
554 sflags |= get_pat_flags(v,
555 gflags,
556 gfn_to_paddr(target_gfn),
557 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT);
558 }
560 // Set the A&D bits for higher level shadows.
561 // Higher level entries do not, strictly speaking, have dirty bits, but
562 // since we use shadow linear tables, each of these entries may, at some
563 // point in time, also serve as a shadow L1 entry.
564 // By setting both the A&D bits in each of these, we eliminate the burden
565 // on the hardware to update these bits on initial accesses.
566 //
567 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
568 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
570 // If the A or D bit has not yet been set in the guest, then we must
571 // prevent the corresponding kind of access.
572 //
573 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
574 sflags &= ~_PAGE_PRESENT;
576 /* D bits exist in L1es and PSE L2es */
577 if ( unlikely(((level == 1) ||
578 ((level == 2) &&
579 (gflags & _PAGE_PSE) &&
580 guest_supports_superpages(v)))
581 && !(gflags & _PAGE_DIRTY)) )
582 sflags &= ~_PAGE_RW;
584 // shadow_mode_log_dirty support
585 //
586 // Only allow the guest write access to a page a) on a demand fault,
587 // or b) if the page is already marked as dirty.
588 //
589 // (We handle log-dirty entirely inside the shadow code, without using the
590 // p2m_ram_logdirty p2m type: only HAP uses that.)
591 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
592 {
593 if ( mfn_valid(target_mfn) ) {
594 if ( ft & FETCH_TYPE_WRITE )
595 paging_mark_dirty(d, mfn_x(target_mfn));
596 else if ( !sh_mfn_is_dirty(d, target_mfn) )
597 sflags &= ~_PAGE_RW;
598 }
599 }
601 if ( unlikely((level == 1) && d->dirty_vram
602 && d->dirty_vram->last_dirty == -1
603 && gfn_x(target_gfn) >= d->dirty_vram->begin_pfn
604 && gfn_x(target_gfn) < d->dirty_vram->end_pfn) )
605 {
606 if ( ft & FETCH_TYPE_WRITE )
607 d->dirty_vram->last_dirty = NOW();
608 else
609 sflags &= ~_PAGE_RW;
610 }
612 /* Read-only memory */
613 if ( p2mt == p2m_ram_ro )
614 sflags &= ~_PAGE_RW;
616 // protect guest page tables
617 //
618 if ( unlikely((level == 1)
619 && sh_mfn_is_a_page_table(target_mfn)
620 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
621 /* Unless the page is out of sync and the guest is
622 writing to it. */
623 && !(mfn_oos_may_write(target_mfn)
624 && (ft == ft_demand_write))
625 #endif /* OOS */
626 ) )
627 {
628 if ( shadow_mode_trap_reads(d) )
629 {
630 // if we are trapping both reads & writes, then mark this page
631 // as not present...
632 //
633 sflags &= ~_PAGE_PRESENT;
634 }
635 else
636 {
637 // otherwise, just prevent any writes...
638 //
639 sflags &= ~_PAGE_RW;
640 }
641 }
643 // PV guests in 64-bit mode use two different page tables for user vs
644 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
645 // It is always shadowed as present...
646 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
647 && !is_hvm_domain(d) )
648 {
649 sflags |= _PAGE_USER;
650 }
652 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
654 done:
655 SHADOW_DEBUG(PROPAGATE,
656 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
657 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
658 }
661 /* These four wrappers give us a little bit of type-safety back around
662 * the use of void-* pointers and intpte types in _sh_propagate(), and
663 * allow the compiler to optimize out some level checks. */
665 #if GUEST_PAGING_LEVELS >= 4
666 static void
667 l4e_propagate_from_guest(struct vcpu *v,
668 guest_l4e_t gl4e,
669 mfn_t sl3mfn,
670 shadow_l4e_t *sl4e,
671 fetch_type_t ft)
672 {
673 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
674 }
676 static void
677 l3e_propagate_from_guest(struct vcpu *v,
678 guest_l3e_t gl3e,
679 mfn_t sl2mfn,
680 shadow_l3e_t *sl3e,
681 fetch_type_t ft)
682 {
683 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
684 }
685 #endif // GUEST_PAGING_LEVELS >= 4
687 static void
688 l2e_propagate_from_guest(struct vcpu *v,
689 guest_l2e_t gl2e,
690 mfn_t sl1mfn,
691 shadow_l2e_t *sl2e,
692 fetch_type_t ft)
693 {
694 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
695 }
697 static void
698 l1e_propagate_from_guest(struct vcpu *v,
699 guest_l1e_t gl1e,
700 mfn_t gmfn,
701 shadow_l1e_t *sl1e,
702 fetch_type_t ft,
703 p2m_type_t p2mt)
704 {
705 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
706 }
709 /**************************************************************************/
710 /* These functions update shadow entries (and do bookkeeping on the shadow
711 * tables they are in). It is intended that they are the only
712 * functions which ever write (non-zero) data onto a shadow page.
713 */
715 static inline void safe_write_entry(void *dst, void *src)
716 /* Copy one PTE safely when processors might be running on the
717 * destination pagetable. This does *not* give safety against
718 * concurrent writes (that's what the shadow lock is for), just
719 * stops the hardware picking up partially written entries. */
720 {
721 volatile unsigned long *d = dst;
722 unsigned long *s = src;
723 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
724 #if CONFIG_PAGING_LEVELS == 3
725 /* In PAE mode, pagetable entries are larger
726 * than machine words, so won't get written atomically. We need to make
727 * sure any other cpu running on these shadows doesn't see a
728 * half-written entry. Do this by marking the entry not-present first,
729 * then writing the high word before the low word. */
730 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
731 d[0] = 0;
732 d[1] = s[1];
733 d[0] = s[0];
734 #else
735 /* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
736 * which will be an atomic write, since the entry is aligned. */
737 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
738 *d = *s;
739 #endif
740 }
743 static inline void
744 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
745 /* This function does the actual writes to shadow pages.
746 * It must not be called directly, since it doesn't do the bookkeeping
747 * that shadow_set_l*e() functions do. */
748 {
749 shadow_l1e_t *dst = d;
750 shadow_l1e_t *src = s;
751 void *map = NULL;
752 int i;
754 /* Because we mirror access rights at all levels in the shadow, an
755 * l2 (or higher) entry with the RW bit cleared will leave us with
756 * no write access through the linear map.
757 * We detect that by writing to the shadow with copy_to_user() and
758 * using map_domain_page() to get a writeable mapping if we need to. */
759 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
760 {
761 perfc_incr(shadow_linear_map_failed);
762 map = sh_map_domain_page(mfn);
763 ASSERT(map != NULL);
764 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
765 }
768 for ( i = 0; i < entries; i++ )
769 safe_write_entry(dst++, src++);
771 if ( map != NULL ) sh_unmap_domain_page(map);
772 }
774 static inline int
775 perms_strictly_increased(u32 old_flags, u32 new_flags)
776 /* Given the flags of two entries, are the new flags a strict
777 * increase in rights over the old ones? */
778 {
779 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
780 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
781 /* Flip the NX bit, since it's the only one that decreases rights;
782 * we calculate as if it were an "X" bit. */
783 of ^= _PAGE_NX_BIT;
784 nf ^= _PAGE_NX_BIT;
785 /* If the changed bits are all set in the new flags, then rights strictly
786 * increased between old and new. */
787 return ((of | (of ^ nf)) == nf);
788 }
790 static int inline
791 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
792 {
793 int res;
794 mfn_t mfn;
795 struct domain *owner;
797 ASSERT(!sh_l1e_is_magic(sl1e));
799 if ( !shadow_mode_refcounts(d) )
800 return 1;
802 res = get_page_from_l1e(sl1e, d);
804 // If a privileged domain is attempting to install a map of a page it does
805 // not own, we let it succeed anyway.
806 //
807 if ( unlikely(!res) &&
808 !shadow_mode_translate(d) &&
809 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
810 (owner = page_get_owner(mfn_to_page(mfn))) &&
811 (d != owner) &&
812 IS_PRIV_FOR(d, owner))
813 {
814 res = get_page_from_l1e(sl1e, owner);
815 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
816 "which is owned by domain %d: %s\n",
817 d->domain_id, mfn_x(mfn), owner->domain_id,
818 res ? "success" : "failed");
819 }
821 if ( unlikely(!res) )
822 {
823 perfc_incr(shadow_get_page_fail);
824 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
825 }
827 return res;
828 }
830 static void inline
831 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
832 {
833 if ( !shadow_mode_refcounts(d) )
834 return;
836 put_page_from_l1e(sl1e, d);
837 }
839 #if GUEST_PAGING_LEVELS >= 4
840 static int shadow_set_l4e(struct vcpu *v,
841 shadow_l4e_t *sl4e,
842 shadow_l4e_t new_sl4e,
843 mfn_t sl4mfn)
844 {
845 int flags = 0, ok;
846 shadow_l4e_t old_sl4e;
847 paddr_t paddr;
848 ASSERT(sl4e != NULL);
849 old_sl4e = *sl4e;
851 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
853 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
854 | (((unsigned long)sl4e) & ~PAGE_MASK));
856 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
857 {
858 /* About to install a new reference */
859 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
860 ok = sh_get_ref(v, sl3mfn, paddr);
861 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
862 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
863 ok |= sh_pin(v, sl3mfn);
864 if ( !ok )
865 {
866 domain_crash(v->domain);
867 return SHADOW_SET_ERROR;
868 }
869 }
871 /* Write the new entry */
872 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
873 flags |= SHADOW_SET_CHANGED;
875 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
876 {
877 /* We lost a reference to an old mfn. */
878 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
879 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
880 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
881 shadow_l4e_get_flags(new_sl4e)) )
882 {
883 flags |= SHADOW_SET_FLUSH;
884 }
885 sh_put_ref(v, osl3mfn, paddr);
886 }
887 return flags;
888 }
890 static int shadow_set_l3e(struct vcpu *v,
891 shadow_l3e_t *sl3e,
892 shadow_l3e_t new_sl3e,
893 mfn_t sl3mfn)
894 {
895 int flags = 0;
896 shadow_l3e_t old_sl3e;
897 paddr_t paddr;
898 ASSERT(sl3e != NULL);
899 old_sl3e = *sl3e;
901 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
903 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
904 | (((unsigned long)sl3e) & ~PAGE_MASK));
906 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
907 {
908 /* About to install a new reference */
909 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
910 {
911 domain_crash(v->domain);
912 return SHADOW_SET_ERROR;
913 }
914 }
916 /* Write the new entry */
917 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
918 flags |= SHADOW_SET_CHANGED;
920 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
921 {
922 /* We lost a reference to an old mfn. */
923 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
924 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
925 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
926 shadow_l3e_get_flags(new_sl3e)) )
927 {
928 flags |= SHADOW_SET_FLUSH;
929 }
930 sh_put_ref(v, osl2mfn, paddr);
931 }
932 return flags;
933 }
934 #endif /* GUEST_PAGING_LEVELS >= 4 */
936 static int shadow_set_l2e(struct vcpu *v,
937 shadow_l2e_t *sl2e,
938 shadow_l2e_t new_sl2e,
939 mfn_t sl2mfn)
940 {
941 int flags = 0;
942 shadow_l2e_t old_sl2e;
943 paddr_t paddr;
945 #if GUEST_PAGING_LEVELS == 2
946 /* In 2-on-3 we work with pairs of l2es pointing at two-page
947 * shadows. Reference counting and up-pointers track from the first
948 * page of the shadow to the first l2e, so make sure that we're
949 * working with those:
950 * Align the pointer down so it's pointing at the first of the pair */
951 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
952 /* Align the mfn of the shadow entry too */
953 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
954 #endif
956 ASSERT(sl2e != NULL);
957 old_sl2e = *sl2e;
959 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
961 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
962 | (((unsigned long)sl2e) & ~PAGE_MASK));
964 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
965 {
966 mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
968 /* About to install a new reference */
969 if ( !sh_get_ref(v, sl1mfn, paddr) )
970 {
971 domain_crash(v->domain);
972 return SHADOW_SET_ERROR;
973 }
974 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
975 {
976 struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
977 mfn_t gl1mfn = _mfn(sp->backpointer);
979 /* If the shadow is a fl1 then the backpointer contains
980 the GFN instead of the GMFN, and it's definitely not
981 OOS. */
982 if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
983 && mfn_is_out_of_sync(gl1mfn) )
984 sh_resync(v, gl1mfn);
985 }
986 #endif
987 }
989 /* Write the new entry */
990 #if GUEST_PAGING_LEVELS == 2
991 {
992 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
993 /* The l1 shadow is two pages long and need to be pointed to by
994 * two adjacent l1es. The pair have the same flags, but point
995 * at odd and even MFNs */
996 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
997 pair[1].l2 |= (1<<PAGE_SHIFT);
998 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
999 }
1000 #else /* normal case */
1001 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1002 #endif
1003 flags |= SHADOW_SET_CHANGED;
1005 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1007 /* We lost a reference to an old mfn. */
1008 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1009 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1010 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1011 shadow_l2e_get_flags(new_sl2e)) )
1013 flags |= SHADOW_SET_FLUSH;
1015 sh_put_ref(v, osl1mfn, paddr);
1017 return flags;
1020 static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1021 shadow_l1e_t *sl1e,
1022 mfn_t sl1mfn,
1023 struct domain *d)
1025 mfn_t mfn;
1026 unsigned long gfn;
1028 if ( !d->dirty_vram ) return;
1030 mfn = shadow_l1e_get_mfn(new_sl1e);
1032 if ( !mfn_valid(mfn) ) return; /* m2p for mmio_direct may not exist */
1034 gfn = mfn_to_gfn(d, mfn);
1036 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1037 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1038 struct page_info *page = mfn_to_page(mfn);
1039 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1041 if ( count_info == 1 )
1042 /* Initial guest reference, record it */
1043 d->dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1044 | ((unsigned long)sl1e & ~PAGE_MASK);
1048 static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1049 shadow_l1e_t *sl1e,
1050 mfn_t sl1mfn,
1051 struct domain *d)
1053 mfn_t mfn;
1054 unsigned long gfn;
1056 if ( !d->dirty_vram ) return;
1058 mfn = shadow_l1e_get_mfn(old_sl1e);
1060 if ( !mfn_valid(mfn) ) return;
1062 gfn = mfn_to_gfn(d, mfn);
1064 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1065 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1066 struct page_info *page = mfn_to_page(mfn);
1067 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1068 int dirty = 0;
1069 paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1070 | ((unsigned long)sl1e & ~PAGE_MASK);
1072 if ( count_info == 1 ) {
1073 /* Last reference */
1074 if ( d->dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1075 /* We didn't know it was that one, let's say it is dirty */
1076 dirty = 1;
1077 } else {
1078 ASSERT(d->dirty_vram->sl1ma[i] == sl1ma);
1079 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1080 if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_DIRTY )
1081 dirty = 1;
1083 } else {
1084 /* We had more than one reference, just consider the page dirty. */
1085 dirty = 1;
1086 /* Check that it's not the one we recorded. */
1087 if ( d->dirty_vram->sl1ma[i] == sl1ma ) {
1088 /* Too bad, we remembered the wrong one... */
1089 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1090 } else {
1091 /* Ok, our recorded sl1e is still pointing to this page, let's
1092 * just hope it will remain. */
1095 if ( dirty ) {
1096 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
1097 d->dirty_vram->last_dirty = NOW();
1102 static int shadow_set_l1e(struct vcpu *v,
1103 shadow_l1e_t *sl1e,
1104 shadow_l1e_t new_sl1e,
1105 mfn_t sl1mfn)
1107 int flags = 0;
1108 struct domain *d = v->domain;
1109 shadow_l1e_t old_sl1e;
1110 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1111 mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
1112 #endif
1113 ASSERT(sl1e != NULL);
1115 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1116 if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
1117 && ((shadow_l1e_get_flags(new_sl1e) & (_PAGE_RW|_PAGE_PRESENT))
1118 == (_PAGE_RW|_PAGE_PRESENT)) )
1119 oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
1120 #endif
1122 old_sl1e = *sl1e;
1124 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1126 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1127 && !sh_l1e_is_magic(new_sl1e) )
1129 /* About to install a new reference */
1130 if ( shadow_mode_refcounts(d) ) {
1131 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
1132 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1134 /* Doesn't look like a pagetable. */
1135 flags |= SHADOW_SET_ERROR;
1136 new_sl1e = shadow_l1e_empty();
1138 else
1140 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1145 /* Write the new entry */
1146 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1147 flags |= SHADOW_SET_CHANGED;
1149 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1150 && !sh_l1e_is_magic(old_sl1e) )
1152 /* We lost a reference to an old mfn. */
1153 /* N.B. Unlike higher-level sets, never need an extra flush
1154 * when writing an l1e. Because it points to the same guest frame
1155 * as the guest l1e did, it's the guest's responsibility to
1156 * trigger a flush later. */
1157 if ( shadow_mode_refcounts(d) )
1159 shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1160 shadow_put_page_from_l1e(old_sl1e, d);
1161 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
1164 return flags;
1168 /**************************************************************************/
1169 /* Macros to walk pagetables. These take the shadow of a pagetable and
1170 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1171 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1172 * second entry (since pairs of entries are managed together). For multi-page
1173 * shadows they walk all pages.
1175 * Arguments are an MFN, the variable to point to each entry, a variable
1176 * to indicate that we are done (we will shortcut to the end of the scan
1177 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1178 * and the code.
1180 * WARNING: These macros have side-effects. They change the values of both
1181 * the pointer and the MFN. */
1183 static inline void increment_ptr_to_guest_entry(void *ptr)
1185 if ( ptr )
1187 guest_l1e_t **entry = ptr;
1188 (*entry)++;
1192 /* All kinds of l1: touch all entries */
1193 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1194 do { \
1195 int _i; \
1196 shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \
1197 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1198 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1199 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1200 { \
1201 (_sl1e) = _sp + _i; \
1202 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1203 {_code} \
1204 if ( _done ) break; \
1205 increment_ptr_to_guest_entry(_gl1p); \
1206 } \
1207 sh_unmap_domain_page(_sp); \
1208 } while (0)
1210 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1211 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1212 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1213 do { \
1214 int __done = 0; \
1215 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1216 ({ (__done = _done); }), _code); \
1217 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1218 if ( !__done ) \
1219 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1220 ({ (__done = _done); }), _code); \
1221 } while (0)
1222 #else /* Everything else; l1 shadows are only one page */
1223 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1224 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1225 #endif
1228 #if GUEST_PAGING_LEVELS == 2
1230 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1231 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1232 do { \
1233 int _i, _j, __done = 0; \
1234 int _xen = !shadow_mode_external(_dom); \
1235 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1236 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1237 { \
1238 shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \
1239 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1240 if ( (!(_xen)) \
1241 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1242 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1243 { \
1244 (_sl2e) = _sp + _i; \
1245 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1246 {_code} \
1247 if ( (__done = (_done)) ) break; \
1248 increment_ptr_to_guest_entry(_gl2p); \
1249 } \
1250 sh_unmap_domain_page(_sp); \
1251 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1252 } \
1253 } while (0)
1255 #elif GUEST_PAGING_LEVELS == 3
1257 /* PAE: if it's an l2h, don't touch Xen mappings */
1258 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1259 do { \
1260 int _i; \
1261 int _xen = !shadow_mode_external(_dom); \
1262 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1263 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1264 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1265 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1266 if ( (!(_xen)) \
1267 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1268 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1269 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1270 { \
1271 (_sl2e) = _sp + _i; \
1272 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1273 {_code} \
1274 if ( _done ) break; \
1275 increment_ptr_to_guest_entry(_gl2p); \
1276 } \
1277 sh_unmap_domain_page(_sp); \
1278 } while (0)
1280 #else
1282 /* 64-bit l2: touch all entries except for PAE compat guests. */
1283 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1284 do { \
1285 int _i; \
1286 int _xen = !shadow_mode_external(_dom); \
1287 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1288 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1289 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1290 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1291 { \
1292 if ( (!(_xen)) \
1293 || !is_pv_32on64_domain(_dom) \
1294 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1295 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1296 { \
1297 (_sl2e) = _sp + _i; \
1298 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1299 {_code} \
1300 if ( _done ) break; \
1301 increment_ptr_to_guest_entry(_gl2p); \
1302 } \
1303 } \
1304 sh_unmap_domain_page(_sp); \
1305 } while (0)
1307 #endif /* different kinds of l2 */
1309 #if GUEST_PAGING_LEVELS == 4
1311 /* 64-bit l3: touch all entries */
1312 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1313 do { \
1314 int _i; \
1315 shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \
1316 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1317 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1318 { \
1319 (_sl3e) = _sp + _i; \
1320 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1321 {_code} \
1322 if ( _done ) break; \
1323 increment_ptr_to_guest_entry(_gl3p); \
1324 } \
1325 sh_unmap_domain_page(_sp); \
1326 } while (0)
1328 /* 64-bit l4: avoid Xen mappings */
1329 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1330 do { \
1331 shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \
1332 int _xen = !shadow_mode_external(_dom); \
1333 int _i; \
1334 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1335 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1336 { \
1337 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1338 { \
1339 (_sl4e) = _sp + _i; \
1340 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1341 {_code} \
1342 if ( _done ) break; \
1343 } \
1344 increment_ptr_to_guest_entry(_gl4p); \
1345 } \
1346 sh_unmap_domain_page(_sp); \
1347 } while (0)
1349 #endif
1353 /**************************************************************************/
1354 /* Functions to install Xen mappings and linear mappings in shadow pages */
1356 // XXX -- this function should probably be moved to shadow-common.c, but that
1357 // probably wants to wait until the shadow types have been moved from
1358 // shadow-types.h to shadow-private.h
1359 //
1360 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1361 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1363 struct domain *d = v->domain;
1364 shadow_l4e_t *sl4e;
1366 sl4e = sh_map_domain_page(sl4mfn);
1367 ASSERT(sl4e != NULL);
1368 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1370 /* Copy the common Xen mappings from the idle domain */
1371 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1372 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1373 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1375 /* Install the per-domain mappings for this domain */
1376 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1377 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1378 __PAGE_HYPERVISOR);
1380 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1381 * shadows on 64-bit xen, this linear mapping is later replaced by the
1382 * monitor pagetable structure, which is built in make_monitor_table
1383 * and maintained by sh_update_linear_entries. */
1384 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1385 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1387 /* Self linear mapping. */
1388 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1390 // linear tables may not be used with translated PV guests
1391 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1392 shadow_l4e_empty();
1394 else
1396 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1397 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1400 if ( shadow_mode_translate(v->domain) )
1402 /* install domain-specific P2M table */
1403 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1404 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1405 __PAGE_HYPERVISOR);
1408 sh_unmap_domain_page(sl4e);
1410 #endif
1412 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1413 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1414 // place, which means that we need to populate the l2h entry in the l3
1415 // table.
1417 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1419 struct domain *d = v->domain;
1420 shadow_l2e_t *sl2e;
1421 #if CONFIG_PAGING_LEVELS == 3
1422 int i;
1423 #else
1425 if ( !is_pv_32on64_vcpu(v) )
1426 return;
1427 #endif
1429 sl2e = sh_map_domain_page(sl2hmfn);
1430 ASSERT(sl2e != NULL);
1431 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1433 #if CONFIG_PAGING_LEVELS == 3
1435 /* Copy the common Xen mappings from the idle domain */
1436 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1437 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1438 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1440 /* Install the per-domain mappings for this domain */
1441 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1442 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1443 shadow_l2e_from_mfn(
1444 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1445 __PAGE_HYPERVISOR);
1447 /* We don't set up a linear mapping here because we can't until this
1448 * l2h is installed in an l3e. sh_update_linear_entries() handles
1449 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1450 * We zero them here, just as a safety measure.
1451 */
1452 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1453 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1454 shadow_l2e_empty();
1455 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1456 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1457 shadow_l2e_empty();
1459 if ( shadow_mode_translate(d) )
1461 /* Install the domain-specific p2m table */
1462 l3_pgentry_t *p2m;
1463 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1464 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1465 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1467 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1468 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1469 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1470 __PAGE_HYPERVISOR)
1471 : shadow_l2e_empty();
1473 sh_unmap_domain_page(p2m);
1476 #else
1478 /* Copy the common Xen mappings from the idle domain */
1479 memcpy(
1480 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1481 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1482 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1484 #endif
1486 sh_unmap_domain_page(sl2e);
1488 #endif
1494 /**************************************************************************/
1495 /* Create a shadow of a given guest page.
1496 */
1497 static mfn_t
1498 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1500 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1501 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1502 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1504 if ( shadow_type != SH_type_l2_32_shadow
1505 && shadow_type != SH_type_l2_pae_shadow
1506 && shadow_type != SH_type_l2h_pae_shadow
1507 && shadow_type != SH_type_l4_64_shadow )
1508 /* Lower-level shadow, not yet linked form a higher level */
1509 mfn_to_shadow_page(smfn)->up = 0;
1511 #if GUEST_PAGING_LEVELS == 4
1512 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1513 if ( shadow_type == SH_type_l4_64_shadow &&
1514 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1516 /* We're shadowing a new l4, but we've been assuming the guest uses
1517 * only one l4 per vcpu and context switches using an l4 entry.
1518 * Count the number of active l4 shadows. If there are enough
1519 * of them, decide that this isn't an old linux guest, and stop
1520 * pinning l3es. This is not very quick but it doesn't happen
1521 * very often. */
1522 struct list_head *l, *t;
1523 struct shadow_page_info *sp;
1524 struct vcpu *v2;
1525 int l4count = 0, vcpus = 0;
1526 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1528 sp = list_entry(l, struct shadow_page_info, list);
1529 if ( sp->type == SH_type_l4_64_shadow )
1530 l4count++;
1532 for_each_vcpu ( v->domain, v2 )
1533 vcpus++;
1534 if ( l4count > 2 * vcpus )
1536 /* Unpin all the pinned l3 tables, and don't pin any more. */
1537 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1539 sp = list_entry(l, struct shadow_page_info, list);
1540 if ( sp->type == SH_type_l3_64_shadow )
1541 sh_unpin(v, shadow_page_to_mfn(sp));
1543 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1546 #endif
1547 #endif
1549 // Create the Xen mappings...
1550 if ( !shadow_mode_external(v->domain) )
1552 switch (shadow_type)
1554 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1555 case SH_type_l4_shadow:
1556 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1557 #endif
1558 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1559 case SH_type_l2h_shadow:
1560 sh_install_xen_entries_in_l2h(v, smfn); break;
1561 #endif
1562 default: /* Do nothing */ break;
1566 shadow_promote(v, gmfn, shadow_type);
1567 set_shadow_status(v, gmfn, shadow_type, smfn);
1569 return smfn;
1572 /* Make a splintered superpage shadow */
1573 static mfn_t
1574 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1576 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1577 (unsigned long) gfn_x(gfn));
1579 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1580 gfn_x(gfn), mfn_x(smfn));
1582 set_fl1_shadow_status(v, gfn, smfn);
1583 return smfn;
1587 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1588 mfn_t
1589 sh_make_monitor_table(struct vcpu *v)
1591 struct domain *d = v->domain;
1593 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1595 /* Guarantee we can get the memory we need */
1596 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1598 #if CONFIG_PAGING_LEVELS == 4
1600 mfn_t m4mfn;
1601 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1602 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1603 /* Remember the level of this table */
1604 mfn_to_page(m4mfn)->shadow_flags = 4;
1605 #if SHADOW_PAGING_LEVELS < 4
1607 mfn_t m3mfn, m2mfn;
1608 l4_pgentry_t *l4e;
1609 l3_pgentry_t *l3e;
1610 /* Install an l3 table and an l2 table that will hold the shadow
1611 * linear map entries. This overrides the linear map entry that
1612 * was installed by sh_install_xen_entries_in_l4. */
1613 l4e = sh_map_domain_page(m4mfn);
1615 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1616 mfn_to_page(m3mfn)->shadow_flags = 3;
1617 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1618 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1620 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1621 mfn_to_page(m2mfn)->shadow_flags = 2;
1622 l3e = sh_map_domain_page(m3mfn);
1623 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1624 sh_unmap_domain_page(l3e);
1626 if ( is_pv_32on64_vcpu(v) )
1628 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1629 * area into its usual VAs in the monitor tables */
1630 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1631 mfn_to_page(m3mfn)->shadow_flags = 3;
1632 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1634 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1635 mfn_to_page(m2mfn)->shadow_flags = 2;
1636 l3e = sh_map_domain_page(m3mfn);
1637 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1638 sh_install_xen_entries_in_l2h(v, m2mfn);
1639 sh_unmap_domain_page(l3e);
1642 sh_unmap_domain_page(l4e);
1644 #endif /* SHADOW_PAGING_LEVELS < 4 */
1645 return m4mfn;
1648 #elif CONFIG_PAGING_LEVELS == 3
1651 mfn_t m3mfn, m2mfn;
1652 l3_pgentry_t *l3e;
1653 l2_pgentry_t *l2e;
1654 int i;
1656 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1657 /* Remember the level of this table */
1658 mfn_to_page(m3mfn)->shadow_flags = 3;
1660 // Install a monitor l2 table in slot 3 of the l3 table.
1661 // This is used for all Xen entries, including linear maps
1662 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1663 mfn_to_page(m2mfn)->shadow_flags = 2;
1664 l3e = sh_map_domain_page(m3mfn);
1665 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1666 sh_install_xen_entries_in_l2h(v, m2mfn);
1667 /* Install the monitor's own linear map */
1668 l2e = sh_map_domain_page(m2mfn);
1669 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1670 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1671 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1672 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1673 : l2e_empty();
1674 sh_unmap_domain_page(l2e);
1675 sh_unmap_domain_page(l3e);
1677 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1678 return m3mfn;
1681 #else
1682 #error this should not happen
1683 #endif /* CONFIG_PAGING_LEVELS */
1685 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1687 /**************************************************************************/
1688 /* These functions also take a virtual address and return the level-N
1689 * shadow table mfn and entry, but they create the shadow pagetables if
1690 * they are needed. The "demand" argument is non-zero when handling
1691 * a demand fault (so we know what to do about accessed bits &c).
1692 * If the necessary tables are not present in the guest, they return NULL. */
1694 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1695 * more levels than the guest, the upper levels are always fixed and do not
1696 * reflect any information from the guest, so we do not use these functions
1697 * to access them. */
1699 #if GUEST_PAGING_LEVELS >= 4
1700 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1701 walk_t *gw,
1702 mfn_t *sl4mfn)
1704 /* There is always a shadow of the top level table. Get it. */
1705 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1706 /* Reading the top level table is always valid. */
1707 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1710 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1711 walk_t *gw,
1712 mfn_t *sl3mfn,
1713 fetch_type_t ft,
1714 int *resync)
1716 mfn_t sl4mfn;
1717 shadow_l4e_t *sl4e;
1718 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1719 /* Get the l4e */
1720 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1721 ASSERT(sl4e != NULL);
1722 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1724 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1725 ASSERT(mfn_valid(*sl3mfn));
1727 else
1729 int r;
1730 shadow_l4e_t new_sl4e;
1731 /* No l3 shadow installed: find and install it. */
1732 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1733 if ( !mfn_valid(*sl3mfn) )
1735 /* No l3 shadow of this page exists at all: make one. */
1736 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1738 /* Install the new sl3 table in the sl4e */
1739 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
1740 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1741 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1742 if ( r & SHADOW_SET_ERROR )
1743 return NULL;
1745 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
1746 *resync |= 1;
1747 #endif
1750 /* Now follow it down a level. Guaranteed to succeed. */
1751 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1753 #endif /* GUEST_PAGING_LEVELS >= 4 */
1756 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1757 walk_t *gw,
1758 mfn_t *sl2mfn,
1759 fetch_type_t ft,
1760 int *resync)
1762 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1763 mfn_t sl3mfn = _mfn(INVALID_MFN);
1764 shadow_l3e_t *sl3e;
1765 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1766 /* Get the l3e */
1767 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync);
1768 if ( sl3e == NULL ) return NULL;
1769 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1771 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1772 ASSERT(mfn_valid(*sl2mfn));
1774 else
1776 int r;
1777 shadow_l3e_t new_sl3e;
1778 unsigned int t = SH_type_l2_shadow;
1780 /* Tag compat L2 containing hypervisor (m2p) mappings */
1781 if ( is_pv_32on64_domain(v->domain) &&
1782 guest_l4_table_offset(gw->va) == 0 &&
1783 guest_l3_table_offset(gw->va) == 3 )
1784 t = SH_type_l2h_shadow;
1786 /* No l2 shadow installed: find and install it. */
1787 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1788 if ( !mfn_valid(*sl2mfn) )
1790 /* No l2 shadow of this page exists at all: make one. */
1791 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1793 /* Install the new sl2 table in the sl3e */
1794 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
1795 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1796 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1797 if ( r & SHADOW_SET_ERROR )
1798 return NULL;
1800 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
1801 *resync |= 1;
1802 #endif
1805 /* Now follow it down a level. Guaranteed to succeed. */
1806 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1807 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1808 /* We never demand-shadow PAE l3es: they are only created in
1809 * sh_update_cr3(). Check if the relevant sl3e is present. */
1810 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1811 + shadow_l3_linear_offset(gw->va);
1812 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1813 return NULL;
1814 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1815 ASSERT(mfn_valid(*sl2mfn));
1816 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1817 #else /* 32bit... */
1818 /* There is always a shadow of the top level table. Get it. */
1819 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1820 /* This next line is important: the guest l2 has a 16k
1821 * shadow, we need to return the right mfn of the four. This
1822 * call will set it for us as a side-effect. */
1823 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
1824 /* Reading the top level table is always valid. */
1825 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1826 #endif
1830 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1831 walk_t *gw,
1832 mfn_t *sl1mfn,
1833 fetch_type_t ft)
1835 mfn_t sl2mfn;
1836 int resync = 0;
1837 shadow_l2e_t *sl2e;
1839 /* Get the l2e */
1840 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync);
1841 if ( sl2e == NULL ) return NULL;
1843 /* Install the sl1 in the l2e if it wasn't there or if we need to
1844 * re-do it to fix a PSE dirty bit. */
1845 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1846 && likely(ft != ft_demand_write
1847 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
1848 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
1850 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1851 ASSERT(mfn_valid(*sl1mfn));
1853 else
1855 shadow_l2e_t new_sl2e;
1856 int r, flags = guest_l2e_get_flags(gw->l2e);
1857 /* No l1 shadow installed: find and install it. */
1858 if ( !(flags & _PAGE_PRESENT) )
1859 return NULL; /* No guest page. */
1860 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1862 /* Splintering a superpage */
1863 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
1864 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1865 if ( !mfn_valid(*sl1mfn) )
1867 /* No fl1 shadow of this superpage exists at all: make one. */
1868 *sl1mfn = make_fl1_shadow(v, l2gfn);
1871 else
1873 /* Shadowing an actual guest l1 table */
1874 if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
1875 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1876 if ( !mfn_valid(*sl1mfn) )
1878 /* No l1 shadow of this page exists at all: make one. */
1879 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1882 /* Install the new sl1 table in the sl2e */
1883 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
1884 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1885 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1886 if ( r & SHADOW_SET_ERROR )
1887 return NULL;
1889 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1890 * the guest l1 table has an 8k shadow, and we need to return
1891 * the right mfn of the pair. This call will set it for us as a
1892 * side-effect. (In all other cases, it's a no-op and will be
1893 * compiled out.) */
1894 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1897 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
1898 /* All pages walked are now pagetables. Safe to resync pages
1899 in case level 4 or 3 shadows were set. */
1900 if ( resync )
1901 shadow_resync_all(v, 0);
1902 #endif
1904 /* Now follow it down a level. Guaranteed to succeed. */
1905 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1910 /**************************************************************************/
1911 /* Destructors for shadow tables:
1912 * Unregister the shadow, decrement refcounts of any entries present in it,
1913 * and release the memory.
1915 * N.B. These destructors do not clear the contents of the shadows.
1916 * This allows us to delay TLB shootdowns until the page is being reused.
1917 * See shadow_alloc() and shadow_free() for how this is handled.
1918 */
1920 #if GUEST_PAGING_LEVELS >= 4
1921 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1923 shadow_l4e_t *sl4e;
1924 u32 t = mfn_to_shadow_page(smfn)->type;
1925 mfn_t gmfn, sl4mfn;
1927 SHADOW_DEBUG(DESTROY_SHADOW,
1928 "%s(%05lx)\n", __func__, mfn_x(smfn));
1929 ASSERT(t == SH_type_l4_shadow);
1931 /* Record that the guest page isn't shadowed any more (in this type) */
1932 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1933 delete_shadow_status(v, gmfn, t, smfn);
1934 shadow_demote(v, gmfn, t);
1935 /* Decrement refcounts of all the old entries */
1936 sl4mfn = smfn;
1937 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
1938 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1940 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1941 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1942 | ((unsigned long)sl4e & ~PAGE_MASK));
1944 });
1946 /* Put the memory back in the pool */
1947 shadow_free(v->domain, smfn);
1950 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
1952 shadow_l3e_t *sl3e;
1953 u32 t = mfn_to_shadow_page(smfn)->type;
1954 mfn_t gmfn, sl3mfn;
1956 SHADOW_DEBUG(DESTROY_SHADOW,
1957 "%s(%05lx)\n", __func__, mfn_x(smfn));
1958 ASSERT(t == SH_type_l3_shadow);
1960 /* Record that the guest page isn't shadowed any more (in this type) */
1961 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1962 delete_shadow_status(v, gmfn, t, smfn);
1963 shadow_demote(v, gmfn, t);
1965 /* Decrement refcounts of all the old entries */
1966 sl3mfn = smfn;
1967 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
1968 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1969 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
1970 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1971 | ((unsigned long)sl3e & ~PAGE_MASK));
1972 });
1974 /* Put the memory back in the pool */
1975 shadow_free(v->domain, smfn);
1977 #endif /* GUEST_PAGING_LEVELS >= 4 */
1980 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
1982 shadow_l2e_t *sl2e;
1983 u32 t = mfn_to_shadow_page(smfn)->type;
1984 mfn_t gmfn, sl2mfn;
1986 SHADOW_DEBUG(DESTROY_SHADOW,
1987 "%s(%05lx)\n", __func__, mfn_x(smfn));
1989 #if GUEST_PAGING_LEVELS >= 3
1990 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
1991 #else
1992 ASSERT(t == SH_type_l2_shadow);
1993 #endif
1995 /* Record that the guest page isn't shadowed any more (in this type) */
1996 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1997 delete_shadow_status(v, gmfn, t, smfn);
1998 shadow_demote(v, gmfn, t);
2000 /* Decrement refcounts of all the old entries */
2001 sl2mfn = smfn;
2002 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2003 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2004 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2005 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2006 | ((unsigned long)sl2e & ~PAGE_MASK));
2007 });
2009 /* Put the memory back in the pool */
2010 shadow_free(v->domain, smfn);
2013 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2015 struct domain *d = v->domain;
2016 shadow_l1e_t *sl1e;
2017 u32 t = mfn_to_shadow_page(smfn)->type;
2019 SHADOW_DEBUG(DESTROY_SHADOW,
2020 "%s(%05lx)\n", __func__, mfn_x(smfn));
2021 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2023 /* Record that the guest page isn't shadowed any more (in this type) */
2024 if ( t == SH_type_fl1_shadow )
2026 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2027 delete_fl1_shadow_status(v, gfn, smfn);
2029 else
2031 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2032 delete_shadow_status(v, gmfn, t, smfn);
2033 shadow_demote(v, gmfn, t);
2036 if ( shadow_mode_refcounts(d) )
2038 /* Decrement refcounts of all the old entries */
2039 mfn_t sl1mfn = smfn;
2040 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2041 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2042 && !sh_l1e_is_magic(*sl1e) ) {
2043 shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2044 shadow_put_page_from_l1e(*sl1e, d);
2046 });
2049 /* Put the memory back in the pool */
2050 shadow_free(v->domain, smfn);
2053 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2054 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2056 struct domain *d = v->domain;
2057 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2059 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2061 mfn_t m3mfn;
2062 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2063 l3_pgentry_t *l3e;
2064 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2066 /* Need to destroy the l3 and l2 monitor pages used
2067 * for the linear map */
2068 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2069 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2070 l3e = sh_map_domain_page(m3mfn);
2071 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2072 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2073 sh_unmap_domain_page(l3e);
2074 shadow_free(d, m3mfn);
2076 if ( is_pv_32on64_vcpu(v) )
2078 /* Need to destroy the l3 and l2 monitor pages that map the
2079 * Xen VAs at 3GB-4GB */
2080 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2081 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2082 l3e = sh_map_domain_page(m3mfn);
2083 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2084 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2085 sh_unmap_domain_page(l3e);
2086 shadow_free(d, m3mfn);
2088 sh_unmap_domain_page(l4e);
2090 #elif CONFIG_PAGING_LEVELS == 3
2091 /* Need to destroy the l2 monitor page in slot 4 too */
2093 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2094 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2095 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2096 sh_unmap_domain_page(l3e);
2098 #endif
2100 /* Put the memory back in the pool */
2101 shadow_free(d, mmfn);
2103 #endif
2105 /**************************************************************************/
2106 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2107 * These are called from common code when we are running out of shadow
2108 * memory, and unpinning all the top-level shadows hasn't worked.
2110 * This implementation is pretty crude and slow, but we hope that it won't
2111 * be called very often. */
2113 #if GUEST_PAGING_LEVELS == 2
2115 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2117 shadow_l2e_t *sl2e;
2118 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2119 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2120 });
2123 #elif GUEST_PAGING_LEVELS == 3
2125 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2126 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2128 shadow_l2e_t *sl2e;
2129 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2130 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2131 });
2134 #elif GUEST_PAGING_LEVELS == 4
2136 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2138 shadow_l4e_t *sl4e;
2139 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2140 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2141 });
2144 #endif
2146 /**************************************************************************/
2147 /* Internal translation functions.
2148 * These functions require a pointer to the shadow entry that will be updated.
2149 */
2151 /* These functions take a new guest entry, translate it to shadow and write
2152 * the shadow entry.
2154 * They return the same bitmaps as the shadow_set_lXe() functions.
2155 */
2157 #if GUEST_PAGING_LEVELS >= 4
2158 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2160 shadow_l4e_t new_sl4e;
2161 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2162 shadow_l4e_t *sl4p = se;
2163 mfn_t sl3mfn = _mfn(INVALID_MFN);
2164 struct domain *d = v->domain;
2165 p2m_type_t p2mt;
2166 int result = 0;
2168 perfc_incr(shadow_validate_gl4e_calls);
2170 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2172 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2173 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2174 if ( p2m_is_ram(p2mt) )
2175 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2176 else
2177 result |= SHADOW_SET_ERROR;
2179 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
2180 if ( mfn_valid(sl3mfn) )
2181 shadow_resync_all(v, 0);
2182 #endif
2184 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2186 // check for updates to xen reserved slots
2187 if ( !shadow_mode_external(d) )
2189 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2190 sizeof(shadow_l4e_t));
2191 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2193 if ( unlikely(reserved_xen_slot) )
2195 // attempt by the guest to write to a xen reserved slot
2196 //
2197 SHADOW_PRINTK("%s out-of-range update "
2198 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2199 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2200 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2202 SHADOW_ERROR("out-of-range l4e update\n");
2203 result |= SHADOW_SET_ERROR;
2206 // do not call shadow_set_l4e...
2207 return result;
2211 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2212 return result;
2216 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2218 shadow_l3e_t new_sl3e;
2219 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2220 shadow_l3e_t *sl3p = se;
2221 mfn_t sl2mfn = _mfn(INVALID_MFN);
2222 p2m_type_t p2mt;
2223 int result = 0;
2225 perfc_incr(shadow_validate_gl3e_calls);
2227 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2229 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2230 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2231 if ( p2m_is_ram(p2mt) )
2232 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2233 else
2234 result |= SHADOW_SET_ERROR;
2236 #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
2237 if ( mfn_valid(sl2mfn) )
2238 shadow_resync_all(v, 0);
2239 #endif
2241 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2242 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2244 return result;
2246 #endif // GUEST_PAGING_LEVELS >= 4
2248 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2250 shadow_l2e_t new_sl2e;
2251 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2252 shadow_l2e_t *sl2p = se;
2253 mfn_t sl1mfn = _mfn(INVALID_MFN);
2254 p2m_type_t p2mt;
2255 int result = 0;
2257 perfc_incr(shadow_validate_gl2e_calls);
2259 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2261 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2262 if ( guest_supports_superpages(v) &&
2263 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2265 // superpage -- need to look up the shadow L1 which holds the
2266 // splitters...
2267 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2268 #if 0
2269 // XXX - it's possible that we want to do some kind of prefetch
2270 // for superpage fl1's here, but this is *not* on the demand path,
2271 // so we'll hold off trying that for now...
2272 //
2273 if ( !mfn_valid(sl1mfn) )
2274 sl1mfn = make_fl1_shadow(v, gl1gfn);
2275 #endif
2277 else
2279 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2280 if ( p2m_is_ram(p2mt) )
2281 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2282 else
2283 result |= SHADOW_SET_ERROR;
2286 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2288 // check for updates to xen reserved slots in PV guests...
2289 // XXX -- need to revisit this for PV 3-on-4 guests.
2290 //
2291 #if SHADOW_PAGING_LEVELS < 4
2292 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2293 if ( !shadow_mode_external(v->domain) )
2295 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2296 sizeof(shadow_l2e_t));
2297 int reserved_xen_slot;
2299 #if SHADOW_PAGING_LEVELS == 3
2300 reserved_xen_slot =
2301 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2302 (shadow_index
2303 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2304 #else /* SHADOW_PAGING_LEVELS == 2 */
2305 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2306 #endif
2308 if ( unlikely(reserved_xen_slot) )
2310 // attempt by the guest to write to a xen reserved slot
2311 //
2312 SHADOW_PRINTK("%s out-of-range update "
2313 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2314 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2315 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2317 SHADOW_ERROR("out-of-range l2e update\n");
2318 result |= SHADOW_SET_ERROR;
2321 // do not call shadow_set_l2e...
2322 return result;
2325 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2326 #endif /* SHADOW_PAGING_LEVELS < 4 */
2328 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2330 return result;
2333 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2335 shadow_l1e_t new_sl1e;
2336 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2337 shadow_l1e_t *sl1p = se;
2338 gfn_t gfn;
2339 mfn_t gmfn;
2340 p2m_type_t p2mt;
2341 int result = 0;
2342 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2343 mfn_t gl1mfn;
2344 #endif /* OOS */
2346 perfc_incr(shadow_validate_gl1e_calls);
2348 gfn = guest_l1e_get_gfn(new_gl1e);
2349 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2351 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2352 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2354 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2355 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
2356 if ( mfn_valid(gl1mfn)
2357 && mfn_is_out_of_sync(gl1mfn) )
2359 /* Update the OOS snapshot. */
2360 mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
2361 guest_l1e_t *snp;
2363 ASSERT(mfn_valid(snpmfn));
2365 snp = sh_map_domain_page(snpmfn);
2366 snp[guest_index(new_ge)] = new_gl1e;
2367 sh_unmap_domain_page(snp);
2369 #endif /* OOS */
2371 return result;
2374 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2375 /**************************************************************************/
2376 /* Special validation function for re-syncing out-of-sync shadows.
2377 * Walks the *shadow* page, and for every entry that it finds,
2378 * revalidates the guest entry that corresponds to it.
2379 * N.B. This function is called with the vcpu that unsynced the page,
2380 * *not* the one that is causing it to be resynced. */
2381 void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
2383 mfn_t sl1mfn;
2384 shadow_l1e_t *sl1p;
2385 guest_l1e_t *gl1p, *gp, *snp;
2386 int rc = 0;
2388 ASSERT(mfn_valid(snpmfn));
2390 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2391 ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
2393 snp = sh_map_domain_page(snpmfn);
2394 gp = sh_map_domain_page(gl1mfn);
2395 gl1p = gp;
2397 SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
2398 guest_l1e_t gl1e = *gl1p;
2399 guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
2401 if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
2403 gfn_t gfn;
2404 mfn_t gmfn;
2405 p2m_type_t p2mt;
2406 shadow_l1e_t nsl1e;
2408 gfn = guest_l1e_get_gfn(gl1e);
2409 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2410 l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
2411 rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
2413 *snpl1p = gl1e;
2415 });
2417 sh_unmap_domain_page(gp);
2418 sh_unmap_domain_page(snp);
2420 /* Setting shadow L1 entries should never need us to flush the TLB */
2421 ASSERT(!(rc & SHADOW_SET_FLUSH));
2424 /* Figure out whether it's definitely safe not to sync this l1 table.
2425 * That is: if we can tell that it's only used once, and that the
2426 * toplevel shadow responsible is not one of ours.
2427 * N.B. This function is called with the vcpu that required the resync,
2428 * *not* the one that originally unsynced the page, but it is
2429 * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
2430 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
2432 struct shadow_page_info *sp;
2433 mfn_t smfn;
2435 smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2436 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2438 /* Up to l2 */
2439 sp = mfn_to_shadow_page(smfn);
2440 if ( sp->count != 1 || !sp->up )
2441 return 0;
2442 smfn = _mfn(sp->up >> PAGE_SHIFT);
2443 ASSERT(mfn_valid(smfn));
2445 #if (SHADOW_PAGING_LEVELS == 4)
2446 /* up to l3 */
2447 sp = mfn_to_shadow_page(smfn);
2448 if ( sp->count != 1 || !sp->up )
2449 return 0;
2450 smfn = _mfn(sp->up >> PAGE_SHIFT);
2451 ASSERT(mfn_valid(smfn));
2453 /* up to l4 */
2454 sp = mfn_to_shadow_page(smfn);
2455 if ( sp->count != 1
2456 || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
2457 return 0;
2458 smfn = _mfn(sp->up >> PAGE_SHIFT);
2459 ASSERT(mfn_valid(smfn));
2461 #if (GUEST_PAGING_LEVELS == 2)
2462 /* In 2-on-3 shadow mode the up pointer contains the link to the
2463 * shadow page, but the shadow_table contains only the first of the
2464 * four pages that makes the PAE top shadow tables. */
2465 smfn = _mfn(mfn_x(smfn) & ~0x3UL);
2466 #endif
2468 #endif
2470 if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
2471 #if (SHADOW_PAGING_LEVELS == 3)
2472 || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
2473 || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
2474 || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
2475 #endif
2477 return 0;
2479 /* Only in use in one toplevel shadow, and it's not the one we're
2480 * running on */
2481 return 1;
2483 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
2486 /**************************************************************************/
2487 /* Functions which translate and install the shadows of arbitrary guest
2488 * entries that we have just seen the guest write. */
2491 static inline int
2492 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2493 void *new_gp, u32 size, u32 sh_type,
2494 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2495 int (*validate_ge)(struct vcpu *v, void *ge,
2496 mfn_t smfn, void *se))
2497 /* Generic function for mapping and validating. */
2499 mfn_t smfn, smfn2, map_mfn;
2500 shadow_l1e_t *sl1p;
2501 u32 shadow_idx, guest_idx;
2502 int result = 0;
2504 /* Align address and size to guest entry boundaries */
2505 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2506 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2507 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2508 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2510 /* Map the shadow page */
2511 smfn = get_shadow_status(v, gmfn, sh_type);
2512 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2513 guest_idx = guest_index(new_gp);
2514 map_mfn = smfn;
2515 shadow_idx = shadow_index(&map_mfn, guest_idx);
2516 sl1p = sh_map_domain_page(map_mfn);
2518 /* Validate one entry at a time */
2519 while ( size )
2521 smfn2 = smfn;
2522 guest_idx = guest_index(new_gp);
2523 shadow_idx = shadow_index(&smfn2, guest_idx);
2524 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2526 /* We have moved to another page of the shadow */
2527 map_mfn = smfn2;
2528 sh_unmap_domain_page(sl1p);
2529 sl1p = sh_map_domain_page(map_mfn);
2531 result |= validate_ge(v,
2532 new_gp,
2533 map_mfn,
2534 &sl1p[shadow_idx]);
2535 size -= sizeof(guest_l1e_t);
2536 new_gp += sizeof(guest_l1e_t);
2538 sh_unmap_domain_page(sl1p);
2539 return result;
2543 int
2544 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2545 void *new_gl4p, u32 size)
2547 #if GUEST_PAGING_LEVELS >= 4
2548 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2549 SH_type_l4_shadow,
2550 shadow_l4_index,
2551 validate_gl4e);
2552 #else // ! GUEST_PAGING_LEVELS >= 4
2553 SHADOW_ERROR("called in wrong paging mode!\n");
2554 BUG();
2555 return 0;
2556 #endif
2559 int
2560 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2561 void *new_gl3p, u32 size)
2563 #if GUEST_PAGING_LEVELS >= 4
2564 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2565 SH_type_l3_shadow,
2566 shadow_l3_index,
2567 validate_gl3e);
2568 #else // ! GUEST_PAGING_LEVELS >= 4
2569 SHADOW_ERROR("called in wrong paging mode!\n");
2570 BUG();
2571 return 0;
2572 #endif
2575 int
2576 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2577 void *new_gl2p, u32 size)
2579 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2580 SH_type_l2_shadow,
2581 shadow_l2_index,
2582 validate_gl2e);
2585 int
2586 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2587 void *new_gl2p, u32 size)
2589 #if GUEST_PAGING_LEVELS >= 3
2590 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2591 SH_type_l2h_shadow,
2592 shadow_l2_index,
2593 validate_gl2e);
2594 #else /* Non-PAE guests don't have different kinds of l2 table */
2595 SHADOW_ERROR("called in wrong paging mode!\n");
2596 BUG();
2597 return 0;
2598 #endif
2601 int
2602 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2603 void *new_gl1p, u32 size)
2605 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2606 SH_type_l1_shadow,
2607 shadow_l1_index,
2608 validate_gl1e);
2612 /**************************************************************************/
2613 /* Optimization: If we see two emulated writes of zeros to the same
2614 * page-table without another kind of page fault in between, we guess
2615 * that this is a batch of changes (for process destruction) and
2616 * unshadow the page so we don't take a pagefault on every entry. This
2617 * should also make finding writeable mappings of pagetables much
2618 * easier. */
2620 /* Look to see if this is the second emulated write in a row to this
2621 * page, and unshadow if it is */
2622 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2624 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2625 if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2626 && sh_mfn_is_a_page_table(gmfn) )
2628 perfc_incr(shadow_early_unshadow);
2629 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2630 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
2632 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2633 #endif
2636 /* Stop counting towards early unshadows, as we've seen a real page fault */
2637 static inline void reset_early_unshadow(struct vcpu *v)
2639 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2640 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2641 #endif
2646 /**************************************************************************/
2647 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2648 * demand-faulted a shadow l1e in the fault handler, to see if it's
2649 * worth fetching some more.
2650 */
2652 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2654 /* XXX magic number */
2655 #define PREFETCH_DISTANCE 32
2657 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2658 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2660 int i, dist;
2661 gfn_t gfn;
2662 mfn_t gmfn;
2663 guest_l1e_t *gl1p = NULL, gl1e;
2664 shadow_l1e_t sl1e;
2665 u32 gflags;
2666 p2m_type_t p2mt;
2667 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2668 guest_l1e_t *snpl1p = NULL;
2669 #endif /* OOS */
2672 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2673 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2674 /* And no more than a maximum fetches-per-fault */
2675 if ( dist > PREFETCH_DISTANCE )
2676 dist = PREFETCH_DISTANCE;
2678 if ( mfn_valid(gw->l1mfn) )
2680 /* Normal guest page; grab the next guest entry */
2681 gl1p = sh_map_domain_page(gw->l1mfn);
2682 gl1p += guest_l1_table_offset(gw->va);
2684 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2685 if ( mfn_is_out_of_sync(gw->l1mfn) )
2687 mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
2689 ASSERT(mfn_valid(snpmfn));
2690 snpl1p = sh_map_domain_page(snpmfn);
2691 snpl1p += guest_l1_table_offset(gw->va);
2693 #endif /* OOS */
2696 for ( i = 1; i < dist ; i++ )
2698 /* No point in prefetching if there's already a shadow */
2699 if ( ptr_sl1e[i].l1 != 0 )
2700 break;
2702 if ( mfn_valid(gw->l1mfn) )
2704 /* Normal guest page; grab the next guest entry */
2705 gl1e = gl1p[i];
2706 /* Not worth continuing if we hit an entry that will need another
2707 * fault for A/D-bit propagation anyway */
2708 gflags = guest_l1e_get_flags(gl1e);
2709 if ( (gflags & _PAGE_PRESENT)
2710 && (!(gflags & _PAGE_ACCESSED)
2711 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2712 break;
2714 else
2716 /* Fragmented superpage, unless we've been called wrongly */
2717 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2718 /* Increment the l1e's GFN by the right number of guest pages */
2719 gl1e = guest_l1e_from_gfn(
2720 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2721 guest_l1e_get_flags(gw->l1e));
2724 /* Look at the gfn that the l1e is pointing at */
2725 gfn = guest_l1e_get_gfn(gl1e);
2726 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2728 /* Propagate the entry. */
2729 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2730 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2732 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2733 if ( snpl1p != NULL )
2734 snpl1p[i] = gl1e;
2735 #endif /* OOS */
2737 if ( gl1p != NULL )
2738 sh_unmap_domain_page(gl1p);
2739 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2740 if ( snpl1p != NULL )
2741 sh_unmap_domain_page(snpl1p);
2742 #endif /* OOS */
2745 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2747 #if GUEST_PAGING_LEVELS == 4
2748 typedef u64 guest_va_t;
2749 typedef u64 guest_pa_t;
2750 #elif GUEST_PAGING_LEVELS == 3
2751 typedef u32 guest_va_t;
2752 typedef u64 guest_pa_t;
2753 #else
2754 typedef u32 guest_va_t;
2755 typedef u32 guest_pa_t;
2756 #endif
2758 static inline void trace_shadow_gen(u32 event, guest_va_t va)
2760 if ( tb_init_done )
2762 event |= (GUEST_PAGING_LEVELS-2)<<8;
2763 __trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va);
2767 static inline void trace_shadow_fixup(guest_l1e_t gl1e,
2768 guest_va_t va)
2770 if ( tb_init_done )
2772 struct {
2773 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2774 so put it first for alignment sake. */
2775 guest_l1e_t gl1e;
2776 guest_va_t va;
2777 u32 flags;
2778 } __attribute__((packed)) d;
2779 u32 event;
2781 event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8);
2783 d.gl1e = gl1e;
2784 d.va = va;
2785 d.flags = this_cpu(trace_shadow_path_flags);
2787 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2791 static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
2792 guest_va_t va)
2794 if ( tb_init_done )
2796 struct {
2797 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2798 so put it first for alignment sake. */
2799 guest_l1e_t gl1e;
2800 guest_va_t va;
2801 u32 flags;
2802 } __attribute__((packed)) d;
2803 u32 event;
2805 event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8);
2807 d.gl1e = gl1e;
2808 d.va = va;
2809 d.flags = this_cpu(trace_shadow_path_flags);
2811 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2815 static inline void trace_shadow_emulate_other(u32 event,
2816 guest_va_t va,
2817 gfn_t gfn)
2819 if ( tb_init_done )
2821 struct {
2822 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2823 so put it first for alignment sake. */
2824 #if GUEST_PAGING_LEVELS == 2
2825 u32 gfn;
2826 #else
2827 u64 gfn;
2828 #endif
2829 guest_va_t va;
2830 } __attribute__((packed)) d;
2832 event |= ((GUEST_PAGING_LEVELS-2)<<8);
2834 d.gfn=gfn_x(gfn);
2835 d.va = va;
2837 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2841 #if GUEST_PAGING_LEVELS == 3
2842 static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va);
2843 static DEFINE_PER_CPU(int,trace_extra_emulation_count);
2844 #endif
2845 static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
2847 static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
2849 if ( tb_init_done )
2851 struct {
2852 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
2853 so put it first for alignment sake. */
2854 guest_l1e_t gl1e, write_val;
2855 guest_va_t va;
2856 unsigned flags:29, emulation_count:3;
2857 } __attribute__((packed)) d;
2858 u32 event;
2860 event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8);
2862 d.gl1e = gl1e;
2863 d.write_val.l1 = this_cpu(trace_emulate_write_val);
2864 d.va = va;
2865 #if GUEST_PAGING_LEVELS == 3
2866 d.emulation_count = this_cpu(trace_extra_emulation_count);
2867 #endif
2868 d.flags = this_cpu(trace_shadow_path_flags);
2870 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2874 /**************************************************************************/
2875 /* Entry points into the shadow code */
2877 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2878 * for pagefaults. Returns 1 if this fault was an artefact of the
2879 * shadow code (and the guest should retry) or 0 if it is not (and the
2880 * fault should be handled elsewhere or passed to the guest). */
2882 static int sh_page_fault(struct vcpu *v,
2883 unsigned long va,
2884 struct cpu_user_regs *regs)
2886 struct domain *d = v->domain;
2887 walk_t gw;
2888 gfn_t gfn = _gfn(0);
2889 mfn_t gmfn, sl1mfn = _mfn(0);
2890 shadow_l1e_t sl1e, *ptr_sl1e;
2891 paddr_t gpa;
2892 struct sh_emulate_ctxt emul_ctxt;
2893 struct x86_emulate_ops *emul_ops;
2894 int r;
2895 fetch_type_t ft = 0;
2896 p2m_type_t p2mt;
2897 uint32_t rc;
2898 int version;
2899 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2900 int fast_emul = 0;
2901 #endif
2903 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
2904 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
2905 regs->eip);
2907 perfc_incr(shadow_fault);
2909 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2910 /* If faulting frame is successfully emulated in last shadow fault
2911 * it's highly likely to reach same emulation action for this frame.
2912 * Then try to emulate early to avoid lock aquisition.
2913 */
2914 if ( v->arch.paging.last_write_emul_ok
2915 && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
2917 /* check whether error code is 3, or else fall back to normal path
2918 * in case of some validation is required
2919 */
2920 if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
2922 fast_emul = 1;
2923 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
2925 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2926 /* Fall back to the slow path if we're trying to emulate
2927 writes to an out of sync page. */
2928 if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
2930 fast_emul = 0;
2931 v->arch.paging.last_write_emul_ok = 0;
2932 goto page_fault_slow_path;
2934 #endif /* OOS */
2936 perfc_incr(shadow_fault_fast_emulate);
2937 goto early_emulation;
2939 else
2940 v->arch.paging.last_write_emul_ok = 0;
2942 #endif
2944 //
2945 // XXX: Need to think about eventually mapping superpages directly in the
2946 // shadow (when possible), as opposed to splintering them into a
2947 // bunch of 4K maps.
2948 //
2950 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
2951 if ( (regs->error_code & PFEC_reserved_bit) )
2953 /* The only reasons for reserved bits to be set in shadow entries
2954 * are the two "magic" shadow_l1e entries. */
2955 if ( likely((__copy_from_user(&sl1e,
2956 (sh_linear_l1_table(v)
2957 + shadow_l1_linear_offset(va)),
2958 sizeof(sl1e)) == 0)
2959 && sh_l1e_is_magic(sl1e)) )
2961 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2962 /* First, need to check that this isn't an out-of-sync
2963 * shadow l1e. If it is, we fall back to the slow path, which
2964 * will sync it up again. */
2966 shadow_l2e_t sl2e;
2967 mfn_t gl1mfn;
2968 if ( (__copy_from_user(&sl2e,
2969 (sh_linear_l2_table(v)
2970 + shadow_l2_linear_offset(va)),
2971 sizeof(sl2e)) != 0)
2972 || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
2973 || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
2974 shadow_l2e_get_mfn(sl2e))->backpointer))
2975 || unlikely(mfn_is_out_of_sync(gl1mfn)) )
2977 /* Hit the slow path as if there had been no
2978 * shadow entry at all, and let it tidy up */
2979 ASSERT(regs->error_code & PFEC_page_present);
2980 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2981 goto page_fault_slow_path;
2984 #endif /* SHOPT_OUT_OF_SYNC */
2986 if ( sh_l1e_is_gnp(sl1e) )
2988 /* Not-present in a guest PT: pass to the guest as
2989 * a not-present fault (by flipping two bits). */
2990 ASSERT(regs->error_code & PFEC_page_present);
2991 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2992 reset_early_unshadow(v);
2993 perfc_incr(shadow_fault_fast_gnp);
2994 SHADOW_PRINTK("fast path not-present\n");
2995 trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
2996 return 0;
2998 else
3000 /* Magic MMIO marker: extract gfn for MMIO address */
3001 ASSERT(sh_l1e_is_mmio(sl1e));
3002 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
3003 << PAGE_SHIFT)
3004 | (va & ~PAGE_MASK);
3006 perfc_incr(shadow_fault_fast_mmio);
3007 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
3008 reset_early_unshadow(v);
3009 trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
3010 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3011 ? EXCRET_fault_fixed : 0);
3013 else
3015 /* This should be exceptionally rare: another vcpu has fixed
3016 * the tables between the fault and our reading the l1e.
3017 * Retry and let the hardware give us the right fault next time. */
3018 perfc_incr(shadow_fault_fast_fail);
3019 SHADOW_PRINTK("fast path false alarm!\n");
3020 trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
3021 return EXCRET_fault_fixed;
3025 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3026 page_fault_slow_path:
3027 #endif
3028 #endif /* SHOPT_FAST_FAULT_PATH */
3030 /* Detect if this page fault happened while we were already in Xen
3031 * doing a shadow operation. If that happens, the only thing we can
3032 * do is let Xen's normal fault handlers try to fix it. In any case,
3033 * a diagnostic trace of the fault will be more useful than
3034 * a BUG() when we try to take the lock again. */
3035 if ( unlikely(shadow_locked_by_me(d)) )
3037 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
3038 d->arch.paging.shadow.locker_function);
3039 return 0;
3042 rewalk:
3044 /* The walk is done in a lock-free style, with some sanity check
3045 * postponed after grabbing shadow lock later. Those delayed checks
3046 * will make sure no inconsistent mapping being translated into
3047 * shadow page table. */
3048 version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
3049 rmb();
3050 rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
3052 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3053 regs->error_code &= ~PFEC_page_present;
3054 if ( !(rc & _PAGE_PRESENT) )
3055 regs->error_code |= PFEC_page_present;
3056 #endif
3058 if ( rc != 0 )
3060 perfc_incr(shadow_fault_bail_real_fault);
3061 SHADOW_PRINTK("not a shadow fault\n");
3062 reset_early_unshadow(v);
3063 goto propagate;
3066 /* It's possible that the guest has put pagetables in memory that it has
3067 * already used for some special purpose (ioreq pages, or granted pages).
3068 * If that happens we'll have killed the guest already but it's still not
3069 * safe to propagate entries out of the guest PT so get out now. */
3070 if ( unlikely(d->is_shutting_down) )
3072 SHADOW_PRINTK("guest is shutting down\n");
3073 goto propagate;
3076 /* What kind of access are we dealing with? */
3077 ft = ((regs->error_code & PFEC_write_access)
3078 ? ft_demand_write : ft_demand_read);
3080 /* What mfn is the guest trying to access? */
3081 gfn = guest_l1e_get_gfn(gw.l1e);
3082 gmfn = gfn_to_mfn(d, gfn, &p2mt);
3084 if ( shadow_mode_refcounts(d) &&
3085 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3087 perfc_incr(shadow_fault_bail_bad_gfn);
3088 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3089 gfn_x(gfn), mfn_x(gmfn));
3090 reset_early_unshadow(v);
3091 goto propagate;
3094 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3095 /* Remember this successful VA->GFN translation for later. */
3096 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3097 regs->error_code | PFEC_page_present);
3098 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3100 shadow_lock(d);
3102 TRACE_CLEAR_PATH_FLAGS;
3104 rc = gw_remove_write_accesses(v, va, &gw);
3106 /* First bit set: Removed write access to a page. */
3107 if ( rc & GW_RMWR_FLUSHTLB )
3109 /* Write permission removal is also a hint that other gwalks
3110 * overlapping with this one may be inconsistent
3111 */
3112 perfc_incr(shadow_rm_write_flush_tlb);
3113 atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3114 flush_tlb_mask(d->domain_dirty_cpumask);
3117 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3118 /* Second bit set: Resynced a page. Re-walk needed. */
3119 if ( rc & GW_RMWR_REWALK )
3121 shadow_unlock(d);
3122 goto rewalk;
3124 #endif /* OOS */
3126 if ( !shadow_check_gwalk(v, va, &gw, version) )
3128 perfc_incr(shadow_inconsistent_gwalk);
3129 shadow_unlock(d);
3130 goto rewalk;
3133 shadow_audit_tables(v);
3134 sh_audit_gw(v, &gw);
3136 /* Make sure there is enough free shadow memory to build a chain of
3137 * shadow tables. (We never allocate a top-level shadow on this path,
3138 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3139 * SH_type_l1_shadow isn't correct in the latter case, all page
3140 * tables are the same size there.) */
3141 shadow_prealloc(d,
3142 SH_type_l1_shadow,
3143 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3145 /* Acquire the shadow. This must happen before we figure out the rights
3146 * for the shadow entry, since we might promote a page here. */
3147 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3148 if ( unlikely(ptr_sl1e == NULL) )
3150 /* Couldn't get the sl1e! Since we know the guest entries
3151 * are OK, this can only have been caused by a failed
3152 * shadow_set_l*e(), which will have crashed the guest.
3153 * Get out of the fault handler immediately. */
3154 ASSERT(d->is_shutting_down);
3155 shadow_unlock(d);
3156 trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va);
3157 return 0;
3160 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3161 /* Always unsync when writing to L1 page tables. */
3162 if ( sh_mfn_is_a_page_table(gmfn)
3163 && ft == ft_demand_write )
3164 sh_unsync(v, gmfn);
3166 if ( unlikely(d->is_shutting_down) )
3168 /* We might end up with a crashed domain here if
3169 * sh_remove_shadows() in a previous sh_resync() call has
3170 * failed. We cannot safely continue since some page is still
3171 * OOS but not in the hash table anymore. */
3172 shadow_unlock(d);
3173 return 0;
3175 #endif /* OOS */
3177 /* Calculate the shadow entry and write it */
3178 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3179 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
3181 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3182 if ( mfn_valid(gw.l1mfn)
3183 && mfn_is_out_of_sync(gw.l1mfn) )
3185 /* Update the OOS snapshot. */
3186 mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
3187 guest_l1e_t *snp;
3189 ASSERT(mfn_valid(snpmfn));
3191 snp = sh_map_domain_page(snpmfn);
3192 snp[guest_l1_table_offset(va)] = gw.l1e;
3193 sh_unmap_domain_page(snp);
3195 #endif /* OOS */
3197 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3198 /* Prefetch some more shadow entries */
3199 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3200 #endif
3202 /* Need to emulate accesses to page tables */
3203 if ( sh_mfn_is_a_page_table(gmfn)
3204 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3205 /* Unless they've been allowed to go out of sync with their
3206 shadows and we don't need to unshadow it. */
3207 && !(mfn_is_out_of_sync(gmfn)
3208 && !(regs->error_code & PFEC_user_mode))
3209 #endif
3212 if ( ft == ft_demand_write )
3214 perfc_incr(shadow_fault_emulate_write);
3215 goto emulate;
3217 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3219 perfc_incr(shadow_fault_emulate_read);
3220 goto emulate;
3224 /* Need to hand off device-model MMIO to the device model */
3225 if ( p2mt == p2m_mmio_dm )
3227 gpa = guest_walk_to_gpa(&gw);
3228 goto mmio;
3231 /* Log attempts to write to read-only memory */
3232 if ( (p2mt == p2m_ram_ro) && (ft == ft_demand_write) )
3234 static unsigned long lastpage = 0;
3235 if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
3236 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
3237 " page. va page=%#lx, mfn=%#lx\n",
3238 va & PAGE_MASK, mfn_x(gmfn));
3239 goto emulate_readonly; /* skip over the instruction */
3242 /* In HVM guests, we force CR0.WP always to be set, so that the
3243 * pagetables are always write-protected. If the guest thinks
3244 * CR0.WP is clear, we must emulate faulting supervisor writes to
3245 * allow the guest to write through read-only PTEs. Emulate if the
3246 * fault was a non-user write to a present page. */
3247 if ( is_hvm_domain(d)
3248 && unlikely(!hvm_wp_enabled(v))
3249 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
3251 perfc_incr(shadow_fault_emulate_wp);
3252 goto emulate;
3255 perfc_incr(shadow_fault_fixed);
3256 d->arch.paging.log_dirty.fault_count++;
3257 reset_early_unshadow(v);
3259 trace_shadow_fixup(gw.l1e, va);
3260 done:
3261 sh_audit_gw(v, &gw);
3262 SHADOW_PRINTK("fixed\n");
3263 shadow_audit_tables(v);
3264 shadow_unlock(d);
3265 return EXCRET_fault_fixed;
3267 emulate:
3268 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3269 goto not_a_shadow_fault;
3271 /*
3272 * We do not emulate user writes. Instead we use them as a hint that the
3273 * page is no longer a page table. This behaviour differs from native, but
3274 * it seems very unlikely that any OS grants user access to page tables.
3275 */
3276 if ( (regs->error_code & PFEC_user_mode) )
3278 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3279 mfn_x(gmfn));
3280 perfc_incr(shadow_fault_emulate_failed);
3281 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3282 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
3283 va, gfn);
3284 goto done;
3287 /*
3288 * Write from userspace to ro-mem needs to jump here to avoid getting
3289 * caught by user-mode page-table check above.
3290 */
3291 emulate_readonly:
3292 /*
3293 * We don't need to hold the lock for the whole emulation; we will
3294 * take it again when we write to the pagetables.
3295 */
3296 sh_audit_gw(v, &gw);
3297 shadow_audit_tables(v);
3298 shadow_unlock(d);
3300 this_cpu(trace_emulate_write_val) = 0;
3302 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3303 early_emulation:
3304 #endif
3305 if ( is_hvm_domain(d) )
3307 /*
3308 * If we are in the middle of injecting an exception or interrupt then
3309 * we should not emulate: it is not the instruction at %eip that caused
3310 * the fault. Furthermore it is almost certainly the case the handler
3311 * stack is currently considered to be a page table, so we should
3312 * unshadow the faulting page before exiting.
3313 */
3314 if ( unlikely(hvm_event_pending(v)) )
3316 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3317 if ( fast_emul )
3319 perfc_incr(shadow_fault_fast_emulate_fail);
3320 v->arch.paging.last_write_emul_ok = 0;
3322 #endif
3323 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3324 "injection: cr2=%#lx, mfn=%#lx\n",
3325 va, mfn_x(gmfn));
3326 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3327 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
3328 va, gfn);
3329 return EXCRET_fault_fixed;
3333 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3334 (unsigned long)regs->eip, (unsigned long)regs->esp);
3336 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3338 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3340 /*
3341 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3342 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3343 * then it must be 'failable': we cannot require the unshadow to succeed.
3344 */
3345 if ( r == X86EMUL_UNHANDLEABLE )
3347 perfc_incr(shadow_fault_emulate_failed);
3348 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3349 if ( fast_emul )
3351 perfc_incr(shadow_fault_fast_emulate_fail);
3352 v->arch.paging.last_write_emul_ok = 0;
3354 #endif
3355 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3356 mfn_x(gmfn));
3357 /* If this is actually a page table, then we have a bug, and need
3358 * to support more operations in the emulator. More likely,
3359 * though, this is a hint that this page should not be shadowed. */
3360 shadow_remove_all_shadows(v, gmfn);
3362 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
3363 va, gfn);
3364 goto emulate_done;
3367 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3368 /* Record successfully emulated information as heuristics to next
3369 * fault on same frame for acceleration. But be careful to verify
3370 * its attribute still as page table, or else unshadow triggered
3371 * in write emulation normally requires a re-sync with guest page
3372 * table to recover r/w permission. Incorrect record for such case
3373 * will cause unexpected more shadow faults due to propagation is
3374 * skipped.
3375 */
3376 if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3378 if ( !fast_emul )
3380 v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3381 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3382 v->arch.paging.last_write_emul_ok = 1;
3385 else if ( fast_emul )
3386 v->arch.paging.last_write_emul_ok = 0;
3387 #endif
3389 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3390 if ( r == X86EMUL_OKAY ) {
3391 int i, emulation_count=0;
3392 this_cpu(trace_emulate_initial_va) = va;
3393 /* Emulate up to four extra instructions in the hope of catching
3394 * the "second half" of a 64-bit pagetable write. */
3395 for ( i = 0 ; i < 4 ; i++ )
3397 shadow_continue_emulation(&emul_ctxt, regs);
3398 v->arch.paging.last_write_was_pt = 0;
3399 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3400 if ( r == X86EMUL_OKAY )
3402 emulation_count++;
3403 if ( v->arch.paging.last_write_was_pt )
3405 perfc_incr(shadow_em_ex_pt);
3406 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
3407 break; /* Don't emulate past the other half of the write */
3409 else
3410 perfc_incr(shadow_em_ex_non_pt);
3412 else
3414 perfc_incr(shadow_em_ex_fail);
3415 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
3416 break; /* Don't emulate again if we failed! */
3419 this_cpu(trace_extra_emulation_count)=emulation_count;
3421 #endif /* PAE guest */
3423 trace_shadow_emulate(gw.l1e, va);
3424 emulate_done:
3425 SHADOW_PRINTK("emulated\n");
3426 return EXCRET_fault_fixed;
3428 mmio:
3429 if ( !guest_mode(regs) )
3430 goto not_a_shadow_fault;
3431 perfc_incr(shadow_fault_mmio);
3432 sh_audit_gw(v, &gw);
3433 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3434 shadow_audit_tables(v);
3435 reset_early_unshadow(v);
3436 shadow_unlock(d);
3437 trace_shadow_gen(TRC_SHADOW_MMIO, va);
3438 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3439 ? EXCRET_fault_fixed : 0);
3441 not_a_shadow_fault:
3442 sh_audit_gw(v, &gw);
3443 SHADOW_PRINTK("not a shadow fault\n");
3444 shadow_audit_tables(v);
3445 reset_early_unshadow(v);
3446 shadow_unlock(d);
3448 propagate:
3449 trace_not_shadow_fault(gw.l1e, va);
3451 return 0;
3455 static int
3456 sh_invlpg(struct vcpu *v, unsigned long va)
3457 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3458 * instruction should be issued on the hardware, or 0 if it's safe not
3459 * to do so. */
3461 mfn_t sl1mfn;
3462 shadow_l2e_t sl2e;
3464 perfc_incr(shadow_invlpg);
3466 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3467 /* No longer safe to use cached gva->gfn translations */
3468 vtlb_flush(v);
3469 #endif
3471 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3472 v->arch.paging.last_write_emul_ok = 0;
3473 #endif
3475 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3476 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3477 * yet. */
3478 #if SHADOW_PAGING_LEVELS == 4
3480 shadow_l3e_t sl3e;
3481 if ( !(shadow_l4e_get_flags(
3482 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3483 & _PAGE_PRESENT) )
3484 return 0;
3485 /* This must still be a copy-from-user because we don't have the
3486 * shadow lock, and the higher-level shadows might disappear
3487 * under our feet. */
3488 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3489 + shadow_l3_linear_offset(va)),
3490 sizeof (sl3e)) != 0 )
3492 perfc_incr(shadow_invlpg_fault);
3493 return 0;
3495 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3496 return 0;
3498 #else /* SHADOW_PAGING_LEVELS == 3 */
3499 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3500 & _PAGE_PRESENT) )
3501 // no need to flush anything if there's no SL2...
3502 return 0;
3503 #endif
3505 /* This must still be a copy-from-user because we don't have the shadow
3506 * lock, and the higher-level shadows might disappear under our feet. */
3507 if ( __copy_from_user(&sl2e,
3508 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3509 sizeof (sl2e)) != 0 )
3511 perfc_incr(shadow_invlpg_fault);
3512 return 0;
3515 // If there's nothing shadowed for this particular sl2e, then
3516 // there is no need to do an invlpg, either...
3517 //
3518 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3519 return 0;
3521 // Check to see if the SL2 is a splintered superpage...
3522 // If so, then we'll need to flush the entire TLB (because that's
3523 // easier than invalidating all of the individual 4K pages).
3524 //
3525 sl1mfn = shadow_l2e_get_mfn(sl2e);
3526 if ( mfn_to_shadow_page(sl1mfn)->type
3527 == SH_type_fl1_shadow )
3529 flush_tlb_local();
3530 return 0;
3533 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3534 /* Check to see if the SL1 is out of sync. */
3536 mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3537 struct page_info *pg = mfn_to_page(gl1mfn);
3538 if ( mfn_valid(gl1mfn)
3539 && page_is_out_of_sync(pg) )
3541 /* The test above may give false positives, since we don't
3542 * hold the shadow lock yet. Check again with the lock held. */
3543 shadow_lock(v->domain);
3545 /* This must still be a copy-from-user because we didn't
3546 * have the shadow lock last time we checked, and the
3547 * higher-level shadows might have disappeared under our
3548 * feet. */
3549 if ( __copy_from_user(&sl2e,
3550 sh_linear_l2_table(v)
3551 + shadow_l2_linear_offset(va),
3552 sizeof (sl2e)) != 0 )
3554 perfc_incr(shadow_invlpg_fault);
3555 shadow_unlock(v->domain);
3556 return 0;
3559 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3561 shadow_unlock(v->domain);
3562 return 0;
3565 sl1mfn = shadow_l2e_get_mfn(sl2e);
3566 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3567 pg = mfn_to_page(gl1mfn);
3569 if ( likely(sh_mfn_is_a_page_table(gl1mfn)
3570 && page_is_out_of_sync(pg) ) )
3572 shadow_l1e_t *sl1;
3573 sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
3574 /* Remove the shadow entry that maps this VA */
3575 (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
3577 shadow_unlock(v->domain);
3578 /* Need the invlpg, to pick up the disappeareance of the sl1e */
3579 return 1;
3582 #endif
3584 return 1;
3588 static unsigned long
3589 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3590 /* Called to translate a guest virtual address to what the *guest*
3591 * pagetables would map it to. */
3593 walk_t gw;
3594 gfn_t gfn;
3596 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3597 /* Check the vTLB cache first */
3598 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3599 if ( VALID_GFN(vtlb_gfn) )
3600 return vtlb_gfn;
3601 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3603 if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
3605 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3606 pfec[0] &= ~PFEC_page_present;
3607 return INVALID_GFN;
3609 gfn = guest_walk_to_gfn(&gw);
3611 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3612 /* Remember this successful VA->GFN translation for later. */
3613 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3614 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3616 return gfn_x(gfn);
3620 static inline void
3621 sh_update_linear_entries(struct vcpu *v)
3622 /* Sync up all the linear mappings for this vcpu's pagetables */
3624 struct domain *d = v->domain;
3626 /* Linear pagetables in PV guests
3627 * ------------------------------
3629 * Guest linear pagetables, which map the guest pages, are at
3630 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3631 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3632 * are set up at shadow creation time, but (of course!) the PAE case
3633 * is subtler. Normal linear mappings are made by having an entry
3634 * in the top-level table that points to itself (shadow linear) or
3635 * to the guest top-level table (guest linear). For PAE, to set up
3636 * a linear map requires us to copy the four top-level entries into
3637 * level-2 entries. That means that every time we change a PAE l3e,
3638 * we need to reflect the change into the copy.
3640 * Linear pagetables in HVM guests
3641 * -------------------------------
3643 * For HVM guests, the linear pagetables are installed in the monitor
3644 * tables (since we can't put them in the shadow). Shadow linear
3645 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3646 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3647 * a linear pagetable of the monitor tables themselves. We have
3648 * the same issue of having to re-copy PAE l3 entries whevever we use
3649 * PAE shadows.
3651 * Because HVM guests run on the same monitor tables regardless of the
3652 * shadow tables in use, the linear mapping of the shadow tables has to
3653 * be updated every time v->arch.shadow_table changes.
3654 */
3656 /* Don't try to update the monitor table if it doesn't exist */
3657 if ( shadow_mode_external(d)
3658 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3659 return;
3661 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3663 /* For PV, one l4e points at the guest l4, one points at the shadow
3664 * l4. No maintenance required.
3665 * For HVM, just need to update the l4e that points to the shadow l4. */
3667 if ( shadow_mode_external(d) )
3669 /* Use the linear map if we can; otherwise make a new mapping */
3670 if ( v == current )
3672 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3673 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3674 __PAGE_HYPERVISOR);
3676 else
3678 l4_pgentry_t *ml4e;
3679 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3680 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3681 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3682 __PAGE_HYPERVISOR);
3683 sh_unmap_domain_page(ml4e);
3687 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3689 /* PV: XXX
3691 * HVM: To give ourselves a linear map of the shadows, we need to
3692 * extend a PAE shadow to 4 levels. We do this by having a monitor
3693 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3694 * entries into it. Then, by having the monitor l4e for shadow
3695 * pagetables also point to the monitor l4, we can use it to access
3696 * the shadows.
3697 */
3699 if ( shadow_mode_external(d) )
3701 /* Install copies of the shadow l3es into the monitor l2 table
3702 * that maps SH_LINEAR_PT_VIRT_START. */
3703 shadow_l3e_t *sl3e;
3704 l2_pgentry_t *ml2e;
3705 int i;
3707 /* Use linear mappings if we can; otherwise make new mappings */
3708 if ( v == current )
3709 ml2e = __linear_l2_table
3710 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3711 else
3713 mfn_t l3mfn, l2mfn;
3714 l4_pgentry_t *ml4e;
3715 l3_pgentry_t *ml3e;
3716 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3717 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3719 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3720 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3721 ml3e = sh_map_domain_page(l3mfn);
3722 sh_unmap_domain_page(ml4e);
3724 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3725 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3726 ml2e = sh_map_domain_page(l2mfn);
3727 sh_unmap_domain_page(ml3e);
3730 /* Shadow l3 tables are made up by sh_update_cr3 */
3731 sl3e = v->arch.paging.shadow.l3table;
3733 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3735 ml2e[i] =
3736 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3737 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3738 __PAGE_HYPERVISOR)
3739 : l2e_empty();
3742 if ( v != current )
3743 sh_unmap_domain_page(ml2e);
3745 else
3746 domain_crash(d); /* XXX */
3748 #elif CONFIG_PAGING_LEVELS == 3
3750 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3751 * entries in the shadow, and the shadow's l3 entries into the
3752 * shadow-linear-map l2 entries in the shadow. This is safe to do
3753 * because Xen does not let guests share high-slot l2 tables between l3s,
3754 * so we know we're not treading on anyone's toes.
3756 * HVM: need to copy the shadow's l3 entries into the
3757 * shadow-linear-map l2 entries in the monitor table. This is safe
3758 * because we have one monitor table for each vcpu. The monitor's
3759 * own l3es don't need to be copied because they never change.
3760 * XXX That might change if we start stuffing things into the rest
3761 * of the monitor's virtual address space.
3762 */
3764 l2_pgentry_t *l2e, new_l2e;
3765 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3766 int i;
3767 int unmap_l2e = 0;
3769 #if GUEST_PAGING_LEVELS == 2
3771 /* Shadow l3 tables were built by sh_update_cr3 */
3772 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3773 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3775 #else /* GUEST_PAGING_LEVELS == 3 */
3777 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3778 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3780 #endif /* GUEST_PAGING_LEVELS */
3782 /* Choose where to write the entries, using linear maps if possible */
3783 if ( shadow_mode_external(d) )
3785 if ( v == current )
3787 /* From the monitor tables, it's safe to use linear maps
3788 * to update monitor l2s */
3789 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3791 else
3793 /* Map the monitor table's high l2 */
3794 l3_pgentry_t *l3e;
3795 l3e = sh_map_domain_page(
3796 pagetable_get_mfn(v->arch.monitor_table));
3797 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3798 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3799 unmap_l2e = 1;
3800 sh_unmap_domain_page(l3e);
3803 else
3805 /* Map the shadow table's high l2 */
3806 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3807 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3808 unmap_l2e = 1;
3811 /* Write linear mapping of guest (only in PV, and only when
3812 * not translated). */
3813 if ( !shadow_mode_translate(d) )
3815 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3817 new_l2e =
3818 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3819 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3820 __PAGE_HYPERVISOR)
3821 : l2e_empty());
3822 safe_write_entry(
3823 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3824 &new_l2e);
3828 /* Write linear mapping of shadow. */
3829 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3831 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3832 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3833 __PAGE_HYPERVISOR)
3834 : l2e_empty();
3835 safe_write_entry(
3836 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3837 &new_l2e);
3840 if ( unmap_l2e )
3841 sh_unmap_domain_page(l2e);
3844 #else
3845 #error this should not happen
3846 #endif
3848 if ( shadow_mode_external(d) )
3850 /*
3851 * Having modified the linear pagetable mapping, flush local host TLBs.
3852 * This was not needed when vmenter/vmexit always had the side effect
3853 * of flushing host TLBs but, with ASIDs, it is possible to finish
3854 * this CR3 update, vmenter the guest, vmexit due to a page fault,
3855 * without an intervening host TLB flush. Then the page fault code
3856 * could use the linear pagetable to read a top-level shadow page
3857 * table entry. But, without this change, it would fetch the wrong
3858 * value due to a stale TLB.
3859 */
3860 flush_tlb_local();
3865 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3866 * Does all appropriate management/bookkeeping/refcounting/etc...
3867 */
3868 static void
3869 sh_detach_old_tables(struct vcpu *v)
3871 mfn_t smfn;
3872 int i = 0;
3874 ////
3875 //// vcpu->arch.paging.shadow.guest_vtable
3876 ////
3878 #if GUEST_PAGING_LEVELS == 3
3879 /* PAE guests don't have a mapping of the guest top-level table */
3880 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3881 #else
3882 if ( v->arch.paging.shadow.guest_vtable )
3884 struct domain *d = v->domain;
3885 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3886 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3887 v->arch.paging.shadow.guest_vtable = NULL;
3889 #endif // !NDEBUG
3892 ////
3893 //// vcpu->arch.shadow_table[]
3894 ////
3896 #if GUEST_PAGING_LEVELS == 3
3897 /* PAE guests have four shadow_table entries */
3898 for ( i = 0 ; i < 4 ; i++ )
3899 #endif
3901 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3902 if ( mfn_x(smfn) )
3903 sh_put_ref(v, smfn, 0);
3904 v->arch.shadow_table[i] = pagetable_null();
3908 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3909 static void
3910 sh_set_toplevel_shadow(struct vcpu *v,
3911 int slot,
3912 mfn_t gmfn,
3913 unsigned int root_type)
3915 mfn_t smfn;
3916 pagetable_t old_entry, new_entry;
3918 struct domain *d = v->domain;
3920 /* Remember the old contents of this slot */
3921 old_entry = v->arch.shadow_table[slot];
3923 /* Now figure out the new contents: is this a valid guest MFN? */
3924 if ( !mfn_valid(gmfn) )
3926 new_entry = pagetable_null();
3927 goto install_new_entry;
3930 /* Guest mfn is valid: shadow it and install the shadow */
3931 smfn = get_shadow_status(v, gmfn, root_type);
3932 if ( !mfn_valid(smfn) )
3934 /* Make sure there's enough free shadow memory. */
3935 shadow_prealloc(d, root_type, 1);
3936 /* Shadow the page. */
3937 smfn = sh_make_shadow(v, gmfn, root_type);
3939 ASSERT(mfn_valid(smfn));
3941 /* Pin the shadow and put it (back) on the list of pinned shadows */
3942 if ( sh_pin(v, smfn) == 0 )
3944 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3945 domain_crash(v->domain);
3948 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3949 * or the next call to set_toplevel_shadow() */
3950 if ( !sh_get_ref(v, smfn, 0) )
3952 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3953 domain_crash(v->domain);
3956 new_entry = pagetable_from_mfn(smfn);
3958 install_new_entry:
3959 /* Done. Install it */
3960 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3961 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3962 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3963 v->arch.shadow_table[slot] = new_entry;
3965 /* Decrement the refcount of the old contents of this slot */
3966 if ( !pagetable_is_null(old_entry) ) {
3967 mfn_t old_smfn = pagetable_get_mfn(old_entry);
3968 /* Need to repin the old toplevel shadow if it's been unpinned
3969 * by shadow_prealloc(): in PV mode we're still running on this
3970 * shadow and it's not safe to free it yet. */
3971 if ( !mfn_to_shadow_page(old_smfn)->pinned && !sh_pin(v, old_smfn) )
3973 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
3974 domain_crash(v->domain);
3976 sh_put_ref(v, old_smfn, 0);
3981 static void
3982 sh_update_cr3(struct vcpu *v, int do_locking)
3983 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3984 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3985 * if appropriate).
3986 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3987 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
3988 * shadow tables are.
3989 * If do_locking != 0, assume we are being called from outside the
3990 * shadow code, and must take and release the shadow lock; otherwise
3991 * that is the caller's responsibility.
3992 */
3994 struct domain *d = v->domain;
3995 mfn_t gmfn;
3996 #if GUEST_PAGING_LEVELS == 3
3997 guest_l3e_t *gl3e;
3998 u32 guest_idx=0;
3999 int i;
4000 #endif
4002 /* Don't do anything on an uninitialised vcpu */
4003 if ( !is_hvm_domain(d) && !v->is_initialised )
4005 ASSERT(v->arch.cr3 == 0);
4006 return;
4009 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4010 /* Need to resync all the shadow entries on a TLB flush. Resync
4011 * current vcpus OOS pages before switching to the new shadow
4012 * tables so that the VA hint is still valid. */
4013 shadow_resync_current_vcpu(v, do_locking);
4014 #endif
4016 if ( do_locking ) shadow_lock(v->domain);
4018 ASSERT(shadow_locked_by_me(v->domain));
4019 ASSERT(v->arch.paging.mode);
4021 ////
4022 //// vcpu->arch.guest_table is already set
4023 ////
4025 #ifndef NDEBUG
4026 /* Double-check that the HVM code has sent us a sane guest_table */
4027 if ( is_hvm_domain(d) )
4029 ASSERT(shadow_mode_external(d));
4030 if ( hvm_paging_enabled(v) )
4031 ASSERT(pagetable_get_pfn(v->arch.guest_table));
4032 else
4033 ASSERT(v->arch.guest_table.pfn
4034 == d->arch.paging.shadow.unpaged_pagetable.pfn);
4036 #endif
4038 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
4039 d->domain_id, v->vcpu_id,
4040 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
4042 #if GUEST_PAGING_LEVELS == 4
4043 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
4044 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
4045 else
4046 #endif
4047 gmfn = pagetable_get_mfn(v->arch.guest_table);
4050 ////
4051 //// vcpu->arch.paging.shadow.guest_vtable
4052 ////
4053 #if GUEST_PAGING_LEVELS == 4
4054 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4056 if ( v->arch.paging.shadow.guest_vtable )
4057 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4058 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4059 /* PAGING_LEVELS==4 implies 64-bit, which means that
4060 * map_domain_page_global can't fail */
4061 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
4063 else
4064 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
4065 #elif GUEST_PAGING_LEVELS == 3
4066 /* On PAE guests we don't use a mapping of the guest's own top-level
4067 * table. We cache the current state of that table and shadow that,
4068 * until the next CR3 write makes us refresh our cache. */
4069 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4071 if ( shadow_mode_external(d) )
4072 /* Find where in the page the l3 table is */
4073 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
4074 else
4075 /* PV guest: l3 is at the start of a page */
4076 guest_idx = 0;
4078 // Ignore the low 2 bits of guest_idx -- they are really just
4079 // cache control.
4080 guest_idx &= ~3;
4082 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
4083 for ( i = 0; i < 4 ; i++ )
4084 v->arch.paging.shadow.gl3e[i] = gl3e[i];
4085 sh_unmap_domain_page(gl3e);
4086 #elif GUEST_PAGING_LEVELS == 2
4087 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4089 if ( v->arch.paging.shadow.guest_vtable )
4090 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4091 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4092 /* Does this really need map_domain_page_global? Handle the
4093 * error properly if so. */
4094 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
4096 else
4097 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
4098 #else
4099 #error this should never happen
4100 #endif
4103 ////
4104 //// vcpu->arch.shadow_table[]
4105 ////
4107 /* We revoke write access to the new guest toplevel page(s) before we
4108 * replace the old shadow pagetable(s), so that we can safely use the
4109 * (old) shadow linear maps in the writeable mapping heuristics. */
4110 #if GUEST_PAGING_LEVELS == 2
4111 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
4112 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4113 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
4114 #elif GUEST_PAGING_LEVELS == 3
4115 /* PAE guests have four shadow_table entries, based on the
4116 * current values of the guest's four l3es. */
4118 int flush = 0;
4119 gfn_t gl2gfn;
4120 mfn_t gl2mfn;
4121 p2m_type_t p2mt;
4122 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
4123 /* First, make all four entries read-only. */
4124 for ( i = 0; i < 4; i++ )
4126 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4128 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4129 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4130 if ( p2m_is_ram(p2mt) )
4131 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
4134 if ( flush )
4135 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4136 /* Now install the new shadows. */
4137 for ( i = 0; i < 4; i++ )
4139 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4141 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4142 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4143 if ( p2m_is_ram(p2mt) )
4144 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
4145 ? SH_type_l2h_shadow
4146 : SH_type_l2_shadow);
4147 else
4148 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4150 else
4151 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4154 #elif GUEST_PAGING_LEVELS == 4
4155 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
4156 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4157 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
4158 #else
4159 #error This should never happen
4160 #endif
4163 ///
4164 /// v->arch.paging.shadow.l3table
4165 ///
4166 #if SHADOW_PAGING_LEVELS == 3
4168 mfn_t smfn;
4169 int i;
4170 for ( i = 0; i < 4; i++ )
4172 #if GUEST_PAGING_LEVELS == 2
4173 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
4174 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
4175 #else
4176 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
4177 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4178 #endif
4179 v->arch.paging.shadow.l3table[i] =
4180 (mfn_x(smfn) == 0)
4181 ? shadow_l3e_empty()
4182 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
4185 #endif /* SHADOW_PAGING_LEVELS == 3 */
4188 ///
4189 /// v->arch.cr3
4190 ///
4191 if ( shadow_mode_external(d) )
4193 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4195 else // not shadow_mode_external...
4197 /* We don't support PV except guest == shadow == config levels */
4198 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4199 #if SHADOW_PAGING_LEVELS == 3
4200 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4201 * Don't use make_cr3 because (a) we know it's below 4GB, and
4202 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4203 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4204 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4205 #else
4206 /* 4-on-4: Just use the shadow top-level directly */
4207 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4208 #endif
4212 ///
4213 /// v->arch.hvm_vcpu.hw_cr[3]
4214 ///
4215 if ( shadow_mode_external(d) )
4217 ASSERT(is_hvm_domain(d));
4218 #if SHADOW_PAGING_LEVELS == 3
4219 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4220 v->arch.hvm_vcpu.hw_cr[3] =
4221 virt_to_maddr(&v->arch.paging.shadow.l3table);
4222 #else
4223 /* 4-on-4: Just use the shadow top-level directly */
4224 v->arch.hvm_vcpu.hw_cr[3] =
4225 pagetable_get_paddr(v->arch.shadow_table[0]);
4226 #endif
4227 hvm_update_guest_cr(v, 3);
4230 /* Fix up the linear pagetable mappings */
4231 sh_update_linear_entries(v);
4233 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4234 /* No longer safe to use cached gva->gfn translations */
4235 vtlb_flush(v);
4236 #endif
4238 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4239 v->arch.paging.last_write_emul_ok = 0;
4240 #endif
4242 /* Release the lock, if we took it (otherwise it's the caller's problem) */
4243 if ( do_locking ) shadow_unlock(v->domain);
4245 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4246 /* Need to resync all the shadow entries on a TLB flush. We only
4247 * update the shadows, leaving the pages out of sync. Also, we try
4248 * to skip synchronization of shadows not mapped in the new
4249 * tables. */
4250 shadow_sync_other_vcpus(v, do_locking);
4251 #endif
4256 /**************************************************************************/
4257 /* Functions to revoke guest rights */
4259 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
4260 int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
4261 mfn_t smfn, unsigned long off)
4263 int r;
4264 shadow_l1e_t *sl1p, sl1e;
4265 struct shadow_page_info *sp;
4267 ASSERT(mfn_valid(gmfn));
4268 ASSERT(mfn_valid(smfn));
4270 sp = mfn_to_shadow_page(smfn);
4272 if ( sp->mbz != 0
4273 || (sp->type != SH_type_l1_shadow
4274 && sp->type != SH_type_fl1_shadow) )
4275 goto fail;
4277 sl1p = sh_map_domain_page(smfn);
4278 sl1p += off;
4279 sl1e = *sl1p;
4280 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4281 != (_PAGE_PRESENT|_PAGE_RW))
4282 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4284 sh_unmap_domain_page(sl1p);
4285 goto fail;
4288 /* Found it! Need to remove its write permissions. */
4289 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4290 r = shadow_set_l1e(v, sl1p, sl1e, smfn);
4291 ASSERT( !(r & SHADOW_SET_ERROR) );
4293 sh_unmap_domain_page(sl1p);
4294 perfc_incr(shadow_writeable_h_7);
4295 return 1;
4297 fail:
4298 perfc_incr(shadow_writeable_h_8);
4299 return 0;
4301 #endif /* OOS */
4303 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4304 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4305 /* Look up this vaddr in the current shadow and see if it's a writeable
4306 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4308 shadow_l1e_t sl1e, *sl1p;
4309 shadow_l2e_t *sl2p;
4310 shadow_l3e_t *sl3p;
4311 #if SHADOW_PAGING_LEVELS >= 4
4312 shadow_l4e_t *sl4p;
4313 #endif
4314 mfn_t sl1mfn;
4315 int r;
4317 /* Carefully look in the shadow linear map for the l1e we expect */
4318 #if SHADOW_PAGING_LEVELS >= 4
4319 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4320 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4321 return 0;
4322 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4323 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4324 return 0;
4325 #else /* SHADOW_PAGING_LEVELS == 3 */
4326 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4327 + shadow_l3_linear_offset(vaddr);
4328 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4329 return 0;
4330 #endif
4331 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4332 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4333 return 0;
4334 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4335 sl1e = *sl1p;
4336 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4337 != (_PAGE_PRESENT|_PAGE_RW))
4338 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4339 return 0;
4341 /* Found it! Need to remove its write permissions. */
4342 sl1mfn = shadow_l2e_get_mfn(*sl2p);
4343 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4344 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
4345 ASSERT( !(r & SHADOW_SET_ERROR) );
4346 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);
4347 return 1;
4349 #endif
4351 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4352 mfn_t readonly_mfn)
4353 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4355 shadow_l1e_t *sl1e;
4356 int done = 0;
4357 int flags;
4358 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4359 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4360 #endif
4362 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4364 flags = shadow_l1e_get_flags(*sl1e);
4365 if ( (flags & _PAGE_PRESENT)
4366 && (flags & _PAGE_RW)
4367 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4369 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4370 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
4371 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4372 /* Remember the last shadow that we shot a writeable mapping in */
4373 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4374 #endif
4375 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4376 & PGT_count_mask) == 0 )
4377 /* This breaks us cleanly out of the FOREACH macro */
4378 done = 1;
4380 });
4381 return done;
4385 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4386 /* Excises all mappings to guest frame from this shadow l1 table */
4388 shadow_l1e_t *sl1e;
4389 int done = 0;
4390 int flags;
4392 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4394 flags = shadow_l1e_get_flags(*sl1e);
4395 if ( (flags & _PAGE_PRESENT)
4396 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4398 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
4399 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4400 /* This breaks us cleanly out of the FOREACH macro */
4401 done = 1;
4403 });
4404 return done;
4407 /**************************************************************************/
4408 /* Functions to excise all pointers to shadows from higher-level shadows. */
4410 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4411 /* Blank out a single shadow entry */
4413 switch ( mfn_to_shadow_page(smfn)->type )
4415 case SH_type_l1_shadow:
4416 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
4417 case SH_type_l2_shadow:
4418 #if GUEST_PAGING_LEVELS >= 3
4419 case SH_type_l2h_shadow:
4420 #endif
4421 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
4422 #if GUEST_PAGING_LEVELS >= 4
4423 case SH_type_l3_shadow:
4424 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
4425 case SH_type_l4_shadow:
4426 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
4427 #endif
4428 default: BUG(); /* Called with the wrong kind of shadow. */
4432 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4433 /* Remove all mappings of this l1 shadow from this l2 shadow */
4435 shadow_l2e_t *sl2e;
4436 int done = 0;
4437 int flags;
4439 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4441 flags = shadow_l2e_get_flags(*sl2e);
4442 if ( (flags & _PAGE_PRESENT)
4443 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4445 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4446 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
4447 /* This breaks us cleanly out of the FOREACH macro */
4448 done = 1;
4450 });
4451 return done;
4454 #if GUEST_PAGING_LEVELS >= 4
4455 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4456 /* Remove all mappings of this l2 shadow from this l3 shadow */
4458 shadow_l3e_t *sl3e;
4459 int done = 0;
4460 int flags;
4462 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4464 flags = shadow_l3e_get_flags(*sl3e);
4465 if ( (flags & _PAGE_PRESENT)
4466 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4468 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4469 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
4470 /* This breaks us cleanly out of the FOREACH macro */
4471 done = 1;
4473 });
4474 return done;
4477 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4478 /* Remove all mappings of this l3 shadow from this l4 shadow */
4480 shadow_l4e_t *sl4e;
4481 int done = 0;
4482 int flags;
4484 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4486 flags = shadow_l4e_get_flags(*sl4e);
4487 if ( (flags & _PAGE_PRESENT)
4488 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4490 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4491 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
4492 /* This breaks us cleanly out of the FOREACH macro */
4493 done = 1;
4495 });
4496 return done;
4498 #endif /* 64bit guest */
4500 /**************************************************************************/
4501 /* Handling HVM guest writes to pagetables */
4503 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4504 #define BAD_GVA_TO_GFN (~0UL)
4505 #define BAD_GFN_TO_MFN (~1UL)
4506 #define READONLY_GFN (~2UL)
4507 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4508 unsigned long vaddr,
4509 struct sh_emulate_ctxt *sh_ctxt)
4511 unsigned long gfn;
4512 mfn_t mfn;
4513 p2m_type_t p2mt;
4514 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4516 /* Translate the VA to a GFN */
4517 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4518 if ( gfn == INVALID_GFN )
4520 if ( is_hvm_vcpu(v) )
4521 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4522 else
4523 propagate_page_fault(vaddr, pfec);
4524 return _mfn(BAD_GVA_TO_GFN);
4527 /* Translate the GFN to an MFN */
4528 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4529 if ( p2mt == p2m_ram_ro )
4530 return _mfn(READONLY_GFN);
4531 if ( !p2m_is_ram(p2mt) )
4532 return _mfn(BAD_GFN_TO_MFN);
4534 ASSERT(mfn_valid(mfn));
4535 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4536 return mfn;
4539 /* Check that the user is allowed to perform this write.
4540 * Returns a mapped pointer to write to, or NULL for error. */
4541 #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
4542 #define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION)
4543 #define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY)
4544 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3)
4545 static void *emulate_map_dest(struct vcpu *v,
4546 unsigned long vaddr,
4547 u32 bytes,
4548 struct sh_emulate_ctxt *sh_ctxt)
4550 unsigned long offset;
4551 void *map = NULL;
4553 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4554 if ( !mfn_valid(sh_ctxt->mfn1) )
4555 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4556 MAPPING_EXCEPTION :
4557 (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
4558 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4560 #ifndef NDEBUG
4561 /* We don't emulate user-mode writes to page tables */
4562 if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 )
4564 gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
4565 "emulate_map_dest(). This should never happen!\n");
4566 return MAPPING_UNHANDLEABLE;
4568 #endif
4570 /* Unaligned writes mean probably this isn't a pagetable */
4571 if ( vaddr & (bytes - 1) )
4572 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4574 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4576 /* Whole write fits on a single page */
4577 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4578 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4580 else
4582 /* Cross-page emulated writes are only supported for HVM guests;
4583 * PV guests ought to know better */
4584 if ( !is_hvm_vcpu(v) )
4585 return MAPPING_UNHANDLEABLE;
4587 /* This write crosses a page boundary. Translate the second page */
4588 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4589 sh_ctxt);
4590 if ( !mfn_valid(sh_ctxt->mfn2) )
4591 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4592 MAPPING_EXCEPTION :
4593 (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
4594 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4596 /* Cross-page writes mean probably not a pagetable */
4597 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4599 /* Hack: we map the pages into the vcpu's LDT space, since we
4600 * know that we're not going to need the LDT for HVM guests,
4601 * and only HVM guests are allowed unaligned writes. */
4602 ASSERT(is_hvm_vcpu(v));
4603 map = (void *)LDT_VIRT_START(v);
4604 offset = l1_linear_offset((unsigned long) map);
4605 l1e_write(&__linear_l1_table[offset],
4606 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4607 l1e_write(&__linear_l1_table[offset + 1],
4608 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4609 flush_tlb_local();
4610 map += (vaddr & ~PAGE_MASK);
4613 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4614 /* Remember if the bottom bit was clear, so we can choose not to run
4615 * the change through the verify code if it's still clear afterwards */
4616 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4617 #endif
4619 return map;
4622 /* Tidy up after the emulated write: mark pages dirty, verify the new
4623 * contents, and undo the mapping */
4624 static void emulate_unmap_dest(struct vcpu *v,
4625 void *addr,
4626 u32 bytes,
4627 struct sh_emulate_ctxt *sh_ctxt)
4629 u32 b1 = bytes, b2 = 0, shflags;
4631 ASSERT(mfn_valid(sh_ctxt->mfn1));
4633 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4634 if ( likely(bytes >= 4)
4635 && (*(u32 *)addr == 0)
4636 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4637 check_for_early_unshadow(v, sh_ctxt->mfn1);
4638 else
4639 reset_early_unshadow(v);
4641 /* We can avoid re-verifying the page contents after the write if:
4642 * - it was no larger than the PTE type of this pagetable;
4643 * - it was aligned to the PTE boundaries; and
4644 * - _PAGE_PRESENT was clear before and after the write. */
4645 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4646 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4647 if ( sh_ctxt->low_bit_was_clear
4648 && !(*(u8 *)addr & _PAGE_PRESENT)
4649 && ((!(shflags & SHF_32)
4650 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4651 * the present bit unset are safe to ignore. */
4652 && ((unsigned long)addr & 7) == 0
4653 && bytes <= 8)
4654 ||
4655 (!(shflags & (SHF_PAE|SHF_64))
4656 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4657 * leave the present bit unset are safe to ignore. */
4658 && ((unsigned long)addr & 3) == 0
4659 && bytes <= 4)) )
4661 /* Writes with this alignment constraint can't possibly cross pages */
4662 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4664 else
4665 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4667 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4669 /* Validate as two writes, one to each page */
4670 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4671 b2 = bytes - b1;
4672 ASSERT(b2 < bytes);
4674 if ( likely(b1 > 0) )
4675 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4676 if ( unlikely(b2 > 0) )
4677 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4680 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4682 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4684 unsigned long offset;
4685 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4686 /* Undo the hacky two-frame contiguous map. */
4687 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4688 offset = l1_linear_offset((unsigned long) addr);
4689 l1e_write(&__linear_l1_table[offset], l1e_empty());
4690 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4691 flush_tlb_all();
4693 else
4694 sh_unmap_domain_page(addr);
4696 atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4699 static int
4700 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4701 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4703 void *addr;
4705 /* Unaligned writes are only acceptable on HVM */
4706 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4707 return X86EMUL_UNHANDLEABLE;
4709 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4710 if ( emulate_map_dest_failed(addr) )
4711 return (long)addr;
4713 shadow_lock(v->domain);
4714 memcpy(addr, src, bytes);
4716 if ( tb_init_done )
4718 #if GUEST_PAGING_LEVELS == 3
4719 if ( vaddr == this_cpu(trace_emulate_initial_va) )
4720 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4721 else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
4723 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
4724 memcpy(&this_cpu(trace_emulate_write_val),
4725 (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE);
4727 #else
4728 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4729 #endif
4732 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4733 shadow_audit_tables(v);
4734 shadow_unlock(v->domain);
4735 return X86EMUL_OKAY;
4738 static int
4739 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4740 unsigned long old, unsigned long new,
4741 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4743 void *addr;
4744 unsigned long prev;
4745 int rv = X86EMUL_OKAY;
4747 /* Unaligned writes are only acceptable on HVM */
4748 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4749 return X86EMUL_UNHANDLEABLE;
4751 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4752 if ( emulate_map_dest_failed(addr) )
4753 return (long)addr;
4755 shadow_lock(v->domain);
4756 switch ( bytes )
4758 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4759 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4760 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4761 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4762 default:
4763 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4764 prev = ~old;
4767 if ( prev != old )
4768 rv = X86EMUL_CMPXCHG_FAILED;
4770 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4771 " wanted %#lx now %#lx bytes %u\n",
4772 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4774 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4775 shadow_audit_tables(v);
4776 shadow_unlock(v->domain);
4777 return rv;
4780 #ifdef __i386__
4781 static int
4782 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4783 unsigned long old_lo, unsigned long old_hi,
4784 unsigned long new_lo, unsigned long new_hi,
4785 struct sh_emulate_ctxt *sh_ctxt)
4787 void *addr;
4788 u64 old, new, prev;
4789 int rv = X86EMUL_OKAY;
4791 /* Unaligned writes are only acceptable on HVM */
4792 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4793 return X86EMUL_UNHANDLEABLE;
4795 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4796 if ( emulate_map_dest_failed(addr) )
4797 return (long)addr;
4799 old = (((u64) old_hi) << 32) | (u64) old_lo;
4800 new = (((u64) new_hi) << 32) | (u64) new_lo;
4802 shadow_lock(v->domain);
4803 prev = cmpxchg(((u64 *)addr), old, new);
4805 if ( prev != old )
4806 rv = X86EMUL_CMPXCHG_FAILED;
4808 emulate_unmap_dest(v, addr, 8, sh_ctxt);
4809 shadow_audit_tables(v);
4810 shadow_unlock(v->domain);
4811 return rv;
4813 #endif
4815 /**************************************************************************/
4816 /* Audit tools */
4818 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4820 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4821 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4822 "gl" #_level "mfn = %" PRI_mfn \
4823 " sl" #_level "mfn = %" PRI_mfn \
4824 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4825 " gl" #_level "e = %" SH_PRI_gpte \
4826 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4827 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4828 _level, guest_index(gl ## _level ## e), \
4829 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4830 gl ## _level ## e, sl ## _level ## e, \
4831 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4832 ##_a); \
4833 BUG(); \
4834 done = 1; \
4835 } while (0)
4837 #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
4838 printk("Shadow %u-on-%u audit failed at level %i\n" \
4839 "gl" #_level "mfn = %" PRI_mfn \
4840 " sl" #_level "mfn = %" PRI_mfn \
4841 " Error: " _fmt "\n", \
4842 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4843 _level, \
4844 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4845 ##_a); \
4846 BUG(); \
4847 done = 1; \
4848 } while (0)
4850 static char * sh_audit_flags(struct vcpu *v, int level,
4851 int gflags, int sflags)
4852 /* Common code for auditing flag bits */
4854 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4855 return "shadow is present but guest is not present";
4856 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4857 return "global bit set in PV shadow";
4858 if ( level == 2 && (sflags & _PAGE_PSE) )
4859 return "PS bit set in shadow";
4860 #if SHADOW_PAGING_LEVELS == 3
4861 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4862 #endif
4863 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4864 return "accessed bit not propagated";
4865 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4866 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4867 return "dirty bit not propagated";
4868 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4869 return "user/supervisor bit does not match";
4870 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4871 return "NX bit does not match";
4872 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4873 return "shadow grants write access but guest does not";
4874 return NULL;
4877 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4879 guest_l1e_t *gl1e, *gp;
4880 shadow_l1e_t *sl1e;
4881 mfn_t mfn, gmfn, gl1mfn;
4882 gfn_t gfn;
4883 p2m_type_t p2mt;
4884 char *s;
4885 int done = 0;
4887 /* Follow the backpointer */
4888 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4890 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4891 /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
4892 if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
4894 oos_audit_hash_is_present(v->domain, gl1mfn);
4895 return 0;
4897 #endif
4899 gl1e = gp = sh_map_domain_page(gl1mfn);
4900 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4902 if ( sh_l1e_is_magic(*sl1e) )
4904 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
4905 if ( sh_l1e_is_gnp(*sl1e) )
4907 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4908 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4910 else
4912 ASSERT(sh_l1e_is_mmio(*sl1e));
4913 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4914 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4915 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4916 " but guest gfn is %" SH_PRI_gfn,
4917 gfn_x(gfn),
4918 gfn_x(guest_l1e_get_gfn(*gl1e)));
4920 #endif
4922 else
4924 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4925 shadow_l1e_get_flags(*sl1e));
4926 if ( s ) AUDIT_FAIL(1, "%s", s);
4928 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4930 gfn = guest_l1e_get_gfn(*gl1e);
4931 mfn = shadow_l1e_get_mfn(*sl1e);
4932 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
4933 if ( mfn_x(gmfn) != mfn_x(mfn) )
4934 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4935 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4936 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4939 });
4940 sh_unmap_domain_page(gp);
4941 return done;
4944 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4946 guest_l1e_t *gl1e, e;
4947 shadow_l1e_t *sl1e;
4948 mfn_t gl1mfn = _mfn(INVALID_MFN);
4949 int f;
4950 int done = 0;
4952 /* fl1 has no useful backpointer: all we can check are flags */
4953 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4954 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4955 f = shadow_l1e_get_flags(*sl1e);
4956 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4957 if ( !(f == 0
4958 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4959 _PAGE_ACCESSED|_PAGE_DIRTY)
4960 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4961 || sh_l1e_is_magic(*sl1e)) )
4962 AUDIT_FAIL(1, "fl1e has bad flags");
4963 });
4964 return 0;
4967 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4969 guest_l2e_t *gl2e, *gp;
4970 shadow_l2e_t *sl2e;
4971 mfn_t mfn, gmfn, gl2mfn;
4972 gfn_t gfn;
4973 p2m_type_t p2mt;
4974 char *s;
4975 int done = 0;
4977 /* Follow the backpointer */
4978 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4980 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4981 /* Only L1's may be out of sync. */
4982 if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
4983 AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
4984 #endif
4986 gl2e = gp = sh_map_domain_page(gl2mfn);
4987 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
4989 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4990 shadow_l2e_get_flags(*sl2e));
4991 if ( s ) AUDIT_FAIL(2, "%s", s);
4993 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4995 gfn = guest_l2e_get_gfn(*gl2e);
4996 mfn = shadow_l2e_get_mfn(*sl2e);
4997 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4998 ? get_fl1_shadow_status(v, gfn)
4999 : get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5000 SH_type_l1_shadow);
5001 if ( mfn_x(gmfn) != mfn_x(mfn) )
5002 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
5003 " (--> %" PRI_mfn ")"
5004 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5005 gfn_x(gfn),
5006 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
5007 : mfn_x(gfn_to_mfn(v->domain, gfn, &p2mt)),
5008 mfn_x(gmfn), mfn_x(mfn));
5010 });
5011 sh_unmap_domain_page(gp);
5012 return 0;
5015 #if GUEST_PAGING_LEVELS >= 4
5016 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
5018 guest_l3e_t *gl3e, *gp;
5019 shadow_l3e_t *sl3e;
5020 mfn_t mfn, gmfn, gl3mfn;
5021 gfn_t gfn;
5022 p2m_type_t p2mt;
5023 char *s;
5024 int done = 0;
5026 /* Follow the backpointer */
5027 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
5029 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5030 /* Only L1's may be out of sync. */
5031 if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
5032 AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
5033 #endif
5035 gl3e = gp = sh_map_domain_page(gl3mfn);
5036 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
5038 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
5039 shadow_l3e_get_flags(*sl3e));
5040 if ( s ) AUDIT_FAIL(3, "%s", s);
5042 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5044 gfn = guest_l3e_get_gfn(*gl3e);
5045 mfn = shadow_l3e_get_mfn(*sl3e);
5046 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5047 ((GUEST_PAGING_LEVELS == 3 ||
5048 is_pv_32on64_vcpu(v))
5049 && !shadow_mode_external(v->domain)
5050 && (guest_index(gl3e) % 4) == 3)
5051 ? SH_type_l2h_shadow
5052 : SH_type_l2_shadow);
5053 if ( mfn_x(gmfn) != mfn_x(mfn) )
5054 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
5055 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5056 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5058 });
5059 sh_unmap_domain_page(gp);
5060 return 0;
5063 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
5065 guest_l4e_t *gl4e, *gp;
5066 shadow_l4e_t *sl4e;
5067 mfn_t mfn, gmfn, gl4mfn;
5068 gfn_t gfn;
5069 p2m_type_t p2mt;
5070 char *s;
5071 int done = 0;
5073 /* Follow the backpointer */
5074 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
5076 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5077 /* Only L1's may be out of sync. */
5078 if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
5079 AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
5080 #endif
5082 gl4e = gp = sh_map_domain_page(gl4mfn);
5083 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
5085 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
5086 shadow_l4e_get_flags(*sl4e));
5087 if ( s ) AUDIT_FAIL(4, "%s", s);
5089 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5091 gfn = guest_l4e_get_gfn(*gl4e);
5092 mfn = shadow_l4e_get_mfn(*sl4e);
5093 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5094 SH_type_l3_shadow);
5095 if ( mfn_x(gmfn) != mfn_x(mfn) )
5096 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
5097 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5098 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5100 });
5101 sh_unmap_domain_page(gp);
5102 return 0;
5104 #endif /* GUEST_PAGING_LEVELS >= 4 */
5107 #undef AUDIT_FAIL
5109 #endif /* Audit code */
5111 /**************************************************************************/
5112 /* Entry points into this mode of the shadow code.
5113 * This will all be mangled by the preprocessor to uniquify everything. */
5114 struct paging_mode sh_paging_mode = {
5115 .page_fault = sh_page_fault,
5116 .invlpg = sh_invlpg,
5117 .gva_to_gfn = sh_gva_to_gfn,
5118 .update_cr3 = sh_update_cr3,
5119 .update_paging_modes = shadow_update_paging_modes,
5120 .write_p2m_entry = shadow_write_p2m_entry,
5121 .write_guest_entry = shadow_write_guest_entry,
5122 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
5123 .guest_map_l1e = sh_guest_map_l1e,
5124 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
5125 .guest_levels = GUEST_PAGING_LEVELS,
5126 .shadow.detach_old_tables = sh_detach_old_tables,
5127 .shadow.x86_emulate_write = sh_x86_emulate_write,
5128 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
5129 #ifdef __i386__
5130 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
5131 #endif
5132 .shadow.make_monitor_table = sh_make_monitor_table,
5133 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
5134 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
5135 .shadow.guess_wrmap = sh_guess_wrmap,
5136 #endif
5137 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
5138 };
5140 /*
5141 * Local variables:
5142 * mode: C
5143 * c-set-style: "BSD"
5144 * c-basic-offset: 4
5145 * indent-tabs-mode: nil
5146 * End:
5147 */