ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 18328:b1e5a0def648

x86, shadow: Fix OOS on domain crash.

I couldn't reproduce the Nevada crash on my testbox, but this should
fix the first Xen crash that was seen in the Nevada HVM (bugzilla
#1322).

What I think most probably happened there is that the set_l2e call in
shadow_get_and_create_l1e() has tried to resync a page, but somehow we
weren't unable to remove the shadow (the real bug we should actually
look after). sh_resync() then removes the page from the OOS hash and
later in the page fault path we find the gw.l1mfn to be still OOS, so
we try to update the snapshot and the bug happens.

This should fix this and other unlikely (like sh_unsync() failing to
remove for hash collision the current gw.l1mfn) cases.

Signed-off-by: Gianluca Guida <gianluca.guida@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Aug 14 10:14:32 2008 +0100 (2008-08-14)
parents b75f0b3e2a7e
children d09404ad5730
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include "private.h"
39 #include "types.h"
41 /* THINGS TO DO LATER:
42 *
43 * TEARDOWN HEURISTICS
44 * Also: have a heuristic for when to destroy a previous paging-mode's
45 * shadows. When a guest is done with its start-of-day 32-bit tables
46 * and reuses the memory we want to drop those shadows. Start with
47 * shadows in a page in two modes as a hint, but beware of clever tricks
48 * like reusing a pagetable for both PAE and 64-bit during boot...
49 *
50 * PAE LINEAR MAPS
51 * Rework shadow_get_l*e() to have the option of using map_domain_page()
52 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
53 * Then we can test the speed difference made by linear maps. If the
54 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
55 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
56 * to share l2h pages again.
57 *
58 * PSE disabled / PSE36
59 * We don't support any modes other than PSE enabled, PSE36 disabled.
60 * Neither of those would be hard to change, but we'd need to be able to
61 * deal with shadows made in one mode and used in another.
62 */
64 #define FETCH_TYPE_PREFETCH 1
65 #define FETCH_TYPE_DEMAND 2
66 #define FETCH_TYPE_WRITE 4
67 typedef enum {
68 ft_prefetch = FETCH_TYPE_PREFETCH,
69 ft_demand_read = FETCH_TYPE_DEMAND,
70 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
71 } fetch_type_t;
73 #ifdef DEBUG_TRACE_DUMP
74 static char *fetch_type_names[] = {
75 [ft_prefetch] "prefetch",
76 [ft_demand_read] "demand read",
77 [ft_demand_write] "demand write",
78 };
79 #endif
81 /**************************************************************************/
82 /* Hash table mapping from guest pagetables to shadows
83 *
84 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
85 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
86 * shadow L1 which maps its "splinters".
87 */
89 static inline mfn_t
90 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
91 /* Look for FL1 shadows in the hash table */
92 {
93 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
94 return smfn;
95 }
97 static inline mfn_t
98 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
99 /* Look for shadows in the hash table */
100 {
101 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
102 perfc_incr(shadow_get_shadow_status);
103 return smfn;
104 }
106 static inline void
107 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
108 /* Put an FL1 shadow into the hash table */
109 {
110 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
111 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
113 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
114 }
116 static inline void
117 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
118 /* Put a shadow into the hash table */
119 {
120 struct domain *d = v->domain;
121 int res;
123 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
124 d->domain_id, v->vcpu_id, mfn_x(gmfn),
125 shadow_type, mfn_x(smfn));
127 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
128 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
129 {
130 res = get_page(mfn_to_page(gmfn), d);
131 ASSERT(res == 1);
132 }
134 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
135 }
137 static inline void
138 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
139 /* Remove a shadow from the hash table */
140 {
141 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
142 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
143 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
144 }
146 static inline void
147 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
148 /* Remove a shadow from the hash table */
149 {
150 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
151 v->domain->domain_id, v->vcpu_id,
152 mfn_x(gmfn), shadow_type, mfn_x(smfn));
153 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
154 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
155 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
156 put_page(mfn_to_page(gmfn));
157 }
159 /**************************************************************************/
160 /* CPU feature support querying */
162 static inline int
163 guest_supports_superpages(struct vcpu *v)
164 {
165 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
166 * CR4.PSE is set or the guest is in PAE or long mode.
167 * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
168 return (is_hvm_vcpu(v) &&
169 (GUEST_PAGING_LEVELS != 2
170 || !hvm_paging_enabled(v)
171 || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
172 }
174 static inline int
175 guest_supports_nx(struct vcpu *v)
176 {
177 if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
178 return 0;
179 if ( !is_hvm_vcpu(v) )
180 return cpu_has_nx;
181 return hvm_nx_enabled(v);
182 }
185 /**************************************************************************/
186 /* Functions for walking the guest page tables */
188 /* Flags that are needed in a pagetable entry, with the sense of NX inverted */
189 static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
190 {
191 static uint32_t flags[] = {
192 /* I/F - Usr Wr */
193 /* 0 0 0 0 */ _PAGE_PRESENT,
194 /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
195 /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
196 /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
197 /* 0 1 0 0 */ _PAGE_PRESENT,
198 /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
199 /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
200 /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
201 /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
202 /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
203 /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
204 /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
205 /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
206 /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
207 /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
208 /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
209 };
211 /* Don't demand not-NX if the CPU wouldn't enforce it. */
212 if ( !guest_supports_nx(v) )
213 pfec &= ~PFEC_insn_fetch;
215 /* Don't demand R/W if the CPU wouldn't enforce it. */
216 if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
217 && !(pfec & PFEC_user_mode) )
218 pfec &= ~PFEC_write_access;
220 return flags[(pfec & 0x1f) >> 1];
221 }
223 /* Modify a guest pagetable entry to set the Accessed and Dirty bits.
224 * Returns non-zero if it actually writes to guest memory. */
225 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
226 {
227 guest_intpte_t old, new;
229 old = *(guest_intpte_t *)walk_p;
230 new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
231 if ( old != new )
232 {
233 /* Write the new entry into the walk, and try to write it back
234 * into the guest table as well. If the guest table has changed
235 * under out feet then leave it alone. */
236 *(guest_intpte_t *)walk_p = new;
237 if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
238 return 1;
239 }
240 return 0;
241 }
243 /* This validation is called with lock held, and after write permission
244 * removal. Then check is atomic and no more inconsistent content can
245 * be observed before lock is released
246 *
247 * Return 1 to indicate success and 0 for inconsistency
248 */
249 static inline uint32_t
250 shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
251 {
252 struct domain *d = v->domain;
253 guest_l1e_t *l1p;
254 guest_l2e_t *l2p;
255 #if GUEST_PAGING_LEVELS >= 4
256 guest_l3e_t *l3p;
257 guest_l4e_t *l4p;
258 #endif
259 int mismatch = 0;
261 ASSERT(shadow_locked_by_me(d));
263 if ( gw->version ==
264 atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
265 return 1;
267 /* We may consider caching guest page mapping from last
268 * guest table walk. However considering this check happens
269 * relatively less-frequent, and a bit burden here to
270 * remap guest page is better than caching mapping in each
271 * guest table walk.
272 *
273 * Also when inconsistency occurs, simply return to trigger
274 * another fault instead of re-validate new path to make
275 * logic simple.
276 */
277 perfc_incr(shadow_check_gwalk);
278 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
279 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
280 l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
281 mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
282 l3p = sh_map_domain_page(gw->l3mfn);
283 mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
284 sh_unmap_domain_page(l3p);
285 #else
286 mismatch |= (gw->l3e.l3 !=
287 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
288 #endif
289 l2p = sh_map_domain_page(gw->l2mfn);
290 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
291 sh_unmap_domain_page(l2p);
292 #else
293 l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
294 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
295 #endif
296 if ( !(guest_supports_superpages(v) &&
297 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
298 {
299 l1p = sh_map_domain_page(gw->l1mfn);
300 mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
301 sh_unmap_domain_page(l1p);
302 }
304 return !mismatch;
305 }
307 /* Remove write access permissions from a gwalk_t in a batch, and
308 * return OR-ed result for TLB flush hint and need to rewalk the guest
309 * pages.
310 *
311 * Syncing pages will remove write access to that page; but it may
312 * also give write access to other pages in the path. If we resync any
313 * pages, re-walk from the beginning.
314 */
315 #define GW_RMWR_FLUSHTLB 1
316 #define GW_RMWR_REWALK 2
318 static inline uint32_t
319 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
320 {
321 uint32_t rc = 0;
323 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
324 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
325 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
326 if ( mfn_is_out_of_sync(gw->l3mfn) )
327 {
328 sh_resync(v, gw->l3mfn);
329 rc = GW_RMWR_REWALK;
330 }
331 else
332 #endif /* OOS */
333 if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
334 rc = GW_RMWR_FLUSHTLB;
335 #endif /* GUEST_PAGING_LEVELS >= 4 */
337 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
338 if ( mfn_is_out_of_sync(gw->l2mfn) )
339 {
340 sh_resync(v, gw->l2mfn);
341 rc |= GW_RMWR_REWALK;
342 }
343 else
344 #endif /* OOS */
345 if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
346 rc |= GW_RMWR_FLUSHTLB;
347 #endif /* GUEST_PAGING_LEVELS >= 3 */
349 if ( !(guest_supports_superpages(v) &&
350 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
351 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
352 && !mfn_is_out_of_sync(gw->l1mfn)
353 #endif /* OOS */
354 && sh_remove_write_access(v, gw->l1mfn, 1, va) )
355 rc |= GW_RMWR_FLUSHTLB;
357 return rc;
358 }
360 /* Walk the guest pagetables, after the manner of a hardware walker.
361 *
362 * Inputs: a vcpu, a virtual address, a walk_t to fill, a
363 * pointer to a pagefault code
364 *
365 * We walk the vcpu's guest pagetables, filling the walk_t with what we
366 * see and adding any Accessed and Dirty bits that are needed in the
367 * guest entries. Using the pagefault code, we check the permissions as
368 * we go. For the purposes of reading pagetables we treat all non-RAM
369 * memory as contining zeroes.
370 *
371 * The walk is done in a lock-free style, with some sanity check postponed
372 * after grabbing shadow lock later. Those delayed checks will make sure
373 * no inconsistent mapping being translated into shadow page table.
374 *
375 * Returns 0 for success, or the set of permission bits that we failed on
376 * if the walk did not complete.
377 * N.B. This is different from the old return code but almost no callers
378 * checked the old return code anyway.
379 */
380 static uint32_t
381 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
382 {
383 struct domain *d = v->domain;
384 p2m_type_t p2mt;
385 guest_l1e_t *l1p = NULL;
386 guest_l2e_t *l2p = NULL;
387 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
388 guest_l3e_t *l3p = NULL;
389 guest_l4e_t *l4p;
390 #endif
391 uint32_t gflags, mflags, rc = 0;
392 int pse;
394 perfc_incr(shadow_guest_walk);
395 memset(gw, 0, sizeof(*gw));
396 gw->va = va;
398 gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
399 rmb();
401 /* Mandatory bits that must be set in every entry. We invert NX, to
402 * calculate as if there were an "X" bit that allowed access.
403 * We will accumulate, in rc, the set of flags that are missing. */
404 mflags = mandatory_flags(v, pfec);
406 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
407 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
409 /* Get the l4e from the top level table and check its flags*/
410 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
411 l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
412 gw->l4e = l4p[guest_l4_table_offset(va)];
413 gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
414 rc |= ((gflags & mflags) ^ mflags);
415 if ( rc & _PAGE_PRESENT ) goto out;
417 /* Map the l3 table */
418 gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
419 if ( !p2m_is_ram(p2mt) )
420 {
421 rc |= _PAGE_PRESENT;
422 goto out;
423 }
424 ASSERT(mfn_valid(gw->l3mfn));
426 /* Get the l3e and check its flags*/
427 l3p = sh_map_domain_page(gw->l3mfn);
428 gw->l3e = l3p[guest_l3_table_offset(va)];
429 gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
430 rc |= ((gflags & mflags) ^ mflags);
431 if ( rc & _PAGE_PRESENT )
432 goto out;
434 #else /* PAE only... */
436 /* Get l3e from the cache of the top level table and check its flag */
437 gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
438 if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
439 {
440 rc |= _PAGE_PRESENT;
441 goto out;
442 }
444 #endif /* PAE or 64... */
446 /* Map the l2 table */
447 gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
448 if ( !p2m_is_ram(p2mt) )
449 {
450 rc |= _PAGE_PRESENT;
451 goto out;
452 }
453 ASSERT(mfn_valid(gw->l2mfn));
455 /* Get the l2e */
456 l2p = sh_map_domain_page(gw->l2mfn);
457 gw->l2e = l2p[guest_l2_table_offset(va)];
459 #else /* 32-bit only... */
461 /* Get l2e from the top level table */
462 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
463 l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
464 gw->l2e = l2p[guest_l2_table_offset(va)];
466 #endif /* All levels... */
468 gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
469 rc |= ((gflags & mflags) ^ mflags);
470 if ( rc & _PAGE_PRESENT )
471 goto out;
473 pse = (guest_supports_superpages(v) &&
474 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
476 if ( pse )
477 {
478 /* Special case: this guest VA is in a PSE superpage, so there's
479 * no guest l1e. We make one up so that the propagation code
480 * can generate a shadow l1 table. Start with the gfn of the
481 * first 4k-page of the superpage. */
482 gfn_t start = guest_l2e_get_gfn(gw->l2e);
483 /* Grant full access in the l1e, since all the guest entry's
484 * access controls are enforced in the shadow l2e. */
485 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
486 _PAGE_ACCESSED|_PAGE_DIRTY);
487 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
488 * of the level 1. */
489 if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
490 flags |= _PAGE_PAT;
491 /* Copy the cache-control bits to the l1 as well, because we
492 * can't represent PAT in the (non-PSE) shadow l2e. :(
493 * This could cause problems if a guest ever maps an area of
494 * memory with superpages using more than one caching mode. */
495 flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
496 /* Increment the pfn by the right number of 4k pages.
497 * The ~0x1 is to mask out the PAT bit mentioned above. */
498 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
499 gw->l1e = guest_l1e_from_gfn(start, flags);
500 gw->l1mfn = _mfn(INVALID_MFN);
501 }
502 else
503 {
504 /* Not a superpage: carry on and find the l1e. */
505 gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
506 if ( !p2m_is_ram(p2mt) )
507 {
508 rc |= _PAGE_PRESENT;
509 goto out;
510 }
511 ASSERT(mfn_valid(gw->l1mfn));
512 l1p = sh_map_domain_page(gw->l1mfn);
513 gw->l1e = l1p[guest_l1_table_offset(va)];
514 gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
515 rc |= ((gflags & mflags) ^ mflags);
516 }
518 /* Go back and set accessed and dirty bits only if the walk was a
519 * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
520 * get set whenever a lower-level PT is used, at least some hardware
521 * walkers behave this way. */
522 if ( rc == 0 )
523 {
524 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
525 if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
526 paging_mark_dirty(d, mfn_x(gw->l4mfn));
527 if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
528 paging_mark_dirty(d, mfn_x(gw->l3mfn));
529 #endif
530 if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
531 (pse && (pfec & PFEC_write_access))) )
532 paging_mark_dirty(d, mfn_x(gw->l2mfn));
533 if ( !pse )
534 {
535 if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
536 (pfec & PFEC_write_access)) )
537 paging_mark_dirty(d, mfn_x(gw->l1mfn));
538 }
539 }
541 out:
542 #if GUEST_PAGING_LEVELS == 4
543 if ( l3p ) sh_unmap_domain_page(l3p);
544 #endif
545 #if GUEST_PAGING_LEVELS >= 3
546 if ( l2p ) sh_unmap_domain_page(l2p);
547 #endif
548 if ( l1p ) sh_unmap_domain_page(l1p);
550 return rc;
551 }
553 /* Given a walk_t, translate the gw->va into the guest's notion of the
554 * corresponding frame number. */
555 static inline gfn_t
556 guest_walk_to_gfn(walk_t *gw)
557 {
558 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
559 return _gfn(INVALID_GFN);
560 return guest_l1e_get_gfn(gw->l1e);
561 }
563 /* Given a walk_t, translate the gw->va into the guest's notion of the
564 * corresponding physical address. */
565 static inline paddr_t
566 guest_walk_to_gpa(walk_t *gw)
567 {
568 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
569 return 0;
570 return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
571 }
573 #if 0 /* Keep for debugging */
574 /* Pretty-print the contents of a guest-walk */
575 static inline void print_gw(walk_t *gw)
576 {
577 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
578 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
579 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
580 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
581 SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
582 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
583 #endif /* PAE or 64... */
584 SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
585 #endif /* All levels... */
586 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
587 SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
588 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
589 SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
590 }
591 #endif /* 0 */
593 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
594 /* Lightweight audit: pass all the shadows associated with this guest walk
595 * through the audit mechanisms */
596 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
597 {
598 mfn_t smfn;
600 if ( !(SHADOW_AUDIT_ENABLE) )
601 return;
603 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
604 if ( mfn_valid(gw->l4mfn)
605 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
606 SH_type_l4_shadow))) )
607 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
608 if ( mfn_valid(gw->l3mfn)
609 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
610 SH_type_l3_shadow))) )
611 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
612 #endif /* PAE or 64... */
613 if ( mfn_valid(gw->l2mfn) )
614 {
615 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
616 SH_type_l2_shadow))) )
617 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
618 #if GUEST_PAGING_LEVELS == 3
619 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
620 SH_type_l2h_shadow))) )
621 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
622 #endif
623 }
624 if ( mfn_valid(gw->l1mfn)
625 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
626 SH_type_l1_shadow))) )
627 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
628 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
629 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
630 && mfn_valid(
631 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
632 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
633 }
635 #else
636 #define sh_audit_gw(_v, _gw) do {} while(0)
637 #endif /* audit code */
640 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
641 void *
642 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
643 unsigned long *gl1mfn)
644 {
645 void *pl1e = NULL;
646 walk_t gw;
648 ASSERT(shadow_mode_translate(v->domain));
650 // XXX -- this is expensive, but it's easy to cobble together...
651 // FIXME!
653 if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0
654 && mfn_valid(gw.l1mfn) )
655 {
656 if ( gl1mfn )
657 *gl1mfn = mfn_x(gw.l1mfn);
658 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
659 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
660 }
662 return pl1e;
663 }
665 void
666 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
667 {
668 walk_t gw;
670 ASSERT(shadow_mode_translate(v->domain));
672 // XXX -- this is expensive, but it's easy to cobble together...
673 // FIXME!
675 (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
676 *(guest_l1e_t *)eff_l1e = gw.l1e;
677 }
678 #endif /* CONFIG == GUEST (== SHADOW) */
680 /**************************************************************************/
681 /* Functions to compute the correct index into a shadow page, given an
682 * index into the guest page (as returned by guest_get_index()).
683 * This is trivial when the shadow and guest use the same sized PTEs, but
684 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
685 * PAE- or 64-bit shadows).
686 *
687 * These functions also increment the shadow mfn, when necessary. When PTE
688 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
689 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
690 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
691 * which shadow page we really want. Similarly, when PTE sizes are
692 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
693 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
694 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
695 * space.)
696 *
697 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
698 * of shadow (to store both the shadow, and the info that would normally be
699 * stored in page_info fields). This arrangement allows the shadow and the
700 * "page_info" fields to always be stored in the same page (in fact, in
701 * the same cache line), avoiding an extra call to map_domain_page().
702 */
704 static inline u32
705 guest_index(void *ptr)
706 {
707 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
708 }
710 static u32
711 shadow_l1_index(mfn_t *smfn, u32 guest_index)
712 {
713 #if (GUEST_PAGING_LEVELS == 2)
714 *smfn = _mfn(mfn_x(*smfn) +
715 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
716 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
717 #else
718 return guest_index;
719 #endif
720 }
722 static u32
723 shadow_l2_index(mfn_t *smfn, u32 guest_index)
724 {
725 #if (GUEST_PAGING_LEVELS == 2)
726 // Because we use 2 shadow l2 entries for each guest entry, the number of
727 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
728 //
729 *smfn = _mfn(mfn_x(*smfn) +
730 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
732 // We multiply by two to get the index of the first of the two entries
733 // used to shadow the specified guest entry.
734 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
735 #else
736 return guest_index;
737 #endif
738 }
740 #if GUEST_PAGING_LEVELS >= 4
742 static u32
743 shadow_l3_index(mfn_t *smfn, u32 guest_index)
744 {
745 return guest_index;
746 }
748 static u32
749 shadow_l4_index(mfn_t *smfn, u32 guest_index)
750 {
751 return guest_index;
752 }
754 #endif // GUEST_PAGING_LEVELS >= 4
757 /**************************************************************************/
758 /* Function which computes shadow entries from their corresponding guest
759 * entries. This is the "heart" of the shadow code. It operates using
760 * level-1 shadow types, but handles all levels of entry.
761 * Don't call it directly, but use the four wrappers below.
762 */
764 static always_inline void
765 _sh_propagate(struct vcpu *v,
766 guest_intpte_t guest_intpte,
767 mfn_t target_mfn,
768 void *shadow_entry_ptr,
769 int level,
770 fetch_type_t ft,
771 p2m_type_t p2mt)
772 {
773 guest_l1e_t guest_entry = { guest_intpte };
774 shadow_l1e_t *sp = shadow_entry_ptr;
775 struct domain *d = v->domain;
776 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
777 u32 pass_thru_flags;
778 u32 gflags, sflags;
780 /* We don't shadow PAE l3s */
781 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
783 /* Check there's something for the shadows to map to */
784 if ( !p2m_is_valid(p2mt) )
785 {
786 *sp = shadow_l1e_empty();
787 goto done;
788 }
790 gflags = guest_l1e_get_flags(guest_entry);
792 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
793 {
794 /* If a guest l1 entry is not present, shadow with the magic
795 * guest-not-present entry. */
796 if ( level == 1 )
797 *sp = sh_l1e_gnp();
798 else
799 *sp = shadow_l1e_empty();
800 goto done;
801 }
803 if ( level == 1 && p2mt == p2m_mmio_dm )
804 {
805 /* Guest l1e maps emulated MMIO space */
806 *sp = sh_l1e_mmio(target_gfn, gflags);
807 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
808 d->arch.paging.shadow.has_fast_mmio_entries = 1;
809 goto done;
810 }
812 // Must have a valid target_mfn unless this is a prefetch or an l1
813 // pointing at MMIO space. In the case of a prefetch, an invalid
814 // mfn means that we can not usefully shadow anything, and so we
815 // return early.
816 //
817 if ( !mfn_valid(target_mfn)
818 && !(level == 1 && (!shadow_mode_refcounts(d)
819 || p2mt == p2m_mmio_direct)) )
820 {
821 ASSERT((ft == ft_prefetch));
822 *sp = shadow_l1e_empty();
823 goto done;
824 }
826 // Propagate bits from the guest to the shadow.
827 // Some of these may be overwritten, below.
828 // Since we know the guest's PRESENT bit is set, we also set the shadow's
829 // SHADOW_PRESENT bit.
830 //
831 pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
832 _PAGE_RW | _PAGE_PRESENT);
833 if ( guest_supports_nx(v) )
834 pass_thru_flags |= _PAGE_NX_BIT;
835 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
836 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
837 sflags = gflags & pass_thru_flags;
839 /*
840 * For HVM domains with direct access to MMIO areas, set the correct
841 * caching attributes in the shadows to match what was asked for.
842 */
843 if ( (level == 1) && is_hvm_domain(d) && has_arch_pdevs(d) &&
844 !is_xen_heap_mfn(mfn_x(target_mfn)) )
845 {
846 unsigned int type;
847 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
848 sflags |= pat_type_2_pte_flags(type);
849 else if ( d->arch.hvm_domain.is_in_uc_mode )
850 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
851 else
852 sflags |= get_pat_flags(v,
853 gflags,
854 gfn_to_paddr(target_gfn),
855 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT);
856 }
858 // Set the A&D bits for higher level shadows.
859 // Higher level entries do not, strictly speaking, have dirty bits, but
860 // since we use shadow linear tables, each of these entries may, at some
861 // point in time, also serve as a shadow L1 entry.
862 // By setting both the A&D bits in each of these, we eliminate the burden
863 // on the hardware to update these bits on initial accesses.
864 //
865 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
866 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
868 // If the A or D bit has not yet been set in the guest, then we must
869 // prevent the corresponding kind of access.
870 //
871 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
872 sflags &= ~_PAGE_PRESENT;
874 /* D bits exist in L1es and PSE L2es */
875 if ( unlikely(((level == 1) ||
876 ((level == 2) &&
877 (gflags & _PAGE_PSE) &&
878 guest_supports_superpages(v)))
879 && !(gflags & _PAGE_DIRTY)) )
880 sflags &= ~_PAGE_RW;
882 // shadow_mode_log_dirty support
883 //
884 // Only allow the guest write access to a page a) on a demand fault,
885 // or b) if the page is already marked as dirty.
886 //
887 // (We handle log-dirty entirely inside the shadow code, without using the
888 // p2m_ram_logdirty p2m type: only HAP uses that.)
889 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
890 {
891 if ( mfn_valid(target_mfn) ) {
892 if ( ft & FETCH_TYPE_WRITE )
893 paging_mark_dirty(d, mfn_x(target_mfn));
894 else if ( !sh_mfn_is_dirty(d, target_mfn) )
895 sflags &= ~_PAGE_RW;
896 }
897 }
899 if ( unlikely((level == 1) && d->dirty_vram
900 && d->dirty_vram->last_dirty == -1
901 && gfn_x(target_gfn) >= d->dirty_vram->begin_pfn
902 && gfn_x(target_gfn) < d->dirty_vram->end_pfn) )
903 {
904 if ( ft & FETCH_TYPE_WRITE )
905 d->dirty_vram->last_dirty = NOW();
906 else
907 sflags &= ~_PAGE_RW;
908 }
910 /* Read-only memory */
911 if ( p2mt == p2m_ram_ro )
912 sflags &= ~_PAGE_RW;
914 // protect guest page tables
915 //
916 if ( unlikely((level == 1)
917 && sh_mfn_is_a_page_table(target_mfn)
918 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
919 /* Unless the page is out of sync and the guest is
920 writing to it. */
921 && !(mfn_oos_may_write(target_mfn)
922 && (ft == ft_demand_write))
923 #endif /* OOS */
924 ) )
925 {
926 if ( shadow_mode_trap_reads(d) )
927 {
928 // if we are trapping both reads & writes, then mark this page
929 // as not present...
930 //
931 sflags &= ~_PAGE_PRESENT;
932 }
933 else
934 {
935 // otherwise, just prevent any writes...
936 //
937 sflags &= ~_PAGE_RW;
938 }
939 }
941 // PV guests in 64-bit mode use two different page tables for user vs
942 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
943 // It is always shadowed as present...
944 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
945 && !is_hvm_domain(d) )
946 {
947 sflags |= _PAGE_USER;
948 }
950 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
952 done:
953 SHADOW_DEBUG(PROPAGATE,
954 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
955 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
956 }
959 /* These four wrappers give us a little bit of type-safety back around
960 * the use of void-* pointers and intpte types in _sh_propagate(), and
961 * allow the compiler to optimize out some level checks. */
963 #if GUEST_PAGING_LEVELS >= 4
964 static void
965 l4e_propagate_from_guest(struct vcpu *v,
966 guest_l4e_t gl4e,
967 mfn_t sl3mfn,
968 shadow_l4e_t *sl4e,
969 fetch_type_t ft)
970 {
971 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
972 }
974 static void
975 l3e_propagate_from_guest(struct vcpu *v,
976 guest_l3e_t gl3e,
977 mfn_t sl2mfn,
978 shadow_l3e_t *sl3e,
979 fetch_type_t ft)
980 {
981 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
982 }
983 #endif // GUEST_PAGING_LEVELS >= 4
985 static void
986 l2e_propagate_from_guest(struct vcpu *v,
987 guest_l2e_t gl2e,
988 mfn_t sl1mfn,
989 shadow_l2e_t *sl2e,
990 fetch_type_t ft)
991 {
992 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
993 }
995 static void
996 l1e_propagate_from_guest(struct vcpu *v,
997 guest_l1e_t gl1e,
998 mfn_t gmfn,
999 shadow_l1e_t *sl1e,
1000 fetch_type_t ft,
1001 p2m_type_t p2mt)
1003 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
1007 /**************************************************************************/
1008 /* These functions update shadow entries (and do bookkeeping on the shadow
1009 * tables they are in). It is intended that they are the only
1010 * functions which ever write (non-zero) data onto a shadow page.
1011 */
1013 static inline void safe_write_entry(void *dst, void *src)
1014 /* Copy one PTE safely when processors might be running on the
1015 * destination pagetable. This does *not* give safety against
1016 * concurrent writes (that's what the shadow lock is for), just
1017 * stops the hardware picking up partially written entries. */
1019 volatile unsigned long *d = dst;
1020 unsigned long *s = src;
1021 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
1022 #if CONFIG_PAGING_LEVELS == 3
1023 /* In PAE mode, pagetable entries are larger
1024 * than machine words, so won't get written atomically. We need to make
1025 * sure any other cpu running on these shadows doesn't see a
1026 * half-written entry. Do this by marking the entry not-present first,
1027 * then writing the high word before the low word. */
1028 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
1029 d[0] = 0;
1030 d[1] = s[1];
1031 d[0] = s[0];
1032 #else
1033 /* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
1034 * which will be an atomic write, since the entry is aligned. */
1035 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
1036 *d = *s;
1037 #endif
1041 static inline void
1042 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
1043 /* This function does the actual writes to shadow pages.
1044 * It must not be called directly, since it doesn't do the bookkeeping
1045 * that shadow_set_l*e() functions do. */
1047 shadow_l1e_t *dst = d;
1048 shadow_l1e_t *src = s;
1049 void *map = NULL;
1050 int i;
1052 /* Because we mirror access rights at all levels in the shadow, an
1053 * l2 (or higher) entry with the RW bit cleared will leave us with
1054 * no write access through the linear map.
1055 * We detect that by writing to the shadow with copy_to_user() and
1056 * using map_domain_page() to get a writeable mapping if we need to. */
1057 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
1059 perfc_incr(shadow_linear_map_failed);
1060 map = sh_map_domain_page(mfn);
1061 ASSERT(map != NULL);
1062 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
1066 for ( i = 0; i < entries; i++ )
1067 safe_write_entry(dst++, src++);
1069 if ( map != NULL ) sh_unmap_domain_page(map);
1072 static inline int
1073 perms_strictly_increased(u32 old_flags, u32 new_flags)
1074 /* Given the flags of two entries, are the new flags a strict
1075 * increase in rights over the old ones? */
1077 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1078 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1079 /* Flip the NX bit, since it's the only one that decreases rights;
1080 * we calculate as if it were an "X" bit. */
1081 of ^= _PAGE_NX_BIT;
1082 nf ^= _PAGE_NX_BIT;
1083 /* If the changed bits are all set in the new flags, then rights strictly
1084 * increased between old and new. */
1085 return ((of | (of ^ nf)) == nf);
1088 static int inline
1089 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1091 int res;
1092 mfn_t mfn;
1093 struct domain *owner;
1095 ASSERT(!sh_l1e_is_magic(sl1e));
1097 if ( !shadow_mode_refcounts(d) )
1098 return 1;
1100 res = get_page_from_l1e(sl1e, d);
1102 // If a privileged domain is attempting to install a map of a page it does
1103 // not own, we let it succeed anyway.
1104 //
1105 if ( unlikely(!res) &&
1106 !shadow_mode_translate(d) &&
1107 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
1108 (owner = page_get_owner(mfn_to_page(mfn))) &&
1109 (d != owner) &&
1110 IS_PRIV_FOR(d, owner))
1112 res = get_page_from_l1e(sl1e, owner);
1113 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
1114 "which is owned by domain %d: %s\n",
1115 d->domain_id, mfn_x(mfn), owner->domain_id,
1116 res ? "success" : "failed");
1119 if ( unlikely(!res) )
1121 perfc_incr(shadow_get_page_fail);
1122 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
1125 return res;
1128 static void inline
1129 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1131 if ( !shadow_mode_refcounts(d) )
1132 return;
1134 put_page_from_l1e(sl1e, d);
1137 #if GUEST_PAGING_LEVELS >= 4
1138 static int shadow_set_l4e(struct vcpu *v,
1139 shadow_l4e_t *sl4e,
1140 shadow_l4e_t new_sl4e,
1141 mfn_t sl4mfn)
1143 int flags = 0, ok;
1144 shadow_l4e_t old_sl4e;
1145 paddr_t paddr;
1146 ASSERT(sl4e != NULL);
1147 old_sl4e = *sl4e;
1149 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
1151 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1152 | (((unsigned long)sl4e) & ~PAGE_MASK));
1154 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
1156 /* About to install a new reference */
1157 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
1158 ok = sh_get_ref(v, sl3mfn, paddr);
1159 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
1160 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
1161 ok |= sh_pin(v, sl3mfn);
1162 if ( !ok )
1164 domain_crash(v->domain);
1165 return SHADOW_SET_ERROR;
1167 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1168 shadow_resync_all(v, 0);
1169 #endif
1172 /* Write the new entry */
1173 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
1174 flags |= SHADOW_SET_CHANGED;
1176 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1178 /* We lost a reference to an old mfn. */
1179 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1180 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1181 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1182 shadow_l4e_get_flags(new_sl4e)) )
1184 flags |= SHADOW_SET_FLUSH;
1186 sh_put_ref(v, osl3mfn, paddr);
1188 return flags;
1191 static int shadow_set_l3e(struct vcpu *v,
1192 shadow_l3e_t *sl3e,
1193 shadow_l3e_t new_sl3e,
1194 mfn_t sl3mfn)
1196 int flags = 0;
1197 shadow_l3e_t old_sl3e;
1198 paddr_t paddr;
1199 ASSERT(sl3e != NULL);
1200 old_sl3e = *sl3e;
1202 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1204 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1205 | (((unsigned long)sl3e) & ~PAGE_MASK));
1207 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1209 /* About to install a new reference */
1210 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1212 domain_crash(v->domain);
1213 return SHADOW_SET_ERROR;
1215 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1216 shadow_resync_all(v, 0);
1217 #endif
1220 /* Write the new entry */
1221 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1222 flags |= SHADOW_SET_CHANGED;
1224 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1226 /* We lost a reference to an old mfn. */
1227 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1228 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1229 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1230 shadow_l3e_get_flags(new_sl3e)) )
1232 flags |= SHADOW_SET_FLUSH;
1234 sh_put_ref(v, osl2mfn, paddr);
1236 return flags;
1238 #endif /* GUEST_PAGING_LEVELS >= 4 */
1240 static int shadow_set_l2e(struct vcpu *v,
1241 shadow_l2e_t *sl2e,
1242 shadow_l2e_t new_sl2e,
1243 mfn_t sl2mfn)
1245 int flags = 0;
1246 shadow_l2e_t old_sl2e;
1247 paddr_t paddr;
1249 #if GUEST_PAGING_LEVELS == 2
1250 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1251 * shadows. Reference counting and up-pointers track from the first
1252 * page of the shadow to the first l2e, so make sure that we're
1253 * working with those:
1254 * Align the pointer down so it's pointing at the first of the pair */
1255 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1256 /* Align the mfn of the shadow entry too */
1257 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1258 #endif
1260 ASSERT(sl2e != NULL);
1261 old_sl2e = *sl2e;
1263 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1265 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1266 | (((unsigned long)sl2e) & ~PAGE_MASK));
1268 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1270 mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
1272 /* About to install a new reference */
1273 if ( !sh_get_ref(v, sl1mfn, paddr) )
1275 domain_crash(v->domain);
1276 return SHADOW_SET_ERROR;
1278 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1280 struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
1281 mfn_t gl1mfn = _mfn(sp->backpointer);
1283 /* If the shadow is a fl1 then the backpointer contains
1284 the GFN instead of the GMFN, and it's definitely not
1285 OOS. */
1286 if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
1287 && mfn_is_out_of_sync(gl1mfn) )
1288 sh_resync(v, gl1mfn);
1290 #endif
1293 /* Write the new entry */
1294 #if GUEST_PAGING_LEVELS == 2
1296 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1297 /* The l1 shadow is two pages long and need to be pointed to by
1298 * two adjacent l1es. The pair have the same flags, but point
1299 * at odd and even MFNs */
1300 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1301 pair[1].l2 |= (1<<PAGE_SHIFT);
1302 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1304 #else /* normal case */
1305 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1306 #endif
1307 flags |= SHADOW_SET_CHANGED;
1309 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1311 /* We lost a reference to an old mfn. */
1312 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1313 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1314 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1315 shadow_l2e_get_flags(new_sl2e)) )
1317 flags |= SHADOW_SET_FLUSH;
1319 sh_put_ref(v, osl1mfn, paddr);
1321 return flags;
1324 static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1325 shadow_l1e_t *sl1e,
1326 mfn_t sl1mfn,
1327 struct domain *d)
1329 mfn_t mfn;
1330 unsigned long gfn;
1332 if ( !d->dirty_vram ) return;
1334 mfn = shadow_l1e_get_mfn(new_sl1e);
1336 if ( !mfn_valid(mfn) ) return; /* m2p for mmio_direct may not exist */
1338 gfn = mfn_to_gfn(d, mfn);
1340 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1341 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1342 struct page_info *page = mfn_to_page(mfn);
1343 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1345 if ( count_info == 1 )
1346 /* Initial guest reference, record it */
1347 d->dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1348 | ((unsigned long)sl1e & ~PAGE_MASK);
1352 static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1353 shadow_l1e_t *sl1e,
1354 mfn_t sl1mfn,
1355 struct domain *d)
1357 mfn_t mfn;
1358 unsigned long gfn;
1360 if ( !d->dirty_vram ) return;
1362 mfn = shadow_l1e_get_mfn(old_sl1e);
1364 if ( !mfn_valid(mfn) ) return;
1366 gfn = mfn_to_gfn(d, mfn);
1368 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1369 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1370 struct page_info *page = mfn_to_page(mfn);
1371 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1372 int dirty = 0;
1373 paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1374 | ((unsigned long)sl1e & ~PAGE_MASK);
1376 if ( count_info == 1 ) {
1377 /* Last reference */
1378 if ( d->dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1379 /* We didn't know it was that one, let's say it is dirty */
1380 dirty = 1;
1381 } else {
1382 ASSERT(d->dirty_vram->sl1ma[i] == sl1ma);
1383 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1384 if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_DIRTY )
1385 dirty = 1;
1387 } else {
1388 /* We had more than one reference, just consider the page dirty. */
1389 dirty = 1;
1390 /* Check that it's not the one we recorded. */
1391 if ( d->dirty_vram->sl1ma[i] == sl1ma ) {
1392 /* Too bad, we remembered the wrong one... */
1393 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1394 } else {
1395 /* Ok, our recorded sl1e is still pointing to this page, let's
1396 * just hope it will remain. */
1399 if ( dirty ) {
1400 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
1401 d->dirty_vram->last_dirty = NOW();
1406 static int shadow_set_l1e(struct vcpu *v,
1407 shadow_l1e_t *sl1e,
1408 shadow_l1e_t new_sl1e,
1409 mfn_t sl1mfn)
1411 int flags = 0;
1412 struct domain *d = v->domain;
1413 shadow_l1e_t old_sl1e;
1414 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1415 mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
1416 #endif
1417 ASSERT(sl1e != NULL);
1419 old_sl1e = *sl1e;
1421 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1423 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1424 && !sh_l1e_is_magic(new_sl1e) )
1426 /* About to install a new reference */
1427 if ( shadow_mode_refcounts(d) ) {
1428 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1430 /* Doesn't look like a pagetable. */
1431 flags |= SHADOW_SET_ERROR;
1432 new_sl1e = shadow_l1e_empty();
1434 else
1436 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1437 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1438 if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
1439 && (shadow_l1e_get_flags(new_sl1e) & _PAGE_RW) )
1441 oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
1443 #endif
1449 /* Write the new entry */
1450 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1451 flags |= SHADOW_SET_CHANGED;
1453 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1454 && !sh_l1e_is_magic(old_sl1e) )
1456 /* We lost a reference to an old mfn. */
1457 /* N.B. Unlike higher-level sets, never need an extra flush
1458 * when writing an l1e. Because it points to the same guest frame
1459 * as the guest l1e did, it's the guest's responsibility to
1460 * trigger a flush later. */
1461 if ( shadow_mode_refcounts(d) )
1463 shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1464 shadow_put_page_from_l1e(old_sl1e, d);
1467 return flags;
1471 /**************************************************************************/
1472 /* Macros to walk pagetables. These take the shadow of a pagetable and
1473 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1474 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1475 * second entry (since pairs of entries are managed together). For multi-page
1476 * shadows they walk all pages.
1478 * Arguments are an MFN, the variable to point to each entry, a variable
1479 * to indicate that we are done (we will shortcut to the end of the scan
1480 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1481 * and the code.
1483 * WARNING: These macros have side-effects. They change the values of both
1484 * the pointer and the MFN. */
1486 static inline void increment_ptr_to_guest_entry(void *ptr)
1488 if ( ptr )
1490 guest_l1e_t **entry = ptr;
1491 (*entry)++;
1495 /* All kinds of l1: touch all entries */
1496 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1497 do { \
1498 int _i; \
1499 shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \
1500 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1501 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1502 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1503 { \
1504 (_sl1e) = _sp + _i; \
1505 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1506 {_code} \
1507 if ( _done ) break; \
1508 increment_ptr_to_guest_entry(_gl1p); \
1509 } \
1510 sh_unmap_domain_page(_sp); \
1511 } while (0)
1513 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1514 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1515 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1516 do { \
1517 int __done = 0; \
1518 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1519 ({ (__done = _done); }), _code); \
1520 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1521 if ( !__done ) \
1522 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1523 ({ (__done = _done); }), _code); \
1524 } while (0)
1525 #else /* Everything else; l1 shadows are only one page */
1526 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1527 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1528 #endif
1531 #if GUEST_PAGING_LEVELS == 2
1533 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1534 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1535 do { \
1536 int _i, _j, __done = 0; \
1537 int _xen = !shadow_mode_external(_dom); \
1538 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1539 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1540 { \
1541 shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \
1542 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1543 if ( (!(_xen)) \
1544 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1545 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1546 { \
1547 (_sl2e) = _sp + _i; \
1548 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1549 {_code} \
1550 if ( (__done = (_done)) ) break; \
1551 increment_ptr_to_guest_entry(_gl2p); \
1552 } \
1553 sh_unmap_domain_page(_sp); \
1554 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1555 } \
1556 } while (0)
1558 #elif GUEST_PAGING_LEVELS == 3
1560 /* PAE: if it's an l2h, don't touch Xen mappings */
1561 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1562 do { \
1563 int _i; \
1564 int _xen = !shadow_mode_external(_dom); \
1565 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1566 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1567 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1568 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1569 if ( (!(_xen)) \
1570 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1571 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1572 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1573 { \
1574 (_sl2e) = _sp + _i; \
1575 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1576 {_code} \
1577 if ( _done ) break; \
1578 increment_ptr_to_guest_entry(_gl2p); \
1579 } \
1580 sh_unmap_domain_page(_sp); \
1581 } while (0)
1583 #else
1585 /* 64-bit l2: touch all entries except for PAE compat guests. */
1586 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1587 do { \
1588 int _i; \
1589 int _xen = !shadow_mode_external(_dom); \
1590 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1591 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1592 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1593 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1594 { \
1595 if ( (!(_xen)) \
1596 || !is_pv_32on64_domain(_dom) \
1597 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1598 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1599 { \
1600 (_sl2e) = _sp + _i; \
1601 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1602 {_code} \
1603 if ( _done ) break; \
1604 increment_ptr_to_guest_entry(_gl2p); \
1605 } \
1606 } \
1607 sh_unmap_domain_page(_sp); \
1608 } while (0)
1610 #endif /* different kinds of l2 */
1612 #if GUEST_PAGING_LEVELS == 4
1614 /* 64-bit l3: touch all entries */
1615 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1616 do { \
1617 int _i; \
1618 shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \
1619 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1620 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1621 { \
1622 (_sl3e) = _sp + _i; \
1623 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1624 {_code} \
1625 if ( _done ) break; \
1626 increment_ptr_to_guest_entry(_gl3p); \
1627 } \
1628 sh_unmap_domain_page(_sp); \
1629 } while (0)
1631 /* 64-bit l4: avoid Xen mappings */
1632 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1633 do { \
1634 shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \
1635 int _xen = !shadow_mode_external(_dom); \
1636 int _i; \
1637 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1638 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1639 { \
1640 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1641 { \
1642 (_sl4e) = _sp + _i; \
1643 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1644 {_code} \
1645 if ( _done ) break; \
1646 } \
1647 increment_ptr_to_guest_entry(_gl4p); \
1648 } \
1649 sh_unmap_domain_page(_sp); \
1650 } while (0)
1652 #endif
1656 /**************************************************************************/
1657 /* Functions to install Xen mappings and linear mappings in shadow pages */
1659 // XXX -- this function should probably be moved to shadow-common.c, but that
1660 // probably wants to wait until the shadow types have been moved from
1661 // shadow-types.h to shadow-private.h
1662 //
1663 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1664 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1666 struct domain *d = v->domain;
1667 shadow_l4e_t *sl4e;
1669 sl4e = sh_map_domain_page(sl4mfn);
1670 ASSERT(sl4e != NULL);
1671 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1673 /* Copy the common Xen mappings from the idle domain */
1674 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1675 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1676 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1678 /* Install the per-domain mappings for this domain */
1679 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1680 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1681 __PAGE_HYPERVISOR);
1683 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1684 * shadows on 64-bit xen, this linear mapping is later replaced by the
1685 * monitor pagetable structure, which is built in make_monitor_table
1686 * and maintained by sh_update_linear_entries. */
1687 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1688 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1690 /* Self linear mapping. */
1691 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1693 // linear tables may not be used with translated PV guests
1694 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1695 shadow_l4e_empty();
1697 else
1699 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1700 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1703 if ( shadow_mode_translate(v->domain) )
1705 /* install domain-specific P2M table */
1706 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1707 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1708 __PAGE_HYPERVISOR);
1711 sh_unmap_domain_page(sl4e);
1713 #endif
1715 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1716 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1717 // place, which means that we need to populate the l2h entry in the l3
1718 // table.
1720 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1722 struct domain *d = v->domain;
1723 shadow_l2e_t *sl2e;
1724 #if CONFIG_PAGING_LEVELS == 3
1725 int i;
1726 #else
1728 if ( !is_pv_32on64_vcpu(v) )
1729 return;
1730 #endif
1732 sl2e = sh_map_domain_page(sl2hmfn);
1733 ASSERT(sl2e != NULL);
1734 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1736 #if CONFIG_PAGING_LEVELS == 3
1738 /* Copy the common Xen mappings from the idle domain */
1739 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1740 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1741 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1743 /* Install the per-domain mappings for this domain */
1744 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1745 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1746 shadow_l2e_from_mfn(
1747 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1748 __PAGE_HYPERVISOR);
1750 /* We don't set up a linear mapping here because we can't until this
1751 * l2h is installed in an l3e. sh_update_linear_entries() handles
1752 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1753 * We zero them here, just as a safety measure.
1754 */
1755 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1756 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1757 shadow_l2e_empty();
1758 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1759 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1760 shadow_l2e_empty();
1762 if ( shadow_mode_translate(d) )
1764 /* Install the domain-specific p2m table */
1765 l3_pgentry_t *p2m;
1766 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1767 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1768 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1770 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1771 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1772 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1773 __PAGE_HYPERVISOR)
1774 : shadow_l2e_empty();
1776 sh_unmap_domain_page(p2m);
1779 #else
1781 /* Copy the common Xen mappings from the idle domain */
1782 memcpy(
1783 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1784 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1785 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1787 #endif
1789 sh_unmap_domain_page(sl2e);
1791 #endif
1797 /**************************************************************************/
1798 /* Create a shadow of a given guest page.
1799 */
1800 static mfn_t
1801 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1803 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1804 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1805 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1807 if ( shadow_type != SH_type_l2_32_shadow
1808 && shadow_type != SH_type_l2_pae_shadow
1809 && shadow_type != SH_type_l2h_pae_shadow
1810 && shadow_type != SH_type_l4_64_shadow )
1811 /* Lower-level shadow, not yet linked form a higher level */
1812 mfn_to_shadow_page(smfn)->up = 0;
1814 #if GUEST_PAGING_LEVELS == 4
1815 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1816 if ( shadow_type == SH_type_l4_64_shadow &&
1817 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1819 /* We're shadowing a new l4, but we've been assuming the guest uses
1820 * only one l4 per vcpu and context switches using an l4 entry.
1821 * Count the number of active l4 shadows. If there are enough
1822 * of them, decide that this isn't an old linux guest, and stop
1823 * pinning l3es. This is not very quick but it doesn't happen
1824 * very often. */
1825 struct list_head *l, *t;
1826 struct shadow_page_info *sp;
1827 struct vcpu *v2;
1828 int l4count = 0, vcpus = 0;
1829 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1831 sp = list_entry(l, struct shadow_page_info, list);
1832 if ( sp->type == SH_type_l4_64_shadow )
1833 l4count++;
1835 for_each_vcpu ( v->domain, v2 )
1836 vcpus++;
1837 if ( l4count > 2 * vcpus )
1839 /* Unpin all the pinned l3 tables, and don't pin any more. */
1840 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1842 sp = list_entry(l, struct shadow_page_info, list);
1843 if ( sp->type == SH_type_l3_64_shadow )
1844 sh_unpin(v, shadow_page_to_mfn(sp));
1846 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1849 #endif
1850 #endif
1852 // Create the Xen mappings...
1853 if ( !shadow_mode_external(v->domain) )
1855 switch (shadow_type)
1857 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1858 case SH_type_l4_shadow:
1859 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1860 #endif
1861 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1862 case SH_type_l2h_shadow:
1863 sh_install_xen_entries_in_l2h(v, smfn); break;
1864 #endif
1865 default: /* Do nothing */ break;
1869 shadow_promote(v, gmfn, shadow_type);
1870 set_shadow_status(v, gmfn, shadow_type, smfn);
1872 return smfn;
1875 /* Make a splintered superpage shadow */
1876 static mfn_t
1877 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1879 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1880 (unsigned long) gfn_x(gfn));
1882 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1883 gfn_x(gfn), mfn_x(smfn));
1885 set_fl1_shadow_status(v, gfn, smfn);
1886 return smfn;
1890 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1891 mfn_t
1892 sh_make_monitor_table(struct vcpu *v)
1894 struct domain *d = v->domain;
1896 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1898 /* Guarantee we can get the memory we need */
1899 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1901 #if CONFIG_PAGING_LEVELS == 4
1903 mfn_t m4mfn;
1904 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1905 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1906 /* Remember the level of this table */
1907 mfn_to_page(m4mfn)->shadow_flags = 4;
1908 #if SHADOW_PAGING_LEVELS < 4
1910 mfn_t m3mfn, m2mfn;
1911 l4_pgentry_t *l4e;
1912 l3_pgentry_t *l3e;
1913 /* Install an l3 table and an l2 table that will hold the shadow
1914 * linear map entries. This overrides the linear map entry that
1915 * was installed by sh_install_xen_entries_in_l4. */
1916 l4e = sh_map_domain_page(m4mfn);
1918 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1919 mfn_to_page(m3mfn)->shadow_flags = 3;
1920 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1921 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1923 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1924 mfn_to_page(m2mfn)->shadow_flags = 2;
1925 l3e = sh_map_domain_page(m3mfn);
1926 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1927 sh_unmap_domain_page(l3e);
1929 if ( is_pv_32on64_vcpu(v) )
1931 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1932 * area into its usual VAs in the monitor tables */
1933 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1934 mfn_to_page(m3mfn)->shadow_flags = 3;
1935 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1937 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1938 mfn_to_page(m2mfn)->shadow_flags = 2;
1939 l3e = sh_map_domain_page(m3mfn);
1940 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1941 sh_install_xen_entries_in_l2h(v, m2mfn);
1942 sh_unmap_domain_page(l3e);
1945 sh_unmap_domain_page(l4e);
1947 #endif /* SHADOW_PAGING_LEVELS < 4 */
1948 return m4mfn;
1951 #elif CONFIG_PAGING_LEVELS == 3
1954 mfn_t m3mfn, m2mfn;
1955 l3_pgentry_t *l3e;
1956 l2_pgentry_t *l2e;
1957 int i;
1959 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1960 /* Remember the level of this table */
1961 mfn_to_page(m3mfn)->shadow_flags = 3;
1963 // Install a monitor l2 table in slot 3 of the l3 table.
1964 // This is used for all Xen entries, including linear maps
1965 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1966 mfn_to_page(m2mfn)->shadow_flags = 2;
1967 l3e = sh_map_domain_page(m3mfn);
1968 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1969 sh_install_xen_entries_in_l2h(v, m2mfn);
1970 /* Install the monitor's own linear map */
1971 l2e = sh_map_domain_page(m2mfn);
1972 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1973 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1974 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1975 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1976 : l2e_empty();
1977 sh_unmap_domain_page(l2e);
1978 sh_unmap_domain_page(l3e);
1980 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1981 return m3mfn;
1984 #else
1985 #error this should not happen
1986 #endif /* CONFIG_PAGING_LEVELS */
1988 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1990 /**************************************************************************/
1991 /* These functions also take a virtual address and return the level-N
1992 * shadow table mfn and entry, but they create the shadow pagetables if
1993 * they are needed. The "demand" argument is non-zero when handling
1994 * a demand fault (so we know what to do about accessed bits &c).
1995 * If the necessary tables are not present in the guest, they return NULL. */
1997 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1998 * more levels than the guest, the upper levels are always fixed and do not
1999 * reflect any information from the guest, so we do not use these functions
2000 * to access them. */
2002 #if GUEST_PAGING_LEVELS >= 4
2003 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
2004 walk_t *gw,
2005 mfn_t *sl4mfn)
2007 /* There is always a shadow of the top level table. Get it. */
2008 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2009 /* Reading the top level table is always valid. */
2010 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
2013 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
2014 walk_t *gw,
2015 mfn_t *sl3mfn,
2016 fetch_type_t ft)
2018 mfn_t sl4mfn;
2019 shadow_l4e_t *sl4e;
2020 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
2021 /* Get the l4e */
2022 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
2023 ASSERT(sl4e != NULL);
2024 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2026 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
2027 ASSERT(mfn_valid(*sl3mfn));
2029 else
2031 int r;
2032 shadow_l4e_t new_sl4e;
2033 /* No l3 shadow installed: find and install it. */
2034 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
2035 if ( !mfn_valid(*sl3mfn) )
2037 /* No l3 shadow of this page exists at all: make one. */
2038 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
2040 /* Install the new sl3 table in the sl4e */
2041 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
2042 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
2043 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2044 if ( r & SHADOW_SET_ERROR )
2045 return NULL;
2047 /* Now follow it down a level. Guaranteed to succeed. */
2048 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
2050 #endif /* GUEST_PAGING_LEVELS >= 4 */
2053 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
2054 walk_t *gw,
2055 mfn_t *sl2mfn,
2056 fetch_type_t ft)
2058 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
2059 mfn_t sl3mfn = _mfn(INVALID_MFN);
2060 shadow_l3e_t *sl3e;
2061 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
2062 /* Get the l3e */
2063 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
2064 if ( sl3e == NULL ) return NULL;
2065 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2067 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2068 ASSERT(mfn_valid(*sl2mfn));
2070 else
2072 int r;
2073 shadow_l3e_t new_sl3e;
2074 unsigned int t = SH_type_l2_shadow;
2076 /* Tag compat L2 containing hypervisor (m2p) mappings */
2077 if ( is_pv_32on64_domain(v->domain) &&
2078 guest_l4_table_offset(gw->va) == 0 &&
2079 guest_l3_table_offset(gw->va) == 3 )
2080 t = SH_type_l2h_shadow;
2082 /* No l2 shadow installed: find and install it. */
2083 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
2084 if ( !mfn_valid(*sl2mfn) )
2086 /* No l2 shadow of this page exists at all: make one. */
2087 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
2089 /* Install the new sl2 table in the sl3e */
2090 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
2091 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
2092 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2093 if ( r & SHADOW_SET_ERROR )
2094 return NULL;
2096 /* Now follow it down a level. Guaranteed to succeed. */
2097 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2098 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
2099 /* We never demand-shadow PAE l3es: they are only created in
2100 * sh_update_cr3(). Check if the relevant sl3e is present. */
2101 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
2102 + shadow_l3_linear_offset(gw->va);
2103 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
2104 return NULL;
2105 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2106 ASSERT(mfn_valid(*sl2mfn));
2107 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2108 #else /* 32bit... */
2109 /* There is always a shadow of the top level table. Get it. */
2110 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2111 /* This next line is important: the guest l2 has a 16k
2112 * shadow, we need to return the right mfn of the four. This
2113 * call will set it for us as a side-effect. */
2114 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
2115 /* Reading the top level table is always valid. */
2116 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2117 #endif
2121 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
2122 walk_t *gw,
2123 mfn_t *sl1mfn,
2124 fetch_type_t ft)
2126 mfn_t sl2mfn;
2127 shadow_l2e_t *sl2e;
2129 /* Get the l2e */
2130 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
2131 if ( sl2e == NULL ) return NULL;
2132 /* Install the sl1 in the l2e if it wasn't there or if we need to
2133 * re-do it to fix a PSE dirty bit. */
2134 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
2135 && likely(ft != ft_demand_write
2136 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
2137 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
2139 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
2140 ASSERT(mfn_valid(*sl1mfn));
2142 else
2144 shadow_l2e_t new_sl2e;
2145 int r, flags = guest_l2e_get_flags(gw->l2e);
2146 /* No l1 shadow installed: find and install it. */
2147 if ( !(flags & _PAGE_PRESENT) )
2148 return NULL; /* No guest page. */
2149 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
2151 /* Splintering a superpage */
2152 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
2153 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
2154 if ( !mfn_valid(*sl1mfn) )
2156 /* No fl1 shadow of this superpage exists at all: make one. */
2157 *sl1mfn = make_fl1_shadow(v, l2gfn);
2160 else
2162 /* Shadowing an actual guest l1 table */
2163 if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
2164 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
2165 if ( !mfn_valid(*sl1mfn) )
2167 /* No l1 shadow of this page exists at all: make one. */
2168 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
2171 /* Install the new sl1 table in the sl2e */
2172 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
2173 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
2174 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2175 if ( r & SHADOW_SET_ERROR )
2176 return NULL;
2177 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
2178 * the guest l1 table has an 8k shadow, and we need to return
2179 * the right mfn of the pair. This call will set it for us as a
2180 * side-effect. (In all other cases, it's a no-op and will be
2181 * compiled out.) */
2182 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
2184 /* Now follow it down a level. Guaranteed to succeed. */
2185 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
2190 /**************************************************************************/
2191 /* Destructors for shadow tables:
2192 * Unregister the shadow, decrement refcounts of any entries present in it,
2193 * and release the memory.
2195 * N.B. These destructors do not clear the contents of the shadows.
2196 * This allows us to delay TLB shootdowns until the page is being reused.
2197 * See shadow_alloc() and shadow_free() for how this is handled.
2198 */
2200 #if GUEST_PAGING_LEVELS >= 4
2201 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
2203 shadow_l4e_t *sl4e;
2204 u32 t = mfn_to_shadow_page(smfn)->type;
2205 mfn_t gmfn, sl4mfn;
2207 SHADOW_DEBUG(DESTROY_SHADOW,
2208 "%s(%05lx)\n", __func__, mfn_x(smfn));
2209 ASSERT(t == SH_type_l4_shadow);
2211 /* Record that the guest page isn't shadowed any more (in this type) */
2212 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2213 delete_shadow_status(v, gmfn, t, smfn);
2214 shadow_demote(v, gmfn, t);
2215 /* Decrement refcounts of all the old entries */
2216 sl4mfn = smfn;
2217 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2218 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2220 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
2221 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
2222 | ((unsigned long)sl4e & ~PAGE_MASK));
2224 });
2226 /* Put the memory back in the pool */
2227 shadow_free(v->domain, smfn);
2230 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2232 shadow_l3e_t *sl3e;
2233 u32 t = mfn_to_shadow_page(smfn)->type;
2234 mfn_t gmfn, sl3mfn;
2236 SHADOW_DEBUG(DESTROY_SHADOW,
2237 "%s(%05lx)\n", __func__, mfn_x(smfn));
2238 ASSERT(t == SH_type_l3_shadow);
2240 /* Record that the guest page isn't shadowed any more (in this type) */
2241 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2242 delete_shadow_status(v, gmfn, t, smfn);
2243 shadow_demote(v, gmfn, t);
2245 /* Decrement refcounts of all the old entries */
2246 sl3mfn = smfn;
2247 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2248 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2249 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2250 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2251 | ((unsigned long)sl3e & ~PAGE_MASK));
2252 });
2254 /* Put the memory back in the pool */
2255 shadow_free(v->domain, smfn);
2257 #endif /* GUEST_PAGING_LEVELS >= 4 */
2260 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2262 shadow_l2e_t *sl2e;
2263 u32 t = mfn_to_shadow_page(smfn)->type;
2264 mfn_t gmfn, sl2mfn;
2266 SHADOW_DEBUG(DESTROY_SHADOW,
2267 "%s(%05lx)\n", __func__, mfn_x(smfn));
2269 #if GUEST_PAGING_LEVELS >= 3
2270 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2271 #else
2272 ASSERT(t == SH_type_l2_shadow);
2273 #endif
2275 /* Record that the guest page isn't shadowed any more (in this type) */
2276 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2277 delete_shadow_status(v, gmfn, t, smfn);
2278 shadow_demote(v, gmfn, t);
2280 /* Decrement refcounts of all the old entries */
2281 sl2mfn = smfn;
2282 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2283 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2284 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2285 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2286 | ((unsigned long)sl2e & ~PAGE_MASK));
2287 });
2289 /* Put the memory back in the pool */
2290 shadow_free(v->domain, smfn);
2293 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2295 struct domain *d = v->domain;
2296 shadow_l1e_t *sl1e;
2297 u32 t = mfn_to_shadow_page(smfn)->type;
2299 SHADOW_DEBUG(DESTROY_SHADOW,
2300 "%s(%05lx)\n", __func__, mfn_x(smfn));
2301 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2303 /* Record that the guest page isn't shadowed any more (in this type) */
2304 if ( t == SH_type_fl1_shadow )
2306 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2307 delete_fl1_shadow_status(v, gfn, smfn);
2309 else
2311 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2312 delete_shadow_status(v, gmfn, t, smfn);
2313 shadow_demote(v, gmfn, t);
2316 if ( shadow_mode_refcounts(d) )
2318 /* Decrement refcounts of all the old entries */
2319 mfn_t sl1mfn = smfn;
2320 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2321 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2322 && !sh_l1e_is_magic(*sl1e) ) {
2323 shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2324 shadow_put_page_from_l1e(*sl1e, d);
2326 });
2329 /* Put the memory back in the pool */
2330 shadow_free(v->domain, smfn);
2333 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2334 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2336 struct domain *d = v->domain;
2337 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2339 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2341 mfn_t m3mfn;
2342 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2343 l3_pgentry_t *l3e;
2344 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2346 /* Need to destroy the l3 and l2 monitor pages used
2347 * for the linear map */
2348 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2349 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2350 l3e = sh_map_domain_page(m3mfn);
2351 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2352 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2353 sh_unmap_domain_page(l3e);
2354 shadow_free(d, m3mfn);
2356 if ( is_pv_32on64_vcpu(v) )
2358 /* Need to destroy the l3 and l2 monitor pages that map the
2359 * Xen VAs at 3GB-4GB */
2360 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2361 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2362 l3e = sh_map_domain_page(m3mfn);
2363 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2364 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2365 sh_unmap_domain_page(l3e);
2366 shadow_free(d, m3mfn);
2368 sh_unmap_domain_page(l4e);
2370 #elif CONFIG_PAGING_LEVELS == 3
2371 /* Need to destroy the l2 monitor page in slot 4 too */
2373 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2374 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2375 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2376 sh_unmap_domain_page(l3e);
2378 #endif
2380 /* Put the memory back in the pool */
2381 shadow_free(d, mmfn);
2383 #endif
2385 /**************************************************************************/
2386 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2387 * These are called from common code when we are running out of shadow
2388 * memory, and unpinning all the top-level shadows hasn't worked.
2390 * This implementation is pretty crude and slow, but we hope that it won't
2391 * be called very often. */
2393 #if GUEST_PAGING_LEVELS == 2
2395 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2397 shadow_l2e_t *sl2e;
2398 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2399 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2400 });
2403 #elif GUEST_PAGING_LEVELS == 3
2405 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2406 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2408 shadow_l2e_t *sl2e;
2409 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2410 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2411 });
2414 #elif GUEST_PAGING_LEVELS == 4
2416 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2418 shadow_l4e_t *sl4e;
2419 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2420 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2421 });
2424 #endif
2426 /**************************************************************************/
2427 /* Internal translation functions.
2428 * These functions require a pointer to the shadow entry that will be updated.
2429 */
2431 /* These functions take a new guest entry, translate it to shadow and write
2432 * the shadow entry.
2434 * They return the same bitmaps as the shadow_set_lXe() functions.
2435 */
2437 #if GUEST_PAGING_LEVELS >= 4
2438 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2440 shadow_l4e_t new_sl4e;
2441 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2442 shadow_l4e_t *sl4p = se;
2443 mfn_t sl3mfn = _mfn(INVALID_MFN);
2444 struct domain *d = v->domain;
2445 p2m_type_t p2mt;
2446 int result = 0;
2448 perfc_incr(shadow_validate_gl4e_calls);
2450 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2452 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2453 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2454 if ( p2m_is_ram(p2mt) )
2455 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2456 else
2457 result |= SHADOW_SET_ERROR;
2459 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2461 // check for updates to xen reserved slots
2462 if ( !shadow_mode_external(d) )
2464 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2465 sizeof(shadow_l4e_t));
2466 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2468 if ( unlikely(reserved_xen_slot) )
2470 // attempt by the guest to write to a xen reserved slot
2471 //
2472 SHADOW_PRINTK("%s out-of-range update "
2473 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2474 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2475 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2477 SHADOW_ERROR("out-of-range l4e update\n");
2478 result |= SHADOW_SET_ERROR;
2481 // do not call shadow_set_l4e...
2482 return result;
2486 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2487 return result;
2491 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2493 shadow_l3e_t new_sl3e;
2494 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2495 shadow_l3e_t *sl3p = se;
2496 mfn_t sl2mfn = _mfn(INVALID_MFN);
2497 p2m_type_t p2mt;
2498 int result = 0;
2500 perfc_incr(shadow_validate_gl3e_calls);
2502 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2504 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2505 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2506 if ( p2m_is_ram(p2mt) )
2507 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2508 else
2509 result |= SHADOW_SET_ERROR;
2511 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2512 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2514 return result;
2516 #endif // GUEST_PAGING_LEVELS >= 4
2518 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2520 shadow_l2e_t new_sl2e;
2521 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2522 shadow_l2e_t *sl2p = se;
2523 mfn_t sl1mfn = _mfn(INVALID_MFN);
2524 p2m_type_t p2mt;
2525 int result = 0;
2527 perfc_incr(shadow_validate_gl2e_calls);
2529 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2531 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2532 if ( guest_supports_superpages(v) &&
2533 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2535 // superpage -- need to look up the shadow L1 which holds the
2536 // splitters...
2537 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2538 #if 0
2539 // XXX - it's possible that we want to do some kind of prefetch
2540 // for superpage fl1's here, but this is *not* on the demand path,
2541 // so we'll hold off trying that for now...
2542 //
2543 if ( !mfn_valid(sl1mfn) )
2544 sl1mfn = make_fl1_shadow(v, gl1gfn);
2545 #endif
2547 else
2549 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2550 if ( p2m_is_ram(p2mt) )
2551 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2552 else
2553 result |= SHADOW_SET_ERROR;
2556 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2558 // check for updates to xen reserved slots in PV guests...
2559 // XXX -- need to revisit this for PV 3-on-4 guests.
2560 //
2561 #if SHADOW_PAGING_LEVELS < 4
2562 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2563 if ( !shadow_mode_external(v->domain) )
2565 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2566 sizeof(shadow_l2e_t));
2567 int reserved_xen_slot;
2569 #if SHADOW_PAGING_LEVELS == 3
2570 reserved_xen_slot =
2571 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2572 (shadow_index
2573 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2574 #else /* SHADOW_PAGING_LEVELS == 2 */
2575 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2576 #endif
2578 if ( unlikely(reserved_xen_slot) )
2580 // attempt by the guest to write to a xen reserved slot
2581 //
2582 SHADOW_PRINTK("%s out-of-range update "
2583 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2584 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2585 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2587 SHADOW_ERROR("out-of-range l2e update\n");
2588 result |= SHADOW_SET_ERROR;
2591 // do not call shadow_set_l2e...
2592 return result;
2595 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2596 #endif /* SHADOW_PAGING_LEVELS < 4 */
2598 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2600 return result;
2603 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2605 shadow_l1e_t new_sl1e;
2606 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2607 shadow_l1e_t *sl1p = se;
2608 gfn_t gfn;
2609 mfn_t gmfn;
2610 p2m_type_t p2mt;
2611 int result = 0;
2612 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2613 mfn_t gl1mfn;
2614 #endif /* OOS */
2616 perfc_incr(shadow_validate_gl1e_calls);
2618 gfn = guest_l1e_get_gfn(new_gl1e);
2619 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2621 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2622 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2624 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2625 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
2626 if ( mfn_valid(gl1mfn)
2627 && mfn_is_out_of_sync(gl1mfn) )
2629 /* Update the OOS snapshot. */
2630 mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
2631 guest_l1e_t *snp;
2633 ASSERT(mfn_valid(snpmfn));
2635 snp = sh_map_domain_page(snpmfn);
2636 snp[guest_index(new_ge)] = new_gl1e;
2637 sh_unmap_domain_page(snp);
2639 #endif /* OOS */
2641 return result;
2644 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2645 /**************************************************************************/
2646 /* Special validation function for re-syncing out-of-sync shadows.
2647 * Walks the *shadow* page, and for every entry that it finds,
2648 * revalidates the guest entry that corresponds to it.
2649 * N.B. This function is called with the vcpu that unsynced the page,
2650 * *not* the one that is causing it to be resynced. */
2651 void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
2653 mfn_t sl1mfn;
2654 shadow_l1e_t *sl1p;
2655 guest_l1e_t *gl1p, *gp, *snp;
2656 int rc = 0;
2658 ASSERT(mfn_valid(snpmfn));
2660 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2661 ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
2663 snp = sh_map_domain_page(snpmfn);
2664 gp = sh_map_domain_page(gl1mfn);
2665 gl1p = gp;
2667 SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
2668 guest_l1e_t gl1e = *gl1p;
2669 guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
2671 if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
2673 gfn_t gfn;
2674 mfn_t gmfn;
2675 p2m_type_t p2mt;
2676 shadow_l1e_t nsl1e;
2678 gfn = guest_l1e_get_gfn(gl1e);
2679 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2680 l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
2681 rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
2683 *snpl1p = gl1e;
2685 });
2687 sh_unmap_domain_page(gp);
2688 sh_unmap_domain_page(snp);
2690 /* Setting shadow L1 entries should never need us to flush the TLB */
2691 ASSERT(!(rc & SHADOW_SET_FLUSH));
2694 /* Figure out whether it's definitely safe not to sync this l1 table.
2695 * That is: if we can tell that it's only used once, and that the
2696 * toplevel shadow responsible is not one of ours.
2697 * N.B. This function is called with the vcpu that required the resync,
2698 * *not* the one that originally unsynced the page, but it is
2699 * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
2700 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
2702 struct shadow_page_info *sp;
2703 mfn_t smfn;
2705 smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2706 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2708 /* Up to l2 */
2709 sp = mfn_to_shadow_page(smfn);
2710 if ( sp->count != 1 || !sp->up )
2711 return 0;
2712 smfn = _mfn(sp->up >> PAGE_SHIFT);
2713 ASSERT(mfn_valid(smfn));
2715 #if (SHADOW_PAGING_LEVELS == 4)
2716 /* up to l3 */
2717 sp = mfn_to_shadow_page(smfn);
2718 if ( sp->count != 1 || !sp->up )
2719 return 0;
2720 smfn = _mfn(sp->up >> PAGE_SHIFT);
2721 ASSERT(mfn_valid(smfn));
2723 /* up to l4 */
2724 sp = mfn_to_shadow_page(smfn);
2725 if ( sp->count != 1
2726 || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
2727 return 0;
2728 smfn = _mfn(sp->up >> PAGE_SHIFT);
2729 ASSERT(mfn_valid(smfn));
2731 #if (GUEST_PAGING_LEVELS == 2)
2732 /* In 2-on-3 shadow mode the up pointer contains the link to the
2733 * shadow page, but the shadow_table contains only the first of the
2734 * four pages that makes the PAE top shadow tables. */
2735 smfn = _mfn(mfn_x(smfn) & ~0x3UL);
2736 #endif
2738 #endif
2740 if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
2741 #if (SHADOW_PAGING_LEVELS == 3)
2742 || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
2743 || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
2744 || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
2745 #endif
2747 return 0;
2749 /* Only in use in one toplevel shadow, and it's not the one we're
2750 * running on */
2751 return 1;
2753 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
2756 /**************************************************************************/
2757 /* Functions which translate and install the shadows of arbitrary guest
2758 * entries that we have just seen the guest write. */
2761 static inline int
2762 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2763 void *new_gp, u32 size, u32 sh_type,
2764 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2765 int (*validate_ge)(struct vcpu *v, void *ge,
2766 mfn_t smfn, void *se))
2767 /* Generic function for mapping and validating. */
2769 mfn_t smfn, smfn2, map_mfn;
2770 shadow_l1e_t *sl1p;
2771 u32 shadow_idx, guest_idx;
2772 int result = 0;
2774 /* Align address and size to guest entry boundaries */
2775 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2776 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2777 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2778 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2780 /* Map the shadow page */
2781 smfn = get_shadow_status(v, gmfn, sh_type);
2782 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2783 guest_idx = guest_index(new_gp);
2784 map_mfn = smfn;
2785 shadow_idx = shadow_index(&map_mfn, guest_idx);
2786 sl1p = sh_map_domain_page(map_mfn);
2788 /* Validate one entry at a time */
2789 while ( size )
2791 smfn2 = smfn;
2792 guest_idx = guest_index(new_gp);
2793 shadow_idx = shadow_index(&smfn2, guest_idx);
2794 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2796 /* We have moved to another page of the shadow */
2797 map_mfn = smfn2;
2798 sh_unmap_domain_page(sl1p);
2799 sl1p = sh_map_domain_page(map_mfn);
2801 result |= validate_ge(v,
2802 new_gp,
2803 map_mfn,
2804 &sl1p[shadow_idx]);
2805 size -= sizeof(guest_l1e_t);
2806 new_gp += sizeof(guest_l1e_t);
2808 sh_unmap_domain_page(sl1p);
2809 return result;
2813 int
2814 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2815 void *new_gl4p, u32 size)
2817 #if GUEST_PAGING_LEVELS >= 4
2818 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2819 SH_type_l4_shadow,
2820 shadow_l4_index,
2821 validate_gl4e);
2822 #else // ! GUEST_PAGING_LEVELS >= 4
2823 SHADOW_ERROR("called in wrong paging mode!\n");
2824 BUG();
2825 return 0;
2826 #endif
2829 int
2830 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2831 void *new_gl3p, u32 size)
2833 #if GUEST_PAGING_LEVELS >= 4
2834 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2835 SH_type_l3_shadow,
2836 shadow_l3_index,
2837 validate_gl3e);
2838 #else // ! GUEST_PAGING_LEVELS >= 4
2839 SHADOW_ERROR("called in wrong paging mode!\n");
2840 BUG();
2841 return 0;
2842 #endif
2845 int
2846 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2847 void *new_gl2p, u32 size)
2849 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2850 SH_type_l2_shadow,
2851 shadow_l2_index,
2852 validate_gl2e);
2855 int
2856 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2857 void *new_gl2p, u32 size)
2859 #if GUEST_PAGING_LEVELS >= 3
2860 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2861 SH_type_l2h_shadow,
2862 shadow_l2_index,
2863 validate_gl2e);
2864 #else /* Non-PAE guests don't have different kinds of l2 table */
2865 SHADOW_ERROR("called in wrong paging mode!\n");
2866 BUG();
2867 return 0;
2868 #endif
2871 int
2872 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2873 void *new_gl1p, u32 size)
2875 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2876 SH_type_l1_shadow,
2877 shadow_l1_index,
2878 validate_gl1e);
2882 /**************************************************************************/
2883 /* Optimization: If we see two emulated writes of zeros to the same
2884 * page-table without another kind of page fault in between, we guess
2885 * that this is a batch of changes (for process destruction) and
2886 * unshadow the page so we don't take a pagefault on every entry. This
2887 * should also make finding writeable mappings of pagetables much
2888 * easier. */
2890 /* Look to see if this is the second emulated write in a row to this
2891 * page, and unshadow if it is */
2892 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2894 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2895 if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2896 && sh_mfn_is_a_page_table(gmfn) )
2898 perfc_incr(shadow_early_unshadow);
2899 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2901 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2902 #endif
2905 /* Stop counting towards early unshadows, as we've seen a real page fault */
2906 static inline void reset_early_unshadow(struct vcpu *v)
2908 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2909 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2910 #endif
2915 /**************************************************************************/
2916 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2917 * demand-faulted a shadow l1e in the fault handler, to see if it's
2918 * worth fetching some more.
2919 */
2921 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2923 /* XXX magic number */
2924 #define PREFETCH_DISTANCE 32
2926 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2927 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2929 int i, dist;
2930 gfn_t gfn;
2931 mfn_t gmfn;
2932 guest_l1e_t *gl1p = NULL, gl1e;
2933 shadow_l1e_t sl1e;
2934 u32 gflags;
2935 p2m_type_t p2mt;
2936 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2937 guest_l1e_t *snpl1p = NULL;
2938 #endif /* OOS */
2941 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2942 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2943 /* And no more than a maximum fetches-per-fault */
2944 if ( dist > PREFETCH_DISTANCE )
2945 dist = PREFETCH_DISTANCE;
2947 if ( mfn_valid(gw->l1mfn) )
2949 /* Normal guest page; grab the next guest entry */
2950 gl1p = sh_map_domain_page(gw->l1mfn);
2951 gl1p += guest_l1_table_offset(gw->va);
2953 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2954 if ( mfn_is_out_of_sync(gw->l1mfn) )
2956 mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
2958 ASSERT(mfn_valid(snpmfn));
2959 snpl1p = sh_map_domain_page(snpmfn);
2960 snpl1p += guest_l1_table_offset(gw->va);
2962 #endif /* OOS */
2965 for ( i = 1; i < dist ; i++ )
2967 /* No point in prefetching if there's already a shadow */
2968 if ( ptr_sl1e[i].l1 != 0 )
2969 break;
2971 if ( mfn_valid(gw->l1mfn) )
2973 /* Normal guest page; grab the next guest entry */
2974 gl1e = gl1p[i];
2975 /* Not worth continuing if we hit an entry that will need another
2976 * fault for A/D-bit propagation anyway */
2977 gflags = guest_l1e_get_flags(gl1e);
2978 if ( (gflags & _PAGE_PRESENT)
2979 && (!(gflags & _PAGE_ACCESSED)
2980 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2981 break;
2983 else
2985 /* Fragmented superpage, unless we've been called wrongly */
2986 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2987 /* Increment the l1e's GFN by the right number of guest pages */
2988 gl1e = guest_l1e_from_gfn(
2989 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2990 guest_l1e_get_flags(gw->l1e));
2993 /* Look at the gfn that the l1e is pointing at */
2994 gfn = guest_l1e_get_gfn(gl1e);
2995 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2997 /* Propagate the entry. */
2998 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2999 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
3001 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3002 if ( snpl1p != NULL )
3003 snpl1p[i] = gl1e;
3004 #endif /* OOS */
3006 if ( gl1p != NULL )
3007 sh_unmap_domain_page(gl1p);
3008 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3009 if ( snpl1p != NULL )
3010 sh_unmap_domain_page(snpl1p);
3011 #endif /* OOS */
3014 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
3017 /**************************************************************************/
3018 /* Entry points into the shadow code */
3020 /* Called from pagefault handler in Xen, and from the HVM trap handlers
3021 * for pagefaults. Returns 1 if this fault was an artefact of the
3022 * shadow code (and the guest should retry) or 0 if it is not (and the
3023 * fault should be handled elsewhere or passed to the guest). */
3025 static int sh_page_fault(struct vcpu *v,
3026 unsigned long va,
3027 struct cpu_user_regs *regs)
3029 struct domain *d = v->domain;
3030 walk_t gw;
3031 gfn_t gfn;
3032 mfn_t gmfn, sl1mfn=_mfn(0);
3033 shadow_l1e_t sl1e, *ptr_sl1e;
3034 paddr_t gpa;
3035 struct sh_emulate_ctxt emul_ctxt;
3036 struct x86_emulate_ops *emul_ops;
3037 int r;
3038 fetch_type_t ft = 0;
3039 p2m_type_t p2mt;
3040 uint32_t rc;
3041 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3042 int fast_emul = 0;
3043 #endif
3045 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
3046 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
3047 regs->rip);
3049 perfc_incr(shadow_fault);
3051 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3052 /* If faulting frame is successfully emulated in last shadow fault
3053 * it's highly likely to reach same emulation action for this frame.
3054 * Then try to emulate early to avoid lock aquisition.
3055 */
3056 if ( v->arch.paging.last_write_emul_ok
3057 && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
3059 /* check whether error code is 3, or else fall back to normal path
3060 * in case of some validation is required
3061 */
3062 if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
3064 fast_emul = 1;
3065 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
3067 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3068 /* Fall back to the slow path if we're trying to emulate
3069 writes to an out of sync page. */
3070 if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
3072 v->arch.paging.last_write_emul_ok = 0;
3073 goto page_fault_slow_path;
3075 #endif /* OOS */
3077 perfc_incr(shadow_fault_fast_emulate);
3078 goto early_emulation;
3080 else
3081 v->arch.paging.last_write_emul_ok = 0;
3083 #endif
3085 //
3086 // XXX: Need to think about eventually mapping superpages directly in the
3087 // shadow (when possible), as opposed to splintering them into a
3088 // bunch of 4K maps.
3089 //
3091 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3092 if ( (regs->error_code & PFEC_reserved_bit) )
3094 /* The only reasons for reserved bits to be set in shadow entries
3095 * are the two "magic" shadow_l1e entries. */
3096 if ( likely((__copy_from_user(&sl1e,
3097 (sh_linear_l1_table(v)
3098 + shadow_l1_linear_offset(va)),
3099 sizeof(sl1e)) == 0)
3100 && sh_l1e_is_magic(sl1e)) )
3102 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3103 /* First, need to check that this isn't an out-of-sync
3104 * shadow l1e. If it is, we fall back to the slow path, which
3105 * will sync it up again. */
3107 shadow_l2e_t sl2e;
3108 mfn_t gl1mfn;
3109 if ( (__copy_from_user(&sl2e,
3110 (sh_linear_l2_table(v)
3111 + shadow_l2_linear_offset(va)),
3112 sizeof(sl2e)) != 0)
3113 || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
3114 || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
3115 shadow_l2e_get_mfn(sl2e))->backpointer))
3116 || unlikely(mfn_is_out_of_sync(gl1mfn)) )
3118 /* Hit the slow path as if there had been no
3119 * shadow entry at all, and let it tidy up */
3120 ASSERT(regs->error_code & PFEC_page_present);
3121 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3122 goto page_fault_slow_path;
3125 #endif /* SHOPT_OUT_OF_SYNC */
3127 if ( sh_l1e_is_gnp(sl1e) )
3129 /* Not-present in a guest PT: pass to the guest as
3130 * a not-present fault (by flipping two bits). */
3131 ASSERT(regs->error_code & PFEC_page_present);
3132 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3133 reset_early_unshadow(v);
3134 perfc_incr(shadow_fault_fast_gnp);
3135 SHADOW_PRINTK("fast path not-present\n");
3136 return 0;
3138 else
3140 /* Magic MMIO marker: extract gfn for MMIO address */
3141 ASSERT(sh_l1e_is_mmio(sl1e));
3142 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
3143 << PAGE_SHIFT)
3144 | (va & ~PAGE_MASK);
3146 perfc_incr(shadow_fault_fast_mmio);
3147 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
3148 reset_early_unshadow(v);
3149 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3150 ? EXCRET_fault_fixed : 0);
3152 else
3154 /* This should be exceptionally rare: another vcpu has fixed
3155 * the tables between the fault and our reading the l1e.
3156 * Retry and let the hardware give us the right fault next time. */
3157 perfc_incr(shadow_fault_fast_fail);
3158 SHADOW_PRINTK("fast path false alarm!\n");
3159 return EXCRET_fault_fixed;
3163 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3164 page_fault_slow_path:
3165 #endif
3166 #endif /* SHOPT_FAST_FAULT_PATH */
3168 /* Detect if this page fault happened while we were already in Xen
3169 * doing a shadow operation. If that happens, the only thing we can
3170 * do is let Xen's normal fault handlers try to fix it. In any case,
3171 * a diagnostic trace of the fault will be more useful than
3172 * a BUG() when we try to take the lock again. */
3173 if ( unlikely(shadow_locked_by_me(d)) )
3175 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
3176 d->arch.paging.shadow.locker_function);
3177 return 0;
3180 rewalk:
3181 rc = guest_walk_tables(v, va, &gw, regs->error_code);
3183 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3184 regs->error_code &= ~PFEC_page_present;
3185 if ( !(rc & _PAGE_PRESENT) )
3186 regs->error_code |= PFEC_page_present;
3187 #endif
3189 if ( rc != 0 )
3191 perfc_incr(shadow_fault_bail_real_fault);
3192 SHADOW_PRINTK("not a shadow fault\n");
3193 reset_early_unshadow(v);
3194 return 0;
3197 /* It's possible that the guest has put pagetables in memory that it has
3198 * already used for some special purpose (ioreq pages, or granted pages).
3199 * If that happens we'll have killed the guest already but it's still not
3200 * safe to propagate entries out of the guest PT so get out now. */
3201 if ( unlikely(d->is_shutting_down) )
3203 SHADOW_PRINTK("guest is shutting down\n");
3204 return 0;
3207 /* What kind of access are we dealing with? */
3208 ft = ((regs->error_code & PFEC_write_access)
3209 ? ft_demand_write : ft_demand_read);
3211 /* What mfn is the guest trying to access? */
3212 gfn = guest_l1e_get_gfn(gw.l1e);
3213 gmfn = gfn_to_mfn(d, gfn, &p2mt);
3215 if ( shadow_mode_refcounts(d) &&
3216 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3218 perfc_incr(shadow_fault_bail_bad_gfn);
3219 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3220 gfn_x(gfn), mfn_x(gmfn));
3221 reset_early_unshadow(v);
3222 return 0;
3225 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3226 /* Remember this successful VA->GFN translation for later. */
3227 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3228 regs->error_code | PFEC_page_present);
3229 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3231 shadow_lock(d);
3233 rc = gw_remove_write_accesses(v, va, &gw);
3235 /* First bit set: Removed write access to a page. */
3236 if ( rc & GW_RMWR_FLUSHTLB )
3238 /* Write permission removal is also a hint that other gwalks
3239 * overlapping with this one may be inconsistent
3240 */
3241 perfc_incr(shadow_rm_write_flush_tlb);
3242 atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3243 flush_tlb_mask(d->domain_dirty_cpumask);
3246 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3247 /* Second bit set: Resynced a page. Re-walk needed. */
3248 if ( rc & GW_RMWR_REWALK )
3250 shadow_unlock(d);
3251 goto rewalk;
3253 #endif /* OOS */
3255 if ( !shadow_check_gwalk(v, va, &gw) )
3257 perfc_incr(shadow_inconsistent_gwalk);
3258 shadow_unlock(d);
3259 goto rewalk;
3262 shadow_audit_tables(v);
3263 sh_audit_gw(v, &gw);
3265 /* Make sure there is enough free shadow memory to build a chain of
3266 * shadow tables. (We never allocate a top-level shadow on this path,
3267 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3268 * SH_type_l1_shadow isn't correct in the latter case, all page
3269 * tables are the same size there.) */
3270 shadow_prealloc(d,
3271 SH_type_l1_shadow,
3272 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3274 /* Acquire the shadow. This must happen before we figure out the rights
3275 * for the shadow entry, since we might promote a page here. */
3276 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3277 if ( unlikely(ptr_sl1e == NULL) )
3279 /* Couldn't get the sl1e! Since we know the guest entries
3280 * are OK, this can only have been caused by a failed
3281 * shadow_set_l*e(), which will have crashed the guest.
3282 * Get out of the fault handler immediately. */
3283 ASSERT(d->is_shutting_down);
3284 shadow_unlock(d);
3285 return 0;
3288 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3289 /* Always unsync when writing to L1 page tables. */
3290 if ( sh_mfn_is_a_page_table(gmfn)
3291 && ft == ft_demand_write )
3292 sh_unsync(v, gmfn);
3294 if ( unlikely(d->is_shutting_down) )
3296 /* We might end up with a crashed domain here if
3297 * sh_remove_shadows() in a previous sh_resync() call has
3298 * failed. We cannot safely continue since some page is still
3299 * OOS but not in the hash table anymore. */
3300 shadow_unlock(d);
3301 return 0;
3303 #endif /* OOS */
3305 /* Calculate the shadow entry and write it */
3306 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3307 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
3309 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3310 if ( mfn_valid(gw.l1mfn)
3311 && mfn_is_out_of_sync(gw.l1mfn) )
3313 /* Update the OOS snapshot. */
3314 mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
3315 guest_l1e_t *snp;
3317 ASSERT(mfn_valid(snpmfn));
3319 snp = sh_map_domain_page(snpmfn);
3320 snp[guest_l1_table_offset(va)] = gw.l1e;
3321 sh_unmap_domain_page(snp);
3323 #endif /* OOS */
3325 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3326 /* Prefetch some more shadow entries */
3327 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3328 #endif
3330 /* Need to emulate accesses to page tables */
3331 if ( sh_mfn_is_a_page_table(gmfn)
3332 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3333 /* Unless they've been allowed to go out of sync with their
3334 shadows and we don't need to unshadow it. */
3335 && !(mfn_is_out_of_sync(gmfn)
3336 && !(regs->error_code & PFEC_user_mode))
3337 #endif
3340 if ( ft == ft_demand_write )
3342 perfc_incr(shadow_fault_emulate_write);
3343 goto emulate;
3345 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3347 perfc_incr(shadow_fault_emulate_read);
3348 goto emulate;
3352 /* Need to hand off device-model MMIO to the device model */
3353 if ( p2mt == p2m_mmio_dm )
3355 gpa = guest_walk_to_gpa(&gw);
3356 goto mmio;
3359 /* Log attempts to write to read-only memory */
3360 if ( (p2mt == p2m_ram_ro) && (ft == ft_demand_write) )
3362 static unsigned long lastpage = 0;
3363 if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
3364 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
3365 " page. va page=%#lx, mfn=%#lx\n",
3366 va & PAGE_MASK, mfn_x(gmfn));
3367 goto emulate_readonly; /* skip over the instruction */
3370 /* In HVM guests, we force CR0.WP always to be set, so that the
3371 * pagetables are always write-protected. If the guest thinks
3372 * CR0.WP is clear, we must emulate faulting supervisor writes to
3373 * allow the guest to write through read-only PTEs. Emulate if the
3374 * fault was a non-user write to a present page. */
3375 if ( is_hvm_domain(d)
3376 && unlikely(!hvm_wp_enabled(v))
3377 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
3379 perfc_incr(shadow_fault_emulate_wp);
3380 goto emulate;
3383 perfc_incr(shadow_fault_fixed);
3384 d->arch.paging.log_dirty.fault_count++;
3385 reset_early_unshadow(v);
3387 done:
3388 sh_audit_gw(v, &gw);
3389 SHADOW_PRINTK("fixed\n");
3390 shadow_audit_tables(v);
3391 shadow_unlock(d);
3392 return EXCRET_fault_fixed;
3394 emulate:
3395 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3396 goto not_a_shadow_fault;
3398 /*
3399 * We do not emulate user writes. Instead we use them as a hint that the
3400 * page is no longer a page table. This behaviour differs from native, but
3401 * it seems very unlikely that any OS grants user access to page tables.
3402 */
3403 if ( (regs->error_code & PFEC_user_mode) )
3405 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3406 mfn_x(gmfn));
3407 perfc_incr(shadow_fault_emulate_failed);
3408 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3409 goto done;
3412 /*
3413 * Write from userspace to ro-mem needs to jump here to avoid getting
3414 * caught by user-mode page-table check above.
3415 */
3416 emulate_readonly:
3417 /*
3418 * We don't need to hold the lock for the whole emulation; we will
3419 * take it again when we write to the pagetables.
3420 */
3421 sh_audit_gw(v, &gw);
3422 shadow_audit_tables(v);
3423 shadow_unlock(d);
3425 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3426 early_emulation:
3427 #endif
3428 if ( is_hvm_domain(d) )
3430 /*
3431 * If we are in the middle of injecting an exception or interrupt then
3432 * we should not emulate: it is not the instruction at %eip that caused
3433 * the fault. Furthermore it is almost certainly the case the handler
3434 * stack is currently considered to be a page table, so we should
3435 * unshadow the faulting page before exiting.
3436 */
3437 if ( unlikely(hvm_event_pending(v)) )
3439 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3440 if ( fast_emul )
3442 perfc_incr(shadow_fault_fast_emulate_fail);
3443 v->arch.paging.last_write_emul_ok = 0;
3445 #endif
3446 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3447 "injection: cr2=%#lx, mfn=%#lx\n",
3448 va, mfn_x(gmfn));
3449 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3450 return EXCRET_fault_fixed;
3454 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3455 (unsigned long)regs->eip, (unsigned long)regs->esp);
3457 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3459 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3461 /*
3462 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3463 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3464 * then it must be 'failable': we cannot require the unshadow to succeed.
3465 */
3466 if ( r == X86EMUL_UNHANDLEABLE )
3468 perfc_incr(shadow_fault_emulate_failed);
3469 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3470 if ( fast_emul )
3472 perfc_incr(shadow_fault_fast_emulate_fail);
3473 v->arch.paging.last_write_emul_ok = 0;
3475 #endif
3476 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3477 mfn_x(gmfn));
3478 /* If this is actually a page table, then we have a bug, and need
3479 * to support more operations in the emulator. More likely,
3480 * though, this is a hint that this page should not be shadowed. */
3481 shadow_remove_all_shadows(v, gmfn);
3484 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3485 /* Record successfully emulated information as heuristics to next
3486 * fault on same frame for acceleration. But be careful to verify
3487 * its attribute still as page table, or else unshadow triggered
3488 * in write emulation normally requires a re-sync with guest page
3489 * table to recover r/w permission. Incorrect record for such case
3490 * will cause unexpected more shadow faults due to propagation is
3491 * skipped.
3492 */
3493 if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3495 if ( !fast_emul )
3497 v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3498 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3499 v->arch.paging.last_write_emul_ok = 1;
3502 else if ( fast_emul )
3503 v->arch.paging.last_write_emul_ok = 0;
3504 #endif
3506 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3507 if ( r == X86EMUL_OKAY ) {
3508 int i;
3509 /* Emulate up to four extra instructions in the hope of catching
3510 * the "second half" of a 64-bit pagetable write. */
3511 for ( i = 0 ; i < 4 ; i++ )
3513 shadow_continue_emulation(&emul_ctxt, regs);
3514 v->arch.paging.last_write_was_pt = 0;
3515 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3516 if ( r == X86EMUL_OKAY )
3518 if ( v->arch.paging.last_write_was_pt )
3520 perfc_incr(shadow_em_ex_pt);
3521 break; /* Don't emulate past the other half of the write */
3523 else
3524 perfc_incr(shadow_em_ex_non_pt);
3526 else
3528 perfc_incr(shadow_em_ex_fail);
3529 break; /* Don't emulate again if we failed! */
3533 #endif /* PAE guest */
3535 SHADOW_PRINTK("emulated\n");
3536 return EXCRET_fault_fixed;
3538 mmio:
3539 if ( !guest_mode(regs) )
3540 goto not_a_shadow_fault;
3541 perfc_incr(shadow_fault_mmio);
3542 sh_audit_gw(v, &gw);
3543 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3544 shadow_audit_tables(v);
3545 reset_early_unshadow(v);
3546 shadow_unlock(d);
3547 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3548 ? EXCRET_fault_fixed : 0);
3550 not_a_shadow_fault:
3551 sh_audit_gw(v, &gw);
3552 SHADOW_PRINTK("not a shadow fault\n");
3553 shadow_audit_tables(v);
3554 reset_early_unshadow(v);
3555 shadow_unlock(d);
3556 return 0;
3560 static int
3561 sh_invlpg(struct vcpu *v, unsigned long va)
3562 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3563 * instruction should be issued on the hardware, or 0 if it's safe not
3564 * to do so. */
3566 mfn_t sl1mfn;
3567 shadow_l2e_t sl2e;
3569 perfc_incr(shadow_invlpg);
3571 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3572 /* No longer safe to use cached gva->gfn translations */
3573 vtlb_flush(v);
3574 #endif
3576 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3577 v->arch.paging.last_write_emul_ok = 0;
3578 #endif
3580 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3581 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3582 * yet. */
3583 #if SHADOW_PAGING_LEVELS == 4
3585 shadow_l3e_t sl3e;
3586 if ( !(shadow_l4e_get_flags(
3587 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3588 & _PAGE_PRESENT) )
3589 return 0;
3590 /* This must still be a copy-from-user because we don't have the
3591 * shadow lock, and the higher-level shadows might disappear
3592 * under our feet. */
3593 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3594 + shadow_l3_linear_offset(va)),
3595 sizeof (sl3e)) != 0 )
3597 perfc_incr(shadow_invlpg_fault);
3598 return 0;
3600 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3601 return 0;
3603 #else /* SHADOW_PAGING_LEVELS == 3 */
3604 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3605 & _PAGE_PRESENT) )
3606 // no need to flush anything if there's no SL2...
3607 return 0;
3608 #endif
3610 /* This must still be a copy-from-user because we don't have the shadow
3611 * lock, and the higher-level shadows might disappear under our feet. */
3612 if ( __copy_from_user(&sl2e,
3613 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3614 sizeof (sl2e)) != 0 )
3616 perfc_incr(shadow_invlpg_fault);
3617 return 0;
3620 // If there's nothing shadowed for this particular sl2e, then
3621 // there is no need to do an invlpg, either...
3622 //
3623 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3624 return 0;
3626 // Check to see if the SL2 is a splintered superpage...
3627 // If so, then we'll need to flush the entire TLB (because that's
3628 // easier than invalidating all of the individual 4K pages).
3629 //
3630 sl1mfn = shadow_l2e_get_mfn(sl2e);
3631 if ( mfn_to_shadow_page(sl1mfn)->type
3632 == SH_type_fl1_shadow )
3634 flush_tlb_local();
3635 return 0;
3638 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3639 /* Check to see if the SL1 is out of sync. */
3641 mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3642 struct page_info *pg = mfn_to_page(gl1mfn);
3643 if ( mfn_valid(gl1mfn)
3644 && page_is_out_of_sync(pg) )
3646 /* The test above may give false positives, since we don't
3647 * hold the shadow lock yet. Check again with the lock held. */
3648 shadow_lock(v->domain);
3650 /* This must still be a copy-from-user because we didn't
3651 * have the shadow lock last time we checked, and the
3652 * higher-level shadows might have disappeared under our
3653 * feet. */
3654 if ( __copy_from_user(&sl2e,
3655 sh_linear_l2_table(v)
3656 + shadow_l2_linear_offset(va),
3657 sizeof (sl2e)) != 0 )
3659 perfc_incr(shadow_invlpg_fault);
3660 shadow_unlock(v->domain);
3661 return 0;
3664 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3666 shadow_unlock(v->domain);
3667 return 0;
3670 sl1mfn = shadow_l2e_get_mfn(sl2e);
3671 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3672 pg = mfn_to_page(gl1mfn);
3674 if ( likely(sh_mfn_is_a_page_table(gl1mfn)
3675 && page_is_out_of_sync(pg) ) )
3677 shadow_l1e_t *sl1;
3678 sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
3679 /* Remove the shadow entry that maps this VA */
3680 (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
3682 shadow_unlock(v->domain);
3683 /* Need the invlpg, to pick up the disappeareance of the sl1e */
3684 return 1;
3687 #endif
3689 return 1;
3693 static unsigned long
3694 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3695 /* Called to translate a guest virtual address to what the *guest*
3696 * pagetables would map it to. */
3698 walk_t gw;
3699 gfn_t gfn;
3701 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3702 /* Check the vTLB cache first */
3703 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3704 if ( VALID_GFN(vtlb_gfn) )
3705 return vtlb_gfn;
3706 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3708 if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
3710 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3711 pfec[0] &= ~PFEC_page_present;
3712 return INVALID_GFN;
3714 gfn = guest_walk_to_gfn(&gw);
3716 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3717 /* Remember this successful VA->GFN translation for later. */
3718 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3719 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3721 return gfn_x(gfn);
3725 static inline void
3726 sh_update_linear_entries(struct vcpu *v)
3727 /* Sync up all the linear mappings for this vcpu's pagetables */
3729 struct domain *d = v->domain;
3731 /* Linear pagetables in PV guests
3732 * ------------------------------
3734 * Guest linear pagetables, which map the guest pages, are at
3735 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3736 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3737 * are set up at shadow creation time, but (of course!) the PAE case
3738 * is subtler. Normal linear mappings are made by having an entry
3739 * in the top-level table that points to itself (shadow linear) or
3740 * to the guest top-level table (guest linear). For PAE, to set up
3741 * a linear map requires us to copy the four top-level entries into
3742 * level-2 entries. That means that every time we change a PAE l3e,
3743 * we need to reflect the change into the copy.
3745 * Linear pagetables in HVM guests
3746 * -------------------------------
3748 * For HVM guests, the linear pagetables are installed in the monitor
3749 * tables (since we can't put them in the shadow). Shadow linear
3750 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3751 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3752 * a linear pagetable of the monitor tables themselves. We have
3753 * the same issue of having to re-copy PAE l3 entries whevever we use
3754 * PAE shadows.
3756 * Because HVM guests run on the same monitor tables regardless of the
3757 * shadow tables in use, the linear mapping of the shadow tables has to
3758 * be updated every time v->arch.shadow_table changes.
3759 */
3761 /* Don't try to update the monitor table if it doesn't exist */
3762 if ( shadow_mode_external(d)
3763 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3764 return;
3766 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3768 /* For PV, one l4e points at the guest l4, one points at the shadow
3769 * l4. No maintenance required.
3770 * For HVM, just need to update the l4e that points to the shadow l4. */
3772 if ( shadow_mode_external(d) )
3774 /* Use the linear map if we can; otherwise make a new mapping */
3775 if ( v == current )
3777 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3778 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3779 __PAGE_HYPERVISOR);
3781 else
3783 l4_pgentry_t *ml4e;
3784 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3785 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3786 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3787 __PAGE_HYPERVISOR);
3788 sh_unmap_domain_page(ml4e);
3792 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3794 /* PV: XXX
3796 * HVM: To give ourselves a linear map of the shadows, we need to
3797 * extend a PAE shadow to 4 levels. We do this by having a monitor
3798 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3799 * entries into it. Then, by having the monitor l4e for shadow
3800 * pagetables also point to the monitor l4, we can use it to access
3801 * the shadows.
3802 */
3804 if ( shadow_mode_external(d) )
3806 /* Install copies of the shadow l3es into the monitor l2 table
3807 * that maps SH_LINEAR_PT_VIRT_START. */
3808 shadow_l3e_t *sl3e;
3809 l2_pgentry_t *ml2e;
3810 int i;
3812 /* Use linear mappings if we can; otherwise make new mappings */
3813 if ( v == current )
3814 ml2e = __linear_l2_table
3815 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3816 else
3818 mfn_t l3mfn, l2mfn;
3819 l4_pgentry_t *ml4e;
3820 l3_pgentry_t *ml3e;
3821 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3822 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3824 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3825 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3826 ml3e = sh_map_domain_page(l3mfn);
3827 sh_unmap_domain_page(ml4e);
3829 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3830 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3831 ml2e = sh_map_domain_page(l2mfn);
3832 sh_unmap_domain_page(ml3e);
3835 /* Shadow l3 tables are made up by sh_update_cr3 */
3836 sl3e = v->arch.paging.shadow.l3table;
3838 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3840 ml2e[i] =
3841 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3842 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3843 __PAGE_HYPERVISOR)
3844 : l2e_empty();
3847 if ( v != current )
3848 sh_unmap_domain_page(ml2e);
3850 else
3851 domain_crash(d); /* XXX */
3853 #elif CONFIG_PAGING_LEVELS == 3
3855 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3856 * entries in the shadow, and the shadow's l3 entries into the
3857 * shadow-linear-map l2 entries in the shadow. This is safe to do
3858 * because Xen does not let guests share high-slot l2 tables between l3s,
3859 * so we know we're not treading on anyone's toes.
3861 * HVM: need to copy the shadow's l3 entries into the
3862 * shadow-linear-map l2 entries in the monitor table. This is safe
3863 * because we have one monitor table for each vcpu. The monitor's
3864 * own l3es don't need to be copied because they never change.
3865 * XXX That might change if we start stuffing things into the rest
3866 * of the monitor's virtual address space.
3867 */
3869 l2_pgentry_t *l2e, new_l2e;
3870 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3871 int i;
3872 int unmap_l2e = 0;
3874 #if GUEST_PAGING_LEVELS == 2
3876 /* Shadow l3 tables were built by sh_update_cr3 */
3877 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3878 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3880 #else /* GUEST_PAGING_LEVELS == 3 */
3882 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3883 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3885 #endif /* GUEST_PAGING_LEVELS */
3887 /* Choose where to write the entries, using linear maps if possible */
3888 if ( shadow_mode_external(d) )
3890 if ( v == current )
3892 /* From the monitor tables, it's safe to use linear maps
3893 * to update monitor l2s */
3894 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3896 else
3898 /* Map the monitor table's high l2 */
3899 l3_pgentry_t *l3e;
3900 l3e = sh_map_domain_page(
3901 pagetable_get_mfn(v->arch.monitor_table));
3902 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3903 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3904 unmap_l2e = 1;
3905 sh_unmap_domain_page(l3e);
3908 else
3910 /* Map the shadow table's high l2 */
3911 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3912 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3913 unmap_l2e = 1;
3916 /* Write linear mapping of guest (only in PV, and only when
3917 * not translated). */
3918 if ( !shadow_mode_translate(d) )
3920 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3922 new_l2e =
3923 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3924 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3925 __PAGE_HYPERVISOR)
3926 : l2e_empty());
3927 safe_write_entry(
3928 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3929 &new_l2e);
3933 /* Write linear mapping of shadow. */
3934 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3936 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3937 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3938 __PAGE_HYPERVISOR)
3939 : l2e_empty();
3940 safe_write_entry(
3941 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3942 &new_l2e);
3945 if ( unmap_l2e )
3946 sh_unmap_domain_page(l2e);
3949 #else
3950 #error this should not happen
3951 #endif
3953 if ( shadow_mode_external(d) )
3955 /*
3956 * Having modified the linear pagetable mapping, flush local host TLBs.
3957 * This was not needed when vmenter/vmexit always had the side effect
3958 * of flushing host TLBs but, with ASIDs, it is possible to finish
3959 * this CR3 update, vmenter the guest, vmexit due to a page fault,
3960 * without an intervening host TLB flush. Then the page fault code
3961 * could use the linear pagetable to read a top-level shadow page
3962 * table entry. But, without this change, it would fetch the wrong
3963 * value due to a stale TLB.
3964 */
3965 flush_tlb_local();
3970 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3971 * Does all appropriate management/bookkeeping/refcounting/etc...
3972 */
3973 static void
3974 sh_detach_old_tables(struct vcpu *v)
3976 mfn_t smfn;
3977 int i = 0;
3979 ////
3980 //// vcpu->arch.paging.shadow.guest_vtable
3981 ////
3983 #if GUEST_PAGING_LEVELS == 3
3984 /* PAE guests don't have a mapping of the guest top-level table */
3985 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3986 #else
3987 if ( v->arch.paging.shadow.guest_vtable )
3989 struct domain *d = v->domain;
3990 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3991 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3992 v->arch.paging.shadow.guest_vtable = NULL;
3994 #endif
3997 ////
3998 //// vcpu->arch.shadow_table[]
3999 ////
4001 #if GUEST_PAGING_LEVELS == 3
4002 /* PAE guests have four shadow_table entries */
4003 for ( i = 0 ; i < 4 ; i++ )
4004 #endif
4006 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4007 if ( mfn_x(smfn) )
4008 sh_put_ref(v, smfn, 0);
4009 v->arch.shadow_table[i] = pagetable_null();
4013 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
4014 static void
4015 sh_set_toplevel_shadow(struct vcpu *v,
4016 int slot,
4017 mfn_t gmfn,
4018 unsigned int root_type)
4020 mfn_t smfn;
4021 pagetable_t old_entry, new_entry;
4023 struct domain *d = v->domain;
4025 /* Remember the old contents of this slot */
4026 old_entry = v->arch.shadow_table[slot];
4028 /* Now figure out the new contents: is this a valid guest MFN? */
4029 if ( !mfn_valid(gmfn) )
4031 new_entry = pagetable_null();
4032 goto install_new_entry;
4035 /* Guest mfn is valid: shadow it and install the shadow */
4036 smfn = get_shadow_status(v, gmfn, root_type);
4037 if ( !mfn_valid(smfn) )
4039 /* Make sure there's enough free shadow memory. */
4040 shadow_prealloc(d, root_type, 1);
4041 /* Shadow the page. */
4042 smfn = sh_make_shadow(v, gmfn, root_type);
4044 ASSERT(mfn_valid(smfn));
4046 /* Pin the shadow and put it (back) on the list of pinned shadows */
4047 if ( sh_pin(v, smfn) == 0 )
4049 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
4050 domain_crash(v->domain);
4053 /* Take a ref to this page: it will be released in sh_detach_old_tables()
4054 * or the next call to set_toplevel_shadow() */
4055 if ( !sh_get_ref(v, smfn, 0) )
4057 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
4058 domain_crash(v->domain);
4061 new_entry = pagetable_from_mfn(smfn);
4063 install_new_entry:
4064 /* Done. Install it */
4065 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
4066 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
4067 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
4068 v->arch.shadow_table[slot] = new_entry;
4070 /* Decrement the refcount of the old contents of this slot */
4071 if ( !pagetable_is_null(old_entry) ) {
4072 mfn_t old_smfn = pagetable_get_mfn(old_entry);
4073 /* Need to repin the old toplevel shadow if it's been unpinned
4074 * by shadow_prealloc(): in PV mode we're still running on this
4075 * shadow and it's not safe to free it yet. */
4076 if ( !mfn_to_shadow_page(old_smfn)->pinned && !sh_pin(v, old_smfn) )
4078 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
4079 domain_crash(v->domain);
4081 sh_put_ref(v, old_smfn, 0);
4086 static void
4087 sh_update_cr3(struct vcpu *v, int do_locking)
4088 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
4089 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
4090 * if appropriate).
4091 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
4092 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
4093 * shadow tables are.
4094 * If do_locking != 0, assume we are being called from outside the
4095 * shadow code, and must take and release the shadow lock; otherwise
4096 * that is the caller's responsibility.
4097 */
4099 struct domain *d = v->domain;
4100 mfn_t gmfn;
4101 #if GUEST_PAGING_LEVELS == 3
4102 guest_l3e_t *gl3e;
4103 u32 guest_idx=0;
4104 int i;
4105 #endif
4107 /* Don't do anything on an uninitialised vcpu */
4108 if ( !is_hvm_domain(d) && !v->is_initialised )
4110 ASSERT(v->arch.cr3 == 0);
4111 return;
4114 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4115 /* Need to resync all the shadow entries on a TLB flush. Resync
4116 * current vcpus OOS pages before switching to the new shadow
4117 * tables so that the VA hint is still valid. */
4118 shadow_resync_current_vcpu(v, do_locking);
4119 #endif
4121 if ( do_locking ) shadow_lock(v->domain);
4123 ASSERT(shadow_locked_by_me(v->domain));
4124 ASSERT(v->arch.paging.mode);
4126 ////
4127 //// vcpu->arch.guest_table is already set
4128 ////
4130 #ifndef NDEBUG
4131 /* Double-check that the HVM code has sent us a sane guest_table */
4132 if ( is_hvm_domain(d) )
4134 ASSERT(shadow_mode_external(d));
4135 if ( hvm_paging_enabled(v) )
4136 ASSERT(pagetable_get_pfn(v->arch.guest_table));
4137 else
4138 ASSERT(v->arch.guest_table.pfn
4139 == d->arch.paging.shadow.unpaged_pagetable.pfn);
4141 #endif
4143 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
4144 d->domain_id, v->vcpu_id,
4145 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
4147 #if GUEST_PAGING_LEVELS == 4
4148 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
4149 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
4150 else
4151 #endif
4152 gmfn = pagetable_get_mfn(v->arch.guest_table);
4155 ////
4156 //// vcpu->arch.paging.shadow.guest_vtable
4157 ////
4158 #if GUEST_PAGING_LEVELS == 4
4159 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4161 if ( v->arch.paging.shadow.guest_vtable )
4162 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4163 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4164 /* PAGING_LEVELS==4 implies 64-bit, which means that
4165 * map_domain_page_global can't fail */
4166 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
4168 else
4169 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
4170 #elif GUEST_PAGING_LEVELS == 3
4171 /* On PAE guests we don't use a mapping of the guest's own top-level
4172 * table. We cache the current state of that table and shadow that,
4173 * until the next CR3 write makes us refresh our cache. */
4174 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4176 if ( shadow_mode_external(d) )
4177 /* Find where in the page the l3 table is */
4178 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
4179 else
4180 /* PV guest: l3 is at the start of a page */
4181 guest_idx = 0;
4183 // Ignore the low 2 bits of guest_idx -- they are really just
4184 // cache control.
4185 guest_idx &= ~3;
4187 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
4188 for ( i = 0; i < 4 ; i++ )
4189 v->arch.paging.shadow.gl3e[i] = gl3e[i];
4190 sh_unmap_domain_page(gl3e);
4191 #elif GUEST_PAGING_LEVELS == 2
4192 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4194 if ( v->arch.paging.shadow.guest_vtable )
4195 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4196 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4197 /* Does this really need map_domain_page_global? Handle the
4198 * error properly if so. */
4199 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
4201 else
4202 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
4203 #else
4204 #error this should never happen
4205 #endif
4208 ////
4209 //// vcpu->arch.shadow_table[]
4210 ////
4212 /* We revoke write access to the new guest toplevel page(s) before we
4213 * replace the old shadow pagetable(s), so that we can safely use the
4214 * (old) shadow linear maps in the writeable mapping heuristics. */
4215 #if GUEST_PAGING_LEVELS == 2
4216 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
4217 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4218 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
4219 #elif GUEST_PAGING_LEVELS == 3
4220 /* PAE guests have four shadow_table entries, based on the
4221 * current values of the guest's four l3es. */
4223 int flush = 0;
4224 gfn_t gl2gfn;
4225 mfn_t gl2mfn;
4226 p2m_type_t p2mt;
4227 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
4228 /* First, make all four entries read-only. */
4229 for ( i = 0; i < 4; i++ )
4231 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4233 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4234 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4235 if ( p2m_is_ram(p2mt) )
4236 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
4239 if ( flush )
4240 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4241 /* Now install the new shadows. */
4242 for ( i = 0; i < 4; i++ )
4244 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4246 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4247 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4248 if ( p2m_is_ram(p2mt) )
4249 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
4250 ? SH_type_l2h_shadow
4251 : SH_type_l2_shadow);
4252 else
4253 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4255 else
4256 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4259 #elif GUEST_PAGING_LEVELS == 4
4260 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
4261 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4262 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
4263 #else
4264 #error This should never happen
4265 #endif
4268 ///
4269 /// v->arch.paging.shadow.l3table
4270 ///
4271 #if SHADOW_PAGING_LEVELS == 3
4273 mfn_t smfn;
4274 int i;
4275 for ( i = 0; i < 4; i++ )
4277 #if GUEST_PAGING_LEVELS == 2
4278 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
4279 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
4280 #else
4281 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
4282 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4283 #endif
4284 v->arch.paging.shadow.l3table[i] =
4285 (mfn_x(smfn) == 0)
4286 ? shadow_l3e_empty()
4287 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
4290 #endif /* SHADOW_PAGING_LEVELS == 3 */
4293 ///
4294 /// v->arch.cr3
4295 ///
4296 if ( shadow_mode_external(d) )
4298 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4300 else // not shadow_mode_external...
4302 /* We don't support PV except guest == shadow == config levels */
4303 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4304 #if SHADOW_PAGING_LEVELS == 3
4305 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4306 * Don't use make_cr3 because (a) we know it's below 4GB, and
4307 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4308 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4309 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4310 #else
4311 /* 4-on-4: Just use the shadow top-level directly */
4312 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4313 #endif
4317 ///
4318 /// v->arch.hvm_vcpu.hw_cr[3]
4319 ///
4320 if ( shadow_mode_external(d) )
4322 ASSERT(is_hvm_domain(d));
4323 #if SHADOW_PAGING_LEVELS == 3
4324 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4325 v->arch.hvm_vcpu.hw_cr[3] =
4326 virt_to_maddr(&v->arch.paging.shadow.l3table);
4327 #else
4328 /* 4-on-4: Just use the shadow top-level directly */
4329 v->arch.hvm_vcpu.hw_cr[3] =
4330 pagetable_get_paddr(v->arch.shadow_table[0]);
4331 #endif
4332 hvm_update_guest_cr(v, 3);
4335 /* Fix up the linear pagetable mappings */
4336 sh_update_linear_entries(v);
4338 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4339 /* No longer safe to use cached gva->gfn translations */
4340 vtlb_flush(v);
4341 #endif
4343 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4344 v->arch.paging.last_write_emul_ok = 0;
4345 #endif
4347 /* Release the lock, if we took it (otherwise it's the caller's problem) */
4348 if ( do_locking ) shadow_unlock(v->domain);
4350 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4351 /* Need to resync all the shadow entries on a TLB flush. We only
4352 * update the shadows, leaving the pages out of sync. Also, we try
4353 * to skip synchronization of shadows not mapped in the new
4354 * tables. */
4355 shadow_sync_other_vcpus(v, do_locking);
4356 #endif
4361 /**************************************************************************/
4362 /* Functions to revoke guest rights */
4364 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
4365 int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
4366 mfn_t smfn, unsigned long off)
4368 int r;
4369 shadow_l1e_t *sl1p, sl1e;
4370 struct shadow_page_info *sp;
4372 ASSERT(mfn_valid(gmfn));
4373 ASSERT(mfn_valid(smfn));
4375 sp = mfn_to_shadow_page(smfn);
4377 if ( sp->mbz != 0
4378 || (sp->type != SH_type_l1_shadow) )
4379 goto fail;
4381 sl1p = sh_map_domain_page(smfn);
4382 sl1p += off;
4383 sl1e = *sl1p;
4384 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4385 != (_PAGE_PRESENT|_PAGE_RW))
4386 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4388 sh_unmap_domain_page(sl1p);
4389 goto fail;
4392 /* Found it! Need to remove its write permissions. */
4393 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4394 r = shadow_set_l1e(v, sl1p, sl1e, smfn);
4395 ASSERT( !(r & SHADOW_SET_ERROR) );
4397 sh_unmap_domain_page(sl1p);
4398 perfc_incr(shadow_writeable_h_7);
4399 return 1;
4401 fail:
4402 perfc_incr(shadow_writeable_h_8);
4403 return 0;
4405 #endif /* OOS */
4407 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4408 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4409 /* Look up this vaddr in the current shadow and see if it's a writeable
4410 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4412 shadow_l1e_t sl1e, *sl1p;
4413 shadow_l2e_t *sl2p;
4414 shadow_l3e_t *sl3p;
4415 #if SHADOW_PAGING_LEVELS >= 4
4416 shadow_l4e_t *sl4p;
4417 #endif
4418 mfn_t sl1mfn;
4419 int r;
4421 /* Carefully look in the shadow linear map for the l1e we expect */
4422 #if SHADOW_PAGING_LEVELS >= 4
4423 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4424 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4425 return 0;
4426 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4427 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4428 return 0;
4429 #else /* SHADOW_PAGING_LEVELS == 3 */
4430 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4431 + shadow_l3_linear_offset(vaddr);
4432 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4433 return 0;
4434 #endif
4435 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4436 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4437 return 0;
4438 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4439 sl1e = *sl1p;
4440 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4441 != (_PAGE_PRESENT|_PAGE_RW))
4442 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4443 return 0;
4445 /* Found it! Need to remove its write permissions. */
4446 sl1mfn = shadow_l2e_get_mfn(*sl2p);
4447 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4448 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
4449 ASSERT( !(r & SHADOW_SET_ERROR) );
4450 return 1;
4452 #endif
4454 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4455 mfn_t readonly_mfn)
4456 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4458 shadow_l1e_t *sl1e;
4459 int done = 0;
4460 int flags;
4461 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4462 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4463 #endif
4465 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4467 flags = shadow_l1e_get_flags(*sl1e);
4468 if ( (flags & _PAGE_PRESENT)
4469 && (flags & _PAGE_RW)
4470 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4472 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4473 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
4474 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4475 /* Remember the last shadow that we shot a writeable mapping in */
4476 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4477 #endif
4478 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4479 & PGT_count_mask) == 0 )
4480 /* This breaks us cleanly out of the FOREACH macro */
4481 done = 1;
4483 });
4484 return done;
4488 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4489 /* Excises all mappings to guest frame from this shadow l1 table */
4491 shadow_l1e_t *sl1e;
4492 int done = 0;
4493 int flags;
4495 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4497 flags = shadow_l1e_get_flags(*sl1e);
4498 if ( (flags & _PAGE_PRESENT)
4499 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4501 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
4502 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4503 /* This breaks us cleanly out of the FOREACH macro */
4504 done = 1;
4506 });
4507 return done;
4510 /**************************************************************************/
4511 /* Functions to excise all pointers to shadows from higher-level shadows. */
4513 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4514 /* Blank out a single shadow entry */
4516 switch ( mfn_to_shadow_page(smfn)->type )
4518 case SH_type_l1_shadow:
4519 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
4520 case SH_type_l2_shadow:
4521 #if GUEST_PAGING_LEVELS >= 3
4522 case SH_type_l2h_shadow:
4523 #endif
4524 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
4525 #if GUEST_PAGING_LEVELS >= 4
4526 case SH_type_l3_shadow:
4527 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
4528 case SH_type_l4_shadow:
4529 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
4530 #endif
4531 default: BUG(); /* Called with the wrong kind of shadow. */
4535 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4536 /* Remove all mappings of this l1 shadow from this l2 shadow */
4538 shadow_l2e_t *sl2e;
4539 int done = 0;
4540 int flags;
4542 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4544 flags = shadow_l2e_get_flags(*sl2e);
4545 if ( (flags & _PAGE_PRESENT)
4546 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4548 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4549 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
4550 /* This breaks us cleanly out of the FOREACH macro */
4551 done = 1;
4553 });
4554 return done;
4557 #if GUEST_PAGING_LEVELS >= 4
4558 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4559 /* Remove all mappings of this l2 shadow from this l3 shadow */
4561 shadow_l3e_t *sl3e;
4562 int done = 0;
4563 int flags;
4565 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4567 flags = shadow_l3e_get_flags(*sl3e);
4568 if ( (flags & _PAGE_PRESENT)
4569 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4571 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4572 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
4573 /* This breaks us cleanly out of the FOREACH macro */
4574 done = 1;
4576 });
4577 return done;
4580 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4581 /* Remove all mappings of this l3 shadow from this l4 shadow */
4583 shadow_l4e_t *sl4e;
4584 int done = 0;
4585 int flags;
4587 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4589 flags = shadow_l4e_get_flags(*sl4e);
4590 if ( (flags & _PAGE_PRESENT)
4591 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4593 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4594 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
4595 /* This breaks us cleanly out of the FOREACH macro */
4596 done = 1;
4598 });
4599 return done;
4601 #endif /* 64bit guest */
4603 /**************************************************************************/
4604 /* Handling HVM guest writes to pagetables */
4606 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4607 #define BAD_GVA_TO_GFN (~0UL)
4608 #define BAD_GFN_TO_MFN (~1UL)
4609 #define READONLY_GFN (~2UL)
4610 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4611 unsigned long vaddr,
4612 struct sh_emulate_ctxt *sh_ctxt)
4614 unsigned long gfn;
4615 mfn_t mfn;
4616 p2m_type_t p2mt;
4617 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4619 /* Translate the VA to a GFN */
4620 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4621 if ( gfn == INVALID_GFN )
4623 if ( is_hvm_vcpu(v) )
4624 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4625 else
4626 propagate_page_fault(vaddr, pfec);
4627 return _mfn(BAD_GVA_TO_GFN);
4630 /* Translate the GFN to an MFN */
4631 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4632 if ( p2mt == p2m_ram_ro )
4633 return _mfn(READONLY_GFN);
4634 if ( !p2m_is_ram(p2mt) )
4635 return _mfn(BAD_GFN_TO_MFN);
4637 ASSERT(mfn_valid(mfn));
4638 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4639 return mfn;
4642 /* Check that the user is allowed to perform this write.
4643 * Returns a mapped pointer to write to, or NULL for error. */
4644 #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
4645 #define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION)
4646 #define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY)
4647 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3)
4648 static void *emulate_map_dest(struct vcpu *v,
4649 unsigned long vaddr,
4650 u32 bytes,
4651 struct sh_emulate_ctxt *sh_ctxt)
4653 unsigned long offset;
4654 void *map = NULL;
4656 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4657 if ( !mfn_valid(sh_ctxt->mfn1) )
4658 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4659 MAPPING_EXCEPTION :
4660 (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
4661 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4663 #ifndef NDEBUG
4664 /* We don't emulate user-mode writes to page tables */
4665 if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 )
4667 gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
4668 "emulate_map_dest(). This should never happen!\n");
4669 return MAPPING_UNHANDLEABLE;
4671 #endif
4673 /* Unaligned writes mean probably this isn't a pagetable */
4674 if ( vaddr & (bytes - 1) )
4675 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4677 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4679 /* Whole write fits on a single page */
4680 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4681 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4683 else
4685 /* Cross-page emulated writes are only supported for HVM guests;
4686 * PV guests ought to know better */
4687 if ( !is_hvm_vcpu(v) )
4688 return MAPPING_UNHANDLEABLE;
4690 /* This write crosses a page boundary. Translate the second page */
4691 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4692 sh_ctxt);
4693 if ( !mfn_valid(sh_ctxt->mfn2) )
4694 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4695 MAPPING_EXCEPTION :
4696 (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
4697 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4699 /* Cross-page writes mean probably not a pagetable */
4700 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4702 /* Hack: we map the pages into the vcpu's LDT space, since we
4703 * know that we're not going to need the LDT for HVM guests,
4704 * and only HVM guests are allowed unaligned writes. */
4705 ASSERT(is_hvm_vcpu(v));
4706 map = (void *)LDT_VIRT_START(v);
4707 offset = l1_linear_offset((unsigned long) map);
4708 l1e_write(&__linear_l1_table[offset],
4709 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4710 l1e_write(&__linear_l1_table[offset + 1],
4711 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4712 flush_tlb_local();
4713 map += (vaddr & ~PAGE_MASK);
4716 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4717 /* Remember if the bottom bit was clear, so we can choose not to run
4718 * the change through the verify code if it's still clear afterwards */
4719 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4720 #endif
4722 return map;
4725 /* Tidy up after the emulated write: mark pages dirty, verify the new
4726 * contents, and undo the mapping */
4727 static void emulate_unmap_dest(struct vcpu *v,
4728 void *addr,
4729 u32 bytes,
4730 struct sh_emulate_ctxt *sh_ctxt)
4732 u32 b1 = bytes, b2 = 0, shflags;
4734 ASSERT(mfn_valid(sh_ctxt->mfn1));
4736 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4737 if ( likely(bytes >= 4)
4738 && (*(u32 *)addr == 0)
4739 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4740 check_for_early_unshadow(v, sh_ctxt->mfn1);
4741 else
4742 reset_early_unshadow(v);
4744 /* We can avoid re-verifying the page contents after the write if:
4745 * - it was no larger than the PTE type of this pagetable;
4746 * - it was aligned to the PTE boundaries; and
4747 * - _PAGE_PRESENT was clear before and after the write. */
4748 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4749 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4750 if ( sh_ctxt->low_bit_was_clear
4751 && !(*(u8 *)addr & _PAGE_PRESENT)
4752 && ((!(shflags & SHF_32)
4753 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4754 * the present bit unset are safe to ignore. */
4755 && ((unsigned long)addr & 7) == 0
4756 && bytes <= 8)
4757 ||
4758 (!(shflags & (SHF_PAE|SHF_64))
4759 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4760 * leave the present bit unset are safe to ignore. */
4761 && ((unsigned long)addr & 3) == 0
4762 && bytes <= 4)) )
4764 /* Writes with this alignment constraint can't possibly cross pages */
4765 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4767 else
4768 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4770 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4772 /* Validate as two writes, one to each page */
4773 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4774 b2 = bytes - b1;
4775 ASSERT(b2 < bytes);
4777 if ( likely(b1 > 0) )
4778 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4779 if ( unlikely(b2 > 0) )
4780 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4783 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4785 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4787 unsigned long offset;
4788 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4789 /* Undo the hacky two-frame contiguous map. */
4790 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4791 offset = l1_linear_offset((unsigned long) addr);
4792 l1e_write(&__linear_l1_table[offset], l1e_empty());
4793 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4794 flush_tlb_all();
4796 else
4797 sh_unmap_domain_page(addr);
4799 atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4802 static int
4803 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4804 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4806 void *addr;
4808 /* Unaligned writes are only acceptable on HVM */
4809 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4810 return X86EMUL_UNHANDLEABLE;
4812 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4813 if ( emulate_map_dest_failed(addr) )
4814 return (long)addr;
4816 shadow_lock(v->domain);
4817 memcpy(addr, src, bytes);
4819 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4820 shadow_audit_tables(v);
4821 shadow_unlock(v->domain);
4822 return X86EMUL_OKAY;
4825 static int
4826 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4827 unsigned long old, unsigned long new,
4828 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4830 void *addr;
4831 unsigned long prev;
4832 int rv = X86EMUL_OKAY;
4834 /* Unaligned writes are only acceptable on HVM */
4835 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4836 return X86EMUL_UNHANDLEABLE;
4838 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4839 if ( emulate_map_dest_failed(addr) )
4840 return (long)addr;
4842 shadow_lock(v->domain);
4843 switch ( bytes )
4845 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4846 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4847 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4848 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4849 default:
4850 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4851 prev = ~old;
4854 if ( prev != old )
4855 rv = X86EMUL_CMPXCHG_FAILED;
4857 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4858 " wanted %#lx now %#lx bytes %u\n",
4859 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4861 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4862 shadow_audit_tables(v);
4863 shadow_unlock(v->domain);
4864 return rv;
4867 #ifdef __i386__
4868 static int
4869 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4870 unsigned long old_lo, unsigned long old_hi,
4871 unsigned long new_lo, unsigned long new_hi,
4872 struct sh_emulate_ctxt *sh_ctxt)
4874 void *addr;
4875 u64 old, new, prev;
4876 int rv = X86EMUL_OKAY;
4878 /* Unaligned writes are only acceptable on HVM */
4879 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4880 return X86EMUL_UNHANDLEABLE;
4882 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4883 if ( emulate_map_dest_failed(addr) )
4884 return (long)addr;
4886 old = (((u64) old_hi) << 32) | (u64) old_lo;
4887 new = (((u64) new_hi) << 32) | (u64) new_lo;
4889 shadow_lock(v->domain);
4890 prev = cmpxchg(((u64 *)addr), old, new);
4892 if ( prev != old )
4893 rv = X86EMUL_CMPXCHG_FAILED;
4895 emulate_unmap_dest(v, addr, 8, sh_ctxt);
4896 shadow_audit_tables(v);
4897 shadow_unlock(v->domain);
4898 return rv;
4900 #endif
4902 /**************************************************************************/
4903 /* Audit tools */
4905 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4907 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4908 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4909 "gl" #_level "mfn = %" PRI_mfn \
4910 " sl" #_level "mfn = %" PRI_mfn \
4911 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4912 " gl" #_level "e = %" SH_PRI_gpte \
4913 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4914 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4915 _level, guest_index(gl ## _level ## e), \
4916 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4917 gl ## _level ## e, sl ## _level ## e, \
4918 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4919 ##_a); \
4920 BUG(); \
4921 done = 1; \
4922 } while (0)
4924 #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
4925 printk("Shadow %u-on-%u audit failed at level %i\n" \
4926 "gl" #_level "mfn = %" PRI_mfn \
4927 " sl" #_level "mfn = %" PRI_mfn \
4928 " Error: " _fmt "\n", \
4929 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4930 _level, \
4931 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4932 ##_a); \
4933 BUG(); \
4934 done = 1; \
4935 } while (0)
4937 static char * sh_audit_flags(struct vcpu *v, int level,
4938 int gflags, int sflags)
4939 /* Common code for auditing flag bits */
4941 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4942 return "shadow is present but guest is not present";
4943 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4944 return "global bit set in PV shadow";
4945 if ( level == 2 && (sflags & _PAGE_PSE) )
4946 return "PS bit set in shadow";
4947 #if SHADOW_PAGING_LEVELS == 3
4948 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4949 #endif
4950 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4951 return "accessed bit not propagated";
4952 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4953 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4954 return "dirty bit not propagated";
4955 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4956 return "user/supervisor bit does not match";
4957 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4958 return "NX bit does not match";
4959 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4960 return "shadow grants write access but guest does not";
4961 return NULL;
4964 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4966 guest_l1e_t *gl1e, *gp;
4967 shadow_l1e_t *sl1e;
4968 mfn_t mfn, gmfn, gl1mfn;
4969 gfn_t gfn;
4970 p2m_type_t p2mt;
4971 char *s;
4972 int done = 0;
4974 /* Follow the backpointer */
4975 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4977 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4978 /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
4979 if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
4981 oos_audit_hash_is_present(v->domain, gl1mfn);
4982 return 0;
4984 #endif
4986 gl1e = gp = sh_map_domain_page(gl1mfn);
4987 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4989 if ( sh_l1e_is_magic(*sl1e) )
4991 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
4992 if ( sh_l1e_is_gnp(*sl1e) )
4994 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4995 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4997 else
4999 ASSERT(sh_l1e_is_mmio(*sl1e));
5000 gfn = sh_l1e_mmio_get_gfn(*sl1e);
5001 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
5002 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
5003 " but guest gfn is %" SH_PRI_gfn,
5004 gfn_x(gfn),
5005 gfn_x(guest_l1e_get_gfn(*gl1e)));
5007 #endif
5009 else
5011 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
5012 shadow_l1e_get_flags(*sl1e));
5013 if ( s ) AUDIT_FAIL(1, "%s", s);
5015 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5017 gfn = guest_l1e_get_gfn(*gl1e);
5018 mfn = shadow_l1e_get_mfn(*sl1e);
5019 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
5020 if ( mfn_x(gmfn) != mfn_x(mfn) )
5021 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
5022 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5023 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5026 });
5027 sh_unmap_domain_page(gp);
5028 return done;
5031 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
5033 guest_l1e_t *gl1e, e;
5034 shadow_l1e_t *sl1e;
5035 mfn_t gl1mfn = _mfn(INVALID_MFN);
5036 int f;
5037 int done = 0;
5039 /* fl1 has no useful backpointer: all we can check are flags */
5040 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
5041 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
5042 f = shadow_l1e_get_flags(*sl1e);
5043 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
5044 if ( !(f == 0
5045 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
5046 _PAGE_ACCESSED|_PAGE_DIRTY)
5047 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
5048 || sh_l1e_is_magic(*sl1e)) )
5049 AUDIT_FAIL(1, "fl1e has bad flags");
5050 });
5051 return 0;
5054 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
5056 guest_l2e_t *gl2e, *gp;
5057 shadow_l2e_t *sl2e;
5058 mfn_t mfn, gmfn, gl2mfn;
5059 gfn_t gfn;
5060 p2m_type_t p2mt;
5061 char *s;
5062 int done = 0;
5064 /* Follow the backpointer */
5065 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
5067 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5068 /* Only L1's may be out of sync. */
5069 if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
5070 AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
5071 #endif
5073 gl2e = gp = sh_map_domain_page(gl2mfn);
5074 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
5076 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
5077 shadow_l2e_get_flags(*sl2e));
5078 if ( s ) AUDIT_FAIL(2, "%s", s);
5080 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5082 gfn = guest_l2e_get_gfn(*gl2e);
5083 mfn = shadow_l2e_get_mfn(*sl2e);
5084 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
5085 ? get_fl1_shadow_status(v, gfn)
5086 : get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5087 SH_type_l1_shadow);
5088 if ( mfn_x(gmfn) != mfn_x(mfn) )
5089 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
5090 " (--> %" PRI_mfn ")"
5091 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5092 gfn_x(gfn),
5093 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
5094 : mfn_x(gfn_to_mfn(v->domain, gfn, &p2mt)),
5095 mfn_x(gmfn), mfn_x(mfn));
5097 });
5098 sh_unmap_domain_page(gp);
5099 return 0;
5102 #if GUEST_PAGING_LEVELS >= 4
5103 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
5105 guest_l3e_t *gl3e, *gp;
5106 shadow_l3e_t *sl3e;
5107 mfn_t mfn, gmfn, gl3mfn;
5108 gfn_t gfn;
5109 p2m_type_t p2mt;
5110 char *s;
5111 int done = 0;
5113 /* Follow the backpointer */
5114 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
5116 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5117 /* Only L1's may be out of sync. */
5118 if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
5119 AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
5120 #endif
5122 gl3e = gp = sh_map_domain_page(gl3mfn);
5123 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
5125 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
5126 shadow_l3e_get_flags(*sl3e));
5127 if ( s ) AUDIT_FAIL(3, "%s", s);
5129 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5131 gfn = guest_l3e_get_gfn(*gl3e);
5132 mfn = shadow_l3e_get_mfn(*sl3e);
5133 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5134 ((GUEST_PAGING_LEVELS == 3 ||
5135 is_pv_32on64_vcpu(v))
5136 && !shadow_mode_external(v->domain)
5137 && (guest_index(gl3e) % 4) == 3)
5138 ? SH_type_l2h_shadow
5139 : SH_type_l2_shadow);
5140 if ( mfn_x(gmfn) != mfn_x(mfn) )
5141 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
5142 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5143 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5145 });
5146 sh_unmap_domain_page(gp);
5147 return 0;
5150 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
5152 guest_l4e_t *gl4e, *gp;
5153 shadow_l4e_t *sl4e;
5154 mfn_t mfn, gmfn, gl4mfn;
5155 gfn_t gfn;
5156 p2m_type_t p2mt;
5157 char *s;
5158 int done = 0;
5160 /* Follow the backpointer */
5161 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
5163 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5164 /* Only L1's may be out of sync. */
5165 if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
5166 AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
5167 #endif
5169 gl4e = gp = sh_map_domain_page(gl4mfn);
5170 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
5172 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
5173 shadow_l4e_get_flags(*sl4e));
5174 if ( s ) AUDIT_FAIL(4, "%s", s);
5176 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5178 gfn = guest_l4e_get_gfn(*gl4e);
5179 mfn = shadow_l4e_get_mfn(*sl4e);
5180 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5181 SH_type_l3_shadow);
5182 if ( mfn_x(gmfn) != mfn_x(mfn) )
5183 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
5184 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5185 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5187 });
5188 sh_unmap_domain_page(gp);
5189 return 0;
5191 #endif /* GUEST_PAGING_LEVELS >= 4 */
5194 #undef AUDIT_FAIL
5196 #endif /* Audit code */
5198 /**************************************************************************/
5199 /* Entry points into this mode of the shadow code.
5200 * This will all be mangled by the preprocessor to uniquify everything. */
5201 struct paging_mode sh_paging_mode = {
5202 .page_fault = sh_page_fault,
5203 .invlpg = sh_invlpg,
5204 .gva_to_gfn = sh_gva_to_gfn,
5205 .update_cr3 = sh_update_cr3,
5206 .update_paging_modes = shadow_update_paging_modes,
5207 .write_p2m_entry = shadow_write_p2m_entry,
5208 .write_guest_entry = shadow_write_guest_entry,
5209 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
5210 .guest_map_l1e = sh_guest_map_l1e,
5211 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
5212 .guest_levels = GUEST_PAGING_LEVELS,
5213 .shadow.detach_old_tables = sh_detach_old_tables,
5214 .shadow.x86_emulate_write = sh_x86_emulate_write,
5215 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
5216 #ifdef __i386__
5217 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
5218 #endif
5219 .shadow.make_monitor_table = sh_make_monitor_table,
5220 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
5221 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
5222 .shadow.guess_wrmap = sh_guess_wrmap,
5223 #endif
5224 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
5225 };
5227 /*
5228 * Local variables:
5229 * mode: C
5230 * c-set-style: "BSD"
5231 * c-basic-offset: 4
5232 * indent-tabs-mode: nil
5233 * End:
5234 */