ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 18492:c353f07bae84

x86, shadow: Allow removing writable mappings from splintered page tables.

The moving of the pagetable mapping in the linux kernel exposed the
fact that under the linux kernel sh_rm_write_access_from_sl1p was
always failing.

Linux seems to use big pages to access page tables, so we should
instruct the shadow code to be able to remove writable mappings from
splintered pagetables as well, avoiding using OS heuristic (which were
failing in 2.6.27 before George patch, leading to brute-force search
at each resync).

Signed-off-by: Gianluca Guida <gianluca.guida@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Sep 15 11:34:42 2008 +0100 (2008-09-15)
parents fa2adc7fb996
children b87cc4de3ca6
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include "private.h"
39 #include "types.h"
41 /* THINGS TO DO LATER:
42 *
43 * TEARDOWN HEURISTICS
44 * Also: have a heuristic for when to destroy a previous paging-mode's
45 * shadows. When a guest is done with its start-of-day 32-bit tables
46 * and reuses the memory we want to drop those shadows. Start with
47 * shadows in a page in two modes as a hint, but beware of clever tricks
48 * like reusing a pagetable for both PAE and 64-bit during boot...
49 *
50 * PAE LINEAR MAPS
51 * Rework shadow_get_l*e() to have the option of using map_domain_page()
52 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
53 * Then we can test the speed difference made by linear maps. If the
54 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
55 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
56 * to share l2h pages again.
57 *
58 * PSE disabled / PSE36
59 * We don't support any modes other than PSE enabled, PSE36 disabled.
60 * Neither of those would be hard to change, but we'd need to be able to
61 * deal with shadows made in one mode and used in another.
62 */
64 #define FETCH_TYPE_PREFETCH 1
65 #define FETCH_TYPE_DEMAND 2
66 #define FETCH_TYPE_WRITE 4
67 typedef enum {
68 ft_prefetch = FETCH_TYPE_PREFETCH,
69 ft_demand_read = FETCH_TYPE_DEMAND,
70 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
71 } fetch_type_t;
73 #ifdef DEBUG_TRACE_DUMP
74 static char *fetch_type_names[] = {
75 [ft_prefetch] "prefetch",
76 [ft_demand_read] "demand read",
77 [ft_demand_write] "demand write",
78 };
79 #endif
81 /**************************************************************************/
82 /* Hash table mapping from guest pagetables to shadows
83 *
84 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
85 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
86 * shadow L1 which maps its "splinters".
87 */
89 static inline mfn_t
90 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
91 /* Look for FL1 shadows in the hash table */
92 {
93 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
94 return smfn;
95 }
97 static inline mfn_t
98 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
99 /* Look for shadows in the hash table */
100 {
101 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
102 perfc_incr(shadow_get_shadow_status);
103 return smfn;
104 }
106 static inline void
107 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
108 /* Put an FL1 shadow into the hash table */
109 {
110 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
111 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
113 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
114 }
116 static inline void
117 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
118 /* Put a shadow into the hash table */
119 {
120 struct domain *d = v->domain;
121 int res;
123 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
124 d->domain_id, v->vcpu_id, mfn_x(gmfn),
125 shadow_type, mfn_x(smfn));
127 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
128 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
129 {
130 res = get_page(mfn_to_page(gmfn), d);
131 ASSERT(res == 1);
132 }
134 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
135 }
137 static inline void
138 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
139 /* Remove a shadow from the hash table */
140 {
141 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
142 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
143 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
144 }
146 static inline void
147 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
148 /* Remove a shadow from the hash table */
149 {
150 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
151 v->domain->domain_id, v->vcpu_id,
152 mfn_x(gmfn), shadow_type, mfn_x(smfn));
153 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
154 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
155 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
156 put_page(mfn_to_page(gmfn));
157 }
159 /**************************************************************************/
160 /* CPU feature support querying */
162 static inline int
163 guest_supports_superpages(struct vcpu *v)
164 {
165 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
166 * CR4.PSE is set or the guest is in PAE or long mode.
167 * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
168 return (is_hvm_vcpu(v) &&
169 (GUEST_PAGING_LEVELS != 2
170 || !hvm_paging_enabled(v)
171 || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
172 }
174 static inline int
175 guest_supports_nx(struct vcpu *v)
176 {
177 if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
178 return 0;
179 if ( !is_hvm_vcpu(v) )
180 return cpu_has_nx;
181 return hvm_nx_enabled(v);
182 }
185 /**************************************************************************/
186 /* Functions for walking the guest page tables */
188 /* Flags that are needed in a pagetable entry, with the sense of NX inverted */
189 static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
190 {
191 static uint32_t flags[] = {
192 /* I/F - Usr Wr */
193 /* 0 0 0 0 */ _PAGE_PRESENT,
194 /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
195 /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
196 /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
197 /* 0 1 0 0 */ _PAGE_PRESENT,
198 /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
199 /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
200 /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
201 /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
202 /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
203 /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
204 /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
205 /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
206 /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
207 /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
208 /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
209 };
211 /* Don't demand not-NX if the CPU wouldn't enforce it. */
212 if ( !guest_supports_nx(v) )
213 pfec &= ~PFEC_insn_fetch;
215 /* Don't demand R/W if the CPU wouldn't enforce it. */
216 if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
217 && !(pfec & PFEC_user_mode) )
218 pfec &= ~PFEC_write_access;
220 return flags[(pfec & 0x1f) >> 1];
221 }
223 /* Modify a guest pagetable entry to set the Accessed and Dirty bits.
224 * Returns non-zero if it actually writes to guest memory. */
225 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
226 {
227 guest_intpte_t old, new;
228 int ret = 0;
230 old = *(guest_intpte_t *)walk_p;
231 new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
232 if ( old != new )
233 {
234 /* Write the new entry into the walk, and try to write it back
235 * into the guest table as well. If the guest table has changed
236 * under out feet then leave it alone. */
237 *(guest_intpte_t *)walk_p = new;
238 if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
239 ret = 1;
241 /* FIXME -- this code is longer than necessary */
242 if(set_dirty)
243 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
244 else
245 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
246 }
247 return ret;
248 }
250 /* This validation is called with lock held, and after write permission
251 * removal. Then check is atomic and no more inconsistent content can
252 * be observed before lock is released
253 *
254 * Return 1 to indicate success and 0 for inconsistency
255 */
256 static inline uint32_t
257 shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
258 {
259 struct domain *d = v->domain;
260 guest_l1e_t *l1p;
261 guest_l2e_t *l2p;
262 #if GUEST_PAGING_LEVELS >= 4
263 guest_l3e_t *l3p;
264 guest_l4e_t *l4p;
265 #endif
266 int mismatch = 0;
268 ASSERT(shadow_locked_by_me(d));
270 if ( gw->version ==
271 atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
272 return 1;
274 /* We may consider caching guest page mapping from last
275 * guest table walk. However considering this check happens
276 * relatively less-frequent, and a bit burden here to
277 * remap guest page is better than caching mapping in each
278 * guest table walk.
279 *
280 * Also when inconsistency occurs, simply return to trigger
281 * another fault instead of re-validate new path to make
282 * logic simple.
283 */
284 perfc_incr(shadow_check_gwalk);
285 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
286 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
287 l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
288 mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
289 l3p = sh_map_domain_page(gw->l3mfn);
290 mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
291 sh_unmap_domain_page(l3p);
292 #else
293 mismatch |= (gw->l3e.l3 !=
294 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
295 #endif
296 l2p = sh_map_domain_page(gw->l2mfn);
297 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
298 sh_unmap_domain_page(l2p);
299 #else
300 l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
301 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
302 #endif
303 if ( !(guest_supports_superpages(v) &&
304 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
305 {
306 l1p = sh_map_domain_page(gw->l1mfn);
307 mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
308 sh_unmap_domain_page(l1p);
309 }
311 return !mismatch;
312 }
314 /* Remove write access permissions from a gwalk_t in a batch, and
315 * return OR-ed result for TLB flush hint and need to rewalk the guest
316 * pages.
317 *
318 * Syncing pages will remove write access to that page; but it may
319 * also give write access to other pages in the path. If we resync any
320 * pages, re-walk from the beginning.
321 */
322 #define GW_RMWR_FLUSHTLB 1
323 #define GW_RMWR_REWALK 2
325 static inline uint32_t
326 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
327 {
328 uint32_t rc = 0;
330 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
331 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
332 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
333 if ( mfn_is_out_of_sync(gw->l3mfn) )
334 {
335 sh_resync(v, gw->l3mfn);
336 rc = GW_RMWR_REWALK;
337 }
338 else
339 #endif /* OOS */
340 if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
341 rc = GW_RMWR_FLUSHTLB;
342 #endif /* GUEST_PAGING_LEVELS >= 4 */
344 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
345 if ( mfn_is_out_of_sync(gw->l2mfn) )
346 {
347 sh_resync(v, gw->l2mfn);
348 rc |= GW_RMWR_REWALK;
349 }
350 else
351 #endif /* OOS */
352 if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
353 rc |= GW_RMWR_FLUSHTLB;
354 #endif /* GUEST_PAGING_LEVELS >= 3 */
356 if ( !(guest_supports_superpages(v) &&
357 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
358 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
359 && !mfn_is_out_of_sync(gw->l1mfn)
360 #endif /* OOS */
361 && sh_remove_write_access(v, gw->l1mfn, 1, va) )
362 rc |= GW_RMWR_FLUSHTLB;
364 return rc;
365 }
367 /* Walk the guest pagetables, after the manner of a hardware walker.
368 *
369 * Inputs: a vcpu, a virtual address, a walk_t to fill, a
370 * pointer to a pagefault code
371 *
372 * We walk the vcpu's guest pagetables, filling the walk_t with what we
373 * see and adding any Accessed and Dirty bits that are needed in the
374 * guest entries. Using the pagefault code, we check the permissions as
375 * we go. For the purposes of reading pagetables we treat all non-RAM
376 * memory as contining zeroes.
377 *
378 * The walk is done in a lock-free style, with some sanity check postponed
379 * after grabbing shadow lock later. Those delayed checks will make sure
380 * no inconsistent mapping being translated into shadow page table.
381 *
382 * Returns 0 for success, or the set of permission bits that we failed on
383 * if the walk did not complete.
384 * N.B. This is different from the old return code but almost no callers
385 * checked the old return code anyway.
386 */
387 static uint32_t
388 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
389 {
390 struct domain *d = v->domain;
391 p2m_type_t p2mt;
392 guest_l1e_t *l1p = NULL;
393 guest_l2e_t *l2p = NULL;
394 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
395 guest_l3e_t *l3p = NULL;
396 guest_l4e_t *l4p;
397 #endif
398 uint32_t gflags, mflags, rc = 0;
399 int pse;
401 perfc_incr(shadow_guest_walk);
402 memset(gw, 0, sizeof(*gw));
403 gw->va = va;
405 gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
406 rmb();
408 /* Mandatory bits that must be set in every entry. We invert NX, to
409 * calculate as if there were an "X" bit that allowed access.
410 * We will accumulate, in rc, the set of flags that are missing. */
411 mflags = mandatory_flags(v, pfec);
413 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
414 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
416 /* Get the l4e from the top level table and check its flags*/
417 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
418 l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
419 gw->l4e = l4p[guest_l4_table_offset(va)];
420 gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
421 rc |= ((gflags & mflags) ^ mflags);
422 if ( rc & _PAGE_PRESENT ) goto out;
424 /* Map the l3 table */
425 gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
426 if ( !p2m_is_ram(p2mt) )
427 {
428 rc |= _PAGE_PRESENT;
429 goto out;
430 }
431 ASSERT(mfn_valid(gw->l3mfn));
433 /* Get the l3e and check its flags*/
434 l3p = sh_map_domain_page(gw->l3mfn);
435 gw->l3e = l3p[guest_l3_table_offset(va)];
436 gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
437 rc |= ((gflags & mflags) ^ mflags);
438 if ( rc & _PAGE_PRESENT )
439 goto out;
441 #else /* PAE only... */
443 /* Get l3e from the cache of the top level table and check its flag */
444 gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
445 if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
446 {
447 rc |= _PAGE_PRESENT;
448 goto out;
449 }
451 #endif /* PAE or 64... */
453 /* Map the l2 table */
454 gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
455 if ( !p2m_is_ram(p2mt) )
456 {
457 rc |= _PAGE_PRESENT;
458 goto out;
459 }
460 ASSERT(mfn_valid(gw->l2mfn));
462 /* Get the l2e */
463 l2p = sh_map_domain_page(gw->l2mfn);
464 gw->l2e = l2p[guest_l2_table_offset(va)];
466 #else /* 32-bit only... */
468 /* Get l2e from the top level table */
469 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
470 l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
471 gw->l2e = l2p[guest_l2_table_offset(va)];
473 #endif /* All levels... */
475 gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
476 rc |= ((gflags & mflags) ^ mflags);
477 if ( rc & _PAGE_PRESENT )
478 goto out;
480 pse = (guest_supports_superpages(v) &&
481 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
483 if ( pse )
484 {
485 /* Special case: this guest VA is in a PSE superpage, so there's
486 * no guest l1e. We make one up so that the propagation code
487 * can generate a shadow l1 table. Start with the gfn of the
488 * first 4k-page of the superpage. */
489 gfn_t start = guest_l2e_get_gfn(gw->l2e);
490 /* Grant full access in the l1e, since all the guest entry's
491 * access controls are enforced in the shadow l2e. */
492 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
493 _PAGE_ACCESSED|_PAGE_DIRTY);
494 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
495 * of the level 1. */
496 if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
497 flags |= _PAGE_PAT;
498 /* Copy the cache-control bits to the l1 as well, because we
499 * can't represent PAT in the (non-PSE) shadow l2e. :(
500 * This could cause problems if a guest ever maps an area of
501 * memory with superpages using more than one caching mode. */
502 flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
503 /* Increment the pfn by the right number of 4k pages.
504 * The ~0x1 is to mask out the PAT bit mentioned above. */
505 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
506 gw->l1e = guest_l1e_from_gfn(start, flags);
507 gw->l1mfn = _mfn(INVALID_MFN);
508 }
509 else
510 {
511 /* Not a superpage: carry on and find the l1e. */
512 gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
513 if ( !p2m_is_ram(p2mt) )
514 {
515 rc |= _PAGE_PRESENT;
516 goto out;
517 }
518 ASSERT(mfn_valid(gw->l1mfn));
519 l1p = sh_map_domain_page(gw->l1mfn);
520 gw->l1e = l1p[guest_l1_table_offset(va)];
521 gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
522 rc |= ((gflags & mflags) ^ mflags);
523 }
525 /* Go back and set accessed and dirty bits only if the walk was a
526 * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
527 * get set whenever a lower-level PT is used, at least some hardware
528 * walkers behave this way. */
529 if ( rc == 0 )
530 {
531 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
532 if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
533 paging_mark_dirty(d, mfn_x(gw->l4mfn));
534 if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
535 paging_mark_dirty(d, mfn_x(gw->l3mfn));
536 #endif
537 if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
538 (pse && (pfec & PFEC_write_access))) )
539 paging_mark_dirty(d, mfn_x(gw->l2mfn));
540 if ( !pse )
541 {
542 if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
543 (pfec & PFEC_write_access)) )
544 paging_mark_dirty(d, mfn_x(gw->l1mfn));
545 }
546 }
548 out:
549 #if GUEST_PAGING_LEVELS == 4
550 if ( l3p ) sh_unmap_domain_page(l3p);
551 #endif
552 #if GUEST_PAGING_LEVELS >= 3
553 if ( l2p ) sh_unmap_domain_page(l2p);
554 #endif
555 if ( l1p ) sh_unmap_domain_page(l1p);
557 return rc;
558 }
560 /* Given a walk_t, translate the gw->va into the guest's notion of the
561 * corresponding frame number. */
562 static inline gfn_t
563 guest_walk_to_gfn(walk_t *gw)
564 {
565 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
566 return _gfn(INVALID_GFN);
567 return guest_l1e_get_gfn(gw->l1e);
568 }
570 /* Given a walk_t, translate the gw->va into the guest's notion of the
571 * corresponding physical address. */
572 static inline paddr_t
573 guest_walk_to_gpa(walk_t *gw)
574 {
575 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
576 return 0;
577 return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
578 }
580 #if 0 /* Keep for debugging */
581 /* Pretty-print the contents of a guest-walk */
582 static inline void print_gw(walk_t *gw)
583 {
584 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
585 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
586 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
587 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
588 SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
589 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
590 #endif /* PAE or 64... */
591 SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
592 #endif /* All levels... */
593 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
594 SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
595 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
596 SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
597 }
598 #endif /* 0 */
600 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
601 /* Lightweight audit: pass all the shadows associated with this guest walk
602 * through the audit mechanisms */
603 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
604 {
605 mfn_t smfn;
607 if ( !(SHADOW_AUDIT_ENABLE) )
608 return;
610 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
611 if ( mfn_valid(gw->l4mfn)
612 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
613 SH_type_l4_shadow))) )
614 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
615 if ( mfn_valid(gw->l3mfn)
616 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
617 SH_type_l3_shadow))) )
618 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
619 #endif /* PAE or 64... */
620 if ( mfn_valid(gw->l2mfn) )
621 {
622 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
623 SH_type_l2_shadow))) )
624 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
625 #if GUEST_PAGING_LEVELS == 3
626 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
627 SH_type_l2h_shadow))) )
628 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
629 #endif
630 }
631 if ( mfn_valid(gw->l1mfn)
632 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
633 SH_type_l1_shadow))) )
634 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
635 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
636 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
637 && mfn_valid(
638 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
639 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
640 }
642 #else
643 #define sh_audit_gw(_v, _gw) do {} while(0)
644 #endif /* audit code */
647 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
648 void *
649 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
650 unsigned long *gl1mfn)
651 {
652 void *pl1e = NULL;
653 walk_t gw;
655 ASSERT(shadow_mode_translate(v->domain));
657 // XXX -- this is expensive, but it's easy to cobble together...
658 // FIXME!
660 if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0
661 && mfn_valid(gw.l1mfn) )
662 {
663 if ( gl1mfn )
664 *gl1mfn = mfn_x(gw.l1mfn);
665 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
666 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
667 }
669 return pl1e;
670 }
672 void
673 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
674 {
675 walk_t gw;
677 ASSERT(shadow_mode_translate(v->domain));
679 // XXX -- this is expensive, but it's easy to cobble together...
680 // FIXME!
682 (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
683 *(guest_l1e_t *)eff_l1e = gw.l1e;
684 }
685 #endif /* CONFIG == GUEST (== SHADOW) */
687 /**************************************************************************/
688 /* Functions to compute the correct index into a shadow page, given an
689 * index into the guest page (as returned by guest_get_index()).
690 * This is trivial when the shadow and guest use the same sized PTEs, but
691 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
692 * PAE- or 64-bit shadows).
693 *
694 * These functions also increment the shadow mfn, when necessary. When PTE
695 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
696 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
697 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
698 * which shadow page we really want. Similarly, when PTE sizes are
699 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
700 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
701 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
702 * space.)
703 *
704 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
705 * of shadow (to store both the shadow, and the info that would normally be
706 * stored in page_info fields). This arrangement allows the shadow and the
707 * "page_info" fields to always be stored in the same page (in fact, in
708 * the same cache line), avoiding an extra call to map_domain_page().
709 */
711 static inline u32
712 guest_index(void *ptr)
713 {
714 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
715 }
717 static u32
718 shadow_l1_index(mfn_t *smfn, u32 guest_index)
719 {
720 #if (GUEST_PAGING_LEVELS == 2)
721 *smfn = _mfn(mfn_x(*smfn) +
722 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
723 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
724 #else
725 return guest_index;
726 #endif
727 }
729 static u32
730 shadow_l2_index(mfn_t *smfn, u32 guest_index)
731 {
732 #if (GUEST_PAGING_LEVELS == 2)
733 // Because we use 2 shadow l2 entries for each guest entry, the number of
734 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
735 //
736 *smfn = _mfn(mfn_x(*smfn) +
737 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
739 // We multiply by two to get the index of the first of the two entries
740 // used to shadow the specified guest entry.
741 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
742 #else
743 return guest_index;
744 #endif
745 }
747 #if GUEST_PAGING_LEVELS >= 4
749 static u32
750 shadow_l3_index(mfn_t *smfn, u32 guest_index)
751 {
752 return guest_index;
753 }
755 static u32
756 shadow_l4_index(mfn_t *smfn, u32 guest_index)
757 {
758 return guest_index;
759 }
761 #endif // GUEST_PAGING_LEVELS >= 4
764 /**************************************************************************/
765 /* Function which computes shadow entries from their corresponding guest
766 * entries. This is the "heart" of the shadow code. It operates using
767 * level-1 shadow types, but handles all levels of entry.
768 * Don't call it directly, but use the four wrappers below.
769 */
771 static always_inline void
772 _sh_propagate(struct vcpu *v,
773 guest_intpte_t guest_intpte,
774 mfn_t target_mfn,
775 void *shadow_entry_ptr,
776 int level,
777 fetch_type_t ft,
778 p2m_type_t p2mt)
779 {
780 guest_l1e_t guest_entry = { guest_intpte };
781 shadow_l1e_t *sp = shadow_entry_ptr;
782 struct domain *d = v->domain;
783 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
784 u32 pass_thru_flags;
785 u32 gflags, sflags;
787 /* We don't shadow PAE l3s */
788 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
790 /* Check there's something for the shadows to map to */
791 if ( !p2m_is_valid(p2mt) )
792 {
793 *sp = shadow_l1e_empty();
794 goto done;
795 }
797 gflags = guest_l1e_get_flags(guest_entry);
799 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
800 {
801 /* If a guest l1 entry is not present, shadow with the magic
802 * guest-not-present entry. */
803 if ( level == 1 )
804 *sp = sh_l1e_gnp();
805 else
806 *sp = shadow_l1e_empty();
807 goto done;
808 }
810 if ( level == 1 && p2mt == p2m_mmio_dm )
811 {
812 /* Guest l1e maps emulated MMIO space */
813 *sp = sh_l1e_mmio(target_gfn, gflags);
814 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
815 d->arch.paging.shadow.has_fast_mmio_entries = 1;
816 goto done;
817 }
819 // Must have a valid target_mfn unless this is a prefetch or an l1
820 // pointing at MMIO space. In the case of a prefetch, an invalid
821 // mfn means that we can not usefully shadow anything, and so we
822 // return early.
823 //
824 if ( !mfn_valid(target_mfn)
825 && !(level == 1 && (!shadow_mode_refcounts(d)
826 || p2mt == p2m_mmio_direct)) )
827 {
828 ASSERT((ft == ft_prefetch));
829 *sp = shadow_l1e_empty();
830 goto done;
831 }
833 // Propagate bits from the guest to the shadow.
834 // Some of these may be overwritten, below.
835 // Since we know the guest's PRESENT bit is set, we also set the shadow's
836 // SHADOW_PRESENT bit.
837 //
838 pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
839 _PAGE_RW | _PAGE_PRESENT);
840 if ( guest_supports_nx(v) )
841 pass_thru_flags |= _PAGE_NX_BIT;
842 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
843 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
844 sflags = gflags & pass_thru_flags;
846 /*
847 * For HVM domains with direct access to MMIO areas, set the correct
848 * caching attributes in the shadows to match what was asked for.
849 */
850 if ( (level == 1) && is_hvm_domain(d) && has_arch_pdevs(d) &&
851 !is_xen_heap_mfn(mfn_x(target_mfn)) )
852 {
853 unsigned int type;
854 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
855 sflags |= pat_type_2_pte_flags(type);
856 else if ( d->arch.hvm_domain.is_in_uc_mode )
857 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
858 else
859 sflags |= get_pat_flags(v,
860 gflags,
861 gfn_to_paddr(target_gfn),
862 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT);
863 }
865 // Set the A&D bits for higher level shadows.
866 // Higher level entries do not, strictly speaking, have dirty bits, but
867 // since we use shadow linear tables, each of these entries may, at some
868 // point in time, also serve as a shadow L1 entry.
869 // By setting both the A&D bits in each of these, we eliminate the burden
870 // on the hardware to update these bits on initial accesses.
871 //
872 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
873 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
875 // If the A or D bit has not yet been set in the guest, then we must
876 // prevent the corresponding kind of access.
877 //
878 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
879 sflags &= ~_PAGE_PRESENT;
881 /* D bits exist in L1es and PSE L2es */
882 if ( unlikely(((level == 1) ||
883 ((level == 2) &&
884 (gflags & _PAGE_PSE) &&
885 guest_supports_superpages(v)))
886 && !(gflags & _PAGE_DIRTY)) )
887 sflags &= ~_PAGE_RW;
889 // shadow_mode_log_dirty support
890 //
891 // Only allow the guest write access to a page a) on a demand fault,
892 // or b) if the page is already marked as dirty.
893 //
894 // (We handle log-dirty entirely inside the shadow code, without using the
895 // p2m_ram_logdirty p2m type: only HAP uses that.)
896 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
897 {
898 if ( mfn_valid(target_mfn) ) {
899 if ( ft & FETCH_TYPE_WRITE )
900 paging_mark_dirty(d, mfn_x(target_mfn));
901 else if ( !sh_mfn_is_dirty(d, target_mfn) )
902 sflags &= ~_PAGE_RW;
903 }
904 }
906 if ( unlikely((level == 1) && d->dirty_vram
907 && d->dirty_vram->last_dirty == -1
908 && gfn_x(target_gfn) >= d->dirty_vram->begin_pfn
909 && gfn_x(target_gfn) < d->dirty_vram->end_pfn) )
910 {
911 if ( ft & FETCH_TYPE_WRITE )
912 d->dirty_vram->last_dirty = NOW();
913 else
914 sflags &= ~_PAGE_RW;
915 }
917 /* Read-only memory */
918 if ( p2mt == p2m_ram_ro )
919 sflags &= ~_PAGE_RW;
921 // protect guest page tables
922 //
923 if ( unlikely((level == 1)
924 && sh_mfn_is_a_page_table(target_mfn)
925 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
926 /* Unless the page is out of sync and the guest is
927 writing to it. */
928 && !(mfn_oos_may_write(target_mfn)
929 && (ft == ft_demand_write))
930 #endif /* OOS */
931 ) )
932 {
933 if ( shadow_mode_trap_reads(d) )
934 {
935 // if we are trapping both reads & writes, then mark this page
936 // as not present...
937 //
938 sflags &= ~_PAGE_PRESENT;
939 }
940 else
941 {
942 // otherwise, just prevent any writes...
943 //
944 sflags &= ~_PAGE_RW;
945 }
946 }
948 // PV guests in 64-bit mode use two different page tables for user vs
949 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
950 // It is always shadowed as present...
951 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
952 && !is_hvm_domain(d) )
953 {
954 sflags |= _PAGE_USER;
955 }
957 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
959 done:
960 SHADOW_DEBUG(PROPAGATE,
961 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
962 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
963 }
966 /* These four wrappers give us a little bit of type-safety back around
967 * the use of void-* pointers and intpte types in _sh_propagate(), and
968 * allow the compiler to optimize out some level checks. */
970 #if GUEST_PAGING_LEVELS >= 4
971 static void
972 l4e_propagate_from_guest(struct vcpu *v,
973 guest_l4e_t gl4e,
974 mfn_t sl3mfn,
975 shadow_l4e_t *sl4e,
976 fetch_type_t ft)
977 {
978 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
979 }
981 static void
982 l3e_propagate_from_guest(struct vcpu *v,
983 guest_l3e_t gl3e,
984 mfn_t sl2mfn,
985 shadow_l3e_t *sl3e,
986 fetch_type_t ft)
987 {
988 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
989 }
990 #endif // GUEST_PAGING_LEVELS >= 4
992 static void
993 l2e_propagate_from_guest(struct vcpu *v,
994 guest_l2e_t gl2e,
995 mfn_t sl1mfn,
996 shadow_l2e_t *sl2e,
997 fetch_type_t ft)
998 {
999 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
1002 static void
1003 l1e_propagate_from_guest(struct vcpu *v,
1004 guest_l1e_t gl1e,
1005 mfn_t gmfn,
1006 shadow_l1e_t *sl1e,
1007 fetch_type_t ft,
1008 p2m_type_t p2mt)
1010 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
1014 /**************************************************************************/
1015 /* These functions update shadow entries (and do bookkeeping on the shadow
1016 * tables they are in). It is intended that they are the only
1017 * functions which ever write (non-zero) data onto a shadow page.
1018 */
1020 static inline void safe_write_entry(void *dst, void *src)
1021 /* Copy one PTE safely when processors might be running on the
1022 * destination pagetable. This does *not* give safety against
1023 * concurrent writes (that's what the shadow lock is for), just
1024 * stops the hardware picking up partially written entries. */
1026 volatile unsigned long *d = dst;
1027 unsigned long *s = src;
1028 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
1029 #if CONFIG_PAGING_LEVELS == 3
1030 /* In PAE mode, pagetable entries are larger
1031 * than machine words, so won't get written atomically. We need to make
1032 * sure any other cpu running on these shadows doesn't see a
1033 * half-written entry. Do this by marking the entry not-present first,
1034 * then writing the high word before the low word. */
1035 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
1036 d[0] = 0;
1037 d[1] = s[1];
1038 d[0] = s[0];
1039 #else
1040 /* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
1041 * which will be an atomic write, since the entry is aligned. */
1042 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
1043 *d = *s;
1044 #endif
1048 static inline void
1049 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
1050 /* This function does the actual writes to shadow pages.
1051 * It must not be called directly, since it doesn't do the bookkeeping
1052 * that shadow_set_l*e() functions do. */
1054 shadow_l1e_t *dst = d;
1055 shadow_l1e_t *src = s;
1056 void *map = NULL;
1057 int i;
1059 /* Because we mirror access rights at all levels in the shadow, an
1060 * l2 (or higher) entry with the RW bit cleared will leave us with
1061 * no write access through the linear map.
1062 * We detect that by writing to the shadow with copy_to_user() and
1063 * using map_domain_page() to get a writeable mapping if we need to. */
1064 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
1066 perfc_incr(shadow_linear_map_failed);
1067 map = sh_map_domain_page(mfn);
1068 ASSERT(map != NULL);
1069 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
1073 for ( i = 0; i < entries; i++ )
1074 safe_write_entry(dst++, src++);
1076 if ( map != NULL ) sh_unmap_domain_page(map);
1079 static inline int
1080 perms_strictly_increased(u32 old_flags, u32 new_flags)
1081 /* Given the flags of two entries, are the new flags a strict
1082 * increase in rights over the old ones? */
1084 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1085 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1086 /* Flip the NX bit, since it's the only one that decreases rights;
1087 * we calculate as if it were an "X" bit. */
1088 of ^= _PAGE_NX_BIT;
1089 nf ^= _PAGE_NX_BIT;
1090 /* If the changed bits are all set in the new flags, then rights strictly
1091 * increased between old and new. */
1092 return ((of | (of ^ nf)) == nf);
1095 static int inline
1096 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1098 int res;
1099 mfn_t mfn;
1100 struct domain *owner;
1102 ASSERT(!sh_l1e_is_magic(sl1e));
1104 if ( !shadow_mode_refcounts(d) )
1105 return 1;
1107 res = get_page_from_l1e(sl1e, d);
1109 // If a privileged domain is attempting to install a map of a page it does
1110 // not own, we let it succeed anyway.
1111 //
1112 if ( unlikely(!res) &&
1113 !shadow_mode_translate(d) &&
1114 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
1115 (owner = page_get_owner(mfn_to_page(mfn))) &&
1116 (d != owner) &&
1117 IS_PRIV_FOR(d, owner))
1119 res = get_page_from_l1e(sl1e, owner);
1120 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
1121 "which is owned by domain %d: %s\n",
1122 d->domain_id, mfn_x(mfn), owner->domain_id,
1123 res ? "success" : "failed");
1126 if ( unlikely(!res) )
1128 perfc_incr(shadow_get_page_fail);
1129 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
1132 return res;
1135 static void inline
1136 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1138 if ( !shadow_mode_refcounts(d) )
1139 return;
1141 put_page_from_l1e(sl1e, d);
1144 #if GUEST_PAGING_LEVELS >= 4
1145 static int shadow_set_l4e(struct vcpu *v,
1146 shadow_l4e_t *sl4e,
1147 shadow_l4e_t new_sl4e,
1148 mfn_t sl4mfn)
1150 int flags = 0, ok;
1151 shadow_l4e_t old_sl4e;
1152 paddr_t paddr;
1153 ASSERT(sl4e != NULL);
1154 old_sl4e = *sl4e;
1156 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
1158 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1159 | (((unsigned long)sl4e) & ~PAGE_MASK));
1161 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
1163 /* About to install a new reference */
1164 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
1165 ok = sh_get_ref(v, sl3mfn, paddr);
1166 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
1167 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
1168 ok |= sh_pin(v, sl3mfn);
1169 if ( !ok )
1171 domain_crash(v->domain);
1172 return SHADOW_SET_ERROR;
1174 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1175 shadow_resync_all(v, 0);
1176 #endif
1179 /* Write the new entry */
1180 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
1181 flags |= SHADOW_SET_CHANGED;
1183 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1185 /* We lost a reference to an old mfn. */
1186 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1187 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1188 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1189 shadow_l4e_get_flags(new_sl4e)) )
1191 flags |= SHADOW_SET_FLUSH;
1193 sh_put_ref(v, osl3mfn, paddr);
1195 return flags;
1198 static int shadow_set_l3e(struct vcpu *v,
1199 shadow_l3e_t *sl3e,
1200 shadow_l3e_t new_sl3e,
1201 mfn_t sl3mfn)
1203 int flags = 0;
1204 shadow_l3e_t old_sl3e;
1205 paddr_t paddr;
1206 ASSERT(sl3e != NULL);
1207 old_sl3e = *sl3e;
1209 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1211 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1212 | (((unsigned long)sl3e) & ~PAGE_MASK));
1214 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1216 /* About to install a new reference */
1217 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1219 domain_crash(v->domain);
1220 return SHADOW_SET_ERROR;
1222 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1223 shadow_resync_all(v, 0);
1224 #endif
1227 /* Write the new entry */
1228 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1229 flags |= SHADOW_SET_CHANGED;
1231 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1233 /* We lost a reference to an old mfn. */
1234 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1235 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1236 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1237 shadow_l3e_get_flags(new_sl3e)) )
1239 flags |= SHADOW_SET_FLUSH;
1241 sh_put_ref(v, osl2mfn, paddr);
1243 return flags;
1245 #endif /* GUEST_PAGING_LEVELS >= 4 */
1247 static int shadow_set_l2e(struct vcpu *v,
1248 shadow_l2e_t *sl2e,
1249 shadow_l2e_t new_sl2e,
1250 mfn_t sl2mfn)
1252 int flags = 0;
1253 shadow_l2e_t old_sl2e;
1254 paddr_t paddr;
1256 #if GUEST_PAGING_LEVELS == 2
1257 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1258 * shadows. Reference counting and up-pointers track from the first
1259 * page of the shadow to the first l2e, so make sure that we're
1260 * working with those:
1261 * Align the pointer down so it's pointing at the first of the pair */
1262 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1263 /* Align the mfn of the shadow entry too */
1264 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1265 #endif
1267 ASSERT(sl2e != NULL);
1268 old_sl2e = *sl2e;
1270 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1272 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1273 | (((unsigned long)sl2e) & ~PAGE_MASK));
1275 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1277 mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
1279 /* About to install a new reference */
1280 if ( !sh_get_ref(v, sl1mfn, paddr) )
1282 domain_crash(v->domain);
1283 return SHADOW_SET_ERROR;
1285 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1287 struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
1288 mfn_t gl1mfn = _mfn(sp->backpointer);
1290 /* If the shadow is a fl1 then the backpointer contains
1291 the GFN instead of the GMFN, and it's definitely not
1292 OOS. */
1293 if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
1294 && mfn_is_out_of_sync(gl1mfn) )
1295 sh_resync(v, gl1mfn);
1297 #endif
1300 /* Write the new entry */
1301 #if GUEST_PAGING_LEVELS == 2
1303 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1304 /* The l1 shadow is two pages long and need to be pointed to by
1305 * two adjacent l1es. The pair have the same flags, but point
1306 * at odd and even MFNs */
1307 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1308 pair[1].l2 |= (1<<PAGE_SHIFT);
1309 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1311 #else /* normal case */
1312 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1313 #endif
1314 flags |= SHADOW_SET_CHANGED;
1316 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1318 /* We lost a reference to an old mfn. */
1319 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1320 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1321 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1322 shadow_l2e_get_flags(new_sl2e)) )
1324 flags |= SHADOW_SET_FLUSH;
1326 sh_put_ref(v, osl1mfn, paddr);
1328 return flags;
1331 static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1332 shadow_l1e_t *sl1e,
1333 mfn_t sl1mfn,
1334 struct domain *d)
1336 mfn_t mfn;
1337 unsigned long gfn;
1339 if ( !d->dirty_vram ) return;
1341 mfn = shadow_l1e_get_mfn(new_sl1e);
1343 if ( !mfn_valid(mfn) ) return; /* m2p for mmio_direct may not exist */
1345 gfn = mfn_to_gfn(d, mfn);
1347 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1348 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1349 struct page_info *page = mfn_to_page(mfn);
1350 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1352 if ( count_info == 1 )
1353 /* Initial guest reference, record it */
1354 d->dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1355 | ((unsigned long)sl1e & ~PAGE_MASK);
1359 static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1360 shadow_l1e_t *sl1e,
1361 mfn_t sl1mfn,
1362 struct domain *d)
1364 mfn_t mfn;
1365 unsigned long gfn;
1367 if ( !d->dirty_vram ) return;
1369 mfn = shadow_l1e_get_mfn(old_sl1e);
1371 if ( !mfn_valid(mfn) ) return;
1373 gfn = mfn_to_gfn(d, mfn);
1375 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1376 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1377 struct page_info *page = mfn_to_page(mfn);
1378 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1379 int dirty = 0;
1380 paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1381 | ((unsigned long)sl1e & ~PAGE_MASK);
1383 if ( count_info == 1 ) {
1384 /* Last reference */
1385 if ( d->dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1386 /* We didn't know it was that one, let's say it is dirty */
1387 dirty = 1;
1388 } else {
1389 ASSERT(d->dirty_vram->sl1ma[i] == sl1ma);
1390 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1391 if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_DIRTY )
1392 dirty = 1;
1394 } else {
1395 /* We had more than one reference, just consider the page dirty. */
1396 dirty = 1;
1397 /* Check that it's not the one we recorded. */
1398 if ( d->dirty_vram->sl1ma[i] == sl1ma ) {
1399 /* Too bad, we remembered the wrong one... */
1400 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1401 } else {
1402 /* Ok, our recorded sl1e is still pointing to this page, let's
1403 * just hope it will remain. */
1406 if ( dirty ) {
1407 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
1408 d->dirty_vram->last_dirty = NOW();
1413 static int shadow_set_l1e(struct vcpu *v,
1414 shadow_l1e_t *sl1e,
1415 shadow_l1e_t new_sl1e,
1416 mfn_t sl1mfn)
1418 int flags = 0;
1419 struct domain *d = v->domain;
1420 shadow_l1e_t old_sl1e;
1421 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1422 mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
1423 #endif
1424 ASSERT(sl1e != NULL);
1426 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1427 if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
1428 && ((shadow_l1e_get_flags(new_sl1e) & (_PAGE_RW|_PAGE_PRESENT))
1429 == (_PAGE_RW|_PAGE_PRESENT)) )
1430 oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
1431 #endif
1433 old_sl1e = *sl1e;
1435 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1437 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1438 && !sh_l1e_is_magic(new_sl1e) )
1440 /* About to install a new reference */
1441 if ( shadow_mode_refcounts(d) ) {
1442 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
1443 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1445 /* Doesn't look like a pagetable. */
1446 flags |= SHADOW_SET_ERROR;
1447 new_sl1e = shadow_l1e_empty();
1449 else
1451 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1456 /* Write the new entry */
1457 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1458 flags |= SHADOW_SET_CHANGED;
1460 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1461 && !sh_l1e_is_magic(old_sl1e) )
1463 /* We lost a reference to an old mfn. */
1464 /* N.B. Unlike higher-level sets, never need an extra flush
1465 * when writing an l1e. Because it points to the same guest frame
1466 * as the guest l1e did, it's the guest's responsibility to
1467 * trigger a flush later. */
1468 if ( shadow_mode_refcounts(d) )
1470 shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1471 shadow_put_page_from_l1e(old_sl1e, d);
1472 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
1475 return flags;
1479 /**************************************************************************/
1480 /* Macros to walk pagetables. These take the shadow of a pagetable and
1481 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1482 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1483 * second entry (since pairs of entries are managed together). For multi-page
1484 * shadows they walk all pages.
1486 * Arguments are an MFN, the variable to point to each entry, a variable
1487 * to indicate that we are done (we will shortcut to the end of the scan
1488 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1489 * and the code.
1491 * WARNING: These macros have side-effects. They change the values of both
1492 * the pointer and the MFN. */
1494 static inline void increment_ptr_to_guest_entry(void *ptr)
1496 if ( ptr )
1498 guest_l1e_t **entry = ptr;
1499 (*entry)++;
1503 /* All kinds of l1: touch all entries */
1504 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1505 do { \
1506 int _i; \
1507 shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \
1508 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1509 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1510 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1511 { \
1512 (_sl1e) = _sp + _i; \
1513 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1514 {_code} \
1515 if ( _done ) break; \
1516 increment_ptr_to_guest_entry(_gl1p); \
1517 } \
1518 sh_unmap_domain_page(_sp); \
1519 } while (0)
1521 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1522 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1523 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1524 do { \
1525 int __done = 0; \
1526 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1527 ({ (__done = _done); }), _code); \
1528 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1529 if ( !__done ) \
1530 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1531 ({ (__done = _done); }), _code); \
1532 } while (0)
1533 #else /* Everything else; l1 shadows are only one page */
1534 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1535 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1536 #endif
1539 #if GUEST_PAGING_LEVELS == 2
1541 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1542 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1543 do { \
1544 int _i, _j, __done = 0; \
1545 int _xen = !shadow_mode_external(_dom); \
1546 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1547 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1548 { \
1549 shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \
1550 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1551 if ( (!(_xen)) \
1552 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1553 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1554 { \
1555 (_sl2e) = _sp + _i; \
1556 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1557 {_code} \
1558 if ( (__done = (_done)) ) break; \
1559 increment_ptr_to_guest_entry(_gl2p); \
1560 } \
1561 sh_unmap_domain_page(_sp); \
1562 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1563 } \
1564 } while (0)
1566 #elif GUEST_PAGING_LEVELS == 3
1568 /* PAE: if it's an l2h, don't touch Xen mappings */
1569 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1570 do { \
1571 int _i; \
1572 int _xen = !shadow_mode_external(_dom); \
1573 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1574 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1575 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1576 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1577 if ( (!(_xen)) \
1578 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1579 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1580 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1581 { \
1582 (_sl2e) = _sp + _i; \
1583 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1584 {_code} \
1585 if ( _done ) break; \
1586 increment_ptr_to_guest_entry(_gl2p); \
1587 } \
1588 sh_unmap_domain_page(_sp); \
1589 } while (0)
1591 #else
1593 /* 64-bit l2: touch all entries except for PAE compat guests. */
1594 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1595 do { \
1596 int _i; \
1597 int _xen = !shadow_mode_external(_dom); \
1598 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1599 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1600 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1601 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1602 { \
1603 if ( (!(_xen)) \
1604 || !is_pv_32on64_domain(_dom) \
1605 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1606 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1607 { \
1608 (_sl2e) = _sp + _i; \
1609 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1610 {_code} \
1611 if ( _done ) break; \
1612 increment_ptr_to_guest_entry(_gl2p); \
1613 } \
1614 } \
1615 sh_unmap_domain_page(_sp); \
1616 } while (0)
1618 #endif /* different kinds of l2 */
1620 #if GUEST_PAGING_LEVELS == 4
1622 /* 64-bit l3: touch all entries */
1623 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1624 do { \
1625 int _i; \
1626 shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \
1627 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1628 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1629 { \
1630 (_sl3e) = _sp + _i; \
1631 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1632 {_code} \
1633 if ( _done ) break; \
1634 increment_ptr_to_guest_entry(_gl3p); \
1635 } \
1636 sh_unmap_domain_page(_sp); \
1637 } while (0)
1639 /* 64-bit l4: avoid Xen mappings */
1640 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1641 do { \
1642 shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \
1643 int _xen = !shadow_mode_external(_dom); \
1644 int _i; \
1645 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1646 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1647 { \
1648 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1649 { \
1650 (_sl4e) = _sp + _i; \
1651 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1652 {_code} \
1653 if ( _done ) break; \
1654 } \
1655 increment_ptr_to_guest_entry(_gl4p); \
1656 } \
1657 sh_unmap_domain_page(_sp); \
1658 } while (0)
1660 #endif
1664 /**************************************************************************/
1665 /* Functions to install Xen mappings and linear mappings in shadow pages */
1667 // XXX -- this function should probably be moved to shadow-common.c, but that
1668 // probably wants to wait until the shadow types have been moved from
1669 // shadow-types.h to shadow-private.h
1670 //
1671 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1672 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1674 struct domain *d = v->domain;
1675 shadow_l4e_t *sl4e;
1677 sl4e = sh_map_domain_page(sl4mfn);
1678 ASSERT(sl4e != NULL);
1679 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1681 /* Copy the common Xen mappings from the idle domain */
1682 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1683 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1684 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1686 /* Install the per-domain mappings for this domain */
1687 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1688 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1689 __PAGE_HYPERVISOR);
1691 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1692 * shadows on 64-bit xen, this linear mapping is later replaced by the
1693 * monitor pagetable structure, which is built in make_monitor_table
1694 * and maintained by sh_update_linear_entries. */
1695 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1696 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1698 /* Self linear mapping. */
1699 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1701 // linear tables may not be used with translated PV guests
1702 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1703 shadow_l4e_empty();
1705 else
1707 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1708 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1711 if ( shadow_mode_translate(v->domain) )
1713 /* install domain-specific P2M table */
1714 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1715 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1716 __PAGE_HYPERVISOR);
1719 sh_unmap_domain_page(sl4e);
1721 #endif
1723 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1724 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1725 // place, which means that we need to populate the l2h entry in the l3
1726 // table.
1728 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1730 struct domain *d = v->domain;
1731 shadow_l2e_t *sl2e;
1732 #if CONFIG_PAGING_LEVELS == 3
1733 int i;
1734 #else
1736 if ( !is_pv_32on64_vcpu(v) )
1737 return;
1738 #endif
1740 sl2e = sh_map_domain_page(sl2hmfn);
1741 ASSERT(sl2e != NULL);
1742 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1744 #if CONFIG_PAGING_LEVELS == 3
1746 /* Copy the common Xen mappings from the idle domain */
1747 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1748 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1749 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1751 /* Install the per-domain mappings for this domain */
1752 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1753 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1754 shadow_l2e_from_mfn(
1755 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1756 __PAGE_HYPERVISOR);
1758 /* We don't set up a linear mapping here because we can't until this
1759 * l2h is installed in an l3e. sh_update_linear_entries() handles
1760 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1761 * We zero them here, just as a safety measure.
1762 */
1763 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1764 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1765 shadow_l2e_empty();
1766 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1767 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1768 shadow_l2e_empty();
1770 if ( shadow_mode_translate(d) )
1772 /* Install the domain-specific p2m table */
1773 l3_pgentry_t *p2m;
1774 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1775 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1776 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1778 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1779 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1780 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1781 __PAGE_HYPERVISOR)
1782 : shadow_l2e_empty();
1784 sh_unmap_domain_page(p2m);
1787 #else
1789 /* Copy the common Xen mappings from the idle domain */
1790 memcpy(
1791 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1792 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1793 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1795 #endif
1797 sh_unmap_domain_page(sl2e);
1799 #endif
1805 /**************************************************************************/
1806 /* Create a shadow of a given guest page.
1807 */
1808 static mfn_t
1809 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1811 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1812 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1813 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1815 if ( shadow_type != SH_type_l2_32_shadow
1816 && shadow_type != SH_type_l2_pae_shadow
1817 && shadow_type != SH_type_l2h_pae_shadow
1818 && shadow_type != SH_type_l4_64_shadow )
1819 /* Lower-level shadow, not yet linked form a higher level */
1820 mfn_to_shadow_page(smfn)->up = 0;
1822 #if GUEST_PAGING_LEVELS == 4
1823 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1824 if ( shadow_type == SH_type_l4_64_shadow &&
1825 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1827 /* We're shadowing a new l4, but we've been assuming the guest uses
1828 * only one l4 per vcpu and context switches using an l4 entry.
1829 * Count the number of active l4 shadows. If there are enough
1830 * of them, decide that this isn't an old linux guest, and stop
1831 * pinning l3es. This is not very quick but it doesn't happen
1832 * very often. */
1833 struct list_head *l, *t;
1834 struct shadow_page_info *sp;
1835 struct vcpu *v2;
1836 int l4count = 0, vcpus = 0;
1837 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1839 sp = list_entry(l, struct shadow_page_info, list);
1840 if ( sp->type == SH_type_l4_64_shadow )
1841 l4count++;
1843 for_each_vcpu ( v->domain, v2 )
1844 vcpus++;
1845 if ( l4count > 2 * vcpus )
1847 /* Unpin all the pinned l3 tables, and don't pin any more. */
1848 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1850 sp = list_entry(l, struct shadow_page_info, list);
1851 if ( sp->type == SH_type_l3_64_shadow )
1852 sh_unpin(v, shadow_page_to_mfn(sp));
1854 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1857 #endif
1858 #endif
1860 // Create the Xen mappings...
1861 if ( !shadow_mode_external(v->domain) )
1863 switch (shadow_type)
1865 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1866 case SH_type_l4_shadow:
1867 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1868 #endif
1869 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1870 case SH_type_l2h_shadow:
1871 sh_install_xen_entries_in_l2h(v, smfn); break;
1872 #endif
1873 default: /* Do nothing */ break;
1877 shadow_promote(v, gmfn, shadow_type);
1878 set_shadow_status(v, gmfn, shadow_type, smfn);
1880 return smfn;
1883 /* Make a splintered superpage shadow */
1884 static mfn_t
1885 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1887 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1888 (unsigned long) gfn_x(gfn));
1890 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1891 gfn_x(gfn), mfn_x(smfn));
1893 set_fl1_shadow_status(v, gfn, smfn);
1894 return smfn;
1898 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1899 mfn_t
1900 sh_make_monitor_table(struct vcpu *v)
1902 struct domain *d = v->domain;
1904 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1906 /* Guarantee we can get the memory we need */
1907 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1909 #if CONFIG_PAGING_LEVELS == 4
1911 mfn_t m4mfn;
1912 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1913 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1914 /* Remember the level of this table */
1915 mfn_to_page(m4mfn)->shadow_flags = 4;
1916 #if SHADOW_PAGING_LEVELS < 4
1918 mfn_t m3mfn, m2mfn;
1919 l4_pgentry_t *l4e;
1920 l3_pgentry_t *l3e;
1921 /* Install an l3 table and an l2 table that will hold the shadow
1922 * linear map entries. This overrides the linear map entry that
1923 * was installed by sh_install_xen_entries_in_l4. */
1924 l4e = sh_map_domain_page(m4mfn);
1926 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1927 mfn_to_page(m3mfn)->shadow_flags = 3;
1928 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1929 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1931 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1932 mfn_to_page(m2mfn)->shadow_flags = 2;
1933 l3e = sh_map_domain_page(m3mfn);
1934 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1935 sh_unmap_domain_page(l3e);
1937 if ( is_pv_32on64_vcpu(v) )
1939 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1940 * area into its usual VAs in the monitor tables */
1941 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1942 mfn_to_page(m3mfn)->shadow_flags = 3;
1943 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1945 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1946 mfn_to_page(m2mfn)->shadow_flags = 2;
1947 l3e = sh_map_domain_page(m3mfn);
1948 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1949 sh_install_xen_entries_in_l2h(v, m2mfn);
1950 sh_unmap_domain_page(l3e);
1953 sh_unmap_domain_page(l4e);
1955 #endif /* SHADOW_PAGING_LEVELS < 4 */
1956 return m4mfn;
1959 #elif CONFIG_PAGING_LEVELS == 3
1962 mfn_t m3mfn, m2mfn;
1963 l3_pgentry_t *l3e;
1964 l2_pgentry_t *l2e;
1965 int i;
1967 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1968 /* Remember the level of this table */
1969 mfn_to_page(m3mfn)->shadow_flags = 3;
1971 // Install a monitor l2 table in slot 3 of the l3 table.
1972 // This is used for all Xen entries, including linear maps
1973 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1974 mfn_to_page(m2mfn)->shadow_flags = 2;
1975 l3e = sh_map_domain_page(m3mfn);
1976 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1977 sh_install_xen_entries_in_l2h(v, m2mfn);
1978 /* Install the monitor's own linear map */
1979 l2e = sh_map_domain_page(m2mfn);
1980 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1981 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1982 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1983 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1984 : l2e_empty();
1985 sh_unmap_domain_page(l2e);
1986 sh_unmap_domain_page(l3e);
1988 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1989 return m3mfn;
1992 #else
1993 #error this should not happen
1994 #endif /* CONFIG_PAGING_LEVELS */
1996 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1998 /**************************************************************************/
1999 /* These functions also take a virtual address and return the level-N
2000 * shadow table mfn and entry, but they create the shadow pagetables if
2001 * they are needed. The "demand" argument is non-zero when handling
2002 * a demand fault (so we know what to do about accessed bits &c).
2003 * If the necessary tables are not present in the guest, they return NULL. */
2005 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
2006 * more levels than the guest, the upper levels are always fixed and do not
2007 * reflect any information from the guest, so we do not use these functions
2008 * to access them. */
2010 #if GUEST_PAGING_LEVELS >= 4
2011 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
2012 walk_t *gw,
2013 mfn_t *sl4mfn)
2015 /* There is always a shadow of the top level table. Get it. */
2016 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2017 /* Reading the top level table is always valid. */
2018 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
2021 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
2022 walk_t *gw,
2023 mfn_t *sl3mfn,
2024 fetch_type_t ft)
2026 mfn_t sl4mfn;
2027 shadow_l4e_t *sl4e;
2028 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
2029 /* Get the l4e */
2030 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
2031 ASSERT(sl4e != NULL);
2032 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2034 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
2035 ASSERT(mfn_valid(*sl3mfn));
2037 else
2039 int r;
2040 shadow_l4e_t new_sl4e;
2041 /* No l3 shadow installed: find and install it. */
2042 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
2043 if ( !mfn_valid(*sl3mfn) )
2045 /* No l3 shadow of this page exists at all: make one. */
2046 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
2048 /* Install the new sl3 table in the sl4e */
2049 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
2050 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
2051 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2052 if ( r & SHADOW_SET_ERROR )
2053 return NULL;
2055 /* Now follow it down a level. Guaranteed to succeed. */
2056 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
2058 #endif /* GUEST_PAGING_LEVELS >= 4 */
2061 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
2062 walk_t *gw,
2063 mfn_t *sl2mfn,
2064 fetch_type_t ft)
2066 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
2067 mfn_t sl3mfn = _mfn(INVALID_MFN);
2068 shadow_l3e_t *sl3e;
2069 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
2070 /* Get the l3e */
2071 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
2072 if ( sl3e == NULL ) return NULL;
2073 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2075 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2076 ASSERT(mfn_valid(*sl2mfn));
2078 else
2080 int r;
2081 shadow_l3e_t new_sl3e;
2082 unsigned int t = SH_type_l2_shadow;
2084 /* Tag compat L2 containing hypervisor (m2p) mappings */
2085 if ( is_pv_32on64_domain(v->domain) &&
2086 guest_l4_table_offset(gw->va) == 0 &&
2087 guest_l3_table_offset(gw->va) == 3 )
2088 t = SH_type_l2h_shadow;
2090 /* No l2 shadow installed: find and install it. */
2091 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
2092 if ( !mfn_valid(*sl2mfn) )
2094 /* No l2 shadow of this page exists at all: make one. */
2095 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
2097 /* Install the new sl2 table in the sl3e */
2098 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
2099 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
2100 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2101 if ( r & SHADOW_SET_ERROR )
2102 return NULL;
2104 /* Now follow it down a level. Guaranteed to succeed. */
2105 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2106 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
2107 /* We never demand-shadow PAE l3es: they are only created in
2108 * sh_update_cr3(). Check if the relevant sl3e is present. */
2109 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
2110 + shadow_l3_linear_offset(gw->va);
2111 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
2112 return NULL;
2113 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2114 ASSERT(mfn_valid(*sl2mfn));
2115 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2116 #else /* 32bit... */
2117 /* There is always a shadow of the top level table. Get it. */
2118 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2119 /* This next line is important: the guest l2 has a 16k
2120 * shadow, we need to return the right mfn of the four. This
2121 * call will set it for us as a side-effect. */
2122 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
2123 /* Reading the top level table is always valid. */
2124 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2125 #endif
2129 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
2130 walk_t *gw,
2131 mfn_t *sl1mfn,
2132 fetch_type_t ft)
2134 mfn_t sl2mfn;
2135 shadow_l2e_t *sl2e;
2137 /* Get the l2e */
2138 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
2139 if ( sl2e == NULL ) return NULL;
2140 /* Install the sl1 in the l2e if it wasn't there or if we need to
2141 * re-do it to fix a PSE dirty bit. */
2142 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
2143 && likely(ft != ft_demand_write
2144 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
2145 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
2147 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
2148 ASSERT(mfn_valid(*sl1mfn));
2150 else
2152 shadow_l2e_t new_sl2e;
2153 int r, flags = guest_l2e_get_flags(gw->l2e);
2154 /* No l1 shadow installed: find and install it. */
2155 if ( !(flags & _PAGE_PRESENT) )
2156 return NULL; /* No guest page. */
2157 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
2159 /* Splintering a superpage */
2160 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
2161 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
2162 if ( !mfn_valid(*sl1mfn) )
2164 /* No fl1 shadow of this superpage exists at all: make one. */
2165 *sl1mfn = make_fl1_shadow(v, l2gfn);
2168 else
2170 /* Shadowing an actual guest l1 table */
2171 if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
2172 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
2173 if ( !mfn_valid(*sl1mfn) )
2175 /* No l1 shadow of this page exists at all: make one. */
2176 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
2179 /* Install the new sl1 table in the sl2e */
2180 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
2181 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
2182 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2183 if ( r & SHADOW_SET_ERROR )
2184 return NULL;
2185 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
2186 * the guest l1 table has an 8k shadow, and we need to return
2187 * the right mfn of the pair. This call will set it for us as a
2188 * side-effect. (In all other cases, it's a no-op and will be
2189 * compiled out.) */
2190 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
2192 /* Now follow it down a level. Guaranteed to succeed. */
2193 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
2198 /**************************************************************************/
2199 /* Destructors for shadow tables:
2200 * Unregister the shadow, decrement refcounts of any entries present in it,
2201 * and release the memory.
2203 * N.B. These destructors do not clear the contents of the shadows.
2204 * This allows us to delay TLB shootdowns until the page is being reused.
2205 * See shadow_alloc() and shadow_free() for how this is handled.
2206 */
2208 #if GUEST_PAGING_LEVELS >= 4
2209 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
2211 shadow_l4e_t *sl4e;
2212 u32 t = mfn_to_shadow_page(smfn)->type;
2213 mfn_t gmfn, sl4mfn;
2215 SHADOW_DEBUG(DESTROY_SHADOW,
2216 "%s(%05lx)\n", __func__, mfn_x(smfn));
2217 ASSERT(t == SH_type_l4_shadow);
2219 /* Record that the guest page isn't shadowed any more (in this type) */
2220 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2221 delete_shadow_status(v, gmfn, t, smfn);
2222 shadow_demote(v, gmfn, t);
2223 /* Decrement refcounts of all the old entries */
2224 sl4mfn = smfn;
2225 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2226 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2228 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
2229 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
2230 | ((unsigned long)sl4e & ~PAGE_MASK));
2232 });
2234 /* Put the memory back in the pool */
2235 shadow_free(v->domain, smfn);
2238 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2240 shadow_l3e_t *sl3e;
2241 u32 t = mfn_to_shadow_page(smfn)->type;
2242 mfn_t gmfn, sl3mfn;
2244 SHADOW_DEBUG(DESTROY_SHADOW,
2245 "%s(%05lx)\n", __func__, mfn_x(smfn));
2246 ASSERT(t == SH_type_l3_shadow);
2248 /* Record that the guest page isn't shadowed any more (in this type) */
2249 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2250 delete_shadow_status(v, gmfn, t, smfn);
2251 shadow_demote(v, gmfn, t);
2253 /* Decrement refcounts of all the old entries */
2254 sl3mfn = smfn;
2255 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2256 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2257 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2258 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2259 | ((unsigned long)sl3e & ~PAGE_MASK));
2260 });
2262 /* Put the memory back in the pool */
2263 shadow_free(v->domain, smfn);
2265 #endif /* GUEST_PAGING_LEVELS >= 4 */
2268 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2270 shadow_l2e_t *sl2e;
2271 u32 t = mfn_to_shadow_page(smfn)->type;
2272 mfn_t gmfn, sl2mfn;
2274 SHADOW_DEBUG(DESTROY_SHADOW,
2275 "%s(%05lx)\n", __func__, mfn_x(smfn));
2277 #if GUEST_PAGING_LEVELS >= 3
2278 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2279 #else
2280 ASSERT(t == SH_type_l2_shadow);
2281 #endif
2283 /* Record that the guest page isn't shadowed any more (in this type) */
2284 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2285 delete_shadow_status(v, gmfn, t, smfn);
2286 shadow_demote(v, gmfn, t);
2288 /* Decrement refcounts of all the old entries */
2289 sl2mfn = smfn;
2290 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2291 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2292 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2293 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2294 | ((unsigned long)sl2e & ~PAGE_MASK));
2295 });
2297 /* Put the memory back in the pool */
2298 shadow_free(v->domain, smfn);
2301 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2303 struct domain *d = v->domain;
2304 shadow_l1e_t *sl1e;
2305 u32 t = mfn_to_shadow_page(smfn)->type;
2307 SHADOW_DEBUG(DESTROY_SHADOW,
2308 "%s(%05lx)\n", __func__, mfn_x(smfn));
2309 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2311 /* Record that the guest page isn't shadowed any more (in this type) */
2312 if ( t == SH_type_fl1_shadow )
2314 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2315 delete_fl1_shadow_status(v, gfn, smfn);
2317 else
2319 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2320 delete_shadow_status(v, gmfn, t, smfn);
2321 shadow_demote(v, gmfn, t);
2324 if ( shadow_mode_refcounts(d) )
2326 /* Decrement refcounts of all the old entries */
2327 mfn_t sl1mfn = smfn;
2328 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2329 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2330 && !sh_l1e_is_magic(*sl1e) ) {
2331 shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2332 shadow_put_page_from_l1e(*sl1e, d);
2334 });
2337 /* Put the memory back in the pool */
2338 shadow_free(v->domain, smfn);
2341 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2342 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2344 struct domain *d = v->domain;
2345 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2347 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2349 mfn_t m3mfn;
2350 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2351 l3_pgentry_t *l3e;
2352 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2354 /* Need to destroy the l3 and l2 monitor pages used
2355 * for the linear map */
2356 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2357 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2358 l3e = sh_map_domain_page(m3mfn);
2359 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2360 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2361 sh_unmap_domain_page(l3e);
2362 shadow_free(d, m3mfn);
2364 if ( is_pv_32on64_vcpu(v) )
2366 /* Need to destroy the l3 and l2 monitor pages that map the
2367 * Xen VAs at 3GB-4GB */
2368 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2369 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2370 l3e = sh_map_domain_page(m3mfn);
2371 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2372 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2373 sh_unmap_domain_page(l3e);
2374 shadow_free(d, m3mfn);
2376 sh_unmap_domain_page(l4e);
2378 #elif CONFIG_PAGING_LEVELS == 3
2379 /* Need to destroy the l2 monitor page in slot 4 too */
2381 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2382 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2383 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2384 sh_unmap_domain_page(l3e);
2386 #endif
2388 /* Put the memory back in the pool */
2389 shadow_free(d, mmfn);
2391 #endif
2393 /**************************************************************************/
2394 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2395 * These are called from common code when we are running out of shadow
2396 * memory, and unpinning all the top-level shadows hasn't worked.
2398 * This implementation is pretty crude and slow, but we hope that it won't
2399 * be called very often. */
2401 #if GUEST_PAGING_LEVELS == 2
2403 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2405 shadow_l2e_t *sl2e;
2406 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2407 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2408 });
2411 #elif GUEST_PAGING_LEVELS == 3
2413 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2414 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2416 shadow_l2e_t *sl2e;
2417 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2418 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2419 });
2422 #elif GUEST_PAGING_LEVELS == 4
2424 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2426 shadow_l4e_t *sl4e;
2427 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2428 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2429 });
2432 #endif
2434 /**************************************************************************/
2435 /* Internal translation functions.
2436 * These functions require a pointer to the shadow entry that will be updated.
2437 */
2439 /* These functions take a new guest entry, translate it to shadow and write
2440 * the shadow entry.
2442 * They return the same bitmaps as the shadow_set_lXe() functions.
2443 */
2445 #if GUEST_PAGING_LEVELS >= 4
2446 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2448 shadow_l4e_t new_sl4e;
2449 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2450 shadow_l4e_t *sl4p = se;
2451 mfn_t sl3mfn = _mfn(INVALID_MFN);
2452 struct domain *d = v->domain;
2453 p2m_type_t p2mt;
2454 int result = 0;
2456 perfc_incr(shadow_validate_gl4e_calls);
2458 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2460 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2461 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2462 if ( p2m_is_ram(p2mt) )
2463 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2464 else
2465 result |= SHADOW_SET_ERROR;
2467 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2469 // check for updates to xen reserved slots
2470 if ( !shadow_mode_external(d) )
2472 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2473 sizeof(shadow_l4e_t));
2474 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2476 if ( unlikely(reserved_xen_slot) )
2478 // attempt by the guest to write to a xen reserved slot
2479 //
2480 SHADOW_PRINTK("%s out-of-range update "
2481 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2482 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2483 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2485 SHADOW_ERROR("out-of-range l4e update\n");
2486 result |= SHADOW_SET_ERROR;
2489 // do not call shadow_set_l4e...
2490 return result;
2494 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2495 return result;
2499 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2501 shadow_l3e_t new_sl3e;
2502 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2503 shadow_l3e_t *sl3p = se;
2504 mfn_t sl2mfn = _mfn(INVALID_MFN);
2505 p2m_type_t p2mt;
2506 int result = 0;
2508 perfc_incr(shadow_validate_gl3e_calls);
2510 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2512 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2513 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2514 if ( p2m_is_ram(p2mt) )
2515 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2516 else
2517 result |= SHADOW_SET_ERROR;
2519 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2520 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2522 return result;
2524 #endif // GUEST_PAGING_LEVELS >= 4
2526 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2528 shadow_l2e_t new_sl2e;
2529 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2530 shadow_l2e_t *sl2p = se;
2531 mfn_t sl1mfn = _mfn(INVALID_MFN);
2532 p2m_type_t p2mt;
2533 int result = 0;
2535 perfc_incr(shadow_validate_gl2e_calls);
2537 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2539 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2540 if ( guest_supports_superpages(v) &&
2541 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2543 // superpage -- need to look up the shadow L1 which holds the
2544 // splitters...
2545 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2546 #if 0
2547 // XXX - it's possible that we want to do some kind of prefetch
2548 // for superpage fl1's here, but this is *not* on the demand path,
2549 // so we'll hold off trying that for now...
2550 //
2551 if ( !mfn_valid(sl1mfn) )
2552 sl1mfn = make_fl1_shadow(v, gl1gfn);
2553 #endif
2555 else
2557 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2558 if ( p2m_is_ram(p2mt) )
2559 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2560 else
2561 result |= SHADOW_SET_ERROR;
2564 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2566 // check for updates to xen reserved slots in PV guests...
2567 // XXX -- need to revisit this for PV 3-on-4 guests.
2568 //
2569 #if SHADOW_PAGING_LEVELS < 4
2570 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2571 if ( !shadow_mode_external(v->domain) )
2573 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2574 sizeof(shadow_l2e_t));
2575 int reserved_xen_slot;
2577 #if SHADOW_PAGING_LEVELS == 3
2578 reserved_xen_slot =
2579 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2580 (shadow_index
2581 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2582 #else /* SHADOW_PAGING_LEVELS == 2 */
2583 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2584 #endif
2586 if ( unlikely(reserved_xen_slot) )
2588 // attempt by the guest to write to a xen reserved slot
2589 //
2590 SHADOW_PRINTK("%s out-of-range update "
2591 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2592 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2593 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2595 SHADOW_ERROR("out-of-range l2e update\n");
2596 result |= SHADOW_SET_ERROR;
2599 // do not call shadow_set_l2e...
2600 return result;
2603 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2604 #endif /* SHADOW_PAGING_LEVELS < 4 */
2606 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2608 return result;
2611 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2613 shadow_l1e_t new_sl1e;
2614 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2615 shadow_l1e_t *sl1p = se;
2616 gfn_t gfn;
2617 mfn_t gmfn;
2618 p2m_type_t p2mt;
2619 int result = 0;
2620 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2621 mfn_t gl1mfn;
2622 #endif /* OOS */
2624 perfc_incr(shadow_validate_gl1e_calls);
2626 gfn = guest_l1e_get_gfn(new_gl1e);
2627 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2629 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2630 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2632 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2633 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
2634 if ( mfn_valid(gl1mfn)
2635 && mfn_is_out_of_sync(gl1mfn) )
2637 /* Update the OOS snapshot. */
2638 mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
2639 guest_l1e_t *snp;
2641 ASSERT(mfn_valid(snpmfn));
2643 snp = sh_map_domain_page(snpmfn);
2644 snp[guest_index(new_ge)] = new_gl1e;
2645 sh_unmap_domain_page(snp);
2647 #endif /* OOS */
2649 return result;
2652 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2653 /**************************************************************************/
2654 /* Special validation function for re-syncing out-of-sync shadows.
2655 * Walks the *shadow* page, and for every entry that it finds,
2656 * revalidates the guest entry that corresponds to it.
2657 * N.B. This function is called with the vcpu that unsynced the page,
2658 * *not* the one that is causing it to be resynced. */
2659 void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
2661 mfn_t sl1mfn;
2662 shadow_l1e_t *sl1p;
2663 guest_l1e_t *gl1p, *gp, *snp;
2664 int rc = 0;
2666 ASSERT(mfn_valid(snpmfn));
2668 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2669 ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
2671 snp = sh_map_domain_page(snpmfn);
2672 gp = sh_map_domain_page(gl1mfn);
2673 gl1p = gp;
2675 SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
2676 guest_l1e_t gl1e = *gl1p;
2677 guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
2679 if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
2681 gfn_t gfn;
2682 mfn_t gmfn;
2683 p2m_type_t p2mt;
2684 shadow_l1e_t nsl1e;
2686 gfn = guest_l1e_get_gfn(gl1e);
2687 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2688 l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
2689 rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
2691 *snpl1p = gl1e;
2693 });
2695 sh_unmap_domain_page(gp);
2696 sh_unmap_domain_page(snp);
2698 /* Setting shadow L1 entries should never need us to flush the TLB */
2699 ASSERT(!(rc & SHADOW_SET_FLUSH));
2702 /* Figure out whether it's definitely safe not to sync this l1 table.
2703 * That is: if we can tell that it's only used once, and that the
2704 * toplevel shadow responsible is not one of ours.
2705 * N.B. This function is called with the vcpu that required the resync,
2706 * *not* the one that originally unsynced the page, but it is
2707 * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
2708 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
2710 struct shadow_page_info *sp;
2711 mfn_t smfn;
2713 smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2714 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2716 /* Up to l2 */
2717 sp = mfn_to_shadow_page(smfn);
2718 if ( sp->count != 1 || !sp->up )
2719 return 0;
2720 smfn = _mfn(sp->up >> PAGE_SHIFT);
2721 ASSERT(mfn_valid(smfn));
2723 #if (SHADOW_PAGING_LEVELS == 4)
2724 /* up to l3 */
2725 sp = mfn_to_shadow_page(smfn);
2726 if ( sp->count != 1 || !sp->up )
2727 return 0;
2728 smfn = _mfn(sp->up >> PAGE_SHIFT);
2729 ASSERT(mfn_valid(smfn));
2731 /* up to l4 */
2732 sp = mfn_to_shadow_page(smfn);
2733 if ( sp->count != 1
2734 || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
2735 return 0;
2736 smfn = _mfn(sp->up >> PAGE_SHIFT);
2737 ASSERT(mfn_valid(smfn));
2739 #if (GUEST_PAGING_LEVELS == 2)
2740 /* In 2-on-3 shadow mode the up pointer contains the link to the
2741 * shadow page, but the shadow_table contains only the first of the
2742 * four pages that makes the PAE top shadow tables. */
2743 smfn = _mfn(mfn_x(smfn) & ~0x3UL);
2744 #endif
2746 #endif
2748 if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
2749 #if (SHADOW_PAGING_LEVELS == 3)
2750 || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
2751 || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
2752 || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
2753 #endif
2755 return 0;
2757 /* Only in use in one toplevel shadow, and it's not the one we're
2758 * running on */
2759 return 1;
2761 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
2764 /**************************************************************************/
2765 /* Functions which translate and install the shadows of arbitrary guest
2766 * entries that we have just seen the guest write. */
2769 static inline int
2770 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2771 void *new_gp, u32 size, u32 sh_type,
2772 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2773 int (*validate_ge)(struct vcpu *v, void *ge,
2774 mfn_t smfn, void *se))
2775 /* Generic function for mapping and validating. */
2777 mfn_t smfn, smfn2, map_mfn;
2778 shadow_l1e_t *sl1p;
2779 u32 shadow_idx, guest_idx;
2780 int result = 0;
2782 /* Align address and size to guest entry boundaries */
2783 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2784 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2785 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2786 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2788 /* Map the shadow page */
2789 smfn = get_shadow_status(v, gmfn, sh_type);
2790 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2791 guest_idx = guest_index(new_gp);
2792 map_mfn = smfn;
2793 shadow_idx = shadow_index(&map_mfn, guest_idx);
2794 sl1p = sh_map_domain_page(map_mfn);
2796 /* Validate one entry at a time */
2797 while ( size )
2799 smfn2 = smfn;
2800 guest_idx = guest_index(new_gp);
2801 shadow_idx = shadow_index(&smfn2, guest_idx);
2802 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2804 /* We have moved to another page of the shadow */
2805 map_mfn = smfn2;
2806 sh_unmap_domain_page(sl1p);
2807 sl1p = sh_map_domain_page(map_mfn);
2809 result |= validate_ge(v,
2810 new_gp,
2811 map_mfn,
2812 &sl1p[shadow_idx]);
2813 size -= sizeof(guest_l1e_t);
2814 new_gp += sizeof(guest_l1e_t);
2816 sh_unmap_domain_page(sl1p);
2817 return result;
2821 int
2822 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2823 void *new_gl4p, u32 size)
2825 #if GUEST_PAGING_LEVELS >= 4
2826 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2827 SH_type_l4_shadow,
2828 shadow_l4_index,
2829 validate_gl4e);
2830 #else // ! GUEST_PAGING_LEVELS >= 4
2831 SHADOW_ERROR("called in wrong paging mode!\n");
2832 BUG();
2833 return 0;
2834 #endif
2837 int
2838 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2839 void *new_gl3p, u32 size)
2841 #if GUEST_PAGING_LEVELS >= 4
2842 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2843 SH_type_l3_shadow,
2844 shadow_l3_index,
2845 validate_gl3e);
2846 #else // ! GUEST_PAGING_LEVELS >= 4
2847 SHADOW_ERROR("called in wrong paging mode!\n");
2848 BUG();
2849 return 0;
2850 #endif
2853 int
2854 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2855 void *new_gl2p, u32 size)
2857 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2858 SH_type_l2_shadow,
2859 shadow_l2_index,
2860 validate_gl2e);
2863 int
2864 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2865 void *new_gl2p, u32 size)
2867 #if GUEST_PAGING_LEVELS >= 3
2868 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2869 SH_type_l2h_shadow,
2870 shadow_l2_index,
2871 validate_gl2e);
2872 #else /* Non-PAE guests don't have different kinds of l2 table */
2873 SHADOW_ERROR("called in wrong paging mode!\n");
2874 BUG();
2875 return 0;
2876 #endif
2879 int
2880 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2881 void *new_gl1p, u32 size)
2883 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2884 SH_type_l1_shadow,
2885 shadow_l1_index,
2886 validate_gl1e);
2890 /**************************************************************************/
2891 /* Optimization: If we see two emulated writes of zeros to the same
2892 * page-table without another kind of page fault in between, we guess
2893 * that this is a batch of changes (for process destruction) and
2894 * unshadow the page so we don't take a pagefault on every entry. This
2895 * should also make finding writeable mappings of pagetables much
2896 * easier. */
2898 /* Look to see if this is the second emulated write in a row to this
2899 * page, and unshadow if it is */
2900 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2902 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2903 if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2904 && sh_mfn_is_a_page_table(gmfn) )
2906 perfc_incr(shadow_early_unshadow);
2907 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2908 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
2910 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2911 #endif
2914 /* Stop counting towards early unshadows, as we've seen a real page fault */
2915 static inline void reset_early_unshadow(struct vcpu *v)
2917 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2918 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2919 #endif
2924 /**************************************************************************/
2925 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2926 * demand-faulted a shadow l1e in the fault handler, to see if it's
2927 * worth fetching some more.
2928 */
2930 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2932 /* XXX magic number */
2933 #define PREFETCH_DISTANCE 32
2935 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2936 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2938 int i, dist;
2939 gfn_t gfn;
2940 mfn_t gmfn;
2941 guest_l1e_t *gl1p = NULL, gl1e;
2942 shadow_l1e_t sl1e;
2943 u32 gflags;
2944 p2m_type_t p2mt;
2945 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2946 guest_l1e_t *snpl1p = NULL;
2947 #endif /* OOS */
2950 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2951 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2952 /* And no more than a maximum fetches-per-fault */
2953 if ( dist > PREFETCH_DISTANCE )
2954 dist = PREFETCH_DISTANCE;
2956 if ( mfn_valid(gw->l1mfn) )
2958 /* Normal guest page; grab the next guest entry */
2959 gl1p = sh_map_domain_page(gw->l1mfn);
2960 gl1p += guest_l1_table_offset(gw->va);
2962 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2963 if ( mfn_is_out_of_sync(gw->l1mfn) )
2965 mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
2967 ASSERT(mfn_valid(snpmfn));
2968 snpl1p = sh_map_domain_page(snpmfn);
2969 snpl1p += guest_l1_table_offset(gw->va);
2971 #endif /* OOS */
2974 for ( i = 1; i < dist ; i++ )
2976 /* No point in prefetching if there's already a shadow */
2977 if ( ptr_sl1e[i].l1 != 0 )
2978 break;
2980 if ( mfn_valid(gw->l1mfn) )
2982 /* Normal guest page; grab the next guest entry */
2983 gl1e = gl1p[i];
2984 /* Not worth continuing if we hit an entry that will need another
2985 * fault for A/D-bit propagation anyway */
2986 gflags = guest_l1e_get_flags(gl1e);
2987 if ( (gflags & _PAGE_PRESENT)
2988 && (!(gflags & _PAGE_ACCESSED)
2989 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2990 break;
2992 else
2994 /* Fragmented superpage, unless we've been called wrongly */
2995 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2996 /* Increment the l1e's GFN by the right number of guest pages */
2997 gl1e = guest_l1e_from_gfn(
2998 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2999 guest_l1e_get_flags(gw->l1e));
3002 /* Look at the gfn that the l1e is pointing at */
3003 gfn = guest_l1e_get_gfn(gl1e);
3004 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
3006 /* Propagate the entry. */
3007 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
3008 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
3010 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3011 if ( snpl1p != NULL )
3012 snpl1p[i] = gl1e;
3013 #endif /* OOS */
3015 if ( gl1p != NULL )
3016 sh_unmap_domain_page(gl1p);
3017 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3018 if ( snpl1p != NULL )
3019 sh_unmap_domain_page(snpl1p);
3020 #endif /* OOS */
3023 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
3025 #if GUEST_PAGING_LEVELS == 4
3026 typedef u64 guest_va_t;
3027 typedef u64 guest_pa_t;
3028 #elif GUEST_PAGING_LEVELS == 3
3029 typedef u32 guest_va_t;
3030 typedef u64 guest_pa_t;
3031 #else
3032 typedef u32 guest_va_t;
3033 typedef u32 guest_pa_t;
3034 #endif
3036 static inline void trace_shadow_gen(u32 event, guest_va_t va)
3038 if ( tb_init_done )
3040 event |= (GUEST_PAGING_LEVELS-2)<<8;
3041 __trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va);
3045 static inline void trace_shadow_fixup(guest_l1e_t gl1e,
3046 guest_va_t va)
3048 if ( tb_init_done )
3050 struct {
3051 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
3052 so put it first for alignment sake. */
3053 guest_l1e_t gl1e;
3054 guest_va_t va;
3055 u32 flags;
3056 } __attribute__((packed)) d;
3057 u32 event;
3059 event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8);
3061 d.gl1e = gl1e;
3062 d.va = va;
3063 d.flags = this_cpu(trace_shadow_path_flags);
3065 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
3069 static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
3070 guest_va_t va)
3072 if ( tb_init_done )
3074 struct {
3075 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
3076 so put it first for alignment sake. */
3077 guest_l1e_t gl1e;
3078 guest_va_t va;
3079 u32 flags;
3080 } __attribute__((packed)) d;
3081 u32 event;
3083 event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8);
3085 d.gl1e = gl1e;
3086 d.va = va;
3087 d.flags = this_cpu(trace_shadow_path_flags);
3089 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
3093 static inline void trace_shadow_emulate_other(u32 event,
3094 guest_va_t va,
3095 gfn_t gfn)
3097 if ( tb_init_done )
3099 struct {
3100 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
3101 so put it first for alignment sake. */
3102 #if GUEST_PAGING_LEVELS == 2
3103 u32 gfn;
3104 #else
3105 u64 gfn;
3106 #endif
3107 guest_va_t va;
3108 } __attribute__((packed)) d;
3110 event |= ((GUEST_PAGING_LEVELS-2)<<8);
3112 d.gfn=gfn_x(gfn);
3113 d.va = va;
3115 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
3119 #if GUEST_PAGING_LEVELS == 3
3120 static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va);
3121 static DEFINE_PER_CPU(int,trace_extra_emulation_count);
3122 #endif
3123 static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
3125 static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
3127 if ( tb_init_done )
3129 struct {
3130 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
3131 so put it first for alignment sake. */
3132 guest_l1e_t gl1e, write_val;
3133 guest_va_t va;
3134 unsigned flags:29, emulation_count:3;
3135 } __attribute__((packed)) d;
3136 u32 event;
3138 event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8);
3140 d.gl1e = gl1e;
3141 d.write_val.l1 = this_cpu(trace_emulate_write_val);
3142 d.va = va;
3143 #if GUEST_PAGING_LEVELS == 3
3144 d.emulation_count = this_cpu(trace_extra_emulation_count);
3145 #endif
3146 d.flags = this_cpu(trace_shadow_path_flags);
3148 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
3152 /**************************************************************************/
3153 /* Entry points into the shadow code */
3155 /* Called from pagefault handler in Xen, and from the HVM trap handlers
3156 * for pagefaults. Returns 1 if this fault was an artefact of the
3157 * shadow code (and the guest should retry) or 0 if it is not (and the
3158 * fault should be handled elsewhere or passed to the guest). */
3160 static int sh_page_fault(struct vcpu *v,
3161 unsigned long va,
3162 struct cpu_user_regs *regs)
3164 struct domain *d = v->domain;
3165 walk_t gw;
3166 gfn_t gfn = _gfn(0);
3167 mfn_t gmfn, sl1mfn = _mfn(0);
3168 shadow_l1e_t sl1e, *ptr_sl1e;
3169 paddr_t gpa;
3170 struct sh_emulate_ctxt emul_ctxt;
3171 struct x86_emulate_ops *emul_ops;
3172 int r;
3173 fetch_type_t ft = 0;
3174 p2m_type_t p2mt;
3175 uint32_t rc;
3176 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3177 int fast_emul = 0;
3178 #endif
3180 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
3181 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
3182 regs->eip);
3184 perfc_incr(shadow_fault);
3186 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3187 /* If faulting frame is successfully emulated in last shadow fault
3188 * it's highly likely to reach same emulation action for this frame.
3189 * Then try to emulate early to avoid lock aquisition.
3190 */
3191 if ( v->arch.paging.last_write_emul_ok
3192 && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
3194 /* check whether error code is 3, or else fall back to normal path
3195 * in case of some validation is required
3196 */
3197 if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
3199 fast_emul = 1;
3200 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
3202 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3203 /* Fall back to the slow path if we're trying to emulate
3204 writes to an out of sync page. */
3205 if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
3207 v->arch.paging.last_write_emul_ok = 0;
3208 goto page_fault_slow_path;
3210 #endif /* OOS */
3212 perfc_incr(shadow_fault_fast_emulate);
3213 goto early_emulation;
3215 else
3216 v->arch.paging.last_write_emul_ok = 0;
3218 #endif
3220 //
3221 // XXX: Need to think about eventually mapping superpages directly in the
3222 // shadow (when possible), as opposed to splintering them into a
3223 // bunch of 4K maps.
3224 //
3226 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3227 if ( (regs->error_code & PFEC_reserved_bit) )
3229 /* The only reasons for reserved bits to be set in shadow entries
3230 * are the two "magic" shadow_l1e entries. */
3231 if ( likely((__copy_from_user(&sl1e,
3232 (sh_linear_l1_table(v)
3233 + shadow_l1_linear_offset(va)),
3234 sizeof(sl1e)) == 0)
3235 && sh_l1e_is_magic(sl1e)) )
3237 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3238 /* First, need to check that this isn't an out-of-sync
3239 * shadow l1e. If it is, we fall back to the slow path, which
3240 * will sync it up again. */
3242 shadow_l2e_t sl2e;
3243 mfn_t gl1mfn;
3244 if ( (__copy_from_user(&sl2e,
3245 (sh_linear_l2_table(v)
3246 + shadow_l2_linear_offset(va)),
3247 sizeof(sl2e)) != 0)
3248 || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
3249 || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
3250 shadow_l2e_get_mfn(sl2e))->backpointer))
3251 || unlikely(mfn_is_out_of_sync(gl1mfn)) )
3253 /* Hit the slow path as if there had been no
3254 * shadow entry at all, and let it tidy up */
3255 ASSERT(regs->error_code & PFEC_page_present);
3256 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3257 goto page_fault_slow_path;
3260 #endif /* SHOPT_OUT_OF_SYNC */
3262 if ( sh_l1e_is_gnp(sl1e) )
3264 /* Not-present in a guest PT: pass to the guest as
3265 * a not-present fault (by flipping two bits). */
3266 ASSERT(regs->error_code & PFEC_page_present);
3267 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3268 reset_early_unshadow(v);
3269 perfc_incr(shadow_fault_fast_gnp);
3270 SHADOW_PRINTK("fast path not-present\n");
3271 trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
3272 return 0;
3274 else
3276 /* Magic MMIO marker: extract gfn for MMIO address */
3277 ASSERT(sh_l1e_is_mmio(sl1e));
3278 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
3279 << PAGE_SHIFT)
3280 | (va & ~PAGE_MASK);
3282 perfc_incr(shadow_fault_fast_mmio);
3283 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
3284 reset_early_unshadow(v);
3285 trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
3286 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3287 ? EXCRET_fault_fixed : 0);
3289 else
3291 /* This should be exceptionally rare: another vcpu has fixed
3292 * the tables between the fault and our reading the l1e.
3293 * Retry and let the hardware give us the right fault next time. */
3294 perfc_incr(shadow_fault_fast_fail);
3295 SHADOW_PRINTK("fast path false alarm!\n");
3296 trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
3297 return EXCRET_fault_fixed;
3301 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3302 page_fault_slow_path:
3303 #endif
3304 #endif /* SHOPT_FAST_FAULT_PATH */
3306 /* Detect if this page fault happened while we were already in Xen
3307 * doing a shadow operation. If that happens, the only thing we can
3308 * do is let Xen's normal fault handlers try to fix it. In any case,
3309 * a diagnostic trace of the fault will be more useful than
3310 * a BUG() when we try to take the lock again. */
3311 if ( unlikely(shadow_locked_by_me(d)) )
3313 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
3314 d->arch.paging.shadow.locker_function);
3315 return 0;
3318 rewalk:
3319 rc = guest_walk_tables(v, va, &gw, regs->error_code);
3321 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3322 regs->error_code &= ~PFEC_page_present;
3323 if ( !(rc & _PAGE_PRESENT) )
3324 regs->error_code |= PFEC_page_present;
3325 #endif
3327 if ( rc != 0 )
3329 perfc_incr(shadow_fault_bail_real_fault);
3330 SHADOW_PRINTK("not a shadow fault\n");
3331 reset_early_unshadow(v);
3332 goto propagate;
3335 /* It's possible that the guest has put pagetables in memory that it has
3336 * already used for some special purpose (ioreq pages, or granted pages).
3337 * If that happens we'll have killed the guest already but it's still not
3338 * safe to propagate entries out of the guest PT so get out now. */
3339 if ( unlikely(d->is_shutting_down) )
3341 SHADOW_PRINTK("guest is shutting down\n");
3342 goto propagate;
3345 /* What kind of access are we dealing with? */
3346 ft = ((regs->error_code & PFEC_write_access)
3347 ? ft_demand_write : ft_demand_read);
3349 /* What mfn is the guest trying to access? */
3350 gfn = guest_l1e_get_gfn(gw.l1e);
3351 gmfn = gfn_to_mfn(d, gfn, &p2mt);
3353 if ( shadow_mode_refcounts(d) &&
3354 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3356 perfc_incr(shadow_fault_bail_bad_gfn);
3357 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3358 gfn_x(gfn), mfn_x(gmfn));
3359 reset_early_unshadow(v);
3360 goto propagate;
3363 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3364 /* Remember this successful VA->GFN translation for later. */
3365 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3366 regs->error_code | PFEC_page_present);
3367 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3369 shadow_lock(d);
3371 TRACE_CLEAR_PATH_FLAGS;
3373 rc = gw_remove_write_accesses(v, va, &gw);
3375 /* First bit set: Removed write access to a page. */
3376 if ( rc & GW_RMWR_FLUSHTLB )
3378 /* Write permission removal is also a hint that other gwalks
3379 * overlapping with this one may be inconsistent
3380 */
3381 perfc_incr(shadow_rm_write_flush_tlb);
3382 atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3383 flush_tlb_mask(d->domain_dirty_cpumask);
3386 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3387 /* Second bit set: Resynced a page. Re-walk needed. */
3388 if ( rc & GW_RMWR_REWALK )
3390 shadow_unlock(d);
3391 goto rewalk;
3393 #endif /* OOS */
3395 if ( !shadow_check_gwalk(v, va, &gw) )
3397 perfc_incr(shadow_inconsistent_gwalk);
3398 shadow_unlock(d);
3399 goto rewalk;
3402 shadow_audit_tables(v);
3403 sh_audit_gw(v, &gw);
3405 /* Make sure there is enough free shadow memory to build a chain of
3406 * shadow tables. (We never allocate a top-level shadow on this path,
3407 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3408 * SH_type_l1_shadow isn't correct in the latter case, all page
3409 * tables are the same size there.) */
3410 shadow_prealloc(d,
3411 SH_type_l1_shadow,
3412 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3414 /* Acquire the shadow. This must happen before we figure out the rights
3415 * for the shadow entry, since we might promote a page here. */
3416 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3417 if ( unlikely(ptr_sl1e == NULL) )
3419 /* Couldn't get the sl1e! Since we know the guest entries
3420 * are OK, this can only have been caused by a failed
3421 * shadow_set_l*e(), which will have crashed the guest.
3422 * Get out of the fault handler immediately. */
3423 ASSERT(d->is_shutting_down);
3424 shadow_unlock(d);
3425 trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va);
3426 return 0;
3429 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3430 /* Always unsync when writing to L1 page tables. */
3431 if ( sh_mfn_is_a_page_table(gmfn)
3432 && ft == ft_demand_write )
3433 sh_unsync(v, gmfn);
3435 if ( unlikely(d->is_shutting_down) )
3437 /* We might end up with a crashed domain here if
3438 * sh_remove_shadows() in a previous sh_resync() call has
3439 * failed. We cannot safely continue since some page is still
3440 * OOS but not in the hash table anymore. */
3441 shadow_unlock(d);
3442 return 0;
3444 #endif /* OOS */
3446 /* Calculate the shadow entry and write it */
3447 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3448 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
3450 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3451 if ( mfn_valid(gw.l1mfn)
3452 && mfn_is_out_of_sync(gw.l1mfn) )
3454 /* Update the OOS snapshot. */
3455 mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
3456 guest_l1e_t *snp;
3458 ASSERT(mfn_valid(snpmfn));
3460 snp = sh_map_domain_page(snpmfn);
3461 snp[guest_l1_table_offset(va)] = gw.l1e;
3462 sh_unmap_domain_page(snp);
3464 #endif /* OOS */
3466 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3467 /* Prefetch some more shadow entries */
3468 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3469 #endif
3471 /* Need to emulate accesses to page tables */
3472 if ( sh_mfn_is_a_page_table(gmfn)
3473 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3474 /* Unless they've been allowed to go out of sync with their
3475 shadows and we don't need to unshadow it. */
3476 && !(mfn_is_out_of_sync(gmfn)
3477 && !(regs->error_code & PFEC_user_mode))
3478 #endif
3481 if ( ft == ft_demand_write )
3483 perfc_incr(shadow_fault_emulate_write);
3484 goto emulate;
3486 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3488 perfc_incr(shadow_fault_emulate_read);
3489 goto emulate;
3493 /* Need to hand off device-model MMIO to the device model */
3494 if ( p2mt == p2m_mmio_dm )
3496 gpa = guest_walk_to_gpa(&gw);
3497 goto mmio;
3500 /* Log attempts to write to read-only memory */
3501 if ( (p2mt == p2m_ram_ro) && (ft == ft_demand_write) )
3503 static unsigned long lastpage = 0;
3504 if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
3505 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
3506 " page. va page=%#lx, mfn=%#lx\n",
3507 va & PAGE_MASK, mfn_x(gmfn));
3508 goto emulate_readonly; /* skip over the instruction */
3511 /* In HVM guests, we force CR0.WP always to be set, so that the
3512 * pagetables are always write-protected. If the guest thinks
3513 * CR0.WP is clear, we must emulate faulting supervisor writes to
3514 * allow the guest to write through read-only PTEs. Emulate if the
3515 * fault was a non-user write to a present page. */
3516 if ( is_hvm_domain(d)
3517 && unlikely(!hvm_wp_enabled(v))
3518 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
3520 perfc_incr(shadow_fault_emulate_wp);
3521 goto emulate;
3524 perfc_incr(shadow_fault_fixed);
3525 d->arch.paging.log_dirty.fault_count++;
3526 reset_early_unshadow(v);
3528 trace_shadow_fixup(gw.l1e, va);
3529 done:
3530 sh_audit_gw(v, &gw);
3531 SHADOW_PRINTK("fixed\n");
3532 shadow_audit_tables(v);
3533 shadow_unlock(d);
3534 return EXCRET_fault_fixed;
3536 emulate:
3537 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3538 goto not_a_shadow_fault;
3540 /*
3541 * We do not emulate user writes. Instead we use them as a hint that the
3542 * page is no longer a page table. This behaviour differs from native, but
3543 * it seems very unlikely that any OS grants user access to page tables.
3544 */
3545 if ( (regs->error_code & PFEC_user_mode) )
3547 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3548 mfn_x(gmfn));
3549 perfc_incr(shadow_fault_emulate_failed);
3550 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3551 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
3552 va, gfn);
3553 goto done;
3556 /*
3557 * Write from userspace to ro-mem needs to jump here to avoid getting
3558 * caught by user-mode page-table check above.
3559 */
3560 emulate_readonly:
3561 /*
3562 * We don't need to hold the lock for the whole emulation; we will
3563 * take it again when we write to the pagetables.
3564 */
3565 sh_audit_gw(v, &gw);
3566 shadow_audit_tables(v);
3567 shadow_unlock(d);
3569 this_cpu(trace_emulate_write_val) = 0;
3571 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3572 early_emulation:
3573 #endif
3574 if ( is_hvm_domain(d) )
3576 /*
3577 * If we are in the middle of injecting an exception or interrupt then
3578 * we should not emulate: it is not the instruction at %eip that caused
3579 * the fault. Furthermore it is almost certainly the case the handler
3580 * stack is currently considered to be a page table, so we should
3581 * unshadow the faulting page before exiting.
3582 */
3583 if ( unlikely(hvm_event_pending(v)) )
3585 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3586 if ( fast_emul )
3588 perfc_incr(shadow_fault_fast_emulate_fail);
3589 v->arch.paging.last_write_emul_ok = 0;
3591 #endif
3592 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3593 "injection: cr2=%#lx, mfn=%#lx\n",
3594 va, mfn_x(gmfn));
3595 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3596 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
3597 va, gfn);
3598 return EXCRET_fault_fixed;
3602 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3603 (unsigned long)regs->eip, (unsigned long)regs->esp);
3605 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3607 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3609 /*
3610 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3611 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3612 * then it must be 'failable': we cannot require the unshadow to succeed.
3613 */
3614 if ( r == X86EMUL_UNHANDLEABLE )
3616 perfc_incr(shadow_fault_emulate_failed);
3617 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3618 if ( fast_emul )
3620 perfc_incr(shadow_fault_fast_emulate_fail);
3621 v->arch.paging.last_write_emul_ok = 0;
3623 #endif
3624 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3625 mfn_x(gmfn));
3626 /* If this is actually a page table, then we have a bug, and need
3627 * to support more operations in the emulator. More likely,
3628 * though, this is a hint that this page should not be shadowed. */
3629 shadow_remove_all_shadows(v, gmfn);
3631 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
3632 va, gfn);
3633 goto emulate_done;
3636 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3637 /* Record successfully emulated information as heuristics to next
3638 * fault on same frame for acceleration. But be careful to verify
3639 * its attribute still as page table, or else unshadow triggered
3640 * in write emulation normally requires a re-sync with guest page
3641 * table to recover r/w permission. Incorrect record for such case
3642 * will cause unexpected more shadow faults due to propagation is
3643 * skipped.
3644 */
3645 if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3647 if ( !fast_emul )
3649 v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3650 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3651 v->arch.paging.last_write_emul_ok = 1;
3654 else if ( fast_emul )
3655 v->arch.paging.last_write_emul_ok = 0;
3656 #endif
3658 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3659 if ( r == X86EMUL_OKAY ) {
3660 int i, emulation_count=0;
3661 this_cpu(trace_emulate_initial_va) = va;
3662 /* Emulate up to four extra instructions in the hope of catching
3663 * the "second half" of a 64-bit pagetable write. */
3664 for ( i = 0 ; i < 4 ; i++ )
3666 shadow_continue_emulation(&emul_ctxt, regs);
3667 v->arch.paging.last_write_was_pt = 0;
3668 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3669 if ( r == X86EMUL_OKAY )
3671 emulation_count++;
3672 if ( v->arch.paging.last_write_was_pt )
3674 perfc_incr(shadow_em_ex_pt);
3675 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
3676 break; /* Don't emulate past the other half of the write */
3678 else
3679 perfc_incr(shadow_em_ex_non_pt);
3681 else
3683 perfc_incr(shadow_em_ex_fail);
3684 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
3685 break; /* Don't emulate again if we failed! */
3688 this_cpu(trace_extra_emulation_count)=emulation_count;
3690 #endif /* PAE guest */
3692 trace_shadow_emulate(gw.l1e, va);
3693 emulate_done:
3694 SHADOW_PRINTK("emulated\n");
3695 return EXCRET_fault_fixed;
3697 mmio:
3698 if ( !guest_mode(regs) )
3699 goto not_a_shadow_fault;
3700 perfc_incr(shadow_fault_mmio);
3701 sh_audit_gw(v, &gw);
3702 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3703 shadow_audit_tables(v);
3704 reset_early_unshadow(v);
3705 shadow_unlock(d);
3706 trace_shadow_gen(TRC_SHADOW_MMIO, va);
3707 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3708 ? EXCRET_fault_fixed : 0);
3710 not_a_shadow_fault:
3711 sh_audit_gw(v, &gw);
3712 SHADOW_PRINTK("not a shadow fault\n");
3713 shadow_audit_tables(v);
3714 reset_early_unshadow(v);
3715 shadow_unlock(d);
3717 propagate:
3718 trace_not_shadow_fault(gw.l1e, va);
3720 return 0;
3724 static int
3725 sh_invlpg(struct vcpu *v, unsigned long va)
3726 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3727 * instruction should be issued on the hardware, or 0 if it's safe not
3728 * to do so. */
3730 mfn_t sl1mfn;
3731 shadow_l2e_t sl2e;
3733 perfc_incr(shadow_invlpg);
3735 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3736 /* No longer safe to use cached gva->gfn translations */
3737 vtlb_flush(v);
3738 #endif
3740 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3741 v->arch.paging.last_write_emul_ok = 0;
3742 #endif
3744 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3745 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3746 * yet. */
3747 #if SHADOW_PAGING_LEVELS == 4
3749 shadow_l3e_t sl3e;
3750 if ( !(shadow_l4e_get_flags(
3751 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3752 & _PAGE_PRESENT) )
3753 return 0;
3754 /* This must still be a copy-from-user because we don't have the
3755 * shadow lock, and the higher-level shadows might disappear
3756 * under our feet. */
3757 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3758 + shadow_l3_linear_offset(va)),
3759 sizeof (sl3e)) != 0 )
3761 perfc_incr(shadow_invlpg_fault);
3762 return 0;
3764 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3765 return 0;
3767 #else /* SHADOW_PAGING_LEVELS == 3 */
3768 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3769 & _PAGE_PRESENT) )
3770 // no need to flush anything if there's no SL2...
3771 return 0;
3772 #endif
3774 /* This must still be a copy-from-user because we don't have the shadow
3775 * lock, and the higher-level shadows might disappear under our feet. */
3776 if ( __copy_from_user(&sl2e,
3777 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3778 sizeof (sl2e)) != 0 )
3780 perfc_incr(shadow_invlpg_fault);
3781 return 0;
3784 // If there's nothing shadowed for this particular sl2e, then
3785 // there is no need to do an invlpg, either...
3786 //
3787 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3788 return 0;
3790 // Check to see if the SL2 is a splintered superpage...
3791 // If so, then we'll need to flush the entire TLB (because that's
3792 // easier than invalidating all of the individual 4K pages).
3793 //
3794 sl1mfn = shadow_l2e_get_mfn(sl2e);
3795 if ( mfn_to_shadow_page(sl1mfn)->type
3796 == SH_type_fl1_shadow )
3798 flush_tlb_local();
3799 return 0;
3802 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3803 /* Check to see if the SL1 is out of sync. */
3805 mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3806 struct page_info *pg = mfn_to_page(gl1mfn);
3807 if ( mfn_valid(gl1mfn)
3808 && page_is_out_of_sync(pg) )
3810 /* The test above may give false positives, since we don't
3811 * hold the shadow lock yet. Check again with the lock held. */
3812 shadow_lock(v->domain);
3814 /* This must still be a copy-from-user because we didn't
3815 * have the shadow lock last time we checked, and the
3816 * higher-level shadows might have disappeared under our
3817 * feet. */
3818 if ( __copy_from_user(&sl2e,
3819 sh_linear_l2_table(v)
3820 + shadow_l2_linear_offset(va),
3821 sizeof (sl2e)) != 0 )
3823 perfc_incr(shadow_invlpg_fault);
3824 shadow_unlock(v->domain);
3825 return 0;
3828 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3830 shadow_unlock(v->domain);
3831 return 0;
3834 sl1mfn = shadow_l2e_get_mfn(sl2e);
3835 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3836 pg = mfn_to_page(gl1mfn);
3838 if ( likely(sh_mfn_is_a_page_table(gl1mfn)
3839 && page_is_out_of_sync(pg) ) )
3841 shadow_l1e_t *sl1;
3842 sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
3843 /* Remove the shadow entry that maps this VA */
3844 (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
3846 shadow_unlock(v->domain);
3847 /* Need the invlpg, to pick up the disappeareance of the sl1e */
3848 return 1;
3851 #endif
3853 return 1;
3857 static unsigned long
3858 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3859 /* Called to translate a guest virtual address to what the *guest*
3860 * pagetables would map it to. */
3862 walk_t gw;
3863 gfn_t gfn;
3865 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3866 /* Check the vTLB cache first */
3867 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3868 if ( VALID_GFN(vtlb_gfn) )
3869 return vtlb_gfn;
3870 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3872 if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
3874 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3875 pfec[0] &= ~PFEC_page_present;
3876 return INVALID_GFN;
3878 gfn = guest_walk_to_gfn(&gw);
3880 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3881 /* Remember this successful VA->GFN translation for later. */
3882 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3883 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3885 return gfn_x(gfn);
3889 static inline void
3890 sh_update_linear_entries(struct vcpu *v)
3891 /* Sync up all the linear mappings for this vcpu's pagetables */
3893 struct domain *d = v->domain;
3895 /* Linear pagetables in PV guests
3896 * ------------------------------
3898 * Guest linear pagetables, which map the guest pages, are at
3899 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3900 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3901 * are set up at shadow creation time, but (of course!) the PAE case
3902 * is subtler. Normal linear mappings are made by having an entry
3903 * in the top-level table that points to itself (shadow linear) or
3904 * to the guest top-level table (guest linear). For PAE, to set up
3905 * a linear map requires us to copy the four top-level entries into
3906 * level-2 entries. That means that every time we change a PAE l3e,
3907 * we need to reflect the change into the copy.
3909 * Linear pagetables in HVM guests
3910 * -------------------------------
3912 * For HVM guests, the linear pagetables are installed in the monitor
3913 * tables (since we can't put them in the shadow). Shadow linear
3914 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3915 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3916 * a linear pagetable of the monitor tables themselves. We have
3917 * the same issue of having to re-copy PAE l3 entries whevever we use
3918 * PAE shadows.
3920 * Because HVM guests run on the same monitor tables regardless of the
3921 * shadow tables in use, the linear mapping of the shadow tables has to
3922 * be updated every time v->arch.shadow_table changes.
3923 */
3925 /* Don't try to update the monitor table if it doesn't exist */
3926 if ( shadow_mode_external(d)
3927 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3928 return;
3930 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3932 /* For PV, one l4e points at the guest l4, one points at the shadow
3933 * l4. No maintenance required.
3934 * For HVM, just need to update the l4e that points to the shadow l4. */
3936 if ( shadow_mode_external(d) )
3938 /* Use the linear map if we can; otherwise make a new mapping */
3939 if ( v == current )
3941 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3942 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3943 __PAGE_HYPERVISOR);
3945 else
3947 l4_pgentry_t *ml4e;
3948 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3949 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3950 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3951 __PAGE_HYPERVISOR);
3952 sh_unmap_domain_page(ml4e);
3956 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3958 /* PV: XXX
3960 * HVM: To give ourselves a linear map of the shadows, we need to
3961 * extend a PAE shadow to 4 levels. We do this by having a monitor
3962 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3963 * entries into it. Then, by having the monitor l4e for shadow
3964 * pagetables also point to the monitor l4, we can use it to access
3965 * the shadows.
3966 */
3968 if ( shadow_mode_external(d) )
3970 /* Install copies of the shadow l3es into the monitor l2 table
3971 * that maps SH_LINEAR_PT_VIRT_START. */
3972 shadow_l3e_t *sl3e;
3973 l2_pgentry_t *ml2e;
3974 int i;
3976 /* Use linear mappings if we can; otherwise make new mappings */
3977 if ( v == current )
3978 ml2e = __linear_l2_table
3979 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3980 else
3982 mfn_t l3mfn, l2mfn;
3983 l4_pgentry_t *ml4e;
3984 l3_pgentry_t *ml3e;
3985 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3986 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3988 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3989 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3990 ml3e = sh_map_domain_page(l3mfn);
3991 sh_unmap_domain_page(ml4e);
3993 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3994 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3995 ml2e = sh_map_domain_page(l2mfn);
3996 sh_unmap_domain_page(ml3e);
3999 /* Shadow l3 tables are made up by sh_update_cr3 */
4000 sl3e = v->arch.paging.shadow.l3table;
4002 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
4004 ml2e[i] =
4005 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
4006 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
4007 __PAGE_HYPERVISOR)
4008 : l2e_empty();
4011 if ( v != current )
4012 sh_unmap_domain_page(ml2e);
4014 else
4015 domain_crash(d); /* XXX */
4017 #elif CONFIG_PAGING_LEVELS == 3
4019 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
4020 * entries in the shadow, and the shadow's l3 entries into the
4021 * shadow-linear-map l2 entries in the shadow. This is safe to do
4022 * because Xen does not let guests share high-slot l2 tables between l3s,
4023 * so we know we're not treading on anyone's toes.
4025 * HVM: need to copy the shadow's l3 entries into the
4026 * shadow-linear-map l2 entries in the monitor table. This is safe
4027 * because we have one monitor table for each vcpu. The monitor's
4028 * own l3es don't need to be copied because they never change.
4029 * XXX That might change if we start stuffing things into the rest
4030 * of the monitor's virtual address space.
4031 */
4033 l2_pgentry_t *l2e, new_l2e;
4034 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
4035 int i;
4036 int unmap_l2e = 0;
4038 #if GUEST_PAGING_LEVELS == 2
4040 /* Shadow l3 tables were built by sh_update_cr3 */
4041 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
4042 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
4044 #else /* GUEST_PAGING_LEVELS == 3 */
4046 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
4047 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
4049 #endif /* GUEST_PAGING_LEVELS */
4051 /* Choose where to write the entries, using linear maps if possible */
4052 if ( shadow_mode_external(d) )
4054 if ( v == current )
4056 /* From the monitor tables, it's safe to use linear maps
4057 * to update monitor l2s */
4058 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
4060 else
4062 /* Map the monitor table's high l2 */
4063 l3_pgentry_t *l3e;
4064 l3e = sh_map_domain_page(
4065 pagetable_get_mfn(v->arch.monitor_table));
4066 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
4067 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
4068 unmap_l2e = 1;
4069 sh_unmap_domain_page(l3e);
4072 else
4074 /* Map the shadow table's high l2 */
4075 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
4076 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
4077 unmap_l2e = 1;
4080 /* Write linear mapping of guest (only in PV, and only when
4081 * not translated). */
4082 if ( !shadow_mode_translate(d) )
4084 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
4086 new_l2e =
4087 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
4088 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
4089 __PAGE_HYPERVISOR)
4090 : l2e_empty());
4091 safe_write_entry(
4092 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
4093 &new_l2e);
4097 /* Write linear mapping of shadow. */
4098 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
4100 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
4101 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
4102 __PAGE_HYPERVISOR)
4103 : l2e_empty();
4104 safe_write_entry(
4105 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
4106 &new_l2e);
4109 if ( unmap_l2e )
4110 sh_unmap_domain_page(l2e);
4113 #else
4114 #error this should not happen
4115 #endif
4117 if ( shadow_mode_external(d) )
4119 /*
4120 * Having modified the linear pagetable mapping, flush local host TLBs.
4121 * This was not needed when vmenter/vmexit always had the side effect
4122 * of flushing host TLBs but, with ASIDs, it is possible to finish
4123 * this CR3 update, vmenter the guest, vmexit due to a page fault,
4124 * without an intervening host TLB flush. Then the page fault code
4125 * could use the linear pagetable to read a top-level shadow page
4126 * table entry. But, without this change, it would fetch the wrong
4127 * value due to a stale TLB.
4128 */
4129 flush_tlb_local();
4134 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
4135 * Does all appropriate management/bookkeeping/refcounting/etc...
4136 */
4137 static void
4138 sh_detach_old_tables(struct vcpu *v)
4140 mfn_t smfn;
4141 int i = 0;
4143 ////
4144 //// vcpu->arch.paging.shadow.guest_vtable
4145 ////
4147 #if GUEST_PAGING_LEVELS == 3
4148 /* PAE guests don't have a mapping of the guest top-level table */
4149 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4150 #else
4151 if ( v->arch.paging.shadow.guest_vtable )
4153 struct domain *d = v->domain;
4154 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4155 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4156 v->arch.paging.shadow.guest_vtable = NULL;
4158 #endif // !NDEBUG
4161 ////
4162 //// vcpu->arch.shadow_table[]
4163 ////
4165 #if GUEST_PAGING_LEVELS == 3
4166 /* PAE guests have four shadow_table entries */
4167 for ( i = 0 ; i < 4 ; i++ )
4168 #endif
4170 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4171 if ( mfn_x(smfn) )
4172 sh_put_ref(v, smfn, 0);
4173 v->arch.shadow_table[i] = pagetable_null();
4177 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
4178 static void
4179 sh_set_toplevel_shadow(struct vcpu *v,
4180 int slot,
4181 mfn_t gmfn,
4182 unsigned int root_type)
4184 mfn_t smfn;
4185 pagetable_t old_entry, new_entry;
4187 struct domain *d = v->domain;
4189 /* Remember the old contents of this slot */
4190 old_entry = v->arch.shadow_table[slot];
4192 /* Now figure out the new contents: is this a valid guest MFN? */
4193 if ( !mfn_valid(gmfn) )
4195 new_entry = pagetable_null();
4196 goto install_new_entry;
4199 /* Guest mfn is valid: shadow it and install the shadow */
4200 smfn = get_shadow_status(v, gmfn, root_type);
4201 if ( !mfn_valid(smfn) )
4203 /* Make sure there's enough free shadow memory. */
4204 shadow_prealloc(d, root_type, 1);
4205 /* Shadow the page. */
4206 smfn = sh_make_shadow(v, gmfn, root_type);
4208 ASSERT(mfn_valid(smfn));
4210 /* Pin the shadow and put it (back) on the list of pinned shadows */
4211 if ( sh_pin(v, smfn) == 0 )
4213 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
4214 domain_crash(v->domain);
4217 /* Take a ref to this page: it will be released in sh_detach_old_tables()
4218 * or the next call to set_toplevel_shadow() */
4219 if ( !sh_get_ref(v, smfn, 0) )
4221 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
4222 domain_crash(v->domain);
4225 new_entry = pagetable_from_mfn(smfn);
4227 install_new_entry:
4228 /* Done. Install it */
4229 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
4230 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
4231 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
4232 v->arch.shadow_table[slot] = new_entry;
4234 /* Decrement the refcount of the old contents of this slot */
4235 if ( !pagetable_is_null(old_entry) ) {
4236 mfn_t old_smfn = pagetable_get_mfn(old_entry);
4237 /* Need to repin the old toplevel shadow if it's been unpinned
4238 * by shadow_prealloc(): in PV mode we're still running on this
4239 * shadow and it's not safe to free it yet. */
4240 if ( !mfn_to_shadow_page(old_smfn)->pinned && !sh_pin(v, old_smfn) )
4242 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
4243 domain_crash(v->domain);
4245 sh_put_ref(v, old_smfn, 0);
4250 static void
4251 sh_update_cr3(struct vcpu *v, int do_locking)
4252 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
4253 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
4254 * if appropriate).
4255 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
4256 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
4257 * shadow tables are.
4258 * If do_locking != 0, assume we are being called from outside the
4259 * shadow code, and must take and release the shadow lock; otherwise
4260 * that is the caller's responsibility.
4261 */
4263 struct domain *d = v->domain;
4264 mfn_t gmfn;
4265 #if GUEST_PAGING_LEVELS == 3
4266 guest_l3e_t *gl3e;
4267 u32 guest_idx=0;
4268 int i;
4269 #endif
4271 /* Don't do anything on an uninitialised vcpu */
4272 if ( !is_hvm_domain(d) && !v->is_initialised )
4274 ASSERT(v->arch.cr3 == 0);
4275 return;
4278 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4279 /* Need to resync all the shadow entries on a TLB flush. Resync
4280 * current vcpus OOS pages before switching to the new shadow
4281 * tables so that the VA hint is still valid. */
4282 shadow_resync_current_vcpu(v, do_locking);
4283 #endif
4285 if ( do_locking ) shadow_lock(v->domain);
4287 ASSERT(shadow_locked_by_me(v->domain));
4288 ASSERT(v->arch.paging.mode);
4290 ////
4291 //// vcpu->arch.guest_table is already set
4292 ////
4294 #ifndef NDEBUG
4295 /* Double-check that the HVM code has sent us a sane guest_table */
4296 if ( is_hvm_domain(d) )
4298 ASSERT(shadow_mode_external(d));
4299 if ( hvm_paging_enabled(v) )
4300 ASSERT(pagetable_get_pfn(v->arch.guest_table));
4301 else
4302 ASSERT(v->arch.guest_table.pfn
4303 == d->arch.paging.shadow.unpaged_pagetable.pfn);
4305 #endif
4307 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
4308 d->domain_id, v->vcpu_id,
4309 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
4311 #if GUEST_PAGING_LEVELS == 4
4312 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
4313 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
4314 else
4315 #endif
4316 gmfn = pagetable_get_mfn(v->arch.guest_table);
4319 ////
4320 //// vcpu->arch.paging.shadow.guest_vtable
4321 ////
4322 #if GUEST_PAGING_LEVELS == 4
4323 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4325 if ( v->arch.paging.shadow.guest_vtable )
4326 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4327 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4328 /* PAGING_LEVELS==4 implies 64-bit, which means that
4329 * map_domain_page_global can't fail */
4330 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
4332 else
4333 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
4334 #elif GUEST_PAGING_LEVELS == 3
4335 /* On PAE guests we don't use a mapping of the guest's own top-level
4336 * table. We cache the current state of that table and shadow that,
4337 * until the next CR3 write makes us refresh our cache. */
4338 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4340 if ( shadow_mode_external(d) )
4341 /* Find where in the page the l3 table is */
4342 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
4343 else
4344 /* PV guest: l3 is at the start of a page */
4345 guest_idx = 0;
4347 // Ignore the low 2 bits of guest_idx -- they are really just
4348 // cache control.
4349 guest_idx &= ~3;
4351 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
4352 for ( i = 0; i < 4 ; i++ )
4353 v->arch.paging.shadow.gl3e[i] = gl3e[i];
4354 sh_unmap_domain_page(gl3e);
4355 #elif GUEST_PAGING_LEVELS == 2
4356 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4358 if ( v->arch.paging.shadow.guest_vtable )
4359 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4360 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4361 /* Does this really need map_domain_page_global? Handle the
4362 * error properly if so. */
4363 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
4365 else
4366 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
4367 #else
4368 #error this should never happen
4369 #endif
4372 ////
4373 //// vcpu->arch.shadow_table[]
4374 ////
4376 /* We revoke write access to the new guest toplevel page(s) before we
4377 * replace the old shadow pagetable(s), so that we can safely use the
4378 * (old) shadow linear maps in the writeable mapping heuristics. */
4379 #if GUEST_PAGING_LEVELS == 2
4380 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
4381 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4382 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
4383 #elif GUEST_PAGING_LEVELS == 3
4384 /* PAE guests have four shadow_table entries, based on the
4385 * current values of the guest's four l3es. */
4387 int flush = 0;
4388 gfn_t gl2gfn;
4389 mfn_t gl2mfn;
4390 p2m_type_t p2mt;
4391 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
4392 /* First, make all four entries read-only. */
4393 for ( i = 0; i < 4; i++ )
4395 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4397 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4398 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4399 if ( p2m_is_ram(p2mt) )
4400 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
4403 if ( flush )
4404 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4405 /* Now install the new shadows. */
4406 for ( i = 0; i < 4; i++ )
4408 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4410 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4411 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4412 if ( p2m_is_ram(p2mt) )
4413 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
4414 ? SH_type_l2h_shadow
4415 : SH_type_l2_shadow);
4416 else
4417 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4419 else
4420 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4423 #elif GUEST_PAGING_LEVELS == 4
4424 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
4425 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4426 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
4427 #else
4428 #error This should never happen
4429 #endif
4432 ///
4433 /// v->arch.paging.shadow.l3table
4434 ///
4435 #if SHADOW_PAGING_LEVELS == 3
4437 mfn_t smfn;
4438 int i;
4439 for ( i = 0; i < 4; i++ )
4441 #if GUEST_PAGING_LEVELS == 2
4442 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
4443 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
4444 #else
4445 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
4446 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4447 #endif
4448 v->arch.paging.shadow.l3table[i] =
4449 (mfn_x(smfn) == 0)
4450 ? shadow_l3e_empty()
4451 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
4454 #endif /* SHADOW_PAGING_LEVELS == 3 */
4457 ///
4458 /// v->arch.cr3
4459 ///
4460 if ( shadow_mode_external(d) )
4462 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4464 else // not shadow_mode_external...
4466 /* We don't support PV except guest == shadow == config levels */
4467 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4468 #if SHADOW_PAGING_LEVELS == 3
4469 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4470 * Don't use make_cr3 because (a) we know it's below 4GB, and
4471 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4472 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4473 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4474 #else
4475 /* 4-on-4: Just use the shadow top-level directly */
4476 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4477 #endif
4481 ///
4482 /// v->arch.hvm_vcpu.hw_cr[3]
4483 ///
4484 if ( shadow_mode_external(d) )
4486 ASSERT(is_hvm_domain(d));
4487 #if SHADOW_PAGING_LEVELS == 3
4488 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4489 v->arch.hvm_vcpu.hw_cr[3] =
4490 virt_to_maddr(&v->arch.paging.shadow.l3table);
4491 #else
4492 /* 4-on-4: Just use the shadow top-level directly */
4493 v->arch.hvm_vcpu.hw_cr[3] =
4494 pagetable_get_paddr(v->arch.shadow_table[0]);
4495 #endif
4496 hvm_update_guest_cr(v, 3);
4499 /* Fix up the linear pagetable mappings */
4500 sh_update_linear_entries(v);
4502 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4503 /* No longer safe to use cached gva->gfn translations */
4504 vtlb_flush(v);
4505 #endif
4507 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4508 v->arch.paging.last_write_emul_ok = 0;
4509 #endif
4511 /* Release the lock, if we took it (otherwise it's the caller's problem) */
4512 if ( do_locking ) shadow_unlock(v->domain);
4514 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4515 /* Need to resync all the shadow entries on a TLB flush. We only
4516 * update the shadows, leaving the pages out of sync. Also, we try
4517 * to skip synchronization of shadows not mapped in the new
4518 * tables. */
4519 shadow_sync_other_vcpus(v, do_locking);
4520 #endif
4525 /**************************************************************************/
4526 /* Functions to revoke guest rights */
4528 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
4529 int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
4530 mfn_t smfn, unsigned long off)
4532 int r;
4533 shadow_l1e_t *sl1p, sl1e;
4534 struct shadow_page_info *sp;
4536 ASSERT(mfn_valid(gmfn));
4537 ASSERT(mfn_valid(smfn));
4539 sp = mfn_to_shadow_page(smfn);
4541 if ( sp->mbz != 0
4542 || (sp->type != SH_type_l1_shadow
4543 && sp->type != SH_type_fl1_shadow) )
4544 goto fail;
4546 sl1p = sh_map_domain_page(smfn);
4547 sl1p += off;
4548 sl1e = *sl1p;
4549 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4550 != (_PAGE_PRESENT|_PAGE_RW))
4551 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4553 sh_unmap_domain_page(sl1p);
4554 goto fail;
4557 /* Found it! Need to remove its write permissions. */
4558 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4559 r = shadow_set_l1e(v, sl1p, sl1e, smfn);
4560 ASSERT( !(r & SHADOW_SET_ERROR) );
4562 sh_unmap_domain_page(sl1p);
4563 perfc_incr(shadow_writeable_h_7);
4564 return 1;
4566 fail:
4567 perfc_incr(shadow_writeable_h_8);
4568 return 0;
4570 #endif /* OOS */
4572 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4573 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4574 /* Look up this vaddr in the current shadow and see if it's a writeable
4575 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4577 shadow_l1e_t sl1e, *sl1p;
4578 shadow_l2e_t *sl2p;
4579 shadow_l3e_t *sl3p;
4580 #if SHADOW_PAGING_LEVELS >= 4
4581 shadow_l4e_t *sl4p;
4582 #endif
4583 mfn_t sl1mfn;
4584 int r;
4586 /* Carefully look in the shadow linear map for the l1e we expect */
4587 #if SHADOW_PAGING_LEVELS >= 4
4588 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4589 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4590 return 0;
4591 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4592 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4593 return 0;
4594 #else /* SHADOW_PAGING_LEVELS == 3 */
4595 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4596 + shadow_l3_linear_offset(vaddr);
4597 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4598 return 0;
4599 #endif
4600 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4601 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4602 return 0;
4603 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4604 sl1e = *sl1p;
4605 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4606 != (_PAGE_PRESENT|_PAGE_RW))
4607 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4608 return 0;
4610 /* Found it! Need to remove its write permissions. */
4611 sl1mfn = shadow_l2e_get_mfn(*sl2p);
4612 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4613 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
4614 ASSERT( !(r & SHADOW_SET_ERROR) );
4615 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);
4616 return 1;
4618 #endif
4620 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4621 mfn_t readonly_mfn)
4622 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4624 shadow_l1e_t *sl1e;
4625 int done = 0;
4626 int flags;
4627 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4628 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4629 #endif
4631 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4633 flags = shadow_l1e_get_flags(*sl1e);
4634 if ( (flags & _PAGE_PRESENT)
4635 && (flags & _PAGE_RW)
4636 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4638 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4639 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
4640 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4641 /* Remember the last shadow that we shot a writeable mapping in */
4642 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4643 #endif
4644 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4645 & PGT_count_mask) == 0 )
4646 /* This breaks us cleanly out of the FOREACH macro */
4647 done = 1;
4649 });
4650 return done;
4654 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4655 /* Excises all mappings to guest frame from this shadow l1 table */
4657 shadow_l1e_t *sl1e;
4658 int done = 0;
4659 int flags;
4661 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4663 flags = shadow_l1e_get_flags(*sl1e);
4664 if ( (flags & _PAGE_PRESENT)
4665 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4667 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
4668 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4669 /* This breaks us cleanly out of the FOREACH macro */
4670 done = 1;
4672 });
4673 return done;
4676 /**************************************************************************/
4677 /* Functions to excise all pointers to shadows from higher-level shadows. */
4679 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4680 /* Blank out a single shadow entry */
4682 switch ( mfn_to_shadow_page(smfn)->type )
4684 case SH_type_l1_shadow:
4685 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
4686 case SH_type_l2_shadow:
4687 #if GUEST_PAGING_LEVELS >= 3
4688 case SH_type_l2h_shadow:
4689 #endif
4690 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
4691 #if GUEST_PAGING_LEVELS >= 4
4692 case SH_type_l3_shadow:
4693 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
4694 case SH_type_l4_shadow:
4695 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
4696 #endif
4697 default: BUG(); /* Called with the wrong kind of shadow. */
4701 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4702 /* Remove all mappings of this l1 shadow from this l2 shadow */
4704 shadow_l2e_t *sl2e;
4705 int done = 0;
4706 int flags;
4708 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4710 flags = shadow_l2e_get_flags(*sl2e);
4711 if ( (flags & _PAGE_PRESENT)
4712 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4714 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4715 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
4716 /* This breaks us cleanly out of the FOREACH macro */
4717 done = 1;
4719 });
4720 return done;
4723 #if GUEST_PAGING_LEVELS >= 4
4724 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4725 /* Remove all mappings of this l2 shadow from this l3 shadow */
4727 shadow_l3e_t *sl3e;
4728 int done = 0;
4729 int flags;
4731 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4733 flags = shadow_l3e_get_flags(*sl3e);
4734 if ( (flags & _PAGE_PRESENT)
4735 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4737 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4738 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
4739 /* This breaks us cleanly out of the FOREACH macro */
4740 done = 1;
4742 });
4743 return done;
4746 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4747 /* Remove all mappings of this l3 shadow from this l4 shadow */
4749 shadow_l4e_t *sl4e;
4750 int done = 0;
4751 int flags;
4753 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4755 flags = shadow_l4e_get_flags(*sl4e);
4756 if ( (flags & _PAGE_PRESENT)
4757 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4759 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4760 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
4761 /* This breaks us cleanly out of the FOREACH macro */
4762 done = 1;
4764 });
4765 return done;
4767 #endif /* 64bit guest */
4769 /**************************************************************************/
4770 /* Handling HVM guest writes to pagetables */
4772 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4773 #define BAD_GVA_TO_GFN (~0UL)
4774 #define BAD_GFN_TO_MFN (~1UL)
4775 #define READONLY_GFN (~2UL)
4776 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4777 unsigned long vaddr,
4778 struct sh_emulate_ctxt *sh_ctxt)
4780 unsigned long gfn;
4781 mfn_t mfn;
4782 p2m_type_t p2mt;
4783 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4785 /* Translate the VA to a GFN */
4786 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4787 if ( gfn == INVALID_GFN )
4789 if ( is_hvm_vcpu(v) )
4790 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4791 else
4792 propagate_page_fault(vaddr, pfec);
4793 return _mfn(BAD_GVA_TO_GFN);
4796 /* Translate the GFN to an MFN */
4797 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4798 if ( p2mt == p2m_ram_ro )
4799 return _mfn(READONLY_GFN);
4800 if ( !p2m_is_ram(p2mt) )
4801 return _mfn(BAD_GFN_TO_MFN);
4803 ASSERT(mfn_valid(mfn));
4804 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4805 return mfn;
4808 /* Check that the user is allowed to perform this write.
4809 * Returns a mapped pointer to write to, or NULL for error. */
4810 #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
4811 #define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION)
4812 #define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY)
4813 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3)
4814 static void *emulate_map_dest(struct vcpu *v,
4815 unsigned long vaddr,
4816 u32 bytes,
4817 struct sh_emulate_ctxt *sh_ctxt)
4819 unsigned long offset;
4820 void *map = NULL;
4822 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4823 if ( !mfn_valid(sh_ctxt->mfn1) )
4824 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4825 MAPPING_EXCEPTION :
4826 (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
4827 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4829 #ifndef NDEBUG
4830 /* We don't emulate user-mode writes to page tables */
4831 if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 )
4833 gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
4834 "emulate_map_dest(). This should never happen!\n");
4835 return MAPPING_UNHANDLEABLE;
4837 #endif
4839 /* Unaligned writes mean probably this isn't a pagetable */
4840 if ( vaddr & (bytes - 1) )
4841 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4843 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4845 /* Whole write fits on a single page */
4846 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4847 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4849 else
4851 /* Cross-page emulated writes are only supported for HVM guests;
4852 * PV guests ought to know better */
4853 if ( !is_hvm_vcpu(v) )
4854 return MAPPING_UNHANDLEABLE;
4856 /* This write crosses a page boundary. Translate the second page */
4857 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4858 sh_ctxt);
4859 if ( !mfn_valid(sh_ctxt->mfn2) )
4860 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4861 MAPPING_EXCEPTION :
4862 (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
4863 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4865 /* Cross-page writes mean probably not a pagetable */
4866 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4868 /* Hack: we map the pages into the vcpu's LDT space, since we
4869 * know that we're not going to need the LDT for HVM guests,
4870 * and only HVM guests are allowed unaligned writes. */
4871 ASSERT(is_hvm_vcpu(v));
4872 map = (void *)LDT_VIRT_START(v);
4873 offset = l1_linear_offset((unsigned long) map);
4874 l1e_write(&__linear_l1_table[offset],
4875 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4876 l1e_write(&__linear_l1_table[offset + 1],
4877 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4878 flush_tlb_local();
4879 map += (vaddr & ~PAGE_MASK);
4882 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4883 /* Remember if the bottom bit was clear, so we can choose not to run
4884 * the change through the verify code if it's still clear afterwards */
4885 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4886 #endif
4888 return map;
4891 /* Tidy up after the emulated write: mark pages dirty, verify the new
4892 * contents, and undo the mapping */
4893 static void emulate_unmap_dest(struct vcpu *v,
4894 void *addr,
4895 u32 bytes,
4896 struct sh_emulate_ctxt *sh_ctxt)
4898 u32 b1 = bytes, b2 = 0, shflags;
4900 ASSERT(mfn_valid(sh_ctxt->mfn1));
4902 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4903 if ( likely(bytes >= 4)
4904 && (*(u32 *)addr == 0)
4905 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4906 check_for_early_unshadow(v, sh_ctxt->mfn1);
4907 else
4908 reset_early_unshadow(v);
4910 /* We can avoid re-verifying the page contents after the write if:
4911 * - it was no larger than the PTE type of this pagetable;
4912 * - it was aligned to the PTE boundaries; and
4913 * - _PAGE_PRESENT was clear before and after the write. */
4914 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4915 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4916 if ( sh_ctxt->low_bit_was_clear
4917 && !(*(u8 *)addr & _PAGE_PRESENT)
4918 && ((!(shflags & SHF_32)
4919 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4920 * the present bit unset are safe to ignore. */
4921 && ((unsigned long)addr & 7) == 0
4922 && bytes <= 8)
4923 ||
4924 (!(shflags & (SHF_PAE|SHF_64))
4925 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4926 * leave the present bit unset are safe to ignore. */
4927 && ((unsigned long)addr & 3) == 0
4928 && bytes <= 4)) )
4930 /* Writes with this alignment constraint can't possibly cross pages */
4931 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4933 else
4934 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4936 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4938 /* Validate as two writes, one to each page */
4939 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4940 b2 = bytes - b1;
4941 ASSERT(b2 < bytes);
4943 if ( likely(b1 > 0) )
4944 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4945 if ( unlikely(b2 > 0) )
4946 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4949 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4951 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4953 unsigned long offset;
4954 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4955 /* Undo the hacky two-frame contiguous map. */
4956 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4957 offset = l1_linear_offset((unsigned long) addr);
4958 l1e_write(&__linear_l1_table[offset], l1e_empty());
4959 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4960 flush_tlb_all();
4962 else
4963 sh_unmap_domain_page(addr);
4965 atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4968 static int
4969 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4970 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4972 void *addr;
4974 /* Unaligned writes are only acceptable on HVM */
4975 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4976 return X86EMUL_UNHANDLEABLE;
4978 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4979 if ( emulate_map_dest_failed(addr) )
4980 return (long)addr;
4982 shadow_lock(v->domain);
4983 memcpy(addr, src, bytes);
4985 if ( tb_init_done )
4987 #if GUEST_PAGING_LEVELS == 3
4988 if ( vaddr == this_cpu(trace_emulate_initial_va) )
4989 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4990 else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
4992 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
4993 memcpy(&this_cpu(trace_emulate_write_val),
4994 (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE);
4996 #else
4997 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4998 #endif
5001 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
5002 shadow_audit_tables(v);
5003 shadow_unlock(v->domain);
5004 return X86EMUL_OKAY;
5007 static int
5008 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
5009 unsigned long old, unsigned long new,
5010 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
5012 void *addr;
5013 unsigned long prev;
5014 int rv = X86EMUL_OKAY;
5016 /* Unaligned writes are only acceptable on HVM */
5017 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
5018 return X86EMUL_UNHANDLEABLE;
5020 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
5021 if ( emulate_map_dest_failed(addr) )
5022 return (long)addr;
5024 shadow_lock(v->domain);
5025 switch ( bytes )
5027 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
5028 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
5029 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
5030 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
5031 default:
5032 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
5033 prev = ~old;
5036 if ( prev != old )
5037 rv = X86EMUL_CMPXCHG_FAILED;
5039 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
5040 " wanted %#lx now %#lx bytes %u\n",
5041 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
5043 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
5044 shadow_audit_tables(v);
5045 shadow_unlock(v->domain);
5046 return rv;
5049 #ifdef __i386__
5050 static int
5051 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
5052 unsigned long old_lo, unsigned long old_hi,
5053 unsigned long new_lo, unsigned long new_hi,
5054 struct sh_emulate_ctxt *sh_ctxt)
5056 void *addr;
5057 u64 old, new, prev;
5058 int rv = X86EMUL_OKAY;
5060 /* Unaligned writes are only acceptable on HVM */
5061 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
5062 return X86EMUL_UNHANDLEABLE;
5064 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
5065 if ( emulate_map_dest_failed(addr) )
5066 return (long)addr;
5068 old = (((u64) old_hi) << 32) | (u64) old_lo;
5069 new = (((u64) new_hi) << 32) | (u64) new_lo;
5071 shadow_lock(v->domain);
5072 prev = cmpxchg(((u64 *)addr), old, new);
5074 if ( prev != old )
5075 rv = X86EMUL_CMPXCHG_FAILED;
5077 emulate_unmap_dest(v, addr, 8, sh_ctxt);
5078 shadow_audit_tables(v);
5079 shadow_unlock(v->domain);
5080 return rv;
5082 #endif
5084 /**************************************************************************/
5085 /* Audit tools */
5087 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
5089 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
5090 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
5091 "gl" #_level "mfn = %" PRI_mfn \
5092 " sl" #_level "mfn = %" PRI_mfn \
5093 " &gl" #_level "e = %p &sl" #_level "e = %p" \
5094 " gl" #_level "e = %" SH_PRI_gpte \
5095 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
5096 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
5097 _level, guest_index(gl ## _level ## e), \
5098 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
5099 gl ## _level ## e, sl ## _level ## e, \
5100 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
5101 ##_a); \
5102 BUG(); \
5103 done = 1; \
5104 } while (0)
5106 #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
5107 printk("Shadow %u-on-%u audit failed at level %i\n" \
5108 "gl" #_level "mfn = %" PRI_mfn \
5109 " sl" #_level "mfn = %" PRI_mfn \
5110 " Error: " _fmt "\n", \
5111 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
5112 _level, \
5113 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
5114 ##_a); \
5115 BUG(); \
5116 done = 1; \
5117 } while (0)
5119 static char * sh_audit_flags(struct vcpu *v, int level,
5120 int gflags, int sflags)
5121 /* Common code for auditing flag bits */
5123 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
5124 return "shadow is present but guest is not present";
5125 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
5126 return "global bit set in PV shadow";
5127 if ( level == 2 && (sflags & _PAGE_PSE) )
5128 return "PS bit set in shadow";
5129 #if SHADOW_PAGING_LEVELS == 3
5130 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
5131 #endif
5132 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
5133 return "accessed bit not propagated";
5134 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
5135 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
5136 return "dirty bit not propagated";
5137 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
5138 return "user/supervisor bit does not match";
5139 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
5140 return "NX bit does not match";
5141 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
5142 return "shadow grants write access but guest does not";
5143 return NULL;
5146 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
5148 guest_l1e_t *gl1e, *gp;
5149 shadow_l1e_t *sl1e;
5150 mfn_t mfn, gmfn, gl1mfn;
5151 gfn_t gfn;
5152 p2m_type_t p2mt;
5153 char *s;
5154 int done = 0;
5156 /* Follow the backpointer */
5157 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
5159 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5160 /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
5161 if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
5163 oos_audit_hash_is_present(v->domain, gl1mfn);
5164 return 0;
5166 #endif
5168 gl1e = gp = sh_map_domain_page(gl1mfn);
5169 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
5171 if ( sh_l1e_is_magic(*sl1e) )
5173 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
5174 if ( sh_l1e_is_gnp(*sl1e) )
5176 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
5177 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
5179 else
5181 ASSERT(sh_l1e_is_mmio(*sl1e));
5182 gfn = sh_l1e_mmio_get_gfn(*sl1e);
5183 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
5184 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
5185 " but guest gfn is %" SH_PRI_gfn,
5186 gfn_x(gfn),
5187 gfn_x(guest_l1e_get_gfn(*gl1e)));
5189 #endif
5191 else
5193 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
5194 shadow_l1e_get_flags(*sl1e));
5195 if ( s ) AUDIT_FAIL(1, "%s", s);
5197 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5199 gfn = guest_l1e_get_gfn(*gl1e);
5200 mfn = shadow_l1e_get_mfn(*sl1e);
5201 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
5202 if ( mfn_x(gmfn) != mfn_x(mfn) )
5203 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
5204 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5205 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5208 });
5209 sh_unmap_domain_page(gp);
5210 return done;
5213 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
5215 guest_l1e_t *gl1e, e;
5216 shadow_l1e_t *sl1e;
5217 mfn_t gl1mfn = _mfn(INVALID_MFN);
5218 int f;
5219 int done = 0;
5221 /* fl1 has no useful backpointer: all we can check are flags */
5222 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
5223 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
5224 f = shadow_l1e_get_flags(*sl1e);
5225 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
5226 if ( !(f == 0
5227 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
5228 _PAGE_ACCESSED|_PAGE_DIRTY)
5229 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
5230 || sh_l1e_is_magic(*sl1e)) )
5231 AUDIT_FAIL(1, "fl1e has bad flags");
5232 });
5233 return 0;
5236 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
5238 guest_l2e_t *gl2e, *gp;
5239 shadow_l2e_t *sl2e;
5240 mfn_t mfn, gmfn, gl2mfn;
5241 gfn_t gfn;
5242 p2m_type_t p2mt;
5243 char *s;
5244 int done = 0;
5246 /* Follow the backpointer */
5247 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
5249 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5250 /* Only L1's may be out of sync. */
5251 if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
5252 AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
5253 #endif
5255 gl2e = gp = sh_map_domain_page(gl2mfn);
5256 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
5258 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
5259 shadow_l2e_get_flags(*sl2e));
5260 if ( s ) AUDIT_FAIL(2, "%s", s);
5262 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5264 gfn = guest_l2e_get_gfn(*gl2e);
5265 mfn = shadow_l2e_get_mfn(*sl2e);
5266 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
5267 ? get_fl1_shadow_status(v, gfn)
5268 : get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5269 SH_type_l1_shadow);
5270 if ( mfn_x(gmfn) != mfn_x(mfn) )
5271 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
5272 " (--> %" PRI_mfn ")"
5273 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5274 gfn_x(gfn),
5275 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
5276 : mfn_x(gfn_to_mfn(v->domain, gfn, &p2mt)),
5277 mfn_x(gmfn), mfn_x(mfn));
5279 });
5280 sh_unmap_domain_page(gp);
5281 return 0;
5284 #if GUEST_PAGING_LEVELS >= 4
5285 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
5287 guest_l3e_t *gl3e, *gp;
5288 shadow_l3e_t *sl3e;
5289 mfn_t mfn, gmfn, gl3mfn;
5290 gfn_t gfn;
5291 p2m_type_t p2mt;
5292 char *s;
5293 int done = 0;
5295 /* Follow the backpointer */
5296 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
5298 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5299 /* Only L1's may be out of sync. */
5300 if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
5301 AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
5302 #endif
5304 gl3e = gp = sh_map_domain_page(gl3mfn);
5305 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
5307 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
5308 shadow_l3e_get_flags(*sl3e));
5309 if ( s ) AUDIT_FAIL(3, "%s", s);
5311 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5313 gfn = guest_l3e_get_gfn(*gl3e);
5314 mfn = shadow_l3e_get_mfn(*sl3e);
5315 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5316 ((GUEST_PAGING_LEVELS == 3 ||
5317 is_pv_32on64_vcpu(v))
5318 && !shadow_mode_external(v->domain)
5319 && (guest_index(gl3e) % 4) == 3)
5320 ? SH_type_l2h_shadow
5321 : SH_type_l2_shadow);
5322 if ( mfn_x(gmfn) != mfn_x(mfn) )
5323 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
5324 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5325 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5327 });
5328 sh_unmap_domain_page(gp);
5329 return 0;
5332 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
5334 guest_l4e_t *gl4e, *gp;
5335 shadow_l4e_t *sl4e;
5336 mfn_t mfn, gmfn, gl4mfn;
5337 gfn_t gfn;
5338 p2m_type_t p2mt;
5339 char *s;
5340 int done = 0;
5342 /* Follow the backpointer */
5343 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
5345 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5346 /* Only L1's may be out of sync. */
5347 if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
5348 AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
5349 #endif
5351 gl4e = gp = sh_map_domain_page(gl4mfn);
5352 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
5354 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
5355 shadow_l4e_get_flags(*sl4e));
5356 if ( s ) AUDIT_FAIL(4, "%s", s);
5358 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5360 gfn = guest_l4e_get_gfn(*gl4e);
5361 mfn = shadow_l4e_get_mfn(*sl4e);
5362 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5363 SH_type_l3_shadow);
5364 if ( mfn_x(gmfn) != mfn_x(mfn) )
5365 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
5366 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5367 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5369 });
5370 sh_unmap_domain_page(gp);
5371 return 0;
5373 #endif /* GUEST_PAGING_LEVELS >= 4 */
5376 #undef AUDIT_FAIL
5378 #endif /* Audit code */
5380 /**************************************************************************/
5381 /* Entry points into this mode of the shadow code.
5382 * This will all be mangled by the preprocessor to uniquify everything. */
5383 struct paging_mode sh_paging_mode = {
5384 .page_fault = sh_page_fault,
5385 .invlpg = sh_invlpg,
5386 .gva_to_gfn = sh_gva_to_gfn,
5387 .update_cr3 = sh_update_cr3,
5388 .update_paging_modes = shadow_update_paging_modes,
5389 .write_p2m_entry = shadow_write_p2m_entry,
5390 .write_guest_entry = shadow_write_guest_entry,
5391 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
5392 .guest_map_l1e = sh_guest_map_l1e,
5393 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
5394 .guest_levels = GUEST_PAGING_LEVELS,
5395 .shadow.detach_old_tables = sh_detach_old_tables,
5396 .shadow.x86_emulate_write = sh_x86_emulate_write,
5397 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
5398 #ifdef __i386__
5399 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
5400 #endif
5401 .shadow.make_monitor_table = sh_make_monitor_table,
5402 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
5403 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
5404 .shadow.guess_wrmap = sh_guess_wrmap,
5405 #endif
5406 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
5407 };
5409 /*
5410 * Local variables:
5411 * mode: C
5412 * c-set-style: "BSD"
5413 * c-basic-offset: 4
5414 * indent-tabs-mode: nil
5415 * End:
5416 */