ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 18479:fa2adc7fb996

x86, shadow: Fix some SHADOW_PRINTK() callers.
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Sep 11 15:17:31 2008 +0100 (2008-09-11)
parents 74621a2add54
children c353f07bae84
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include "private.h"
39 #include "types.h"
41 /* THINGS TO DO LATER:
42 *
43 * TEARDOWN HEURISTICS
44 * Also: have a heuristic for when to destroy a previous paging-mode's
45 * shadows. When a guest is done with its start-of-day 32-bit tables
46 * and reuses the memory we want to drop those shadows. Start with
47 * shadows in a page in two modes as a hint, but beware of clever tricks
48 * like reusing a pagetable for both PAE and 64-bit during boot...
49 *
50 * PAE LINEAR MAPS
51 * Rework shadow_get_l*e() to have the option of using map_domain_page()
52 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
53 * Then we can test the speed difference made by linear maps. If the
54 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
55 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
56 * to share l2h pages again.
57 *
58 * PSE disabled / PSE36
59 * We don't support any modes other than PSE enabled, PSE36 disabled.
60 * Neither of those would be hard to change, but we'd need to be able to
61 * deal with shadows made in one mode and used in another.
62 */
64 #define FETCH_TYPE_PREFETCH 1
65 #define FETCH_TYPE_DEMAND 2
66 #define FETCH_TYPE_WRITE 4
67 typedef enum {
68 ft_prefetch = FETCH_TYPE_PREFETCH,
69 ft_demand_read = FETCH_TYPE_DEMAND,
70 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
71 } fetch_type_t;
73 #ifdef DEBUG_TRACE_DUMP
74 static char *fetch_type_names[] = {
75 [ft_prefetch] "prefetch",
76 [ft_demand_read] "demand read",
77 [ft_demand_write] "demand write",
78 };
79 #endif
81 /**************************************************************************/
82 /* Hash table mapping from guest pagetables to shadows
83 *
84 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
85 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
86 * shadow L1 which maps its "splinters".
87 */
89 static inline mfn_t
90 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
91 /* Look for FL1 shadows in the hash table */
92 {
93 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
94 return smfn;
95 }
97 static inline mfn_t
98 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
99 /* Look for shadows in the hash table */
100 {
101 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
102 perfc_incr(shadow_get_shadow_status);
103 return smfn;
104 }
106 static inline void
107 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
108 /* Put an FL1 shadow into the hash table */
109 {
110 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
111 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
113 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
114 }
116 static inline void
117 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
118 /* Put a shadow into the hash table */
119 {
120 struct domain *d = v->domain;
121 int res;
123 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
124 d->domain_id, v->vcpu_id, mfn_x(gmfn),
125 shadow_type, mfn_x(smfn));
127 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
128 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
129 {
130 res = get_page(mfn_to_page(gmfn), d);
131 ASSERT(res == 1);
132 }
134 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
135 }
137 static inline void
138 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
139 /* Remove a shadow from the hash table */
140 {
141 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
142 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
143 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
144 }
146 static inline void
147 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
148 /* Remove a shadow from the hash table */
149 {
150 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
151 v->domain->domain_id, v->vcpu_id,
152 mfn_x(gmfn), shadow_type, mfn_x(smfn));
153 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
154 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
155 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
156 put_page(mfn_to_page(gmfn));
157 }
159 /**************************************************************************/
160 /* CPU feature support querying */
162 static inline int
163 guest_supports_superpages(struct vcpu *v)
164 {
165 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
166 * CR4.PSE is set or the guest is in PAE or long mode.
167 * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
168 return (is_hvm_vcpu(v) &&
169 (GUEST_PAGING_LEVELS != 2
170 || !hvm_paging_enabled(v)
171 || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
172 }
174 static inline int
175 guest_supports_nx(struct vcpu *v)
176 {
177 if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
178 return 0;
179 if ( !is_hvm_vcpu(v) )
180 return cpu_has_nx;
181 return hvm_nx_enabled(v);
182 }
185 /**************************************************************************/
186 /* Functions for walking the guest page tables */
188 /* Flags that are needed in a pagetable entry, with the sense of NX inverted */
189 static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
190 {
191 static uint32_t flags[] = {
192 /* I/F - Usr Wr */
193 /* 0 0 0 0 */ _PAGE_PRESENT,
194 /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
195 /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
196 /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
197 /* 0 1 0 0 */ _PAGE_PRESENT,
198 /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
199 /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
200 /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
201 /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
202 /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
203 /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
204 /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
205 /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
206 /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
207 /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
208 /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
209 };
211 /* Don't demand not-NX if the CPU wouldn't enforce it. */
212 if ( !guest_supports_nx(v) )
213 pfec &= ~PFEC_insn_fetch;
215 /* Don't demand R/W if the CPU wouldn't enforce it. */
216 if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
217 && !(pfec & PFEC_user_mode) )
218 pfec &= ~PFEC_write_access;
220 return flags[(pfec & 0x1f) >> 1];
221 }
223 /* Modify a guest pagetable entry to set the Accessed and Dirty bits.
224 * Returns non-zero if it actually writes to guest memory. */
225 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
226 {
227 guest_intpte_t old, new;
228 int ret = 0;
230 old = *(guest_intpte_t *)walk_p;
231 new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
232 if ( old != new )
233 {
234 /* Write the new entry into the walk, and try to write it back
235 * into the guest table as well. If the guest table has changed
236 * under out feet then leave it alone. */
237 *(guest_intpte_t *)walk_p = new;
238 if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
239 ret = 1;
241 /* FIXME -- this code is longer than necessary */
242 if(set_dirty)
243 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
244 else
245 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
246 }
247 return ret;
248 }
250 /* This validation is called with lock held, and after write permission
251 * removal. Then check is atomic and no more inconsistent content can
252 * be observed before lock is released
253 *
254 * Return 1 to indicate success and 0 for inconsistency
255 */
256 static inline uint32_t
257 shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
258 {
259 struct domain *d = v->domain;
260 guest_l1e_t *l1p;
261 guest_l2e_t *l2p;
262 #if GUEST_PAGING_LEVELS >= 4
263 guest_l3e_t *l3p;
264 guest_l4e_t *l4p;
265 #endif
266 int mismatch = 0;
268 ASSERT(shadow_locked_by_me(d));
270 if ( gw->version ==
271 atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
272 return 1;
274 /* We may consider caching guest page mapping from last
275 * guest table walk. However considering this check happens
276 * relatively less-frequent, and a bit burden here to
277 * remap guest page is better than caching mapping in each
278 * guest table walk.
279 *
280 * Also when inconsistency occurs, simply return to trigger
281 * another fault instead of re-validate new path to make
282 * logic simple.
283 */
284 perfc_incr(shadow_check_gwalk);
285 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
286 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
287 l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
288 mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
289 l3p = sh_map_domain_page(gw->l3mfn);
290 mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
291 sh_unmap_domain_page(l3p);
292 #else
293 mismatch |= (gw->l3e.l3 !=
294 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
295 #endif
296 l2p = sh_map_domain_page(gw->l2mfn);
297 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
298 sh_unmap_domain_page(l2p);
299 #else
300 l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
301 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
302 #endif
303 if ( !(guest_supports_superpages(v) &&
304 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
305 {
306 l1p = sh_map_domain_page(gw->l1mfn);
307 mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
308 sh_unmap_domain_page(l1p);
309 }
311 return !mismatch;
312 }
314 /* Remove write access permissions from a gwalk_t in a batch, and
315 * return OR-ed result for TLB flush hint and need to rewalk the guest
316 * pages.
317 *
318 * Syncing pages will remove write access to that page; but it may
319 * also give write access to other pages in the path. If we resync any
320 * pages, re-walk from the beginning.
321 */
322 #define GW_RMWR_FLUSHTLB 1
323 #define GW_RMWR_REWALK 2
325 static inline uint32_t
326 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
327 {
328 uint32_t rc = 0;
330 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
331 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
332 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
333 if ( mfn_is_out_of_sync(gw->l3mfn) )
334 {
335 sh_resync(v, gw->l3mfn);
336 rc = GW_RMWR_REWALK;
337 }
338 else
339 #endif /* OOS */
340 if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
341 rc = GW_RMWR_FLUSHTLB;
342 #endif /* GUEST_PAGING_LEVELS >= 4 */
344 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
345 if ( mfn_is_out_of_sync(gw->l2mfn) )
346 {
347 sh_resync(v, gw->l2mfn);
348 rc |= GW_RMWR_REWALK;
349 }
350 else
351 #endif /* OOS */
352 if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
353 rc |= GW_RMWR_FLUSHTLB;
354 #endif /* GUEST_PAGING_LEVELS >= 3 */
356 if ( !(guest_supports_superpages(v) &&
357 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
358 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
359 && !mfn_is_out_of_sync(gw->l1mfn)
360 #endif /* OOS */
361 && sh_remove_write_access(v, gw->l1mfn, 1, va) )
362 rc |= GW_RMWR_FLUSHTLB;
364 return rc;
365 }
367 /* Walk the guest pagetables, after the manner of a hardware walker.
368 *
369 * Inputs: a vcpu, a virtual address, a walk_t to fill, a
370 * pointer to a pagefault code
371 *
372 * We walk the vcpu's guest pagetables, filling the walk_t with what we
373 * see and adding any Accessed and Dirty bits that are needed in the
374 * guest entries. Using the pagefault code, we check the permissions as
375 * we go. For the purposes of reading pagetables we treat all non-RAM
376 * memory as contining zeroes.
377 *
378 * The walk is done in a lock-free style, with some sanity check postponed
379 * after grabbing shadow lock later. Those delayed checks will make sure
380 * no inconsistent mapping being translated into shadow page table.
381 *
382 * Returns 0 for success, or the set of permission bits that we failed on
383 * if the walk did not complete.
384 * N.B. This is different from the old return code but almost no callers
385 * checked the old return code anyway.
386 */
387 static uint32_t
388 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
389 {
390 struct domain *d = v->domain;
391 p2m_type_t p2mt;
392 guest_l1e_t *l1p = NULL;
393 guest_l2e_t *l2p = NULL;
394 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
395 guest_l3e_t *l3p = NULL;
396 guest_l4e_t *l4p;
397 #endif
398 uint32_t gflags, mflags, rc = 0;
399 int pse;
401 perfc_incr(shadow_guest_walk);
402 memset(gw, 0, sizeof(*gw));
403 gw->va = va;
405 gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
406 rmb();
408 /* Mandatory bits that must be set in every entry. We invert NX, to
409 * calculate as if there were an "X" bit that allowed access.
410 * We will accumulate, in rc, the set of flags that are missing. */
411 mflags = mandatory_flags(v, pfec);
413 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
414 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
416 /* Get the l4e from the top level table and check its flags*/
417 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
418 l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
419 gw->l4e = l4p[guest_l4_table_offset(va)];
420 gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
421 rc |= ((gflags & mflags) ^ mflags);
422 if ( rc & _PAGE_PRESENT ) goto out;
424 /* Map the l3 table */
425 gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
426 if ( !p2m_is_ram(p2mt) )
427 {
428 rc |= _PAGE_PRESENT;
429 goto out;
430 }
431 ASSERT(mfn_valid(gw->l3mfn));
433 /* Get the l3e and check its flags*/
434 l3p = sh_map_domain_page(gw->l3mfn);
435 gw->l3e = l3p[guest_l3_table_offset(va)];
436 gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
437 rc |= ((gflags & mflags) ^ mflags);
438 if ( rc & _PAGE_PRESENT )
439 goto out;
441 #else /* PAE only... */
443 /* Get l3e from the cache of the top level table and check its flag */
444 gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
445 if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
446 {
447 rc |= _PAGE_PRESENT;
448 goto out;
449 }
451 #endif /* PAE or 64... */
453 /* Map the l2 table */
454 gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
455 if ( !p2m_is_ram(p2mt) )
456 {
457 rc |= _PAGE_PRESENT;
458 goto out;
459 }
460 ASSERT(mfn_valid(gw->l2mfn));
462 /* Get the l2e */
463 l2p = sh_map_domain_page(gw->l2mfn);
464 gw->l2e = l2p[guest_l2_table_offset(va)];
466 #else /* 32-bit only... */
468 /* Get l2e from the top level table */
469 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
470 l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
471 gw->l2e = l2p[guest_l2_table_offset(va)];
473 #endif /* All levels... */
475 gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
476 rc |= ((gflags & mflags) ^ mflags);
477 if ( rc & _PAGE_PRESENT )
478 goto out;
480 pse = (guest_supports_superpages(v) &&
481 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
483 if ( pse )
484 {
485 /* Special case: this guest VA is in a PSE superpage, so there's
486 * no guest l1e. We make one up so that the propagation code
487 * can generate a shadow l1 table. Start with the gfn of the
488 * first 4k-page of the superpage. */
489 gfn_t start = guest_l2e_get_gfn(gw->l2e);
490 /* Grant full access in the l1e, since all the guest entry's
491 * access controls are enforced in the shadow l2e. */
492 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
493 _PAGE_ACCESSED|_PAGE_DIRTY);
494 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
495 * of the level 1. */
496 if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
497 flags |= _PAGE_PAT;
498 /* Copy the cache-control bits to the l1 as well, because we
499 * can't represent PAT in the (non-PSE) shadow l2e. :(
500 * This could cause problems if a guest ever maps an area of
501 * memory with superpages using more than one caching mode. */
502 flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
503 /* Increment the pfn by the right number of 4k pages.
504 * The ~0x1 is to mask out the PAT bit mentioned above. */
505 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
506 gw->l1e = guest_l1e_from_gfn(start, flags);
507 gw->l1mfn = _mfn(INVALID_MFN);
508 }
509 else
510 {
511 /* Not a superpage: carry on and find the l1e. */
512 gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
513 if ( !p2m_is_ram(p2mt) )
514 {
515 rc |= _PAGE_PRESENT;
516 goto out;
517 }
518 ASSERT(mfn_valid(gw->l1mfn));
519 l1p = sh_map_domain_page(gw->l1mfn);
520 gw->l1e = l1p[guest_l1_table_offset(va)];
521 gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
522 rc |= ((gflags & mflags) ^ mflags);
523 }
525 /* Go back and set accessed and dirty bits only if the walk was a
526 * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
527 * get set whenever a lower-level PT is used, at least some hardware
528 * walkers behave this way. */
529 if ( rc == 0 )
530 {
531 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
532 if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
533 paging_mark_dirty(d, mfn_x(gw->l4mfn));
534 if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
535 paging_mark_dirty(d, mfn_x(gw->l3mfn));
536 #endif
537 if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
538 (pse && (pfec & PFEC_write_access))) )
539 paging_mark_dirty(d, mfn_x(gw->l2mfn));
540 if ( !pse )
541 {
542 if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
543 (pfec & PFEC_write_access)) )
544 paging_mark_dirty(d, mfn_x(gw->l1mfn));
545 }
546 }
548 out:
549 #if GUEST_PAGING_LEVELS == 4
550 if ( l3p ) sh_unmap_domain_page(l3p);
551 #endif
552 #if GUEST_PAGING_LEVELS >= 3
553 if ( l2p ) sh_unmap_domain_page(l2p);
554 #endif
555 if ( l1p ) sh_unmap_domain_page(l1p);
557 return rc;
558 }
560 /* Given a walk_t, translate the gw->va into the guest's notion of the
561 * corresponding frame number. */
562 static inline gfn_t
563 guest_walk_to_gfn(walk_t *gw)
564 {
565 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
566 return _gfn(INVALID_GFN);
567 return guest_l1e_get_gfn(gw->l1e);
568 }
570 /* Given a walk_t, translate the gw->va into the guest's notion of the
571 * corresponding physical address. */
572 static inline paddr_t
573 guest_walk_to_gpa(walk_t *gw)
574 {
575 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
576 return 0;
577 return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
578 }
580 #if 0 /* Keep for debugging */
581 /* Pretty-print the contents of a guest-walk */
582 static inline void print_gw(walk_t *gw)
583 {
584 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
585 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
586 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
587 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
588 SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
589 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
590 #endif /* PAE or 64... */
591 SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
592 #endif /* All levels... */
593 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
594 SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
595 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
596 SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
597 }
598 #endif /* 0 */
600 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
601 /* Lightweight audit: pass all the shadows associated with this guest walk
602 * through the audit mechanisms */
603 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
604 {
605 mfn_t smfn;
607 if ( !(SHADOW_AUDIT_ENABLE) )
608 return;
610 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
611 if ( mfn_valid(gw->l4mfn)
612 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
613 SH_type_l4_shadow))) )
614 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
615 if ( mfn_valid(gw->l3mfn)
616 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
617 SH_type_l3_shadow))) )
618 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
619 #endif /* PAE or 64... */
620 if ( mfn_valid(gw->l2mfn) )
621 {
622 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
623 SH_type_l2_shadow))) )
624 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
625 #if GUEST_PAGING_LEVELS == 3
626 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
627 SH_type_l2h_shadow))) )
628 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
629 #endif
630 }
631 if ( mfn_valid(gw->l1mfn)
632 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
633 SH_type_l1_shadow))) )
634 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
635 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
636 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
637 && mfn_valid(
638 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
639 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
640 }
642 #else
643 #define sh_audit_gw(_v, _gw) do {} while(0)
644 #endif /* audit code */
647 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
648 void *
649 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
650 unsigned long *gl1mfn)
651 {
652 void *pl1e = NULL;
653 walk_t gw;
655 ASSERT(shadow_mode_translate(v->domain));
657 // XXX -- this is expensive, but it's easy to cobble together...
658 // FIXME!
660 if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0
661 && mfn_valid(gw.l1mfn) )
662 {
663 if ( gl1mfn )
664 *gl1mfn = mfn_x(gw.l1mfn);
665 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
666 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
667 }
669 return pl1e;
670 }
672 void
673 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
674 {
675 walk_t gw;
677 ASSERT(shadow_mode_translate(v->domain));
679 // XXX -- this is expensive, but it's easy to cobble together...
680 // FIXME!
682 (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
683 *(guest_l1e_t *)eff_l1e = gw.l1e;
684 }
685 #endif /* CONFIG == GUEST (== SHADOW) */
687 /**************************************************************************/
688 /* Functions to compute the correct index into a shadow page, given an
689 * index into the guest page (as returned by guest_get_index()).
690 * This is trivial when the shadow and guest use the same sized PTEs, but
691 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
692 * PAE- or 64-bit shadows).
693 *
694 * These functions also increment the shadow mfn, when necessary. When PTE
695 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
696 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
697 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
698 * which shadow page we really want. Similarly, when PTE sizes are
699 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
700 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
701 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
702 * space.)
703 *
704 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
705 * of shadow (to store both the shadow, and the info that would normally be
706 * stored in page_info fields). This arrangement allows the shadow and the
707 * "page_info" fields to always be stored in the same page (in fact, in
708 * the same cache line), avoiding an extra call to map_domain_page().
709 */
711 static inline u32
712 guest_index(void *ptr)
713 {
714 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
715 }
717 static u32
718 shadow_l1_index(mfn_t *smfn, u32 guest_index)
719 {
720 #if (GUEST_PAGING_LEVELS == 2)
721 *smfn = _mfn(mfn_x(*smfn) +
722 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
723 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
724 #else
725 return guest_index;
726 #endif
727 }
729 static u32
730 shadow_l2_index(mfn_t *smfn, u32 guest_index)
731 {
732 #if (GUEST_PAGING_LEVELS == 2)
733 // Because we use 2 shadow l2 entries for each guest entry, the number of
734 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
735 //
736 *smfn = _mfn(mfn_x(*smfn) +
737 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
739 // We multiply by two to get the index of the first of the two entries
740 // used to shadow the specified guest entry.
741 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
742 #else
743 return guest_index;
744 #endif
745 }
747 #if GUEST_PAGING_LEVELS >= 4
749 static u32
750 shadow_l3_index(mfn_t *smfn, u32 guest_index)
751 {
752 return guest_index;
753 }
755 static u32
756 shadow_l4_index(mfn_t *smfn, u32 guest_index)
757 {
758 return guest_index;
759 }
761 #endif // GUEST_PAGING_LEVELS >= 4
764 /**************************************************************************/
765 /* Function which computes shadow entries from their corresponding guest
766 * entries. This is the "heart" of the shadow code. It operates using
767 * level-1 shadow types, but handles all levels of entry.
768 * Don't call it directly, but use the four wrappers below.
769 */
771 static always_inline void
772 _sh_propagate(struct vcpu *v,
773 guest_intpte_t guest_intpte,
774 mfn_t target_mfn,
775 void *shadow_entry_ptr,
776 int level,
777 fetch_type_t ft,
778 p2m_type_t p2mt)
779 {
780 guest_l1e_t guest_entry = { guest_intpte };
781 shadow_l1e_t *sp = shadow_entry_ptr;
782 struct domain *d = v->domain;
783 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
784 u32 pass_thru_flags;
785 u32 gflags, sflags;
787 /* We don't shadow PAE l3s */
788 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
790 /* Check there's something for the shadows to map to */
791 if ( !p2m_is_valid(p2mt) )
792 {
793 *sp = shadow_l1e_empty();
794 goto done;
795 }
797 gflags = guest_l1e_get_flags(guest_entry);
799 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
800 {
801 /* If a guest l1 entry is not present, shadow with the magic
802 * guest-not-present entry. */
803 if ( level == 1 )
804 *sp = sh_l1e_gnp();
805 else
806 *sp = shadow_l1e_empty();
807 goto done;
808 }
810 if ( level == 1 && p2mt == p2m_mmio_dm )
811 {
812 /* Guest l1e maps emulated MMIO space */
813 *sp = sh_l1e_mmio(target_gfn, gflags);
814 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
815 d->arch.paging.shadow.has_fast_mmio_entries = 1;
816 goto done;
817 }
819 // Must have a valid target_mfn unless this is a prefetch or an l1
820 // pointing at MMIO space. In the case of a prefetch, an invalid
821 // mfn means that we can not usefully shadow anything, and so we
822 // return early.
823 //
824 if ( !mfn_valid(target_mfn)
825 && !(level == 1 && (!shadow_mode_refcounts(d)
826 || p2mt == p2m_mmio_direct)) )
827 {
828 ASSERT((ft == ft_prefetch));
829 *sp = shadow_l1e_empty();
830 goto done;
831 }
833 // Propagate bits from the guest to the shadow.
834 // Some of these may be overwritten, below.
835 // Since we know the guest's PRESENT bit is set, we also set the shadow's
836 // SHADOW_PRESENT bit.
837 //
838 pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
839 _PAGE_RW | _PAGE_PRESENT);
840 if ( guest_supports_nx(v) )
841 pass_thru_flags |= _PAGE_NX_BIT;
842 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
843 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
844 sflags = gflags & pass_thru_flags;
846 /*
847 * For HVM domains with direct access to MMIO areas, set the correct
848 * caching attributes in the shadows to match what was asked for.
849 */
850 if ( (level == 1) && is_hvm_domain(d) && has_arch_pdevs(d) &&
851 !is_xen_heap_mfn(mfn_x(target_mfn)) )
852 {
853 unsigned int type;
854 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
855 sflags |= pat_type_2_pte_flags(type);
856 else if ( d->arch.hvm_domain.is_in_uc_mode )
857 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
858 else
859 sflags |= get_pat_flags(v,
860 gflags,
861 gfn_to_paddr(target_gfn),
862 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT);
863 }
865 // Set the A&D bits for higher level shadows.
866 // Higher level entries do not, strictly speaking, have dirty bits, but
867 // since we use shadow linear tables, each of these entries may, at some
868 // point in time, also serve as a shadow L1 entry.
869 // By setting both the A&D bits in each of these, we eliminate the burden
870 // on the hardware to update these bits on initial accesses.
871 //
872 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
873 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
875 // If the A or D bit has not yet been set in the guest, then we must
876 // prevent the corresponding kind of access.
877 //
878 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
879 sflags &= ~_PAGE_PRESENT;
881 /* D bits exist in L1es and PSE L2es */
882 if ( unlikely(((level == 1) ||
883 ((level == 2) &&
884 (gflags & _PAGE_PSE) &&
885 guest_supports_superpages(v)))
886 && !(gflags & _PAGE_DIRTY)) )
887 sflags &= ~_PAGE_RW;
889 // shadow_mode_log_dirty support
890 //
891 // Only allow the guest write access to a page a) on a demand fault,
892 // or b) if the page is already marked as dirty.
893 //
894 // (We handle log-dirty entirely inside the shadow code, without using the
895 // p2m_ram_logdirty p2m type: only HAP uses that.)
896 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
897 {
898 if ( mfn_valid(target_mfn) ) {
899 if ( ft & FETCH_TYPE_WRITE )
900 paging_mark_dirty(d, mfn_x(target_mfn));
901 else if ( !sh_mfn_is_dirty(d, target_mfn) )
902 sflags &= ~_PAGE_RW;
903 }
904 }
906 if ( unlikely((level == 1) && d->dirty_vram
907 && d->dirty_vram->last_dirty == -1
908 && gfn_x(target_gfn) >= d->dirty_vram->begin_pfn
909 && gfn_x(target_gfn) < d->dirty_vram->end_pfn) )
910 {
911 if ( ft & FETCH_TYPE_WRITE )
912 d->dirty_vram->last_dirty = NOW();
913 else
914 sflags &= ~_PAGE_RW;
915 }
917 /* Read-only memory */
918 if ( p2mt == p2m_ram_ro )
919 sflags &= ~_PAGE_RW;
921 // protect guest page tables
922 //
923 if ( unlikely((level == 1)
924 && sh_mfn_is_a_page_table(target_mfn)
925 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
926 /* Unless the page is out of sync and the guest is
927 writing to it. */
928 && !(mfn_oos_may_write(target_mfn)
929 && (ft == ft_demand_write))
930 #endif /* OOS */
931 ) )
932 {
933 if ( shadow_mode_trap_reads(d) )
934 {
935 // if we are trapping both reads & writes, then mark this page
936 // as not present...
937 //
938 sflags &= ~_PAGE_PRESENT;
939 }
940 else
941 {
942 // otherwise, just prevent any writes...
943 //
944 sflags &= ~_PAGE_RW;
945 }
946 }
948 // PV guests in 64-bit mode use two different page tables for user vs
949 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
950 // It is always shadowed as present...
951 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
952 && !is_hvm_domain(d) )
953 {
954 sflags |= _PAGE_USER;
955 }
957 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
959 done:
960 SHADOW_DEBUG(PROPAGATE,
961 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
962 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
963 }
966 /* These four wrappers give us a little bit of type-safety back around
967 * the use of void-* pointers and intpte types in _sh_propagate(), and
968 * allow the compiler to optimize out some level checks. */
970 #if GUEST_PAGING_LEVELS >= 4
971 static void
972 l4e_propagate_from_guest(struct vcpu *v,
973 guest_l4e_t gl4e,
974 mfn_t sl3mfn,
975 shadow_l4e_t *sl4e,
976 fetch_type_t ft)
977 {
978 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
979 }
981 static void
982 l3e_propagate_from_guest(struct vcpu *v,
983 guest_l3e_t gl3e,
984 mfn_t sl2mfn,
985 shadow_l3e_t *sl3e,
986 fetch_type_t ft)
987 {
988 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
989 }
990 #endif // GUEST_PAGING_LEVELS >= 4
992 static void
993 l2e_propagate_from_guest(struct vcpu *v,
994 guest_l2e_t gl2e,
995 mfn_t sl1mfn,
996 shadow_l2e_t *sl2e,
997 fetch_type_t ft)
998 {
999 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
1002 static void
1003 l1e_propagate_from_guest(struct vcpu *v,
1004 guest_l1e_t gl1e,
1005 mfn_t gmfn,
1006 shadow_l1e_t *sl1e,
1007 fetch_type_t ft,
1008 p2m_type_t p2mt)
1010 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
1014 /**************************************************************************/
1015 /* These functions update shadow entries (and do bookkeeping on the shadow
1016 * tables they are in). It is intended that they are the only
1017 * functions which ever write (non-zero) data onto a shadow page.
1018 */
1020 static inline void safe_write_entry(void *dst, void *src)
1021 /* Copy one PTE safely when processors might be running on the
1022 * destination pagetable. This does *not* give safety against
1023 * concurrent writes (that's what the shadow lock is for), just
1024 * stops the hardware picking up partially written entries. */
1026 volatile unsigned long *d = dst;
1027 unsigned long *s = src;
1028 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
1029 #if CONFIG_PAGING_LEVELS == 3
1030 /* In PAE mode, pagetable entries are larger
1031 * than machine words, so won't get written atomically. We need to make
1032 * sure any other cpu running on these shadows doesn't see a
1033 * half-written entry. Do this by marking the entry not-present first,
1034 * then writing the high word before the low word. */
1035 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
1036 d[0] = 0;
1037 d[1] = s[1];
1038 d[0] = s[0];
1039 #else
1040 /* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
1041 * which will be an atomic write, since the entry is aligned. */
1042 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
1043 *d = *s;
1044 #endif
1048 static inline void
1049 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
1050 /* This function does the actual writes to shadow pages.
1051 * It must not be called directly, since it doesn't do the bookkeeping
1052 * that shadow_set_l*e() functions do. */
1054 shadow_l1e_t *dst = d;
1055 shadow_l1e_t *src = s;
1056 void *map = NULL;
1057 int i;
1059 /* Because we mirror access rights at all levels in the shadow, an
1060 * l2 (or higher) entry with the RW bit cleared will leave us with
1061 * no write access through the linear map.
1062 * We detect that by writing to the shadow with copy_to_user() and
1063 * using map_domain_page() to get a writeable mapping if we need to. */
1064 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
1066 perfc_incr(shadow_linear_map_failed);
1067 map = sh_map_domain_page(mfn);
1068 ASSERT(map != NULL);
1069 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
1073 for ( i = 0; i < entries; i++ )
1074 safe_write_entry(dst++, src++);
1076 if ( map != NULL ) sh_unmap_domain_page(map);
1079 static inline int
1080 perms_strictly_increased(u32 old_flags, u32 new_flags)
1081 /* Given the flags of two entries, are the new flags a strict
1082 * increase in rights over the old ones? */
1084 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1085 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1086 /* Flip the NX bit, since it's the only one that decreases rights;
1087 * we calculate as if it were an "X" bit. */
1088 of ^= _PAGE_NX_BIT;
1089 nf ^= _PAGE_NX_BIT;
1090 /* If the changed bits are all set in the new flags, then rights strictly
1091 * increased between old and new. */
1092 return ((of | (of ^ nf)) == nf);
1095 static int inline
1096 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1098 int res;
1099 mfn_t mfn;
1100 struct domain *owner;
1102 ASSERT(!sh_l1e_is_magic(sl1e));
1104 if ( !shadow_mode_refcounts(d) )
1105 return 1;
1107 res = get_page_from_l1e(sl1e, d);
1109 // If a privileged domain is attempting to install a map of a page it does
1110 // not own, we let it succeed anyway.
1111 //
1112 if ( unlikely(!res) &&
1113 !shadow_mode_translate(d) &&
1114 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
1115 (owner = page_get_owner(mfn_to_page(mfn))) &&
1116 (d != owner) &&
1117 IS_PRIV_FOR(d, owner))
1119 res = get_page_from_l1e(sl1e, owner);
1120 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
1121 "which is owned by domain %d: %s\n",
1122 d->domain_id, mfn_x(mfn), owner->domain_id,
1123 res ? "success" : "failed");
1126 if ( unlikely(!res) )
1128 perfc_incr(shadow_get_page_fail);
1129 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
1132 return res;
1135 static void inline
1136 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1138 if ( !shadow_mode_refcounts(d) )
1139 return;
1141 put_page_from_l1e(sl1e, d);
1144 #if GUEST_PAGING_LEVELS >= 4
1145 static int shadow_set_l4e(struct vcpu *v,
1146 shadow_l4e_t *sl4e,
1147 shadow_l4e_t new_sl4e,
1148 mfn_t sl4mfn)
1150 int flags = 0, ok;
1151 shadow_l4e_t old_sl4e;
1152 paddr_t paddr;
1153 ASSERT(sl4e != NULL);
1154 old_sl4e = *sl4e;
1156 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
1158 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1159 | (((unsigned long)sl4e) & ~PAGE_MASK));
1161 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
1163 /* About to install a new reference */
1164 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
1165 ok = sh_get_ref(v, sl3mfn, paddr);
1166 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
1167 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
1168 ok |= sh_pin(v, sl3mfn);
1169 if ( !ok )
1171 domain_crash(v->domain);
1172 return SHADOW_SET_ERROR;
1174 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1175 shadow_resync_all(v, 0);
1176 #endif
1179 /* Write the new entry */
1180 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
1181 flags |= SHADOW_SET_CHANGED;
1183 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1185 /* We lost a reference to an old mfn. */
1186 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1187 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1188 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1189 shadow_l4e_get_flags(new_sl4e)) )
1191 flags |= SHADOW_SET_FLUSH;
1193 sh_put_ref(v, osl3mfn, paddr);
1195 return flags;
1198 static int shadow_set_l3e(struct vcpu *v,
1199 shadow_l3e_t *sl3e,
1200 shadow_l3e_t new_sl3e,
1201 mfn_t sl3mfn)
1203 int flags = 0;
1204 shadow_l3e_t old_sl3e;
1205 paddr_t paddr;
1206 ASSERT(sl3e != NULL);
1207 old_sl3e = *sl3e;
1209 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1211 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1212 | (((unsigned long)sl3e) & ~PAGE_MASK));
1214 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1216 /* About to install a new reference */
1217 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1219 domain_crash(v->domain);
1220 return SHADOW_SET_ERROR;
1222 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1223 shadow_resync_all(v, 0);
1224 #endif
1227 /* Write the new entry */
1228 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1229 flags |= SHADOW_SET_CHANGED;
1231 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1233 /* We lost a reference to an old mfn. */
1234 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1235 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1236 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1237 shadow_l3e_get_flags(new_sl3e)) )
1239 flags |= SHADOW_SET_FLUSH;
1241 sh_put_ref(v, osl2mfn, paddr);
1243 return flags;
1245 #endif /* GUEST_PAGING_LEVELS >= 4 */
1247 static int shadow_set_l2e(struct vcpu *v,
1248 shadow_l2e_t *sl2e,
1249 shadow_l2e_t new_sl2e,
1250 mfn_t sl2mfn)
1252 int flags = 0;
1253 shadow_l2e_t old_sl2e;
1254 paddr_t paddr;
1256 #if GUEST_PAGING_LEVELS == 2
1257 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1258 * shadows. Reference counting and up-pointers track from the first
1259 * page of the shadow to the first l2e, so make sure that we're
1260 * working with those:
1261 * Align the pointer down so it's pointing at the first of the pair */
1262 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1263 /* Align the mfn of the shadow entry too */
1264 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1265 #endif
1267 ASSERT(sl2e != NULL);
1268 old_sl2e = *sl2e;
1270 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1272 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1273 | (((unsigned long)sl2e) & ~PAGE_MASK));
1275 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1277 mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
1279 /* About to install a new reference */
1280 if ( !sh_get_ref(v, sl1mfn, paddr) )
1282 domain_crash(v->domain);
1283 return SHADOW_SET_ERROR;
1285 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1287 struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
1288 mfn_t gl1mfn = _mfn(sp->backpointer);
1290 /* If the shadow is a fl1 then the backpointer contains
1291 the GFN instead of the GMFN, and it's definitely not
1292 OOS. */
1293 if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
1294 && mfn_is_out_of_sync(gl1mfn) )
1295 sh_resync(v, gl1mfn);
1297 #endif
1300 /* Write the new entry */
1301 #if GUEST_PAGING_LEVELS == 2
1303 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1304 /* The l1 shadow is two pages long and need to be pointed to by
1305 * two adjacent l1es. The pair have the same flags, but point
1306 * at odd and even MFNs */
1307 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1308 pair[1].l2 |= (1<<PAGE_SHIFT);
1309 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1311 #else /* normal case */
1312 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1313 #endif
1314 flags |= SHADOW_SET_CHANGED;
1316 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1318 /* We lost a reference to an old mfn. */
1319 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1320 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1321 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1322 shadow_l2e_get_flags(new_sl2e)) )
1324 flags |= SHADOW_SET_FLUSH;
1326 sh_put_ref(v, osl1mfn, paddr);
1328 return flags;
1331 static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1332 shadow_l1e_t *sl1e,
1333 mfn_t sl1mfn,
1334 struct domain *d)
1336 mfn_t mfn;
1337 unsigned long gfn;
1339 if ( !d->dirty_vram ) return;
1341 mfn = shadow_l1e_get_mfn(new_sl1e);
1343 if ( !mfn_valid(mfn) ) return; /* m2p for mmio_direct may not exist */
1345 gfn = mfn_to_gfn(d, mfn);
1347 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1348 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1349 struct page_info *page = mfn_to_page(mfn);
1350 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1352 if ( count_info == 1 )
1353 /* Initial guest reference, record it */
1354 d->dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1355 | ((unsigned long)sl1e & ~PAGE_MASK);
1359 static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1360 shadow_l1e_t *sl1e,
1361 mfn_t sl1mfn,
1362 struct domain *d)
1364 mfn_t mfn;
1365 unsigned long gfn;
1367 if ( !d->dirty_vram ) return;
1369 mfn = shadow_l1e_get_mfn(old_sl1e);
1371 if ( !mfn_valid(mfn) ) return;
1373 gfn = mfn_to_gfn(d, mfn);
1375 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1376 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1377 struct page_info *page = mfn_to_page(mfn);
1378 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1379 int dirty = 0;
1380 paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1381 | ((unsigned long)sl1e & ~PAGE_MASK);
1383 if ( count_info == 1 ) {
1384 /* Last reference */
1385 if ( d->dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1386 /* We didn't know it was that one, let's say it is dirty */
1387 dirty = 1;
1388 } else {
1389 ASSERT(d->dirty_vram->sl1ma[i] == sl1ma);
1390 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1391 if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_DIRTY )
1392 dirty = 1;
1394 } else {
1395 /* We had more than one reference, just consider the page dirty. */
1396 dirty = 1;
1397 /* Check that it's not the one we recorded. */
1398 if ( d->dirty_vram->sl1ma[i] == sl1ma ) {
1399 /* Too bad, we remembered the wrong one... */
1400 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1401 } else {
1402 /* Ok, our recorded sl1e is still pointing to this page, let's
1403 * just hope it will remain. */
1406 if ( dirty ) {
1407 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
1408 d->dirty_vram->last_dirty = NOW();
1413 static int shadow_set_l1e(struct vcpu *v,
1414 shadow_l1e_t *sl1e,
1415 shadow_l1e_t new_sl1e,
1416 mfn_t sl1mfn)
1418 int flags = 0;
1419 struct domain *d = v->domain;
1420 shadow_l1e_t old_sl1e;
1421 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1422 mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
1423 #endif
1424 ASSERT(sl1e != NULL);
1426 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1427 if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
1428 && ((shadow_l1e_get_flags(new_sl1e) & (_PAGE_RW|_PAGE_PRESENT))
1429 == (_PAGE_RW|_PAGE_PRESENT)) )
1430 oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
1431 #endif
1433 old_sl1e = *sl1e;
1435 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1437 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1438 && !sh_l1e_is_magic(new_sl1e) )
1440 /* About to install a new reference */
1441 if ( shadow_mode_refcounts(d) ) {
1442 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
1443 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1445 /* Doesn't look like a pagetable. */
1446 flags |= SHADOW_SET_ERROR;
1447 new_sl1e = shadow_l1e_empty();
1449 else
1451 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1456 /* Write the new entry */
1457 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1458 flags |= SHADOW_SET_CHANGED;
1460 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1461 && !sh_l1e_is_magic(old_sl1e) )
1463 /* We lost a reference to an old mfn. */
1464 /* N.B. Unlike higher-level sets, never need an extra flush
1465 * when writing an l1e. Because it points to the same guest frame
1466 * as the guest l1e did, it's the guest's responsibility to
1467 * trigger a flush later. */
1468 if ( shadow_mode_refcounts(d) )
1470 shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1471 shadow_put_page_from_l1e(old_sl1e, d);
1472 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
1475 return flags;
1479 /**************************************************************************/
1480 /* Macros to walk pagetables. These take the shadow of a pagetable and
1481 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1482 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1483 * second entry (since pairs of entries are managed together). For multi-page
1484 * shadows they walk all pages.
1486 * Arguments are an MFN, the variable to point to each entry, a variable
1487 * to indicate that we are done (we will shortcut to the end of the scan
1488 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1489 * and the code.
1491 * WARNING: These macros have side-effects. They change the values of both
1492 * the pointer and the MFN. */
1494 static inline void increment_ptr_to_guest_entry(void *ptr)
1496 if ( ptr )
1498 guest_l1e_t **entry = ptr;
1499 (*entry)++;
1503 /* All kinds of l1: touch all entries */
1504 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1505 do { \
1506 int _i; \
1507 shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \
1508 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1509 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1510 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1511 { \
1512 (_sl1e) = _sp + _i; \
1513 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1514 {_code} \
1515 if ( _done ) break; \
1516 increment_ptr_to_guest_entry(_gl1p); \
1517 } \
1518 sh_unmap_domain_page(_sp); \
1519 } while (0)
1521 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1522 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1523 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1524 do { \
1525 int __done = 0; \
1526 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1527 ({ (__done = _done); }), _code); \
1528 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1529 if ( !__done ) \
1530 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1531 ({ (__done = _done); }), _code); \
1532 } while (0)
1533 #else /* Everything else; l1 shadows are only one page */
1534 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1535 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1536 #endif
1539 #if GUEST_PAGING_LEVELS == 2
1541 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1542 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1543 do { \
1544 int _i, _j, __done = 0; \
1545 int _xen = !shadow_mode_external(_dom); \
1546 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1547 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1548 { \
1549 shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \
1550 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1551 if ( (!(_xen)) \
1552 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1553 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1554 { \
1555 (_sl2e) = _sp + _i; \
1556 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1557 {_code} \
1558 if ( (__done = (_done)) ) break; \
1559 increment_ptr_to_guest_entry(_gl2p); \
1560 } \
1561 sh_unmap_domain_page(_sp); \
1562 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1563 } \
1564 } while (0)
1566 #elif GUEST_PAGING_LEVELS == 3
1568 /* PAE: if it's an l2h, don't touch Xen mappings */
1569 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1570 do { \
1571 int _i; \
1572 int _xen = !shadow_mode_external(_dom); \
1573 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1574 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1575 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1576 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1577 if ( (!(_xen)) \
1578 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1579 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1580 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1581 { \
1582 (_sl2e) = _sp + _i; \
1583 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1584 {_code} \
1585 if ( _done ) break; \
1586 increment_ptr_to_guest_entry(_gl2p); \
1587 } \
1588 sh_unmap_domain_page(_sp); \
1589 } while (0)
1591 #else
1593 /* 64-bit l2: touch all entries except for PAE compat guests. */
1594 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1595 do { \
1596 int _i; \
1597 int _xen = !shadow_mode_external(_dom); \
1598 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1599 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1600 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1601 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1602 { \
1603 if ( (!(_xen)) \
1604 || !is_pv_32on64_domain(_dom) \
1605 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1606 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1607 { \
1608 (_sl2e) = _sp + _i; \
1609 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1610 {_code} \
1611 if ( _done ) break; \
1612 increment_ptr_to_guest_entry(_gl2p); \
1613 } \
1614 } \
1615 sh_unmap_domain_page(_sp); \
1616 } while (0)
1618 #endif /* different kinds of l2 */
1620 #if GUEST_PAGING_LEVELS == 4
1622 /* 64-bit l3: touch all entries */
1623 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1624 do { \
1625 int _i; \
1626 shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \
1627 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1628 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1629 { \
1630 (_sl3e) = _sp + _i; \
1631 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1632 {_code} \
1633 if ( _done ) break; \
1634 increment_ptr_to_guest_entry(_gl3p); \
1635 } \
1636 sh_unmap_domain_page(_sp); \
1637 } while (0)
1639 /* 64-bit l4: avoid Xen mappings */
1640 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1641 do { \
1642 shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \
1643 int _xen = !shadow_mode_external(_dom); \
1644 int _i; \
1645 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1646 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1647 { \
1648 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1649 { \
1650 (_sl4e) = _sp + _i; \
1651 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1652 {_code} \
1653 if ( _done ) break; \
1654 } \
1655 increment_ptr_to_guest_entry(_gl4p); \
1656 } \
1657 sh_unmap_domain_page(_sp); \
1658 } while (0)
1660 #endif
1664 /**************************************************************************/
1665 /* Functions to install Xen mappings and linear mappings in shadow pages */
1667 // XXX -- this function should probably be moved to shadow-common.c, but that
1668 // probably wants to wait until the shadow types have been moved from
1669 // shadow-types.h to shadow-private.h
1670 //
1671 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1672 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1674 struct domain *d = v->domain;
1675 shadow_l4e_t *sl4e;
1677 sl4e = sh_map_domain_page(sl4mfn);
1678 ASSERT(sl4e != NULL);
1679 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1681 /* Copy the common Xen mappings from the idle domain */
1682 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1683 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1684 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1686 /* Install the per-domain mappings for this domain */
1687 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1688 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1689 __PAGE_HYPERVISOR);
1691 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1692 * shadows on 64-bit xen, this linear mapping is later replaced by the
1693 * monitor pagetable structure, which is built in make_monitor_table
1694 * and maintained by sh_update_linear_entries. */
1695 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1696 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1698 /* Self linear mapping. */
1699 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1701 // linear tables may not be used with translated PV guests
1702 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1703 shadow_l4e_empty();
1705 else
1707 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1708 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1711 if ( shadow_mode_translate(v->domain) )
1713 /* install domain-specific P2M table */
1714 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1715 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1716 __PAGE_HYPERVISOR);
1719 sh_unmap_domain_page(sl4e);
1721 #endif
1723 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1724 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1725 // place, which means that we need to populate the l2h entry in the l3
1726 // table.
1728 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1730 struct domain *d = v->domain;
1731 shadow_l2e_t *sl2e;
1732 #if CONFIG_PAGING_LEVELS == 3
1733 int i;
1734 #else
1736 if ( !is_pv_32on64_vcpu(v) )
1737 return;
1738 #endif
1740 sl2e = sh_map_domain_page(sl2hmfn);
1741 ASSERT(sl2e != NULL);
1742 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1744 #if CONFIG_PAGING_LEVELS == 3
1746 /* Copy the common Xen mappings from the idle domain */
1747 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1748 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1749 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1751 /* Install the per-domain mappings for this domain */
1752 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1753 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1754 shadow_l2e_from_mfn(
1755 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1756 __PAGE_HYPERVISOR);
1758 /* We don't set up a linear mapping here because we can't until this
1759 * l2h is installed in an l3e. sh_update_linear_entries() handles
1760 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1761 * We zero them here, just as a safety measure.
1762 */
1763 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1764 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1765 shadow_l2e_empty();
1766 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1767 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1768 shadow_l2e_empty();
1770 if ( shadow_mode_translate(d) )
1772 /* Install the domain-specific p2m table */
1773 l3_pgentry_t *p2m;
1774 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1775 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1776 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1778 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1779 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1780 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1781 __PAGE_HYPERVISOR)
1782 : shadow_l2e_empty();
1784 sh_unmap_domain_page(p2m);
1787 #else
1789 /* Copy the common Xen mappings from the idle domain */
1790 memcpy(
1791 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1792 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1793 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1795 #endif
1797 sh_unmap_domain_page(sl2e);
1799 #endif
1805 /**************************************************************************/
1806 /* Create a shadow of a given guest page.
1807 */
1808 static mfn_t
1809 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1811 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1812 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1813 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1815 if ( shadow_type != SH_type_l2_32_shadow
1816 && shadow_type != SH_type_l2_pae_shadow
1817 && shadow_type != SH_type_l2h_pae_shadow
1818 && shadow_type != SH_type_l4_64_shadow )
1819 /* Lower-level shadow, not yet linked form a higher level */
1820 mfn_to_shadow_page(smfn)->up = 0;
1822 #if GUEST_PAGING_LEVELS == 4
1823 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1824 if ( shadow_type == SH_type_l4_64_shadow &&
1825 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1827 /* We're shadowing a new l4, but we've been assuming the guest uses
1828 * only one l4 per vcpu and context switches using an l4 entry.
1829 * Count the number of active l4 shadows. If there are enough
1830 * of them, decide that this isn't an old linux guest, and stop
1831 * pinning l3es. This is not very quick but it doesn't happen
1832 * very often. */
1833 struct list_head *l, *t;
1834 struct shadow_page_info *sp;
1835 struct vcpu *v2;
1836 int l4count = 0, vcpus = 0;
1837 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1839 sp = list_entry(l, struct shadow_page_info, list);
1840 if ( sp->type == SH_type_l4_64_shadow )
1841 l4count++;
1843 for_each_vcpu ( v->domain, v2 )
1844 vcpus++;
1845 if ( l4count > 2 * vcpus )
1847 /* Unpin all the pinned l3 tables, and don't pin any more. */
1848 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1850 sp = list_entry(l, struct shadow_page_info, list);
1851 if ( sp->type == SH_type_l3_64_shadow )
1852 sh_unpin(v, shadow_page_to_mfn(sp));
1854 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1857 #endif
1858 #endif
1860 // Create the Xen mappings...
1861 if ( !shadow_mode_external(v->domain) )
1863 switch (shadow_type)
1865 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1866 case SH_type_l4_shadow:
1867 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1868 #endif
1869 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1870 case SH_type_l2h_shadow:
1871 sh_install_xen_entries_in_l2h(v, smfn); break;
1872 #endif
1873 default: /* Do nothing */ break;
1877 shadow_promote(v, gmfn, shadow_type);
1878 set_shadow_status(v, gmfn, shadow_type, smfn);
1880 return smfn;
1883 /* Make a splintered superpage shadow */
1884 static mfn_t
1885 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1887 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1888 (unsigned long) gfn_x(gfn));
1890 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1891 gfn_x(gfn), mfn_x(smfn));
1893 set_fl1_shadow_status(v, gfn, smfn);
1894 return smfn;
1898 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1899 mfn_t
1900 sh_make_monitor_table(struct vcpu *v)
1902 struct domain *d = v->domain;
1904 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1906 /* Guarantee we can get the memory we need */
1907 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1909 #if CONFIG_PAGING_LEVELS == 4
1911 mfn_t m4mfn;
1912 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1913 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1914 /* Remember the level of this table */
1915 mfn_to_page(m4mfn)->shadow_flags = 4;
1916 #if SHADOW_PAGING_LEVELS < 4
1918 mfn_t m3mfn, m2mfn;
1919 l4_pgentry_t *l4e;
1920 l3_pgentry_t *l3e;
1921 /* Install an l3 table and an l2 table that will hold the shadow
1922 * linear map entries. This overrides the linear map entry that
1923 * was installed by sh_install_xen_entries_in_l4. */
1924 l4e = sh_map_domain_page(m4mfn);
1926 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1927 mfn_to_page(m3mfn)->shadow_flags = 3;
1928 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1929 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1931 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1932 mfn_to_page(m2mfn)->shadow_flags = 2;
1933 l3e = sh_map_domain_page(m3mfn);
1934 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1935 sh_unmap_domain_page(l3e);
1937 if ( is_pv_32on64_vcpu(v) )
1939 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1940 * area into its usual VAs in the monitor tables */
1941 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1942 mfn_to_page(m3mfn)->shadow_flags = 3;
1943 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1945 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1946 mfn_to_page(m2mfn)->shadow_flags = 2;
1947 l3e = sh_map_domain_page(m3mfn);
1948 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1949 sh_install_xen_entries_in_l2h(v, m2mfn);
1950 sh_unmap_domain_page(l3e);
1953 sh_unmap_domain_page(l4e);
1955 #endif /* SHADOW_PAGING_LEVELS < 4 */
1956 return m4mfn;
1959 #elif CONFIG_PAGING_LEVELS == 3
1962 mfn_t m3mfn, m2mfn;
1963 l3_pgentry_t *l3e;
1964 l2_pgentry_t *l2e;
1965 int i;
1967 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1968 /* Remember the level of this table */
1969 mfn_to_page(m3mfn)->shadow_flags = 3;
1971 // Install a monitor l2 table in slot 3 of the l3 table.
1972 // This is used for all Xen entries, including linear maps
1973 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1974 mfn_to_page(m2mfn)->shadow_flags = 2;
1975 l3e = sh_map_domain_page(m3mfn);
1976 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1977 sh_install_xen_entries_in_l2h(v, m2mfn);
1978 /* Install the monitor's own linear map */
1979 l2e = sh_map_domain_page(m2mfn);
1980 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1981 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1982 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1983 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1984 : l2e_empty();
1985 sh_unmap_domain_page(l2e);
1986 sh_unmap_domain_page(l3e);
1988 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1989 return m3mfn;
1992 #else
1993 #error this should not happen
1994 #endif /* CONFIG_PAGING_LEVELS */
1996 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1998 /**************************************************************************/
1999 /* These functions also take a virtual address and return the level-N
2000 * shadow table mfn and entry, but they create the shadow pagetables if
2001 * they are needed. The "demand" argument is non-zero when handling
2002 * a demand fault (so we know what to do about accessed bits &c).
2003 * If the necessary tables are not present in the guest, they return NULL. */
2005 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
2006 * more levels than the guest, the upper levels are always fixed and do not
2007 * reflect any information from the guest, so we do not use these functions
2008 * to access them. */
2010 #if GUEST_PAGING_LEVELS >= 4
2011 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
2012 walk_t *gw,
2013 mfn_t *sl4mfn)
2015 /* There is always a shadow of the top level table. Get it. */
2016 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2017 /* Reading the top level table is always valid. */
2018 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
2021 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
2022 walk_t *gw,
2023 mfn_t *sl3mfn,
2024 fetch_type_t ft)
2026 mfn_t sl4mfn;
2027 shadow_l4e_t *sl4e;
2028 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
2029 /* Get the l4e */
2030 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
2031 ASSERT(sl4e != NULL);
2032 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2034 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
2035 ASSERT(mfn_valid(*sl3mfn));
2037 else
2039 int r;
2040 shadow_l4e_t new_sl4e;
2041 /* No l3 shadow installed: find and install it. */
2042 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
2043 if ( !mfn_valid(*sl3mfn) )
2045 /* No l3 shadow of this page exists at all: make one. */
2046 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
2048 /* Install the new sl3 table in the sl4e */
2049 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
2050 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
2051 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2052 if ( r & SHADOW_SET_ERROR )
2053 return NULL;
2055 /* Now follow it down a level. Guaranteed to succeed. */
2056 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
2058 #endif /* GUEST_PAGING_LEVELS >= 4 */
2061 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
2062 walk_t *gw,
2063 mfn_t *sl2mfn,
2064 fetch_type_t ft)
2066 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
2067 mfn_t sl3mfn = _mfn(INVALID_MFN);
2068 shadow_l3e_t *sl3e;
2069 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
2070 /* Get the l3e */
2071 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
2072 if ( sl3e == NULL ) return NULL;
2073 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2075 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2076 ASSERT(mfn_valid(*sl2mfn));
2078 else
2080 int r;
2081 shadow_l3e_t new_sl3e;
2082 unsigned int t = SH_type_l2_shadow;
2084 /* Tag compat L2 containing hypervisor (m2p) mappings */
2085 if ( is_pv_32on64_domain(v->domain) &&
2086 guest_l4_table_offset(gw->va) == 0 &&
2087 guest_l3_table_offset(gw->va) == 3 )
2088 t = SH_type_l2h_shadow;
2090 /* No l2 shadow installed: find and install it. */
2091 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
2092 if ( !mfn_valid(*sl2mfn) )
2094 /* No l2 shadow of this page exists at all: make one. */
2095 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
2097 /* Install the new sl2 table in the sl3e */
2098 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
2099 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
2100 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2101 if ( r & SHADOW_SET_ERROR )
2102 return NULL;
2104 /* Now follow it down a level. Guaranteed to succeed. */
2105 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2106 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
2107 /* We never demand-shadow PAE l3es: they are only created in
2108 * sh_update_cr3(). Check if the relevant sl3e is present. */
2109 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
2110 + shadow_l3_linear_offset(gw->va);
2111 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
2112 return NULL;
2113 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2114 ASSERT(mfn_valid(*sl2mfn));
2115 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2116 #else /* 32bit... */
2117 /* There is always a shadow of the top level table. Get it. */
2118 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2119 /* This next line is important: the guest l2 has a 16k
2120 * shadow, we need to return the right mfn of the four. This
2121 * call will set it for us as a side-effect. */
2122 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
2123 /* Reading the top level table is always valid. */
2124 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2125 #endif
2129 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
2130 walk_t *gw,
2131 mfn_t *sl1mfn,
2132 fetch_type_t ft)
2134 mfn_t sl2mfn;
2135 shadow_l2e_t *sl2e;
2137 /* Get the l2e */
2138 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
2139 if ( sl2e == NULL ) return NULL;
2140 /* Install the sl1 in the l2e if it wasn't there or if we need to
2141 * re-do it to fix a PSE dirty bit. */
2142 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
2143 && likely(ft != ft_demand_write
2144 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
2145 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
2147 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
2148 ASSERT(mfn_valid(*sl1mfn));
2150 else
2152 shadow_l2e_t new_sl2e;
2153 int r, flags = guest_l2e_get_flags(gw->l2e);
2154 /* No l1 shadow installed: find and install it. */
2155 if ( !(flags & _PAGE_PRESENT) )
2156 return NULL; /* No guest page. */
2157 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
2159 /* Splintering a superpage */
2160 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
2161 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
2162 if ( !mfn_valid(*sl1mfn) )
2164 /* No fl1 shadow of this superpage exists at all: make one. */
2165 *sl1mfn = make_fl1_shadow(v, l2gfn);
2168 else
2170 /* Shadowing an actual guest l1 table */
2171 if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
2172 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
2173 if ( !mfn_valid(*sl1mfn) )
2175 /* No l1 shadow of this page exists at all: make one. */
2176 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
2179 /* Install the new sl1 table in the sl2e */
2180 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
2181 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
2182 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2183 if ( r & SHADOW_SET_ERROR )
2184 return NULL;
2185 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
2186 * the guest l1 table has an 8k shadow, and we need to return
2187 * the right mfn of the pair. This call will set it for us as a
2188 * side-effect. (In all other cases, it's a no-op and will be
2189 * compiled out.) */
2190 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
2192 /* Now follow it down a level. Guaranteed to succeed. */
2193 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
2198 /**************************************************************************/
2199 /* Destructors for shadow tables:
2200 * Unregister the shadow, decrement refcounts of any entries present in it,
2201 * and release the memory.
2203 * N.B. These destructors do not clear the contents of the shadows.
2204 * This allows us to delay TLB shootdowns until the page is being reused.
2205 * See shadow_alloc() and shadow_free() for how this is handled.
2206 */
2208 #if GUEST_PAGING_LEVELS >= 4
2209 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
2211 shadow_l4e_t *sl4e;
2212 u32 t = mfn_to_shadow_page(smfn)->type;
2213 mfn_t gmfn, sl4mfn;
2215 SHADOW_DEBUG(DESTROY_SHADOW,
2216 "%s(%05lx)\n", __func__, mfn_x(smfn));
2217 ASSERT(t == SH_type_l4_shadow);
2219 /* Record that the guest page isn't shadowed any more (in this type) */
2220 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2221 delete_shadow_status(v, gmfn, t, smfn);
2222 shadow_demote(v, gmfn, t);
2223 /* Decrement refcounts of all the old entries */
2224 sl4mfn = smfn;
2225 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2226 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2228 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
2229 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
2230 | ((unsigned long)sl4e & ~PAGE_MASK));
2232 });
2234 /* Put the memory back in the pool */
2235 shadow_free(v->domain, smfn);
2238 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2240 shadow_l3e_t *sl3e;
2241 u32 t = mfn_to_shadow_page(smfn)->type;
2242 mfn_t gmfn, sl3mfn;
2244 SHADOW_DEBUG(DESTROY_SHADOW,
2245 "%s(%05lx)\n", __func__, mfn_x(smfn));
2246 ASSERT(t == SH_type_l3_shadow);
2248 /* Record that the guest page isn't shadowed any more (in this type) */
2249 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2250 delete_shadow_status(v, gmfn, t, smfn);
2251 shadow_demote(v, gmfn, t);
2253 /* Decrement refcounts of all the old entries */
2254 sl3mfn = smfn;
2255 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2256 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2257 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2258 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2259 | ((unsigned long)sl3e & ~PAGE_MASK));
2260 });
2262 /* Put the memory back in the pool */
2263 shadow_free(v->domain, smfn);
2265 #endif /* GUEST_PAGING_LEVELS >= 4 */
2268 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2270 shadow_l2e_t *sl2e;
2271 u32 t = mfn_to_shadow_page(smfn)->type;
2272 mfn_t gmfn, sl2mfn;
2274 SHADOW_DEBUG(DESTROY_SHADOW,
2275 "%s(%05lx)\n", __func__, mfn_x(smfn));
2277 #if GUEST_PAGING_LEVELS >= 3
2278 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2279 #else
2280 ASSERT(t == SH_type_l2_shadow);
2281 #endif
2283 /* Record that the guest page isn't shadowed any more (in this type) */
2284 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2285 delete_shadow_status(v, gmfn, t, smfn);
2286 shadow_demote(v, gmfn, t);
2288 /* Decrement refcounts of all the old entries */
2289 sl2mfn = smfn;
2290 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2291 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2292 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2293 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2294 | ((unsigned long)sl2e & ~PAGE_MASK));
2295 });
2297 /* Put the memory back in the pool */
2298 shadow_free(v->domain, smfn);
2301 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2303 struct domain *d = v->domain;
2304 shadow_l1e_t *sl1e;
2305 u32 t = mfn_to_shadow_page(smfn)->type;
2307 SHADOW_DEBUG(DESTROY_SHADOW,
2308 "%s(%05lx)\n", __func__, mfn_x(smfn));
2309 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2311 /* Record that the guest page isn't shadowed any more (in this type) */
2312 if ( t == SH_type_fl1_shadow )
2314 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2315 delete_fl1_shadow_status(v, gfn, smfn);
2317 else
2319 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2320 delete_shadow_status(v, gmfn, t, smfn);
2321 shadow_demote(v, gmfn, t);
2324 if ( shadow_mode_refcounts(d) )
2326 /* Decrement refcounts of all the old entries */
2327 mfn_t sl1mfn = smfn;
2328 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2329 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2330 && !sh_l1e_is_magic(*sl1e) ) {
2331 shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2332 shadow_put_page_from_l1e(*sl1e, d);
2334 });
2337 /* Put the memory back in the pool */
2338 shadow_free(v->domain, smfn);
2341 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2342 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2344 struct domain *d = v->domain;
2345 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2347 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2349 mfn_t m3mfn;
2350 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2351 l3_pgentry_t *l3e;
2352 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2354 /* Need to destroy the l3 and l2 monitor pages used
2355 * for the linear map */
2356 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2357 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2358 l3e = sh_map_domain_page(m3mfn);
2359 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2360 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2361 sh_unmap_domain_page(l3e);
2362 shadow_free(d, m3mfn);
2364 if ( is_pv_32on64_vcpu(v) )
2366 /* Need to destroy the l3 and l2 monitor pages that map the
2367 * Xen VAs at 3GB-4GB */
2368 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2369 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2370 l3e = sh_map_domain_page(m3mfn);
2371 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2372 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2373 sh_unmap_domain_page(l3e);
2374 shadow_free(d, m3mfn);
2376 sh_unmap_domain_page(l4e);
2378 #elif CONFIG_PAGING_LEVELS == 3
2379 /* Need to destroy the l2 monitor page in slot 4 too */
2381 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2382 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2383 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2384 sh_unmap_domain_page(l3e);
2386 #endif
2388 /* Put the memory back in the pool */
2389 shadow_free(d, mmfn);
2391 #endif
2393 /**************************************************************************/
2394 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2395 * These are called from common code when we are running out of shadow
2396 * memory, and unpinning all the top-level shadows hasn't worked.
2398 * This implementation is pretty crude and slow, but we hope that it won't
2399 * be called very often. */
2401 #if GUEST_PAGING_LEVELS == 2
2403 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2405 shadow_l2e_t *sl2e;
2406 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2407 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2408 });
2411 #elif GUEST_PAGING_LEVELS == 3
2413 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2414 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2416 shadow_l2e_t *sl2e;
2417 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2418 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2419 });
2422 #elif GUEST_PAGING_LEVELS == 4
2424 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2426 shadow_l4e_t *sl4e;
2427 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2428 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2429 });
2432 #endif
2434 /**************************************************************************/
2435 /* Internal translation functions.
2436 * These functions require a pointer to the shadow entry that will be updated.
2437 */
2439 /* These functions take a new guest entry, translate it to shadow and write
2440 * the shadow entry.
2442 * They return the same bitmaps as the shadow_set_lXe() functions.
2443 */
2445 #if GUEST_PAGING_LEVELS >= 4
2446 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2448 shadow_l4e_t new_sl4e;
2449 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2450 shadow_l4e_t *sl4p = se;
2451 mfn_t sl3mfn = _mfn(INVALID_MFN);
2452 struct domain *d = v->domain;
2453 p2m_type_t p2mt;
2454 int result = 0;
2456 perfc_incr(shadow_validate_gl4e_calls);
2458 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2460 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2461 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2462 if ( p2m_is_ram(p2mt) )
2463 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2464 else
2465 result |= SHADOW_SET_ERROR;
2467 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2469 // check for updates to xen reserved slots
2470 if ( !shadow_mode_external(d) )
2472 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2473 sizeof(shadow_l4e_t));
2474 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2476 if ( unlikely(reserved_xen_slot) )
2478 // attempt by the guest to write to a xen reserved slot
2479 //
2480 SHADOW_PRINTK("%s out-of-range update "
2481 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2482 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2483 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2485 SHADOW_ERROR("out-of-range l4e update\n");
2486 result |= SHADOW_SET_ERROR;
2489 // do not call shadow_set_l4e...
2490 return result;
2494 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2495 return result;
2499 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2501 shadow_l3e_t new_sl3e;
2502 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2503 shadow_l3e_t *sl3p = se;
2504 mfn_t sl2mfn = _mfn(INVALID_MFN);
2505 p2m_type_t p2mt;
2506 int result = 0;
2508 perfc_incr(shadow_validate_gl3e_calls);
2510 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2512 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2513 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2514 if ( p2m_is_ram(p2mt) )
2515 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2516 else
2517 result |= SHADOW_SET_ERROR;
2519 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2520 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2522 return result;
2524 #endif // GUEST_PAGING_LEVELS >= 4
2526 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2528 shadow_l2e_t new_sl2e;
2529 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2530 shadow_l2e_t *sl2p = se;
2531 mfn_t sl1mfn = _mfn(INVALID_MFN);
2532 p2m_type_t p2mt;
2533 int result = 0;
2535 perfc_incr(shadow_validate_gl2e_calls);
2537 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2539 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2540 if ( guest_supports_superpages(v) &&
2541 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2543 // superpage -- need to look up the shadow L1 which holds the
2544 // splitters...
2545 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2546 #if 0
2547 // XXX - it's possible that we want to do some kind of prefetch
2548 // for superpage fl1's here, but this is *not* on the demand path,
2549 // so we'll hold off trying that for now...
2550 //
2551 if ( !mfn_valid(sl1mfn) )
2552 sl1mfn = make_fl1_shadow(v, gl1gfn);
2553 #endif
2555 else
2557 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2558 if ( p2m_is_ram(p2mt) )
2559 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2560 else
2561 result |= SHADOW_SET_ERROR;
2564 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2566 // check for updates to xen reserved slots in PV guests...
2567 // XXX -- need to revisit this for PV 3-on-4 guests.
2568 //
2569 #if SHADOW_PAGING_LEVELS < 4
2570 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2571 if ( !shadow_mode_external(v->domain) )
2573 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2574 sizeof(shadow_l2e_t));
2575 int reserved_xen_slot;
2577 #if SHADOW_PAGING_LEVELS == 3
2578 reserved_xen_slot =
2579 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2580 (shadow_index
2581 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2582 #else /* SHADOW_PAGING_LEVELS == 2 */
2583 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2584 #endif
2586 if ( unlikely(reserved_xen_slot) )
2588 // attempt by the guest to write to a xen reserved slot
2589 //
2590 SHADOW_PRINTK("%s out-of-range update "
2591 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2592 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2593 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2595 SHADOW_ERROR("out-of-range l2e update\n");
2596 result |= SHADOW_SET_ERROR;
2599 // do not call shadow_set_l2e...
2600 return result;
2603 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2604 #endif /* SHADOW_PAGING_LEVELS < 4 */
2606 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2608 return result;
2611 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2613 shadow_l1e_t new_sl1e;
2614 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2615 shadow_l1e_t *sl1p = se;
2616 gfn_t gfn;
2617 mfn_t gmfn;
2618 p2m_type_t p2mt;
2619 int result = 0;
2620 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2621 mfn_t gl1mfn;
2622 #endif /* OOS */
2624 perfc_incr(shadow_validate_gl1e_calls);
2626 gfn = guest_l1e_get_gfn(new_gl1e);
2627 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2629 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2630 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2632 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2633 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
2634 if ( mfn_valid(gl1mfn)
2635 && mfn_is_out_of_sync(gl1mfn) )
2637 /* Update the OOS snapshot. */
2638 mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
2639 guest_l1e_t *snp;
2641 ASSERT(mfn_valid(snpmfn));
2643 snp = sh_map_domain_page(snpmfn);
2644 snp[guest_index(new_ge)] = new_gl1e;
2645 sh_unmap_domain_page(snp);
2647 #endif /* OOS */
2649 return result;
2652 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2653 /**************************************************************************/
2654 /* Special validation function for re-syncing out-of-sync shadows.
2655 * Walks the *shadow* page, and for every entry that it finds,
2656 * revalidates the guest entry that corresponds to it.
2657 * N.B. This function is called with the vcpu that unsynced the page,
2658 * *not* the one that is causing it to be resynced. */
2659 void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
2661 mfn_t sl1mfn;
2662 shadow_l1e_t *sl1p;
2663 guest_l1e_t *gl1p, *gp, *snp;
2664 int rc = 0;
2666 ASSERT(mfn_valid(snpmfn));
2668 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2669 ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
2671 snp = sh_map_domain_page(snpmfn);
2672 gp = sh_map_domain_page(gl1mfn);
2673 gl1p = gp;
2675 SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
2676 guest_l1e_t gl1e = *gl1p;
2677 guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
2679 if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
2681 gfn_t gfn;
2682 mfn_t gmfn;
2683 p2m_type_t p2mt;
2684 shadow_l1e_t nsl1e;
2686 gfn = guest_l1e_get_gfn(gl1e);
2687 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2688 l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
2689 rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
2691 *snpl1p = gl1e;
2693 });
2695 sh_unmap_domain_page(gp);
2696 sh_unmap_domain_page(snp);
2698 /* Setting shadow L1 entries should never need us to flush the TLB */
2699 ASSERT(!(rc & SHADOW_SET_FLUSH));
2702 /* Figure out whether it's definitely safe not to sync this l1 table.
2703 * That is: if we can tell that it's only used once, and that the
2704 * toplevel shadow responsible is not one of ours.
2705 * N.B. This function is called with the vcpu that required the resync,
2706 * *not* the one that originally unsynced the page, but it is
2707 * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
2708 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
2710 struct shadow_page_info *sp;
2711 mfn_t smfn;
2713 smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2714 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2716 /* Up to l2 */
2717 sp = mfn_to_shadow_page(smfn);
2718 if ( sp->count != 1 || !sp->up )
2719 return 0;
2720 smfn = _mfn(sp->up >> PAGE_SHIFT);
2721 ASSERT(mfn_valid(smfn));
2723 #if (SHADOW_PAGING_LEVELS == 4)
2724 /* up to l3 */
2725 sp = mfn_to_shadow_page(smfn);
2726 if ( sp->count != 1 || !sp->up )
2727 return 0;
2728 smfn = _mfn(sp->up >> PAGE_SHIFT);
2729 ASSERT(mfn_valid(smfn));
2731 /* up to l4 */
2732 sp = mfn_to_shadow_page(smfn);
2733 if ( sp->count != 1
2734 || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
2735 return 0;
2736 smfn = _mfn(sp->up >> PAGE_SHIFT);
2737 ASSERT(mfn_valid(smfn));
2739 #if (GUEST_PAGING_LEVELS == 2)
2740 /* In 2-on-3 shadow mode the up pointer contains the link to the
2741 * shadow page, but the shadow_table contains only the first of the
2742 * four pages that makes the PAE top shadow tables. */
2743 smfn = _mfn(mfn_x(smfn) & ~0x3UL);
2744 #endif
2746 #endif
2748 if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
2749 #if (SHADOW_PAGING_LEVELS == 3)
2750 || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
2751 || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
2752 || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
2753 #endif
2755 return 0;
2757 /* Only in use in one toplevel shadow, and it's not the one we're
2758 * running on */
2759 return 1;
2761 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
2764 /**************************************************************************/
2765 /* Functions which translate and install the shadows of arbitrary guest
2766 * entries that we have just seen the guest write. */
2769 static inline int
2770 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2771 void *new_gp, u32 size, u32 sh_type,
2772 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2773 int (*validate_ge)(struct vcpu *v, void *ge,
2774 mfn_t smfn, void *se))
2775 /* Generic function for mapping and validating. */
2777 mfn_t smfn, smfn2, map_mfn;
2778 shadow_l1e_t *sl1p;
2779 u32 shadow_idx, guest_idx;
2780 int result = 0;
2782 /* Align address and size to guest entry boundaries */
2783 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2784 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2785 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2786 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2788 /* Map the shadow page */
2789 smfn = get_shadow_status(v, gmfn, sh_type);
2790 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2791 guest_idx = guest_index(new_gp);
2792 map_mfn = smfn;
2793 shadow_idx = shadow_index(&map_mfn, guest_idx);
2794 sl1p = sh_map_domain_page(map_mfn);
2796 /* Validate one entry at a time */
2797 while ( size )
2799 smfn2 = smfn;
2800 guest_idx = guest_index(new_gp);
2801 shadow_idx = shadow_index(&smfn2, guest_idx);
2802 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2804 /* We have moved to another page of the shadow */
2805 map_mfn = smfn2;
2806 sh_unmap_domain_page(sl1p);
2807 sl1p = sh_map_domain_page(map_mfn);
2809 result |= validate_ge(v,
2810 new_gp,
2811 map_mfn,
2812 &sl1p[shadow_idx]);
2813 size -= sizeof(guest_l1e_t);
2814 new_gp += sizeof(guest_l1e_t);
2816 sh_unmap_domain_page(sl1p);
2817 return result;
2821 int
2822 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2823 void *new_gl4p, u32 size)
2825 #if GUEST_PAGING_LEVELS >= 4
2826 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2827 SH_type_l4_shadow,
2828 shadow_l4_index,
2829 validate_gl4e);
2830 #else // ! GUEST_PAGING_LEVELS >= 4
2831 SHADOW_ERROR("called in wrong paging mode!\n");
2832 BUG();
2833 return 0;
2834 #endif
2837 int
2838 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2839 void *new_gl3p, u32 size)
2841 #if GUEST_PAGING_LEVELS >= 4
2842 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2843 SH_type_l3_shadow,
2844 shadow_l3_index,
2845 validate_gl3e);
2846 #else // ! GUEST_PAGING_LEVELS >= 4
2847 SHADOW_ERROR("called in wrong paging mode!\n");
2848 BUG();
2849 return 0;
2850 #endif
2853 int
2854 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2855 void *new_gl2p, u32 size)
2857 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2858 SH_type_l2_shadow,
2859 shadow_l2_index,
2860 validate_gl2e);
2863 int
2864 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2865 void *new_gl2p, u32 size)
2867 #if GUEST_PAGING_LEVELS >= 3
2868 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2869 SH_type_l2h_shadow,
2870 shadow_l2_index,
2871 validate_gl2e);
2872 #else /* Non-PAE guests don't have different kinds of l2 table */
2873 SHADOW_ERROR("called in wrong paging mode!\n");
2874 BUG();
2875 return 0;
2876 #endif
2879 int
2880 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2881 void *new_gl1p, u32 size)
2883 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2884 SH_type_l1_shadow,
2885 shadow_l1_index,
2886 validate_gl1e);
2890 /**************************************************************************/
2891 /* Optimization: If we see two emulated writes of zeros to the same
2892 * page-table without another kind of page fault in between, we guess
2893 * that this is a batch of changes (for process destruction) and
2894 * unshadow the page so we don't take a pagefault on every entry. This
2895 * should also make finding writeable mappings of pagetables much
2896 * easier. */
2898 /* Look to see if this is the second emulated write in a row to this
2899 * page, and unshadow if it is */
2900 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2902 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2903 if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2904 && sh_mfn_is_a_page_table(gmfn) )
2906 perfc_incr(shadow_early_unshadow);
2907 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2908 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
2910 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2911 #endif
2914 /* Stop counting towards early unshadows, as we've seen a real page fault */
2915 static inline void reset_early_unshadow(struct vcpu *v)
2917 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2918 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2919 #endif
2924 /**************************************************************************/
2925 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2926 * demand-faulted a shadow l1e in the fault handler, to see if it's
2927 * worth fetching some more.
2928 */
2930 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2932 /* XXX magic number */
2933 #define PREFETCH_DISTANCE 32
2935 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2936 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2938 int i, dist;
2939 gfn_t gfn;
2940 mfn_t gmfn;
2941 guest_l1e_t *gl1p = NULL, gl1e;
2942 shadow_l1e_t sl1e;
2943 u32 gflags;
2944 p2m_type_t p2mt;
2945 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2946 guest_l1e_t *snpl1p = NULL;
2947 #endif /* OOS */
2950 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2951 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2952 /* And no more than a maximum fetches-per-fault */
2953 if ( dist > PREFETCH_DISTANCE )
2954 dist = PREFETCH_DISTANCE;
2956 if ( mfn_valid(gw->l1mfn) )
2958 /* Normal guest page; grab the next guest entry */
2959 gl1p = sh_map_domain_page(gw->l1mfn);
2960 gl1p += guest_l1_table_offset(gw->va);
2962 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2963 if ( mfn_is_out_of_sync(gw->l1mfn) )
2965 mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
2967 ASSERT(mfn_valid(snpmfn));
2968 snpl1p = sh_map_domain_page(snpmfn);
2969 snpl1p += guest_l1_table_offset(gw->va);
2971 #endif /* OOS */
2974 for ( i = 1; i < dist ; i++ )
2976 /* No point in prefetching if there's already a shadow */
2977 if ( ptr_sl1e[i].l1 != 0 )
2978 break;
2980 if ( mfn_valid(gw->l1mfn) )
2982 /* Normal guest page; grab the next guest entry */
2983 gl1e = gl1p[i];
2984 /* Not worth continuing if we hit an entry that will need another
2985 * fault for A/D-bit propagation anyway */
2986 gflags = guest_l1e_get_flags(gl1e);
2987 if ( (gflags & _PAGE_PRESENT)
2988 && (!(gflags & _PAGE_ACCESSED)
2989 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2990 break;
2992 else
2994 /* Fragmented superpage, unless we've been called wrongly */
2995 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2996 /* Increment the l1e's GFN by the right number of guest pages */
2997 gl1e = guest_l1e_from_gfn(
2998 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2999 guest_l1e_get_flags(gw->l1e));
3002 /* Look at the gfn that the l1e is pointing at */
3003 gfn = guest_l1e_get_gfn(gl1e);
3004 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
3006 /* Propagate the entry. */
3007 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
3008 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
3010 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3011 if ( snpl1p != NULL )
3012 snpl1p[i] = gl1e;
3013 #endif /* OOS */
3015 if ( gl1p != NULL )
3016 sh_unmap_domain_page(gl1p);
3017 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3018 if ( snpl1p != NULL )
3019 sh_unmap_domain_page(snpl1p);
3020 #endif /* OOS */
3023 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
3025 #if GUEST_PAGING_LEVELS == 4
3026 typedef u64 guest_va_t;
3027 typedef u64 guest_pa_t;
3028 #elif GUEST_PAGING_LEVELS == 3
3029 typedef u32 guest_va_t;
3030 typedef u64 guest_pa_t;
3031 #else
3032 typedef u32 guest_va_t;
3033 typedef u32 guest_pa_t;
3034 #endif
3036 static inline void trace_shadow_gen(u32 event, guest_va_t va)
3038 if ( tb_init_done )
3040 event |= (GUEST_PAGING_LEVELS-2)<<8;
3041 __trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va);
3045 static inline void trace_shadow_fixup(guest_l1e_t gl1e,
3046 guest_va_t va)
3048 if ( tb_init_done )
3050 struct {
3051 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
3052 so put it first for alignment sake. */
3053 guest_l1e_t gl1e;
3054 guest_va_t va;
3055 u32 flags;
3056 } __attribute__((packed)) d;
3057 u32 event;
3059 event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8);
3061 d.gl1e = gl1e;
3062 d.va = va;
3063 d.flags = this_cpu(trace_shadow_path_flags);
3065 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
3069 static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
3070 guest_va_t va)
3072 if ( tb_init_done )
3074 struct {
3075 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
3076 so put it first for alignment sake. */
3077 guest_l1e_t gl1e;
3078 guest_va_t va;
3079 u32 flags;
3080 } __attribute__((packed)) d;
3081 u32 event;
3083 event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8);
3085 d.gl1e = gl1e;
3086 d.va = va;
3087 d.flags = this_cpu(trace_shadow_path_flags);
3089 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
3093 static inline void trace_shadow_emulate_other(u32 event,
3094 guest_va_t va,
3095 gfn_t gfn)
3097 if ( tb_init_done )
3099 struct {
3100 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
3101 so put it first for alignment sake. */
3102 #if GUEST_PAGING_LEVELS == 2
3103 u32 gfn;
3104 #else
3105 u64 gfn;
3106 #endif
3107 guest_va_t va;
3108 } __attribute__((packed)) d;
3110 event |= ((GUEST_PAGING_LEVELS-2)<<8);
3112 d.gfn=gfn_x(gfn);
3113 d.va = va;
3115 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
3119 #if GUEST_PAGING_LEVELS == 3
3120 static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va);
3121 static DEFINE_PER_CPU(int,trace_extra_emulation_count);
3122 #endif
3123 static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
3125 static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
3127 if ( tb_init_done )
3129 struct {
3130 /* for PAE, guest_l1e may be 64 while guest_va may be 32;
3131 so put it first for alignment sake. */
3132 guest_l1e_t gl1e, write_val;
3133 guest_va_t va;
3134 unsigned flags:29, emulation_count:3;
3135 } __attribute__((packed)) d;
3136 u32 event;
3138 event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8);
3140 d.gl1e = gl1e;
3141 d.write_val.l1 = this_cpu(trace_emulate_write_val);
3142 d.va = va;
3143 #if GUEST_PAGING_LEVELS == 3
3144 d.emulation_count = this_cpu(trace_extra_emulation_count);
3145 #endif
3146 d.flags = this_cpu(trace_shadow_path_flags);
3148 __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
3152 /**************************************************************************/
3153 /* Entry points into the shadow code */
3155 /* Called from pagefault handler in Xen, and from the HVM trap handlers
3156 * for pagefaults. Returns 1 if this fault was an artefact of the
3157 * shadow code (and the guest should retry) or 0 if it is not (and the
3158 * fault should be handled elsewhere or passed to the guest). */
3160 static int sh_page_fault(struct vcpu *v,
3161 unsigned long va,
3162 struct cpu_user_regs *regs)
3164 struct domain *d = v->domain;
3165 walk_t gw;
3166 gfn_t gfn = _gfn(0);
3167 mfn_t gmfn, sl1mfn = _mfn(0);
3168 shadow_l1e_t sl1e, *ptr_sl1e;
3169 paddr_t gpa;
3170 struct sh_emulate_ctxt emul_ctxt;
3171 struct x86_emulate_ops *emul_ops;
3172 int r;
3173 fetch_type_t ft = 0;
3174 p2m_type_t p2mt;
3175 uint32_t rc;
3176 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3177 int fast_emul = 0;
3178 #endif
3180 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
3181 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
3182 regs->eip);
3184 perfc_incr(shadow_fault);
3186 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3187 /* If faulting frame is successfully emulated in last shadow fault
3188 * it's highly likely to reach same emulation action for this frame.
3189 * Then try to emulate early to avoid lock aquisition.
3190 */
3191 if ( v->arch.paging.last_write_emul_ok
3192 && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
3194 /* check whether error code is 3, or else fall back to normal path
3195 * in case of some validation is required
3196 */
3197 if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
3199 fast_emul = 1;
3200 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
3202 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3203 /* Fall back to the slow path if we're trying to emulate
3204 writes to an out of sync page. */
3205 if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
3207 v->arch.paging.last_write_emul_ok = 0;
3208 goto page_fault_slow_path;
3210 #endif /* OOS */
3212 perfc_incr(shadow_fault_fast_emulate);
3213 goto early_emulation;
3215 else
3216 v->arch.paging.last_write_emul_ok = 0;
3218 #endif
3220 //
3221 // XXX: Need to think about eventually mapping superpages directly in the
3222 // shadow (when possible), as opposed to splintering them into a
3223 // bunch of 4K maps.
3224 //
3226 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3227 if ( (regs->error_code & PFEC_reserved_bit) )
3229 /* The only reasons for reserved bits to be set in shadow entries
3230 * are the two "magic" shadow_l1e entries. */
3231 if ( likely((__copy_from_user(&sl1e,
3232 (sh_linear_l1_table(v)
3233 + shadow_l1_linear_offset(va)),
3234 sizeof(sl1e)) == 0)
3235 && sh_l1e_is_magic(sl1e)) )
3237 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3238 /* First, need to check that this isn't an out-of-sync
3239 * shadow l1e. If it is, we fall back to the slow path, which
3240 * will sync it up again. */
3242 shadow_l2e_t sl2e;
3243 mfn_t gl1mfn;
3244 if ( (__copy_from_user(&sl2e,
3245 (sh_linear_l2_table(v)
3246 + shadow_l2_linear_offset(va)),
3247 sizeof(sl2e)) != 0)
3248 || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
3249 || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
3250 shadow_l2e_get_mfn(sl2e))->backpointer))
3251 || unlikely(mfn_is_out_of_sync(gl1mfn)) )
3253 /* Hit the slow path as if there had been no
3254 * shadow entry at all, and let it tidy up */
3255 ASSERT(regs->error_code & PFEC_page_present);
3256 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3257 goto page_fault_slow_path;
3260 #endif /* SHOPT_OUT_OF_SYNC */
3262 if ( sh_l1e_is_gnp(sl1e) )
3264 /* Not-present in a guest PT: pass to the guest as
3265 * a not-present fault (by flipping two bits). */
3266 ASSERT(regs->error_code & PFEC_page_present);
3267 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3268 reset_early_unshadow(v);
3269 perfc_incr(shadow_fault_fast_gnp);
3270 SHADOW_PRINTK("fast path not-present\n");
3271 trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
3272 return 0;
3274 else
3276 /* Magic MMIO marker: extract gfn for MMIO address */
3277 ASSERT(sh_l1e_is_mmio(sl1e));
3278 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
3279 << PAGE_SHIFT)
3280 | (va & ~PAGE_MASK);
3282 perfc_incr(shadow_fault_fast_mmio);
3283 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
3284 reset_early_unshadow(v);
3285 trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
3286 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3287 ? EXCRET_fault_fixed : 0);
3289 else
3291 /* This should be exceptionally rare: another vcpu has fixed
3292 * the tables between the fault and our reading the l1e.
3293 * Retry and let the hardware give us the right fault next time. */
3294 perfc_incr(shadow_fault_fast_fail);
3295 SHADOW_PRINTK("fast path false alarm!\n");
3296 trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
3297 return EXCRET_fault_fixed;
3301 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3302 page_fault_slow_path:
3303 #endif
3304 #endif /* SHOPT_FAST_FAULT_PATH */
3306 /* Detect if this page fault happened while we were already in Xen
3307 * doing a shadow operation. If that happens, the only thing we can
3308 * do is let Xen's normal fault handlers try to fix it. In any case,
3309 * a diagnostic trace of the fault will be more useful than
3310 * a BUG() when we try to take the lock again. */
3311 if ( unlikely(shadow_locked_by_me(d)) )
3313 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
3314 d->arch.paging.shadow.locker_function);
3315 return 0;
3318 rewalk:
3319 rc = guest_walk_tables(v, va, &gw, regs->error_code);
3321 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3322 regs->error_code &= ~PFEC_page_present;
3323 if ( !(rc & _PAGE_PRESENT) )
3324 regs->error_code |= PFEC_page_present;
3325 #endif
3327 if ( rc != 0 )
3329 perfc_incr(shadow_fault_bail_real_fault);
3330 SHADOW_PRINTK("not a shadow fault\n");
3331 reset_early_unshadow(v);
3332 goto propagate;
3335 /* It's possible that the guest has put pagetables in memory that it has
3336 * already used for some special purpose (ioreq pages, or granted pages).
3337 * If that happens we'll have killed the guest already but it's still not
3338 * safe to propagate entries out of the guest PT so get out now. */
3339 if ( unlikely(d->is_shutting_down) )
3341 SHADOW_PRINTK("guest is shutting down\n");
3342 goto propagate;
3345 /* What kind of access are we dealing with? */
3346 ft = ((regs->error_code & PFEC_write_access)
3347 ? ft_demand_write : ft_demand_read);
3349 /* What mfn is the guest trying to access? */
3350 gfn = guest_l1e_get_gfn(gw.l1e);
3351 gmfn = gfn_to_mfn(d, gfn, &p2mt);
3353 if ( shadow_mode_refcounts(d) &&
3354 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3356 perfc_incr(shadow_fault_bail_bad_gfn);
3357 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3358 gfn_x(gfn), mfn_x(gmfn));
3359 reset_early_unshadow(v);
3360 goto propagate;
3363 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3364 /* Remember this successful VA->GFN translation for later. */
3365 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3366 regs->error_code | PFEC_page_present);
3367 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3369 shadow_lock(d);
3371 TRACE_CLEAR_PATH_FLAGS;
3373 rc = gw_remove_write_accesses(v, va, &gw);
3375 /* First bit set: Removed write access to a page. */
3376 if ( rc & GW_RMWR_FLUSHTLB )
3378 /* Write permission removal is also a hint that other gwalks
3379 * overlapping with this one may be inconsistent
3380 */
3381 perfc_incr(shadow_rm_write_flush_tlb);
3382 atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3383 flush_tlb_mask(d->domain_dirty_cpumask);
3386 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3387 /* Second bit set: Resynced a page. Re-walk needed. */
3388 if ( rc & GW_RMWR_REWALK )
3390 shadow_unlock(d);
3391 goto rewalk;
3393 #endif /* OOS */
3395 if ( !shadow_check_gwalk(v, va, &gw) )
3397 perfc_incr(shadow_inconsistent_gwalk);
3398 shadow_unlock(d);
3399 goto rewalk;
3402 shadow_audit_tables(v);
3403 sh_audit_gw(v, &gw);
3405 /* Make sure there is enough free shadow memory to build a chain of
3406 * shadow tables. (We never allocate a top-level shadow on this path,
3407 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3408 * SH_type_l1_shadow isn't correct in the latter case, all page
3409 * tables are the same size there.) */
3410 shadow_prealloc(d,
3411 SH_type_l1_shadow,
3412 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3414 /* Acquire the shadow. This must happen before we figure out the rights
3415 * for the shadow entry, since we might promote a page here. */
3416 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3417 if ( unlikely(ptr_sl1e == NULL) )
3419 /* Couldn't get the sl1e! Since we know the guest entries
3420 * are OK, this can only have been caused by a failed
3421 * shadow_set_l*e(), which will have crashed the guest.
3422 * Get out of the fault handler immediately. */
3423 ASSERT(d->is_shutting_down);
3424 shadow_unlock(d);
3425 trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va);
3426 return 0;
3429 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3430 /* Always unsync when writing to L1 page tables. */
3431 if ( sh_mfn_is_a_page_table(gmfn)
3432 && ft == ft_demand_write )
3433 sh_unsync(v, gmfn);
3435 if ( unlikely(d->is_shutting_down) )
3437 /* We might end up with a crashed domain here if
3438 * sh_remove_shadows() in a previous sh_resync() call has
3439 * failed. We cannot safely continue since some page is still
3440 * OOS but not in the hash table anymore. */
3441 shadow_unlock(d);
3442 return 0;
3444 #endif /* OOS */
3446 /* Calculate the shadow entry and write it */
3447 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3448 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
3450 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3451 if ( mfn_valid(gw.l1mfn)
3452 && mfn_is_out_of_sync(gw.l1mfn) )
3454 /* Update the OOS snapshot. */
3455 mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
3456 guest_l1e_t *snp;
3458 ASSERT(mfn_valid(snpmfn));
3460 snp = sh_map_domain_page(snpmfn);
3461 snp[guest_l1_table_offset(va)] = gw.l1e;
3462 sh_unmap_domain_page(snp);
3464 #endif /* OOS */
3466 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3467 /* Prefetch some more shadow entries */
3468 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3469 #endif
3471 /* Need to emulate accesses to page tables */
3472 if ( sh_mfn_is_a_page_table(gmfn)
3473 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3474 /* Unless they've been allowed to go out of sync with their
3475 shadows and we don't need to unshadow it. */
3476 && !(mfn_is_out_of_sync(gmfn)
3477 && !(regs->error_code & PFEC_user_mode))
3478 #endif
3481 if ( ft == ft_demand_write )
3483 perfc_incr(shadow_fault_emulate_write);
3484 goto emulate;
3486 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3488 perfc_incr(shadow_fault_emulate_read);
3489 goto emulate;
3493 /* Need to hand off device-model MMIO to the device model */
3494 if ( p2mt == p2m_mmio_dm )
3496 gpa = guest_walk_to_gpa(&gw);
3497 goto mmio;
3500 /* Log attempts to write to read-only memory */
3501 if ( (p2mt == p2m_ram_ro) && (ft == ft_demand_write) )
3503 static unsigned long lastpage = 0;
3504 if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
3505 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
3506 " page. va page=%#lx, mfn=%#lx\n",
3507 va & PAGE_MASK, mfn_x(gmfn));
3508 goto emulate_readonly; /* skip over the instruction */
3511 /* In HVM guests, we force CR0.WP always to be set, so that the
3512 * pagetables are always write-protected. If the guest thinks
3513 * CR0.WP is clear, we must emulate faulting supervisor writes to
3514 * allow the guest to write through read-only PTEs. Emulate if the
3515 * fault was a non-user write to a present page. */
3516 if ( is_hvm_domain(d)
3517 && unlikely(!hvm_wp_enabled(v))
3518 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
3520 perfc_incr(shadow_fault_emulate_wp);
3521 goto emulate;
3524 perfc_incr(shadow_fault_fixed);
3525 d->arch.paging.log_dirty.fault_count++;
3526 reset_early_unshadow(v);
3528 trace_shadow_fixup(gw.l1e, va);
3529 done:
3530 sh_audit_gw(v, &gw);
3531 SHADOW_PRINTK("fixed\n");
3532 shadow_audit_tables(v);
3533 shadow_unlock(d);
3534 return EXCRET_fault_fixed;
3536 emulate:
3537 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3538 goto not_a_shadow_fault;
3540 /*
3541 * We do not emulate user writes. Instead we use them as a hint that the
3542 * page is no longer a page table. This behaviour differs from native, but
3543 * it seems very unlikely that any OS grants user access to page tables.
3544 */
3545 if ( (regs->error_code & PFEC_user_mode) )
3547 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3548 mfn_x(gmfn));
3549 perfc_incr(shadow_fault_emulate_failed);
3550 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3551 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
3552 va, gfn);
3553 goto done;
3556 /*
3557 * Write from userspace to ro-mem needs to jump here to avoid getting
3558 * caught by user-mode page-table check above.
3559 */
3560 emulate_readonly:
3561 /*
3562 * We don't need to hold the lock for the whole emulation; we will
3563 * take it again when we write to the pagetables.
3564 */
3565 sh_audit_gw(v, &gw);
3566 shadow_audit_tables(v);
3567 shadow_unlock(d);
3569 this_cpu(trace_emulate_write_val) = 0;
3571 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3572 early_emulation:
3573 #endif
3574 if ( is_hvm_domain(d) )
3576 /*
3577 * If we are in the middle of injecting an exception or interrupt then
3578 * we should not emulate: it is not the instruction at %eip that caused
3579 * the fault. Furthermore it is almost certainly the case the handler
3580 * stack is currently considered to be a page table, so we should
3581 * unshadow the faulting page before exiting.
3582 */
3583 if ( unlikely(hvm_event_pending(v)) )
3585 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3586 if ( fast_emul )
3588 perfc_incr(shadow_fault_fast_emulate_fail);
3589 v->arch.paging.last_write_emul_ok = 0;
3591 #endif
3592 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3593 "injection: cr2=%#lx, mfn=%#lx\n",
3594 va, mfn_x(gmfn));
3595 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3596 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
3597 va, gfn);
3598 return EXCRET_fault_fixed;
3602 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3603 (unsigned long)regs->eip, (unsigned long)regs->esp);
3605 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3607 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3609 /*
3610 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3611 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3612 * then it must be 'failable': we cannot require the unshadow to succeed.
3613 */
3614 if ( r == X86EMUL_UNHANDLEABLE )
3616 perfc_incr(shadow_fault_emulate_failed);
3617 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3618 if ( fast_emul )
3620 perfc_incr(shadow_fault_fast_emulate_fail);
3621 v->arch.paging.last_write_emul_ok = 0;
3623 #endif
3624 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3625 mfn_x(gmfn));
3626 /* If this is actually a page table, then we have a bug, and need
3627 * to support more operations in the emulator. More likely,
3628 * though, this is a hint that this page should not be shadowed. */
3629 shadow_remove_all_shadows(v, gmfn);
3631 trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
3632 va, gfn);
3633 goto emulate_done;
3636 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3637 /* Record successfully emulated information as heuristics to next
3638 * fault on same frame for acceleration. But be careful to verify
3639 * its attribute still as page table, or else unshadow triggered
3640 * in write emulation normally requires a re-sync with guest page
3641 * table to recover r/w permission. Incorrect record for such case
3642 * will cause unexpected more shadow faults due to propagation is
3643 * skipped.
3644 */
3645 if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3647 if ( !fast_emul )
3649 v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3650 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3651 v->arch.paging.last_write_emul_ok = 1;
3654 else if ( fast_emul )
3655 v->arch.paging.last_write_emul_ok = 0;
3656 #endif
3658 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3659 if ( r == X86EMUL_OKAY ) {
3660 int i, emulation_count=0;
3661 this_cpu(trace_emulate_initial_va) = va;
3662 /* Emulate up to four extra instructions in the hope of catching
3663 * the "second half" of a 64-bit pagetable write. */
3664 for ( i = 0 ; i < 4 ; i++ )
3666 shadow_continue_emulation(&emul_ctxt, regs);
3667 v->arch.paging.last_write_was_pt = 0;
3668 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3669 if ( r == X86EMUL_OKAY )
3671 emulation_count++;
3672 if ( v->arch.paging.last_write_was_pt )
3674 perfc_incr(shadow_em_ex_pt);
3675 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
3676 break; /* Don't emulate past the other half of the write */
3678 else
3679 perfc_incr(shadow_em_ex_non_pt);
3681 else
3683 perfc_incr(shadow_em_ex_fail);
3684 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
3685 break; /* Don't emulate again if we failed! */
3688 this_cpu(trace_extra_emulation_count)=emulation_count;
3690 #endif /* PAE guest */
3692 trace_shadow_emulate(gw.l1e, va);
3693 emulate_done:
3694 SHADOW_PRINTK("emulated\n");
3695 return EXCRET_fault_fixed;
3697 mmio:
3698 if ( !guest_mode(regs) )
3699 goto not_a_shadow_fault;
3700 perfc_incr(shadow_fault_mmio);
3701 sh_audit_gw(v, &gw);
3702 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3703 shadow_audit_tables(v);
3704 reset_early_unshadow(v);
3705 shadow_unlock(d);
3706 trace_shadow_gen(TRC_SHADOW_MMIO, va);
3707 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3708 ? EXCRET_fault_fixed : 0);
3710 not_a_shadow_fault:
3711 sh_audit_gw(v, &gw);
3712 SHADOW_PRINTK("not a shadow fault\n");
3713 shadow_audit_tables(v);
3714 reset_early_unshadow(v);
3715 shadow_unlock(d);
3717 propagate:
3718 trace_not_shadow_fault(gw.l1e, va);
3720 return 0;
3724 static int
3725 sh_invlpg(struct vcpu *v, unsigned long va)
3726 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3727 * instruction should be issued on the hardware, or 0 if it's safe not
3728 * to do so. */
3730 mfn_t sl1mfn;
3731 shadow_l2e_t sl2e;
3733 perfc_incr(shadow_invlpg);
3735 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3736 /* No longer safe to use cached gva->gfn translations */
3737 vtlb_flush(v);
3738 #endif
3740 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3741 v->arch.paging.last_write_emul_ok = 0;
3742 #endif
3744 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3745 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3746 * yet. */
3747 #if SHADOW_PAGING_LEVELS == 4
3749 shadow_l3e_t sl3e;
3750 if ( !(shadow_l4e_get_flags(
3751 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3752 & _PAGE_PRESENT) )
3753 return 0;
3754 /* This must still be a copy-from-user because we don't have the
3755 * shadow lock, and the higher-level shadows might disappear
3756 * under our feet. */
3757 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3758 + shadow_l3_linear_offset(va)),
3759 sizeof (sl3e)) != 0 )
3761 perfc_incr(shadow_invlpg_fault);
3762 return 0;
3764 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3765 return 0;
3767 #else /* SHADOW_PAGING_LEVELS == 3 */
3768 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3769 & _PAGE_PRESENT) )
3770 // no need to flush anything if there's no SL2...
3771 return 0;
3772 #endif
3774 /* This must still be a copy-from-user because we don't have the shadow
3775 * lock, and the higher-level shadows might disappear under our feet. */
3776 if ( __copy_from_user(&sl2e,
3777 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3778 sizeof (sl2e)) != 0 )
3780 perfc_incr(shadow_invlpg_fault);
3781 return 0;
3784 // If there's nothing shadowed for this particular sl2e, then
3785 // there is no need to do an invlpg, either...
3786 //
3787 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3788 return 0;
3790 // Check to see if the SL2 is a splintered superpage...
3791 // If so, then we'll need to flush the entire TLB (because that's
3792 // easier than invalidating all of the individual 4K pages).
3793 //
3794 sl1mfn = shadow_l2e_get_mfn(sl2e);
3795 if ( mfn_to_shadow_page(sl1mfn)->type
3796 == SH_type_fl1_shadow )
3798 flush_tlb_local();
3799 return 0;
3802 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3803 /* Check to see if the SL1 is out of sync. */
3805 mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3806 struct page_info *pg = mfn_to_page(gl1mfn);
3807 if ( mfn_valid(gl1mfn)
3808 && page_is_out_of_sync(pg) )
3810 /* The test above may give false positives, since we don't
3811 * hold the shadow lock yet. Check again with the lock held. */
3812 shadow_lock(v->domain);
3814 /* This must still be a copy-from-user because we didn't
3815 * have the shadow lock last time we checked, and the
3816 * higher-level shadows might have disappeared under our
3817 * feet. */
3818 if ( __copy_from_user(&sl2e,
3819 sh_linear_l2_table(v)
3820 + shadow_l2_linear_offset(va),
3821 sizeof (sl2e)) != 0 )
3823 perfc_incr(shadow_invlpg_fault);
3824 shadow_unlock(v->domain);
3825 return 0;
3828 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3830 shadow_unlock(v->domain);
3831 return 0;
3834 sl1mfn = shadow_l2e_get_mfn(sl2e);
3835 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3836 pg = mfn_to_page(gl1mfn);
3838 if ( likely(sh_mfn_is_a_page_table(gl1mfn)
3839 && page_is_out_of_sync(pg) ) )
3841 shadow_l1e_t *sl1;
3842 sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
3843 /* Remove the shadow entry that maps this VA */
3844 (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
3846 shadow_unlock(v->domain);
3847 /* Need the invlpg, to pick up the disappeareance of the sl1e */
3848 return 1;
3851 #endif
3853 return 1;
3857 static unsigned long
3858 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3859 /* Called to translate a guest virtual address to what the *guest*
3860 * pagetables would map it to. */
3862 walk_t gw;
3863 gfn_t gfn;
3865 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3866 /* Check the vTLB cache first */
3867 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3868 if ( VALID_GFN(vtlb_gfn) )
3869 return vtlb_gfn;
3870 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3872 if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
3874 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3875 pfec[0] &= ~PFEC_page_present;
3876 return INVALID_GFN;
3878 gfn = guest_walk_to_gfn(&gw);
3880 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3881 /* Remember this successful VA->GFN translation for later. */
3882 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3883 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3885 return gfn_x(gfn);
3889 static inline void
3890 sh_update_linear_entries(struct vcpu *v)
3891 /* Sync up all the linear mappings for this vcpu's pagetables */
3893 struct domain *d = v->domain;
3895 /* Linear pagetables in PV guests
3896 * ------------------------------
3898 * Guest linear pagetables, which map the guest pages, are at
3899 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3900 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3901 * are set up at shadow creation time, but (of course!) the PAE case
3902 * is subtler. Normal linear mappings are made by having an entry
3903 * in the top-level table that points to itself (shadow linear) or
3904 * to the guest top-level table (guest linear). For PAE, to set up
3905 * a linear map requires us to copy the four top-level entries into
3906 * level-2 entries. That means that every time we change a PAE l3e,
3907 * we need to reflect the change into the copy.
3909 * Linear pagetables in HVM guests
3910 * -------------------------------
3912 * For HVM guests, the linear pagetables are installed in the monitor
3913 * tables (since we can't put them in the shadow). Shadow linear
3914 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3915 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3916 * a linear pagetable of the monitor tables themselves. We have
3917 * the same issue of having to re-copy PAE l3 entries whevever we use
3918 * PAE shadows.
3920 * Because HVM guests run on the same monitor tables regardless of the
3921 * shadow tables in use, the linear mapping of the shadow tables has to
3922 * be updated every time v->arch.shadow_table changes.
3923 */
3925 /* Don't try to update the monitor table if it doesn't exist */
3926 if ( shadow_mode_external(d)
3927 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3928 return;
3930 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3932 /* For PV, one l4e points at the guest l4, one points at the shadow
3933 * l4. No maintenance required.
3934 * For HVM, just need to update the l4e that points to the shadow l4. */
3936 if ( shadow_mode_external(d) )
3938 /* Use the linear map if we can; otherwise make a new mapping */
3939 if ( v == current )
3941 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3942 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3943 __PAGE_HYPERVISOR);
3945 else
3947 l4_pgentry_t *ml4e;
3948 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3949 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3950 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3951 __PAGE_HYPERVISOR);
3952 sh_unmap_domain_page(ml4e);
3956 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3958 /* PV: XXX
3960 * HVM: To give ourselves a linear map of the shadows, we need to
3961 * extend a PAE shadow to 4 levels. We do this by having a monitor
3962 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3963 * entries into it. Then, by having the monitor l4e for shadow
3964 * pagetables also point to the monitor l4, we can use it to access
3965 * the shadows.
3966 */
3968 if ( shadow_mode_external(d) )
3970 /* Install copies of the shadow l3es into the monitor l2 table
3971 * that maps SH_LINEAR_PT_VIRT_START. */
3972 shadow_l3e_t *sl3e;
3973 l2_pgentry_t *ml2e;
3974 int i;
3976 /* Use linear mappings if we can; otherwise make new mappings */
3977 if ( v == current )
3978 ml2e = __linear_l2_table
3979 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3980 else
3982 mfn_t l3mfn, l2mfn;
3983 l4_pgentry_t *ml4e;
3984 l3_pgentry_t *ml3e;
3985 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3986 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3988 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3989 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3990 ml3e = sh_map_domain_page(l3mfn);
3991 sh_unmap_domain_page(ml4e);
3993 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3994 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3995 ml2e = sh_map_domain_page(l2mfn);
3996 sh_unmap_domain_page(ml3e);
3999 /* Shadow l3 tables are made up by sh_update_cr3 */
4000 sl3e = v->arch.paging.shadow.l3table;
4002 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
4004 ml2e[i] =
4005 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
4006 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
4007 __PAGE_HYPERVISOR)
4008 : l2e_empty();
4011 if ( v != current )
4012 sh_unmap_domain_page(ml2e);
4014 else
4015 domain_crash(d); /* XXX */
4017 #elif CONFIG_PAGING_LEVELS == 3
4019 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
4020 * entries in the shadow, and the shadow's l3 entries into the
4021 * shadow-linear-map l2 entries in the shadow. This is safe to do
4022 * because Xen does not let guests share high-slot l2 tables between l3s,
4023 * so we know we're not treading on anyone's toes.
4025 * HVM: need to copy the shadow's l3 entries into the
4026 * shadow-linear-map l2 entries in the monitor table. This is safe
4027 * because we have one monitor table for each vcpu. The monitor's
4028 * own l3es don't need to be copied because they never change.
4029 * XXX That might change if we start stuffing things into the rest
4030 * of the monitor's virtual address space.
4031 */
4033 l2_pgentry_t *l2e, new_l2e;
4034 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
4035 int i;
4036 int unmap_l2e = 0;
4038 #if GUEST_PAGING_LEVELS == 2
4040 /* Shadow l3 tables were built by sh_update_cr3 */
4041 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
4042 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
4044 #else /* GUEST_PAGING_LEVELS == 3 */
4046 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
4047 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
4049 #endif /* GUEST_PAGING_LEVELS */
4051 /* Choose where to write the entries, using linear maps if possible */
4052 if ( shadow_mode_external(d) )
4054 if ( v == current )
4056 /* From the monitor tables, it's safe to use linear maps
4057 * to update monitor l2s */
4058 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
4060 else
4062 /* Map the monitor table's high l2 */
4063 l3_pgentry_t *l3e;
4064 l3e = sh_map_domain_page(
4065 pagetable_get_mfn(v->arch.monitor_table));
4066 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
4067 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
4068 unmap_l2e = 1;
4069 sh_unmap_domain_page(l3e);
4072 else
4074 /* Map the shadow table's high l2 */
4075 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
4076 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
4077 unmap_l2e = 1;
4080 /* Write linear mapping of guest (only in PV, and only when
4081 * not translated). */
4082 if ( !shadow_mode_translate(d) )
4084 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
4086 new_l2e =
4087 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
4088 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
4089 __PAGE_HYPERVISOR)
4090 : l2e_empty());
4091 safe_write_entry(
4092 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
4093 &new_l2e);
4097 /* Write linear mapping of shadow. */
4098 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
4100 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
4101 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
4102 __PAGE_HYPERVISOR)
4103 : l2e_empty();
4104 safe_write_entry(
4105 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
4106 &new_l2e);
4109 if ( unmap_l2e )
4110 sh_unmap_domain_page(l2e);
4113 #else
4114 #error this should not happen
4115 #endif
4117 if ( shadow_mode_external(d) )
4119 /*
4120 * Having modified the linear pagetable mapping, flush local host TLBs.
4121 * This was not needed when vmenter/vmexit always had the side effect
4122 * of flushing host TLBs but, with ASIDs, it is possible to finish
4123 * this CR3 update, vmenter the guest, vmexit due to a page fault,
4124 * without an intervening host TLB flush. Then the page fault code
4125 * could use the linear pagetable to read a top-level shadow page
4126 * table entry. But, without this change, it would fetch the wrong
4127 * value due to a stale TLB.
4128 */
4129 flush_tlb_local();
4134 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
4135 * Does all appropriate management/bookkeeping/refcounting/etc...
4136 */
4137 static void
4138 sh_detach_old_tables(struct vcpu *v)
4140 mfn_t smfn;
4141 int i = 0;
4143 ////
4144 //// vcpu->arch.paging.shadow.guest_vtable
4145 ////
4147 #if GUEST_PAGING_LEVELS == 3
4148 /* PAE guests don't have a mapping of the guest top-level table */
4149 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4150 #else
4151 if ( v->arch.paging.shadow.guest_vtable )
4153 struct domain *d = v->domain;
4154 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4155 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4156 v->arch.paging.shadow.guest_vtable = NULL;
4158 #endif // !NDEBUG
4161 ////
4162 //// vcpu->arch.shadow_table[]
4163 ////
4165 #if GUEST_PAGING_LEVELS == 3
4166 /* PAE guests have four shadow_table entries */
4167 for ( i = 0 ; i < 4 ; i++ )
4168 #endif
4170 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4171 if ( mfn_x(smfn) )
4172 sh_put_ref(v, smfn, 0);
4173 v->arch.shadow_table[i] = pagetable_null();
4177 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
4178 static void
4179 sh_set_toplevel_shadow(struct vcpu *v,
4180 int slot,
4181 mfn_t gmfn,
4182 unsigned int root_type)
4184 mfn_t smfn;
4185 pagetable_t old_entry, new_entry;
4187 struct domain *d = v->domain;
4189 /* Remember the old contents of this slot */
4190 old_entry = v->arch.shadow_table[slot];
4192 /* Now figure out the new contents: is this a valid guest MFN? */
4193 if ( !mfn_valid(gmfn) )
4195 new_entry = pagetable_null();
4196 goto install_new_entry;
4199 /* Guest mfn is valid: shadow it and install the shadow */
4200 smfn = get_shadow_status(v, gmfn, root_type);
4201 if ( !mfn_valid(smfn) )
4203 /* Make sure there's enough free shadow memory. */
4204 shadow_prealloc(d, root_type, 1);
4205 /* Shadow the page. */
4206 smfn = sh_make_shadow(v, gmfn, root_type);
4208 ASSERT(mfn_valid(smfn));
4210 /* Pin the shadow and put it (back) on the list of pinned shadows */
4211 if ( sh_pin(v, smfn) == 0 )
4213 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
4214 domain_crash(v->domain);
4217 /* Take a ref to this page: it will be released in sh_detach_old_tables()
4218 * or the next call to set_toplevel_shadow() */
4219 if ( !sh_get_ref(v, smfn, 0) )
4221 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
4222 domain_crash(v->domain);
4225 new_entry = pagetable_from_mfn(smfn);
4227 install_new_entry:
4228 /* Done. Install it */
4229 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
4230 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
4231 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
4232 v->arch.shadow_table[slot] = new_entry;
4234 /* Decrement the refcount of the old contents of this slot */
4235 if ( !pagetable_is_null(old_entry) ) {
4236 mfn_t old_smfn = pagetable_get_mfn(old_entry);
4237 /* Need to repin the old toplevel shadow if it's been unpinned
4238 * by shadow_prealloc(): in PV mode we're still running on this
4239 * shadow and it's not safe to free it yet. */
4240 if ( !mfn_to_shadow_page(old_smfn)->pinned && !sh_pin(v, old_smfn) )
4242 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
4243 domain_crash(v->domain);
4245 sh_put_ref(v, old_smfn, 0);
4250 static void
4251 sh_update_cr3(struct vcpu *v, int do_locking)
4252 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
4253 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
4254 * if appropriate).
4255 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
4256 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
4257 * shadow tables are.
4258 * If do_locking != 0, assume we are being called from outside the
4259 * shadow code, and must take and release the shadow lock; otherwise
4260 * that is the caller's responsibility.
4261 */
4263 struct domain *d = v->domain;
4264 mfn_t gmfn;
4265 #if GUEST_PAGING_LEVELS == 3
4266 guest_l3e_t *gl3e;
4267 u32 guest_idx=0;
4268 int i;
4269 #endif
4271 /* Don't do anything on an uninitialised vcpu */
4272 if ( !is_hvm_domain(d) && !v->is_initialised )
4274 ASSERT(v->arch.cr3 == 0);
4275 return;
4278 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4279 /* Need to resync all the shadow entries on a TLB flush. Resync
4280 * current vcpus OOS pages before switching to the new shadow
4281 * tables so that the VA hint is still valid. */
4282 shadow_resync_current_vcpu(v, do_locking);
4283 #endif
4285 if ( do_locking ) shadow_lock(v->domain);
4287 ASSERT(shadow_locked_by_me(v->domain));
4288 ASSERT(v->arch.paging.mode);
4290 ////
4291 //// vcpu->arch.guest_table is already set
4292 ////
4294 #ifndef NDEBUG
4295 /* Double-check that the HVM code has sent us a sane guest_table */
4296 if ( is_hvm_domain(d) )
4298 ASSERT(shadow_mode_external(d));
4299 if ( hvm_paging_enabled(v) )
4300 ASSERT(pagetable_get_pfn(v->arch.guest_table));
4301 else
4302 ASSERT(v->arch.guest_table.pfn
4303 == d->arch.paging.shadow.unpaged_pagetable.pfn);
4305 #endif
4307 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
4308 d->domain_id, v->vcpu_id,
4309 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
4311 #if GUEST_PAGING_LEVELS == 4
4312 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
4313 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
4314 else
4315 #endif
4316 gmfn = pagetable_get_mfn(v->arch.guest_table);
4319 ////
4320 //// vcpu->arch.paging.shadow.guest_vtable
4321 ////
4322 #if GUEST_PAGING_LEVELS == 4
4323 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4325 if ( v->arch.paging.shadow.guest_vtable )
4326 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4327 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4328 /* PAGING_LEVELS==4 implies 64-bit, which means that
4329 * map_domain_page_global can't fail */
4330 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
4332 else
4333 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
4334 #elif GUEST_PAGING_LEVELS == 3
4335 /* On PAE guests we don't use a mapping of the guest's own top-level
4336 * table. We cache the current state of that table and shadow that,
4337 * until the next CR3 write makes us refresh our cache. */
4338 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4340 if ( shadow_mode_external(d) )
4341 /* Find where in the page the l3 table is */
4342 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
4343 else
4344 /* PV guest: l3 is at the start of a page */
4345 guest_idx = 0;
4347 // Ignore the low 2 bits of guest_idx -- they are really just
4348 // cache control.
4349 guest_idx &= ~3;
4351 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
4352 for ( i = 0; i < 4 ; i++ )
4353 v->arch.paging.shadow.gl3e[i] = gl3e[i];
4354 sh_unmap_domain_page(gl3e);
4355 #elif GUEST_PAGING_LEVELS == 2
4356 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4358 if ( v->arch.paging.shadow.guest_vtable )
4359 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4360 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4361 /* Does this really need map_domain_page_global? Handle the
4362 * error properly if so. */
4363 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
4365 else
4366 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
4367 #else
4368 #error this should never happen
4369 #endif
4372 ////
4373 //// vcpu->arch.shadow_table[]
4374 ////
4376 /* We revoke write access to the new guest toplevel page(s) before we
4377 * replace the old shadow pagetable(s), so that we can safely use the
4378 * (old) shadow linear maps in the writeable mapping heuristics. */
4379 #if GUEST_PAGING_LEVELS == 2
4380 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
4381 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4382 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
4383 #elif GUEST_PAGING_LEVELS == 3
4384 /* PAE guests have four shadow_table entries, based on the
4385 * current values of the guest's four l3es. */
4387 int flush = 0;
4388 gfn_t gl2gfn;
4389 mfn_t gl2mfn;
4390 p2m_type_t p2mt;
4391 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
4392 /* First, make all four entries read-only. */
4393 for ( i = 0; i < 4; i++ )
4395 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4397 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4398 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4399 if ( p2m_is_ram(p2mt) )
4400 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
4403 if ( flush )
4404 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4405 /* Now install the new shadows. */
4406 for ( i = 0; i < 4; i++ )
4408 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4410 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4411 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4412 if ( p2m_is_ram(p2mt) )
4413 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
4414 ? SH_type_l2h_shadow
4415 : SH_type_l2_shadow);
4416 else
4417 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4419 else
4420 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4423 #elif GUEST_PAGING_LEVELS == 4
4424 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
4425 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4426 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
4427 #else
4428 #error This should never happen
4429 #endif
4432 ///
4433 /// v->arch.paging.shadow.l3table
4434 ///
4435 #if SHADOW_PAGING_LEVELS == 3
4437 mfn_t smfn;
4438 int i;
4439 for ( i = 0; i < 4; i++ )
4441 #if GUEST_PAGING_LEVELS == 2
4442 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
4443 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
4444 #else
4445 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
4446 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4447 #endif
4448 v->arch.paging.shadow.l3table[i] =
4449 (mfn_x(smfn) == 0)
4450 ? shadow_l3e_empty()
4451 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
4454 #endif /* SHADOW_PAGING_LEVELS == 3 */
4457 ///
4458 /// v->arch.cr3
4459 ///
4460 if ( shadow_mode_external(d) )
4462 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4464 else // not shadow_mode_external...
4466 /* We don't support PV except guest == shadow == config levels */
4467 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4468 #if SHADOW_PAGING_LEVELS == 3
4469 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4470 * Don't use make_cr3 because (a) we know it's below 4GB, and
4471 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4472 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4473 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4474 #else
4475 /* 4-on-4: Just use the shadow top-level directly */
4476 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4477 #endif
4481 ///
4482 /// v->arch.hvm_vcpu.hw_cr[3]
4483 ///
4484 if ( shadow_mode_external(d) )
4486 ASSERT(is_hvm_domain(d));
4487 #if SHADOW_PAGING_LEVELS == 3
4488 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4489 v->arch.hvm_vcpu.hw_cr[3] =
4490 virt_to_maddr(&v->arch.paging.shadow.l3table);
4491 #else
4492 /* 4-on-4: Just use the shadow top-level directly */
4493 v->arch.hvm_vcpu.hw_cr[3] =
4494 pagetable_get_paddr(v->arch.shadow_table[0]);
4495 #endif
4496 hvm_update_guest_cr(v, 3);
4499 /* Fix up the linear pagetable mappings */
4500 sh_update_linear_entries(v);
4502 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4503 /* No longer safe to use cached gva->gfn translations */
4504 vtlb_flush(v);
4505 #endif
4507 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4508 v->arch.paging.last_write_emul_ok = 0;
4509 #endif
4511 /* Release the lock, if we took it (otherwise it's the caller's problem) */
4512 if ( do_locking ) shadow_unlock(v->domain);
4514 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4515 /* Need to resync all the shadow entries on a TLB flush. We only
4516 * update the shadows, leaving the pages out of sync. Also, we try
4517 * to skip synchronization of shadows not mapped in the new
4518 * tables. */
4519 shadow_sync_other_vcpus(v, do_locking);
4520 #endif
4525 /**************************************************************************/
4526 /* Functions to revoke guest rights */
4528 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
4529 int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
4530 mfn_t smfn, unsigned long off)
4532 int r;
4533 shadow_l1e_t *sl1p, sl1e;
4534 struct shadow_page_info *sp;
4536 ASSERT(mfn_valid(gmfn));
4537 ASSERT(mfn_valid(smfn));
4539 sp = mfn_to_shadow_page(smfn);
4541 if ( sp->mbz != 0
4542 || (sp->type != SH_type_l1_shadow) )
4543 goto fail;
4545 sl1p = sh_map_domain_page(smfn);
4546 sl1p += off;
4547 sl1e = *sl1p;
4548 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4549 != (_PAGE_PRESENT|_PAGE_RW))
4550 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4552 sh_unmap_domain_page(sl1p);
4553 goto fail;
4556 /* Found it! Need to remove its write permissions. */
4557 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4558 r = shadow_set_l1e(v, sl1p, sl1e, smfn);
4559 ASSERT( !(r & SHADOW_SET_ERROR) );
4561 sh_unmap_domain_page(sl1p);
4562 perfc_incr(shadow_writeable_h_7);
4563 return 1;
4565 fail:
4566 perfc_incr(shadow_writeable_h_8);
4567 return 0;
4569 #endif /* OOS */
4571 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4572 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4573 /* Look up this vaddr in the current shadow and see if it's a writeable
4574 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4576 shadow_l1e_t sl1e, *sl1p;
4577 shadow_l2e_t *sl2p;
4578 shadow_l3e_t *sl3p;
4579 #if SHADOW_PAGING_LEVELS >= 4
4580 shadow_l4e_t *sl4p;
4581 #endif
4582 mfn_t sl1mfn;
4583 int r;
4585 /* Carefully look in the shadow linear map for the l1e we expect */
4586 #if SHADOW_PAGING_LEVELS >= 4
4587 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4588 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4589 return 0;
4590 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4591 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4592 return 0;
4593 #else /* SHADOW_PAGING_LEVELS == 3 */
4594 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4595 + shadow_l3_linear_offset(vaddr);
4596 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4597 return 0;
4598 #endif
4599 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4600 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4601 return 0;
4602 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4603 sl1e = *sl1p;
4604 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4605 != (_PAGE_PRESENT|_PAGE_RW))
4606 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4607 return 0;
4609 /* Found it! Need to remove its write permissions. */
4610 sl1mfn = shadow_l2e_get_mfn(*sl2p);
4611 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4612 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
4613 ASSERT( !(r & SHADOW_SET_ERROR) );
4614 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);
4615 return 1;
4617 #endif
4619 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4620 mfn_t readonly_mfn)
4621 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4623 shadow_l1e_t *sl1e;
4624 int done = 0;
4625 int flags;
4626 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4627 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4628 #endif
4630 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4632 flags = shadow_l1e_get_flags(*sl1e);
4633 if ( (flags & _PAGE_PRESENT)
4634 && (flags & _PAGE_RW)
4635 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4637 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4638 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
4639 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4640 /* Remember the last shadow that we shot a writeable mapping in */
4641 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4642 #endif
4643 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4644 & PGT_count_mask) == 0 )
4645 /* This breaks us cleanly out of the FOREACH macro */
4646 done = 1;
4648 });
4649 return done;
4653 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4654 /* Excises all mappings to guest frame from this shadow l1 table */
4656 shadow_l1e_t *sl1e;
4657 int done = 0;
4658 int flags;
4660 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4662 flags = shadow_l1e_get_flags(*sl1e);
4663 if ( (flags & _PAGE_PRESENT)
4664 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4666 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
4667 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4668 /* This breaks us cleanly out of the FOREACH macro */
4669 done = 1;
4671 });
4672 return done;
4675 /**************************************************************************/
4676 /* Functions to excise all pointers to shadows from higher-level shadows. */
4678 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4679 /* Blank out a single shadow entry */
4681 switch ( mfn_to_shadow_page(smfn)->type )
4683 case SH_type_l1_shadow:
4684 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
4685 case SH_type_l2_shadow:
4686 #if GUEST_PAGING_LEVELS >= 3
4687 case SH_type_l2h_shadow:
4688 #endif
4689 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
4690 #if GUEST_PAGING_LEVELS >= 4
4691 case SH_type_l3_shadow:
4692 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
4693 case SH_type_l4_shadow:
4694 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
4695 #endif
4696 default: BUG(); /* Called with the wrong kind of shadow. */
4700 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4701 /* Remove all mappings of this l1 shadow from this l2 shadow */
4703 shadow_l2e_t *sl2e;
4704 int done = 0;
4705 int flags;
4707 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4709 flags = shadow_l2e_get_flags(*sl2e);
4710 if ( (flags & _PAGE_PRESENT)
4711 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4713 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4714 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
4715 /* This breaks us cleanly out of the FOREACH macro */
4716 done = 1;
4718 });
4719 return done;
4722 #if GUEST_PAGING_LEVELS >= 4
4723 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4724 /* Remove all mappings of this l2 shadow from this l3 shadow */
4726 shadow_l3e_t *sl3e;
4727 int done = 0;
4728 int flags;
4730 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4732 flags = shadow_l3e_get_flags(*sl3e);
4733 if ( (flags & _PAGE_PRESENT)
4734 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4736 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4737 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
4738 /* This breaks us cleanly out of the FOREACH macro */
4739 done = 1;
4741 });
4742 return done;
4745 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4746 /* Remove all mappings of this l3 shadow from this l4 shadow */
4748 shadow_l4e_t *sl4e;
4749 int done = 0;
4750 int flags;
4752 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4754 flags = shadow_l4e_get_flags(*sl4e);
4755 if ( (flags & _PAGE_PRESENT)
4756 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4758 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4759 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
4760 /* This breaks us cleanly out of the FOREACH macro */
4761 done = 1;
4763 });
4764 return done;
4766 #endif /* 64bit guest */
4768 /**************************************************************************/
4769 /* Handling HVM guest writes to pagetables */
4771 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4772 #define BAD_GVA_TO_GFN (~0UL)
4773 #define BAD_GFN_TO_MFN (~1UL)
4774 #define READONLY_GFN (~2UL)
4775 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4776 unsigned long vaddr,
4777 struct sh_emulate_ctxt *sh_ctxt)
4779 unsigned long gfn;
4780 mfn_t mfn;
4781 p2m_type_t p2mt;
4782 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4784 /* Translate the VA to a GFN */
4785 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4786 if ( gfn == INVALID_GFN )
4788 if ( is_hvm_vcpu(v) )
4789 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4790 else
4791 propagate_page_fault(vaddr, pfec);
4792 return _mfn(BAD_GVA_TO_GFN);
4795 /* Translate the GFN to an MFN */
4796 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4797 if ( p2mt == p2m_ram_ro )
4798 return _mfn(READONLY_GFN);
4799 if ( !p2m_is_ram(p2mt) )
4800 return _mfn(BAD_GFN_TO_MFN);
4802 ASSERT(mfn_valid(mfn));
4803 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4804 return mfn;
4807 /* Check that the user is allowed to perform this write.
4808 * Returns a mapped pointer to write to, or NULL for error. */
4809 #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
4810 #define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION)
4811 #define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY)
4812 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3)
4813 static void *emulate_map_dest(struct vcpu *v,
4814 unsigned long vaddr,
4815 u32 bytes,
4816 struct sh_emulate_ctxt *sh_ctxt)
4818 unsigned long offset;
4819 void *map = NULL;
4821 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4822 if ( !mfn_valid(sh_ctxt->mfn1) )
4823 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4824 MAPPING_EXCEPTION :
4825 (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
4826 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4828 #ifndef NDEBUG
4829 /* We don't emulate user-mode writes to page tables */
4830 if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 )
4832 gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
4833 "emulate_map_dest(). This should never happen!\n");
4834 return MAPPING_UNHANDLEABLE;
4836 #endif
4838 /* Unaligned writes mean probably this isn't a pagetable */
4839 if ( vaddr & (bytes - 1) )
4840 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4842 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4844 /* Whole write fits on a single page */
4845 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4846 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4848 else
4850 /* Cross-page emulated writes are only supported for HVM guests;
4851 * PV guests ought to know better */
4852 if ( !is_hvm_vcpu(v) )
4853 return MAPPING_UNHANDLEABLE;
4855 /* This write crosses a page boundary. Translate the second page */
4856 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4857 sh_ctxt);
4858 if ( !mfn_valid(sh_ctxt->mfn2) )
4859 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4860 MAPPING_EXCEPTION :
4861 (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
4862 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4864 /* Cross-page writes mean probably not a pagetable */
4865 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4867 /* Hack: we map the pages into the vcpu's LDT space, since we
4868 * know that we're not going to need the LDT for HVM guests,
4869 * and only HVM guests are allowed unaligned writes. */
4870 ASSERT(is_hvm_vcpu(v));
4871 map = (void *)LDT_VIRT_START(v);
4872 offset = l1_linear_offset((unsigned long) map);
4873 l1e_write(&__linear_l1_table[offset],
4874 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4875 l1e_write(&__linear_l1_table[offset + 1],
4876 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4877 flush_tlb_local();
4878 map += (vaddr & ~PAGE_MASK);
4881 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4882 /* Remember if the bottom bit was clear, so we can choose not to run
4883 * the change through the verify code if it's still clear afterwards */
4884 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4885 #endif
4887 return map;
4890 /* Tidy up after the emulated write: mark pages dirty, verify the new
4891 * contents, and undo the mapping */
4892 static void emulate_unmap_dest(struct vcpu *v,
4893 void *addr,
4894 u32 bytes,
4895 struct sh_emulate_ctxt *sh_ctxt)
4897 u32 b1 = bytes, b2 = 0, shflags;
4899 ASSERT(mfn_valid(sh_ctxt->mfn1));
4901 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4902 if ( likely(bytes >= 4)
4903 && (*(u32 *)addr == 0)
4904 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4905 check_for_early_unshadow(v, sh_ctxt->mfn1);
4906 else
4907 reset_early_unshadow(v);
4909 /* We can avoid re-verifying the page contents after the write if:
4910 * - it was no larger than the PTE type of this pagetable;
4911 * - it was aligned to the PTE boundaries; and
4912 * - _PAGE_PRESENT was clear before and after the write. */
4913 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4914 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4915 if ( sh_ctxt->low_bit_was_clear
4916 && !(*(u8 *)addr & _PAGE_PRESENT)
4917 && ((!(shflags & SHF_32)
4918 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4919 * the present bit unset are safe to ignore. */
4920 && ((unsigned long)addr & 7) == 0
4921 && bytes <= 8)
4922 ||
4923 (!(shflags & (SHF_PAE|SHF_64))
4924 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4925 * leave the present bit unset are safe to ignore. */
4926 && ((unsigned long)addr & 3) == 0
4927 && bytes <= 4)) )
4929 /* Writes with this alignment constraint can't possibly cross pages */
4930 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4932 else
4933 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4935 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4937 /* Validate as two writes, one to each page */
4938 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4939 b2 = bytes - b1;
4940 ASSERT(b2 < bytes);
4942 if ( likely(b1 > 0) )
4943 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4944 if ( unlikely(b2 > 0) )
4945 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4948 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4950 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4952 unsigned long offset;
4953 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4954 /* Undo the hacky two-frame contiguous map. */
4955 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4956 offset = l1_linear_offset((unsigned long) addr);
4957 l1e_write(&__linear_l1_table[offset], l1e_empty());
4958 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4959 flush_tlb_all();
4961 else
4962 sh_unmap_domain_page(addr);
4964 atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4967 static int
4968 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4969 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4971 void *addr;
4973 /* Unaligned writes are only acceptable on HVM */
4974 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4975 return X86EMUL_UNHANDLEABLE;
4977 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4978 if ( emulate_map_dest_failed(addr) )
4979 return (long)addr;
4981 shadow_lock(v->domain);
4982 memcpy(addr, src, bytes);
4984 if ( tb_init_done )
4986 #if GUEST_PAGING_LEVELS == 3
4987 if ( vaddr == this_cpu(trace_emulate_initial_va) )
4988 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4989 else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
4991 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
4992 memcpy(&this_cpu(trace_emulate_write_val),
4993 (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE);
4995 #else
4996 memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4997 #endif
5000 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
5001 shadow_audit_tables(v);
5002 shadow_unlock(v->domain);
5003 return X86EMUL_OKAY;
5006 static int
5007 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
5008 unsigned long old, unsigned long new,
5009 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
5011 void *addr;
5012 unsigned long prev;
5013 int rv = X86EMUL_OKAY;
5015 /* Unaligned writes are only acceptable on HVM */
5016 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
5017 return X86EMUL_UNHANDLEABLE;
5019 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
5020 if ( emulate_map_dest_failed(addr) )
5021 return (long)addr;
5023 shadow_lock(v->domain);
5024 switch ( bytes )
5026 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
5027 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
5028 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
5029 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
5030 default:
5031 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
5032 prev = ~old;
5035 if ( prev != old )
5036 rv = X86EMUL_CMPXCHG_FAILED;
5038 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
5039 " wanted %#lx now %#lx bytes %u\n",
5040 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
5042 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
5043 shadow_audit_tables(v);
5044 shadow_unlock(v->domain);
5045 return rv;
5048 #ifdef __i386__
5049 static int
5050 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
5051 unsigned long old_lo, unsigned long old_hi,
5052 unsigned long new_lo, unsigned long new_hi,
5053 struct sh_emulate_ctxt *sh_ctxt)
5055 void *addr;
5056 u64 old, new, prev;
5057 int rv = X86EMUL_OKAY;
5059 /* Unaligned writes are only acceptable on HVM */
5060 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
5061 return X86EMUL_UNHANDLEABLE;
5063 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
5064 if ( emulate_map_dest_failed(addr) )
5065 return (long)addr;
5067 old = (((u64) old_hi) << 32) | (u64) old_lo;
5068 new = (((u64) new_hi) << 32) | (u64) new_lo;
5070 shadow_lock(v->domain);
5071 prev = cmpxchg(((u64 *)addr), old, new);
5073 if ( prev != old )
5074 rv = X86EMUL_CMPXCHG_FAILED;
5076 emulate_unmap_dest(v, addr, 8, sh_ctxt);
5077 shadow_audit_tables(v);
5078 shadow_unlock(v->domain);
5079 return rv;
5081 #endif
5083 /**************************************************************************/
5084 /* Audit tools */
5086 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
5088 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
5089 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
5090 "gl" #_level "mfn = %" PRI_mfn \
5091 " sl" #_level "mfn = %" PRI_mfn \
5092 " &gl" #_level "e = %p &sl" #_level "e = %p" \
5093 " gl" #_level "e = %" SH_PRI_gpte \
5094 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
5095 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
5096 _level, guest_index(gl ## _level ## e), \
5097 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
5098 gl ## _level ## e, sl ## _level ## e, \
5099 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
5100 ##_a); \
5101 BUG(); \
5102 done = 1; \
5103 } while (0)
5105 #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
5106 printk("Shadow %u-on-%u audit failed at level %i\n" \
5107 "gl" #_level "mfn = %" PRI_mfn \
5108 " sl" #_level "mfn = %" PRI_mfn \
5109 " Error: " _fmt "\n", \
5110 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
5111 _level, \
5112 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
5113 ##_a); \
5114 BUG(); \
5115 done = 1; \
5116 } while (0)
5118 static char * sh_audit_flags(struct vcpu *v, int level,
5119 int gflags, int sflags)
5120 /* Common code for auditing flag bits */
5122 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
5123 return "shadow is present but guest is not present";
5124 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
5125 return "global bit set in PV shadow";
5126 if ( level == 2 && (sflags & _PAGE_PSE) )
5127 return "PS bit set in shadow";
5128 #if SHADOW_PAGING_LEVELS == 3
5129 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
5130 #endif
5131 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
5132 return "accessed bit not propagated";
5133 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
5134 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
5135 return "dirty bit not propagated";
5136 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
5137 return "user/supervisor bit does not match";
5138 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
5139 return "NX bit does not match";
5140 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
5141 return "shadow grants write access but guest does not";
5142 return NULL;
5145 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
5147 guest_l1e_t *gl1e, *gp;
5148 shadow_l1e_t *sl1e;
5149 mfn_t mfn, gmfn, gl1mfn;
5150 gfn_t gfn;
5151 p2m_type_t p2mt;
5152 char *s;
5153 int done = 0;
5155 /* Follow the backpointer */
5156 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
5158 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5159 /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
5160 if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
5162 oos_audit_hash_is_present(v->domain, gl1mfn);
5163 return 0;
5165 #endif
5167 gl1e = gp = sh_map_domain_page(gl1mfn);
5168 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
5170 if ( sh_l1e_is_magic(*sl1e) )
5172 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
5173 if ( sh_l1e_is_gnp(*sl1e) )
5175 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
5176 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
5178 else
5180 ASSERT(sh_l1e_is_mmio(*sl1e));
5181 gfn = sh_l1e_mmio_get_gfn(*sl1e);
5182 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
5183 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
5184 " but guest gfn is %" SH_PRI_gfn,
5185 gfn_x(gfn),
5186 gfn_x(guest_l1e_get_gfn(*gl1e)));
5188 #endif
5190 else
5192 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
5193 shadow_l1e_get_flags(*sl1e));
5194 if ( s ) AUDIT_FAIL(1, "%s", s);
5196 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5198 gfn = guest_l1e_get_gfn(*gl1e);
5199 mfn = shadow_l1e_get_mfn(*sl1e);
5200 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
5201 if ( mfn_x(gmfn) != mfn_x(mfn) )
5202 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
5203 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5204 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5207 });
5208 sh_unmap_domain_page(gp);
5209 return done;
5212 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
5214 guest_l1e_t *gl1e, e;
5215 shadow_l1e_t *sl1e;
5216 mfn_t gl1mfn = _mfn(INVALID_MFN);
5217 int f;
5218 int done = 0;
5220 /* fl1 has no useful backpointer: all we can check are flags */
5221 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
5222 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
5223 f = shadow_l1e_get_flags(*sl1e);
5224 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
5225 if ( !(f == 0
5226 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
5227 _PAGE_ACCESSED|_PAGE_DIRTY)
5228 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
5229 || sh_l1e_is_magic(*sl1e)) )
5230 AUDIT_FAIL(1, "fl1e has bad flags");
5231 });
5232 return 0;
5235 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
5237 guest_l2e_t *gl2e, *gp;
5238 shadow_l2e_t *sl2e;
5239 mfn_t mfn, gmfn, gl2mfn;
5240 gfn_t gfn;
5241 p2m_type_t p2mt;
5242 char *s;
5243 int done = 0;
5245 /* Follow the backpointer */
5246 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
5248 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5249 /* Only L1's may be out of sync. */
5250 if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
5251 AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
5252 #endif
5254 gl2e = gp = sh_map_domain_page(gl2mfn);
5255 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
5257 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
5258 shadow_l2e_get_flags(*sl2e));
5259 if ( s ) AUDIT_FAIL(2, "%s", s);
5261 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5263 gfn = guest_l2e_get_gfn(*gl2e);
5264 mfn = shadow_l2e_get_mfn(*sl2e);
5265 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
5266 ? get_fl1_shadow_status(v, gfn)
5267 : get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5268 SH_type_l1_shadow);
5269 if ( mfn_x(gmfn) != mfn_x(mfn) )
5270 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
5271 " (--> %" PRI_mfn ")"
5272 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5273 gfn_x(gfn),
5274 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
5275 : mfn_x(gfn_to_mfn(v->domain, gfn, &p2mt)),
5276 mfn_x(gmfn), mfn_x(mfn));
5278 });
5279 sh_unmap_domain_page(gp);
5280 return 0;
5283 #if GUEST_PAGING_LEVELS >= 4
5284 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
5286 guest_l3e_t *gl3e, *gp;
5287 shadow_l3e_t *sl3e;
5288 mfn_t mfn, gmfn, gl3mfn;
5289 gfn_t gfn;
5290 p2m_type_t p2mt;
5291 char *s;
5292 int done = 0;
5294 /* Follow the backpointer */
5295 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
5297 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5298 /* Only L1's may be out of sync. */
5299 if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
5300 AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
5301 #endif
5303 gl3e = gp = sh_map_domain_page(gl3mfn);
5304 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
5306 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
5307 shadow_l3e_get_flags(*sl3e));
5308 if ( s ) AUDIT_FAIL(3, "%s", s);
5310 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5312 gfn = guest_l3e_get_gfn(*gl3e);
5313 mfn = shadow_l3e_get_mfn(*sl3e);
5314 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5315 ((GUEST_PAGING_LEVELS == 3 ||
5316 is_pv_32on64_vcpu(v))
5317 && !shadow_mode_external(v->domain)
5318 && (guest_index(gl3e) % 4) == 3)
5319 ? SH_type_l2h_shadow
5320 : SH_type_l2_shadow);
5321 if ( mfn_x(gmfn) != mfn_x(mfn) )
5322 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
5323 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5324 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5326 });
5327 sh_unmap_domain_page(gp);
5328 return 0;
5331 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
5333 guest_l4e_t *gl4e, *gp;
5334 shadow_l4e_t *sl4e;
5335 mfn_t mfn, gmfn, gl4mfn;
5336 gfn_t gfn;
5337 p2m_type_t p2mt;
5338 char *s;
5339 int done = 0;
5341 /* Follow the backpointer */
5342 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
5344 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5345 /* Only L1's may be out of sync. */
5346 if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
5347 AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
5348 #endif
5350 gl4e = gp = sh_map_domain_page(gl4mfn);
5351 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
5353 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
5354 shadow_l4e_get_flags(*sl4e));
5355 if ( s ) AUDIT_FAIL(4, "%s", s);
5357 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5359 gfn = guest_l4e_get_gfn(*gl4e);
5360 mfn = shadow_l4e_get_mfn(*sl4e);
5361 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5362 SH_type_l3_shadow);
5363 if ( mfn_x(gmfn) != mfn_x(mfn) )
5364 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
5365 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5366 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5368 });
5369 sh_unmap_domain_page(gp);
5370 return 0;
5372 #endif /* GUEST_PAGING_LEVELS >= 4 */
5375 #undef AUDIT_FAIL
5377 #endif /* Audit code */
5379 /**************************************************************************/
5380 /* Entry points into this mode of the shadow code.
5381 * This will all be mangled by the preprocessor to uniquify everything. */
5382 struct paging_mode sh_paging_mode = {
5383 .page_fault = sh_page_fault,
5384 .invlpg = sh_invlpg,
5385 .gva_to_gfn = sh_gva_to_gfn,
5386 .update_cr3 = sh_update_cr3,
5387 .update_paging_modes = shadow_update_paging_modes,
5388 .write_p2m_entry = shadow_write_p2m_entry,
5389 .write_guest_entry = shadow_write_guest_entry,
5390 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
5391 .guest_map_l1e = sh_guest_map_l1e,
5392 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
5393 .guest_levels = GUEST_PAGING_LEVELS,
5394 .shadow.detach_old_tables = sh_detach_old_tables,
5395 .shadow.x86_emulate_write = sh_x86_emulate_write,
5396 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
5397 #ifdef __i386__
5398 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
5399 #endif
5400 .shadow.make_monitor_table = sh_make_monitor_table,
5401 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
5402 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
5403 .shadow.guess_wrmap = sh_guess_wrmap,
5404 #endif
5405 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
5406 };
5408 /*
5409 * Local variables:
5410 * mode: C
5411 * c-set-style: "BSD"
5412 * c-basic-offset: 4
5413 * indent-tabs-mode: nil
5414 * End:
5415 */