ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 18318:b75f0b3e2a7e

x86, shadow, oos: Remove overzealous warning and simplify code.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Aug 13 11:09:46 2008 +0100 (2008-08-13)
parents b613bf4c4289
children b1e5a0def648
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include "private.h"
39 #include "types.h"
41 /* THINGS TO DO LATER:
42 *
43 * TEARDOWN HEURISTICS
44 * Also: have a heuristic for when to destroy a previous paging-mode's
45 * shadows. When a guest is done with its start-of-day 32-bit tables
46 * and reuses the memory we want to drop those shadows. Start with
47 * shadows in a page in two modes as a hint, but beware of clever tricks
48 * like reusing a pagetable for both PAE and 64-bit during boot...
49 *
50 * PAE LINEAR MAPS
51 * Rework shadow_get_l*e() to have the option of using map_domain_page()
52 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
53 * Then we can test the speed difference made by linear maps. If the
54 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
55 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
56 * to share l2h pages again.
57 *
58 * PSE disabled / PSE36
59 * We don't support any modes other than PSE enabled, PSE36 disabled.
60 * Neither of those would be hard to change, but we'd need to be able to
61 * deal with shadows made in one mode and used in another.
62 */
64 #define FETCH_TYPE_PREFETCH 1
65 #define FETCH_TYPE_DEMAND 2
66 #define FETCH_TYPE_WRITE 4
67 typedef enum {
68 ft_prefetch = FETCH_TYPE_PREFETCH,
69 ft_demand_read = FETCH_TYPE_DEMAND,
70 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
71 } fetch_type_t;
73 #ifdef DEBUG_TRACE_DUMP
74 static char *fetch_type_names[] = {
75 [ft_prefetch] "prefetch",
76 [ft_demand_read] "demand read",
77 [ft_demand_write] "demand write",
78 };
79 #endif
81 /**************************************************************************/
82 /* Hash table mapping from guest pagetables to shadows
83 *
84 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
85 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
86 * shadow L1 which maps its "splinters".
87 */
89 static inline mfn_t
90 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
91 /* Look for FL1 shadows in the hash table */
92 {
93 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
94 return smfn;
95 }
97 static inline mfn_t
98 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
99 /* Look for shadows in the hash table */
100 {
101 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
102 perfc_incr(shadow_get_shadow_status);
103 return smfn;
104 }
106 static inline void
107 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
108 /* Put an FL1 shadow into the hash table */
109 {
110 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
111 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
113 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
114 }
116 static inline void
117 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
118 /* Put a shadow into the hash table */
119 {
120 struct domain *d = v->domain;
121 int res;
123 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
124 d->domain_id, v->vcpu_id, mfn_x(gmfn),
125 shadow_type, mfn_x(smfn));
127 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
128 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
129 {
130 res = get_page(mfn_to_page(gmfn), d);
131 ASSERT(res == 1);
132 }
134 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
135 }
137 static inline void
138 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
139 /* Remove a shadow from the hash table */
140 {
141 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
142 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
143 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
144 }
146 static inline void
147 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
148 /* Remove a shadow from the hash table */
149 {
150 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
151 v->domain->domain_id, v->vcpu_id,
152 mfn_x(gmfn), shadow_type, mfn_x(smfn));
153 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
154 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
155 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
156 put_page(mfn_to_page(gmfn));
157 }
159 /**************************************************************************/
160 /* CPU feature support querying */
162 static inline int
163 guest_supports_superpages(struct vcpu *v)
164 {
165 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
166 * CR4.PSE is set or the guest is in PAE or long mode.
167 * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
168 return (is_hvm_vcpu(v) &&
169 (GUEST_PAGING_LEVELS != 2
170 || !hvm_paging_enabled(v)
171 || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
172 }
174 static inline int
175 guest_supports_nx(struct vcpu *v)
176 {
177 if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
178 return 0;
179 if ( !is_hvm_vcpu(v) )
180 return cpu_has_nx;
181 return hvm_nx_enabled(v);
182 }
185 /**************************************************************************/
186 /* Functions for walking the guest page tables */
188 /* Flags that are needed in a pagetable entry, with the sense of NX inverted */
189 static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
190 {
191 static uint32_t flags[] = {
192 /* I/F - Usr Wr */
193 /* 0 0 0 0 */ _PAGE_PRESENT,
194 /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
195 /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
196 /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
197 /* 0 1 0 0 */ _PAGE_PRESENT,
198 /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
199 /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
200 /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
201 /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
202 /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
203 /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
204 /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
205 /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
206 /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
207 /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
208 /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
209 };
211 /* Don't demand not-NX if the CPU wouldn't enforce it. */
212 if ( !guest_supports_nx(v) )
213 pfec &= ~PFEC_insn_fetch;
215 /* Don't demand R/W if the CPU wouldn't enforce it. */
216 if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
217 && !(pfec & PFEC_user_mode) )
218 pfec &= ~PFEC_write_access;
220 return flags[(pfec & 0x1f) >> 1];
221 }
223 /* Modify a guest pagetable entry to set the Accessed and Dirty bits.
224 * Returns non-zero if it actually writes to guest memory. */
225 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
226 {
227 guest_intpte_t old, new;
229 old = *(guest_intpte_t *)walk_p;
230 new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
231 if ( old != new )
232 {
233 /* Write the new entry into the walk, and try to write it back
234 * into the guest table as well. If the guest table has changed
235 * under out feet then leave it alone. */
236 *(guest_intpte_t *)walk_p = new;
237 if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
238 return 1;
239 }
240 return 0;
241 }
243 /* This validation is called with lock held, and after write permission
244 * removal. Then check is atomic and no more inconsistent content can
245 * be observed before lock is released
246 *
247 * Return 1 to indicate success and 0 for inconsistency
248 */
249 static inline uint32_t
250 shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
251 {
252 struct domain *d = v->domain;
253 guest_l1e_t *l1p;
254 guest_l2e_t *l2p;
255 #if GUEST_PAGING_LEVELS >= 4
256 guest_l3e_t *l3p;
257 guest_l4e_t *l4p;
258 #endif
259 int mismatch = 0;
261 ASSERT(shadow_locked_by_me(d));
263 if ( gw->version ==
264 atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
265 return 1;
267 /* We may consider caching guest page mapping from last
268 * guest table walk. However considering this check happens
269 * relatively less-frequent, and a bit burden here to
270 * remap guest page is better than caching mapping in each
271 * guest table walk.
272 *
273 * Also when inconsistency occurs, simply return to trigger
274 * another fault instead of re-validate new path to make
275 * logic simple.
276 */
277 perfc_incr(shadow_check_gwalk);
278 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
279 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
280 l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
281 mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
282 l3p = sh_map_domain_page(gw->l3mfn);
283 mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
284 sh_unmap_domain_page(l3p);
285 #else
286 mismatch |= (gw->l3e.l3 !=
287 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
288 #endif
289 l2p = sh_map_domain_page(gw->l2mfn);
290 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
291 sh_unmap_domain_page(l2p);
292 #else
293 l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
294 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
295 #endif
296 if ( !(guest_supports_superpages(v) &&
297 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
298 {
299 l1p = sh_map_domain_page(gw->l1mfn);
300 mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
301 sh_unmap_domain_page(l1p);
302 }
304 return !mismatch;
305 }
307 /* Remove write access permissions from a gwalk_t in a batch, and
308 * return OR-ed result for TLB flush hint and need to rewalk the guest
309 * pages.
310 *
311 * Syncing pages will remove write access to that page; but it may
312 * also give write access to other pages in the path. If we resync any
313 * pages, re-walk from the beginning.
314 */
315 #define GW_RMWR_FLUSHTLB 1
316 #define GW_RMWR_REWALK 2
318 static inline uint32_t
319 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
320 {
321 uint32_t rc = 0;
323 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
324 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
325 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
326 if ( mfn_is_out_of_sync(gw->l3mfn) )
327 {
328 sh_resync(v, gw->l3mfn);
329 rc = GW_RMWR_REWALK;
330 }
331 else
332 #endif /* OOS */
333 if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
334 rc = GW_RMWR_FLUSHTLB;
335 #endif /* GUEST_PAGING_LEVELS >= 4 */
337 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
338 if ( mfn_is_out_of_sync(gw->l2mfn) )
339 {
340 sh_resync(v, gw->l2mfn);
341 rc |= GW_RMWR_REWALK;
342 }
343 else
344 #endif /* OOS */
345 if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
346 rc |= GW_RMWR_FLUSHTLB;
347 #endif /* GUEST_PAGING_LEVELS >= 3 */
349 if ( !(guest_supports_superpages(v) &&
350 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
351 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
352 && !mfn_is_out_of_sync(gw->l1mfn)
353 #endif /* OOS */
354 && sh_remove_write_access(v, gw->l1mfn, 1, va) )
355 rc |= GW_RMWR_FLUSHTLB;
357 return rc;
358 }
360 /* Walk the guest pagetables, after the manner of a hardware walker.
361 *
362 * Inputs: a vcpu, a virtual address, a walk_t to fill, a
363 * pointer to a pagefault code
364 *
365 * We walk the vcpu's guest pagetables, filling the walk_t with what we
366 * see and adding any Accessed and Dirty bits that are needed in the
367 * guest entries. Using the pagefault code, we check the permissions as
368 * we go. For the purposes of reading pagetables we treat all non-RAM
369 * memory as contining zeroes.
370 *
371 * The walk is done in a lock-free style, with some sanity check postponed
372 * after grabbing shadow lock later. Those delayed checks will make sure
373 * no inconsistent mapping being translated into shadow page table.
374 *
375 * Returns 0 for success, or the set of permission bits that we failed on
376 * if the walk did not complete.
377 * N.B. This is different from the old return code but almost no callers
378 * checked the old return code anyway.
379 */
380 static uint32_t
381 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
382 {
383 struct domain *d = v->domain;
384 p2m_type_t p2mt;
385 guest_l1e_t *l1p = NULL;
386 guest_l2e_t *l2p = NULL;
387 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
388 guest_l3e_t *l3p = NULL;
389 guest_l4e_t *l4p;
390 #endif
391 uint32_t gflags, mflags, rc = 0;
392 int pse;
394 perfc_incr(shadow_guest_walk);
395 memset(gw, 0, sizeof(*gw));
396 gw->va = va;
398 gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
399 rmb();
401 /* Mandatory bits that must be set in every entry. We invert NX, to
402 * calculate as if there were an "X" bit that allowed access.
403 * We will accumulate, in rc, the set of flags that are missing. */
404 mflags = mandatory_flags(v, pfec);
406 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
407 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
409 /* Get the l4e from the top level table and check its flags*/
410 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
411 l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
412 gw->l4e = l4p[guest_l4_table_offset(va)];
413 gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
414 rc |= ((gflags & mflags) ^ mflags);
415 if ( rc & _PAGE_PRESENT ) goto out;
417 /* Map the l3 table */
418 gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
419 if ( !p2m_is_ram(p2mt) )
420 {
421 rc |= _PAGE_PRESENT;
422 goto out;
423 }
424 ASSERT(mfn_valid(gw->l3mfn));
426 /* Get the l3e and check its flags*/
427 l3p = sh_map_domain_page(gw->l3mfn);
428 gw->l3e = l3p[guest_l3_table_offset(va)];
429 gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
430 rc |= ((gflags & mflags) ^ mflags);
431 if ( rc & _PAGE_PRESENT )
432 goto out;
434 #else /* PAE only... */
436 /* Get l3e from the cache of the top level table and check its flag */
437 gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
438 if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
439 {
440 rc |= _PAGE_PRESENT;
441 goto out;
442 }
444 #endif /* PAE or 64... */
446 /* Map the l2 table */
447 gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
448 if ( !p2m_is_ram(p2mt) )
449 {
450 rc |= _PAGE_PRESENT;
451 goto out;
452 }
453 ASSERT(mfn_valid(gw->l2mfn));
455 /* Get the l2e */
456 l2p = sh_map_domain_page(gw->l2mfn);
457 gw->l2e = l2p[guest_l2_table_offset(va)];
459 #else /* 32-bit only... */
461 /* Get l2e from the top level table */
462 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
463 l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
464 gw->l2e = l2p[guest_l2_table_offset(va)];
466 #endif /* All levels... */
468 gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
469 rc |= ((gflags & mflags) ^ mflags);
470 if ( rc & _PAGE_PRESENT )
471 goto out;
473 pse = (guest_supports_superpages(v) &&
474 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
476 if ( pse )
477 {
478 /* Special case: this guest VA is in a PSE superpage, so there's
479 * no guest l1e. We make one up so that the propagation code
480 * can generate a shadow l1 table. Start with the gfn of the
481 * first 4k-page of the superpage. */
482 gfn_t start = guest_l2e_get_gfn(gw->l2e);
483 /* Grant full access in the l1e, since all the guest entry's
484 * access controls are enforced in the shadow l2e. */
485 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
486 _PAGE_ACCESSED|_PAGE_DIRTY);
487 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
488 * of the level 1. */
489 if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
490 flags |= _PAGE_PAT;
491 /* Copy the cache-control bits to the l1 as well, because we
492 * can't represent PAT in the (non-PSE) shadow l2e. :(
493 * This could cause problems if a guest ever maps an area of
494 * memory with superpages using more than one caching mode. */
495 flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
496 /* Increment the pfn by the right number of 4k pages.
497 * The ~0x1 is to mask out the PAT bit mentioned above. */
498 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
499 gw->l1e = guest_l1e_from_gfn(start, flags);
500 gw->l1mfn = _mfn(INVALID_MFN);
501 }
502 else
503 {
504 /* Not a superpage: carry on and find the l1e. */
505 gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
506 if ( !p2m_is_ram(p2mt) )
507 {
508 rc |= _PAGE_PRESENT;
509 goto out;
510 }
511 ASSERT(mfn_valid(gw->l1mfn));
512 l1p = sh_map_domain_page(gw->l1mfn);
513 gw->l1e = l1p[guest_l1_table_offset(va)];
514 gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
515 rc |= ((gflags & mflags) ^ mflags);
516 }
518 /* Go back and set accessed and dirty bits only if the walk was a
519 * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
520 * get set whenever a lower-level PT is used, at least some hardware
521 * walkers behave this way. */
522 if ( rc == 0 )
523 {
524 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
525 if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
526 paging_mark_dirty(d, mfn_x(gw->l4mfn));
527 if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
528 paging_mark_dirty(d, mfn_x(gw->l3mfn));
529 #endif
530 if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
531 (pse && (pfec & PFEC_write_access))) )
532 paging_mark_dirty(d, mfn_x(gw->l2mfn));
533 if ( !pse )
534 {
535 if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
536 (pfec & PFEC_write_access)) )
537 paging_mark_dirty(d, mfn_x(gw->l1mfn));
538 }
539 }
541 out:
542 #if GUEST_PAGING_LEVELS == 4
543 if ( l3p ) sh_unmap_domain_page(l3p);
544 #endif
545 #if GUEST_PAGING_LEVELS >= 3
546 if ( l2p ) sh_unmap_domain_page(l2p);
547 #endif
548 if ( l1p ) sh_unmap_domain_page(l1p);
550 return rc;
551 }
553 /* Given a walk_t, translate the gw->va into the guest's notion of the
554 * corresponding frame number. */
555 static inline gfn_t
556 guest_walk_to_gfn(walk_t *gw)
557 {
558 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
559 return _gfn(INVALID_GFN);
560 return guest_l1e_get_gfn(gw->l1e);
561 }
563 /* Given a walk_t, translate the gw->va into the guest's notion of the
564 * corresponding physical address. */
565 static inline paddr_t
566 guest_walk_to_gpa(walk_t *gw)
567 {
568 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
569 return 0;
570 return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
571 }
573 #if 0 /* Keep for debugging */
574 /* Pretty-print the contents of a guest-walk */
575 static inline void print_gw(walk_t *gw)
576 {
577 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
578 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
579 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
580 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
581 SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
582 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
583 #endif /* PAE or 64... */
584 SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
585 #endif /* All levels... */
586 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
587 SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
588 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
589 SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
590 }
591 #endif /* 0 */
593 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
594 /* Lightweight audit: pass all the shadows associated with this guest walk
595 * through the audit mechanisms */
596 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
597 {
598 mfn_t smfn;
600 if ( !(SHADOW_AUDIT_ENABLE) )
601 return;
603 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
604 if ( mfn_valid(gw->l4mfn)
605 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
606 SH_type_l4_shadow))) )
607 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
608 if ( mfn_valid(gw->l3mfn)
609 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
610 SH_type_l3_shadow))) )
611 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
612 #endif /* PAE or 64... */
613 if ( mfn_valid(gw->l2mfn) )
614 {
615 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
616 SH_type_l2_shadow))) )
617 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
618 #if GUEST_PAGING_LEVELS == 3
619 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
620 SH_type_l2h_shadow))) )
621 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
622 #endif
623 }
624 if ( mfn_valid(gw->l1mfn)
625 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
626 SH_type_l1_shadow))) )
627 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
628 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
629 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
630 && mfn_valid(
631 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
632 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
633 }
635 #else
636 #define sh_audit_gw(_v, _gw) do {} while(0)
637 #endif /* audit code */
640 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
641 void *
642 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
643 unsigned long *gl1mfn)
644 {
645 void *pl1e = NULL;
646 walk_t gw;
648 ASSERT(shadow_mode_translate(v->domain));
650 // XXX -- this is expensive, but it's easy to cobble together...
651 // FIXME!
653 if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0
654 && mfn_valid(gw.l1mfn) )
655 {
656 if ( gl1mfn )
657 *gl1mfn = mfn_x(gw.l1mfn);
658 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
659 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
660 }
662 return pl1e;
663 }
665 void
666 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
667 {
668 walk_t gw;
670 ASSERT(shadow_mode_translate(v->domain));
672 // XXX -- this is expensive, but it's easy to cobble together...
673 // FIXME!
675 (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
676 *(guest_l1e_t *)eff_l1e = gw.l1e;
677 }
678 #endif /* CONFIG == GUEST (== SHADOW) */
680 /**************************************************************************/
681 /* Functions to compute the correct index into a shadow page, given an
682 * index into the guest page (as returned by guest_get_index()).
683 * This is trivial when the shadow and guest use the same sized PTEs, but
684 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
685 * PAE- or 64-bit shadows).
686 *
687 * These functions also increment the shadow mfn, when necessary. When PTE
688 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
689 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
690 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
691 * which shadow page we really want. Similarly, when PTE sizes are
692 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
693 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
694 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
695 * space.)
696 *
697 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
698 * of shadow (to store both the shadow, and the info that would normally be
699 * stored in page_info fields). This arrangement allows the shadow and the
700 * "page_info" fields to always be stored in the same page (in fact, in
701 * the same cache line), avoiding an extra call to map_domain_page().
702 */
704 static inline u32
705 guest_index(void *ptr)
706 {
707 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
708 }
710 static u32
711 shadow_l1_index(mfn_t *smfn, u32 guest_index)
712 {
713 #if (GUEST_PAGING_LEVELS == 2)
714 *smfn = _mfn(mfn_x(*smfn) +
715 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
716 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
717 #else
718 return guest_index;
719 #endif
720 }
722 static u32
723 shadow_l2_index(mfn_t *smfn, u32 guest_index)
724 {
725 #if (GUEST_PAGING_LEVELS == 2)
726 // Because we use 2 shadow l2 entries for each guest entry, the number of
727 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
728 //
729 *smfn = _mfn(mfn_x(*smfn) +
730 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
732 // We multiply by two to get the index of the first of the two entries
733 // used to shadow the specified guest entry.
734 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
735 #else
736 return guest_index;
737 #endif
738 }
740 #if GUEST_PAGING_LEVELS >= 4
742 static u32
743 shadow_l3_index(mfn_t *smfn, u32 guest_index)
744 {
745 return guest_index;
746 }
748 static u32
749 shadow_l4_index(mfn_t *smfn, u32 guest_index)
750 {
751 return guest_index;
752 }
754 #endif // GUEST_PAGING_LEVELS >= 4
757 /**************************************************************************/
758 /* Function which computes shadow entries from their corresponding guest
759 * entries. This is the "heart" of the shadow code. It operates using
760 * level-1 shadow types, but handles all levels of entry.
761 * Don't call it directly, but use the four wrappers below.
762 */
764 static always_inline void
765 _sh_propagate(struct vcpu *v,
766 guest_intpte_t guest_intpte,
767 mfn_t target_mfn,
768 void *shadow_entry_ptr,
769 int level,
770 fetch_type_t ft,
771 p2m_type_t p2mt)
772 {
773 guest_l1e_t guest_entry = { guest_intpte };
774 shadow_l1e_t *sp = shadow_entry_ptr;
775 struct domain *d = v->domain;
776 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
777 u32 pass_thru_flags;
778 u32 gflags, sflags;
780 /* We don't shadow PAE l3s */
781 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
783 /* Check there's something for the shadows to map to */
784 if ( !p2m_is_valid(p2mt) )
785 {
786 *sp = shadow_l1e_empty();
787 goto done;
788 }
790 gflags = guest_l1e_get_flags(guest_entry);
792 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
793 {
794 /* If a guest l1 entry is not present, shadow with the magic
795 * guest-not-present entry. */
796 if ( level == 1 )
797 *sp = sh_l1e_gnp();
798 else
799 *sp = shadow_l1e_empty();
800 goto done;
801 }
803 if ( level == 1 && p2mt == p2m_mmio_dm )
804 {
805 /* Guest l1e maps emulated MMIO space */
806 *sp = sh_l1e_mmio(target_gfn, gflags);
807 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
808 d->arch.paging.shadow.has_fast_mmio_entries = 1;
809 goto done;
810 }
812 // Must have a valid target_mfn unless this is a prefetch or an l1
813 // pointing at MMIO space. In the case of a prefetch, an invalid
814 // mfn means that we can not usefully shadow anything, and so we
815 // return early.
816 //
817 if ( !mfn_valid(target_mfn)
818 && !(level == 1 && (!shadow_mode_refcounts(d)
819 || p2mt == p2m_mmio_direct)) )
820 {
821 ASSERT((ft == ft_prefetch));
822 *sp = shadow_l1e_empty();
823 goto done;
824 }
826 // Propagate bits from the guest to the shadow.
827 // Some of these may be overwritten, below.
828 // Since we know the guest's PRESENT bit is set, we also set the shadow's
829 // SHADOW_PRESENT bit.
830 //
831 pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
832 _PAGE_RW | _PAGE_PRESENT);
833 if ( guest_supports_nx(v) )
834 pass_thru_flags |= _PAGE_NX_BIT;
835 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
836 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
837 sflags = gflags & pass_thru_flags;
839 /*
840 * For HVM domains with direct access to MMIO areas, set the correct
841 * caching attributes in the shadows to match what was asked for.
842 */
843 if ( (level == 1) && is_hvm_domain(d) && has_arch_pdevs(d) &&
844 !is_xen_heap_mfn(mfn_x(target_mfn)) )
845 {
846 unsigned int type;
847 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
848 sflags |= pat_type_2_pte_flags(type);
849 else if ( d->arch.hvm_domain.is_in_uc_mode )
850 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
851 else
852 sflags |= get_pat_flags(v,
853 gflags,
854 gfn_to_paddr(target_gfn),
855 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT);
856 }
858 // Set the A&D bits for higher level shadows.
859 // Higher level entries do not, strictly speaking, have dirty bits, but
860 // since we use shadow linear tables, each of these entries may, at some
861 // point in time, also serve as a shadow L1 entry.
862 // By setting both the A&D bits in each of these, we eliminate the burden
863 // on the hardware to update these bits on initial accesses.
864 //
865 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
866 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
868 // If the A or D bit has not yet been set in the guest, then we must
869 // prevent the corresponding kind of access.
870 //
871 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
872 sflags &= ~_PAGE_PRESENT;
874 /* D bits exist in L1es and PSE L2es */
875 if ( unlikely(((level == 1) ||
876 ((level == 2) &&
877 (gflags & _PAGE_PSE) &&
878 guest_supports_superpages(v)))
879 && !(gflags & _PAGE_DIRTY)) )
880 sflags &= ~_PAGE_RW;
882 // shadow_mode_log_dirty support
883 //
884 // Only allow the guest write access to a page a) on a demand fault,
885 // or b) if the page is already marked as dirty.
886 //
887 // (We handle log-dirty entirely inside the shadow code, without using the
888 // p2m_ram_logdirty p2m type: only HAP uses that.)
889 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
890 {
891 if ( mfn_valid(target_mfn) ) {
892 if ( ft & FETCH_TYPE_WRITE )
893 paging_mark_dirty(d, mfn_x(target_mfn));
894 else if ( !sh_mfn_is_dirty(d, target_mfn) )
895 sflags &= ~_PAGE_RW;
896 }
897 }
899 if ( unlikely((level == 1) && d->dirty_vram
900 && d->dirty_vram->last_dirty == -1
901 && gfn_x(target_gfn) >= d->dirty_vram->begin_pfn
902 && gfn_x(target_gfn) < d->dirty_vram->end_pfn) )
903 {
904 if ( ft & FETCH_TYPE_WRITE )
905 d->dirty_vram->last_dirty = NOW();
906 else
907 sflags &= ~_PAGE_RW;
908 }
910 /* Read-only memory */
911 if ( p2mt == p2m_ram_ro )
912 sflags &= ~_PAGE_RW;
914 // protect guest page tables
915 //
916 if ( unlikely((level == 1)
917 && sh_mfn_is_a_page_table(target_mfn)
918 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
919 /* Unless the page is out of sync and the guest is
920 writing to it. */
921 && !(mfn_oos_may_write(target_mfn)
922 && (ft == ft_demand_write))
923 #endif /* OOS */
924 ) )
925 {
926 if ( shadow_mode_trap_reads(d) )
927 {
928 // if we are trapping both reads & writes, then mark this page
929 // as not present...
930 //
931 sflags &= ~_PAGE_PRESENT;
932 }
933 else
934 {
935 // otherwise, just prevent any writes...
936 //
937 sflags &= ~_PAGE_RW;
938 }
939 }
941 // PV guests in 64-bit mode use two different page tables for user vs
942 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
943 // It is always shadowed as present...
944 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
945 && !is_hvm_domain(d) )
946 {
947 sflags |= _PAGE_USER;
948 }
950 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
952 done:
953 SHADOW_DEBUG(PROPAGATE,
954 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
955 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
956 }
959 /* These four wrappers give us a little bit of type-safety back around
960 * the use of void-* pointers and intpte types in _sh_propagate(), and
961 * allow the compiler to optimize out some level checks. */
963 #if GUEST_PAGING_LEVELS >= 4
964 static void
965 l4e_propagate_from_guest(struct vcpu *v,
966 guest_l4e_t gl4e,
967 mfn_t sl3mfn,
968 shadow_l4e_t *sl4e,
969 fetch_type_t ft)
970 {
971 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
972 }
974 static void
975 l3e_propagate_from_guest(struct vcpu *v,
976 guest_l3e_t gl3e,
977 mfn_t sl2mfn,
978 shadow_l3e_t *sl3e,
979 fetch_type_t ft)
980 {
981 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
982 }
983 #endif // GUEST_PAGING_LEVELS >= 4
985 static void
986 l2e_propagate_from_guest(struct vcpu *v,
987 guest_l2e_t gl2e,
988 mfn_t sl1mfn,
989 shadow_l2e_t *sl2e,
990 fetch_type_t ft)
991 {
992 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
993 }
995 static void
996 l1e_propagate_from_guest(struct vcpu *v,
997 guest_l1e_t gl1e,
998 mfn_t gmfn,
999 shadow_l1e_t *sl1e,
1000 fetch_type_t ft,
1001 p2m_type_t p2mt)
1003 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
1007 /**************************************************************************/
1008 /* These functions update shadow entries (and do bookkeeping on the shadow
1009 * tables they are in). It is intended that they are the only
1010 * functions which ever write (non-zero) data onto a shadow page.
1011 */
1013 static inline void safe_write_entry(void *dst, void *src)
1014 /* Copy one PTE safely when processors might be running on the
1015 * destination pagetable. This does *not* give safety against
1016 * concurrent writes (that's what the shadow lock is for), just
1017 * stops the hardware picking up partially written entries. */
1019 volatile unsigned long *d = dst;
1020 unsigned long *s = src;
1021 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
1022 #if CONFIG_PAGING_LEVELS == 3
1023 /* In PAE mode, pagetable entries are larger
1024 * than machine words, so won't get written atomically. We need to make
1025 * sure any other cpu running on these shadows doesn't see a
1026 * half-written entry. Do this by marking the entry not-present first,
1027 * then writing the high word before the low word. */
1028 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
1029 d[0] = 0;
1030 d[1] = s[1];
1031 d[0] = s[0];
1032 #else
1033 /* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
1034 * which will be an atomic write, since the entry is aligned. */
1035 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
1036 *d = *s;
1037 #endif
1041 static inline void
1042 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
1043 /* This function does the actual writes to shadow pages.
1044 * It must not be called directly, since it doesn't do the bookkeeping
1045 * that shadow_set_l*e() functions do. */
1047 shadow_l1e_t *dst = d;
1048 shadow_l1e_t *src = s;
1049 void *map = NULL;
1050 int i;
1052 /* Because we mirror access rights at all levels in the shadow, an
1053 * l2 (or higher) entry with the RW bit cleared will leave us with
1054 * no write access through the linear map.
1055 * We detect that by writing to the shadow with copy_to_user() and
1056 * using map_domain_page() to get a writeable mapping if we need to. */
1057 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
1059 perfc_incr(shadow_linear_map_failed);
1060 map = sh_map_domain_page(mfn);
1061 ASSERT(map != NULL);
1062 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
1066 for ( i = 0; i < entries; i++ )
1067 safe_write_entry(dst++, src++);
1069 if ( map != NULL ) sh_unmap_domain_page(map);
1072 static inline int
1073 perms_strictly_increased(u32 old_flags, u32 new_flags)
1074 /* Given the flags of two entries, are the new flags a strict
1075 * increase in rights over the old ones? */
1077 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1078 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1079 /* Flip the NX bit, since it's the only one that decreases rights;
1080 * we calculate as if it were an "X" bit. */
1081 of ^= _PAGE_NX_BIT;
1082 nf ^= _PAGE_NX_BIT;
1083 /* If the changed bits are all set in the new flags, then rights strictly
1084 * increased between old and new. */
1085 return ((of | (of ^ nf)) == nf);
1088 static int inline
1089 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1091 int res;
1092 mfn_t mfn;
1093 struct domain *owner;
1095 ASSERT(!sh_l1e_is_magic(sl1e));
1097 if ( !shadow_mode_refcounts(d) )
1098 return 1;
1100 res = get_page_from_l1e(sl1e, d);
1102 // If a privileged domain is attempting to install a map of a page it does
1103 // not own, we let it succeed anyway.
1104 //
1105 if ( unlikely(!res) &&
1106 !shadow_mode_translate(d) &&
1107 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
1108 (owner = page_get_owner(mfn_to_page(mfn))) &&
1109 (d != owner) &&
1110 IS_PRIV_FOR(d, owner))
1112 res = get_page_from_l1e(sl1e, owner);
1113 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
1114 "which is owned by domain %d: %s\n",
1115 d->domain_id, mfn_x(mfn), owner->domain_id,
1116 res ? "success" : "failed");
1119 if ( unlikely(!res) )
1121 perfc_incr(shadow_get_page_fail);
1122 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
1125 return res;
1128 static void inline
1129 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1131 if ( !shadow_mode_refcounts(d) )
1132 return;
1134 put_page_from_l1e(sl1e, d);
1137 #if GUEST_PAGING_LEVELS >= 4
1138 static int shadow_set_l4e(struct vcpu *v,
1139 shadow_l4e_t *sl4e,
1140 shadow_l4e_t new_sl4e,
1141 mfn_t sl4mfn)
1143 int flags = 0, ok;
1144 shadow_l4e_t old_sl4e;
1145 paddr_t paddr;
1146 ASSERT(sl4e != NULL);
1147 old_sl4e = *sl4e;
1149 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
1151 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1152 | (((unsigned long)sl4e) & ~PAGE_MASK));
1154 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
1156 /* About to install a new reference */
1157 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
1158 ok = sh_get_ref(v, sl3mfn, paddr);
1159 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
1160 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
1161 ok |= sh_pin(v, sl3mfn);
1162 if ( !ok )
1164 domain_crash(v->domain);
1165 return SHADOW_SET_ERROR;
1167 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1168 shadow_resync_all(v, 0);
1169 #endif
1172 /* Write the new entry */
1173 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
1174 flags |= SHADOW_SET_CHANGED;
1176 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1178 /* We lost a reference to an old mfn. */
1179 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1180 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1181 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1182 shadow_l4e_get_flags(new_sl4e)) )
1184 flags |= SHADOW_SET_FLUSH;
1186 sh_put_ref(v, osl3mfn, paddr);
1188 return flags;
1191 static int shadow_set_l3e(struct vcpu *v,
1192 shadow_l3e_t *sl3e,
1193 shadow_l3e_t new_sl3e,
1194 mfn_t sl3mfn)
1196 int flags = 0;
1197 shadow_l3e_t old_sl3e;
1198 paddr_t paddr;
1199 ASSERT(sl3e != NULL);
1200 old_sl3e = *sl3e;
1202 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1204 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1205 | (((unsigned long)sl3e) & ~PAGE_MASK));
1207 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1209 /* About to install a new reference */
1210 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1212 domain_crash(v->domain);
1213 return SHADOW_SET_ERROR;
1215 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1216 shadow_resync_all(v, 0);
1217 #endif
1220 /* Write the new entry */
1221 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1222 flags |= SHADOW_SET_CHANGED;
1224 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1226 /* We lost a reference to an old mfn. */
1227 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1228 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1229 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1230 shadow_l3e_get_flags(new_sl3e)) )
1232 flags |= SHADOW_SET_FLUSH;
1234 sh_put_ref(v, osl2mfn, paddr);
1236 return flags;
1238 #endif /* GUEST_PAGING_LEVELS >= 4 */
1240 static int shadow_set_l2e(struct vcpu *v,
1241 shadow_l2e_t *sl2e,
1242 shadow_l2e_t new_sl2e,
1243 mfn_t sl2mfn)
1245 int flags = 0;
1246 shadow_l2e_t old_sl2e;
1247 paddr_t paddr;
1249 #if GUEST_PAGING_LEVELS == 2
1250 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1251 * shadows. Reference counting and up-pointers track from the first
1252 * page of the shadow to the first l2e, so make sure that we're
1253 * working with those:
1254 * Align the pointer down so it's pointing at the first of the pair */
1255 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1256 /* Align the mfn of the shadow entry too */
1257 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1258 #endif
1260 ASSERT(sl2e != NULL);
1261 old_sl2e = *sl2e;
1263 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1265 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1266 | (((unsigned long)sl2e) & ~PAGE_MASK));
1268 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1270 mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
1272 /* About to install a new reference */
1273 if ( !sh_get_ref(v, sl1mfn, paddr) )
1275 domain_crash(v->domain);
1276 return SHADOW_SET_ERROR;
1278 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1280 struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
1281 mfn_t gl1mfn = _mfn(sp->backpointer);
1283 /* If the shadow is a fl1 then the backpointer contains
1284 the GFN instead of the GMFN, and it's definitely not
1285 OOS. */
1286 if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
1287 && mfn_is_out_of_sync(gl1mfn) )
1288 sh_resync(v, gl1mfn);
1290 #endif
1293 /* Write the new entry */
1294 #if GUEST_PAGING_LEVELS == 2
1296 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1297 /* The l1 shadow is two pages long and need to be pointed to by
1298 * two adjacent l1es. The pair have the same flags, but point
1299 * at odd and even MFNs */
1300 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1301 pair[1].l2 |= (1<<PAGE_SHIFT);
1302 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1304 #else /* normal case */
1305 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1306 #endif
1307 flags |= SHADOW_SET_CHANGED;
1309 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1311 /* We lost a reference to an old mfn. */
1312 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1313 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1314 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1315 shadow_l2e_get_flags(new_sl2e)) )
1317 flags |= SHADOW_SET_FLUSH;
1319 sh_put_ref(v, osl1mfn, paddr);
1321 return flags;
1324 static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1325 shadow_l1e_t *sl1e,
1326 mfn_t sl1mfn,
1327 struct domain *d)
1329 mfn_t mfn;
1330 unsigned long gfn;
1332 if ( !d->dirty_vram ) return;
1334 mfn = shadow_l1e_get_mfn(new_sl1e);
1336 if ( !mfn_valid(mfn) ) return; /* m2p for mmio_direct may not exist */
1338 gfn = mfn_to_gfn(d, mfn);
1340 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1341 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1342 struct page_info *page = mfn_to_page(mfn);
1343 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1345 if ( count_info == 1 )
1346 /* Initial guest reference, record it */
1347 d->dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1348 | ((unsigned long)sl1e & ~PAGE_MASK);
1352 static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1353 shadow_l1e_t *sl1e,
1354 mfn_t sl1mfn,
1355 struct domain *d)
1357 mfn_t mfn;
1358 unsigned long gfn;
1360 if ( !d->dirty_vram ) return;
1362 mfn = shadow_l1e_get_mfn(old_sl1e);
1364 if ( !mfn_valid(mfn) ) return;
1366 gfn = mfn_to_gfn(d, mfn);
1368 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1369 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1370 struct page_info *page = mfn_to_page(mfn);
1371 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1372 int dirty = 0;
1373 paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1374 | ((unsigned long)sl1e & ~PAGE_MASK);
1376 if ( count_info == 1 ) {
1377 /* Last reference */
1378 if ( d->dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1379 /* We didn't know it was that one, let's say it is dirty */
1380 dirty = 1;
1381 } else {
1382 ASSERT(d->dirty_vram->sl1ma[i] == sl1ma);
1383 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1384 if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_DIRTY )
1385 dirty = 1;
1387 } else {
1388 /* We had more than one reference, just consider the page dirty. */
1389 dirty = 1;
1390 /* Check that it's not the one we recorded. */
1391 if ( d->dirty_vram->sl1ma[i] == sl1ma ) {
1392 /* Too bad, we remembered the wrong one... */
1393 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1394 } else {
1395 /* Ok, our recorded sl1e is still pointing to this page, let's
1396 * just hope it will remain. */
1399 if ( dirty ) {
1400 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
1401 d->dirty_vram->last_dirty = NOW();
1406 static int shadow_set_l1e(struct vcpu *v,
1407 shadow_l1e_t *sl1e,
1408 shadow_l1e_t new_sl1e,
1409 mfn_t sl1mfn)
1411 int flags = 0;
1412 struct domain *d = v->domain;
1413 shadow_l1e_t old_sl1e;
1414 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1415 mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
1416 #endif
1417 ASSERT(sl1e != NULL);
1419 old_sl1e = *sl1e;
1421 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1423 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1424 && !sh_l1e_is_magic(new_sl1e) )
1426 /* About to install a new reference */
1427 if ( shadow_mode_refcounts(d) ) {
1428 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1430 /* Doesn't look like a pagetable. */
1431 flags |= SHADOW_SET_ERROR;
1432 new_sl1e = shadow_l1e_empty();
1434 else
1436 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1437 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1438 if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
1439 && (shadow_l1e_get_flags(new_sl1e) & _PAGE_RW) )
1441 oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
1443 #endif
1449 /* Write the new entry */
1450 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1451 flags |= SHADOW_SET_CHANGED;
1453 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1454 && !sh_l1e_is_magic(old_sl1e) )
1456 /* We lost a reference to an old mfn. */
1457 /* N.B. Unlike higher-level sets, never need an extra flush
1458 * when writing an l1e. Because it points to the same guest frame
1459 * as the guest l1e did, it's the guest's responsibility to
1460 * trigger a flush later. */
1461 if ( shadow_mode_refcounts(d) )
1463 shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1464 shadow_put_page_from_l1e(old_sl1e, d);
1467 return flags;
1471 /**************************************************************************/
1472 /* Macros to walk pagetables. These take the shadow of a pagetable and
1473 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1474 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1475 * second entry (since pairs of entries are managed together). For multi-page
1476 * shadows they walk all pages.
1478 * Arguments are an MFN, the variable to point to each entry, a variable
1479 * to indicate that we are done (we will shortcut to the end of the scan
1480 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1481 * and the code.
1483 * WARNING: These macros have side-effects. They change the values of both
1484 * the pointer and the MFN. */
1486 static inline void increment_ptr_to_guest_entry(void *ptr)
1488 if ( ptr )
1490 guest_l1e_t **entry = ptr;
1491 (*entry)++;
1495 /* All kinds of l1: touch all entries */
1496 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1497 do { \
1498 int _i; \
1499 shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \
1500 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1501 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1502 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1503 { \
1504 (_sl1e) = _sp + _i; \
1505 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1506 {_code} \
1507 if ( _done ) break; \
1508 increment_ptr_to_guest_entry(_gl1p); \
1509 } \
1510 sh_unmap_domain_page(_sp); \
1511 } while (0)
1513 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1514 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1515 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1516 do { \
1517 int __done = 0; \
1518 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1519 ({ (__done = _done); }), _code); \
1520 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1521 if ( !__done ) \
1522 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1523 ({ (__done = _done); }), _code); \
1524 } while (0)
1525 #else /* Everything else; l1 shadows are only one page */
1526 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1527 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1528 #endif
1531 #if GUEST_PAGING_LEVELS == 2
1533 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1534 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1535 do { \
1536 int _i, _j, __done = 0; \
1537 int _xen = !shadow_mode_external(_dom); \
1538 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1539 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1540 { \
1541 shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \
1542 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1543 if ( (!(_xen)) \
1544 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1545 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1546 { \
1547 (_sl2e) = _sp + _i; \
1548 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1549 {_code} \
1550 if ( (__done = (_done)) ) break; \
1551 increment_ptr_to_guest_entry(_gl2p); \
1552 } \
1553 sh_unmap_domain_page(_sp); \
1554 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1555 } \
1556 } while (0)
1558 #elif GUEST_PAGING_LEVELS == 3
1560 /* PAE: if it's an l2h, don't touch Xen mappings */
1561 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1562 do { \
1563 int _i; \
1564 int _xen = !shadow_mode_external(_dom); \
1565 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1566 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1567 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1568 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1569 if ( (!(_xen)) \
1570 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1571 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1572 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1573 { \
1574 (_sl2e) = _sp + _i; \
1575 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1576 {_code} \
1577 if ( _done ) break; \
1578 increment_ptr_to_guest_entry(_gl2p); \
1579 } \
1580 sh_unmap_domain_page(_sp); \
1581 } while (0)
1583 #else
1585 /* 64-bit l2: touch all entries except for PAE compat guests. */
1586 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1587 do { \
1588 int _i; \
1589 int _xen = !shadow_mode_external(_dom); \
1590 shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1591 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1592 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1593 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1594 { \
1595 if ( (!(_xen)) \
1596 || !is_pv_32on64_domain(_dom) \
1597 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1598 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1599 { \
1600 (_sl2e) = _sp + _i; \
1601 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1602 {_code} \
1603 if ( _done ) break; \
1604 increment_ptr_to_guest_entry(_gl2p); \
1605 } \
1606 } \
1607 sh_unmap_domain_page(_sp); \
1608 } while (0)
1610 #endif /* different kinds of l2 */
1612 #if GUEST_PAGING_LEVELS == 4
1614 /* 64-bit l3: touch all entries */
1615 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1616 do { \
1617 int _i; \
1618 shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \
1619 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1620 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1621 { \
1622 (_sl3e) = _sp + _i; \
1623 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1624 {_code} \
1625 if ( _done ) break; \
1626 increment_ptr_to_guest_entry(_gl3p); \
1627 } \
1628 sh_unmap_domain_page(_sp); \
1629 } while (0)
1631 /* 64-bit l4: avoid Xen mappings */
1632 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1633 do { \
1634 shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \
1635 int _xen = !shadow_mode_external(_dom); \
1636 int _i; \
1637 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1638 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1639 { \
1640 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1641 { \
1642 (_sl4e) = _sp + _i; \
1643 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1644 {_code} \
1645 if ( _done ) break; \
1646 } \
1647 increment_ptr_to_guest_entry(_gl4p); \
1648 } \
1649 sh_unmap_domain_page(_sp); \
1650 } while (0)
1652 #endif
1656 /**************************************************************************/
1657 /* Functions to install Xen mappings and linear mappings in shadow pages */
1659 // XXX -- this function should probably be moved to shadow-common.c, but that
1660 // probably wants to wait until the shadow types have been moved from
1661 // shadow-types.h to shadow-private.h
1662 //
1663 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1664 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1666 struct domain *d = v->domain;
1667 shadow_l4e_t *sl4e;
1669 sl4e = sh_map_domain_page(sl4mfn);
1670 ASSERT(sl4e != NULL);
1671 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1673 /* Copy the common Xen mappings from the idle domain */
1674 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1675 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1676 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1678 /* Install the per-domain mappings for this domain */
1679 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1680 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1681 __PAGE_HYPERVISOR);
1683 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1684 * shadows on 64-bit xen, this linear mapping is later replaced by the
1685 * monitor pagetable structure, which is built in make_monitor_table
1686 * and maintained by sh_update_linear_entries. */
1687 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1688 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1690 /* Self linear mapping. */
1691 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1693 // linear tables may not be used with translated PV guests
1694 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1695 shadow_l4e_empty();
1697 else
1699 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1700 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1703 if ( shadow_mode_translate(v->domain) )
1705 /* install domain-specific P2M table */
1706 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1707 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1708 __PAGE_HYPERVISOR);
1711 sh_unmap_domain_page(sl4e);
1713 #endif
1715 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1716 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1717 // place, which means that we need to populate the l2h entry in the l3
1718 // table.
1720 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1722 struct domain *d = v->domain;
1723 shadow_l2e_t *sl2e;
1724 #if CONFIG_PAGING_LEVELS == 3
1725 int i;
1726 #else
1728 if ( !is_pv_32on64_vcpu(v) )
1729 return;
1730 #endif
1732 sl2e = sh_map_domain_page(sl2hmfn);
1733 ASSERT(sl2e != NULL);
1734 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1736 #if CONFIG_PAGING_LEVELS == 3
1738 /* Copy the common Xen mappings from the idle domain */
1739 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1740 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1741 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1743 /* Install the per-domain mappings for this domain */
1744 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1745 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1746 shadow_l2e_from_mfn(
1747 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1748 __PAGE_HYPERVISOR);
1750 /* We don't set up a linear mapping here because we can't until this
1751 * l2h is installed in an l3e. sh_update_linear_entries() handles
1752 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1753 * We zero them here, just as a safety measure.
1754 */
1755 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1756 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1757 shadow_l2e_empty();
1758 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1759 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1760 shadow_l2e_empty();
1762 if ( shadow_mode_translate(d) )
1764 /* Install the domain-specific p2m table */
1765 l3_pgentry_t *p2m;
1766 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1767 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1768 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1770 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1771 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1772 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1773 __PAGE_HYPERVISOR)
1774 : shadow_l2e_empty();
1776 sh_unmap_domain_page(p2m);
1779 #else
1781 /* Copy the common Xen mappings from the idle domain */
1782 memcpy(
1783 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1784 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1785 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1787 #endif
1789 sh_unmap_domain_page(sl2e);
1791 #endif
1797 /**************************************************************************/
1798 /* Create a shadow of a given guest page.
1799 */
1800 static mfn_t
1801 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1803 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1804 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1805 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1807 if ( shadow_type != SH_type_l2_32_shadow
1808 && shadow_type != SH_type_l2_pae_shadow
1809 && shadow_type != SH_type_l2h_pae_shadow
1810 && shadow_type != SH_type_l4_64_shadow )
1811 /* Lower-level shadow, not yet linked form a higher level */
1812 mfn_to_shadow_page(smfn)->up = 0;
1814 #if GUEST_PAGING_LEVELS == 4
1815 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1816 if ( shadow_type == SH_type_l4_64_shadow &&
1817 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1819 /* We're shadowing a new l4, but we've been assuming the guest uses
1820 * only one l4 per vcpu and context switches using an l4 entry.
1821 * Count the number of active l4 shadows. If there are enough
1822 * of them, decide that this isn't an old linux guest, and stop
1823 * pinning l3es. This is not very quick but it doesn't happen
1824 * very often. */
1825 struct list_head *l, *t;
1826 struct shadow_page_info *sp;
1827 struct vcpu *v2;
1828 int l4count = 0, vcpus = 0;
1829 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1831 sp = list_entry(l, struct shadow_page_info, list);
1832 if ( sp->type == SH_type_l4_64_shadow )
1833 l4count++;
1835 for_each_vcpu ( v->domain, v2 )
1836 vcpus++;
1837 if ( l4count > 2 * vcpus )
1839 /* Unpin all the pinned l3 tables, and don't pin any more. */
1840 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1842 sp = list_entry(l, struct shadow_page_info, list);
1843 if ( sp->type == SH_type_l3_64_shadow )
1844 sh_unpin(v, shadow_page_to_mfn(sp));
1846 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1849 #endif
1850 #endif
1852 // Create the Xen mappings...
1853 if ( !shadow_mode_external(v->domain) )
1855 switch (shadow_type)
1857 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1858 case SH_type_l4_shadow:
1859 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1860 #endif
1861 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1862 case SH_type_l2h_shadow:
1863 sh_install_xen_entries_in_l2h(v, smfn); break;
1864 #endif
1865 default: /* Do nothing */ break;
1869 shadow_promote(v, gmfn, shadow_type);
1870 set_shadow_status(v, gmfn, shadow_type, smfn);
1872 return smfn;
1875 /* Make a splintered superpage shadow */
1876 static mfn_t
1877 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1879 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1880 (unsigned long) gfn_x(gfn));
1882 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1883 gfn_x(gfn), mfn_x(smfn));
1885 set_fl1_shadow_status(v, gfn, smfn);
1886 return smfn;
1890 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1891 mfn_t
1892 sh_make_monitor_table(struct vcpu *v)
1894 struct domain *d = v->domain;
1896 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1898 /* Guarantee we can get the memory we need */
1899 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1901 #if CONFIG_PAGING_LEVELS == 4
1903 mfn_t m4mfn;
1904 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1905 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1906 /* Remember the level of this table */
1907 mfn_to_page(m4mfn)->shadow_flags = 4;
1908 #if SHADOW_PAGING_LEVELS < 4
1910 mfn_t m3mfn, m2mfn;
1911 l4_pgentry_t *l4e;
1912 l3_pgentry_t *l3e;
1913 /* Install an l3 table and an l2 table that will hold the shadow
1914 * linear map entries. This overrides the linear map entry that
1915 * was installed by sh_install_xen_entries_in_l4. */
1916 l4e = sh_map_domain_page(m4mfn);
1918 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1919 mfn_to_page(m3mfn)->shadow_flags = 3;
1920 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1921 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1923 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1924 mfn_to_page(m2mfn)->shadow_flags = 2;
1925 l3e = sh_map_domain_page(m3mfn);
1926 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1927 sh_unmap_domain_page(l3e);
1929 if ( is_pv_32on64_vcpu(v) )
1931 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1932 * area into its usual VAs in the monitor tables */
1933 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1934 mfn_to_page(m3mfn)->shadow_flags = 3;
1935 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1937 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1938 mfn_to_page(m2mfn)->shadow_flags = 2;
1939 l3e = sh_map_domain_page(m3mfn);
1940 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1941 sh_install_xen_entries_in_l2h(v, m2mfn);
1942 sh_unmap_domain_page(l3e);
1945 sh_unmap_domain_page(l4e);
1947 #endif /* SHADOW_PAGING_LEVELS < 4 */
1948 return m4mfn;
1951 #elif CONFIG_PAGING_LEVELS == 3
1954 mfn_t m3mfn, m2mfn;
1955 l3_pgentry_t *l3e;
1956 l2_pgentry_t *l2e;
1957 int i;
1959 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1960 /* Remember the level of this table */
1961 mfn_to_page(m3mfn)->shadow_flags = 3;
1963 // Install a monitor l2 table in slot 3 of the l3 table.
1964 // This is used for all Xen entries, including linear maps
1965 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1966 mfn_to_page(m2mfn)->shadow_flags = 2;
1967 l3e = sh_map_domain_page(m3mfn);
1968 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1969 sh_install_xen_entries_in_l2h(v, m2mfn);
1970 /* Install the monitor's own linear map */
1971 l2e = sh_map_domain_page(m2mfn);
1972 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1973 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1974 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1975 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1976 : l2e_empty();
1977 sh_unmap_domain_page(l2e);
1978 sh_unmap_domain_page(l3e);
1980 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1981 return m3mfn;
1984 #else
1985 #error this should not happen
1986 #endif /* CONFIG_PAGING_LEVELS */
1988 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1990 /**************************************************************************/
1991 /* These functions also take a virtual address and return the level-N
1992 * shadow table mfn and entry, but they create the shadow pagetables if
1993 * they are needed. The "demand" argument is non-zero when handling
1994 * a demand fault (so we know what to do about accessed bits &c).
1995 * If the necessary tables are not present in the guest, they return NULL. */
1997 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1998 * more levels than the guest, the upper levels are always fixed and do not
1999 * reflect any information from the guest, so we do not use these functions
2000 * to access them. */
2002 #if GUEST_PAGING_LEVELS >= 4
2003 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
2004 walk_t *gw,
2005 mfn_t *sl4mfn)
2007 /* There is always a shadow of the top level table. Get it. */
2008 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2009 /* Reading the top level table is always valid. */
2010 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
2013 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
2014 walk_t *gw,
2015 mfn_t *sl3mfn,
2016 fetch_type_t ft)
2018 mfn_t sl4mfn;
2019 shadow_l4e_t *sl4e;
2020 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
2021 /* Get the l4e */
2022 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
2023 ASSERT(sl4e != NULL);
2024 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2026 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
2027 ASSERT(mfn_valid(*sl3mfn));
2029 else
2031 int r;
2032 shadow_l4e_t new_sl4e;
2033 /* No l3 shadow installed: find and install it. */
2034 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
2035 if ( !mfn_valid(*sl3mfn) )
2037 /* No l3 shadow of this page exists at all: make one. */
2038 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
2040 /* Install the new sl3 table in the sl4e */
2041 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
2042 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
2043 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2044 if ( r & SHADOW_SET_ERROR )
2045 return NULL;
2047 /* Now follow it down a level. Guaranteed to succeed. */
2048 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
2050 #endif /* GUEST_PAGING_LEVELS >= 4 */
2053 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
2054 walk_t *gw,
2055 mfn_t *sl2mfn,
2056 fetch_type_t ft)
2058 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
2059 mfn_t sl3mfn = _mfn(INVALID_MFN);
2060 shadow_l3e_t *sl3e;
2061 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
2062 /* Get the l3e */
2063 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
2064 if ( sl3e == NULL ) return NULL;
2065 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2067 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2068 ASSERT(mfn_valid(*sl2mfn));
2070 else
2072 int r;
2073 shadow_l3e_t new_sl3e;
2074 unsigned int t = SH_type_l2_shadow;
2076 /* Tag compat L2 containing hypervisor (m2p) mappings */
2077 if ( is_pv_32on64_domain(v->domain) &&
2078 guest_l4_table_offset(gw->va) == 0 &&
2079 guest_l3_table_offset(gw->va) == 3 )
2080 t = SH_type_l2h_shadow;
2082 /* No l2 shadow installed: find and install it. */
2083 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
2084 if ( !mfn_valid(*sl2mfn) )
2086 /* No l2 shadow of this page exists at all: make one. */
2087 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
2089 /* Install the new sl2 table in the sl3e */
2090 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
2091 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
2092 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2093 if ( r & SHADOW_SET_ERROR )
2094 return NULL;
2096 /* Now follow it down a level. Guaranteed to succeed. */
2097 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2098 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
2099 /* We never demand-shadow PAE l3es: they are only created in
2100 * sh_update_cr3(). Check if the relevant sl3e is present. */
2101 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
2102 + shadow_l3_linear_offset(gw->va);
2103 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
2104 return NULL;
2105 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2106 ASSERT(mfn_valid(*sl2mfn));
2107 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2108 #else /* 32bit... */
2109 /* There is always a shadow of the top level table. Get it. */
2110 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2111 /* This next line is important: the guest l2 has a 16k
2112 * shadow, we need to return the right mfn of the four. This
2113 * call will set it for us as a side-effect. */
2114 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
2115 /* Reading the top level table is always valid. */
2116 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2117 #endif
2121 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
2122 walk_t *gw,
2123 mfn_t *sl1mfn,
2124 fetch_type_t ft)
2126 mfn_t sl2mfn;
2127 shadow_l2e_t *sl2e;
2129 /* Get the l2e */
2130 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
2131 if ( sl2e == NULL ) return NULL;
2132 /* Install the sl1 in the l2e if it wasn't there or if we need to
2133 * re-do it to fix a PSE dirty bit. */
2134 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
2135 && likely(ft != ft_demand_write
2136 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
2137 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
2139 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
2140 ASSERT(mfn_valid(*sl1mfn));
2142 else
2144 shadow_l2e_t new_sl2e;
2145 int r, flags = guest_l2e_get_flags(gw->l2e);
2146 /* No l1 shadow installed: find and install it. */
2147 if ( !(flags & _PAGE_PRESENT) )
2148 return NULL; /* No guest page. */
2149 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
2151 /* Splintering a superpage */
2152 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
2153 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
2154 if ( !mfn_valid(*sl1mfn) )
2156 /* No fl1 shadow of this superpage exists at all: make one. */
2157 *sl1mfn = make_fl1_shadow(v, l2gfn);
2160 else
2162 /* Shadowing an actual guest l1 table */
2163 if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
2164 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
2165 if ( !mfn_valid(*sl1mfn) )
2167 /* No l1 shadow of this page exists at all: make one. */
2168 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
2171 /* Install the new sl1 table in the sl2e */
2172 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
2173 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
2174 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2175 if ( r & SHADOW_SET_ERROR )
2176 return NULL;
2177 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
2178 * the guest l1 table has an 8k shadow, and we need to return
2179 * the right mfn of the pair. This call will set it for us as a
2180 * side-effect. (In all other cases, it's a no-op and will be
2181 * compiled out.) */
2182 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
2184 /* Now follow it down a level. Guaranteed to succeed. */
2185 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
2190 /**************************************************************************/
2191 /* Destructors for shadow tables:
2192 * Unregister the shadow, decrement refcounts of any entries present in it,
2193 * and release the memory.
2195 * N.B. These destructors do not clear the contents of the shadows.
2196 * This allows us to delay TLB shootdowns until the page is being reused.
2197 * See shadow_alloc() and shadow_free() for how this is handled.
2198 */
2200 #if GUEST_PAGING_LEVELS >= 4
2201 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
2203 shadow_l4e_t *sl4e;
2204 u32 t = mfn_to_shadow_page(smfn)->type;
2205 mfn_t gmfn, sl4mfn;
2207 SHADOW_DEBUG(DESTROY_SHADOW,
2208 "%s(%05lx)\n", __func__, mfn_x(smfn));
2209 ASSERT(t == SH_type_l4_shadow);
2211 /* Record that the guest page isn't shadowed any more (in this type) */
2212 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2213 delete_shadow_status(v, gmfn, t, smfn);
2214 shadow_demote(v, gmfn, t);
2215 /* Decrement refcounts of all the old entries */
2216 sl4mfn = smfn;
2217 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2218 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2220 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
2221 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
2222 | ((unsigned long)sl4e & ~PAGE_MASK));
2224 });
2226 /* Put the memory back in the pool */
2227 shadow_free(v->domain, smfn);
2230 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2232 shadow_l3e_t *sl3e;
2233 u32 t = mfn_to_shadow_page(smfn)->type;
2234 mfn_t gmfn, sl3mfn;
2236 SHADOW_DEBUG(DESTROY_SHADOW,
2237 "%s(%05lx)\n", __func__, mfn_x(smfn));
2238 ASSERT(t == SH_type_l3_shadow);
2240 /* Record that the guest page isn't shadowed any more (in this type) */
2241 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2242 delete_shadow_status(v, gmfn, t, smfn);
2243 shadow_demote(v, gmfn, t);
2245 /* Decrement refcounts of all the old entries */
2246 sl3mfn = smfn;
2247 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2248 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2249 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2250 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2251 | ((unsigned long)sl3e & ~PAGE_MASK));
2252 });
2254 /* Put the memory back in the pool */
2255 shadow_free(v->domain, smfn);
2257 #endif /* GUEST_PAGING_LEVELS >= 4 */
2260 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2262 shadow_l2e_t *sl2e;
2263 u32 t = mfn_to_shadow_page(smfn)->type;
2264 mfn_t gmfn, sl2mfn;
2266 SHADOW_DEBUG(DESTROY_SHADOW,
2267 "%s(%05lx)\n", __func__, mfn_x(smfn));
2269 #if GUEST_PAGING_LEVELS >= 3
2270 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2271 #else
2272 ASSERT(t == SH_type_l2_shadow);
2273 #endif
2275 /* Record that the guest page isn't shadowed any more (in this type) */
2276 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2277 delete_shadow_status(v, gmfn, t, smfn);
2278 shadow_demote(v, gmfn, t);
2280 /* Decrement refcounts of all the old entries */
2281 sl2mfn = smfn;
2282 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2283 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2284 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2285 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2286 | ((unsigned long)sl2e & ~PAGE_MASK));
2287 });
2289 /* Put the memory back in the pool */
2290 shadow_free(v->domain, smfn);
2293 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2295 struct domain *d = v->domain;
2296 shadow_l1e_t *sl1e;
2297 u32 t = mfn_to_shadow_page(smfn)->type;
2299 SHADOW_DEBUG(DESTROY_SHADOW,
2300 "%s(%05lx)\n", __func__, mfn_x(smfn));
2301 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2303 /* Record that the guest page isn't shadowed any more (in this type) */
2304 if ( t == SH_type_fl1_shadow )
2306 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2307 delete_fl1_shadow_status(v, gfn, smfn);
2309 else
2311 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2312 delete_shadow_status(v, gmfn, t, smfn);
2313 shadow_demote(v, gmfn, t);
2316 if ( shadow_mode_refcounts(d) )
2318 /* Decrement refcounts of all the old entries */
2319 mfn_t sl1mfn = smfn;
2320 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2321 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2322 && !sh_l1e_is_magic(*sl1e) ) {
2323 shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2324 shadow_put_page_from_l1e(*sl1e, d);
2326 });
2329 /* Put the memory back in the pool */
2330 shadow_free(v->domain, smfn);
2333 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2334 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2336 struct domain *d = v->domain;
2337 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2339 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2341 mfn_t m3mfn;
2342 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2343 l3_pgentry_t *l3e;
2344 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2346 /* Need to destroy the l3 and l2 monitor pages used
2347 * for the linear map */
2348 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2349 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2350 l3e = sh_map_domain_page(m3mfn);
2351 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2352 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2353 sh_unmap_domain_page(l3e);
2354 shadow_free(d, m3mfn);
2356 if ( is_pv_32on64_vcpu(v) )
2358 /* Need to destroy the l3 and l2 monitor pages that map the
2359 * Xen VAs at 3GB-4GB */
2360 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2361 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2362 l3e = sh_map_domain_page(m3mfn);
2363 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2364 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2365 sh_unmap_domain_page(l3e);
2366 shadow_free(d, m3mfn);
2368 sh_unmap_domain_page(l4e);
2370 #elif CONFIG_PAGING_LEVELS == 3
2371 /* Need to destroy the l2 monitor page in slot 4 too */
2373 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2374 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2375 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2376 sh_unmap_domain_page(l3e);
2378 #endif
2380 /* Put the memory back in the pool */
2381 shadow_free(d, mmfn);
2383 #endif
2385 /**************************************************************************/
2386 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2387 * These are called from common code when we are running out of shadow
2388 * memory, and unpinning all the top-level shadows hasn't worked.
2390 * This implementation is pretty crude and slow, but we hope that it won't
2391 * be called very often. */
2393 #if GUEST_PAGING_LEVELS == 2
2395 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2397 shadow_l2e_t *sl2e;
2398 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2399 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2400 });
2403 #elif GUEST_PAGING_LEVELS == 3
2405 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2406 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2408 shadow_l2e_t *sl2e;
2409 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2410 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2411 });
2414 #elif GUEST_PAGING_LEVELS == 4
2416 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2418 shadow_l4e_t *sl4e;
2419 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2420 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2421 });
2424 #endif
2426 /**************************************************************************/
2427 /* Internal translation functions.
2428 * These functions require a pointer to the shadow entry that will be updated.
2429 */
2431 /* These functions take a new guest entry, translate it to shadow and write
2432 * the shadow entry.
2434 * They return the same bitmaps as the shadow_set_lXe() functions.
2435 */
2437 #if GUEST_PAGING_LEVELS >= 4
2438 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2440 shadow_l4e_t new_sl4e;
2441 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2442 shadow_l4e_t *sl4p = se;
2443 mfn_t sl3mfn = _mfn(INVALID_MFN);
2444 struct domain *d = v->domain;
2445 p2m_type_t p2mt;
2446 int result = 0;
2448 perfc_incr(shadow_validate_gl4e_calls);
2450 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2452 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2453 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2454 if ( p2m_is_ram(p2mt) )
2455 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2456 else
2457 result |= SHADOW_SET_ERROR;
2459 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2461 // check for updates to xen reserved slots
2462 if ( !shadow_mode_external(d) )
2464 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2465 sizeof(shadow_l4e_t));
2466 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2468 if ( unlikely(reserved_xen_slot) )
2470 // attempt by the guest to write to a xen reserved slot
2471 //
2472 SHADOW_PRINTK("%s out-of-range update "
2473 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2474 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2475 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2477 SHADOW_ERROR("out-of-range l4e update\n");
2478 result |= SHADOW_SET_ERROR;
2481 // do not call shadow_set_l4e...
2482 return result;
2486 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2487 return result;
2491 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2493 shadow_l3e_t new_sl3e;
2494 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2495 shadow_l3e_t *sl3p = se;
2496 mfn_t sl2mfn = _mfn(INVALID_MFN);
2497 p2m_type_t p2mt;
2498 int result = 0;
2500 perfc_incr(shadow_validate_gl3e_calls);
2502 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2504 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2505 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2506 if ( p2m_is_ram(p2mt) )
2507 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2508 else
2509 result |= SHADOW_SET_ERROR;
2511 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2512 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2514 return result;
2516 #endif // GUEST_PAGING_LEVELS >= 4
2518 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2520 shadow_l2e_t new_sl2e;
2521 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2522 shadow_l2e_t *sl2p = se;
2523 mfn_t sl1mfn = _mfn(INVALID_MFN);
2524 p2m_type_t p2mt;
2525 int result = 0;
2527 perfc_incr(shadow_validate_gl2e_calls);
2529 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2531 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2532 if ( guest_supports_superpages(v) &&
2533 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2535 // superpage -- need to look up the shadow L1 which holds the
2536 // splitters...
2537 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2538 #if 0
2539 // XXX - it's possible that we want to do some kind of prefetch
2540 // for superpage fl1's here, but this is *not* on the demand path,
2541 // so we'll hold off trying that for now...
2542 //
2543 if ( !mfn_valid(sl1mfn) )
2544 sl1mfn = make_fl1_shadow(v, gl1gfn);
2545 #endif
2547 else
2549 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2550 if ( p2m_is_ram(p2mt) )
2551 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2552 else
2553 result |= SHADOW_SET_ERROR;
2556 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2558 // check for updates to xen reserved slots in PV guests...
2559 // XXX -- need to revisit this for PV 3-on-4 guests.
2560 //
2561 #if SHADOW_PAGING_LEVELS < 4
2562 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2563 if ( !shadow_mode_external(v->domain) )
2565 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2566 sizeof(shadow_l2e_t));
2567 int reserved_xen_slot;
2569 #if SHADOW_PAGING_LEVELS == 3
2570 reserved_xen_slot =
2571 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2572 (shadow_index
2573 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2574 #else /* SHADOW_PAGING_LEVELS == 2 */
2575 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2576 #endif
2578 if ( unlikely(reserved_xen_slot) )
2580 // attempt by the guest to write to a xen reserved slot
2581 //
2582 SHADOW_PRINTK("%s out-of-range update "
2583 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2584 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2585 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2587 SHADOW_ERROR("out-of-range l2e update\n");
2588 result |= SHADOW_SET_ERROR;
2591 // do not call shadow_set_l2e...
2592 return result;
2595 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2596 #endif /* SHADOW_PAGING_LEVELS < 4 */
2598 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2600 return result;
2603 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2605 shadow_l1e_t new_sl1e;
2606 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2607 shadow_l1e_t *sl1p = se;
2608 gfn_t gfn;
2609 mfn_t gmfn;
2610 p2m_type_t p2mt;
2611 int result = 0;
2612 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2613 mfn_t gl1mfn;
2614 #endif /* OOS */
2616 perfc_incr(shadow_validate_gl1e_calls);
2618 gfn = guest_l1e_get_gfn(new_gl1e);
2619 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2621 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2622 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2624 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2625 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
2626 if ( mfn_valid(gl1mfn)
2627 && mfn_is_out_of_sync(gl1mfn) )
2629 /* Update the OOS snapshot. */
2630 mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
2631 guest_l1e_t *snp;
2633 ASSERT(mfn_valid(snpmfn));
2635 snp = sh_map_domain_page(snpmfn);
2636 snp[guest_index(new_ge)] = new_gl1e;
2637 sh_unmap_domain_page(snp);
2639 #endif /* OOS */
2641 return result;
2644 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2645 /**************************************************************************/
2646 /* Special validation function for re-syncing out-of-sync shadows.
2647 * Walks the *shadow* page, and for every entry that it finds,
2648 * revalidates the guest entry that corresponds to it.
2649 * N.B. This function is called with the vcpu that unsynced the page,
2650 * *not* the one that is causing it to be resynced. */
2651 void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
2653 mfn_t sl1mfn;
2654 shadow_l1e_t *sl1p;
2655 guest_l1e_t *gl1p, *gp, *snp;
2656 int rc = 0;
2658 ASSERT(mfn_valid(snpmfn));
2660 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2661 ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
2663 snp = sh_map_domain_page(snpmfn);
2664 gp = sh_map_domain_page(gl1mfn);
2665 gl1p = gp;
2667 SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
2668 guest_l1e_t gl1e = *gl1p;
2669 guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
2671 if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
2673 gfn_t gfn;
2674 mfn_t gmfn;
2675 p2m_type_t p2mt;
2676 shadow_l1e_t nsl1e;
2678 gfn = guest_l1e_get_gfn(gl1e);
2679 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2680 l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
2681 rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
2683 *snpl1p = gl1e;
2685 });
2687 sh_unmap_domain_page(gp);
2688 sh_unmap_domain_page(snp);
2690 /* Setting shadow L1 entries should never need us to flush the TLB */
2691 ASSERT(!(rc & SHADOW_SET_FLUSH));
2694 /* Figure out whether it's definitely safe not to sync this l1 table.
2695 * That is: if we can tell that it's only used once, and that the
2696 * toplevel shadow responsible is not one of ours.
2697 * N.B. This function is called with the vcpu that required the resync,
2698 * *not* the one that originally unsynced the page, but it is
2699 * called in the *mode* of the vcpu that unsynced it. Clear? Good. */
2700 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
2702 struct shadow_page_info *sp;
2703 mfn_t smfn;
2705 smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2706 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2708 /* Up to l2 */
2709 sp = mfn_to_shadow_page(smfn);
2710 if ( sp->count != 1 || !sp->up )
2711 return 0;
2712 smfn = _mfn(sp->up >> PAGE_SHIFT);
2713 ASSERT(mfn_valid(smfn));
2715 #if (SHADOW_PAGING_LEVELS == 4)
2716 /* up to l3 */
2717 sp = mfn_to_shadow_page(smfn);
2718 if ( sp->count != 1 || !sp->up )
2719 return 0;
2720 smfn = _mfn(sp->up >> PAGE_SHIFT);
2721 ASSERT(mfn_valid(smfn));
2723 /* up to l4 */
2724 sp = mfn_to_shadow_page(smfn);
2725 if ( sp->count != 1
2726 || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
2727 return 0;
2728 smfn = _mfn(sp->up >> PAGE_SHIFT);
2729 ASSERT(mfn_valid(smfn));
2731 #if (GUEST_PAGING_LEVELS == 2)
2732 /* In 2-on-3 shadow mode the up pointer contains the link to the
2733 * shadow page, but the shadow_table contains only the first of the
2734 * four pages that makes the PAE top shadow tables. */
2735 smfn = _mfn(mfn_x(smfn) & ~0x3UL);
2736 #endif
2738 #endif
2740 if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
2741 #if (SHADOW_PAGING_LEVELS == 3)
2742 || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
2743 || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
2744 || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
2745 #endif
2747 return 0;
2749 /* Only in use in one toplevel shadow, and it's not the one we're
2750 * running on */
2751 return 1;
2753 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
2756 /**************************************************************************/
2757 /* Functions which translate and install the shadows of arbitrary guest
2758 * entries that we have just seen the guest write. */
2761 static inline int
2762 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2763 void *new_gp, u32 size, u32 sh_type,
2764 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2765 int (*validate_ge)(struct vcpu *v, void *ge,
2766 mfn_t smfn, void *se))
2767 /* Generic function for mapping and validating. */
2769 mfn_t smfn, smfn2, map_mfn;
2770 shadow_l1e_t *sl1p;
2771 u32 shadow_idx, guest_idx;
2772 int result = 0;
2774 /* Align address and size to guest entry boundaries */
2775 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2776 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2777 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2778 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2780 /* Map the shadow page */
2781 smfn = get_shadow_status(v, gmfn, sh_type);
2782 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2783 guest_idx = guest_index(new_gp);
2784 map_mfn = smfn;
2785 shadow_idx = shadow_index(&map_mfn, guest_idx);
2786 sl1p = sh_map_domain_page(map_mfn);
2788 /* Validate one entry at a time */
2789 while ( size )
2791 smfn2 = smfn;
2792 guest_idx = guest_index(new_gp);
2793 shadow_idx = shadow_index(&smfn2, guest_idx);
2794 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2796 /* We have moved to another page of the shadow */
2797 map_mfn = smfn2;
2798 sh_unmap_domain_page(sl1p);
2799 sl1p = sh_map_domain_page(map_mfn);
2801 result |= validate_ge(v,
2802 new_gp,
2803 map_mfn,
2804 &sl1p[shadow_idx]);
2805 size -= sizeof(guest_l1e_t);
2806 new_gp += sizeof(guest_l1e_t);
2808 sh_unmap_domain_page(sl1p);
2809 return result;
2813 int
2814 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2815 void *new_gl4p, u32 size)
2817 #if GUEST_PAGING_LEVELS >= 4
2818 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2819 SH_type_l4_shadow,
2820 shadow_l4_index,
2821 validate_gl4e);
2822 #else // ! GUEST_PAGING_LEVELS >= 4
2823 SHADOW_ERROR("called in wrong paging mode!\n");
2824 BUG();
2825 return 0;
2826 #endif
2829 int
2830 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2831 void *new_gl3p, u32 size)
2833 #if GUEST_PAGING_LEVELS >= 4
2834 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2835 SH_type_l3_shadow,
2836 shadow_l3_index,
2837 validate_gl3e);
2838 #else // ! GUEST_PAGING_LEVELS >= 4
2839 SHADOW_ERROR("called in wrong paging mode!\n");
2840 BUG();
2841 return 0;
2842 #endif
2845 int
2846 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2847 void *new_gl2p, u32 size)
2849 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2850 SH_type_l2_shadow,
2851 shadow_l2_index,
2852 validate_gl2e);
2855 int
2856 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2857 void *new_gl2p, u32 size)
2859 #if GUEST_PAGING_LEVELS >= 3
2860 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2861 SH_type_l2h_shadow,
2862 shadow_l2_index,
2863 validate_gl2e);
2864 #else /* Non-PAE guests don't have different kinds of l2 table */
2865 SHADOW_ERROR("called in wrong paging mode!\n");
2866 BUG();
2867 return 0;
2868 #endif
2871 int
2872 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2873 void *new_gl1p, u32 size)
2875 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2876 SH_type_l1_shadow,
2877 shadow_l1_index,
2878 validate_gl1e);
2882 /**************************************************************************/
2883 /* Optimization: If we see two emulated writes of zeros to the same
2884 * page-table without another kind of page fault in between, we guess
2885 * that this is a batch of changes (for process destruction) and
2886 * unshadow the page so we don't take a pagefault on every entry. This
2887 * should also make finding writeable mappings of pagetables much
2888 * easier. */
2890 /* Look to see if this is the second emulated write in a row to this
2891 * page, and unshadow if it is */
2892 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2894 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2895 if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2896 && sh_mfn_is_a_page_table(gmfn) )
2898 perfc_incr(shadow_early_unshadow);
2899 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2901 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2902 #endif
2905 /* Stop counting towards early unshadows, as we've seen a real page fault */
2906 static inline void reset_early_unshadow(struct vcpu *v)
2908 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2909 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2910 #endif
2915 /**************************************************************************/
2916 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2917 * demand-faulted a shadow l1e in the fault handler, to see if it's
2918 * worth fetching some more.
2919 */
2921 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2923 /* XXX magic number */
2924 #define PREFETCH_DISTANCE 32
2926 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2927 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2929 int i, dist;
2930 gfn_t gfn;
2931 mfn_t gmfn;
2932 guest_l1e_t *gl1p = NULL, gl1e;
2933 shadow_l1e_t sl1e;
2934 u32 gflags;
2935 p2m_type_t p2mt;
2936 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2937 guest_l1e_t *snpl1p = NULL;
2938 #endif /* OOS */
2941 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2942 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2943 /* And no more than a maximum fetches-per-fault */
2944 if ( dist > PREFETCH_DISTANCE )
2945 dist = PREFETCH_DISTANCE;
2947 if ( mfn_valid(gw->l1mfn) )
2949 /* Normal guest page; grab the next guest entry */
2950 gl1p = sh_map_domain_page(gw->l1mfn);
2951 gl1p += guest_l1_table_offset(gw->va);
2953 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2954 if ( mfn_is_out_of_sync(gw->l1mfn) )
2956 mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
2958 ASSERT(mfn_valid(snpmfn));
2959 snpl1p = sh_map_domain_page(snpmfn);
2960 snpl1p += guest_l1_table_offset(gw->va);
2962 #endif /* OOS */
2965 for ( i = 1; i < dist ; i++ )
2967 /* No point in prefetching if there's already a shadow */
2968 if ( ptr_sl1e[i].l1 != 0 )
2969 break;
2971 if ( mfn_valid(gw->l1mfn) )
2973 /* Normal guest page; grab the next guest entry */
2974 gl1e = gl1p[i];
2975 /* Not worth continuing if we hit an entry that will need another
2976 * fault for A/D-bit propagation anyway */
2977 gflags = guest_l1e_get_flags(gl1e);
2978 if ( (gflags & _PAGE_PRESENT)
2979 && (!(gflags & _PAGE_ACCESSED)
2980 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2981 break;
2983 else
2985 /* Fragmented superpage, unless we've been called wrongly */
2986 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2987 /* Increment the l1e's GFN by the right number of guest pages */
2988 gl1e = guest_l1e_from_gfn(
2989 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2990 guest_l1e_get_flags(gw->l1e));
2993 /* Look at the gfn that the l1e is pointing at */
2994 gfn = guest_l1e_get_gfn(gl1e);
2995 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2997 /* Propagate the entry. */
2998 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2999 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
3001 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3002 if ( snpl1p != NULL )
3003 snpl1p[i] = gl1e;
3004 #endif /* OOS */
3006 if ( gl1p != NULL )
3007 sh_unmap_domain_page(gl1p);
3008 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3009 if ( snpl1p != NULL )
3010 sh_unmap_domain_page(snpl1p);
3011 #endif /* OOS */
3014 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
3017 /**************************************************************************/
3018 /* Entry points into the shadow code */
3020 /* Called from pagefault handler in Xen, and from the HVM trap handlers
3021 * for pagefaults. Returns 1 if this fault was an artefact of the
3022 * shadow code (and the guest should retry) or 0 if it is not (and the
3023 * fault should be handled elsewhere or passed to the guest). */
3025 static int sh_page_fault(struct vcpu *v,
3026 unsigned long va,
3027 struct cpu_user_regs *regs)
3029 struct domain *d = v->domain;
3030 walk_t gw;
3031 gfn_t gfn;
3032 mfn_t gmfn, sl1mfn=_mfn(0);
3033 shadow_l1e_t sl1e, *ptr_sl1e;
3034 paddr_t gpa;
3035 struct sh_emulate_ctxt emul_ctxt;
3036 struct x86_emulate_ops *emul_ops;
3037 int r;
3038 fetch_type_t ft = 0;
3039 p2m_type_t p2mt;
3040 uint32_t rc;
3041 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3042 int fast_emul = 0;
3043 #endif
3045 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
3046 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
3047 regs->rip);
3049 perfc_incr(shadow_fault);
3051 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3052 /* If faulting frame is successfully emulated in last shadow fault
3053 * it's highly likely to reach same emulation action for this frame.
3054 * Then try to emulate early to avoid lock aquisition.
3055 */
3056 if ( v->arch.paging.last_write_emul_ok
3057 && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
3059 /* check whether error code is 3, or else fall back to normal path
3060 * in case of some validation is required
3061 */
3062 if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
3064 fast_emul = 1;
3065 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
3067 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3068 /* Fall back to the slow path if we're trying to emulate
3069 writes to an out of sync page. */
3070 if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
3072 v->arch.paging.last_write_emul_ok = 0;
3073 goto page_fault_slow_path;
3075 #endif /* OOS */
3077 perfc_incr(shadow_fault_fast_emulate);
3078 goto early_emulation;
3080 else
3081 v->arch.paging.last_write_emul_ok = 0;
3083 #endif
3085 //
3086 // XXX: Need to think about eventually mapping superpages directly in the
3087 // shadow (when possible), as opposed to splintering them into a
3088 // bunch of 4K maps.
3089 //
3091 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3092 if ( (regs->error_code & PFEC_reserved_bit) )
3094 /* The only reasons for reserved bits to be set in shadow entries
3095 * are the two "magic" shadow_l1e entries. */
3096 if ( likely((__copy_from_user(&sl1e,
3097 (sh_linear_l1_table(v)
3098 + shadow_l1_linear_offset(va)),
3099 sizeof(sl1e)) == 0)
3100 && sh_l1e_is_magic(sl1e)) )
3102 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3103 /* First, need to check that this isn't an out-of-sync
3104 * shadow l1e. If it is, we fall back to the slow path, which
3105 * will sync it up again. */
3107 shadow_l2e_t sl2e;
3108 mfn_t gl1mfn;
3109 if ( (__copy_from_user(&sl2e,
3110 (sh_linear_l2_table(v)
3111 + shadow_l2_linear_offset(va)),
3112 sizeof(sl2e)) != 0)
3113 || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
3114 || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
3115 shadow_l2e_get_mfn(sl2e))->backpointer))
3116 || unlikely(mfn_is_out_of_sync(gl1mfn)) )
3118 /* Hit the slow path as if there had been no
3119 * shadow entry at all, and let it tidy up */
3120 ASSERT(regs->error_code & PFEC_page_present);
3121 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3122 goto page_fault_slow_path;
3125 #endif /* SHOPT_OUT_OF_SYNC */
3127 if ( sh_l1e_is_gnp(sl1e) )
3129 /* Not-present in a guest PT: pass to the guest as
3130 * a not-present fault (by flipping two bits). */
3131 ASSERT(regs->error_code & PFEC_page_present);
3132 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3133 reset_early_unshadow(v);
3134 perfc_incr(shadow_fault_fast_gnp);
3135 SHADOW_PRINTK("fast path not-present\n");
3136 return 0;
3138 else
3140 /* Magic MMIO marker: extract gfn for MMIO address */
3141 ASSERT(sh_l1e_is_mmio(sl1e));
3142 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
3143 << PAGE_SHIFT)
3144 | (va & ~PAGE_MASK);
3146 perfc_incr(shadow_fault_fast_mmio);
3147 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
3148 reset_early_unshadow(v);
3149 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3150 ? EXCRET_fault_fixed : 0);
3152 else
3154 /* This should be exceptionally rare: another vcpu has fixed
3155 * the tables between the fault and our reading the l1e.
3156 * Retry and let the hardware give us the right fault next time. */
3157 perfc_incr(shadow_fault_fast_fail);
3158 SHADOW_PRINTK("fast path false alarm!\n");
3159 return EXCRET_fault_fixed;
3163 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3164 page_fault_slow_path:
3165 #endif
3166 #endif /* SHOPT_FAST_FAULT_PATH */
3168 /* Detect if this page fault happened while we were already in Xen
3169 * doing a shadow operation. If that happens, the only thing we can
3170 * do is let Xen's normal fault handlers try to fix it. In any case,
3171 * a diagnostic trace of the fault will be more useful than
3172 * a BUG() when we try to take the lock again. */
3173 if ( unlikely(shadow_locked_by_me(d)) )
3175 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
3176 d->arch.paging.shadow.locker_function);
3177 return 0;
3180 rewalk:
3181 rc = guest_walk_tables(v, va, &gw, regs->error_code);
3183 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3184 regs->error_code &= ~PFEC_page_present;
3185 if ( !(rc & _PAGE_PRESENT) )
3186 regs->error_code |= PFEC_page_present;
3187 #endif
3189 if ( rc != 0 )
3191 perfc_incr(shadow_fault_bail_real_fault);
3192 SHADOW_PRINTK("not a shadow fault\n");
3193 reset_early_unshadow(v);
3194 return 0;
3197 /* It's possible that the guest has put pagetables in memory that it has
3198 * already used for some special purpose (ioreq pages, or granted pages).
3199 * If that happens we'll have killed the guest already but it's still not
3200 * safe to propagate entries out of the guest PT so get out now. */
3201 if ( unlikely(d->is_shutting_down) )
3203 SHADOW_PRINTK("guest is shutting down\n");
3204 return 0;
3207 /* What kind of access are we dealing with? */
3208 ft = ((regs->error_code & PFEC_write_access)
3209 ? ft_demand_write : ft_demand_read);
3211 /* What mfn is the guest trying to access? */
3212 gfn = guest_l1e_get_gfn(gw.l1e);
3213 gmfn = gfn_to_mfn(d, gfn, &p2mt);
3215 if ( shadow_mode_refcounts(d) &&
3216 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3218 perfc_incr(shadow_fault_bail_bad_gfn);
3219 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3220 gfn_x(gfn), mfn_x(gmfn));
3221 reset_early_unshadow(v);
3222 return 0;
3225 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3226 /* Remember this successful VA->GFN translation for later. */
3227 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3228 regs->error_code | PFEC_page_present);
3229 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3231 shadow_lock(d);
3233 rc = gw_remove_write_accesses(v, va, &gw);
3235 /* First bit set: Removed write access to a page. */
3236 if ( rc & GW_RMWR_FLUSHTLB )
3238 /* Write permission removal is also a hint that other gwalks
3239 * overlapping with this one may be inconsistent
3240 */
3241 perfc_incr(shadow_rm_write_flush_tlb);
3242 atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3243 flush_tlb_mask(d->domain_dirty_cpumask);
3246 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3247 /* Second bit set: Resynced a page. Re-walk needed. */
3248 if ( rc & GW_RMWR_REWALK )
3250 shadow_unlock(d);
3251 goto rewalk;
3253 #endif /* OOS */
3255 if ( !shadow_check_gwalk(v, va, &gw) )
3257 perfc_incr(shadow_inconsistent_gwalk);
3258 shadow_unlock(d);
3259 goto rewalk;
3262 shadow_audit_tables(v);
3263 sh_audit_gw(v, &gw);
3265 /* Make sure there is enough free shadow memory to build a chain of
3266 * shadow tables. (We never allocate a top-level shadow on this path,
3267 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3268 * SH_type_l1_shadow isn't correct in the latter case, all page
3269 * tables are the same size there.) */
3270 shadow_prealloc(d,
3271 SH_type_l1_shadow,
3272 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3274 /* Acquire the shadow. This must happen before we figure out the rights
3275 * for the shadow entry, since we might promote a page here. */
3276 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3277 if ( unlikely(ptr_sl1e == NULL) )
3279 /* Couldn't get the sl1e! Since we know the guest entries
3280 * are OK, this can only have been caused by a failed
3281 * shadow_set_l*e(), which will have crashed the guest.
3282 * Get out of the fault handler immediately. */
3283 ASSERT(d->is_shutting_down);
3284 shadow_unlock(d);
3285 return 0;
3288 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3289 /* Always unsync when writing to L1 page tables. */
3290 if ( sh_mfn_is_a_page_table(gmfn)
3291 && ft == ft_demand_write )
3292 sh_unsync(v, gmfn);
3293 #endif /* OOS */
3295 /* Calculate the shadow entry and write it */
3296 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3297 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
3299 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3300 if ( mfn_valid(gw.l1mfn)
3301 && mfn_is_out_of_sync(gw.l1mfn) )
3303 /* Update the OOS snapshot. */
3304 mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
3305 guest_l1e_t *snp;
3307 ASSERT(mfn_valid(snpmfn));
3309 snp = sh_map_domain_page(snpmfn);
3310 snp[guest_l1_table_offset(va)] = gw.l1e;
3311 sh_unmap_domain_page(snp);
3313 #endif /* OOS */
3315 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3316 /* Prefetch some more shadow entries */
3317 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3318 #endif
3320 /* Need to emulate accesses to page tables */
3321 if ( sh_mfn_is_a_page_table(gmfn)
3322 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3323 /* Unless they've been allowed to go out of sync with their
3324 shadows and we don't need to unshadow it. */
3325 && !(mfn_is_out_of_sync(gmfn)
3326 && !(regs->error_code & PFEC_user_mode))
3327 #endif
3330 if ( ft == ft_demand_write )
3332 perfc_incr(shadow_fault_emulate_write);
3333 goto emulate;
3335 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3337 perfc_incr(shadow_fault_emulate_read);
3338 goto emulate;
3342 /* Need to hand off device-model MMIO to the device model */
3343 if ( p2mt == p2m_mmio_dm )
3345 gpa = guest_walk_to_gpa(&gw);
3346 goto mmio;
3349 /* Log attempts to write to read-only memory */
3350 if ( (p2mt == p2m_ram_ro) && (ft == ft_demand_write) )
3352 static unsigned long lastpage = 0;
3353 if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
3354 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
3355 " page. va page=%#lx, mfn=%#lx\n",
3356 va & PAGE_MASK, mfn_x(gmfn));
3357 goto emulate_readonly; /* skip over the instruction */
3360 /* In HVM guests, we force CR0.WP always to be set, so that the
3361 * pagetables are always write-protected. If the guest thinks
3362 * CR0.WP is clear, we must emulate faulting supervisor writes to
3363 * allow the guest to write through read-only PTEs. Emulate if the
3364 * fault was a non-user write to a present page. */
3365 if ( is_hvm_domain(d)
3366 && unlikely(!hvm_wp_enabled(v))
3367 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
3369 perfc_incr(shadow_fault_emulate_wp);
3370 goto emulate;
3373 perfc_incr(shadow_fault_fixed);
3374 d->arch.paging.log_dirty.fault_count++;
3375 reset_early_unshadow(v);
3377 done:
3378 sh_audit_gw(v, &gw);
3379 SHADOW_PRINTK("fixed\n");
3380 shadow_audit_tables(v);
3381 shadow_unlock(d);
3382 return EXCRET_fault_fixed;
3384 emulate:
3385 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3386 goto not_a_shadow_fault;
3388 /*
3389 * We do not emulate user writes. Instead we use them as a hint that the
3390 * page is no longer a page table. This behaviour differs from native, but
3391 * it seems very unlikely that any OS grants user access to page tables.
3392 */
3393 if ( (regs->error_code & PFEC_user_mode) )
3395 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3396 mfn_x(gmfn));
3397 perfc_incr(shadow_fault_emulate_failed);
3398 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3399 goto done;
3402 /*
3403 * Write from userspace to ro-mem needs to jump here to avoid getting
3404 * caught by user-mode page-table check above.
3405 */
3406 emulate_readonly:
3407 /*
3408 * We don't need to hold the lock for the whole emulation; we will
3409 * take it again when we write to the pagetables.
3410 */
3411 sh_audit_gw(v, &gw);
3412 shadow_audit_tables(v);
3413 shadow_unlock(d);
3415 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3416 early_emulation:
3417 #endif
3418 if ( is_hvm_domain(d) )
3420 /*
3421 * If we are in the middle of injecting an exception or interrupt then
3422 * we should not emulate: it is not the instruction at %eip that caused
3423 * the fault. Furthermore it is almost certainly the case the handler
3424 * stack is currently considered to be a page table, so we should
3425 * unshadow the faulting page before exiting.
3426 */
3427 if ( unlikely(hvm_event_pending(v)) )
3429 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3430 if ( fast_emul )
3432 perfc_incr(shadow_fault_fast_emulate_fail);
3433 v->arch.paging.last_write_emul_ok = 0;
3435 #endif
3436 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3437 "injection: cr2=%#lx, mfn=%#lx\n",
3438 va, mfn_x(gmfn));
3439 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3440 return EXCRET_fault_fixed;
3444 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3445 (unsigned long)regs->eip, (unsigned long)regs->esp);
3447 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3449 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3451 /*
3452 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3453 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3454 * then it must be 'failable': we cannot require the unshadow to succeed.
3455 */
3456 if ( r == X86EMUL_UNHANDLEABLE )
3458 perfc_incr(shadow_fault_emulate_failed);
3459 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3460 if ( fast_emul )
3462 perfc_incr(shadow_fault_fast_emulate_fail);
3463 v->arch.paging.last_write_emul_ok = 0;
3465 #endif
3466 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3467 mfn_x(gmfn));
3468 /* If this is actually a page table, then we have a bug, and need
3469 * to support more operations in the emulator. More likely,
3470 * though, this is a hint that this page should not be shadowed. */
3471 shadow_remove_all_shadows(v, gmfn);
3474 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3475 /* Record successfully emulated information as heuristics to next
3476 * fault on same frame for acceleration. But be careful to verify
3477 * its attribute still as page table, or else unshadow triggered
3478 * in write emulation normally requires a re-sync with guest page
3479 * table to recover r/w permission. Incorrect record for such case
3480 * will cause unexpected more shadow faults due to propagation is
3481 * skipped.
3482 */
3483 if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3485 if ( !fast_emul )
3487 v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3488 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3489 v->arch.paging.last_write_emul_ok = 1;
3492 else if ( fast_emul )
3493 v->arch.paging.last_write_emul_ok = 0;
3494 #endif
3496 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3497 if ( r == X86EMUL_OKAY ) {
3498 int i;
3499 /* Emulate up to four extra instructions in the hope of catching
3500 * the "second half" of a 64-bit pagetable write. */
3501 for ( i = 0 ; i < 4 ; i++ )
3503 shadow_continue_emulation(&emul_ctxt, regs);
3504 v->arch.paging.last_write_was_pt = 0;
3505 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3506 if ( r == X86EMUL_OKAY )
3508 if ( v->arch.paging.last_write_was_pt )
3510 perfc_incr(shadow_em_ex_pt);
3511 break; /* Don't emulate past the other half of the write */
3513 else
3514 perfc_incr(shadow_em_ex_non_pt);
3516 else
3518 perfc_incr(shadow_em_ex_fail);
3519 break; /* Don't emulate again if we failed! */
3523 #endif /* PAE guest */
3525 SHADOW_PRINTK("emulated\n");
3526 return EXCRET_fault_fixed;
3528 mmio:
3529 if ( !guest_mode(regs) )
3530 goto not_a_shadow_fault;
3531 perfc_incr(shadow_fault_mmio);
3532 sh_audit_gw(v, &gw);
3533 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3534 shadow_audit_tables(v);
3535 reset_early_unshadow(v);
3536 shadow_unlock(d);
3537 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3538 ? EXCRET_fault_fixed : 0);
3540 not_a_shadow_fault:
3541 sh_audit_gw(v, &gw);
3542 SHADOW_PRINTK("not a shadow fault\n");
3543 shadow_audit_tables(v);
3544 reset_early_unshadow(v);
3545 shadow_unlock(d);
3546 return 0;
3550 static int
3551 sh_invlpg(struct vcpu *v, unsigned long va)
3552 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3553 * instruction should be issued on the hardware, or 0 if it's safe not
3554 * to do so. */
3556 mfn_t sl1mfn;
3557 shadow_l2e_t sl2e;
3559 perfc_incr(shadow_invlpg);
3561 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3562 /* No longer safe to use cached gva->gfn translations */
3563 vtlb_flush(v);
3564 #endif
3566 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3567 v->arch.paging.last_write_emul_ok = 0;
3568 #endif
3570 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3571 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3572 * yet. */
3573 #if SHADOW_PAGING_LEVELS == 4
3575 shadow_l3e_t sl3e;
3576 if ( !(shadow_l4e_get_flags(
3577 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3578 & _PAGE_PRESENT) )
3579 return 0;
3580 /* This must still be a copy-from-user because we don't have the
3581 * shadow lock, and the higher-level shadows might disappear
3582 * under our feet. */
3583 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3584 + shadow_l3_linear_offset(va)),
3585 sizeof (sl3e)) != 0 )
3587 perfc_incr(shadow_invlpg_fault);
3588 return 0;
3590 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3591 return 0;
3593 #else /* SHADOW_PAGING_LEVELS == 3 */
3594 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3595 & _PAGE_PRESENT) )
3596 // no need to flush anything if there's no SL2...
3597 return 0;
3598 #endif
3600 /* This must still be a copy-from-user because we don't have the shadow
3601 * lock, and the higher-level shadows might disappear under our feet. */
3602 if ( __copy_from_user(&sl2e,
3603 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3604 sizeof (sl2e)) != 0 )
3606 perfc_incr(shadow_invlpg_fault);
3607 return 0;
3610 // If there's nothing shadowed for this particular sl2e, then
3611 // there is no need to do an invlpg, either...
3612 //
3613 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3614 return 0;
3616 // Check to see if the SL2 is a splintered superpage...
3617 // If so, then we'll need to flush the entire TLB (because that's
3618 // easier than invalidating all of the individual 4K pages).
3619 //
3620 sl1mfn = shadow_l2e_get_mfn(sl2e);
3621 if ( mfn_to_shadow_page(sl1mfn)->type
3622 == SH_type_fl1_shadow )
3624 flush_tlb_local();
3625 return 0;
3628 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3629 /* Check to see if the SL1 is out of sync. */
3631 mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3632 struct page_info *pg = mfn_to_page(gl1mfn);
3633 if ( mfn_valid(gl1mfn)
3634 && page_is_out_of_sync(pg) )
3636 /* The test above may give false positives, since we don't
3637 * hold the shadow lock yet. Check again with the lock held. */
3638 shadow_lock(v->domain);
3640 /* This must still be a copy-from-user because we didn't
3641 * have the shadow lock last time we checked, and the
3642 * higher-level shadows might have disappeared under our
3643 * feet. */
3644 if ( __copy_from_user(&sl2e,
3645 sh_linear_l2_table(v)
3646 + shadow_l2_linear_offset(va),
3647 sizeof (sl2e)) != 0 )
3649 perfc_incr(shadow_invlpg_fault);
3650 shadow_unlock(v->domain);
3651 return 0;
3654 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3656 shadow_unlock(v->domain);
3657 return 0;
3660 sl1mfn = shadow_l2e_get_mfn(sl2e);
3661 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
3662 pg = mfn_to_page(gl1mfn);
3664 if ( likely(sh_mfn_is_a_page_table(gl1mfn)
3665 && page_is_out_of_sync(pg) ) )
3667 shadow_l1e_t *sl1;
3668 sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
3669 /* Remove the shadow entry that maps this VA */
3670 (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
3672 shadow_unlock(v->domain);
3673 /* Need the invlpg, to pick up the disappeareance of the sl1e */
3674 return 1;
3677 #endif
3679 return 1;
3683 static unsigned long
3684 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3685 /* Called to translate a guest virtual address to what the *guest*
3686 * pagetables would map it to. */
3688 walk_t gw;
3689 gfn_t gfn;
3691 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3692 /* Check the vTLB cache first */
3693 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3694 if ( VALID_GFN(vtlb_gfn) )
3695 return vtlb_gfn;
3696 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3698 if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
3700 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3701 pfec[0] &= ~PFEC_page_present;
3702 return INVALID_GFN;
3704 gfn = guest_walk_to_gfn(&gw);
3706 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3707 /* Remember this successful VA->GFN translation for later. */
3708 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3709 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3711 return gfn_x(gfn);
3715 static inline void
3716 sh_update_linear_entries(struct vcpu *v)
3717 /* Sync up all the linear mappings for this vcpu's pagetables */
3719 struct domain *d = v->domain;
3721 /* Linear pagetables in PV guests
3722 * ------------------------------
3724 * Guest linear pagetables, which map the guest pages, are at
3725 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3726 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3727 * are set up at shadow creation time, but (of course!) the PAE case
3728 * is subtler. Normal linear mappings are made by having an entry
3729 * in the top-level table that points to itself (shadow linear) or
3730 * to the guest top-level table (guest linear). For PAE, to set up
3731 * a linear map requires us to copy the four top-level entries into
3732 * level-2 entries. That means that every time we change a PAE l3e,
3733 * we need to reflect the change into the copy.
3735 * Linear pagetables in HVM guests
3736 * -------------------------------
3738 * For HVM guests, the linear pagetables are installed in the monitor
3739 * tables (since we can't put them in the shadow). Shadow linear
3740 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3741 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3742 * a linear pagetable of the monitor tables themselves. We have
3743 * the same issue of having to re-copy PAE l3 entries whevever we use
3744 * PAE shadows.
3746 * Because HVM guests run on the same monitor tables regardless of the
3747 * shadow tables in use, the linear mapping of the shadow tables has to
3748 * be updated every time v->arch.shadow_table changes.
3749 */
3751 /* Don't try to update the monitor table if it doesn't exist */
3752 if ( shadow_mode_external(d)
3753 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3754 return;
3756 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3758 /* For PV, one l4e points at the guest l4, one points at the shadow
3759 * l4. No maintenance required.
3760 * For HVM, just need to update the l4e that points to the shadow l4. */
3762 if ( shadow_mode_external(d) )
3764 /* Use the linear map if we can; otherwise make a new mapping */
3765 if ( v == current )
3767 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3768 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3769 __PAGE_HYPERVISOR);
3771 else
3773 l4_pgentry_t *ml4e;
3774 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3775 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3776 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3777 __PAGE_HYPERVISOR);
3778 sh_unmap_domain_page(ml4e);
3782 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3784 /* PV: XXX
3786 * HVM: To give ourselves a linear map of the shadows, we need to
3787 * extend a PAE shadow to 4 levels. We do this by having a monitor
3788 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3789 * entries into it. Then, by having the monitor l4e for shadow
3790 * pagetables also point to the monitor l4, we can use it to access
3791 * the shadows.
3792 */
3794 if ( shadow_mode_external(d) )
3796 /* Install copies of the shadow l3es into the monitor l2 table
3797 * that maps SH_LINEAR_PT_VIRT_START. */
3798 shadow_l3e_t *sl3e;
3799 l2_pgentry_t *ml2e;
3800 int i;
3802 /* Use linear mappings if we can; otherwise make new mappings */
3803 if ( v == current )
3804 ml2e = __linear_l2_table
3805 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3806 else
3808 mfn_t l3mfn, l2mfn;
3809 l4_pgentry_t *ml4e;
3810 l3_pgentry_t *ml3e;
3811 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3812 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3814 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3815 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3816 ml3e = sh_map_domain_page(l3mfn);
3817 sh_unmap_domain_page(ml4e);
3819 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3820 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3821 ml2e = sh_map_domain_page(l2mfn);
3822 sh_unmap_domain_page(ml3e);
3825 /* Shadow l3 tables are made up by sh_update_cr3 */
3826 sl3e = v->arch.paging.shadow.l3table;
3828 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3830 ml2e[i] =
3831 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3832 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3833 __PAGE_HYPERVISOR)
3834 : l2e_empty();
3837 if ( v != current )
3838 sh_unmap_domain_page(ml2e);
3840 else
3841 domain_crash(d); /* XXX */
3843 #elif CONFIG_PAGING_LEVELS == 3
3845 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3846 * entries in the shadow, and the shadow's l3 entries into the
3847 * shadow-linear-map l2 entries in the shadow. This is safe to do
3848 * because Xen does not let guests share high-slot l2 tables between l3s,
3849 * so we know we're not treading on anyone's toes.
3851 * HVM: need to copy the shadow's l3 entries into the
3852 * shadow-linear-map l2 entries in the monitor table. This is safe
3853 * because we have one monitor table for each vcpu. The monitor's
3854 * own l3es don't need to be copied because they never change.
3855 * XXX That might change if we start stuffing things into the rest
3856 * of the monitor's virtual address space.
3857 */
3859 l2_pgentry_t *l2e, new_l2e;
3860 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3861 int i;
3862 int unmap_l2e = 0;
3864 #if GUEST_PAGING_LEVELS == 2
3866 /* Shadow l3 tables were built by sh_update_cr3 */
3867 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3868 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3870 #else /* GUEST_PAGING_LEVELS == 3 */
3872 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3873 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3875 #endif /* GUEST_PAGING_LEVELS */
3877 /* Choose where to write the entries, using linear maps if possible */
3878 if ( shadow_mode_external(d) )
3880 if ( v == current )
3882 /* From the monitor tables, it's safe to use linear maps
3883 * to update monitor l2s */
3884 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3886 else
3888 /* Map the monitor table's high l2 */
3889 l3_pgentry_t *l3e;
3890 l3e = sh_map_domain_page(
3891 pagetable_get_mfn(v->arch.monitor_table));
3892 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3893 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3894 unmap_l2e = 1;
3895 sh_unmap_domain_page(l3e);
3898 else
3900 /* Map the shadow table's high l2 */
3901 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3902 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3903 unmap_l2e = 1;
3906 /* Write linear mapping of guest (only in PV, and only when
3907 * not translated). */
3908 if ( !shadow_mode_translate(d) )
3910 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3912 new_l2e =
3913 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3914 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3915 __PAGE_HYPERVISOR)
3916 : l2e_empty());
3917 safe_write_entry(
3918 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3919 &new_l2e);
3923 /* Write linear mapping of shadow. */
3924 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3926 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3927 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3928 __PAGE_HYPERVISOR)
3929 : l2e_empty();
3930 safe_write_entry(
3931 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3932 &new_l2e);
3935 if ( unmap_l2e )
3936 sh_unmap_domain_page(l2e);
3939 #else
3940 #error this should not happen
3941 #endif
3943 if ( shadow_mode_external(d) )
3945 /*
3946 * Having modified the linear pagetable mapping, flush local host TLBs.
3947 * This was not needed when vmenter/vmexit always had the side effect
3948 * of flushing host TLBs but, with ASIDs, it is possible to finish
3949 * this CR3 update, vmenter the guest, vmexit due to a page fault,
3950 * without an intervening host TLB flush. Then the page fault code
3951 * could use the linear pagetable to read a top-level shadow page
3952 * table entry. But, without this change, it would fetch the wrong
3953 * value due to a stale TLB.
3954 */
3955 flush_tlb_local();
3960 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3961 * Does all appropriate management/bookkeeping/refcounting/etc...
3962 */
3963 static void
3964 sh_detach_old_tables(struct vcpu *v)
3966 mfn_t smfn;
3967 int i = 0;
3969 ////
3970 //// vcpu->arch.paging.shadow.guest_vtable
3971 ////
3973 #if GUEST_PAGING_LEVELS == 3
3974 /* PAE guests don't have a mapping of the guest top-level table */
3975 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3976 #else
3977 if ( v->arch.paging.shadow.guest_vtable )
3979 struct domain *d = v->domain;
3980 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3981 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3982 v->arch.paging.shadow.guest_vtable = NULL;
3984 #endif
3987 ////
3988 //// vcpu->arch.shadow_table[]
3989 ////
3991 #if GUEST_PAGING_LEVELS == 3
3992 /* PAE guests have four shadow_table entries */
3993 for ( i = 0 ; i < 4 ; i++ )
3994 #endif
3996 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3997 if ( mfn_x(smfn) )
3998 sh_put_ref(v, smfn, 0);
3999 v->arch.shadow_table[i] = pagetable_null();
4003 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
4004 static void
4005 sh_set_toplevel_shadow(struct vcpu *v,
4006 int slot,
4007 mfn_t gmfn,
4008 unsigned int root_type)
4010 mfn_t smfn;
4011 pagetable_t old_entry, new_entry;
4013 struct domain *d = v->domain;
4015 /* Remember the old contents of this slot */
4016 old_entry = v->arch.shadow_table[slot];
4018 /* Now figure out the new contents: is this a valid guest MFN? */
4019 if ( !mfn_valid(gmfn) )
4021 new_entry = pagetable_null();
4022 goto install_new_entry;
4025 /* Guest mfn is valid: shadow it and install the shadow */
4026 smfn = get_shadow_status(v, gmfn, root_type);
4027 if ( !mfn_valid(smfn) )
4029 /* Make sure there's enough free shadow memory. */
4030 shadow_prealloc(d, root_type, 1);
4031 /* Shadow the page. */
4032 smfn = sh_make_shadow(v, gmfn, root_type);
4034 ASSERT(mfn_valid(smfn));
4036 /* Pin the shadow and put it (back) on the list of pinned shadows */
4037 if ( sh_pin(v, smfn) == 0 )
4039 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
4040 domain_crash(v->domain);
4043 /* Take a ref to this page: it will be released in sh_detach_old_tables()
4044 * or the next call to set_toplevel_shadow() */
4045 if ( !sh_get_ref(v, smfn, 0) )
4047 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
4048 domain_crash(v->domain);
4051 new_entry = pagetable_from_mfn(smfn);
4053 install_new_entry:
4054 /* Done. Install it */
4055 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
4056 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
4057 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
4058 v->arch.shadow_table[slot] = new_entry;
4060 /* Decrement the refcount of the old contents of this slot */
4061 if ( !pagetable_is_null(old_entry) ) {
4062 mfn_t old_smfn = pagetable_get_mfn(old_entry);
4063 /* Need to repin the old toplevel shadow if it's been unpinned
4064 * by shadow_prealloc(): in PV mode we're still running on this
4065 * shadow and it's not safe to free it yet. */
4066 if ( !mfn_to_shadow_page(old_smfn)->pinned && !sh_pin(v, old_smfn) )
4068 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
4069 domain_crash(v->domain);
4071 sh_put_ref(v, old_smfn, 0);
4076 static void
4077 sh_update_cr3(struct vcpu *v, int do_locking)
4078 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
4079 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
4080 * if appropriate).
4081 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
4082 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
4083 * shadow tables are.
4084 * If do_locking != 0, assume we are being called from outside the
4085 * shadow code, and must take and release the shadow lock; otherwise
4086 * that is the caller's responsibility.
4087 */
4089 struct domain *d = v->domain;
4090 mfn_t gmfn;
4091 #if GUEST_PAGING_LEVELS == 3
4092 guest_l3e_t *gl3e;
4093 u32 guest_idx=0;
4094 int i;
4095 #endif
4097 /* Don't do anything on an uninitialised vcpu */
4098 if ( !is_hvm_domain(d) && !v->is_initialised )
4100 ASSERT(v->arch.cr3 == 0);
4101 return;
4104 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4105 /* Need to resync all the shadow entries on a TLB flush. Resync
4106 * current vcpus OOS pages before switching to the new shadow
4107 * tables so that the VA hint is still valid. */
4108 shadow_resync_current_vcpu(v, do_locking);
4109 #endif
4111 if ( do_locking ) shadow_lock(v->domain);
4113 ASSERT(shadow_locked_by_me(v->domain));
4114 ASSERT(v->arch.paging.mode);
4116 ////
4117 //// vcpu->arch.guest_table is already set
4118 ////
4120 #ifndef NDEBUG
4121 /* Double-check that the HVM code has sent us a sane guest_table */
4122 if ( is_hvm_domain(d) )
4124 ASSERT(shadow_mode_external(d));
4125 if ( hvm_paging_enabled(v) )
4126 ASSERT(pagetable_get_pfn(v->arch.guest_table));
4127 else
4128 ASSERT(v->arch.guest_table.pfn
4129 == d->arch.paging.shadow.unpaged_pagetable.pfn);
4131 #endif
4133 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
4134 d->domain_id, v->vcpu_id,
4135 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
4137 #if GUEST_PAGING_LEVELS == 4
4138 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
4139 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
4140 else
4141 #endif
4142 gmfn = pagetable_get_mfn(v->arch.guest_table);
4145 ////
4146 //// vcpu->arch.paging.shadow.guest_vtable
4147 ////
4148 #if GUEST_PAGING_LEVELS == 4
4149 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4151 if ( v->arch.paging.shadow.guest_vtable )
4152 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4153 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4154 /* PAGING_LEVELS==4 implies 64-bit, which means that
4155 * map_domain_page_global can't fail */
4156 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
4158 else
4159 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
4160 #elif GUEST_PAGING_LEVELS == 3
4161 /* On PAE guests we don't use a mapping of the guest's own top-level
4162 * table. We cache the current state of that table and shadow that,
4163 * until the next CR3 write makes us refresh our cache. */
4164 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4166 if ( shadow_mode_external(d) )
4167 /* Find where in the page the l3 table is */
4168 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
4169 else
4170 /* PV guest: l3 is at the start of a page */
4171 guest_idx = 0;
4173 // Ignore the low 2 bits of guest_idx -- they are really just
4174 // cache control.
4175 guest_idx &= ~3;
4177 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
4178 for ( i = 0; i < 4 ; i++ )
4179 v->arch.paging.shadow.gl3e[i] = gl3e[i];
4180 sh_unmap_domain_page(gl3e);
4181 #elif GUEST_PAGING_LEVELS == 2
4182 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4184 if ( v->arch.paging.shadow.guest_vtable )
4185 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4186 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4187 /* Does this really need map_domain_page_global? Handle the
4188 * error properly if so. */
4189 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
4191 else
4192 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
4193 #else
4194 #error this should never happen
4195 #endif
4198 ////
4199 //// vcpu->arch.shadow_table[]
4200 ////
4202 /* We revoke write access to the new guest toplevel page(s) before we
4203 * replace the old shadow pagetable(s), so that we can safely use the
4204 * (old) shadow linear maps in the writeable mapping heuristics. */
4205 #if GUEST_PAGING_LEVELS == 2
4206 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
4207 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4208 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
4209 #elif GUEST_PAGING_LEVELS == 3
4210 /* PAE guests have four shadow_table entries, based on the
4211 * current values of the guest's four l3es. */
4213 int flush = 0;
4214 gfn_t gl2gfn;
4215 mfn_t gl2mfn;
4216 p2m_type_t p2mt;
4217 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
4218 /* First, make all four entries read-only. */
4219 for ( i = 0; i < 4; i++ )
4221 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4223 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4224 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4225 if ( p2m_is_ram(p2mt) )
4226 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
4229 if ( flush )
4230 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4231 /* Now install the new shadows. */
4232 for ( i = 0; i < 4; i++ )
4234 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4236 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4237 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
4238 if ( p2m_is_ram(p2mt) )
4239 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
4240 ? SH_type_l2h_shadow
4241 : SH_type_l2_shadow);
4242 else
4243 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4245 else
4246 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4249 #elif GUEST_PAGING_LEVELS == 4
4250 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
4251 flush_tlb_mask(v->domain->domain_dirty_cpumask);
4252 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
4253 #else
4254 #error This should never happen
4255 #endif
4258 ///
4259 /// v->arch.paging.shadow.l3table
4260 ///
4261 #if SHADOW_PAGING_LEVELS == 3
4263 mfn_t smfn;
4264 int i;
4265 for ( i = 0; i < 4; i++ )
4267 #if GUEST_PAGING_LEVELS == 2
4268 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
4269 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
4270 #else
4271 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
4272 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4273 #endif
4274 v->arch.paging.shadow.l3table[i] =
4275 (mfn_x(smfn) == 0)
4276 ? shadow_l3e_empty()
4277 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
4280 #endif /* SHADOW_PAGING_LEVELS == 3 */
4283 ///
4284 /// v->arch.cr3
4285 ///
4286 if ( shadow_mode_external(d) )
4288 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4290 else // not shadow_mode_external...
4292 /* We don't support PV except guest == shadow == config levels */
4293 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4294 #if SHADOW_PAGING_LEVELS == 3
4295 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4296 * Don't use make_cr3 because (a) we know it's below 4GB, and
4297 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4298 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4299 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4300 #else
4301 /* 4-on-4: Just use the shadow top-level directly */
4302 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4303 #endif
4307 ///
4308 /// v->arch.hvm_vcpu.hw_cr[3]
4309 ///
4310 if ( shadow_mode_external(d) )
4312 ASSERT(is_hvm_domain(d));
4313 #if SHADOW_PAGING_LEVELS == 3
4314 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4315 v->arch.hvm_vcpu.hw_cr[3] =
4316 virt_to_maddr(&v->arch.paging.shadow.l3table);
4317 #else
4318 /* 4-on-4: Just use the shadow top-level directly */
4319 v->arch.hvm_vcpu.hw_cr[3] =
4320 pagetable_get_paddr(v->arch.shadow_table[0]);
4321 #endif
4322 hvm_update_guest_cr(v, 3);
4325 /* Fix up the linear pagetable mappings */
4326 sh_update_linear_entries(v);
4328 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4329 /* No longer safe to use cached gva->gfn translations */
4330 vtlb_flush(v);
4331 #endif
4333 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4334 v->arch.paging.last_write_emul_ok = 0;
4335 #endif
4337 /* Release the lock, if we took it (otherwise it's the caller's problem) */
4338 if ( do_locking ) shadow_unlock(v->domain);
4340 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4341 /* Need to resync all the shadow entries on a TLB flush. We only
4342 * update the shadows, leaving the pages out of sync. Also, we try
4343 * to skip synchronization of shadows not mapped in the new
4344 * tables. */
4345 shadow_sync_other_vcpus(v, do_locking);
4346 #endif
4351 /**************************************************************************/
4352 /* Functions to revoke guest rights */
4354 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
4355 int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
4356 mfn_t smfn, unsigned long off)
4358 int r;
4359 shadow_l1e_t *sl1p, sl1e;
4360 struct shadow_page_info *sp;
4362 ASSERT(mfn_valid(gmfn));
4363 ASSERT(mfn_valid(smfn));
4365 sp = mfn_to_shadow_page(smfn);
4367 if ( sp->mbz != 0
4368 || (sp->type != SH_type_l1_shadow) )
4369 goto fail;
4371 sl1p = sh_map_domain_page(smfn);
4372 sl1p += off;
4373 sl1e = *sl1p;
4374 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4375 != (_PAGE_PRESENT|_PAGE_RW))
4376 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4378 sh_unmap_domain_page(sl1p);
4379 goto fail;
4382 /* Found it! Need to remove its write permissions. */
4383 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4384 r = shadow_set_l1e(v, sl1p, sl1e, smfn);
4385 ASSERT( !(r & SHADOW_SET_ERROR) );
4387 sh_unmap_domain_page(sl1p);
4388 perfc_incr(shadow_writeable_h_7);
4389 return 1;
4391 fail:
4392 perfc_incr(shadow_writeable_h_8);
4393 return 0;
4395 #endif /* OOS */
4397 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4398 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4399 /* Look up this vaddr in the current shadow and see if it's a writeable
4400 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4402 shadow_l1e_t sl1e, *sl1p;
4403 shadow_l2e_t *sl2p;
4404 shadow_l3e_t *sl3p;
4405 #if SHADOW_PAGING_LEVELS >= 4
4406 shadow_l4e_t *sl4p;
4407 #endif
4408 mfn_t sl1mfn;
4409 int r;
4411 /* Carefully look in the shadow linear map for the l1e we expect */
4412 #if SHADOW_PAGING_LEVELS >= 4
4413 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4414 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4415 return 0;
4416 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4417 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4418 return 0;
4419 #else /* SHADOW_PAGING_LEVELS == 3 */
4420 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4421 + shadow_l3_linear_offset(vaddr);
4422 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4423 return 0;
4424 #endif
4425 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4426 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4427 return 0;
4428 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4429 sl1e = *sl1p;
4430 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4431 != (_PAGE_PRESENT|_PAGE_RW))
4432 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4433 return 0;
4435 /* Found it! Need to remove its write permissions. */
4436 sl1mfn = shadow_l2e_get_mfn(*sl2p);
4437 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4438 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
4439 ASSERT( !(r & SHADOW_SET_ERROR) );
4440 return 1;
4442 #endif
4444 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4445 mfn_t readonly_mfn)
4446 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4448 shadow_l1e_t *sl1e;
4449 int done = 0;
4450 int flags;
4451 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4452 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4453 #endif
4455 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4457 flags = shadow_l1e_get_flags(*sl1e);
4458 if ( (flags & _PAGE_PRESENT)
4459 && (flags & _PAGE_RW)
4460 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4462 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4463 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
4464 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4465 /* Remember the last shadow that we shot a writeable mapping in */
4466 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4467 #endif
4468 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4469 & PGT_count_mask) == 0 )
4470 /* This breaks us cleanly out of the FOREACH macro */
4471 done = 1;
4473 });
4474 return done;
4478 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4479 /* Excises all mappings to guest frame from this shadow l1 table */
4481 shadow_l1e_t *sl1e;
4482 int done = 0;
4483 int flags;
4485 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4487 flags = shadow_l1e_get_flags(*sl1e);
4488 if ( (flags & _PAGE_PRESENT)
4489 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4491 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
4492 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4493 /* This breaks us cleanly out of the FOREACH macro */
4494 done = 1;
4496 });
4497 return done;
4500 /**************************************************************************/
4501 /* Functions to excise all pointers to shadows from higher-level shadows. */
4503 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4504 /* Blank out a single shadow entry */
4506 switch ( mfn_to_shadow_page(smfn)->type )
4508 case SH_type_l1_shadow:
4509 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
4510 case SH_type_l2_shadow:
4511 #if GUEST_PAGING_LEVELS >= 3
4512 case SH_type_l2h_shadow:
4513 #endif
4514 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
4515 #if GUEST_PAGING_LEVELS >= 4
4516 case SH_type_l3_shadow:
4517 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
4518 case SH_type_l4_shadow:
4519 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
4520 #endif
4521 default: BUG(); /* Called with the wrong kind of shadow. */
4525 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4526 /* Remove all mappings of this l1 shadow from this l2 shadow */
4528 shadow_l2e_t *sl2e;
4529 int done = 0;
4530 int flags;
4532 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4534 flags = shadow_l2e_get_flags(*sl2e);
4535 if ( (flags & _PAGE_PRESENT)
4536 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4538 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4539 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
4540 /* This breaks us cleanly out of the FOREACH macro */
4541 done = 1;
4543 });
4544 return done;
4547 #if GUEST_PAGING_LEVELS >= 4
4548 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4549 /* Remove all mappings of this l2 shadow from this l3 shadow */
4551 shadow_l3e_t *sl3e;
4552 int done = 0;
4553 int flags;
4555 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4557 flags = shadow_l3e_get_flags(*sl3e);
4558 if ( (flags & _PAGE_PRESENT)
4559 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4561 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4562 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
4563 /* This breaks us cleanly out of the FOREACH macro */
4564 done = 1;
4566 });
4567 return done;
4570 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4571 /* Remove all mappings of this l3 shadow from this l4 shadow */
4573 shadow_l4e_t *sl4e;
4574 int done = 0;
4575 int flags;
4577 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4579 flags = shadow_l4e_get_flags(*sl4e);
4580 if ( (flags & _PAGE_PRESENT)
4581 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4583 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4584 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
4585 /* This breaks us cleanly out of the FOREACH macro */
4586 done = 1;
4588 });
4589 return done;
4591 #endif /* 64bit guest */
4593 /**************************************************************************/
4594 /* Handling HVM guest writes to pagetables */
4596 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4597 #define BAD_GVA_TO_GFN (~0UL)
4598 #define BAD_GFN_TO_MFN (~1UL)
4599 #define READONLY_GFN (~2UL)
4600 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4601 unsigned long vaddr,
4602 struct sh_emulate_ctxt *sh_ctxt)
4604 unsigned long gfn;
4605 mfn_t mfn;
4606 p2m_type_t p2mt;
4607 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4609 /* Translate the VA to a GFN */
4610 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4611 if ( gfn == INVALID_GFN )
4613 if ( is_hvm_vcpu(v) )
4614 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4615 else
4616 propagate_page_fault(vaddr, pfec);
4617 return _mfn(BAD_GVA_TO_GFN);
4620 /* Translate the GFN to an MFN */
4621 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4622 if ( p2mt == p2m_ram_ro )
4623 return _mfn(READONLY_GFN);
4624 if ( !p2m_is_ram(p2mt) )
4625 return _mfn(BAD_GFN_TO_MFN);
4627 ASSERT(mfn_valid(mfn));
4628 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4629 return mfn;
4632 /* Check that the user is allowed to perform this write.
4633 * Returns a mapped pointer to write to, or NULL for error. */
4634 #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
4635 #define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION)
4636 #define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY)
4637 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3)
4638 static void *emulate_map_dest(struct vcpu *v,
4639 unsigned long vaddr,
4640 u32 bytes,
4641 struct sh_emulate_ctxt *sh_ctxt)
4643 unsigned long offset;
4644 void *map = NULL;
4646 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4647 if ( !mfn_valid(sh_ctxt->mfn1) )
4648 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4649 MAPPING_EXCEPTION :
4650 (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
4651 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4653 #ifndef NDEBUG
4654 /* We don't emulate user-mode writes to page tables */
4655 if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 )
4657 gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
4658 "emulate_map_dest(). This should never happen!\n");
4659 return MAPPING_UNHANDLEABLE;
4661 #endif
4663 /* Unaligned writes mean probably this isn't a pagetable */
4664 if ( vaddr & (bytes - 1) )
4665 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4667 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4669 /* Whole write fits on a single page */
4670 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4671 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4673 else
4675 /* Cross-page emulated writes are only supported for HVM guests;
4676 * PV guests ought to know better */
4677 if ( !is_hvm_vcpu(v) )
4678 return MAPPING_UNHANDLEABLE;
4680 /* This write crosses a page boundary. Translate the second page */
4681 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4682 sh_ctxt);
4683 if ( !mfn_valid(sh_ctxt->mfn2) )
4684 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4685 MAPPING_EXCEPTION :
4686 (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
4687 MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4689 /* Cross-page writes mean probably not a pagetable */
4690 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4692 /* Hack: we map the pages into the vcpu's LDT space, since we
4693 * know that we're not going to need the LDT for HVM guests,
4694 * and only HVM guests are allowed unaligned writes. */
4695 ASSERT(is_hvm_vcpu(v));
4696 map = (void *)LDT_VIRT_START(v);
4697 offset = l1_linear_offset((unsigned long) map);
4698 l1e_write(&__linear_l1_table[offset],
4699 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4700 l1e_write(&__linear_l1_table[offset + 1],
4701 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4702 flush_tlb_local();
4703 map += (vaddr & ~PAGE_MASK);
4706 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4707 /* Remember if the bottom bit was clear, so we can choose not to run
4708 * the change through the verify code if it's still clear afterwards */
4709 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4710 #endif
4712 return map;
4715 /* Tidy up after the emulated write: mark pages dirty, verify the new
4716 * contents, and undo the mapping */
4717 static void emulate_unmap_dest(struct vcpu *v,
4718 void *addr,
4719 u32 bytes,
4720 struct sh_emulate_ctxt *sh_ctxt)
4722 u32 b1 = bytes, b2 = 0, shflags;
4724 ASSERT(mfn_valid(sh_ctxt->mfn1));
4726 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4727 if ( likely(bytes >= 4)
4728 && (*(u32 *)addr == 0)
4729 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4730 check_for_early_unshadow(v, sh_ctxt->mfn1);
4731 else
4732 reset_early_unshadow(v);
4734 /* We can avoid re-verifying the page contents after the write if:
4735 * - it was no larger than the PTE type of this pagetable;
4736 * - it was aligned to the PTE boundaries; and
4737 * - _PAGE_PRESENT was clear before and after the write. */
4738 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4739 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4740 if ( sh_ctxt->low_bit_was_clear
4741 && !(*(u8 *)addr & _PAGE_PRESENT)
4742 && ((!(shflags & SHF_32)
4743 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4744 * the present bit unset are safe to ignore. */
4745 && ((unsigned long)addr & 7) == 0
4746 && bytes <= 8)
4747 ||
4748 (!(shflags & (SHF_PAE|SHF_64))
4749 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4750 * leave the present bit unset are safe to ignore. */
4751 && ((unsigned long)addr & 3) == 0
4752 && bytes <= 4)) )
4754 /* Writes with this alignment constraint can't possibly cross pages */
4755 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4757 else
4758 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4760 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4762 /* Validate as two writes, one to each page */
4763 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4764 b2 = bytes - b1;
4765 ASSERT(b2 < bytes);
4767 if ( likely(b1 > 0) )
4768 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4769 if ( unlikely(b2 > 0) )
4770 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4773 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4775 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4777 unsigned long offset;
4778 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4779 /* Undo the hacky two-frame contiguous map. */
4780 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4781 offset = l1_linear_offset((unsigned long) addr);
4782 l1e_write(&__linear_l1_table[offset], l1e_empty());
4783 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4784 flush_tlb_all();
4786 else
4787 sh_unmap_domain_page(addr);
4789 atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4792 static int
4793 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4794 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4796 void *addr;
4798 /* Unaligned writes are only acceptable on HVM */
4799 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4800 return X86EMUL_UNHANDLEABLE;
4802 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4803 if ( emulate_map_dest_failed(addr) )
4804 return (long)addr;
4806 shadow_lock(v->domain);
4807 memcpy(addr, src, bytes);
4809 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4810 shadow_audit_tables(v);
4811 shadow_unlock(v->domain);
4812 return X86EMUL_OKAY;
4815 static int
4816 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4817 unsigned long old, unsigned long new,
4818 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4820 void *addr;
4821 unsigned long prev;
4822 int rv = X86EMUL_OKAY;
4824 /* Unaligned writes are only acceptable on HVM */
4825 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4826 return X86EMUL_UNHANDLEABLE;
4828 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4829 if ( emulate_map_dest_failed(addr) )
4830 return (long)addr;
4832 shadow_lock(v->domain);
4833 switch ( bytes )
4835 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4836 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4837 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4838 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4839 default:
4840 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4841 prev = ~old;
4844 if ( prev != old )
4845 rv = X86EMUL_CMPXCHG_FAILED;
4847 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4848 " wanted %#lx now %#lx bytes %u\n",
4849 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4851 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4852 shadow_audit_tables(v);
4853 shadow_unlock(v->domain);
4854 return rv;
4857 #ifdef __i386__
4858 static int
4859 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4860 unsigned long old_lo, unsigned long old_hi,
4861 unsigned long new_lo, unsigned long new_hi,
4862 struct sh_emulate_ctxt *sh_ctxt)
4864 void *addr;
4865 u64 old, new, prev;
4866 int rv = X86EMUL_OKAY;
4868 /* Unaligned writes are only acceptable on HVM */
4869 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4870 return X86EMUL_UNHANDLEABLE;
4872 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4873 if ( emulate_map_dest_failed(addr) )
4874 return (long)addr;
4876 old = (((u64) old_hi) << 32) | (u64) old_lo;
4877 new = (((u64) new_hi) << 32) | (u64) new_lo;
4879 shadow_lock(v->domain);
4880 prev = cmpxchg(((u64 *)addr), old, new);
4882 if ( prev != old )
4883 rv = X86EMUL_CMPXCHG_FAILED;
4885 emulate_unmap_dest(v, addr, 8, sh_ctxt);
4886 shadow_audit_tables(v);
4887 shadow_unlock(v->domain);
4888 return rv;
4890 #endif
4892 /**************************************************************************/
4893 /* Audit tools */
4895 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4897 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4898 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4899 "gl" #_level "mfn = %" PRI_mfn \
4900 " sl" #_level "mfn = %" PRI_mfn \
4901 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4902 " gl" #_level "e = %" SH_PRI_gpte \
4903 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4904 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4905 _level, guest_index(gl ## _level ## e), \
4906 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4907 gl ## _level ## e, sl ## _level ## e, \
4908 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4909 ##_a); \
4910 BUG(); \
4911 done = 1; \
4912 } while (0)
4914 #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
4915 printk("Shadow %u-on-%u audit failed at level %i\n" \
4916 "gl" #_level "mfn = %" PRI_mfn \
4917 " sl" #_level "mfn = %" PRI_mfn \
4918 " Error: " _fmt "\n", \
4919 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4920 _level, \
4921 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4922 ##_a); \
4923 BUG(); \
4924 done = 1; \
4925 } while (0)
4927 static char * sh_audit_flags(struct vcpu *v, int level,
4928 int gflags, int sflags)
4929 /* Common code for auditing flag bits */
4931 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4932 return "shadow is present but guest is not present";
4933 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4934 return "global bit set in PV shadow";
4935 if ( level == 2 && (sflags & _PAGE_PSE) )
4936 return "PS bit set in shadow";
4937 #if SHADOW_PAGING_LEVELS == 3
4938 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4939 #endif
4940 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4941 return "accessed bit not propagated";
4942 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4943 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4944 return "dirty bit not propagated";
4945 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4946 return "user/supervisor bit does not match";
4947 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4948 return "NX bit does not match";
4949 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4950 return "shadow grants write access but guest does not";
4951 return NULL;
4954 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4956 guest_l1e_t *gl1e, *gp;
4957 shadow_l1e_t *sl1e;
4958 mfn_t mfn, gmfn, gl1mfn;
4959 gfn_t gfn;
4960 p2m_type_t p2mt;
4961 char *s;
4962 int done = 0;
4964 /* Follow the backpointer */
4965 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4967 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4968 /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
4969 if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
4971 oos_audit_hash_is_present(v->domain, gl1mfn);
4972 return 0;
4974 #endif
4976 gl1e = gp = sh_map_domain_page(gl1mfn);
4977 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4979 if ( sh_l1e_is_magic(*sl1e) )
4981 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
4982 if ( sh_l1e_is_gnp(*sl1e) )
4984 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4985 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4987 else
4989 ASSERT(sh_l1e_is_mmio(*sl1e));
4990 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4991 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4992 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4993 " but guest gfn is %" SH_PRI_gfn,
4994 gfn_x(gfn),
4995 gfn_x(guest_l1e_get_gfn(*gl1e)));
4997 #endif
4999 else
5001 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
5002 shadow_l1e_get_flags(*sl1e));
5003 if ( s ) AUDIT_FAIL(1, "%s", s);
5005 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5007 gfn = guest_l1e_get_gfn(*gl1e);
5008 mfn = shadow_l1e_get_mfn(*sl1e);
5009 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
5010 if ( mfn_x(gmfn) != mfn_x(mfn) )
5011 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
5012 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5013 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5016 });
5017 sh_unmap_domain_page(gp);
5018 return done;
5021 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
5023 guest_l1e_t *gl1e, e;
5024 shadow_l1e_t *sl1e;
5025 mfn_t gl1mfn = _mfn(INVALID_MFN);
5026 int f;
5027 int done = 0;
5029 /* fl1 has no useful backpointer: all we can check are flags */
5030 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
5031 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
5032 f = shadow_l1e_get_flags(*sl1e);
5033 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
5034 if ( !(f == 0
5035 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
5036 _PAGE_ACCESSED|_PAGE_DIRTY)
5037 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
5038 || sh_l1e_is_magic(*sl1e)) )
5039 AUDIT_FAIL(1, "fl1e has bad flags");
5040 });
5041 return 0;
5044 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
5046 guest_l2e_t *gl2e, *gp;
5047 shadow_l2e_t *sl2e;
5048 mfn_t mfn, gmfn, gl2mfn;
5049 gfn_t gfn;
5050 p2m_type_t p2mt;
5051 char *s;
5052 int done = 0;
5054 /* Follow the backpointer */
5055 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
5057 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5058 /* Only L1's may be out of sync. */
5059 if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
5060 AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
5061 #endif
5063 gl2e = gp = sh_map_domain_page(gl2mfn);
5064 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
5066 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
5067 shadow_l2e_get_flags(*sl2e));
5068 if ( s ) AUDIT_FAIL(2, "%s", s);
5070 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5072 gfn = guest_l2e_get_gfn(*gl2e);
5073 mfn = shadow_l2e_get_mfn(*sl2e);
5074 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
5075 ? get_fl1_shadow_status(v, gfn)
5076 : get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5077 SH_type_l1_shadow);
5078 if ( mfn_x(gmfn) != mfn_x(mfn) )
5079 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
5080 " (--> %" PRI_mfn ")"
5081 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5082 gfn_x(gfn),
5083 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
5084 : mfn_x(gfn_to_mfn(v->domain, gfn, &p2mt)),
5085 mfn_x(gmfn), mfn_x(mfn));
5087 });
5088 sh_unmap_domain_page(gp);
5089 return 0;
5092 #if GUEST_PAGING_LEVELS >= 4
5093 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
5095 guest_l3e_t *gl3e, *gp;
5096 shadow_l3e_t *sl3e;
5097 mfn_t mfn, gmfn, gl3mfn;
5098 gfn_t gfn;
5099 p2m_type_t p2mt;
5100 char *s;
5101 int done = 0;
5103 /* Follow the backpointer */
5104 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
5106 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5107 /* Only L1's may be out of sync. */
5108 if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
5109 AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
5110 #endif
5112 gl3e = gp = sh_map_domain_page(gl3mfn);
5113 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
5115 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
5116 shadow_l3e_get_flags(*sl3e));
5117 if ( s ) AUDIT_FAIL(3, "%s", s);
5119 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5121 gfn = guest_l3e_get_gfn(*gl3e);
5122 mfn = shadow_l3e_get_mfn(*sl3e);
5123 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5124 ((GUEST_PAGING_LEVELS == 3 ||
5125 is_pv_32on64_vcpu(v))
5126 && !shadow_mode_external(v->domain)
5127 && (guest_index(gl3e) % 4) == 3)
5128 ? SH_type_l2h_shadow
5129 : SH_type_l2_shadow);
5130 if ( mfn_x(gmfn) != mfn_x(mfn) )
5131 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
5132 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5133 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5135 });
5136 sh_unmap_domain_page(gp);
5137 return 0;
5140 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
5142 guest_l4e_t *gl4e, *gp;
5143 shadow_l4e_t *sl4e;
5144 mfn_t mfn, gmfn, gl4mfn;
5145 gfn_t gfn;
5146 p2m_type_t p2mt;
5147 char *s;
5148 int done = 0;
5150 /* Follow the backpointer */
5151 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
5153 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5154 /* Only L1's may be out of sync. */
5155 if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
5156 AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
5157 #endif
5159 gl4e = gp = sh_map_domain_page(gl4mfn);
5160 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
5162 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
5163 shadow_l4e_get_flags(*sl4e));
5164 if ( s ) AUDIT_FAIL(4, "%s", s);
5166 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5168 gfn = guest_l4e_get_gfn(*gl4e);
5169 mfn = shadow_l4e_get_mfn(*sl4e);
5170 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
5171 SH_type_l3_shadow);
5172 if ( mfn_x(gmfn) != mfn_x(mfn) )
5173 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
5174 " --> %" PRI_mfn " != mfn %" PRI_mfn,
5175 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5177 });
5178 sh_unmap_domain_page(gp);
5179 return 0;
5181 #endif /* GUEST_PAGING_LEVELS >= 4 */
5184 #undef AUDIT_FAIL
5186 #endif /* Audit code */
5188 /**************************************************************************/
5189 /* Entry points into this mode of the shadow code.
5190 * This will all be mangled by the preprocessor to uniquify everything. */
5191 struct paging_mode sh_paging_mode = {
5192 .page_fault = sh_page_fault,
5193 .invlpg = sh_invlpg,
5194 .gva_to_gfn = sh_gva_to_gfn,
5195 .update_cr3 = sh_update_cr3,
5196 .update_paging_modes = shadow_update_paging_modes,
5197 .write_p2m_entry = shadow_write_p2m_entry,
5198 .write_guest_entry = shadow_write_guest_entry,
5199 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
5200 .guest_map_l1e = sh_guest_map_l1e,
5201 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
5202 .guest_levels = GUEST_PAGING_LEVELS,
5203 .shadow.detach_old_tables = sh_detach_old_tables,
5204 .shadow.x86_emulate_write = sh_x86_emulate_write,
5205 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
5206 #ifdef __i386__
5207 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
5208 #endif
5209 .shadow.make_monitor_table = sh_make_monitor_table,
5210 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
5211 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
5212 .shadow.guess_wrmap = sh_guess_wrmap,
5213 #endif
5214 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
5215 };
5217 /*
5218 * Local variables:
5219 * mode: C
5220 * c-set-style: "BSD"
5221 * c-basic-offset: 4
5222 * indent-tabs-mode: nil
5223 * End:
5224 */