ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 17571:b6aa55ca599e

shadow: track video RAM dirty bits

This adds a new HVM op that enables tracking dirty bits of a range of
video RAM. The idea is to optimize just for the most common case
(only one guest mapping, with sometimes some temporary other
mappings), which permits to keep the overhead on shadow as low as
possible.

Signed-off-by: Samuel Thibault <samuel.thibault@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri May 02 15:08:27 2008 +0100 (2008-05-02)
parents bc7ee2f93852
children bb49aeae1ff2
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include "private.h"
39 #include "types.h"
41 /* THINGS TO DO LATER:
42 *
43 * TEARDOWN HEURISTICS
44 * Also: have a heuristic for when to destroy a previous paging-mode's
45 * shadows. When a guest is done with its start-of-day 32-bit tables
46 * and reuses the memory we want to drop those shadows. Start with
47 * shadows in a page in two modes as a hint, but beware of clever tricks
48 * like reusing a pagetable for both PAE and 64-bit during boot...
49 *
50 * PAE LINEAR MAPS
51 * Rework shadow_get_l*e() to have the option of using map_domain_page()
52 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
53 * Then we can test the speed difference made by linear maps. If the
54 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
55 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
56 * to share l2h pages again.
57 *
58 * PSE disabled / PSE36
59 * We don't support any modes other than PSE enabled, PSE36 disabled.
60 * Neither of those would be hard to change, but we'd need to be able to
61 * deal with shadows made in one mode and used in another.
62 */
64 #define FETCH_TYPE_PREFETCH 1
65 #define FETCH_TYPE_DEMAND 2
66 #define FETCH_TYPE_WRITE 4
67 typedef enum {
68 ft_prefetch = FETCH_TYPE_PREFETCH,
69 ft_demand_read = FETCH_TYPE_DEMAND,
70 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
71 } fetch_type_t;
73 #ifdef DEBUG_TRACE_DUMP
74 static char *fetch_type_names[] = {
75 [ft_prefetch] "prefetch",
76 [ft_demand_read] "demand read",
77 [ft_demand_write] "demand write",
78 };
79 #endif
81 /**************************************************************************/
82 /* Hash table mapping from guest pagetables to shadows
83 *
84 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
85 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
86 * shadow L1 which maps its "splinters".
87 */
89 static inline mfn_t
90 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
91 /* Look for FL1 shadows in the hash table */
92 {
93 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
94 return smfn;
95 }
97 static inline mfn_t
98 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
99 /* Look for shadows in the hash table */
100 {
101 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
102 perfc_incr(shadow_get_shadow_status);
103 return smfn;
104 }
106 static inline void
107 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
108 /* Put an FL1 shadow into the hash table */
109 {
110 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
111 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
113 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
114 }
116 static inline void
117 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
118 /* Put a shadow into the hash table */
119 {
120 struct domain *d = v->domain;
121 int res;
123 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
124 d->domain_id, v->vcpu_id, mfn_x(gmfn),
125 shadow_type, mfn_x(smfn));
127 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
128 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
129 {
130 res = get_page(mfn_to_page(gmfn), d);
131 ASSERT(res == 1);
132 }
134 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
135 }
137 static inline void
138 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
139 /* Remove a shadow from the hash table */
140 {
141 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
142 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
143 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
144 }
146 static inline void
147 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
148 /* Remove a shadow from the hash table */
149 {
150 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
151 v->domain->domain_id, v->vcpu_id,
152 mfn_x(gmfn), shadow_type, mfn_x(smfn));
153 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
154 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
155 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
156 put_page(mfn_to_page(gmfn));
157 }
159 /**************************************************************************/
160 /* CPU feature support querying */
162 static inline int
163 guest_supports_superpages(struct vcpu *v)
164 {
165 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
166 * CR4.PSE is set or the guest is in PAE or long mode.
167 * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
168 return (is_hvm_vcpu(v) &&
169 (GUEST_PAGING_LEVELS != 2
170 || !hvm_paging_enabled(v)
171 || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
172 }
174 static inline int
175 guest_supports_nx(struct vcpu *v)
176 {
177 if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
178 return 0;
179 if ( !is_hvm_vcpu(v) )
180 return cpu_has_nx;
181 return hvm_nx_enabled(v);
182 }
185 /**************************************************************************/
186 /* Functions for walking the guest page tables */
188 /* Flags that are needed in a pagetable entry, with the sense of NX inverted */
189 static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
190 {
191 static uint32_t flags[] = {
192 /* I/F - Usr Wr */
193 /* 0 0 0 0 */ _PAGE_PRESENT,
194 /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
195 /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
196 /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
197 /* 0 1 0 0 */ _PAGE_PRESENT,
198 /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
199 /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
200 /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
201 /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
202 /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
203 /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
204 /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
205 /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
206 /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
207 /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
208 /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
209 };
211 /* Don't demand not-NX if the CPU wouldn't enforce it. */
212 if ( !guest_supports_nx(v) )
213 pfec &= ~PFEC_insn_fetch;
215 /* Don't demand R/W if the CPU wouldn't enforce it. */
216 if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
217 && !(pfec & PFEC_user_mode) )
218 pfec &= ~PFEC_write_access;
220 return flags[(pfec & 0x1f) >> 1];
221 }
223 /* Modify a guest pagetable entry to set the Accessed and Dirty bits.
224 * Returns non-zero if it actually writes to guest memory. */
225 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
226 {
227 guest_intpte_t old, new;
229 old = *(guest_intpte_t *)walk_p;
230 new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
231 if ( old != new )
232 {
233 /* Write the new entry into the walk, and try to write it back
234 * into the guest table as well. If the guest table has changed
235 * under out feet then leave it alone. */
236 *(guest_intpte_t *)walk_p = new;
237 if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
238 return 1;
239 }
240 return 0;
241 }
243 /* This validation is called with lock held, and after write permission
244 * removal. Then check is atomic and no more inconsistent content can
245 * be observed before lock is released
246 *
247 * Return 1 to indicate success and 0 for inconsistency
248 */
249 static inline uint32_t
250 shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
251 {
252 struct domain *d = v->domain;
253 guest_l1e_t *l1p;
254 guest_l2e_t *l2p;
255 #if GUEST_PAGING_LEVELS >= 4
256 guest_l3e_t *l3p;
257 guest_l4e_t *l4p;
258 #endif
259 int mismatch = 0;
261 ASSERT(shadow_locked_by_me(d));
263 if ( gw->version ==
264 atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
265 return 1;
267 /* We may consider caching guest page mapping from last
268 * guest table walk. However considering this check happens
269 * relatively less-frequent, and a bit burden here to
270 * remap guest page is better than caching mapping in each
271 * guest table walk.
272 *
273 * Also when inconsistency occurs, simply return to trigger
274 * another fault instead of re-validate new path to make
275 * logic simple.
276 */
277 perfc_incr(shadow_check_gwalk);
278 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
279 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
280 l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
281 mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
282 l3p = sh_map_domain_page(gw->l3mfn);
283 mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
284 sh_unmap_domain_page(l3p);
285 #else
286 mismatch |= (gw->l3e.l3 !=
287 v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
288 #endif
289 l2p = sh_map_domain_page(gw->l2mfn);
290 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
291 sh_unmap_domain_page(l2p);
292 #else
293 l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
294 mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
295 #endif
296 if ( !(guest_supports_superpages(v) &&
297 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
298 {
299 l1p = sh_map_domain_page(gw->l1mfn);
300 mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
301 sh_unmap_domain_page(l1p);
302 }
304 return !mismatch;
305 }
307 /* Remove write access permissions from a gwalk_t in a batch, and
308 * return OR-ed result for TLB flush hint
309 */
310 static inline uint32_t
311 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
312 {
313 int rc = 0;
315 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
316 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
317 rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
318 #endif
319 rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
320 #endif
321 if ( !(guest_supports_superpages(v) &&
322 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
323 rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
325 return rc;
326 }
328 /* Walk the guest pagetables, after the manner of a hardware walker.
329 *
330 * Inputs: a vcpu, a virtual address, a walk_t to fill, a
331 * pointer to a pagefault code
332 *
333 * We walk the vcpu's guest pagetables, filling the walk_t with what we
334 * see and adding any Accessed and Dirty bits that are needed in the
335 * guest entries. Using the pagefault code, we check the permissions as
336 * we go. For the purposes of reading pagetables we treat all non-RAM
337 * memory as contining zeroes.
338 *
339 * The walk is done in a lock-free style, with some sanity check postponed
340 * after grabbing shadow lock later. Those delayed checks will make sure
341 * no inconsistent mapping being translated into shadow page table.
342 *
343 * Returns 0 for success, or the set of permission bits that we failed on
344 * if the walk did not complete.
345 * N.B. This is different from the old return code but almost no callers
346 * checked the old return code anyway.
347 */
348 static uint32_t
349 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
350 {
351 struct domain *d = v->domain;
352 p2m_type_t p2mt;
353 guest_l1e_t *l1p = NULL;
354 guest_l2e_t *l2p = NULL;
355 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
356 guest_l3e_t *l3p = NULL;
357 guest_l4e_t *l4p;
358 #endif
359 uint32_t gflags, mflags, rc = 0;
360 int pse;
362 perfc_incr(shadow_guest_walk);
363 memset(gw, 0, sizeof(*gw));
364 gw->va = va;
366 gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
367 rmb();
369 /* Mandatory bits that must be set in every entry. We invert NX, to
370 * calculate as if there were an "X" bit that allowed access.
371 * We will accumulate, in rc, the set of flags that are missing. */
372 mflags = mandatory_flags(v, pfec);
374 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
375 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
377 /* Get the l4e from the top level table and check its flags*/
378 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
379 l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
380 gw->l4e = l4p[guest_l4_table_offset(va)];
381 gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
382 rc |= ((gflags & mflags) ^ mflags);
383 if ( rc & _PAGE_PRESENT ) goto out;
385 /* Map the l3 table */
386 gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
387 if ( !p2m_is_ram(p2mt) )
388 {
389 rc |= _PAGE_PRESENT;
390 goto out;
391 }
392 ASSERT(mfn_valid(gw->l3mfn));
394 /* Get the l3e and check its flags*/
395 l3p = sh_map_domain_page(gw->l3mfn);
396 gw->l3e = l3p[guest_l3_table_offset(va)];
397 gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
398 rc |= ((gflags & mflags) ^ mflags);
399 if ( rc & _PAGE_PRESENT )
400 goto out;
402 #else /* PAE only... */
404 /* Get l3e from the cache of the top level table and check its flag */
405 gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
406 if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
407 {
408 rc |= _PAGE_PRESENT;
409 goto out;
410 }
412 #endif /* PAE or 64... */
414 /* Map the l2 table */
415 gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
416 if ( !p2m_is_ram(p2mt) )
417 {
418 rc |= _PAGE_PRESENT;
419 goto out;
420 }
421 ASSERT(mfn_valid(gw->l2mfn));
423 /* Get the l2e */
424 l2p = sh_map_domain_page(gw->l2mfn);
425 gw->l2e = l2p[guest_l2_table_offset(va)];
427 #else /* 32-bit only... */
429 /* Get l2e from the top level table */
430 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
431 l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
432 gw->l2e = l2p[guest_l2_table_offset(va)];
434 #endif /* All levels... */
436 gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
437 rc |= ((gflags & mflags) ^ mflags);
438 if ( rc & _PAGE_PRESENT )
439 goto out;
441 pse = (guest_supports_superpages(v) &&
442 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
444 if ( pse )
445 {
446 /* Special case: this guest VA is in a PSE superpage, so there's
447 * no guest l1e. We make one up so that the propagation code
448 * can generate a shadow l1 table. Start with the gfn of the
449 * first 4k-page of the superpage. */
450 gfn_t start = guest_l2e_get_gfn(gw->l2e);
451 /* Grant full access in the l1e, since all the guest entry's
452 * access controls are enforced in the shadow l2e. */
453 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
454 _PAGE_ACCESSED|_PAGE_DIRTY);
455 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
456 * of the level 1. */
457 if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
458 flags |= _PAGE_PAT;
459 /* Copy the cache-control bits to the l1 as well, because we
460 * can't represent PAT in the (non-PSE) shadow l2e. :(
461 * This could cause problems if a guest ever maps an area of
462 * memory with superpages using more than one caching mode. */
463 flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
464 /* Increment the pfn by the right number of 4k pages.
465 * The ~0x1 is to mask out the PAT bit mentioned above. */
466 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
467 gw->l1e = guest_l1e_from_gfn(start, flags);
468 gw->l1mfn = _mfn(INVALID_MFN);
469 }
470 else
471 {
472 /* Not a superpage: carry on and find the l1e. */
473 gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
474 if ( !p2m_is_ram(p2mt) )
475 {
476 rc |= _PAGE_PRESENT;
477 goto out;
478 }
479 ASSERT(mfn_valid(gw->l1mfn));
480 l1p = sh_map_domain_page(gw->l1mfn);
481 gw->l1e = l1p[guest_l1_table_offset(va)];
482 gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
483 rc |= ((gflags & mflags) ^ mflags);
484 }
486 /* Go back and set accessed and dirty bits only if the walk was a
487 * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
488 * get set whenever a lower-level PT is used, at least some hardware
489 * walkers behave this way. */
490 if ( rc == 0 )
491 {
492 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
493 if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
494 paging_mark_dirty(d, mfn_x(gw->l4mfn));
495 if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
496 paging_mark_dirty(d, mfn_x(gw->l3mfn));
497 #endif
498 if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
499 (pse && (pfec & PFEC_write_access))) )
500 paging_mark_dirty(d, mfn_x(gw->l2mfn));
501 if ( !pse )
502 {
503 if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
504 (pfec & PFEC_write_access)) )
505 paging_mark_dirty(d, mfn_x(gw->l1mfn));
506 }
507 }
509 out:
510 #if GUEST_PAGING_LEVELS == 4
511 if ( l3p ) sh_unmap_domain_page(l3p);
512 #endif
513 #if GUEST_PAGING_LEVELS >= 3
514 if ( l2p ) sh_unmap_domain_page(l2p);
515 #endif
516 if ( l1p ) sh_unmap_domain_page(l1p);
518 return rc;
519 }
521 /* Given a walk_t, translate the gw->va into the guest's notion of the
522 * corresponding frame number. */
523 static inline gfn_t
524 guest_walk_to_gfn(walk_t *gw)
525 {
526 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
527 return _gfn(INVALID_GFN);
528 return guest_l1e_get_gfn(gw->l1e);
529 }
531 /* Given a walk_t, translate the gw->va into the guest's notion of the
532 * corresponding physical address. */
533 static inline paddr_t
534 guest_walk_to_gpa(walk_t *gw)
535 {
536 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
537 return 0;
538 return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
539 }
541 #if 0 /* Keep for debugging */
542 /* Pretty-print the contents of a guest-walk */
543 static inline void print_gw(walk_t *gw)
544 {
545 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
546 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
547 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
548 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
549 SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
550 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
551 #endif /* PAE or 64... */
552 SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
553 #endif /* All levels... */
554 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
555 SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
556 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
557 SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
558 }
559 #endif /* 0 */
561 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
562 /* Lightweight audit: pass all the shadows associated with this guest walk
563 * through the audit mechanisms */
564 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
565 {
566 mfn_t smfn;
568 if ( !(SHADOW_AUDIT_ENABLE) )
569 return;
571 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
572 if ( mfn_valid(gw->l4mfn)
573 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
574 SH_type_l4_shadow))) )
575 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
576 if ( mfn_valid(gw->l3mfn)
577 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
578 SH_type_l3_shadow))) )
579 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
580 #endif /* PAE or 64... */
581 if ( mfn_valid(gw->l2mfn) )
582 {
583 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
584 SH_type_l2_shadow))) )
585 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
586 #if GUEST_PAGING_LEVELS == 3
587 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
588 SH_type_l2h_shadow))) )
589 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
590 #endif
591 }
592 if ( mfn_valid(gw->l1mfn)
593 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
594 SH_type_l1_shadow))) )
595 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
596 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
597 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
598 && mfn_valid(
599 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
600 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
601 }
603 #else
604 #define sh_audit_gw(_v, _gw) do {} while(0)
605 #endif /* audit code */
608 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
609 void *
610 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
611 unsigned long *gl1mfn)
612 {
613 void *pl1e = NULL;
614 walk_t gw;
616 ASSERT(shadow_mode_translate(v->domain));
618 // XXX -- this is expensive, but it's easy to cobble together...
619 // FIXME!
621 if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0
622 && mfn_valid(gw.l1mfn) )
623 {
624 if ( gl1mfn )
625 *gl1mfn = mfn_x(gw.l1mfn);
626 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
627 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
628 }
630 return pl1e;
631 }
633 void
634 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
635 {
636 walk_t gw;
638 ASSERT(shadow_mode_translate(v->domain));
640 // XXX -- this is expensive, but it's easy to cobble together...
641 // FIXME!
643 (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
644 *(guest_l1e_t *)eff_l1e = gw.l1e;
645 }
646 #endif /* CONFIG==SHADOW==GUEST */
648 /**************************************************************************/
649 /* Functions to compute the correct index into a shadow page, given an
650 * index into the guest page (as returned by guest_get_index()).
651 * This is trivial when the shadow and guest use the same sized PTEs, but
652 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
653 * PAE- or 64-bit shadows).
654 *
655 * These functions also increment the shadow mfn, when necessary. When PTE
656 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
657 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
658 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
659 * which shadow page we really want. Similarly, when PTE sizes are
660 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
661 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
662 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
663 * space.)
664 *
665 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
666 * of shadow (to store both the shadow, and the info that would normally be
667 * stored in page_info fields). This arrangement allows the shadow and the
668 * "page_info" fields to always be stored in the same page (in fact, in
669 * the same cache line), avoiding an extra call to map_domain_page().
670 */
672 static inline u32
673 guest_index(void *ptr)
674 {
675 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
676 }
678 static u32
679 shadow_l1_index(mfn_t *smfn, u32 guest_index)
680 {
681 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
682 *smfn = _mfn(mfn_x(*smfn) +
683 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
684 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
685 #else
686 return guest_index;
687 #endif
688 }
690 static u32
691 shadow_l2_index(mfn_t *smfn, u32 guest_index)
692 {
693 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
694 // Because we use 2 shadow l2 entries for each guest entry, the number of
695 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
696 //
697 *smfn = _mfn(mfn_x(*smfn) +
698 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
700 // We multiple by two to get the index of the first of the two entries
701 // used to shadow the specified guest entry.
702 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
703 #else
704 return guest_index;
705 #endif
706 }
708 #if GUEST_PAGING_LEVELS >= 4
710 static u32
711 shadow_l3_index(mfn_t *smfn, u32 guest_index)
712 {
713 return guest_index;
714 }
716 static u32
717 shadow_l4_index(mfn_t *smfn, u32 guest_index)
718 {
719 return guest_index;
720 }
722 #endif // GUEST_PAGING_LEVELS >= 4
724 extern u32 get_pat_flags(struct vcpu *v,
725 u32 gl1e_flags,
726 paddr_t gpaddr,
727 paddr_t spaddr);
729 unsigned char pat_type_2_pte_flags(unsigned char pat_type);
730 /**************************************************************************/
731 /* Function which computes shadow entries from their corresponding guest
732 * entries. This is the "heart" of the shadow code. It operates using
733 * level-1 shadow types, but handles all levels of entry.
734 * Don't call it directly, but use the four wrappers below.
735 */
737 static always_inline void
738 _sh_propagate(struct vcpu *v,
739 guest_intpte_t guest_intpte,
740 mfn_t target_mfn,
741 void *shadow_entry_ptr,
742 int level,
743 fetch_type_t ft,
744 p2m_type_t p2mt)
745 {
746 guest_l1e_t guest_entry = { guest_intpte };
747 shadow_l1e_t *sp = shadow_entry_ptr;
748 struct domain *d = v->domain;
749 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
750 u32 pass_thru_flags;
751 u32 gflags, sflags;
753 /* We don't shadow PAE l3s */
754 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
756 /* Check there's something for the shadows to map to */
757 if ( !p2m_is_valid(p2mt) )
758 {
759 *sp = shadow_l1e_empty();
760 goto done;
761 }
763 gflags = guest_l1e_get_flags(guest_entry);
765 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
766 {
767 /* If a guest l1 entry is not present, shadow with the magic
768 * guest-not-present entry. */
769 if ( level == 1 )
770 *sp = sh_l1e_gnp();
771 else
772 *sp = shadow_l1e_empty();
773 goto done;
774 }
776 if ( level == 1 && p2mt == p2m_mmio_dm )
777 {
778 /* Guest l1e maps emulated MMIO space */
779 *sp = sh_l1e_mmio(target_gfn, gflags);
780 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
781 d->arch.paging.shadow.has_fast_mmio_entries = 1;
782 goto done;
783 }
785 // Must have a valid target_mfn unless this is a prefetch or an l1
786 // pointing at MMIO space. In the case of a prefetch, an invalid
787 // mfn means that we can not usefully shadow anything, and so we
788 // return early.
789 //
790 if ( !mfn_valid(target_mfn)
791 && !(level == 1 && (!shadow_mode_refcounts(d)
792 || p2mt == p2m_mmio_direct)) )
793 {
794 ASSERT((ft == ft_prefetch));
795 *sp = shadow_l1e_empty();
796 goto done;
797 }
799 // Propagate bits from the guest to the shadow.
800 // Some of these may be overwritten, below.
801 // Since we know the guest's PRESENT bit is set, we also set the shadow's
802 // SHADOW_PRESENT bit.
803 //
804 pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
805 _PAGE_RW | _PAGE_PRESENT);
806 if ( guest_supports_nx(v) )
807 pass_thru_flags |= _PAGE_NX_BIT;
808 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
809 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
810 sflags = gflags & pass_thru_flags;
812 /*
813 * For HVM domains with direct access to MMIO areas, set the correct
814 * caching attributes in the shadows to match what was asked for.
815 */
816 if ( (level == 1) && is_hvm_domain(d) &&
817 !list_empty(&(domain_hvm_iommu(d)->pdev_list)) &&
818 !is_xen_heap_mfn(mfn_x(target_mfn)) )
819 {
820 unsigned int type;
821 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
822 sflags |= pat_type_2_pte_flags(type);
823 else if ( d->arch.hvm_domain.is_in_uc_mode )
824 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
825 else
826 sflags |= get_pat_flags(v,
827 gflags,
828 gfn_to_paddr(target_gfn),
829 ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT);
830 }
832 // Set the A&D bits for higher level shadows.
833 // Higher level entries do not, strictly speaking, have dirty bits, but
834 // since we use shadow linear tables, each of these entries may, at some
835 // point in time, also serve as a shadow L1 entry.
836 // By setting both the A&D bits in each of these, we eliminate the burden
837 // on the hardware to update these bits on initial accesses.
838 //
839 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
840 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
842 // If the A or D bit has not yet been set in the guest, then we must
843 // prevent the corresponding kind of access.
844 //
845 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
846 sflags &= ~_PAGE_PRESENT;
848 /* D bits exist in L1es and PSE L2es */
849 if ( unlikely(((level == 1) ||
850 ((level == 2) &&
851 (gflags & _PAGE_PSE) &&
852 guest_supports_superpages(v)))
853 && !(gflags & _PAGE_DIRTY)) )
854 sflags &= ~_PAGE_RW;
856 // shadow_mode_log_dirty support
857 //
858 // Only allow the guest write access to a page a) on a demand fault,
859 // or b) if the page is already marked as dirty.
860 //
861 // (We handle log-dirty entirely inside the shadow code, without using the
862 // p2m_ram_logdirty p2m type: only HAP uses that.)
863 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
864 {
865 if ( mfn_valid(target_mfn) ) {
866 if ( ft & FETCH_TYPE_WRITE )
867 paging_mark_dirty(d, mfn_x(target_mfn));
868 else if ( !sh_mfn_is_dirty(d, target_mfn) )
869 sflags &= ~_PAGE_RW;
870 }
871 }
873 /* Read-only memory */
874 if ( p2mt == p2m_ram_ro )
875 sflags &= ~_PAGE_RW;
877 // protect guest page tables
878 //
879 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
880 {
881 if ( shadow_mode_trap_reads(d) )
882 {
883 // if we are trapping both reads & writes, then mark this page
884 // as not present...
885 //
886 sflags &= ~_PAGE_PRESENT;
887 }
888 else
889 {
890 // otherwise, just prevent any writes...
891 //
892 sflags &= ~_PAGE_RW;
893 }
894 }
896 // PV guests in 64-bit mode use two different page tables for user vs
897 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
898 // It is always shadowed as present...
899 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
900 && !is_hvm_domain(d) )
901 {
902 sflags |= _PAGE_USER;
903 }
905 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
907 done:
908 SHADOW_DEBUG(PROPAGATE,
909 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
910 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
911 }
914 /* These four wrappers give us a little bit of type-safety back around
915 * the use of void-* pointers and intpte types in _sh_propagate(), and
916 * allow the compiler to optimize out some level checks. */
918 #if GUEST_PAGING_LEVELS >= 4
919 static void
920 l4e_propagate_from_guest(struct vcpu *v,
921 guest_l4e_t gl4e,
922 mfn_t sl3mfn,
923 shadow_l4e_t *sl4e,
924 fetch_type_t ft)
925 {
926 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
927 }
929 static void
930 l3e_propagate_from_guest(struct vcpu *v,
931 guest_l3e_t gl3e,
932 mfn_t sl2mfn,
933 shadow_l3e_t *sl3e,
934 fetch_type_t ft)
935 {
936 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
937 }
938 #endif // GUEST_PAGING_LEVELS >= 4
940 static void
941 l2e_propagate_from_guest(struct vcpu *v,
942 guest_l2e_t gl2e,
943 mfn_t sl1mfn,
944 shadow_l2e_t *sl2e,
945 fetch_type_t ft)
946 {
947 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
948 }
950 static void
951 l1e_propagate_from_guest(struct vcpu *v,
952 guest_l1e_t gl1e,
953 mfn_t gmfn,
954 shadow_l1e_t *sl1e,
955 fetch_type_t ft,
956 p2m_type_t p2mt)
957 {
958 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
959 }
962 /**************************************************************************/
963 /* These functions update shadow entries (and do bookkeeping on the shadow
964 * tables they are in). It is intended that they are the only
965 * functions which ever write (non-zero) data onto a shadow page.
966 */
968 static inline void safe_write_entry(void *dst, void *src)
969 /* Copy one PTE safely when processors might be running on the
970 * destination pagetable. This does *not* give safety against
971 * concurrent writes (that's what the shadow lock is for), just
972 * stops the hardware picking up partially written entries. */
973 {
974 volatile unsigned long *d = dst;
975 unsigned long *s = src;
976 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
977 #if CONFIG_PAGING_LEVELS == 3
978 /* In PAE mode, pagetable entries are larger
979 * than machine words, so won't get written atomically. We need to make
980 * sure any other cpu running on these shadows doesn't see a
981 * half-written entry. Do this by marking the entry not-present first,
982 * then writing the high word before the low word. */
983 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
984 d[0] = 0;
985 d[1] = s[1];
986 d[0] = s[0];
987 #else
988 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
989 * which will be an atomic write, since the entry is aligned. */
990 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
991 *d = *s;
992 #endif
993 }
996 static inline void
997 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
998 /* This function does the actual writes to shadow pages.
999 * It must not be called directly, since it doesn't do the bookkeeping
1000 * that shadow_set_l*e() functions do. */
1002 shadow_l1e_t *dst = d;
1003 shadow_l1e_t *src = s;
1004 void *map = NULL;
1005 int i;
1007 /* Because we mirror access rights at all levels in the shadow, an
1008 * l2 (or higher) entry with the RW bit cleared will leave us with
1009 * no write access through the linear map.
1010 * We detect that by writing to the shadow with copy_to_user() and
1011 * using map_domain_page() to get a writeable mapping if we need to. */
1012 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
1014 perfc_incr(shadow_linear_map_failed);
1015 map = sh_map_domain_page(mfn);
1016 ASSERT(map != NULL);
1017 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
1021 for ( i = 0; i < entries; i++ )
1022 safe_write_entry(dst++, src++);
1024 if ( map != NULL ) sh_unmap_domain_page(map);
1027 static inline int
1028 perms_strictly_increased(u32 old_flags, u32 new_flags)
1029 /* Given the flags of two entries, are the new flags a strict
1030 * increase in rights over the old ones? */
1032 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1033 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
1034 /* Flip the NX bit, since it's the only one that decreases rights;
1035 * we calculate as if it were an "X" bit. */
1036 of ^= _PAGE_NX_BIT;
1037 nf ^= _PAGE_NX_BIT;
1038 /* If the changed bits are all set in the new flags, then rights strictly
1039 * increased between old and new. */
1040 return ((of | (of ^ nf)) == nf);
1043 static int inline
1044 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1046 int res;
1047 mfn_t mfn;
1048 struct domain *owner;
1050 ASSERT(!sh_l1e_is_magic(sl1e));
1052 if ( !shadow_mode_refcounts(d) )
1053 return 1;
1055 res = get_page_from_l1e(sl1e, d);
1057 // If a privileged domain is attempting to install a map of a page it does
1058 // not own, we let it succeed anyway.
1059 //
1060 if ( unlikely(!res) &&
1061 !shadow_mode_translate(d) &&
1062 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
1063 (owner = page_get_owner(mfn_to_page(mfn))) &&
1064 (d != owner) &&
1065 IS_PRIV_FOR(d, owner))
1067 res = get_page_from_l1e(sl1e, owner);
1068 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
1069 "which is owned by domain %d: %s\n",
1070 d->domain_id, mfn_x(mfn), owner->domain_id,
1071 res ? "success" : "failed");
1074 if ( unlikely(!res) )
1076 perfc_incr(shadow_get_page_fail);
1077 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
1080 return res;
1083 static void inline
1084 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1086 if ( !shadow_mode_refcounts(d) )
1087 return;
1089 put_page_from_l1e(sl1e, d);
1092 #if GUEST_PAGING_LEVELS >= 4
1093 static int shadow_set_l4e(struct vcpu *v,
1094 shadow_l4e_t *sl4e,
1095 shadow_l4e_t new_sl4e,
1096 mfn_t sl4mfn)
1098 int flags = 0, ok;
1099 shadow_l4e_t old_sl4e;
1100 paddr_t paddr;
1101 ASSERT(sl4e != NULL);
1102 old_sl4e = *sl4e;
1104 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
1106 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1107 | (((unsigned long)sl4e) & ~PAGE_MASK));
1109 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
1111 /* About to install a new reference */
1112 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
1113 ok = sh_get_ref(v, sl3mfn, paddr);
1114 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
1115 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
1116 ok |= sh_pin(v, sl3mfn);
1117 if ( !ok )
1119 domain_crash(v->domain);
1120 return SHADOW_SET_ERROR;
1124 /* Write the new entry */
1125 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
1126 flags |= SHADOW_SET_CHANGED;
1128 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1130 /* We lost a reference to an old mfn. */
1131 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1132 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1133 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1134 shadow_l4e_get_flags(new_sl4e)) )
1136 flags |= SHADOW_SET_FLUSH;
1138 sh_put_ref(v, osl3mfn, paddr);
1140 return flags;
1143 static int shadow_set_l3e(struct vcpu *v,
1144 shadow_l3e_t *sl3e,
1145 shadow_l3e_t new_sl3e,
1146 mfn_t sl3mfn)
1148 int flags = 0;
1149 shadow_l3e_t old_sl3e;
1150 paddr_t paddr;
1151 ASSERT(sl3e != NULL);
1152 old_sl3e = *sl3e;
1154 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1156 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1157 | (((unsigned long)sl3e) & ~PAGE_MASK));
1159 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1160 /* About to install a new reference */
1161 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1163 domain_crash(v->domain);
1164 return SHADOW_SET_ERROR;
1167 /* Write the new entry */
1168 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1169 flags |= SHADOW_SET_CHANGED;
1171 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1173 /* We lost a reference to an old mfn. */
1174 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1175 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1176 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1177 shadow_l3e_get_flags(new_sl3e)) )
1179 flags |= SHADOW_SET_FLUSH;
1181 sh_put_ref(v, osl2mfn, paddr);
1183 return flags;
1185 #endif /* GUEST_PAGING_LEVELS >= 4 */
1187 static int shadow_set_l2e(struct vcpu *v,
1188 shadow_l2e_t *sl2e,
1189 shadow_l2e_t new_sl2e,
1190 mfn_t sl2mfn)
1192 int flags = 0;
1193 shadow_l2e_t old_sl2e;
1194 paddr_t paddr;
1196 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1197 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1198 * shadows. Reference counting and up-pointers track from the first
1199 * page of the shadow to the first l2e, so make sure that we're
1200 * working with those:
1201 * Align the pointer down so it's pointing at the first of the pair */
1202 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1203 /* Align the mfn of the shadow entry too */
1204 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1205 #endif
1207 ASSERT(sl2e != NULL);
1208 old_sl2e = *sl2e;
1210 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1212 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1213 | (((unsigned long)sl2e) & ~PAGE_MASK));
1215 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1216 /* About to install a new reference */
1217 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1219 domain_crash(v->domain);
1220 return SHADOW_SET_ERROR;
1223 /* Write the new entry */
1224 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1226 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1227 /* The l1 shadow is two pages long and need to be pointed to by
1228 * two adjacent l1es. The pair have the same flags, but point
1229 * at odd and even MFNs */
1230 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1231 pair[1].l2 |= (1<<PAGE_SHIFT);
1232 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1234 #else /* normal case */
1235 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1236 #endif
1237 flags |= SHADOW_SET_CHANGED;
1239 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1241 /* We lost a reference to an old mfn. */
1242 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1243 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1244 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1245 shadow_l2e_get_flags(new_sl2e)) )
1247 flags |= SHADOW_SET_FLUSH;
1249 sh_put_ref(v, osl1mfn, paddr);
1251 return flags;
1254 static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1255 shadow_l1e_t *sl1e,
1256 mfn_t sl1mfn,
1257 struct domain *d)
1259 mfn_t mfn;
1260 unsigned long gfn;
1262 if ( !d->dirty_vram ) return;
1264 mfn = shadow_l1e_get_mfn(new_sl1e);
1265 gfn = mfn_to_gfn(d, mfn);
1267 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1268 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1269 struct page_info *page = mfn_to_page(mfn);
1270 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1272 if ( count_info == 1 )
1273 /* Initial guest reference, record it */
1274 d->dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1275 | ((paddr_t) sl1e & ~PAGE_MASK);
1279 static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1280 shadow_l1e_t *sl1e,
1281 mfn_t sl1mfn,
1282 struct domain *d)
1284 mfn_t mfn;
1285 unsigned long gfn;
1287 if ( !d->dirty_vram ) return;
1289 mfn = shadow_l1e_get_mfn(old_sl1e);
1290 gfn = mfn_to_gfn(d, mfn);
1292 if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
1293 unsigned long i = gfn - d->dirty_vram->begin_pfn;
1294 struct page_info *page = mfn_to_page(mfn);
1295 u32 count_info = page->u.inuse.type_info & PGT_count_mask;
1296 int dirty = 0;
1297 paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1298 | ((paddr_t) sl1e & ~PAGE_MASK);
1300 if ( count_info == 1 ) {
1301 /* Last reference */
1302 if ( d->dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1303 /* We didn't know it was that one, let's say it is dirty */
1304 dirty = 1;
1305 } else {
1306 ASSERT(d->dirty_vram->sl1ma[i] == sl1ma);
1307 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1308 if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_DIRTY )
1309 dirty = 1;
1311 } else {
1312 /* We had more than one reference, just consider the page dirty. */
1313 dirty = 1;
1314 /* Check that it's not the one we recorded. */
1315 if ( d->dirty_vram->sl1ma[i] == sl1ma ) {
1316 /* Too bad, we remembered the wrong one... */
1317 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
1318 } else {
1319 /* Ok, our recorded sl1e is still pointing to this page, let's
1320 * just hope it will remain. */
1323 if ( dirty )
1324 d->dirty_vram->dirty_bitmap[i / 8] |= d->dirty_vram->dirty_bitmap[i % 8];
1328 static int shadow_set_l1e(struct vcpu *v,
1329 shadow_l1e_t *sl1e,
1330 shadow_l1e_t new_sl1e,
1331 mfn_t sl1mfn)
1333 int flags = 0;
1334 struct domain *d = v->domain;
1335 shadow_l1e_t old_sl1e;
1336 ASSERT(sl1e != NULL);
1338 old_sl1e = *sl1e;
1340 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1342 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1343 && !sh_l1e_is_magic(new_sl1e) )
1345 /* About to install a new reference */
1346 if ( shadow_mode_refcounts(d) ) {
1347 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1349 /* Doesn't look like a pagetable. */
1350 flags |= SHADOW_SET_ERROR;
1351 new_sl1e = shadow_l1e_empty();
1352 } else {
1353 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1358 /* Write the new entry */
1359 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1360 flags |= SHADOW_SET_CHANGED;
1362 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1363 && !sh_l1e_is_magic(old_sl1e) )
1365 /* We lost a reference to an old mfn. */
1366 /* N.B. Unlike higher-level sets, never need an extra flush
1367 * when writing an l1e. Because it points to the same guest frame
1368 * as the guest l1e did, it's the guest's responsibility to
1369 * trigger a flush later. */
1370 if ( shadow_mode_refcounts(d) )
1372 shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1373 shadow_put_page_from_l1e(old_sl1e, d);
1376 return flags;
1380 /**************************************************************************/
1381 /* Macros to walk pagetables. These take the shadow of a pagetable and
1382 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1383 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1384 * second entry (since pairs of entries are managed together). For multi-page
1385 * shadows they walk all pages.
1387 * Arguments are an MFN, the variable to point to each entry, a variable
1388 * to indicate that we are done (we will shortcut to the end of the scan
1389 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1390 * and the code.
1392 * WARNING: These macros have side-effects. They change the values of both
1393 * the pointer and the MFN. */
1395 static inline void increment_ptr_to_guest_entry(void *ptr)
1397 if ( ptr )
1399 guest_l1e_t **entry = ptr;
1400 (*entry)++;
1404 /* All kinds of l1: touch all entries */
1405 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1406 do { \
1407 int _i; \
1408 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1409 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1410 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1411 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1412 { \
1413 (_sl1e) = _sp + _i; \
1414 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1415 {_code} \
1416 if ( _done ) break; \
1417 increment_ptr_to_guest_entry(_gl1p); \
1418 } \
1419 unmap_shadow_page(_sp); \
1420 } while (0)
1422 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1423 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1424 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1425 do { \
1426 int __done = 0; \
1427 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1428 ({ (__done = _done); }), _code); \
1429 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1430 if ( !__done ) \
1431 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1432 ({ (__done = _done); }), _code); \
1433 } while (0)
1434 #else /* Everything else; l1 shadows are only one page */
1435 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1436 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1437 #endif
1440 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1442 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1443 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1444 do { \
1445 int _i, _j, __done = 0; \
1446 int _xen = !shadow_mode_external(_dom); \
1447 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1448 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1449 { \
1450 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1451 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1452 if ( (!(_xen)) \
1453 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1454 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1455 { \
1456 (_sl2e) = _sp + _i; \
1457 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1458 {_code} \
1459 if ( (__done = (_done)) ) break; \
1460 increment_ptr_to_guest_entry(_gl2p); \
1461 } \
1462 unmap_shadow_page(_sp); \
1463 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1464 } \
1465 } while (0)
1467 #elif GUEST_PAGING_LEVELS == 2
1469 /* 32-bit on 32-bit: avoid Xen entries */
1470 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1471 do { \
1472 int _i; \
1473 int _xen = !shadow_mode_external(_dom); \
1474 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1475 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1476 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1477 if ( (!(_xen)) \
1478 || \
1479 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1480 { \
1481 (_sl2e) = _sp + _i; \
1482 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1483 {_code} \
1484 if ( _done ) break; \
1485 increment_ptr_to_guest_entry(_gl2p); \
1486 } \
1487 unmap_shadow_page(_sp); \
1488 } while (0)
1490 #elif GUEST_PAGING_LEVELS == 3
1492 /* PAE: if it's an l2h, don't touch Xen mappings */
1493 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1494 do { \
1495 int _i; \
1496 int _xen = !shadow_mode_external(_dom); \
1497 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1498 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1499 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1500 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1501 if ( (!(_xen)) \
1502 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1503 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1504 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1505 { \
1506 (_sl2e) = _sp + _i; \
1507 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1508 {_code} \
1509 if ( _done ) break; \
1510 increment_ptr_to_guest_entry(_gl2p); \
1511 } \
1512 unmap_shadow_page(_sp); \
1513 } while (0)
1515 #else
1517 /* 64-bit l2: touch all entries except for PAE compat guests. */
1518 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1519 do { \
1520 int _i; \
1521 int _xen = !shadow_mode_external(_dom); \
1522 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1523 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1524 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1525 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1526 { \
1527 if ( (!(_xen)) \
1528 || !is_pv_32on64_domain(_dom) \
1529 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1530 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1531 { \
1532 (_sl2e) = _sp + _i; \
1533 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1534 {_code} \
1535 if ( _done ) break; \
1536 increment_ptr_to_guest_entry(_gl2p); \
1537 } \
1538 } \
1539 unmap_shadow_page(_sp); \
1540 } while (0)
1542 #endif /* different kinds of l2 */
1544 #if GUEST_PAGING_LEVELS == 4
1546 /* 64-bit l3: touch all entries */
1547 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1548 do { \
1549 int _i; \
1550 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1551 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1552 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1553 { \
1554 (_sl3e) = _sp + _i; \
1555 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1556 {_code} \
1557 if ( _done ) break; \
1558 increment_ptr_to_guest_entry(_gl3p); \
1559 } \
1560 unmap_shadow_page(_sp); \
1561 } while (0)
1563 /* 64-bit l4: avoid Xen mappings */
1564 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1565 do { \
1566 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1567 int _xen = !shadow_mode_external(_dom); \
1568 int _i; \
1569 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1570 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1571 { \
1572 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1573 { \
1574 (_sl4e) = _sp + _i; \
1575 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1576 {_code} \
1577 if ( _done ) break; \
1578 } \
1579 increment_ptr_to_guest_entry(_gl4p); \
1580 } \
1581 unmap_shadow_page(_sp); \
1582 } while (0)
1584 #endif
1588 /**************************************************************************/
1589 /* Functions to install Xen mappings and linear mappings in shadow pages */
1591 // XXX -- this function should probably be moved to shadow-common.c, but that
1592 // probably wants to wait until the shadow types have been moved from
1593 // shadow-types.h to shadow-private.h
1594 //
1595 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1596 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1598 struct domain *d = v->domain;
1599 shadow_l4e_t *sl4e;
1601 sl4e = sh_map_domain_page(sl4mfn);
1602 ASSERT(sl4e != NULL);
1603 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1605 /* Copy the common Xen mappings from the idle domain */
1606 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1607 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1608 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1610 /* Install the per-domain mappings for this domain */
1611 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1612 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1613 __PAGE_HYPERVISOR);
1615 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1616 * shadows on 64-bit xen, this linear mapping is later replaced by the
1617 * monitor pagetable structure, which is built in make_monitor_table
1618 * and maintained by sh_update_linear_entries. */
1619 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1620 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1622 /* Self linear mapping. */
1623 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1625 // linear tables may not be used with translated PV guests
1626 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1627 shadow_l4e_empty();
1629 else
1631 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1632 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1635 if ( shadow_mode_translate(v->domain) )
1637 /* install domain-specific P2M table */
1638 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1639 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1640 __PAGE_HYPERVISOR);
1643 if ( is_pv_32on64_domain(v->domain) )
1645 /* install compat arg xlat entry */
1646 sl4e[shadow_l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1647 shadow_l4e_from_mfn(
1648 page_to_mfn(virt_to_page(d->arch.mm_arg_xlat_l3)),
1649 __PAGE_HYPERVISOR);
1652 sh_unmap_domain_page(sl4e);
1654 #endif
1656 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1657 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1658 // place, which means that we need to populate the l2h entry in the l3
1659 // table.
1661 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1663 struct domain *d = v->domain;
1664 shadow_l2e_t *sl2e;
1665 #if CONFIG_PAGING_LEVELS == 3
1666 int i;
1667 #else
1669 if ( !is_pv_32on64_vcpu(v) )
1670 return;
1671 #endif
1673 sl2e = sh_map_domain_page(sl2hmfn);
1674 ASSERT(sl2e != NULL);
1675 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1677 #if CONFIG_PAGING_LEVELS == 3
1679 /* Copy the common Xen mappings from the idle domain */
1680 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1681 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1682 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1684 /* Install the per-domain mappings for this domain */
1685 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1686 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1687 shadow_l2e_from_mfn(
1688 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1689 __PAGE_HYPERVISOR);
1691 /* We don't set up a linear mapping here because we can't until this
1692 * l2h is installed in an l3e. sh_update_linear_entries() handles
1693 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1694 * We zero them here, just as a safety measure.
1695 */
1696 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1697 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1698 shadow_l2e_empty();
1699 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1700 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1701 shadow_l2e_empty();
1703 if ( shadow_mode_translate(d) )
1705 /* Install the domain-specific p2m table */
1706 l3_pgentry_t *p2m;
1707 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1708 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1709 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1711 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1712 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1713 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1714 __PAGE_HYPERVISOR)
1715 : shadow_l2e_empty();
1717 sh_unmap_domain_page(p2m);
1720 #else
1722 /* Copy the common Xen mappings from the idle domain */
1723 memcpy(
1724 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1725 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1726 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1728 #endif
1730 sh_unmap_domain_page(sl2e);
1732 #endif
1735 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1736 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1738 struct domain *d = v->domain;
1739 shadow_l2e_t *sl2e;
1740 int i;
1742 sl2e = sh_map_domain_page(sl2mfn);
1743 ASSERT(sl2e != NULL);
1744 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1746 /* Copy the common Xen mappings from the idle domain */
1747 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1748 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1749 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1751 /* Install the per-domain mappings for this domain */
1752 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1753 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1754 shadow_l2e_from_mfn(
1755 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1756 __PAGE_HYPERVISOR);
1758 /* Linear mapping */
1759 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1760 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1762 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1764 // linear tables may not be used with translated PV guests
1765 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1766 shadow_l2e_empty();
1768 else
1770 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1771 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1774 if ( shadow_mode_translate(d) )
1776 /* install domain-specific P2M table */
1777 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1778 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1779 __PAGE_HYPERVISOR);
1782 sh_unmap_domain_page(sl2e);
1784 #endif
1788 /**************************************************************************/
1789 /* Create a shadow of a given guest page.
1790 */
1791 static mfn_t
1792 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1794 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1795 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1796 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1798 if ( shadow_type != SH_type_l2_32_shadow
1799 && shadow_type != SH_type_l2_pae_shadow
1800 && shadow_type != SH_type_l2h_pae_shadow
1801 && shadow_type != SH_type_l4_64_shadow )
1802 /* Lower-level shadow, not yet linked form a higher level */
1803 mfn_to_shadow_page(smfn)->up = 0;
1805 #if GUEST_PAGING_LEVELS == 4
1806 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1807 if ( shadow_type == SH_type_l4_64_shadow &&
1808 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1810 /* We're shadowing a new l4, but we've been assuming the guest uses
1811 * only one l4 per vcpu and context switches using an l4 entry.
1812 * Count the number of active l4 shadows. If there are enough
1813 * of them, decide that this isn't an old linux guest, and stop
1814 * pinning l3es. This is not very quick but it doesn't happen
1815 * very often. */
1816 struct list_head *l, *t;
1817 struct shadow_page_info *sp;
1818 struct vcpu *v2;
1819 int l4count = 0, vcpus = 0;
1820 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1822 sp = list_entry(l, struct shadow_page_info, list);
1823 if ( sp->type == SH_type_l4_64_shadow )
1824 l4count++;
1826 for_each_vcpu ( v->domain, v2 )
1827 vcpus++;
1828 if ( l4count > 2 * vcpus )
1830 /* Unpin all the pinned l3 tables, and don't pin any more. */
1831 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1833 sp = list_entry(l, struct shadow_page_info, list);
1834 if ( sp->type == SH_type_l3_64_shadow )
1835 sh_unpin(v, shadow_page_to_mfn(sp));
1837 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1840 #endif
1841 #endif
1843 // Create the Xen mappings...
1844 if ( !shadow_mode_external(v->domain) )
1846 switch (shadow_type)
1848 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1849 case SH_type_l4_shadow:
1850 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1851 #endif
1852 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1853 case SH_type_l2h_shadow:
1854 sh_install_xen_entries_in_l2h(v, smfn); break;
1855 #endif
1856 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1857 case SH_type_l2_shadow:
1858 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1859 #endif
1860 default: /* Do nothing */ break;
1864 shadow_promote(v, gmfn, shadow_type);
1865 set_shadow_status(v, gmfn, shadow_type, smfn);
1867 return smfn;
1870 /* Make a splintered superpage shadow */
1871 static mfn_t
1872 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1874 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1875 (unsigned long) gfn_x(gfn));
1877 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1878 gfn_x(gfn), mfn_x(smfn));
1880 set_fl1_shadow_status(v, gfn, smfn);
1881 return smfn;
1885 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1886 mfn_t
1887 sh_make_monitor_table(struct vcpu *v)
1889 struct domain *d = v->domain;
1891 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1893 /* Guarantee we can get the memory we need */
1894 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1896 #if CONFIG_PAGING_LEVELS == 4
1898 mfn_t m4mfn;
1899 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1900 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1901 /* Remember the level of this table */
1902 mfn_to_page(m4mfn)->shadow_flags = 4;
1903 #if SHADOW_PAGING_LEVELS < 4
1905 mfn_t m3mfn, m2mfn;
1906 l4_pgentry_t *l4e;
1907 l3_pgentry_t *l3e;
1908 /* Install an l3 table and an l2 table that will hold the shadow
1909 * linear map entries. This overrides the linear map entry that
1910 * was installed by sh_install_xen_entries_in_l4. */
1911 l4e = sh_map_domain_page(m4mfn);
1913 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1914 mfn_to_page(m3mfn)->shadow_flags = 3;
1915 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1916 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1918 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1919 mfn_to_page(m2mfn)->shadow_flags = 2;
1920 l3e = sh_map_domain_page(m3mfn);
1921 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1922 sh_unmap_domain_page(l3e);
1924 if ( is_pv_32on64_vcpu(v) )
1926 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1927 * area into its usual VAs in the monitor tables */
1928 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1929 mfn_to_page(m3mfn)->shadow_flags = 3;
1930 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1932 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1933 mfn_to_page(m2mfn)->shadow_flags = 2;
1934 l3e = sh_map_domain_page(m3mfn);
1935 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1936 sh_install_xen_entries_in_l2h(v, m2mfn);
1937 sh_unmap_domain_page(l3e);
1940 sh_unmap_domain_page(l4e);
1942 #endif /* SHADOW_PAGING_LEVELS < 4 */
1943 return m4mfn;
1946 #elif CONFIG_PAGING_LEVELS == 3
1949 mfn_t m3mfn, m2mfn;
1950 l3_pgentry_t *l3e;
1951 l2_pgentry_t *l2e;
1952 int i;
1954 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1955 /* Remember the level of this table */
1956 mfn_to_page(m3mfn)->shadow_flags = 3;
1958 // Install a monitor l2 table in slot 3 of the l3 table.
1959 // This is used for all Xen entries, including linear maps
1960 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1961 mfn_to_page(m2mfn)->shadow_flags = 2;
1962 l3e = sh_map_domain_page(m3mfn);
1963 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1964 sh_install_xen_entries_in_l2h(v, m2mfn);
1965 /* Install the monitor's own linear map */
1966 l2e = sh_map_domain_page(m2mfn);
1967 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1968 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1969 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1970 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1971 : l2e_empty();
1972 sh_unmap_domain_page(l2e);
1973 sh_unmap_domain_page(l3e);
1975 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1976 return m3mfn;
1979 #elif CONFIG_PAGING_LEVELS == 2
1982 mfn_t m2mfn;
1983 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1984 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1985 /* Remember the level of this table */
1986 mfn_to_page(m2mfn)->shadow_flags = 2;
1987 return m2mfn;
1990 #else
1991 #error this should not happen
1992 #endif /* CONFIG_PAGING_LEVELS */
1994 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1996 /**************************************************************************/
1997 /* These functions also take a virtual address and return the level-N
1998 * shadow table mfn and entry, but they create the shadow pagetables if
1999 * they are needed. The "demand" argument is non-zero when handling
2000 * a demand fault (so we know what to do about accessed bits &c).
2001 * If the necessary tables are not present in the guest, they return NULL. */
2003 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
2004 * more levels than the guest, the upper levels are always fixed and do not
2005 * reflect any information from the guest, so we do not use these functions
2006 * to access them. */
2008 #if GUEST_PAGING_LEVELS >= 4
2009 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
2010 walk_t *gw,
2011 mfn_t *sl4mfn)
2013 /* There is always a shadow of the top level table. Get it. */
2014 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2015 /* Reading the top level table is always valid. */
2016 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
2019 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
2020 walk_t *gw,
2021 mfn_t *sl3mfn,
2022 fetch_type_t ft)
2024 mfn_t sl4mfn;
2025 shadow_l4e_t *sl4e;
2026 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
2027 /* Get the l4e */
2028 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
2029 ASSERT(sl4e != NULL);
2030 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2032 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
2033 ASSERT(mfn_valid(*sl3mfn));
2035 else
2037 int r;
2038 shadow_l4e_t new_sl4e;
2039 /* No l3 shadow installed: find and install it. */
2040 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
2041 if ( !mfn_valid(*sl3mfn) )
2043 /* No l3 shadow of this page exists at all: make one. */
2044 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
2046 /* Install the new sl3 table in the sl4e */
2047 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
2048 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
2049 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2050 if ( r & SHADOW_SET_ERROR )
2051 return NULL;
2053 /* Now follow it down a level. Guaranteed to succeed. */
2054 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
2056 #endif /* GUEST_PAGING_LEVELS >= 4 */
2059 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
2060 walk_t *gw,
2061 mfn_t *sl2mfn,
2062 fetch_type_t ft)
2064 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
2065 mfn_t sl3mfn = _mfn(INVALID_MFN);
2066 shadow_l3e_t *sl3e;
2067 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
2068 /* Get the l3e */
2069 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
2070 if ( sl3e == NULL ) return NULL;
2071 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2073 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2074 ASSERT(mfn_valid(*sl2mfn));
2076 else
2078 int r;
2079 shadow_l3e_t new_sl3e;
2080 unsigned int t = SH_type_l2_shadow;
2082 /* Tag compat L2 containing hypervisor (m2p) mappings */
2083 if ( is_pv_32on64_domain(v->domain) &&
2084 guest_l4_table_offset(gw->va) == 0 &&
2085 guest_l3_table_offset(gw->va) == 3 )
2086 t = SH_type_l2h_shadow;
2088 /* No l2 shadow installed: find and install it. */
2089 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
2090 if ( !mfn_valid(*sl2mfn) )
2092 /* No l2 shadow of this page exists at all: make one. */
2093 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
2095 /* Install the new sl2 table in the sl3e */
2096 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
2097 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
2098 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2099 if ( r & SHADOW_SET_ERROR )
2100 return NULL;
2102 /* Now follow it down a level. Guaranteed to succeed. */
2103 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2104 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
2105 /* We never demand-shadow PAE l3es: they are only created in
2106 * sh_update_cr3(). Check if the relevant sl3e is present. */
2107 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
2108 + shadow_l3_linear_offset(gw->va);
2109 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
2110 return NULL;
2111 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
2112 ASSERT(mfn_valid(*sl2mfn));
2113 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2114 #else /* 32bit... */
2115 /* There is always a shadow of the top level table. Get it. */
2116 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
2117 /* This next line is important: the guest l2 has a 16k
2118 * shadow, we need to return the right mfn of the four. This
2119 * call will set it for us as a side-effect. */
2120 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
2121 /* Reading the top level table is always valid. */
2122 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
2123 #endif
2127 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
2128 walk_t *gw,
2129 mfn_t *sl1mfn,
2130 fetch_type_t ft)
2132 mfn_t sl2mfn;
2133 shadow_l2e_t *sl2e;
2135 /* Get the l2e */
2136 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
2137 if ( sl2e == NULL ) return NULL;
2138 /* Install the sl1 in the l2e if it wasn't there or if we need to
2139 * re-do it to fix a PSE dirty bit. */
2140 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
2141 && likely(ft != ft_demand_write
2142 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
2143 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
2145 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
2146 ASSERT(mfn_valid(*sl1mfn));
2148 else
2150 shadow_l2e_t new_sl2e;
2151 int r, flags = guest_l2e_get_flags(gw->l2e);
2152 /* No l1 shadow installed: find and install it. */
2153 if ( !(flags & _PAGE_PRESENT) )
2154 return NULL; /* No guest page. */
2155 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
2157 /* Splintering a superpage */
2158 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
2159 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
2160 if ( !mfn_valid(*sl1mfn) )
2162 /* No fl1 shadow of this superpage exists at all: make one. */
2163 *sl1mfn = make_fl1_shadow(v, l2gfn);
2166 else
2168 /* Shadowing an actual guest l1 table */
2169 if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
2170 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
2171 if ( !mfn_valid(*sl1mfn) )
2173 /* No l1 shadow of this page exists at all: make one. */
2174 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
2177 /* Install the new sl1 table in the sl2e */
2178 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
2179 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
2180 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2181 if ( r & SHADOW_SET_ERROR )
2182 return NULL;
2183 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
2184 * the guest l1 table has an 8k shadow, and we need to return
2185 * the right mfn of the pair. This call will set it for us as a
2186 * side-effect. (In all other cases, it's a no-op and will be
2187 * compiled out.) */
2188 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
2190 /* Now follow it down a level. Guaranteed to succeed. */
2191 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
2196 /**************************************************************************/
2197 /* Destructors for shadow tables:
2198 * Unregister the shadow, decrement refcounts of any entries present in it,
2199 * and release the memory.
2201 * N.B. These destructors do not clear the contents of the shadows.
2202 * This allows us to delay TLB shootdowns until the page is being reused.
2203 * See shadow_alloc() and shadow_free() for how this is handled.
2204 */
2206 #if GUEST_PAGING_LEVELS >= 4
2207 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
2209 shadow_l4e_t *sl4e;
2210 u32 t = mfn_to_shadow_page(smfn)->type;
2211 mfn_t gmfn, sl4mfn;
2213 SHADOW_DEBUG(DESTROY_SHADOW,
2214 "%s(%05lx)\n", __func__, mfn_x(smfn));
2215 ASSERT(t == SH_type_l4_shadow);
2217 /* Record that the guest page isn't shadowed any more (in this type) */
2218 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2219 delete_shadow_status(v, gmfn, t, smfn);
2220 shadow_demote(v, gmfn, t);
2221 /* Decrement refcounts of all the old entries */
2222 sl4mfn = smfn;
2223 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2224 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2226 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
2227 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
2228 | ((unsigned long)sl4e & ~PAGE_MASK));
2230 });
2232 /* Put the memory back in the pool */
2233 shadow_free(v->domain, smfn);
2236 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2238 shadow_l3e_t *sl3e;
2239 u32 t = mfn_to_shadow_page(smfn)->type;
2240 mfn_t gmfn, sl3mfn;
2242 SHADOW_DEBUG(DESTROY_SHADOW,
2243 "%s(%05lx)\n", __func__, mfn_x(smfn));
2244 ASSERT(t == SH_type_l3_shadow);
2246 /* Record that the guest page isn't shadowed any more (in this type) */
2247 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2248 delete_shadow_status(v, gmfn, t, smfn);
2249 shadow_demote(v, gmfn, t);
2251 /* Decrement refcounts of all the old entries */
2252 sl3mfn = smfn;
2253 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2254 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2255 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2256 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2257 | ((unsigned long)sl3e & ~PAGE_MASK));
2258 });
2260 /* Put the memory back in the pool */
2261 shadow_free(v->domain, smfn);
2263 #endif /* GUEST_PAGING_LEVELS >= 4 */
2266 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2268 shadow_l2e_t *sl2e;
2269 u32 t = mfn_to_shadow_page(smfn)->type;
2270 mfn_t gmfn, sl2mfn;
2272 SHADOW_DEBUG(DESTROY_SHADOW,
2273 "%s(%05lx)\n", __func__, mfn_x(smfn));
2275 #if GUEST_PAGING_LEVELS >= 3
2276 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2277 #else
2278 ASSERT(t == SH_type_l2_shadow);
2279 #endif
2281 /* Record that the guest page isn't shadowed any more (in this type) */
2282 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2283 delete_shadow_status(v, gmfn, t, smfn);
2284 shadow_demote(v, gmfn, t);
2286 /* Decrement refcounts of all the old entries */
2287 sl2mfn = smfn;
2288 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2289 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2290 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2291 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2292 | ((unsigned long)sl2e & ~PAGE_MASK));
2293 });
2295 /* Put the memory back in the pool */
2296 shadow_free(v->domain, smfn);
2299 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2301 struct domain *d = v->domain;
2302 shadow_l1e_t *sl1e;
2303 u32 t = mfn_to_shadow_page(smfn)->type;
2305 SHADOW_DEBUG(DESTROY_SHADOW,
2306 "%s(%05lx)\n", __func__, mfn_x(smfn));
2307 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2309 /* Record that the guest page isn't shadowed any more (in this type) */
2310 if ( t == SH_type_fl1_shadow )
2312 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2313 delete_fl1_shadow_status(v, gfn, smfn);
2315 else
2317 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2318 delete_shadow_status(v, gmfn, t, smfn);
2319 shadow_demote(v, gmfn, t);
2322 if ( shadow_mode_refcounts(d) )
2324 /* Decrement refcounts of all the old entries */
2325 mfn_t sl1mfn = smfn;
2326 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2327 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2328 && !sh_l1e_is_magic(*sl1e) ) {
2329 shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2330 shadow_put_page_from_l1e(*sl1e, d);
2332 });
2335 /* Put the memory back in the pool */
2336 shadow_free(v->domain, smfn);
2339 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2340 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2342 struct domain *d = v->domain;
2343 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2345 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2347 mfn_t m3mfn;
2348 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2349 l3_pgentry_t *l3e;
2350 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2352 /* Need to destroy the l3 and l2 monitor pages used
2353 * for the linear map */
2354 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2355 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2356 l3e = sh_map_domain_page(m3mfn);
2357 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2358 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2359 sh_unmap_domain_page(l3e);
2360 shadow_free(d, m3mfn);
2362 if ( is_pv_32on64_vcpu(v) )
2364 /* Need to destroy the l3 and l2 monitor pages that map the
2365 * Xen VAs at 3GB-4GB */
2366 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2367 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2368 l3e = sh_map_domain_page(m3mfn);
2369 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2370 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2371 sh_unmap_domain_page(l3e);
2372 shadow_free(d, m3mfn);
2374 sh_unmap_domain_page(l4e);
2376 #elif CONFIG_PAGING_LEVELS == 3
2377 /* Need to destroy the l2 monitor page in slot 4 too */
2379 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2380 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2381 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2382 sh_unmap_domain_page(l3e);
2384 #endif
2386 /* Put the memory back in the pool */
2387 shadow_free(d, mmfn);
2389 #endif
2391 /**************************************************************************/
2392 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2393 * These are called from common code when we are running out of shadow
2394 * memory, and unpinning all the top-level shadows hasn't worked.
2396 * This implementation is pretty crude and slow, but we hope that it won't
2397 * be called very often. */
2399 #if GUEST_PAGING_LEVELS == 2
2401 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2403 shadow_l2e_t *sl2e;
2404 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2405 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2406 });
2409 #elif GUEST_PAGING_LEVELS == 3
2411 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2412 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2414 shadow_l2e_t *sl2e;
2415 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2416 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2417 });
2420 #elif GUEST_PAGING_LEVELS == 4
2422 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2424 shadow_l4e_t *sl4e;
2425 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2426 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2427 });
2430 #endif
2432 /**************************************************************************/
2433 /* Internal translation functions.
2434 * These functions require a pointer to the shadow entry that will be updated.
2435 */
2437 /* These functions take a new guest entry, translate it to shadow and write
2438 * the shadow entry.
2440 * They return the same bitmaps as the shadow_set_lXe() functions.
2441 */
2443 #if GUEST_PAGING_LEVELS >= 4
2444 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2446 shadow_l4e_t new_sl4e;
2447 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2448 shadow_l4e_t *sl4p = se;
2449 mfn_t sl3mfn = _mfn(INVALID_MFN);
2450 struct domain *d = v->domain;
2451 p2m_type_t p2mt;
2452 int result = 0;
2454 perfc_incr(shadow_validate_gl4e_calls);
2456 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2458 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2459 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2460 if ( p2m_is_ram(p2mt) )
2461 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2462 else
2463 result |= SHADOW_SET_ERROR;
2465 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2467 // check for updates to xen reserved slots
2468 if ( !shadow_mode_external(d) )
2470 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2471 sizeof(shadow_l4e_t));
2472 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2474 if ( unlikely(reserved_xen_slot) )
2476 // attempt by the guest to write to a xen reserved slot
2477 //
2478 SHADOW_PRINTK("%s out-of-range update "
2479 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2480 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2481 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2483 SHADOW_ERROR("out-of-range l4e update\n");
2484 result |= SHADOW_SET_ERROR;
2487 // do not call shadow_set_l4e...
2488 return result;
2492 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2493 return result;
2497 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2499 shadow_l3e_t new_sl3e;
2500 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2501 shadow_l3e_t *sl3p = se;
2502 mfn_t sl2mfn = _mfn(INVALID_MFN);
2503 p2m_type_t p2mt;
2504 int result = 0;
2506 perfc_incr(shadow_validate_gl3e_calls);
2508 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2510 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2511 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2512 if ( p2m_is_ram(p2mt) )
2513 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2514 else
2515 result |= SHADOW_SET_ERROR;
2517 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2518 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2520 return result;
2522 #endif // GUEST_PAGING_LEVELS >= 4
2524 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2526 shadow_l2e_t new_sl2e;
2527 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2528 shadow_l2e_t *sl2p = se;
2529 mfn_t sl1mfn = _mfn(INVALID_MFN);
2530 p2m_type_t p2mt;
2531 int result = 0;
2533 perfc_incr(shadow_validate_gl2e_calls);
2535 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2537 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2538 if ( guest_supports_superpages(v) &&
2539 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2541 // superpage -- need to look up the shadow L1 which holds the
2542 // splitters...
2543 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2544 #if 0
2545 // XXX - it's possible that we want to do some kind of prefetch
2546 // for superpage fl1's here, but this is *not* on the demand path,
2547 // so we'll hold off trying that for now...
2548 //
2549 if ( !mfn_valid(sl1mfn) )
2550 sl1mfn = make_fl1_shadow(v, gl1gfn);
2551 #endif
2553 else
2555 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2556 if ( p2m_is_ram(p2mt) )
2557 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2558 else
2559 result |= SHADOW_SET_ERROR;
2562 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2564 // check for updates to xen reserved slots in PV guests...
2565 // XXX -- need to revisit this for PV 3-on-4 guests.
2566 //
2567 #if SHADOW_PAGING_LEVELS < 4
2568 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2569 if ( !shadow_mode_external(v->domain) )
2571 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2572 sizeof(shadow_l2e_t));
2573 int reserved_xen_slot;
2575 #if SHADOW_PAGING_LEVELS == 3
2576 reserved_xen_slot =
2577 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2578 (shadow_index
2579 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2580 #else /* SHADOW_PAGING_LEVELS == 2 */
2581 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2582 #endif
2584 if ( unlikely(reserved_xen_slot) )
2586 // attempt by the guest to write to a xen reserved slot
2587 //
2588 SHADOW_PRINTK("%s out-of-range update "
2589 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2590 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2591 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2593 SHADOW_ERROR("out-of-range l2e update\n");
2594 result |= SHADOW_SET_ERROR;
2597 // do not call shadow_set_l2e...
2598 return result;
2601 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2602 #endif /* SHADOW_PAGING_LEVELS < 4 */
2604 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2606 return result;
2609 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2611 shadow_l1e_t new_sl1e;
2612 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2613 shadow_l1e_t *sl1p = se;
2614 gfn_t gfn;
2615 mfn_t gmfn;
2616 p2m_type_t p2mt;
2617 int result = 0;
2619 perfc_incr(shadow_validate_gl1e_calls);
2621 gfn = guest_l1e_get_gfn(new_gl1e);
2622 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2624 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2626 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2627 return result;
2631 /**************************************************************************/
2632 /* Functions which translate and install the shadows of arbitrary guest
2633 * entries that we have just seen the guest write. */
2636 static inline int
2637 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2638 void *new_gp, u32 size, u32 sh_type,
2639 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2640 int (*validate_ge)(struct vcpu *v, void *ge,
2641 mfn_t smfn, void *se))
2642 /* Generic function for mapping and validating. */
2644 mfn_t smfn, smfn2, map_mfn;
2645 shadow_l1e_t *sl1p;
2646 u32 shadow_idx, guest_idx;
2647 int result = 0;
2649 /* Align address and size to guest entry boundaries */
2650 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2651 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2652 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2653 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2655 /* Map the shadow page */
2656 smfn = get_shadow_status(v, gmfn, sh_type);
2657 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2658 guest_idx = guest_index(new_gp);
2659 map_mfn = smfn;
2660 shadow_idx = shadow_index(&map_mfn, guest_idx);
2661 sl1p = map_shadow_page(map_mfn);
2663 /* Validate one entry at a time */
2664 while ( size )
2666 smfn2 = smfn;
2667 guest_idx = guest_index(new_gp);
2668 shadow_idx = shadow_index(&smfn2, guest_idx);
2669 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2671 /* We have moved to another page of the shadow */
2672 map_mfn = smfn2;
2673 unmap_shadow_page(sl1p);
2674 sl1p = map_shadow_page(map_mfn);
2676 result |= validate_ge(v,
2677 new_gp,
2678 map_mfn,
2679 &sl1p[shadow_idx]);
2680 size -= sizeof(guest_l1e_t);
2681 new_gp += sizeof(guest_l1e_t);
2683 unmap_shadow_page(sl1p);
2684 return result;
2688 int
2689 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2690 void *new_gl4p, u32 size)
2692 #if GUEST_PAGING_LEVELS >= 4
2693 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2694 SH_type_l4_shadow,
2695 shadow_l4_index,
2696 validate_gl4e);
2697 #else // ! GUEST_PAGING_LEVELS >= 4
2698 SHADOW_ERROR("called in wrong paging mode!\n");
2699 BUG();
2700 return 0;
2701 #endif
2704 int
2705 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2706 void *new_gl3p, u32 size)
2708 #if GUEST_PAGING_LEVELS >= 4
2709 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2710 SH_type_l3_shadow,
2711 shadow_l3_index,
2712 validate_gl3e);
2713 #else // ! GUEST_PAGING_LEVELS >= 4
2714 SHADOW_ERROR("called in wrong paging mode!\n");
2715 BUG();
2716 return 0;
2717 #endif
2720 int
2721 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2722 void *new_gl2p, u32 size)
2724 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2725 SH_type_l2_shadow,
2726 shadow_l2_index,
2727 validate_gl2e);
2730 int
2731 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2732 void *new_gl2p, u32 size)
2734 #if GUEST_PAGING_LEVELS >= 3
2735 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2736 SH_type_l2h_shadow,
2737 shadow_l2_index,
2738 validate_gl2e);
2739 #else /* Non-PAE guests don't have different kinds of l2 table */
2740 SHADOW_ERROR("called in wrong paging mode!\n");
2741 BUG();
2742 return 0;
2743 #endif
2746 int
2747 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2748 void *new_gl1p, u32 size)
2750 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2751 SH_type_l1_shadow,
2752 shadow_l1_index,
2753 validate_gl1e);
2757 /**************************************************************************/
2758 /* Optimization: If we see two emulated writes of zeros to the same
2759 * page-table without another kind of page fault in between, we guess
2760 * that this is a batch of changes (for process destruction) and
2761 * unshadow the page so we don't take a pagefault on every entry. This
2762 * should also make finding writeable mappings of pagetables much
2763 * easier. */
2765 /* Look to see if this is the second emulated write in a row to this
2766 * page, and unshadow if it is */
2767 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2769 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2770 if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2771 && sh_mfn_is_a_page_table(gmfn) )
2773 perfc_incr(shadow_early_unshadow);
2774 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2776 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2777 #endif
2780 /* Stop counting towards early unshadows, as we've seen a real page fault */
2781 static inline void reset_early_unshadow(struct vcpu *v)
2783 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2784 v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2785 #endif
2790 /**************************************************************************/
2791 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2792 * demand-faulted a shadow l1e in the fault handler, to see if it's
2793 * worth fetching some more.
2794 */
2796 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2798 /* XXX magic number */
2799 #define PREFETCH_DISTANCE 32
2801 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2802 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2804 int i, dist;
2805 gfn_t gfn;
2806 mfn_t gmfn;
2807 guest_l1e_t *gl1p = NULL, gl1e;
2808 shadow_l1e_t sl1e;
2809 u32 gflags;
2810 p2m_type_t p2mt;
2812 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2813 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2814 /* And no more than a maximum fetches-per-fault */
2815 if ( dist > PREFETCH_DISTANCE )
2816 dist = PREFETCH_DISTANCE;
2818 if ( mfn_valid(gw->l1mfn) )
2820 /* Normal guest page; grab the next guest entry */
2821 gl1p = sh_map_domain_page(gw->l1mfn);
2822 gl1p += guest_l1_table_offset(gw->va);
2825 for ( i = 1; i < dist ; i++ )
2827 /* No point in prefetching if there's already a shadow */
2828 if ( ptr_sl1e[i].l1 != 0 )
2829 break;
2831 if ( mfn_valid(gw->l1mfn) )
2833 /* Normal guest page; grab the next guest entry */
2834 gl1e = gl1p[i];
2835 /* Not worth continuing if we hit an entry that will need another
2836 * fault for A/D-bit propagation anyway */
2837 gflags = guest_l1e_get_flags(gl1e);
2838 if ( (gflags & _PAGE_PRESENT)
2839 && (!(gflags & _PAGE_ACCESSED)
2840 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2841 break;
2843 else
2845 /* Fragmented superpage, unless we've been called wrongly */
2846 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2847 /* Increment the l1e's GFN by the right number of guest pages */
2848 gl1e = guest_l1e_from_gfn(
2849 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2850 guest_l1e_get_flags(gw->l1e));
2853 /* Look at the gfn that the l1e is pointing at */
2854 gfn = guest_l1e_get_gfn(gl1e);
2855 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2857 /* Propagate the entry. */
2858 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2859 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2861 if ( gl1p != NULL )
2862 sh_unmap_domain_page(gl1p);
2865 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2868 /**************************************************************************/
2869 /* Entry points into the shadow code */
2871 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2872 * for pagefaults. Returns 1 if this fault was an artefact of the
2873 * shadow code (and the guest should retry) or 0 if it is not (and the
2874 * fault should be handled elsewhere or passed to the guest). */
2876 static int sh_page_fault(struct vcpu *v,
2877 unsigned long va,
2878 struct cpu_user_regs *regs)
2880 struct domain *d = v->domain;
2881 walk_t gw;
2882 gfn_t gfn;
2883 mfn_t gmfn, sl1mfn=_mfn(0);
2884 shadow_l1e_t sl1e, *ptr_sl1e;
2885 paddr_t gpa;
2886 struct sh_emulate_ctxt emul_ctxt;
2887 struct x86_emulate_ops *emul_ops;
2888 int r;
2889 fetch_type_t ft = 0;
2890 p2m_type_t p2mt;
2891 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2892 int fast_emul = 0;
2893 #endif
2895 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
2896 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
2897 regs->rip);
2899 perfc_incr(shadow_fault);
2901 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2902 /* If faulting frame is successfully emulated in last shadow fault
2903 * it's highly likely to reach same emulation action for this frame.
2904 * Then try to emulate early to avoid lock aquisition.
2905 */
2906 if ( v->arch.paging.last_write_emul_ok
2907 && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
2909 /* check whether error code is 3, or else fall back to normal path
2910 * in case of some validation is required
2911 */
2912 if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
2914 fast_emul = 1;
2915 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
2916 perfc_incr(shadow_fault_fast_emulate);
2917 goto early_emulation;
2919 else
2920 v->arch.paging.last_write_emul_ok = 0;
2922 #endif
2924 //
2925 // XXX: Need to think about eventually mapping superpages directly in the
2926 // shadow (when possible), as opposed to splintering them into a
2927 // bunch of 4K maps.
2928 //
2930 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2931 if ( (regs->error_code & PFEC_reserved_bit) )
2933 /* The only reasons for reserved bits to be set in shadow entries
2934 * are the two "magic" shadow_l1e entries. */
2935 if ( likely((__copy_from_user(&sl1e,
2936 (sh_linear_l1_table(v)
2937 + shadow_l1_linear_offset(va)),
2938 sizeof(sl1e)) == 0)
2939 && sh_l1e_is_magic(sl1e)) )
2941 if ( sh_l1e_is_gnp(sl1e) )
2943 /* Not-present in a guest PT: pass to the guest as
2944 * a not-present fault (by flipping two bits). */
2945 ASSERT(regs->error_code & PFEC_page_present);
2946 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2947 reset_early_unshadow(v);
2948 perfc_incr(shadow_fault_fast_gnp);
2949 SHADOW_PRINTK("fast path not-present\n");
2950 return 0;
2952 else
2954 /* Magic MMIO marker: extract gfn for MMIO address */
2955 ASSERT(sh_l1e_is_mmio(sl1e));
2956 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2957 << PAGE_SHIFT)
2958 | (va & ~PAGE_MASK);
2960 perfc_incr(shadow_fault_fast_mmio);
2961 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2962 reset_early_unshadow(v);
2963 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
2964 ? EXCRET_fault_fixed : 0);
2966 else
2968 /* This should be exceptionally rare: another vcpu has fixed
2969 * the tables between the fault and our reading the l1e.
2970 * Retry and let the hardware give us the right fault next time. */
2971 perfc_incr(shadow_fault_fast_fail);
2972 SHADOW_PRINTK("fast path false alarm!\n");
2973 return EXCRET_fault_fixed;
2976 #endif /* SHOPT_FAST_FAULT_PATH */
2978 /* Detect if this page fault happened while we were already in Xen
2979 * doing a shadow operation. If that happens, the only thing we can
2980 * do is let Xen's normal fault handlers try to fix it. In any case,
2981 * a diagnostic trace of the fault will be more useful than
2982 * a BUG() when we try to take the lock again. */
2983 if ( unlikely(shadow_locked_by_me(d)) )
2985 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
2986 d->arch.paging.shadow.locker_function);
2987 return 0;
2990 if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
2992 perfc_incr(shadow_fault_bail_real_fault);
2993 SHADOW_PRINTK("not a shadow fault\n");
2994 reset_early_unshadow(v);
2995 return 0;
2998 /* It's possible that the guest has put pagetables in memory that it has
2999 * already used for some special purpose (ioreq pages, or granted pages).
3000 * If that happens we'll have killed the guest already but it's still not
3001 * safe to propagate entries out of the guest PT so get out now. */
3002 if ( unlikely(d->is_shutting_down) )
3004 SHADOW_PRINTK("guest is shutting down\n");
3005 return 0;
3008 /* What kind of access are we dealing with? */
3009 ft = ((regs->error_code & PFEC_write_access)
3010 ? ft_demand_write : ft_demand_read);
3012 /* What mfn is the guest trying to access? */
3013 gfn = guest_l1e_get_gfn(gw.l1e);
3014 gmfn = gfn_to_mfn(d, gfn, &p2mt);
3016 if ( shadow_mode_refcounts(d) &&
3017 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3019 perfc_incr(shadow_fault_bail_bad_gfn);
3020 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3021 gfn_x(gfn), mfn_x(gmfn));
3022 reset_early_unshadow(v);
3023 return 0;
3026 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3027 /* Remember this successful VA->GFN translation for later. */
3028 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3029 regs->error_code | PFEC_page_present);
3030 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3032 shadow_lock(d);
3034 if ( gw_remove_write_accesses(v, va, &gw) )
3036 /* Write permission removal is also a hint that other gwalks
3037 * overlapping with this one may be inconsistent
3038 */
3039 perfc_incr(shadow_rm_write_flush_tlb);
3040 atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3041 flush_tlb_mask(d->domain_dirty_cpumask);
3044 if ( !shadow_check_gwalk(v, va, &gw) )
3046 perfc_incr(shadow_inconsistent_gwalk);
3047 shadow_unlock(d);
3048 return EXCRET_fault_fixed;
3051 shadow_audit_tables(v);
3052 sh_audit_gw(v, &gw);
3054 /* Make sure there is enough free shadow memory to build a chain of
3055 * shadow tables. (We never allocate a top-level shadow on this path,
3056 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3057 * SH_type_l1_shadow isn't correct in the latter case, all page
3058 * tables are the same size there.) */
3059 shadow_prealloc(d,
3060 SH_type_l1_shadow,
3061 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3063 /* Acquire the shadow. This must happen before we figure out the rights
3064 * for the shadow entry, since we might promote a page here. */
3065 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3066 if ( unlikely(ptr_sl1e == NULL) )
3068 /* Couldn't get the sl1e! Since we know the guest entries
3069 * are OK, this can only have been caused by a failed
3070 * shadow_set_l*e(), which will have crashed the guest.
3071 * Get out of the fault handler immediately. */
3072 ASSERT(d->is_shutting_down);
3073 shadow_unlock(d);
3074 return 0;
3077 /* Calculate the shadow entry and write it */
3078 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3079 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
3081 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3082 /* Prefetch some more shadow entries */
3083 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3084 #endif
3086 /* Need to emulate accesses to page tables */
3087 if ( sh_mfn_is_a_page_table(gmfn) )
3089 if ( ft == ft_demand_write )
3091 perfc_incr(shadow_fault_emulate_write);
3092 goto emulate;
3094 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3096 perfc_incr(shadow_fault_emulate_read);
3097 goto emulate;
3101 /* Need to hand off device-model MMIO and writes to read-only
3102 * memory to the device model */
3103 if ( p2mt == p2m_mmio_dm
3104 || (p2mt == p2m_ram_ro && ft == ft_demand_write) )
3106 gpa = guest_walk_to_gpa(&gw);
3107 goto mmio;
3110 /* In HVM guests, we force CR0.WP always to be set, so that the
3111 * pagetables are always write-protected. If the guest thinks
3112 * CR0.WP is clear, we must emulate faulting supervisor writes to
3113 * allow the guest to write through read-only PTEs. Emulate if the
3114 * fault was a non-user write to a present page. */
3115 if ( is_hvm_domain(d)
3116 && unlikely(!hvm_wp_enabled(v))
3117 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
3119 perfc_incr(shadow_fault_emulate_wp);
3120 goto emulate;
3123 perfc_incr(shadow_fault_fixed);
3124 d->arch.paging.log_dirty.fault_count++;
3125 reset_early_unshadow(v);
3127 done:
3128 sh_audit_gw(v, &gw);
3129 SHADOW_PRINTK("fixed\n");
3130 shadow_audit_tables(v);
3131 shadow_unlock(d);
3132 return EXCRET_fault_fixed;
3134 emulate:
3135 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3136 goto not_a_shadow_fault;
3138 /*
3139 * We do not emulate user writes. Instead we use them as a hint that the
3140 * page is no longer a page table. This behaviour differs from native, but
3141 * it seems very unlikely that any OS grants user access to page tables.
3142 */
3143 if ( (regs->error_code & PFEC_user_mode) )
3145 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3146 mfn_x(gmfn));
3147 perfc_incr(shadow_fault_emulate_failed);
3148 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3149 goto done;
3152 /*
3153 * We don't need to hold the lock for the whole emulation; we will
3154 * take it again when we write to the pagetables.
3155 */
3156 sh_audit_gw(v, &gw);
3157 shadow_audit_tables(v);
3158 shadow_unlock(d);
3160 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3161 early_emulation:
3162 #endif
3163 if ( is_hvm_domain(d) )
3165 /*
3166 * If we are in the middle of injecting an exception or interrupt then
3167 * we should not emulate: it is not the instruction at %eip that caused
3168 * the fault. Furthermore it is almost certainly the case the handler
3169 * stack is currently considered to be a page table, so we should
3170 * unshadow the faulting page before exiting.
3171 */
3172 if ( unlikely(hvm_event_pending(v)) )
3174 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3175 if ( fast_emul )
3177 perfc_incr(shadow_fault_fast_emulate_fail);
3178 v->arch.paging.last_write_emul_ok = 0;
3180 #endif
3181 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3182 "injection: cr2=%#lx, mfn=%#lx\n",
3183 va, mfn_x(gmfn));
3184 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3185 return EXCRET_fault_fixed;
3189 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3190 (unsigned long)regs->eip, (unsigned long)regs->esp);
3192 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3194 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3196 /*
3197 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3198 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3199 * then it must be 'failable': we cannot require the unshadow to succeed.
3200 */
3201 if ( r == X86EMUL_UNHANDLEABLE )
3203 perfc_incr(shadow_fault_emulate_failed);
3204 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3205 if ( fast_emul )
3207 perfc_incr(shadow_fault_fast_emulate_fail);
3208 v->arch.paging.last_write_emul_ok = 0;
3210 #endif
3211 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3212 mfn_x(gmfn));
3213 /* If this is actually a page table, then we have a bug, and need
3214 * to support more operations in the emulator. More likely,
3215 * though, this is a hint that this page should not be shadowed. */
3216 shadow_remove_all_shadows(v, gmfn);
3219 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3220 /* Record successfully emulated information as heuristics to next
3221 * fault on same frame for acceleration. But be careful to verify
3222 * its attribute still as page table, or else unshadow triggered
3223 * in write emulation normally requires a re-sync with guest page
3224 * table to recover r/w permission. Incorrect record for such case
3225 * will cause unexpected more shadow faults due to propagation is
3226 * skipped.
3227 */
3228 if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3230 if ( !fast_emul )
3232 v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3233 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3234 v->arch.paging.last_write_emul_ok = 1;
3237 else if ( fast_emul )
3238 v->arch.paging.last_write_emul_ok = 0;
3239 #endif
3241 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3242 if ( r == X86EMUL_OKAY ) {
3243 int i;
3244 /* Emulate up to four extra instructions in the hope of catching
3245 * the "second half" of a 64-bit pagetable write. */
3246 for ( i = 0 ; i < 4 ; i++ )
3248 shadow_continue_emulation(&emul_ctxt, regs);
3249 v->arch.paging.last_write_was_pt = 0;
3250 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3251 if ( r == X86EMUL_OKAY )
3253 if ( v->arch.paging.last_write_was_pt )
3255 perfc_incr(shadow_em_ex_pt);
3256 break; /* Don't emulate past the other half of the write */
3258 else
3259 perfc_incr(shadow_em_ex_non_pt);
3261 else
3263 perfc_incr(shadow_em_ex_fail);
3264 break; /* Don't emulate again if we failed! */
3268 #endif /* PAE guest */
3270 SHADOW_PRINTK("emulated\n");
3271 return EXCRET_fault_fixed;
3273 mmio:
3274 if ( !guest_mode(regs) )
3275 goto not_a_shadow_fault;
3276 perfc_incr(shadow_fault_mmio);
3277 sh_audit_gw(v, &gw);
3278 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3279 shadow_audit_tables(v);
3280 reset_early_unshadow(v);
3281 shadow_unlock(d);
3282 return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3283 ? EXCRET_fault_fixed : 0);
3285 not_a_shadow_fault:
3286 sh_audit_gw(v, &gw);
3287 SHADOW_PRINTK("not a shadow fault\n");
3288 shadow_audit_tables(v);
3289 reset_early_unshadow(v);
3290 shadow_unlock(d);
3291 return 0;
3295 static int
3296 sh_invlpg(struct vcpu *v, unsigned long va)
3297 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3298 * instruction should be issued on the hardware, or 0 if it's safe not
3299 * to do so. */
3301 shadow_l2e_t sl2e;
3303 perfc_incr(shadow_invlpg);
3305 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3306 /* No longer safe to use cached gva->gfn translations */
3307 vtlb_flush(v);
3308 #endif
3310 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3311 v->arch.paging.last_write_emul_ok = 0;
3312 #endif
3314 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3315 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3316 * yet. */
3317 #if SHADOW_PAGING_LEVELS == 4
3319 shadow_l3e_t sl3e;
3320 if ( !(shadow_l4e_get_flags(
3321 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3322 & _PAGE_PRESENT) )
3323 return 0;
3324 /* This must still be a copy-from-user because we don't have the
3325 * shadow lock, and the higher-level shadows might disappear
3326 * under our feet. */
3327 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3328 + shadow_l3_linear_offset(va)),
3329 sizeof (sl3e)) != 0 )
3331 perfc_incr(shadow_invlpg_fault);
3332 return 0;
3334 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3335 return 0;
3337 #elif SHADOW_PAGING_LEVELS == 3
3338 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3339 & _PAGE_PRESENT) )
3340 // no need to flush anything if there's no SL2...
3341 return 0;
3342 #endif
3344 /* This must still be a copy-from-user because we don't have the shadow
3345 * lock, and the higher-level shadows might disappear under our feet. */
3346 if ( __copy_from_user(&sl2e,
3347 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3348 sizeof (sl2e)) != 0 )
3350 perfc_incr(shadow_invlpg_fault);
3351 return 0;
3354 // If there's nothing shadowed for this particular sl2e, then
3355 // there is no need to do an invlpg, either...
3356 //
3357 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3358 return 0;
3360 // Check to see if the SL2 is a splintered superpage...
3361 // If so, then we'll need to flush the entire TLB (because that's
3362 // easier than invalidating all of the individual 4K pages).
3363 //
3364 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
3365 == SH_type_fl1_shadow )
3367 flush_tlb_local();
3368 return 0;
3371 return 1;
3375 static unsigned long
3376 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3377 /* Called to translate a guest virtual address to what the *guest*
3378 * pagetables would map it to. */
3380 walk_t gw;
3381 gfn_t gfn;
3383 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3384 /* Check the vTLB cache first */
3385 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3386 if ( VALID_GFN(vtlb_gfn) )
3387 return vtlb_gfn;
3388 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3390 if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
3392 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3393 pfec[0] &= ~PFEC_page_present;
3394 return INVALID_GFN;
3396 gfn = guest_walk_to_gfn(&gw);
3398 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3399 /* Remember this successful VA->GFN translation for later. */
3400 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3401 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3403 return gfn_x(gfn);
3407 static inline void
3408 sh_update_linear_entries(struct vcpu *v)
3409 /* Sync up all the linear mappings for this vcpu's pagetables */
3411 struct domain *d = v->domain;
3413 /* Linear pagetables in PV guests
3414 * ------------------------------
3416 * Guest linear pagetables, which map the guest pages, are at
3417 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3418 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3419 * are set up at shadow creation time, but (of course!) the PAE case
3420 * is subtler. Normal linear mappings are made by having an entry
3421 * in the top-level table that points to itself (shadow linear) or
3422 * to the guest top-level table (guest linear). For PAE, to set up
3423 * a linear map requires us to copy the four top-level entries into
3424 * level-2 entries. That means that every time we change a PAE l3e,
3425 * we need to reflect the change into the copy.
3427 * Linear pagetables in HVM guests
3428 * -------------------------------
3430 * For HVM guests, the linear pagetables are installed in the monitor
3431 * tables (since we can't put them in the shadow). Shadow linear
3432 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3433 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3434 * a linear pagetable of the monitor tables themselves. We have
3435 * the same issue of having to re-copy PAE l3 entries whevever we use
3436 * PAE shadows.
3438 * Because HVM guests run on the same monitor tables regardless of the
3439 * shadow tables in use, the linear mapping of the shadow tables has to
3440 * be updated every time v->arch.shadow_table changes.
3441 */
3443 /* Don't try to update the monitor table if it doesn't exist */
3444 if ( shadow_mode_external(d)
3445 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3446 return;
3448 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3450 /* For PV, one l4e points at the guest l4, one points at the shadow
3451 * l4. No maintenance required.
3452 * For HVM, just need to update the l4e that points to the shadow l4. */
3454 if ( shadow_mode_external(d) )
3456 /* Use the linear map if we can; otherwise make a new mapping */
3457 if ( v == current )
3459 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3460 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3461 __PAGE_HYPERVISOR);
3463 else
3465 l4_pgentry_t *ml4e;
3466 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3467 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3468 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3469 __PAGE_HYPERVISOR);
3470 sh_unmap_domain_page(ml4e);
3474 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3476 /* PV: XXX
3478 * HVM: To give ourselves a linear map of the shadows, we need to
3479 * extend a PAE shadow to 4 levels. We do this by having a monitor
3480 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3481 * entries into it. Then, by having the monitor l4e for shadow
3482 * pagetables also point to the monitor l4, we can use it to access
3483 * the shadows.
3484 */
3486 if ( shadow_mode_external(d) )
3488 /* Install copies of the shadow l3es into the monitor l2 table
3489 * that maps SH_LINEAR_PT_VIRT_START. */
3490 shadow_l3e_t *sl3e;
3491 l2_pgentry_t *ml2e;
3492 int i;
3494 /* Use linear mappings if we can; otherwise make new mappings */
3495 if ( v == current )
3496 ml2e = __linear_l2_table
3497 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3498 else
3500 mfn_t l3mfn, l2mfn;
3501 l4_pgentry_t *ml4e;
3502 l3_pgentry_t *ml3e;
3503 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3504 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3506 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3507 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3508 ml3e = sh_map_domain_page(l3mfn);
3509 sh_unmap_domain_page(ml4e);
3511 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3512 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3513 ml2e = sh_map_domain_page(l2mfn);
3514 sh_unmap_domain_page(ml3e);
3517 /* Shadow l3 tables are made up by sh_update_cr3 */
3518 sl3e = v->arch.paging.shadow.l3table;
3520 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3522 ml2e[i] =
3523 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3524 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3525 __PAGE_HYPERVISOR)
3526 : l2e_empty();
3529 if ( v != current )
3530 sh_unmap_domain_page(ml2e);
3532 else
3533 domain_crash(d); /* XXX */
3535 #elif CONFIG_PAGING_LEVELS == 3
3537 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3538 * entries in the shadow, and the shadow's l3 entries into the
3539 * shadow-linear-map l2 entries in the shadow. This is safe to do
3540 * because Xen does not let guests share high-slot l2 tables between l3s,
3541 * so we know we're not treading on anyone's toes.
3543 * HVM: need to copy the shadow's l3 entries into the
3544 * shadow-linear-map l2 entries in the monitor table. This is safe
3545 * because we have one monitor table for each vcpu. The monitor's
3546 * own l3es don't need to be copied because they never change.
3547 * XXX That might change if we start stuffing things into the rest
3548 * of the monitor's virtual address space.
3549 */
3551 l2_pgentry_t *l2e, new_l2e;
3552 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3553 int i;
3554 int unmap_l2e = 0;
3556 #if GUEST_PAGING_LEVELS == 2
3558 /* Shadow l3 tables were built by sh_update_cr3 */
3559 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3560 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3562 #else /* GUEST_PAGING_LEVELS == 3 */
3564 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3565 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3567 #endif /* GUEST_PAGING_LEVELS */
3569 /* Choose where to write the entries, using linear maps if possible */
3570 if ( shadow_mode_external(d) )
3572 if ( v == current )
3574 /* From the monitor tables, it's safe to use linear maps
3575 * to update monitor l2s */
3576 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3578 else
3580 /* Map the monitor table's high l2 */
3581 l3_pgentry_t *l3e;
3582 l3e = sh_map_domain_page(
3583 pagetable_get_mfn(v->arch.monitor_table));
3584 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3585 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3586 unmap_l2e = 1;
3587 sh_unmap_domain_page(l3e);
3590 else
3592 /* Map the shadow table's high l2 */
3593 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3594 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3595 unmap_l2e = 1;
3598 /* Write linear mapping of guest (only in PV, and only when
3599 * not translated). */
3600 if ( !shadow_mode_translate(d) )
3602 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3604 new_l2e =
3605 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3606 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3607 __PAGE_HYPERVISOR)
3608 : l2e_empty());
3609 safe_write_entry(
3610 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3611 &new_l2e);
3615 /* Write linear mapping of shadow. */
3616 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3618 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3619 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3620 __PAGE_HYPERVISOR)
3621 : l2e_empty();
3622 safe_write_entry(
3623 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3624 &new_l2e);
3627 if ( unmap_l2e )
3628 sh_unmap_domain_page(l2e);
3631 #elif CONFIG_PAGING_LEVELS == 2
3633 /* For PV, one l2e points at the guest l2, one points at the shadow
3634 * l2. No maintenance required.
3635 * For HVM, just need to update the l2e that points to the shadow l2. */
3637 if ( shadow_mode_external(d) )
3639 /* Use the linear map if we can; otherwise make a new mapping */
3640 if ( v == current )
3642 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3643 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3644 __PAGE_HYPERVISOR);
3646 else
3648 l2_pgentry_t *ml2e;
3649 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3650 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3651 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3652 __PAGE_HYPERVISOR);
3653 sh_unmap_domain_page(ml2e);
3657 #else
3658 #error this should not happen
3659 #endif
3661 if ( shadow_mode_external(d) )
3663 /*
3664 * Having modified the linear pagetable mapping, flush local host TLBs.
3665 * This was not needed when vmenter/vmexit always had the side effect
3666 * of flushing host TLBs but, with ASIDs, it is possible to finish
3667 * this CR3 update, vmenter the guest, vmexit due to a page fault,
3668 * without an intervening host TLB flush. Then the page fault code
3669 * could use the linear pagetable to read a top-level shadow page
3670 * table entry. But, without this change, it would fetch the wrong
3671 * value due to a stale TLB.
3672 */
3673 flush_tlb_local();
3678 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3679 * Does all appropriate management/bookkeeping/refcounting/etc...
3680 */
3681 static void
3682 sh_detach_old_tables(struct vcpu *v)
3684 mfn_t smfn;
3685 int i = 0;
3687 ////
3688 //// vcpu->arch.paging.shadow.guest_vtable
3689 ////
3691 #if GUEST_PAGING_LEVELS == 3
3692 /* PAE guests don't have a mapping of the guest top-level table */
3693 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3694 #else
3695 if ( v->arch.paging.shadow.guest_vtable )
3697 struct domain *d = v->domain;
3698 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3699 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3700 v->arch.paging.shadow.guest_vtable = NULL;
3702 #endif
3705 ////
3706 //// vcpu->arch.shadow_table[]
3707 ////
3709 #if GUEST_PAGING_LEVELS == 3
3710 /* PAE guests have four shadow_table entries */
3711 for ( i = 0 ; i < 4 ; i++ )
3712 #endif
3714 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3715 if ( mfn_x(smfn) )
3716 sh_put_ref(v, smfn, 0);
3717 v->arch.shadow_table[i] = pagetable_null();
3721 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3722 static void
3723 sh_set_toplevel_shadow(struct vcpu *v,
3724 int slot,
3725 mfn_t gmfn,
3726 unsigned int root_type)
3728 mfn_t smfn;
3729 pagetable_t old_entry, new_entry;
3731 struct domain *d = v->domain;
3733 /* Remember the old contents of this slot */
3734 old_entry = v->arch.shadow_table[slot];
3736 /* Now figure out the new contents: is this a valid guest MFN? */
3737 if ( !mfn_valid(gmfn) )
3739 new_entry = pagetable_null();
3740 goto install_new_entry;
3743 /* Guest mfn is valid: shadow it and install the shadow */
3744 smfn = get_shadow_status(v, gmfn, root_type);
3745 if ( !mfn_valid(smfn) )
3747 /* Make sure there's enough free shadow memory. */
3748 shadow_prealloc(d, root_type, 1);
3749 /* Shadow the page. */
3750 smfn = sh_make_shadow(v, gmfn, root_type);
3752 ASSERT(mfn_valid(smfn));
3754 /* Pin the shadow and put it (back) on the list of pinned shadows */
3755 if ( sh_pin(v, smfn) == 0 )
3757 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3758 domain_crash(v->domain);
3761 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3762 * or the next call to set_toplevel_shadow() */
3763 if ( !sh_get_ref(v, smfn, 0) )
3765 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3766 domain_crash(v->domain);
3769 new_entry = pagetable_from_mfn(smfn);
3771 install_new_entry:
3772 /* Done. Install it */
3773 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3774 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3775 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3776 v->arch.shadow_table[slot] = new_entry;
3778 /* Decrement the refcount of the old contents of this slot */
3779 if ( !pagetable_is_null(old_entry) ) {
3780 mfn_t old_smfn = pagetable_get_mfn(old_entry);
3781 /* Need to repin the old toplevel shadow if it's been unpinned
3782 * by shadow_prealloc(): in PV mode we're still running on this
3783 * shadow and it's not safe to free it yet. */
3784 if ( !mfn_to_shadow_page(old_smfn)->pinned && !sh_pin(v, old_smfn) )
3786 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
3787 domain_crash(v->domain);
3789 sh_put_ref(v, old_smfn, 0);
3794 static void
3795 sh_update_cr3(struct vcpu *v, int do_locking)
3796 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3797 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3798 * if appropriate).
3799 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3800 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
3801 * shadow tables are.
3802 * If do_locking != 0, assume we are being called from outside the
3803 * shadow code, and must take and release the shadow lock; otherwise
3804 * that is the caller's responsibility.
3805 */
3807 struct domain *d = v->domain;
3808 mfn_t gmfn;
3809 #if GUEST_PAGING_LEVELS == 3
3810 guest_l3e_t *gl3e;
3811 u32 guest_idx=0;
3812 int i;
3813 #endif
3815 /* Don't do anything on an uninitialised vcpu */
3816 if ( !is_hvm_domain(d) && !v->is_initialised )
3818 ASSERT(v->arch.cr3 == 0);
3819 return;
3822 if ( do_locking ) shadow_lock(v->domain);
3824 ASSERT(shadow_locked_by_me(v->domain));
3825 ASSERT(v->arch.paging.mode);
3827 ////
3828 //// vcpu->arch.guest_table is already set
3829 ////
3831 #ifndef NDEBUG
3832 /* Double-check that the HVM code has sent us a sane guest_table */
3833 if ( is_hvm_domain(d) )
3835 ASSERT(shadow_mode_external(d));
3836 if ( hvm_paging_enabled(v) )
3837 ASSERT(pagetable_get_pfn(v->arch.guest_table));
3838 else
3839 ASSERT(v->arch.guest_table.pfn
3840 == d->arch.paging.shadow.unpaged_pagetable.pfn);
3842 #endif
3844 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3845 d->domain_id, v->vcpu_id,
3846 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3848 #if GUEST_PAGING_LEVELS == 4
3849 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
3850 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3851 else
3852 #endif
3853 gmfn = pagetable_get_mfn(v->arch.guest_table);
3856 ////
3857 //// vcpu->arch.paging.shadow.guest_vtable
3858 ////
3859 #if GUEST_PAGING_LEVELS == 4
3860 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3862 if ( v->arch.paging.shadow.guest_vtable )
3863 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3864 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3865 /* PAGING_LEVELS==4 implies 64-bit, which means that
3866 * map_domain_page_global can't fail */
3867 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
3869 else
3870 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
3871 #elif GUEST_PAGING_LEVELS == 3
3872 /* On PAE guests we don't use a mapping of the guest's own top-level
3873 * table. We cache the current state of that table and shadow that,
3874 * until the next CR3 write makes us refresh our cache. */
3875 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3877 if ( shadow_mode_external(d) )
3878 /* Find where in the page the l3 table is */
3879 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
3880 else
3881 /* PV guest: l3 is at the start of a page */
3882 guest_idx = 0;
3884 // Ignore the low 2 bits of guest_idx -- they are really just
3885 // cache control.
3886 guest_idx &= ~3;
3888 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
3889 for ( i = 0; i < 4 ; i++ )
3890 v->arch.paging.shadow.gl3e[i] = gl3e[i];
3891 sh_unmap_domain_page(gl3e);
3892 #elif GUEST_PAGING_LEVELS == 2
3893 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3895 if ( v->arch.paging.shadow.guest_vtable )
3896 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3897 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3898 /* Does this really need map_domain_page_global? Handle the
3899 * error properly if so. */
3900 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
3902 else
3903 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
3904 #else
3905 #error this should never happen
3906 #endif
3908 #if 0
3909 printk("%s %s %d gmfn=%05lx shadow.guest_vtable=%p\n",
3910 __func__, __FILE__, __LINE__, gmfn, v->arch.paging.shadow.guest_vtable);
3911 #endif
3913 ////
3914 //// vcpu->arch.shadow_table[]
3915 ////
3917 /* We revoke write access to the new guest toplevel page(s) before we
3918 * replace the old shadow pagetable(s), so that we can safely use the
3919 * (old) shadow linear maps in the writeable mapping heuristics. */
3920 #if GUEST_PAGING_LEVELS == 2
3921 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
3922 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3923 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3924 #elif GUEST_PAGING_LEVELS == 3
3925 /* PAE guests have four shadow_table entries, based on the
3926 * current values of the guest's four l3es. */
3928 int flush = 0;
3929 gfn_t gl2gfn;
3930 mfn_t gl2mfn;
3931 p2m_type_t p2mt;
3932 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
3933 /* First, make all four entries read-only. */
3934 for ( i = 0; i < 4; i++ )
3936 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3938 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3939 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
3940 if ( p2m_is_ram(p2mt) )
3941 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
3944 if ( flush )
3945 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3946 /* Now install the new shadows. */
3947 for ( i = 0; i < 4; i++ )
3949 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3951 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3952 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
3953 if ( p2m_is_ram(p2mt) )
3954 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3955 ? SH_type_l2h_shadow
3956 : SH_type_l2_shadow);
3957 else
3958 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3960 else
3961 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3964 #elif GUEST_PAGING_LEVELS == 4
3965 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
3966 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3967 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3968 #else
3969 #error This should never happen
3970 #endif
3972 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3973 #endif
3975 ///
3976 /// v->arch.paging.shadow.l3table
3977 ///
3978 #if SHADOW_PAGING_LEVELS == 3
3980 mfn_t smfn;
3981 int i;
3982 for ( i = 0; i < 4; i++ )
3984 #if GUEST_PAGING_LEVELS == 2
3985 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3986 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3987 #else
3988 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3989 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3990 #endif
3991 v->arch.paging.shadow.l3table[i] =
3992 (mfn_x(smfn) == 0)
3993 ? shadow_l3e_empty()
3994 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3997 #endif /* SHADOW_PAGING_LEVELS == 3 */
4000 ///
4001 /// v->arch.cr3
4002 ///
4003 if ( shadow_mode_external(d) )
4005 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4007 else // not shadow_mode_external...
4009 /* We don't support PV except guest == shadow == config levels */
4010 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4011 #if SHADOW_PAGING_LEVELS == 3
4012 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4013 * Don't use make_cr3 because (a) we know it's below 4GB, and
4014 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4015 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4016 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4017 #else
4018 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
4019 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4020 #endif
4024 ///
4025 /// v->arch.hvm_vcpu.hw_cr[3]
4026 ///
4027 if ( shadow_mode_external(d) )
4029 ASSERT(is_hvm_domain(d));
4030 #if SHADOW_PAGING_LEVELS == 3
4031 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4032 v->arch.hvm_vcpu.hw_cr[3] =
4033 virt_to_maddr(&v->arch.paging.shadow.l3table);
4034 #else
4035 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
4036 v->arch.hvm_vcpu.hw_cr[3] =
4037 pagetable_get_paddr(v->arch.shadow_table[0]);
4038 #endif
4039 hvm_update_guest_cr(v, 3);
4042 /* Fix up the linear pagetable mappings */
4043 sh_update_linear_entries(v);
4045 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4046 /* No longer safe to use cached gva->gfn translations */
4047 vtlb_flush(v);
4048 #endif
4050 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4051 v->arch.paging.last_write_emul_ok = 0;
4052 #endif
4054 /* Release the lock, if we took it (otherwise it's the caller's problem) */
4055 if ( do_locking ) shadow_unlock(v->domain);
4059 /**************************************************************************/
4060 /* Functions to revoke guest rights */
4062 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4063 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4064 /* Look up this vaddr in the current shadow and see if it's a writeable
4065 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4067 shadow_l1e_t sl1e, *sl1p;
4068 shadow_l2e_t *sl2p;
4069 #if SHADOW_PAGING_LEVELS >= 3
4070 shadow_l3e_t *sl3p;
4071 #if SHADOW_PAGING_LEVELS >= 4
4072 shadow_l4e_t *sl4p;
4073 #endif
4074 #endif
4075 mfn_t sl1mfn;
4076 int r;
4078 /* Carefully look in the shadow linear map for the l1e we expect */
4079 #if SHADOW_PAGING_LEVELS >= 4
4080 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4081 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4082 return 0;
4083 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4084 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4085 return 0;
4086 #elif SHADOW_PAGING_LEVELS == 3
4087 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4088 + shadow_l3_linear_offset(vaddr);
4089 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4090 return 0;
4091 #endif
4092 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4093 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4094 return 0;
4095 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4096 sl1e = *sl1p;
4097 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4098 != (_PAGE_PRESENT|_PAGE_RW))
4099 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4100 return 0;
4102 /* Found it! Need to remove its write permissions. */
4103 sl1mfn = shadow_l2e_get_mfn(*sl2p);
4104 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4105 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
4106 ASSERT( !(r & SHADOW_SET_ERROR) );
4107 return 1;
4109 #endif
4111 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4112 mfn_t readonly_mfn)
4113 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4115 shadow_l1e_t *sl1e;
4116 int done = 0;
4117 int flags;
4118 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4120 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4122 flags = shadow_l1e_get_flags(*sl1e);
4123 if ( (flags & _PAGE_PRESENT)
4124 && (flags & _PAGE_RW)
4125 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4127 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4128 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
4129 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4130 /* Remember the last shadow that we shot a writeable mapping in */
4131 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4132 #endif
4133 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4134 & PGT_count_mask) == 0 )
4135 /* This breaks us cleanly out of the FOREACH macro */
4136 done = 1;
4138 });
4139 return done;
4143 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4144 /* Excises all mappings to guest frame from this shadow l1 table */
4146 shadow_l1e_t *sl1e;
4147 int done = 0;
4148 int flags;
4150 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4152 flags = shadow_l1e_get_flags(*sl1e);
4153 if ( (flags & _PAGE_PRESENT)
4154 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4156 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
4157 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4158 /* This breaks us cleanly out of the FOREACH macro */
4159 done = 1;
4161 });
4162 return done;
4165 /**************************************************************************/
4166 /* Functions to excise all pointers to shadows from higher-level shadows. */
4168 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4169 /* Blank out a single shadow entry */
4171 switch ( mfn_to_shadow_page(smfn)->type )
4173 case SH_type_l1_shadow:
4174 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
4175 case SH_type_l2_shadow:
4176 #if GUEST_PAGING_LEVELS >= 3
4177 case SH_type_l2h_shadow:
4178 #endif
4179 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
4180 #if GUEST_PAGING_LEVELS >= 4
4181 case SH_type_l3_shadow:
4182 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
4183 case SH_type_l4_shadow:
4184 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
4185 #endif
4186 default: BUG(); /* Called with the wrong kind of shadow. */
4190 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4191 /* Remove all mappings of this l1 shadow from this l2 shadow */
4193 shadow_l2e_t *sl2e;
4194 int done = 0;
4195 int flags;
4197 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4199 flags = shadow_l2e_get_flags(*sl2e);
4200 if ( (flags & _PAGE_PRESENT)
4201 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4203 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4204 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
4205 /* This breaks us cleanly out of the FOREACH macro */
4206 done = 1;
4208 });
4209 return done;
4212 #if GUEST_PAGING_LEVELS >= 4
4213 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4214 /* Remove all mappings of this l2 shadow from this l3 shadow */
4216 shadow_l3e_t *sl3e;
4217 int done = 0;
4218 int flags;
4220 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4222 flags = shadow_l3e_get_flags(*sl3e);
4223 if ( (flags & _PAGE_PRESENT)
4224 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4226 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4227 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
4228 /* This breaks us cleanly out of the FOREACH macro */
4229 done = 1;
4231 });
4232 return done;
4235 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4236 /* Remove all mappings of this l3 shadow from this l4 shadow */
4238 shadow_l4e_t *sl4e;
4239 int done = 0;
4240 int flags;
4242 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4244 flags = shadow_l4e_get_flags(*sl4e);
4245 if ( (flags & _PAGE_PRESENT)
4246 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4248 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4249 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
4250 /* This breaks us cleanly out of the FOREACH macro */
4251 done = 1;
4253 });
4254 return done;
4256 #endif /* 64bit guest */
4258 /**************************************************************************/
4259 /* Handling HVM guest writes to pagetables */
4261 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4262 #define BAD_GVA_TO_GFN (~0UL)
4263 #define BAD_GFN_TO_MFN (~1UL)
4264 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4265 unsigned long vaddr,
4266 struct sh_emulate_ctxt *sh_ctxt)
4268 unsigned long gfn;
4269 mfn_t mfn;
4270 p2m_type_t p2mt;
4271 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4273 /* Translate the VA to a GFN */
4274 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4275 if ( gfn == INVALID_GFN )
4277 if ( is_hvm_vcpu(v) )
4278 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4279 else
4280 propagate_page_fault(vaddr, pfec);
4281 return _mfn(BAD_GVA_TO_GFN);
4284 /* Translate the GFN to an MFN */
4285 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4286 if ( p2m_is_ram(p2mt) )
4288 ASSERT(mfn_valid(mfn));
4289 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4290 return mfn;
4293 return _mfn(BAD_GFN_TO_MFN);
4296 /* Check that the user is allowed to perform this write.
4297 * Returns a mapped pointer to write to, or NULL for error. */
4298 #define MAPPING_UNHANDLEABLE ((void *)0)
4299 #define MAPPING_EXCEPTION ((void *)1)
4300 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 1)
4301 static void *emulate_map_dest(struct vcpu *v,
4302 unsigned long vaddr,
4303 u32 bytes,
4304 struct sh_emulate_ctxt *sh_ctxt)
4306 struct segment_register *sreg;
4307 unsigned long offset;
4308 void *map = NULL;
4310 /* We don't emulate user-mode writes to page tables */
4311 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
4312 if ( sreg->attr.fields.dpl == 3 )
4313 return MAPPING_UNHANDLEABLE;
4315 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4316 if ( !mfn_valid(sh_ctxt->mfn1) )
4317 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4318 MAPPING_EXCEPTION : MAPPING_UNHANDLEABLE);
4320 /* Unaligned writes mean probably this isn't a pagetable */
4321 if ( vaddr & (bytes - 1) )
4322 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4324 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4326 /* Whole write fits on a single page */
4327 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4328 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4330 else
4332 /* Cross-page emulated writes are only supported for HVM guests;
4333 * PV guests ought to know better */
4334 if ( !is_hvm_vcpu(v) )
4335 return MAPPING_UNHANDLEABLE;
4337 /* This write crosses a page boundary. Translate the second page */
4338 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4339 sh_ctxt);
4340 if ( !mfn_valid(sh_ctxt->mfn2) )
4341 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4342 MAPPING_EXCEPTION : MAPPING_UNHANDLEABLE);
4344 /* Cross-page writes mean probably not a pagetable */
4345 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4347 /* Hack: we map the pages into the vcpu's LDT space, since we
4348 * know that we're not going to need the LDT for HVM guests,
4349 * and only HVM guests are allowed unaligned writes. */
4350 ASSERT(is_hvm_vcpu(v));
4351 map = (void *)LDT_VIRT_START(v);
4352 offset = l1_linear_offset((unsigned long) map);
4353 l1e_write(&__linear_l1_table[offset],
4354 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4355 l1e_write(&__linear_l1_table[offset + 1],
4356 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4357 flush_tlb_local();
4358 map += (vaddr & ~PAGE_MASK);
4361 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4362 /* Remember if the bottom bit was clear, so we can choose not to run
4363 * the change through the verify code if it's still clear afterwards */
4364 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4365 #endif
4367 return map;
4370 /* Tidy up after the emulated write: mark pages dirty, verify the new
4371 * contents, and undo the mapping */
4372 static void emulate_unmap_dest(struct vcpu *v,
4373 void *addr,
4374 u32 bytes,
4375 struct sh_emulate_ctxt *sh_ctxt)
4377 u32 b1 = bytes, b2 = 0, shflags;
4379 ASSERT(mfn_valid(sh_ctxt->mfn1));
4381 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4382 if ( likely(bytes >= 4)
4383 && (*(u32 *)addr == 0)
4384 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4385 check_for_early_unshadow(v, sh_ctxt->mfn1);
4386 else
4387 reset_early_unshadow(v);
4389 /* We can avoid re-verifying the page contents after the write if:
4390 * - it was no larger than the PTE type of this pagetable;
4391 * - it was aligned to the PTE boundaries; and
4392 * - _PAGE_PRESENT was clear before and after the write. */
4393 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4394 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4395 if ( sh_ctxt->low_bit_was_clear
4396 && !(*(u8 *)addr & _PAGE_PRESENT)
4397 && ((!(shflags & SHF_32)
4398 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4399 * the present bit unset are safe to ignore. */
4400 && ((unsigned long)addr & 7) == 0
4401 && bytes <= 8)
4402 ||
4403 (!(shflags & (SHF_PAE|SHF_64))
4404 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4405 * leave the present bit unset are safe to ignore. */
4406 && ((unsigned long)addr & 3) == 0
4407 && bytes <= 4)) )
4409 /* Writes with this alignment constraint can't possibly cross pages */
4410 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4412 else
4413 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4415 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4417 /* Validate as two writes, one to each page */
4418 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4419 b2 = bytes - b1;
4420 ASSERT(b2 < bytes);
4422 if ( likely(b1 > 0) )
4423 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4424 if ( unlikely(b2 > 0) )
4425 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4428 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4430 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4432 unsigned long offset;
4433 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4434 /* Undo the hacky two-frame contiguous map. */
4435 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4436 offset = l1_linear_offset((unsigned long) addr);
4437 l1e_write(&__linear_l1_table[offset], l1e_empty());
4438 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4439 flush_tlb_all();
4441 else
4442 sh_unmap_domain_page(addr);
4444 atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4447 static int
4448 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4449 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4451 void *addr;
4453 /* Unaligned writes are only acceptable on HVM */
4454 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4455 return X86EMUL_UNHANDLEABLE;
4457 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4458 if ( emulate_map_dest_failed(addr) )
4459 return ((addr == MAPPING_EXCEPTION) ?
4460 X86EMUL_EXCEPTION : X86EMUL_UNHANDLEABLE);
4462 shadow_lock(v->domain);
4463 memcpy(addr, src, bytes);
4465 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4466 shadow_audit_tables(v);
4467 shadow_unlock(v->domain);
4468 return X86EMUL_OKAY;
4471 static int
4472 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4473 unsigned long old, unsigned long new,
4474 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4476 void *addr;
4477 unsigned long prev;
4478 int rv = X86EMUL_OKAY;
4480 /* Unaligned writes are only acceptable on HVM */
4481 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4482 return X86EMUL_UNHANDLEABLE;
4484 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4485 if ( emulate_map_dest_failed(addr) )
4486 return ((addr == MAPPING_EXCEPTION) ?
4487 X86EMUL_EXCEPTION : X86EMUL_UNHANDLEABLE);
4489 shadow_lock(v->domain);
4490 switch ( bytes )
4492 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4493 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4494 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4495 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4496 default:
4497 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4498 prev = ~old;
4501 if ( prev != old )
4502 rv = X86EMUL_CMPXCHG_FAILED;
4504 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4505 " wanted %#lx now %#lx bytes %u\n",
4506 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4508 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4509 shadow_audit_tables(v);
4510 shadow_unlock(v->domain);
4511 return rv;
4514 #ifdef __i386__
4515 static int
4516 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4517 unsigned long old_lo, unsigned long old_hi,
4518 unsigned long new_lo, unsigned long new_hi,
4519 struct sh_emulate_ctxt *sh_ctxt)
4521 void *addr;
4522 u64 old, new, prev;
4523 int rv = X86EMUL_OKAY;
4525 /* Unaligned writes are only acceptable on HVM */
4526 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4527 return X86EMUL_UNHANDLEABLE;
4529 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4530 if ( emulate_map_dest_failed(addr) )
4531 return ((addr == MAPPING_EXCEPTION) ?
4532 X86EMUL_EXCEPTION : X86EMUL_UNHANDLEABLE);
4534 old = (((u64) old_hi) << 32) | (u64) old_lo;
4535 new = (((u64) new_hi) << 32) | (u64) new_lo;
4537 shadow_lock(v->domain);
4538 prev = cmpxchg(((u64 *)addr), old, new);
4540 if ( prev != old )
4541 rv = X86EMUL_CMPXCHG_FAILED;
4543 emulate_unmap_dest(v, addr, 8, sh_ctxt);
4544 shadow_audit_tables(v);
4545 shadow_unlock(v->domain);
4546 return rv;
4548 #endif
4550 /**************************************************************************/
4551 /* Audit tools */
4553 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4555 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4556 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4557 "gl" #_level "mfn = %" PRI_mfn \
4558 " sl" #_level "mfn = %" PRI_mfn \
4559 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4560 " gl" #_level "e = %" SH_PRI_gpte \
4561 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4562 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4563 _level, guest_index(gl ## _level ## e), \
4564 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4565 gl ## _level ## e, sl ## _level ## e, \
4566 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4567 ##_a); \
4568 BUG(); \
4569 done = 1; \
4570 } while (0)
4573 static char * sh_audit_flags(struct vcpu *v, int level,
4574 int gflags, int sflags)
4575 /* Common code for auditing flag bits */
4577 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4578 return "shadow is present but guest is not present";
4579 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4580 return "global bit set in PV shadow";
4581 if ( level == 2 && (sflags & _PAGE_PSE) )
4582 return "PS bit set in shadow";
4583 #if SHADOW_PAGING_LEVELS == 3
4584 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4585 #endif
4586 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4587 return "accessed bit not propagated";
4588 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4589 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4590 return "dirty bit not propagated";
4591 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4592 return "user/supervisor bit does not match";
4593 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4594 return "NX bit does not match";
4595 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4596 return "shadow grants write access but guest does not";
4597 return NULL;
4600 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4602 guest_l1e_t *gl1e, *gp;
4603 shadow_l1e_t *sl1e;
4604 mfn_t mfn, gmfn, gl1mfn;
4605 gfn_t gfn;
4606 p2m_type_t p2mt;
4607 char *s;
4608 int done = 0;
4610 /* Follow the backpointer */
4611 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4612 gl1e = gp = sh_map_domain_page(gl1mfn);
4613 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4615 if ( sh_l1e_is_magic(*sl1e) )
4617 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4618 if ( sh_l1e_is_gnp(*sl1e) )
4620 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4621 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4623 else
4625 ASSERT(sh_l1e_is_mmio(*sl1e));
4626 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4627 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4628 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4629 " but guest gfn is %" SH_PRI_gfn,
4630 gfn_x(gfn),
4631 gfn_x(guest_l1e_get_gfn(*gl1e)));
4633 #endif
4635 else
4637 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4638 shadow_l1e_get_flags(*sl1e));
4639 if ( s ) AUDIT_FAIL(1, "%s", s);
4641 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4643 gfn = guest_l1e_get_gfn(*gl1e);
4644 mfn = shadow_l1e_get_mfn(*sl1e);
4645 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
4646 if ( mfn_x(gmfn) != mfn_x(mfn) )
4647 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4648 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4649 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4652 });
4653 sh_unmap_domain_page(gp);
4654 return done;
4657 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4659 guest_l1e_t *gl1e, e;
4660 shadow_l1e_t *sl1e;
4661 mfn_t gl1mfn = _mfn(INVALID_MFN);
4662 int f;
4663 int done = 0;
4665 /* fl1 has no useful backpointer: all we can check are flags */
4666 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4667 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4668 f = shadow_l1e_get_flags(*sl1e);
4669 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4670 if ( !(f == 0
4671 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4672 _PAGE_ACCESSED|_PAGE_DIRTY)
4673 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4674 || sh_l1e_is_magic(*sl1e)) )
4675 AUDIT_FAIL(1, "fl1e has bad flags");
4676 });
4677 return 0;
4680 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4682 guest_l2e_t *gl2e, *gp;
4683 shadow_l2e_t *sl2e;
4684 mfn_t mfn, gmfn, gl2mfn;
4685 gfn_t gfn;
4686 p2m_type_t p2mt;
4687 char *s;
4688 int done = 0;
4690 /* Follow the backpointer */
4691 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4692 gl2e = gp = sh_map_domain_page(gl2mfn);
4693 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
4695 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4696 shadow_l2e_get_flags(*sl2e));
4697 if ( s ) AUDIT_FAIL(2, "%s", s);
4699 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4701 gfn = guest_l2e_get_gfn(*gl2e);
4702 mfn = shadow_l2e_get_mfn(*sl2e);
4703 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4704 ? get_fl1_shadow_status(v, gfn)
4705 : get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
4706 SH_type_l1_shadow);
4707 if ( mfn_x(gmfn) != mfn_x(mfn) )
4708 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4709 " (--> %" PRI_mfn ")"
4710 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4711 gfn_x(gfn),
4712 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4713 : mfn_x(gfn_to_mfn(v->domain, gfn, &p2mt)),
4714 mfn_x(gmfn), mfn_x(mfn));
4716 });
4717 sh_unmap_domain_page(gp);
4718 return 0;
4721 #if GUEST_PAGING_LEVELS >= 4
4722 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4724 guest_l3e_t *gl3e, *gp;
4725 shadow_l3e_t *sl3e;
4726 mfn_t mfn, gmfn, gl3mfn;
4727 gfn_t gfn;
4728 p2m_type_t p2mt;
4729 char *s;
4730 int done = 0;
4732 /* Follow the backpointer */
4733 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4734 gl3e = gp = sh_map_domain_page(gl3mfn);
4735 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4737 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4738 shadow_l3e_get_flags(*sl3e));
4739 if ( s ) AUDIT_FAIL(3, "%s", s);
4741 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4743 gfn = guest_l3e_get_gfn(*gl3e);
4744 mfn = shadow_l3e_get_mfn(*sl3e);
4745 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
4746 ((GUEST_PAGING_LEVELS == 3 ||
4747 is_pv_32on64_vcpu(v))
4748 && !shadow_mode_external(v->domain)
4749 && (guest_index(gl3e) % 4) == 3)
4750 ? SH_type_l2h_shadow
4751 : SH_type_l2_shadow);
4752 if ( mfn_x(gmfn) != mfn_x(mfn) )
4753 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4754 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4755 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4757 });
4758 sh_unmap_domain_page(gp);
4759 return 0;
4762 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4764 guest_l4e_t *gl4e, *gp;
4765 shadow_l4e_t *sl4e;
4766 mfn_t mfn, gmfn, gl4mfn;
4767 gfn_t gfn;
4768 p2m_type_t p2mt;
4769 char *s;
4770 int done = 0;
4772 /* Follow the backpointer */
4773 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4774 gl4e = gp = sh_map_domain_page(gl4mfn);
4775 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
4777 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4778 shadow_l4e_get_flags(*sl4e));
4779 if ( s ) AUDIT_FAIL(4, "%s", s);
4781 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4783 gfn = guest_l4e_get_gfn(*gl4e);
4784 mfn = shadow_l4e_get_mfn(*sl4e);
4785 gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt),
4786 SH_type_l3_shadow);
4787 if ( mfn_x(gmfn) != mfn_x(mfn) )
4788 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4789 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4790 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4792 });
4793 sh_unmap_domain_page(gp);
4794 return 0;
4796 #endif /* GUEST_PAGING_LEVELS >= 4 */
4799 #undef AUDIT_FAIL
4801 #endif /* Audit code */
4803 /**************************************************************************/
4804 /* Entry points into this mode of the shadow code.
4805 * This will all be mangled by the preprocessor to uniquify everything. */
4806 struct paging_mode sh_paging_mode = {
4807 .page_fault = sh_page_fault,
4808 .invlpg = sh_invlpg,
4809 .gva_to_gfn = sh_gva_to_gfn,
4810 .update_cr3 = sh_update_cr3,
4811 .update_paging_modes = shadow_update_paging_modes,
4812 .write_p2m_entry = shadow_write_p2m_entry,
4813 .write_guest_entry = shadow_write_guest_entry,
4814 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
4815 .guest_map_l1e = sh_guest_map_l1e,
4816 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4817 .guest_levels = GUEST_PAGING_LEVELS,
4818 .shadow.detach_old_tables = sh_detach_old_tables,
4819 .shadow.x86_emulate_write = sh_x86_emulate_write,
4820 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4821 #ifdef __i386__
4822 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4823 #endif
4824 .shadow.make_monitor_table = sh_make_monitor_table,
4825 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
4826 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4827 .shadow.guess_wrmap = sh_guess_wrmap,
4828 #endif
4829 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
4830 };
4832 /*
4833 * Local variables:
4834 * mode: C
4835 * c-set-style: "BSD"
4836 * c-basic-offset: 4
4837 * indent-tabs-mode: nil
4838 * End:
4839 */