ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 17062:0769835cf50f

x86 shadow: Reduce scope of shadow lock.

emulate_map_dest doesn't require holding lock, since
only shadow related operation possibly involved is to
remove shadow which is less frequent and can acquire
lock inside. Rest are either guest table walk or
per-vcpu monitor table manipulation

Signed-off-by Kevin Tian <kevin.tian@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Feb 14 10:33:12 2008 +0000 (2008-02-14)
parents 9541494c0945
children 03d13b696027
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include <asm/hvm/cacheattr.h>
37 #include <asm/mtrr.h>
38 #include "private.h"
39 #include "types.h"
41 /* THINGS TO DO LATER:
42 *
43 * TEARDOWN HEURISTICS
44 * Also: have a heuristic for when to destroy a previous paging-mode's
45 * shadows. When a guest is done with its start-of-day 32-bit tables
46 * and reuses the memory we want to drop those shadows. Start with
47 * shadows in a page in two modes as a hint, but beware of clever tricks
48 * like reusing a pagetable for both PAE and 64-bit during boot...
49 *
50 * PAE LINEAR MAPS
51 * Rework shadow_get_l*e() to have the option of using map_domain_page()
52 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
53 * Then we can test the speed difference made by linear maps. If the
54 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
55 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
56 * to share l2h pages again.
57 *
58 * GUEST_WALK_TABLES TLB FLUSH COALESCE
59 * guest_walk_tables can do up to three remote TLB flushes as it walks to
60 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
61 * and if we do flush, re-do the walk. If anything has changed, then
62 * pause all the other vcpus and do the walk *again*.
63 *
64 * PSE disabled / PSE36
65 * We don't support any modes other than PSE enabled, PSE36 disabled.
66 * Neither of those would be hard to change, but we'd need to be able to
67 * deal with shadows made in one mode and used in another.
68 */
70 #define FETCH_TYPE_PREFETCH 1
71 #define FETCH_TYPE_DEMAND 2
72 #define FETCH_TYPE_WRITE 4
73 typedef enum {
74 ft_prefetch = FETCH_TYPE_PREFETCH,
75 ft_demand_read = FETCH_TYPE_DEMAND,
76 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
77 } fetch_type_t;
79 #ifdef DEBUG_TRACE_DUMP
80 static char *fetch_type_names[] = {
81 [ft_prefetch] "prefetch",
82 [ft_demand_read] "demand read",
83 [ft_demand_write] "demand write",
84 };
85 #endif
87 /**************************************************************************/
88 /* Hash table mapping from guest pagetables to shadows
89 *
90 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
91 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
92 * shadow L1 which maps its "splinters".
93 */
95 static inline mfn_t
96 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
97 /* Look for FL1 shadows in the hash table */
98 {
99 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
100 return smfn;
101 }
103 static inline mfn_t
104 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
105 /* Look for shadows in the hash table */
106 {
107 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
108 perfc_incr(shadow_get_shadow_status);
109 return smfn;
110 }
112 static inline void
113 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
114 /* Put an FL1 shadow into the hash table */
115 {
116 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
117 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
119 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
120 }
122 static inline void
123 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
124 /* Put a shadow into the hash table */
125 {
126 struct domain *d = v->domain;
127 int res;
129 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
130 d->domain_id, v->vcpu_id, mfn_x(gmfn),
131 shadow_type, mfn_x(smfn));
133 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
134 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
135 {
136 res = get_page(mfn_to_page(gmfn), d);
137 ASSERT(res == 1);
138 }
140 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
141 }
143 static inline void
144 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
145 /* Remove a shadow from the hash table */
146 {
147 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
148 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
149 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
150 }
152 static inline void
153 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
154 /* Remove a shadow from the hash table */
155 {
156 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
157 v->domain->domain_id, v->vcpu_id,
158 mfn_x(gmfn), shadow_type, mfn_x(smfn));
159 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
160 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
161 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
162 put_page(mfn_to_page(gmfn));
163 }
165 /**************************************************************************/
166 /* CPU feature support querying */
168 static inline int
169 guest_supports_superpages(struct vcpu *v)
170 {
171 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
172 * CR4.PSE is set or the guest is in PAE or long mode.
173 * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
174 return (is_hvm_vcpu(v) &&
175 (GUEST_PAGING_LEVELS != 2
176 || !hvm_paging_enabled(v)
177 || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
178 }
180 static inline int
181 guest_supports_nx(struct vcpu *v)
182 {
183 if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
184 return 0;
185 if ( !is_hvm_vcpu(v) )
186 return cpu_has_nx;
187 return hvm_nx_enabled(v);
188 }
191 /**************************************************************************/
192 /* Functions for walking the guest page tables */
194 /* Flags that are needed in a pagetable entry, with the sense of NX inverted */
195 static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
196 {
197 static uint32_t flags[] = {
198 /* I/F - Usr Wr */
199 /* 0 0 0 0 */ _PAGE_PRESENT,
200 /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
201 /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
202 /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
203 /* 0 1 0 0 */ _PAGE_PRESENT,
204 /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
205 /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
206 /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
207 /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
208 /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
209 /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
210 /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
211 /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
212 /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
213 /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
214 /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
215 };
217 /* Don't demand not-NX if the CPU wouldn't enforce it. */
218 if ( !guest_supports_nx(v) )
219 pfec &= ~PFEC_insn_fetch;
221 /* Don't demand R/W if the CPU wouldn't enforce it. */
222 if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
223 && !(pfec & PFEC_user_mode) )
224 pfec &= ~PFEC_write_access;
226 return flags[(pfec & 0x1f) >> 1];
227 }
229 /* Modify a guest pagetable entry to set the Accessed and Dirty bits.
230 * Returns non-zero if it actually writes to guest memory. */
231 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
232 {
233 guest_intpte_t old, new;
235 old = *(guest_intpte_t *)walk_p;
236 new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
237 if ( old != new )
238 {
239 /* Write the new entry into the walk, and try to write it back
240 * into the guest table as well. If the guest table has changed
241 * under out feet then leave it alone. */
242 *(guest_intpte_t *)walk_p = new;
243 if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
244 return 1;
245 }
246 return 0;
247 }
249 /* Walk the guest pagetables, after the manner of a hardware walker.
250 *
251 * Inputs: a vcpu, a virtual address, a walk_t to fill, a
252 * pointer to a pagefault code, and a flag "shadow_op".
253 *
254 * We walk the vcpu's guest pagetables, filling the walk_t with what we
255 * see and adding any Accessed and Dirty bits that are needed in the
256 * guest entries. Using the pagefault code, we check the permissions as
257 * we go. For the purposes of reading pagetables we treat all non-RAM
258 * memory as contining zeroes.
259 *
260 * If "shadow_op" is non-zero, we are serving a genuine guest memory access,
261 * and must (a) be under the shadow lock, and (b) remove write access
262 * from any guest PT pages we see, as we will be shadowing them soon
263 * and will rely on the contents' not having changed.
264 *
265 * Returns 0 for success, or the set of permission bits that we failed on
266 * if the walk did not complete.
267 * N.B. This is different from the old return code but almost no callers
268 * checked the old return code anyway.
269 */
270 static uint32_t
271 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw,
272 uint32_t pfec, int shadow_op)
273 {
274 struct domain *d = v->domain;
275 p2m_type_t p2mt;
276 guest_l1e_t *l1p = NULL;
277 guest_l2e_t *l2p = NULL;
278 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
279 guest_l3e_t *l3p = NULL;
280 guest_l4e_t *l4p;
281 #endif
282 uint32_t gflags, mflags, rc = 0;
283 int pse;
285 ASSERT(!shadow_op || shadow_locked_by_me(d));
287 perfc_incr(shadow_guest_walk);
288 memset(gw, 0, sizeof(*gw));
289 gw->va = va;
291 /* Mandatory bits that must be set in every entry. We invert NX, to
292 * calculate as if there were an "X" bit that allowed access.
293 * We will accumulate, in rc, the set of flags that are missing. */
294 mflags = mandatory_flags(v, pfec);
296 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
297 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
299 /* Get the l4e from the top level table and check its flags*/
300 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
301 l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
302 gw->l4e = l4p[guest_l4_table_offset(va)];
303 gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
304 rc |= ((gflags & mflags) ^ mflags);
305 if ( rc & _PAGE_PRESENT ) goto out;
307 /* Map the l3 table */
308 gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
309 if ( !p2m_is_ram(p2mt) )
310 {
311 rc |= _PAGE_PRESENT;
312 goto out;
313 }
314 ASSERT(mfn_valid(gw->l3mfn));
315 /* This mfn is a pagetable: make sure the guest can't write to it. */
316 if ( shadow_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
317 flush_tlb_mask(d->domain_dirty_cpumask);
318 /* Get the l3e and check its flags*/
319 l3p = sh_map_domain_page(gw->l3mfn);
320 gw->l3e = l3p[guest_l3_table_offset(va)];
321 gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
322 rc |= ((gflags & mflags) ^ mflags);
323 if ( rc & _PAGE_PRESENT )
324 goto out;
326 #else /* PAE only... */
328 /* Get l3e from the cache of the top level table and check its flag */
329 gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
330 if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
331 {
332 rc |= _PAGE_PRESENT;
333 goto out;
334 }
336 #endif /* PAE or 64... */
338 /* Map the l2 table */
339 gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
340 if ( !p2m_is_ram(p2mt) )
341 {
342 rc |= _PAGE_PRESENT;
343 goto out;
344 }
345 ASSERT(mfn_valid(gw->l2mfn));
346 /* This mfn is a pagetable: make sure the guest can't write to it. */
347 if ( shadow_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
348 flush_tlb_mask(d->domain_dirty_cpumask);
349 /* Get the l2e */
350 l2p = sh_map_domain_page(gw->l2mfn);
351 gw->l2e = l2p[guest_l2_table_offset(va)];
353 #else /* 32-bit only... */
355 /* Get l2e from the top level table */
356 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
357 l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
358 gw->l2e = l2p[guest_l2_table_offset(va)];
360 #endif /* All levels... */
362 gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
363 rc |= ((gflags & mflags) ^ mflags);
364 if ( rc & _PAGE_PRESENT )
365 goto out;
367 pse = (guest_supports_superpages(v) &&
368 (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
370 if ( pse )
371 {
372 /* Special case: this guest VA is in a PSE superpage, so there's
373 * no guest l1e. We make one up so that the propagation code
374 * can generate a shadow l1 table. Start with the gfn of the
375 * first 4k-page of the superpage. */
376 gfn_t start = guest_l2e_get_gfn(gw->l2e);
377 /* Grant full access in the l1e, since all the guest entry's
378 * access controls are enforced in the shadow l2e. */
379 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
380 _PAGE_ACCESSED|_PAGE_DIRTY);
381 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
382 * of the level 1. */
383 if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
384 flags |= _PAGE_PAT;
385 /* Copy the cache-control bits to the l1 as well, because we
386 * can't represent PAT in the (non-PSE) shadow l2e. :(
387 * This could cause problems if a guest ever maps an area of
388 * memory with superpages using more than one caching mode. */
389 flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
390 /* Increment the pfn by the right number of 4k pages.
391 * The ~0x1 is to mask out the PAT bit mentioned above. */
392 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
393 gw->l1e = guest_l1e_from_gfn(start, flags);
394 gw->l1mfn = _mfn(INVALID_MFN);
395 }
396 else
397 {
398 /* Not a superpage: carry on and find the l1e. */
399 gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
400 if ( !p2m_is_ram(p2mt) )
401 {
402 rc |= _PAGE_PRESENT;
403 goto out;
404 }
405 ASSERT(mfn_valid(gw->l1mfn));
406 /* This mfn is a pagetable: make sure the guest can't write to it. */
407 if ( shadow_op
408 && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
409 flush_tlb_mask(d->domain_dirty_cpumask);
410 l1p = sh_map_domain_page(gw->l1mfn);
411 gw->l1e = l1p[guest_l1_table_offset(va)];
412 gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
413 rc |= ((gflags & mflags) ^ mflags);
414 }
416 /* Go back and set accessed and dirty bits only if the walk was a
417 * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
418 * get set whenever a lower-level PT is used, at least some hardware
419 * walkers behave this way. */
420 if ( rc == 0 )
421 {
422 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
423 if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
424 paging_mark_dirty(d, mfn_x(gw->l4mfn));
425 if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
426 paging_mark_dirty(d, mfn_x(gw->l3mfn));
427 #endif
428 if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
429 (pse && (pfec & PFEC_write_access))) )
430 paging_mark_dirty(d, mfn_x(gw->l2mfn));
431 if ( !pse )
432 {
433 if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
434 (pfec & PFEC_write_access)) )
435 paging_mark_dirty(d, mfn_x(gw->l1mfn));
436 }
437 }
439 out:
440 #if GUEST_PAGING_LEVELS == 4
441 if ( l3p ) sh_unmap_domain_page(l3p);
442 #endif
443 #if GUEST_PAGING_LEVELS >= 3
444 if ( l2p ) sh_unmap_domain_page(l2p);
445 #endif
446 if ( l1p ) sh_unmap_domain_page(l1p);
448 return rc;
449 }
451 /* Given a walk_t, translate the gw->va into the guest's notion of the
452 * corresponding frame number. */
453 static inline gfn_t
454 guest_walk_to_gfn(walk_t *gw)
455 {
456 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
457 return _gfn(INVALID_GFN);
458 return guest_l1e_get_gfn(gw->l1e);
459 }
461 /* Given a walk_t, translate the gw->va into the guest's notion of the
462 * corresponding physical address. */
463 static inline paddr_t
464 guest_walk_to_gpa(walk_t *gw)
465 {
466 if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
467 return 0;
468 return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
469 }
471 #if 0 /* Keep for debugging */
472 /* Pretty-print the contents of a guest-walk */
473 static inline void print_gw(walk_t *gw)
474 {
475 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
476 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
477 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
478 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
479 SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
480 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
481 #endif /* PAE or 64... */
482 SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
483 #endif /* All levels... */
484 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
485 SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
486 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
487 SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
488 }
489 #endif /* 0 */
491 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
492 /* Lightweight audit: pass all the shadows associated with this guest walk
493 * through the audit mechanisms */
494 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
495 {
496 mfn_t smfn;
498 if ( !(SHADOW_AUDIT_ENABLE) )
499 return;
501 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
502 if ( mfn_valid(gw->l4mfn)
503 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
504 SH_type_l4_shadow))) )
505 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
506 if ( mfn_valid(gw->l3mfn)
507 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
508 SH_type_l3_shadow))) )
509 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
510 #endif /* PAE or 64... */
511 if ( mfn_valid(gw->l2mfn) )
512 {
513 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
514 SH_type_l2_shadow))) )
515 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
516 #if GUEST_PAGING_LEVELS == 3
517 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
518 SH_type_l2h_shadow))) )
519 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
520 #endif
521 }
522 if ( mfn_valid(gw->l1mfn)
523 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
524 SH_type_l1_shadow))) )
525 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
526 else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
527 && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
528 && mfn_valid(
529 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
530 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
531 }
533 #else
534 #define sh_audit_gw(_v, _gw) do {} while(0)
535 #endif /* audit code */
538 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
539 void *
540 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
541 unsigned long *gl1mfn)
542 {
543 void *pl1e = NULL;
544 walk_t gw;
546 ASSERT(shadow_mode_translate(v->domain));
548 // XXX -- this is expensive, but it's easy to cobble together...
549 // FIXME!
551 shadow_lock(v->domain);
552 if ( guest_walk_tables(v, addr, &gw, PFEC_page_present, 1) == 0
553 && mfn_valid(gw.l1mfn) )
554 {
555 if ( gl1mfn )
556 *gl1mfn = mfn_x(gw.l1mfn);
557 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
558 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
559 }
561 shadow_unlock(v->domain);
563 return pl1e;
564 }
566 void
567 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
568 {
569 walk_t gw;
571 ASSERT(shadow_mode_translate(v->domain));
573 // XXX -- this is expensive, but it's easy to cobble together...
574 // FIXME!
576 shadow_lock(v->domain);
577 (void) guest_walk_tables(v, addr, &gw, PFEC_page_present, 1);
578 *(guest_l1e_t *)eff_l1e = gw.l1e;
579 shadow_unlock(v->domain);
580 }
581 #endif /* CONFIG==SHADOW==GUEST */
583 /**************************************************************************/
584 /* Functions to compute the correct index into a shadow page, given an
585 * index into the guest page (as returned by guest_get_index()).
586 * This is trivial when the shadow and guest use the same sized PTEs, but
587 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
588 * PAE- or 64-bit shadows).
589 *
590 * These functions also increment the shadow mfn, when necessary. When PTE
591 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
592 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
593 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
594 * which shadow page we really want. Similarly, when PTE sizes are
595 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
596 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
597 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
598 * space.)
599 *
600 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
601 * of shadow (to store both the shadow, and the info that would normally be
602 * stored in page_info fields). This arrangement allows the shadow and the
603 * "page_info" fields to always be stored in the same page (in fact, in
604 * the same cache line), avoiding an extra call to map_domain_page().
605 */
607 static inline u32
608 guest_index(void *ptr)
609 {
610 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
611 }
613 static u32
614 shadow_l1_index(mfn_t *smfn, u32 guest_index)
615 {
616 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
617 *smfn = _mfn(mfn_x(*smfn) +
618 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
619 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
620 #else
621 return guest_index;
622 #endif
623 }
625 static u32
626 shadow_l2_index(mfn_t *smfn, u32 guest_index)
627 {
628 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
629 // Because we use 2 shadow l2 entries for each guest entry, the number of
630 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
631 //
632 *smfn = _mfn(mfn_x(*smfn) +
633 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
635 // We multiple by two to get the index of the first of the two entries
636 // used to shadow the specified guest entry.
637 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
638 #else
639 return guest_index;
640 #endif
641 }
643 #if GUEST_PAGING_LEVELS >= 4
645 static u32
646 shadow_l3_index(mfn_t *smfn, u32 guest_index)
647 {
648 return guest_index;
649 }
651 static u32
652 shadow_l4_index(mfn_t *smfn, u32 guest_index)
653 {
654 return guest_index;
655 }
657 #endif // GUEST_PAGING_LEVELS >= 4
659 extern u32 get_pat_flags(struct vcpu *v,
660 u32 gl1e_flags,
661 paddr_t gpaddr,
662 paddr_t spaddr);
664 unsigned char pat_type_2_pte_flags(unsigned char pat_type);
665 /**************************************************************************/
666 /* Function which computes shadow entries from their corresponding guest
667 * entries. This is the "heart" of the shadow code. It operates using
668 * level-1 shadow types, but handles all levels of entry.
669 * Don't call it directly, but use the four wrappers below.
670 */
672 static always_inline void
673 _sh_propagate(struct vcpu *v,
674 guest_intpte_t guest_intpte,
675 mfn_t target_mfn,
676 void *shadow_entry_ptr,
677 int level,
678 fetch_type_t ft,
679 p2m_type_t p2mt)
680 {
681 guest_l1e_t guest_entry = { guest_intpte };
682 shadow_l1e_t *sp = shadow_entry_ptr;
683 struct domain *d = v->domain;
684 gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
685 u32 pass_thru_flags;
686 u32 gflags, sflags;
688 /* We don't shadow PAE l3s */
689 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
691 /* Check there's something for the shadows to map to */
692 if ( !p2m_is_valid(p2mt) )
693 {
694 *sp = shadow_l1e_empty();
695 goto done;
696 }
698 gflags = guest_l1e_get_flags(guest_entry);
700 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
701 {
702 /* If a guest l1 entry is not present, shadow with the magic
703 * guest-not-present entry. */
704 if ( level == 1 )
705 *sp = sh_l1e_gnp();
706 else
707 *sp = shadow_l1e_empty();
708 goto done;
709 }
711 if ( level == 1 && p2mt == p2m_mmio_dm )
712 {
713 /* Guest l1e maps emulated MMIO space */
714 *sp = sh_l1e_mmio(target_gfn, gflags);
715 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
716 d->arch.paging.shadow.has_fast_mmio_entries = 1;
717 goto done;
718 }
720 // Must have a valid target_mfn unless this is a prefetch or an l1
721 // pointing at MMIO space. In the case of a prefetch, an invalid
722 // mfn means that we can not usefully shadow anything, and so we
723 // return early.
724 //
725 if ( !mfn_valid(target_mfn)
726 && !(level == 1 && (!shadow_mode_refcounts(d)
727 || p2mt == p2m_mmio_direct)) )
728 {
729 ASSERT((ft == ft_prefetch));
730 *sp = shadow_l1e_empty();
731 goto done;
732 }
734 // Propagate bits from the guest to the shadow.
735 // Some of these may be overwritten, below.
736 // Since we know the guest's PRESENT bit is set, we also set the shadow's
737 // SHADOW_PRESENT bit.
738 //
739 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
740 _PAGE_RW | _PAGE_PRESENT);
741 if ( guest_supports_nx(v) )
742 pass_thru_flags |= _PAGE_NX_BIT;
743 if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
744 pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
745 sflags = gflags & pass_thru_flags;
747 /*
748 * For HVM domains with direct access to MMIO areas, set the correct
749 * caching attributes in the shadows to match what was asked for.
750 */
751 if ( (level == 1) && is_hvm_domain(d) &&
752 !list_empty(&(domain_hvm_iommu(d)->pdev_list)) &&
753 !is_xen_heap_mfn(mfn_x(target_mfn)) )
754 {
755 unsigned int type;
756 if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
757 sflags |= pat_type_2_pte_flags(type);
758 else if ( d->arch.hvm_domain.is_in_uc_mode )
759 sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
760 else
761 sflags |= get_pat_flags(v,
762 gflags,
763 gfn_to_paddr(target_gfn),
764 mfn_x(target_mfn) << PAGE_SHIFT);
765 }
767 // Set the A&D bits for higher level shadows.
768 // Higher level entries do not, strictly speaking, have dirty bits, but
769 // since we use shadow linear tables, each of these entries may, at some
770 // point in time, also serve as a shadow L1 entry.
771 // By setting both the A&D bits in each of these, we eliminate the burden
772 // on the hardware to update these bits on initial accesses.
773 //
774 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
775 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
777 // If the A or D bit has not yet been set in the guest, then we must
778 // prevent the corresponding kind of access.
779 //
780 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
781 sflags &= ~_PAGE_PRESENT;
783 /* D bits exist in L1es and PSE L2es */
784 if ( unlikely(((level == 1) ||
785 ((level == 2) &&
786 (gflags & _PAGE_PSE) &&
787 guest_supports_superpages(v)))
788 && !(gflags & _PAGE_DIRTY)) )
789 sflags &= ~_PAGE_RW;
791 // shadow_mode_log_dirty support
792 //
793 // Only allow the guest write access to a page a) on a demand fault,
794 // or b) if the page is already marked as dirty.
795 //
796 // (We handle log-dirty entirely inside the shadow code, without using the
797 // p2m_ram_logdirty p2m type: only HAP uses that.)
798 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
799 {
800 if ( mfn_valid(target_mfn) ) {
801 if ( ft & FETCH_TYPE_WRITE )
802 paging_mark_dirty(d, mfn_x(target_mfn));
803 else if ( !sh_mfn_is_dirty(d, target_mfn) )
804 sflags &= ~_PAGE_RW;
805 }
806 }
808 /* Read-only memory */
809 if ( p2mt == p2m_ram_ro )
810 sflags &= ~_PAGE_RW;
812 // protect guest page tables
813 //
814 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
815 {
816 if ( shadow_mode_trap_reads(d) )
817 {
818 // if we are trapping both reads & writes, then mark this page
819 // as not present...
820 //
821 sflags &= ~_PAGE_PRESENT;
822 }
823 else
824 {
825 // otherwise, just prevent any writes...
826 //
827 sflags &= ~_PAGE_RW;
828 }
829 }
831 // PV guests in 64-bit mode use two different page tables for user vs
832 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
833 // It is always shadowed as present...
834 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
835 && !is_hvm_domain(d) )
836 {
837 sflags |= _PAGE_USER;
838 }
840 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
842 done:
843 SHADOW_DEBUG(PROPAGATE,
844 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
845 fetch_type_names[ft], level, guest_entry.l1, sp->l1);
846 }
849 /* These four wrappers give us a little bit of type-safety back around
850 * the use of void-* pointers and intpte types in _sh_propagate(), and
851 * allow the compiler to optimize out some level checks. */
853 #if GUEST_PAGING_LEVELS >= 4
854 static void
855 l4e_propagate_from_guest(struct vcpu *v,
856 guest_l4e_t gl4e,
857 mfn_t sl3mfn,
858 shadow_l4e_t *sl4e,
859 fetch_type_t ft)
860 {
861 _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
862 }
864 static void
865 l3e_propagate_from_guest(struct vcpu *v,
866 guest_l3e_t gl3e,
867 mfn_t sl2mfn,
868 shadow_l3e_t *sl3e,
869 fetch_type_t ft)
870 {
871 _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
872 }
873 #endif // GUEST_PAGING_LEVELS >= 4
875 static void
876 l2e_propagate_from_guest(struct vcpu *v,
877 guest_l2e_t gl2e,
878 mfn_t sl1mfn,
879 shadow_l2e_t *sl2e,
880 fetch_type_t ft)
881 {
882 _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
883 }
885 static void
886 l1e_propagate_from_guest(struct vcpu *v,
887 guest_l1e_t gl1e,
888 mfn_t gmfn,
889 shadow_l1e_t *sl1e,
890 fetch_type_t ft,
891 p2m_type_t p2mt)
892 {
893 _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
894 }
897 /**************************************************************************/
898 /* These functions update shadow entries (and do bookkeeping on the shadow
899 * tables they are in). It is intended that they are the only
900 * functions which ever write (non-zero) data onto a shadow page.
901 */
903 static inline void safe_write_entry(void *dst, void *src)
904 /* Copy one PTE safely when processors might be running on the
905 * destination pagetable. This does *not* give safety against
906 * concurrent writes (that's what the shadow lock is for), just
907 * stops the hardware picking up partially written entries. */
908 {
909 volatile unsigned long *d = dst;
910 unsigned long *s = src;
911 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
912 #if CONFIG_PAGING_LEVELS == 3
913 /* In PAE mode, pagetable entries are larger
914 * than machine words, so won't get written atomically. We need to make
915 * sure any other cpu running on these shadows doesn't see a
916 * half-written entry. Do this by marking the entry not-present first,
917 * then writing the high word before the low word. */
918 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
919 d[0] = 0;
920 d[1] = s[1];
921 d[0] = s[0];
922 #else
923 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
924 * which will be an atomic write, since the entry is aligned. */
925 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
926 *d = *s;
927 #endif
928 }
931 static inline void
932 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
933 /* This function does the actual writes to shadow pages.
934 * It must not be called directly, since it doesn't do the bookkeeping
935 * that shadow_set_l*e() functions do. */
936 {
937 shadow_l1e_t *dst = d;
938 shadow_l1e_t *src = s;
939 void *map = NULL;
940 int i;
942 /* Because we mirror access rights at all levels in the shadow, an
943 * l2 (or higher) entry with the RW bit cleared will leave us with
944 * no write access through the linear map.
945 * We detect that by writing to the shadow with copy_to_user() and
946 * using map_domain_page() to get a writeable mapping if we need to. */
947 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
948 {
949 perfc_incr(shadow_linear_map_failed);
950 map = sh_map_domain_page(mfn);
951 ASSERT(map != NULL);
952 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
953 }
956 for ( i = 0; i < entries; i++ )
957 safe_write_entry(dst++, src++);
959 if ( map != NULL ) sh_unmap_domain_page(map);
960 }
962 static inline int
963 perms_strictly_increased(u32 old_flags, u32 new_flags)
964 /* Given the flags of two entries, are the new flags a strict
965 * increase in rights over the old ones? */
966 {
967 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
968 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
969 /* Flip the NX bit, since it's the only one that decreases rights;
970 * we calculate as if it were an "X" bit. */
971 of ^= _PAGE_NX_BIT;
972 nf ^= _PAGE_NX_BIT;
973 /* If the changed bits are all set in the new flags, then rights strictly
974 * increased between old and new. */
975 return ((of | (of ^ nf)) == nf);
976 }
978 static int inline
979 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
980 {
981 int res;
982 mfn_t mfn;
983 struct domain *owner;
985 ASSERT(!sh_l1e_is_magic(sl1e));
987 if ( !shadow_mode_refcounts(d) )
988 return 1;
990 res = get_page_from_l1e(sl1e, d);
992 // If a privileged domain is attempting to install a map of a page it does
993 // not own, we let it succeed anyway.
994 //
995 if ( unlikely(!res) &&
996 !shadow_mode_translate(d) &&
997 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
998 (owner = page_get_owner(mfn_to_page(mfn))) &&
999 (d != owner) &&
1000 IS_PRIV_FOR(d, owner))
1002 res = get_page_from_l1e(sl1e, owner);
1003 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
1004 "which is owned by domain %d: %s\n",
1005 d->domain_id, mfn_x(mfn), owner->domain_id,
1006 res ? "success" : "failed");
1009 if ( unlikely(!res) )
1011 perfc_incr(shadow_get_page_fail);
1012 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
1015 return res;
1018 static void inline
1019 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
1021 if ( !shadow_mode_refcounts(d) )
1022 return;
1024 put_page_from_l1e(sl1e, d);
1027 #if GUEST_PAGING_LEVELS >= 4
1028 static int shadow_set_l4e(struct vcpu *v,
1029 shadow_l4e_t *sl4e,
1030 shadow_l4e_t new_sl4e,
1031 mfn_t sl4mfn)
1033 int flags = 0, ok;
1034 shadow_l4e_t old_sl4e;
1035 paddr_t paddr;
1036 ASSERT(sl4e != NULL);
1037 old_sl4e = *sl4e;
1039 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
1041 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1042 | (((unsigned long)sl4e) & ~PAGE_MASK));
1044 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
1046 /* About to install a new reference */
1047 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
1048 ok = sh_get_ref(v, sl3mfn, paddr);
1049 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
1050 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
1051 ok |= sh_pin(v, sl3mfn);
1052 if ( !ok )
1054 domain_crash(v->domain);
1055 return SHADOW_SET_ERROR;
1059 /* Write the new entry */
1060 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
1061 flags |= SHADOW_SET_CHANGED;
1063 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1065 /* We lost a reference to an old mfn. */
1066 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1067 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1068 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1069 shadow_l4e_get_flags(new_sl4e)) )
1071 flags |= SHADOW_SET_FLUSH;
1073 sh_put_ref(v, osl3mfn, paddr);
1075 return flags;
1078 static int shadow_set_l3e(struct vcpu *v,
1079 shadow_l3e_t *sl3e,
1080 shadow_l3e_t new_sl3e,
1081 mfn_t sl3mfn)
1083 int flags = 0;
1084 shadow_l3e_t old_sl3e;
1085 paddr_t paddr;
1086 ASSERT(sl3e != NULL);
1087 old_sl3e = *sl3e;
1089 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1091 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1092 | (((unsigned long)sl3e) & ~PAGE_MASK));
1094 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1095 /* About to install a new reference */
1096 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1098 domain_crash(v->domain);
1099 return SHADOW_SET_ERROR;
1102 /* Write the new entry */
1103 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1104 flags |= SHADOW_SET_CHANGED;
1106 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1108 /* We lost a reference to an old mfn. */
1109 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1110 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1111 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1112 shadow_l3e_get_flags(new_sl3e)) )
1114 flags |= SHADOW_SET_FLUSH;
1116 sh_put_ref(v, osl2mfn, paddr);
1118 return flags;
1120 #endif /* GUEST_PAGING_LEVELS >= 4 */
1122 static int shadow_set_l2e(struct vcpu *v,
1123 shadow_l2e_t *sl2e,
1124 shadow_l2e_t new_sl2e,
1125 mfn_t sl2mfn)
1127 int flags = 0;
1128 shadow_l2e_t old_sl2e;
1129 paddr_t paddr;
1131 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1132 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1133 * shadows. Reference counting and up-pointers track from the first
1134 * page of the shadow to the first l2e, so make sure that we're
1135 * working with those:
1136 * Align the pointer down so it's pointing at the first of the pair */
1137 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1138 /* Align the mfn of the shadow entry too */
1139 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1140 #endif
1142 ASSERT(sl2e != NULL);
1143 old_sl2e = *sl2e;
1145 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1147 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1148 | (((unsigned long)sl2e) & ~PAGE_MASK));
1150 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1151 /* About to install a new reference */
1152 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1154 domain_crash(v->domain);
1155 return SHADOW_SET_ERROR;
1158 /* Write the new entry */
1159 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1161 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1162 /* The l1 shadow is two pages long and need to be pointed to by
1163 * two adjacent l1es. The pair have the same flags, but point
1164 * at odd and even MFNs */
1165 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1166 pair[1].l2 |= (1<<PAGE_SHIFT);
1167 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1169 #else /* normal case */
1170 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1171 #endif
1172 flags |= SHADOW_SET_CHANGED;
1174 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1176 /* We lost a reference to an old mfn. */
1177 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1178 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1179 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1180 shadow_l2e_get_flags(new_sl2e)) )
1182 flags |= SHADOW_SET_FLUSH;
1184 sh_put_ref(v, osl1mfn, paddr);
1186 return flags;
1189 static int shadow_set_l1e(struct vcpu *v,
1190 shadow_l1e_t *sl1e,
1191 shadow_l1e_t new_sl1e,
1192 mfn_t sl1mfn)
1194 int flags = 0;
1195 struct domain *d = v->domain;
1196 shadow_l1e_t old_sl1e;
1197 ASSERT(sl1e != NULL);
1199 old_sl1e = *sl1e;
1201 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1203 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1204 && !sh_l1e_is_magic(new_sl1e) )
1206 /* About to install a new reference */
1207 if ( shadow_mode_refcounts(d) ) {
1208 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1210 /* Doesn't look like a pagetable. */
1211 flags |= SHADOW_SET_ERROR;
1212 new_sl1e = shadow_l1e_empty();
1217 /* Write the new entry */
1218 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1219 flags |= SHADOW_SET_CHANGED;
1221 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1222 && !sh_l1e_is_magic(old_sl1e) )
1224 /* We lost a reference to an old mfn. */
1225 /* N.B. Unlike higher-level sets, never need an extra flush
1226 * when writing an l1e. Because it points to the same guest frame
1227 * as the guest l1e did, it's the guest's responsibility to
1228 * trigger a flush later. */
1229 if ( shadow_mode_refcounts(d) )
1231 shadow_put_page_from_l1e(old_sl1e, d);
1234 return flags;
1238 /**************************************************************************/
1239 /* Macros to walk pagetables. These take the shadow of a pagetable and
1240 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1241 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1242 * second entry (since pairs of entries are managed together). For multi-page
1243 * shadows they walk all pages.
1245 * Arguments are an MFN, the variable to point to each entry, a variable
1246 * to indicate that we are done (we will shortcut to the end of the scan
1247 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1248 * and the code.
1250 * WARNING: These macros have side-effects. They change the values of both
1251 * the pointer and the MFN. */
1253 static inline void increment_ptr_to_guest_entry(void *ptr)
1255 if ( ptr )
1257 guest_l1e_t **entry = ptr;
1258 (*entry)++;
1262 /* All kinds of l1: touch all entries */
1263 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1264 do { \
1265 int _i; \
1266 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1267 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1268 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1269 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1270 { \
1271 (_sl1e) = _sp + _i; \
1272 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1273 {_code} \
1274 if ( _done ) break; \
1275 increment_ptr_to_guest_entry(_gl1p); \
1276 } \
1277 unmap_shadow_page(_sp); \
1278 } while (0)
1280 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1281 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1282 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1283 do { \
1284 int __done = 0; \
1285 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1286 ({ (__done = _done); }), _code); \
1287 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1288 if ( !__done ) \
1289 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1290 ({ (__done = _done); }), _code); \
1291 } while (0)
1292 #else /* Everything else; l1 shadows are only one page */
1293 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1294 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1295 #endif
1298 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1300 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1301 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1302 do { \
1303 int _i, _j, __done = 0; \
1304 int _xen = !shadow_mode_external(_dom); \
1305 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1306 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1307 { \
1308 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1309 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1310 if ( (!(_xen)) \
1311 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1312 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1313 { \
1314 (_sl2e) = _sp + _i; \
1315 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1316 {_code} \
1317 if ( (__done = (_done)) ) break; \
1318 increment_ptr_to_guest_entry(_gl2p); \
1319 } \
1320 unmap_shadow_page(_sp); \
1321 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1322 } \
1323 } while (0)
1325 #elif GUEST_PAGING_LEVELS == 2
1327 /* 32-bit on 32-bit: avoid Xen entries */
1328 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1329 do { \
1330 int _i; \
1331 int _xen = !shadow_mode_external(_dom); \
1332 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1333 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1334 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1335 if ( (!(_xen)) \
1336 || \
1337 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1338 { \
1339 (_sl2e) = _sp + _i; \
1340 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1341 {_code} \
1342 if ( _done ) break; \
1343 increment_ptr_to_guest_entry(_gl2p); \
1344 } \
1345 unmap_shadow_page(_sp); \
1346 } while (0)
1348 #elif GUEST_PAGING_LEVELS == 3
1350 /* PAE: if it's an l2h, don't touch Xen mappings */
1351 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1352 do { \
1353 int _i; \
1354 int _xen = !shadow_mode_external(_dom); \
1355 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1356 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1357 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1358 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1359 if ( (!(_xen)) \
1360 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1361 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1362 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1363 { \
1364 (_sl2e) = _sp + _i; \
1365 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1366 {_code} \
1367 if ( _done ) break; \
1368 increment_ptr_to_guest_entry(_gl2p); \
1369 } \
1370 unmap_shadow_page(_sp); \
1371 } while (0)
1373 #else
1375 /* 64-bit l2: touch all entries except for PAE compat guests. */
1376 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1377 do { \
1378 int _i; \
1379 int _xen = !shadow_mode_external(_dom); \
1380 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1381 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1382 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1383 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1384 { \
1385 if ( (!(_xen)) \
1386 || !is_pv_32on64_domain(_dom) \
1387 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1388 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1389 { \
1390 (_sl2e) = _sp + _i; \
1391 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1392 {_code} \
1393 if ( _done ) break; \
1394 increment_ptr_to_guest_entry(_gl2p); \
1395 } \
1396 } \
1397 unmap_shadow_page(_sp); \
1398 } while (0)
1400 #endif /* different kinds of l2 */
1402 #if GUEST_PAGING_LEVELS == 4
1404 /* 64-bit l3: touch all entries */
1405 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1406 do { \
1407 int _i; \
1408 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1409 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1410 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1411 { \
1412 (_sl3e) = _sp + _i; \
1413 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1414 {_code} \
1415 if ( _done ) break; \
1416 increment_ptr_to_guest_entry(_gl3p); \
1417 } \
1418 unmap_shadow_page(_sp); \
1419 } while (0)
1421 /* 64-bit l4: avoid Xen mappings */
1422 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1423 do { \
1424 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1425 int _xen = !shadow_mode_external(_dom); \
1426 int _i; \
1427 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1428 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1429 { \
1430 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1431 { \
1432 (_sl4e) = _sp + _i; \
1433 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1434 {_code} \
1435 if ( _done ) break; \
1436 } \
1437 increment_ptr_to_guest_entry(_gl4p); \
1438 } \
1439 unmap_shadow_page(_sp); \
1440 } while (0)
1442 #endif
1446 /**************************************************************************/
1447 /* Functions to install Xen mappings and linear mappings in shadow pages */
1449 // XXX -- this function should probably be moved to shadow-common.c, but that
1450 // probably wants to wait until the shadow types have been moved from
1451 // shadow-types.h to shadow-private.h
1452 //
1453 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1454 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1456 struct domain *d = v->domain;
1457 shadow_l4e_t *sl4e;
1459 sl4e = sh_map_domain_page(sl4mfn);
1460 ASSERT(sl4e != NULL);
1461 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1463 /* Copy the common Xen mappings from the idle domain */
1464 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1465 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1466 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1468 /* Install the per-domain mappings for this domain */
1469 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1470 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1471 __PAGE_HYPERVISOR);
1473 /* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1474 * shadows on 64-bit xen, this linear mapping is later replaced by the
1475 * monitor pagetable structure, which is built in make_monitor_table
1476 * and maintained by sh_update_linear_entries. */
1477 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1478 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1480 /* Self linear mapping. */
1481 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1483 // linear tables may not be used with translated PV guests
1484 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1485 shadow_l4e_empty();
1487 else
1489 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1490 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1493 if ( shadow_mode_translate(v->domain) )
1495 /* install domain-specific P2M table */
1496 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1497 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1498 __PAGE_HYPERVISOR);
1501 if ( is_pv_32on64_domain(v->domain) )
1503 /* install compat arg xlat entry */
1504 sl4e[shadow_l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1505 shadow_l4e_from_mfn(
1506 page_to_mfn(virt_to_page(d->arch.mm_arg_xlat_l3)),
1507 __PAGE_HYPERVISOR);
1510 sh_unmap_domain_page(sl4e);
1512 #endif
1514 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1515 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1516 // place, which means that we need to populate the l2h entry in the l3
1517 // table.
1519 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1521 struct domain *d = v->domain;
1522 shadow_l2e_t *sl2e;
1523 #if CONFIG_PAGING_LEVELS == 3
1524 int i;
1525 #else
1527 if ( !is_pv_32on64_vcpu(v) )
1528 return;
1529 #endif
1531 sl2e = sh_map_domain_page(sl2hmfn);
1532 ASSERT(sl2e != NULL);
1533 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1535 #if CONFIG_PAGING_LEVELS == 3
1537 /* Copy the common Xen mappings from the idle domain */
1538 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1539 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1540 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1542 /* Install the per-domain mappings for this domain */
1543 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1544 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1545 shadow_l2e_from_mfn(
1546 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1547 __PAGE_HYPERVISOR);
1549 /* We don't set up a linear mapping here because we can't until this
1550 * l2h is installed in an l3e. sh_update_linear_entries() handles
1551 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1552 * We zero them here, just as a safety measure.
1553 */
1554 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1555 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1556 shadow_l2e_empty();
1557 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1558 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1559 shadow_l2e_empty();
1561 if ( shadow_mode_translate(d) )
1563 /* Install the domain-specific p2m table */
1564 l3_pgentry_t *p2m;
1565 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1566 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1567 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1569 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1570 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1571 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1572 __PAGE_HYPERVISOR)
1573 : shadow_l2e_empty();
1575 sh_unmap_domain_page(p2m);
1578 #else
1580 /* Copy the common Xen mappings from the idle domain */
1581 memcpy(
1582 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1583 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1584 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1586 #endif
1588 sh_unmap_domain_page(sl2e);
1590 #endif
1593 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1594 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1596 struct domain *d = v->domain;
1597 shadow_l2e_t *sl2e;
1598 int i;
1600 sl2e = sh_map_domain_page(sl2mfn);
1601 ASSERT(sl2e != NULL);
1602 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1604 /* Copy the common Xen mappings from the idle domain */
1605 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1606 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1607 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1609 /* Install the per-domain mappings for this domain */
1610 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1611 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1612 shadow_l2e_from_mfn(
1613 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1614 __PAGE_HYPERVISOR);
1616 /* Linear mapping */
1617 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1618 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1620 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1622 // linear tables may not be used with translated PV guests
1623 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1624 shadow_l2e_empty();
1626 else
1628 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1629 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1632 if ( shadow_mode_translate(d) )
1634 /* install domain-specific P2M table */
1635 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1636 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1637 __PAGE_HYPERVISOR);
1640 sh_unmap_domain_page(sl2e);
1642 #endif
1646 /**************************************************************************/
1647 /* Create a shadow of a given guest page.
1648 */
1649 static mfn_t
1650 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1652 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1653 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1654 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1656 if ( shadow_type != SH_type_l2_32_shadow
1657 && shadow_type != SH_type_l2_pae_shadow
1658 && shadow_type != SH_type_l2h_pae_shadow
1659 && shadow_type != SH_type_l4_64_shadow )
1660 /* Lower-level shadow, not yet linked form a higher level */
1661 mfn_to_shadow_page(smfn)->up = 0;
1663 #if GUEST_PAGING_LEVELS == 4
1664 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1665 if ( shadow_type == SH_type_l4_64_shadow &&
1666 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1668 /* We're shadowing a new l4, but we've been assuming the guest uses
1669 * only one l4 per vcpu and context switches using an l4 entry.
1670 * Count the number of active l4 shadows. If there are enough
1671 * of them, decide that this isn't an old linux guest, and stop
1672 * pinning l3es. This is not very quick but it doesn't happen
1673 * very often. */
1674 struct list_head *l, *t;
1675 struct shadow_page_info *sp;
1676 struct vcpu *v2;
1677 int l4count = 0, vcpus = 0;
1678 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1680 sp = list_entry(l, struct shadow_page_info, list);
1681 if ( sp->type == SH_type_l4_64_shadow )
1682 l4count++;
1684 for_each_vcpu ( v->domain, v2 )
1685 vcpus++;
1686 if ( l4count > 2 * vcpus )
1688 /* Unpin all the pinned l3 tables, and don't pin any more. */
1689 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1691 sp = list_entry(l, struct shadow_page_info, list);
1692 if ( sp->type == SH_type_l3_64_shadow )
1693 sh_unpin(v, shadow_page_to_mfn(sp));
1695 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1698 #endif
1699 #endif
1701 // Create the Xen mappings...
1702 if ( !shadow_mode_external(v->domain) )
1704 switch (shadow_type)
1706 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1707 case SH_type_l4_shadow:
1708 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1709 #endif
1710 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1711 case SH_type_l2h_shadow:
1712 sh_install_xen_entries_in_l2h(v, smfn); break;
1713 #endif
1714 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1715 case SH_type_l2_shadow:
1716 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1717 #endif
1718 default: /* Do nothing */ break;
1722 shadow_promote(v, gmfn, shadow_type);
1723 set_shadow_status(v, gmfn, shadow_type, smfn);
1725 return smfn;
1728 /* Make a splintered superpage shadow */
1729 static mfn_t
1730 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1732 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1733 (unsigned long) gfn_x(gfn));
1735 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1736 gfn_x(gfn), mfn_x(smfn));
1738 set_fl1_shadow_status(v, gfn, smfn);
1739 return smfn;
1743 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1744 mfn_t
1745 sh_make_monitor_table(struct vcpu *v)
1747 struct domain *d = v->domain;
1749 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1751 /* Guarantee we can get the memory we need */
1752 shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1754 #if CONFIG_PAGING_LEVELS == 4
1756 mfn_t m4mfn;
1757 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1758 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1759 /* Remember the level of this table */
1760 mfn_to_page(m4mfn)->shadow_flags = 4;
1761 #if SHADOW_PAGING_LEVELS < 4
1763 mfn_t m3mfn, m2mfn;
1764 l4_pgentry_t *l4e;
1765 l3_pgentry_t *l3e;
1766 /* Install an l3 table and an l2 table that will hold the shadow
1767 * linear map entries. This overrides the linear map entry that
1768 * was installed by sh_install_xen_entries_in_l4. */
1769 l4e = sh_map_domain_page(m4mfn);
1771 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1772 mfn_to_page(m3mfn)->shadow_flags = 3;
1773 l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1774 = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1776 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1777 mfn_to_page(m2mfn)->shadow_flags = 2;
1778 l3e = sh_map_domain_page(m3mfn);
1779 l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1780 sh_unmap_domain_page(l3e);
1782 if ( is_pv_32on64_vcpu(v) )
1784 /* For 32-on-64 PV guests, we need to map the 32-bit Xen
1785 * area into its usual VAs in the monitor tables */
1786 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1787 mfn_to_page(m3mfn)->shadow_flags = 3;
1788 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1790 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1791 mfn_to_page(m2mfn)->shadow_flags = 2;
1792 l3e = sh_map_domain_page(m3mfn);
1793 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1794 sh_install_xen_entries_in_l2h(v, m2mfn);
1795 sh_unmap_domain_page(l3e);
1798 sh_unmap_domain_page(l4e);
1800 #endif /* SHADOW_PAGING_LEVELS < 4 */
1801 return m4mfn;
1804 #elif CONFIG_PAGING_LEVELS == 3
1807 mfn_t m3mfn, m2mfn;
1808 l3_pgentry_t *l3e;
1809 l2_pgentry_t *l2e;
1810 int i;
1812 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1813 /* Remember the level of this table */
1814 mfn_to_page(m3mfn)->shadow_flags = 3;
1816 // Install a monitor l2 table in slot 3 of the l3 table.
1817 // This is used for all Xen entries, including linear maps
1818 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1819 mfn_to_page(m2mfn)->shadow_flags = 2;
1820 l3e = sh_map_domain_page(m3mfn);
1821 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1822 sh_install_xen_entries_in_l2h(v, m2mfn);
1823 /* Install the monitor's own linear map */
1824 l2e = sh_map_domain_page(m2mfn);
1825 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1826 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1827 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1828 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1829 : l2e_empty();
1830 sh_unmap_domain_page(l2e);
1831 sh_unmap_domain_page(l3e);
1833 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1834 return m3mfn;
1837 #elif CONFIG_PAGING_LEVELS == 2
1840 mfn_t m2mfn;
1841 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1842 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1843 /* Remember the level of this table */
1844 mfn_to_page(m2mfn)->shadow_flags = 2;
1845 return m2mfn;
1848 #else
1849 #error this should not happen
1850 #endif /* CONFIG_PAGING_LEVELS */
1852 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1854 /**************************************************************************/
1855 /* These functions also take a virtual address and return the level-N
1856 * shadow table mfn and entry, but they create the shadow pagetables if
1857 * they are needed. The "demand" argument is non-zero when handling
1858 * a demand fault (so we know what to do about accessed bits &c).
1859 * If the necessary tables are not present in the guest, they return NULL. */
1861 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1862 * more levels than the guest, the upper levels are always fixed and do not
1863 * reflect any information from the guest, so we do not use these functions
1864 * to access them. */
1866 #if GUEST_PAGING_LEVELS >= 4
1867 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1868 walk_t *gw,
1869 mfn_t *sl4mfn)
1871 /* There is always a shadow of the top level table. Get it. */
1872 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1873 /* Reading the top level table is always valid. */
1874 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1877 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1878 walk_t *gw,
1879 mfn_t *sl3mfn,
1880 fetch_type_t ft)
1882 mfn_t sl4mfn;
1883 shadow_l4e_t *sl4e;
1884 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1885 /* Get the l4e */
1886 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1887 ASSERT(sl4e != NULL);
1888 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1890 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1891 ASSERT(mfn_valid(*sl3mfn));
1893 else
1895 int r;
1896 shadow_l4e_t new_sl4e;
1897 /* No l3 shadow installed: find and install it. */
1898 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1899 if ( !mfn_valid(*sl3mfn) )
1901 /* No l3 shadow of this page exists at all: make one. */
1902 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1904 /* Install the new sl3 table in the sl4e */
1905 l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
1906 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1907 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1908 if ( r & SHADOW_SET_ERROR )
1909 return NULL;
1911 /* Now follow it down a level. Guaranteed to succeed. */
1912 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1914 #endif /* GUEST_PAGING_LEVELS >= 4 */
1917 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1918 walk_t *gw,
1919 mfn_t *sl2mfn,
1920 fetch_type_t ft)
1922 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1923 mfn_t sl3mfn = _mfn(INVALID_MFN);
1924 shadow_l3e_t *sl3e;
1925 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1926 /* Get the l3e */
1927 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1928 if ( sl3e == NULL ) return NULL;
1929 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1931 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1932 ASSERT(mfn_valid(*sl2mfn));
1934 else
1936 int r;
1937 shadow_l3e_t new_sl3e;
1938 unsigned int t = SH_type_l2_shadow;
1940 /* Tag compat L2 containing hypervisor (m2p) mappings */
1941 if ( is_pv_32on64_domain(v->domain) &&
1942 guest_l4_table_offset(gw->va) == 0 &&
1943 guest_l3_table_offset(gw->va) == 3 )
1944 t = SH_type_l2h_shadow;
1946 /* No l2 shadow installed: find and install it. */
1947 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1948 if ( !mfn_valid(*sl2mfn) )
1950 /* No l2 shadow of this page exists at all: make one. */
1951 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1953 /* Install the new sl2 table in the sl3e */
1954 l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
1955 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1956 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1957 if ( r & SHADOW_SET_ERROR )
1958 return NULL;
1960 /* Now follow it down a level. Guaranteed to succeed. */
1961 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1962 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1963 /* We never demand-shadow PAE l3es: they are only created in
1964 * sh_update_cr3(). Check if the relevant sl3e is present. */
1965 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1966 + shadow_l3_linear_offset(gw->va);
1967 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1968 return NULL;
1969 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1970 ASSERT(mfn_valid(*sl2mfn));
1971 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1972 #else /* 32bit... */
1973 /* There is always a shadow of the top level table. Get it. */
1974 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1975 /* This next line is important: the guest l2 has a 16k
1976 * shadow, we need to return the right mfn of the four. This
1977 * call will set it for us as a side-effect. */
1978 (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
1979 /* Reading the top level table is always valid. */
1980 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1981 #endif
1985 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1986 walk_t *gw,
1987 mfn_t *sl1mfn,
1988 fetch_type_t ft)
1990 mfn_t sl2mfn;
1991 shadow_l2e_t *sl2e;
1993 /* Get the l2e */
1994 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1995 if ( sl2e == NULL ) return NULL;
1996 /* Install the sl1 in the l2e if it wasn't there or if we need to
1997 * re-do it to fix a PSE dirty bit. */
1998 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1999 && likely(ft != ft_demand_write
2000 || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
2001 || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
2003 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
2004 ASSERT(mfn_valid(*sl1mfn));
2006 else
2008 shadow_l2e_t new_sl2e;
2009 int r, flags = guest_l2e_get_flags(gw->l2e);
2010 /* No l1 shadow installed: find and install it. */
2011 if ( !(flags & _PAGE_PRESENT) )
2012 return NULL; /* No guest page. */
2013 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
2015 /* Splintering a superpage */
2016 gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
2017 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
2018 if ( !mfn_valid(*sl1mfn) )
2020 /* No fl1 shadow of this superpage exists at all: make one. */
2021 *sl1mfn = make_fl1_shadow(v, l2gfn);
2024 else
2026 /* Shadowing an actual guest l1 table */
2027 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
2028 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
2029 if ( !mfn_valid(*sl1mfn) )
2031 /* No l1 shadow of this page exists at all: make one. */
2032 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
2035 /* Install the new sl1 table in the sl2e */
2036 l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
2037 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
2038 ASSERT((r & SHADOW_SET_FLUSH) == 0);
2039 if ( r & SHADOW_SET_ERROR )
2040 return NULL;
2041 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
2042 * the guest l1 table has an 8k shadow, and we need to return
2043 * the right mfn of the pair. This call will set it for us as a
2044 * side-effect. (In all other cases, it's a no-op and will be
2045 * compiled out.) */
2046 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
2048 /* Now follow it down a level. Guaranteed to succeed. */
2049 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
2054 /**************************************************************************/
2055 /* Destructors for shadow tables:
2056 * Unregister the shadow, decrement refcounts of any entries present in it,
2057 * and release the memory.
2059 * N.B. These destructors do not clear the contents of the shadows.
2060 * This allows us to delay TLB shootdowns until the page is being reused.
2061 * See shadow_alloc() and shadow_free() for how this is handled.
2062 */
2064 #if GUEST_PAGING_LEVELS >= 4
2065 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
2067 shadow_l4e_t *sl4e;
2068 u32 t = mfn_to_shadow_page(smfn)->type;
2069 mfn_t gmfn, sl4mfn;
2071 SHADOW_DEBUG(DESTROY_SHADOW,
2072 "%s(%05lx)\n", __func__, mfn_x(smfn));
2073 ASSERT(t == SH_type_l4_shadow);
2075 /* Record that the guest page isn't shadowed any more (in this type) */
2076 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2077 delete_shadow_status(v, gmfn, t, smfn);
2078 shadow_demote(v, gmfn, t);
2079 /* Decrement refcounts of all the old entries */
2080 sl4mfn = smfn;
2081 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2082 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2084 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
2085 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
2086 | ((unsigned long)sl4e & ~PAGE_MASK));
2088 });
2090 /* Put the memory back in the pool */
2091 shadow_free(v->domain, smfn);
2094 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2096 shadow_l3e_t *sl3e;
2097 u32 t = mfn_to_shadow_page(smfn)->type;
2098 mfn_t gmfn, sl3mfn;
2100 SHADOW_DEBUG(DESTROY_SHADOW,
2101 "%s(%05lx)\n", __func__, mfn_x(smfn));
2102 ASSERT(t == SH_type_l3_shadow);
2104 /* Record that the guest page isn't shadowed any more (in this type) */
2105 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2106 delete_shadow_status(v, gmfn, t, smfn);
2107 shadow_demote(v, gmfn, t);
2109 /* Decrement refcounts of all the old entries */
2110 sl3mfn = smfn;
2111 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2112 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2113 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2114 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2115 | ((unsigned long)sl3e & ~PAGE_MASK));
2116 });
2118 /* Put the memory back in the pool */
2119 shadow_free(v->domain, smfn);
2121 #endif /* GUEST_PAGING_LEVELS >= 4 */
2124 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2126 shadow_l2e_t *sl2e;
2127 u32 t = mfn_to_shadow_page(smfn)->type;
2128 mfn_t gmfn, sl2mfn;
2130 SHADOW_DEBUG(DESTROY_SHADOW,
2131 "%s(%05lx)\n", __func__, mfn_x(smfn));
2133 #if GUEST_PAGING_LEVELS >= 3
2134 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2135 #else
2136 ASSERT(t == SH_type_l2_shadow);
2137 #endif
2139 /* Record that the guest page isn't shadowed any more (in this type) */
2140 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2141 delete_shadow_status(v, gmfn, t, smfn);
2142 shadow_demote(v, gmfn, t);
2144 /* Decrement refcounts of all the old entries */
2145 sl2mfn = smfn;
2146 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2147 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2148 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2149 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2150 | ((unsigned long)sl2e & ~PAGE_MASK));
2151 });
2153 /* Put the memory back in the pool */
2154 shadow_free(v->domain, smfn);
2157 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2159 struct domain *d = v->domain;
2160 shadow_l1e_t *sl1e;
2161 u32 t = mfn_to_shadow_page(smfn)->type;
2163 SHADOW_DEBUG(DESTROY_SHADOW,
2164 "%s(%05lx)\n", __func__, mfn_x(smfn));
2165 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2167 /* Record that the guest page isn't shadowed any more (in this type) */
2168 if ( t == SH_type_fl1_shadow )
2170 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2171 delete_fl1_shadow_status(v, gfn, smfn);
2173 else
2175 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2176 delete_shadow_status(v, gmfn, t, smfn);
2177 shadow_demote(v, gmfn, t);
2180 if ( shadow_mode_refcounts(d) )
2182 /* Decrement refcounts of all the old entries */
2183 mfn_t sl1mfn = smfn;
2184 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2185 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2186 && !sh_l1e_is_magic(*sl1e) )
2187 shadow_put_page_from_l1e(*sl1e, d);
2188 });
2191 /* Put the memory back in the pool */
2192 shadow_free(v->domain, smfn);
2195 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2196 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2198 struct domain *d = v->domain;
2199 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2201 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2203 mfn_t m3mfn;
2204 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2205 l3_pgentry_t *l3e;
2206 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2208 /* Need to destroy the l3 and l2 monitor pages used
2209 * for the linear map */
2210 ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2211 m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2212 l3e = sh_map_domain_page(m3mfn);
2213 ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2214 shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2215 sh_unmap_domain_page(l3e);
2216 shadow_free(d, m3mfn);
2218 if ( is_pv_32on64_vcpu(v) )
2220 /* Need to destroy the l3 and l2 monitor pages that map the
2221 * Xen VAs at 3GB-4GB */
2222 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2223 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2224 l3e = sh_map_domain_page(m3mfn);
2225 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2226 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2227 sh_unmap_domain_page(l3e);
2228 shadow_free(d, m3mfn);
2230 sh_unmap_domain_page(l4e);
2232 #elif CONFIG_PAGING_LEVELS == 3
2233 /* Need to destroy the l2 monitor page in slot 4 too */
2235 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2236 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2237 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2238 sh_unmap_domain_page(l3e);
2240 #endif
2242 /* Put the memory back in the pool */
2243 shadow_free(d, mmfn);
2245 #endif
2247 /**************************************************************************/
2248 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2249 * These are called from common code when we are running out of shadow
2250 * memory, and unpinning all the top-level shadows hasn't worked.
2252 * This implementation is pretty crude and slow, but we hope that it won't
2253 * be called very often. */
2255 #if GUEST_PAGING_LEVELS == 2
2257 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2259 shadow_l2e_t *sl2e;
2260 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2261 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2262 });
2265 #elif GUEST_PAGING_LEVELS == 3
2267 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2268 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2270 shadow_l2e_t *sl2e;
2271 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2272 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2273 });
2276 #elif GUEST_PAGING_LEVELS == 4
2278 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2280 shadow_l4e_t *sl4e;
2281 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2282 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2283 });
2286 #endif
2288 /**************************************************************************/
2289 /* Internal translation functions.
2290 * These functions require a pointer to the shadow entry that will be updated.
2291 */
2293 /* These functions take a new guest entry, translate it to shadow and write
2294 * the shadow entry.
2296 * They return the same bitmaps as the shadow_set_lXe() functions.
2297 */
2299 #if GUEST_PAGING_LEVELS >= 4
2300 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2302 shadow_l4e_t new_sl4e;
2303 guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2304 shadow_l4e_t *sl4p = se;
2305 mfn_t sl3mfn = _mfn(INVALID_MFN);
2306 struct domain *d = v->domain;
2307 p2m_type_t p2mt;
2308 int result = 0;
2310 perfc_incr(shadow_validate_gl4e_calls);
2312 if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2314 gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2315 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2316 if ( p2m_is_ram(p2mt) )
2317 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2318 else
2319 result |= SHADOW_SET_ERROR;
2321 l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2323 // check for updates to xen reserved slots
2324 if ( !shadow_mode_external(d) )
2326 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2327 sizeof(shadow_l4e_t));
2328 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2330 if ( unlikely(reserved_xen_slot) )
2332 // attempt by the guest to write to a xen reserved slot
2333 //
2334 SHADOW_PRINTK("%s out-of-range update "
2335 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2336 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2337 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2339 SHADOW_ERROR("out-of-range l4e update\n");
2340 result |= SHADOW_SET_ERROR;
2343 // do not call shadow_set_l4e...
2344 return result;
2348 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2349 return result;
2353 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2355 shadow_l3e_t new_sl3e;
2356 guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2357 shadow_l3e_t *sl3p = se;
2358 mfn_t sl2mfn = _mfn(INVALID_MFN);
2359 p2m_type_t p2mt;
2360 int result = 0;
2362 perfc_incr(shadow_validate_gl3e_calls);
2364 if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2366 gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2367 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2368 if ( p2m_is_ram(p2mt) )
2369 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2370 else
2371 result |= SHADOW_SET_ERROR;
2373 l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2374 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2376 return result;
2378 #endif // GUEST_PAGING_LEVELS >= 4
2380 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2382 shadow_l2e_t new_sl2e;
2383 guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2384 shadow_l2e_t *sl2p = se;
2385 mfn_t sl1mfn = _mfn(INVALID_MFN);
2386 p2m_type_t p2mt;
2387 int result = 0;
2389 perfc_incr(shadow_validate_gl2e_calls);
2391 if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2393 gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2394 if ( guest_supports_superpages(v) &&
2395 (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2397 // superpage -- need to look up the shadow L1 which holds the
2398 // splitters...
2399 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2400 #if 0
2401 // XXX - it's possible that we want to do some kind of prefetch
2402 // for superpage fl1's here, but this is *not* on the demand path,
2403 // so we'll hold off trying that for now...
2404 //
2405 if ( !mfn_valid(sl1mfn) )
2406 sl1mfn = make_fl1_shadow(v, gl1gfn);
2407 #endif
2409 else
2411 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2412 if ( p2m_is_ram(p2mt) )
2413 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2414 else
2415 result |= SHADOW_SET_ERROR;
2418 l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2420 // check for updates to xen reserved slots in PV guests...
2421 // XXX -- need to revisit this for PV 3-on-4 guests.
2422 //
2423 #if SHADOW_PAGING_LEVELS < 4
2424 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2425 if ( !shadow_mode_external(v->domain) )
2427 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2428 sizeof(shadow_l2e_t));
2429 int reserved_xen_slot;
2431 #if SHADOW_PAGING_LEVELS == 3
2432 reserved_xen_slot =
2433 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2434 (shadow_index
2435 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2436 #else /* SHADOW_PAGING_LEVELS == 2 */
2437 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2438 #endif
2440 if ( unlikely(reserved_xen_slot) )
2442 // attempt by the guest to write to a xen reserved slot
2443 //
2444 SHADOW_PRINTK("%s out-of-range update "
2445 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2446 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2447 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2449 SHADOW_ERROR("out-of-range l2e update\n");
2450 result |= SHADOW_SET_ERROR;
2453 // do not call shadow_set_l2e...
2454 return result;
2457 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2458 #endif /* SHADOW_PAGING_LEVELS < 4 */
2460 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2462 return result;
2465 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2467 shadow_l1e_t new_sl1e;
2468 guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2469 shadow_l1e_t *sl1p = se;
2470 gfn_t gfn;
2471 mfn_t gmfn;
2472 p2m_type_t p2mt;
2473 int result = 0;
2475 perfc_incr(shadow_validate_gl1e_calls);
2477 gfn = guest_l1e_get_gfn(new_gl1e);
2478 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2480 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2482 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2483 return result;
2487 /**************************************************************************/
2488 /* Functions which translate and install the shadows of arbitrary guest
2489 * entries that we have just seen the guest write. */
2492 static inline int
2493 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2494 void *new_gp, u32 size, u32 sh_type,
2495 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2496 int (*validate_ge)(struct vcpu *v, void *ge,
2497 mfn_t smfn, void *se))
2498 /* Generic function for mapping and validating. */
2500 mfn_t smfn, smfn2, map_mfn;
2501 shadow_l1e_t *sl1p;
2502 u32 shadow_idx, guest_idx;
2503 int result = 0;
2505 /* Align address and size to guest entry boundaries */
2506 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2507 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2508 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2509 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2511 /* Map the shadow page */
2512 smfn = get_shadow_status(v, gmfn, sh_type);
2513 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2514 guest_idx = guest_index(new_gp);
2515 map_mfn = smfn;
2516 shadow_idx = shadow_index(&map_mfn, guest_idx);
2517 sl1p = map_shadow_page(map_mfn);
2519 /* Validate one entry at a time */
2520 while ( size )
2522 smfn2 = smfn;
2523 guest_idx = guest_index(new_gp);
2524 shadow_idx = shadow_index(&smfn2, guest_idx);
2525 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2527 /* We have moved to another page of the shadow */
2528 map_mfn = smfn2;
2529 unmap_shadow_page(sl1p);
2530 sl1p = map_shadow_page(map_mfn);
2532 result |= validate_ge(v,
2533 new_gp,
2534 map_mfn,
2535 &sl1p[shadow_idx]);
2536 size -= sizeof(guest_l1e_t);
2537 new_gp += sizeof(guest_l1e_t);
2539 unmap_shadow_page(sl1p);
2540 return result;
2544 int
2545 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2546 void *new_gl4p, u32 size)
2548 #if GUEST_PAGING_LEVELS >= 4
2549 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2550 SH_type_l4_shadow,
2551 shadow_l4_index,
2552 validate_gl4e);
2553 #else // ! GUEST_PAGING_LEVELS >= 4
2554 SHADOW_ERROR("called in wrong paging mode!\n");
2555 BUG();
2556 return 0;
2557 #endif
2560 int
2561 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2562 void *new_gl3p, u32 size)
2564 #if GUEST_PAGING_LEVELS >= 4
2565 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2566 SH_type_l3_shadow,
2567 shadow_l3_index,
2568 validate_gl3e);
2569 #else // ! GUEST_PAGING_LEVELS >= 4
2570 SHADOW_ERROR("called in wrong paging mode!\n");
2571 BUG();
2572 return 0;
2573 #endif
2576 int
2577 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2578 void *new_gl2p, u32 size)
2580 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2581 SH_type_l2_shadow,
2582 shadow_l2_index,
2583 validate_gl2e);
2586 int
2587 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2588 void *new_gl2p, u32 size)
2590 #if GUEST_PAGING_LEVELS >= 3
2591 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2592 SH_type_l2h_shadow,
2593 shadow_l2_index,
2594 validate_gl2e);
2595 #else /* Non-PAE guests don't have different kinds of l2 table */
2596 SHADOW_ERROR("called in wrong paging mode!\n");
2597 BUG();
2598 return 0;
2599 #endif
2602 int
2603 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2604 void *new_gl1p, u32 size)
2606 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2607 SH_type_l1_shadow,
2608 shadow_l1_index,
2609 validate_gl1e);
2613 /**************************************************************************/
2614 /* Optimization: If we see two emulated writes of zeros to the same
2615 * page-table without another kind of page fault in between, we guess
2616 * that this is a batch of changes (for process destruction) and
2617 * unshadow the page so we don't take a pagefault on every entry. This
2618 * should also make finding writeable mappings of pagetables much
2619 * easier. */
2621 /* Look to see if this is the second emulated write in a row to this
2622 * page, and unshadow if it is */
2623 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2625 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2626 if ( v->arch.paging.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2627 sh_mfn_is_a_page_table(gmfn) )
2629 perfc_incr(shadow_early_unshadow);
2630 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2632 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
2633 #endif
2636 /* Stop counting towards early unshadows, as we've seen a real page fault */
2637 static inline void reset_early_unshadow(struct vcpu *v)
2639 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2640 v->arch.paging.shadow.last_emulated_mfn = INVALID_MFN;
2641 #endif
2646 /**************************************************************************/
2647 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2648 * demand-faulted a shadow l1e in the fault handler, to see if it's
2649 * worth fetching some more.
2650 */
2652 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2654 /* XXX magic number */
2655 #define PREFETCH_DISTANCE 32
2657 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2658 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2660 int i, dist;
2661 gfn_t gfn;
2662 mfn_t gmfn;
2663 guest_l1e_t *gl1p = NULL, gl1e;
2664 shadow_l1e_t sl1e;
2665 u32 gflags;
2666 p2m_type_t p2mt;
2668 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2669 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2670 /* And no more than a maximum fetches-per-fault */
2671 if ( dist > PREFETCH_DISTANCE )
2672 dist = PREFETCH_DISTANCE;
2674 if ( mfn_valid(gw->l1mfn) )
2676 /* Normal guest page; grab the next guest entry */
2677 gl1p = sh_map_domain_page(gw->l1mfn);
2678 gl1p += guest_l1_table_offset(gw->va);
2681 for ( i = 1; i < dist ; i++ )
2683 /* No point in prefetching if there's already a shadow */
2684 if ( ptr_sl1e[i].l1 != 0 )
2685 break;
2687 if ( mfn_valid(gw->l1mfn) )
2689 /* Normal guest page; grab the next guest entry */
2690 gl1e = gl1p[i];
2691 /* Not worth continuing if we hit an entry that will need another
2692 * fault for A/D-bit propagation anyway */
2693 gflags = guest_l1e_get_flags(gl1e);
2694 if ( (gflags & _PAGE_PRESENT)
2695 && (!(gflags & _PAGE_ACCESSED)
2696 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2697 break;
2699 else
2701 /* Fragmented superpage, unless we've been called wrongly */
2702 ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2703 /* Increment the l1e's GFN by the right number of guest pages */
2704 gl1e = guest_l1e_from_gfn(
2705 _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2706 guest_l1e_get_flags(gw->l1e));
2709 /* Look at the gfn that the l1e is pointing at */
2710 gfn = guest_l1e_get_gfn(gl1e);
2711 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2713 /* Propagate the entry. */
2714 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2715 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2717 if ( gl1p != NULL )
2718 sh_unmap_domain_page(gl1p);
2721 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2724 /**************************************************************************/
2725 /* Entry points into the shadow code */
2727 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2728 * for pagefaults. Returns 1 if this fault was an artefact of the
2729 * shadow code (and the guest should retry) or 0 if it is not (and the
2730 * fault should be handled elsewhere or passed to the guest). */
2732 static int sh_page_fault(struct vcpu *v,
2733 unsigned long va,
2734 struct cpu_user_regs *regs)
2736 struct domain *d = v->domain;
2737 walk_t gw;
2738 gfn_t gfn;
2739 mfn_t gmfn, sl1mfn=_mfn(0);
2740 shadow_l1e_t sl1e, *ptr_sl1e;
2741 paddr_t gpa;
2742 struct sh_emulate_ctxt emul_ctxt;
2743 struct x86_emulate_ops *emul_ops;
2744 int r;
2745 fetch_type_t ft = 0;
2746 p2m_type_t p2mt;
2748 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
2749 v->domain->domain_id, v->vcpu_id, va, regs->error_code,
2750 regs->rip);
2752 perfc_incr(shadow_fault);
2753 //
2754 // XXX: Need to think about eventually mapping superpages directly in the
2755 // shadow (when possible), as opposed to splintering them into a
2756 // bunch of 4K maps.
2757 //
2759 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2760 if ( (regs->error_code & PFEC_reserved_bit) )
2762 /* The only reasons for reserved bits to be set in shadow entries
2763 * are the two "magic" shadow_l1e entries. */
2764 if ( likely((__copy_from_user(&sl1e,
2765 (sh_linear_l1_table(v)
2766 + shadow_l1_linear_offset(va)),
2767 sizeof(sl1e)) == 0)
2768 && sh_l1e_is_magic(sl1e)) )
2770 if ( sh_l1e_is_gnp(sl1e) )
2772 /* Not-present in a guest PT: pass to the guest as
2773 * a not-present fault (by flipping two bits). */
2774 ASSERT(regs->error_code & PFEC_page_present);
2775 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2776 reset_early_unshadow(v);
2777 perfc_incr(shadow_fault_fast_gnp);
2778 SHADOW_PRINTK("fast path not-present\n");
2779 return 0;
2781 else
2783 /* Magic MMIO marker: extract gfn for MMIO address */
2784 ASSERT(sh_l1e_is_mmio(sl1e));
2785 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2786 << PAGE_SHIFT)
2787 | (va & ~PAGE_MASK);
2789 perfc_incr(shadow_fault_fast_mmio);
2790 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2791 reset_early_unshadow(v);
2792 handle_mmio(gpa);
2793 return EXCRET_fault_fixed;
2795 else
2797 /* This should be exceptionally rare: another vcpu has fixed
2798 * the tables between the fault and our reading the l1e.
2799 * Retry and let the hardware give us the right fault next time. */
2800 perfc_incr(shadow_fault_fast_fail);
2801 SHADOW_PRINTK("fast path false alarm!\n");
2802 return EXCRET_fault_fixed;
2805 #endif /* SHOPT_FAST_FAULT_PATH */
2807 /* Detect if this page fault happened while we were already in Xen
2808 * doing a shadow operation. If that happens, the only thing we can
2809 * do is let Xen's normal fault handlers try to fix it. In any case,
2810 * a diagnostic trace of the fault will be more useful than
2811 * a BUG() when we try to take the lock again. */
2812 if ( unlikely(shadow_locked_by_me(d)) )
2814 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
2815 d->arch.paging.shadow.locker_function);
2816 return 0;
2819 shadow_lock(d);
2821 shadow_audit_tables(v);
2823 if ( guest_walk_tables(v, va, &gw, regs->error_code, 1) != 0 )
2825 perfc_incr(shadow_fault_bail_real_fault);
2826 goto not_a_shadow_fault;
2829 /* It's possible that the guest has put pagetables in memory that it has
2830 * already used for some special purpose (ioreq pages, or granted pages).
2831 * If that happens we'll have killed the guest already but it's still not
2832 * safe to propagate entries out of the guest PT so get out now. */
2833 if ( unlikely(d->is_shutting_down) )
2835 SHADOW_PRINTK("guest is shutting down\n");
2836 shadow_unlock(d);
2837 return 0;
2840 sh_audit_gw(v, &gw);
2842 /* What kind of access are we dealing with? */
2843 ft = ((regs->error_code & PFEC_write_access)
2844 ? ft_demand_write : ft_demand_read);
2846 /* What mfn is the guest trying to access? */
2847 gfn = guest_l1e_get_gfn(gw.l1e);
2848 gmfn = gfn_to_mfn(d, gfn, &p2mt);
2850 if ( shadow_mode_refcounts(d) &&
2851 (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
2853 perfc_incr(shadow_fault_bail_bad_gfn);
2854 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
2855 gfn_x(gfn), mfn_x(gmfn));
2856 goto not_a_shadow_fault;
2859 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2860 /* Remember this successful VA->GFN translation for later. */
2861 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
2862 regs->error_code | PFEC_page_present);
2863 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2865 /* Make sure there is enough free shadow memory to build a chain of
2866 * shadow tables. (We never allocate a top-level shadow on this path,
2867 * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
2868 * SH_type_l1_shadow isn't correct in the latter case, all page
2869 * tables are the same size there.) */
2870 shadow_prealloc(d,
2871 SH_type_l1_shadow,
2872 GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
2874 /* Acquire the shadow. This must happen before we figure out the rights
2875 * for the shadow entry, since we might promote a page here. */
2876 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2877 if ( unlikely(ptr_sl1e == NULL) )
2879 /* Couldn't get the sl1e! Since we know the guest entries
2880 * are OK, this can only have been caused by a failed
2881 * shadow_set_l*e(), which will have crashed the guest.
2882 * Get out of the fault handler immediately. */
2883 ASSERT(d->is_shutting_down);
2884 shadow_unlock(d);
2885 return 0;
2888 /* Calculate the shadow entry and write it */
2889 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
2890 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2892 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2893 /* Prefetch some more shadow entries */
2894 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2895 #endif
2897 /* Need to emulate accesses to page tables */
2898 if ( sh_mfn_is_a_page_table(gmfn) )
2900 if ( ft == ft_demand_write )
2902 perfc_incr(shadow_fault_emulate_write);
2903 goto emulate;
2905 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2907 perfc_incr(shadow_fault_emulate_read);
2908 goto emulate;
2912 /* Need to hand off device-model MMIO and writes to read-only
2913 * memory to the device model */
2914 if ( p2mt == p2m_mmio_dm
2915 || (p2mt == p2m_ram_ro && ft == ft_demand_write) )
2917 gpa = guest_walk_to_gpa(&gw);
2918 goto mmio;
2921 /* In HVM guests, we force CR0.WP always to be set, so that the
2922 * pagetables are always write-protected. If the guest thinks
2923 * CR0.WP is clear, we must emulate faulting supervisor writes to
2924 * allow the guest to write through read-only PTEs. Emulate if the
2925 * fault was a non-user write to a present page. */
2926 if ( is_hvm_domain(d)
2927 && unlikely(!hvm_wp_enabled(v))
2928 && regs->error_code == (PFEC_write_access|PFEC_page_present) )
2930 perfc_incr(shadow_fault_emulate_wp);
2931 goto emulate;
2934 perfc_incr(shadow_fault_fixed);
2935 d->arch.paging.log_dirty.fault_count++;
2936 reset_early_unshadow(v);
2938 done:
2939 sh_audit_gw(v, &gw);
2940 SHADOW_PRINTK("fixed\n");
2941 shadow_audit_tables(v);
2942 shadow_unlock(d);
2943 return EXCRET_fault_fixed;
2945 emulate:
2946 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
2947 goto not_a_shadow_fault;
2949 /*
2950 * We do not emulate user writes. Instead we use them as a hint that the
2951 * page is no longer a page table. This behaviour differs from native, but
2952 * it seems very unlikely that any OS grants user access to page tables.
2953 */
2954 if ( (regs->error_code & PFEC_user_mode) )
2956 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
2957 mfn_x(gmfn));
2958 perfc_incr(shadow_fault_emulate_failed);
2959 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2960 goto done;
2963 if ( is_hvm_domain(d) )
2965 /*
2966 * If we are in the middle of injecting an exception or interrupt then
2967 * we should not emulate: it is not the instruction at %eip that caused
2968 * the fault. Furthermore it is almost certainly the case the handler
2969 * stack is currently considered to be a page table, so we should
2970 * unshadow the faulting page before exiting.
2971 */
2972 if ( unlikely(hvm_event_pending(v)) )
2974 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
2975 "injection: cr2=%#lx, mfn=%#lx\n",
2976 va, mfn_x(gmfn));
2977 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2978 goto done;
2982 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
2983 (unsigned long)regs->eip, (unsigned long)regs->esp);
2985 /*
2986 * We don't need to hold the lock for the whole emulation; we will
2987 * take it again when we write to the pagetables.
2988 */
2989 sh_audit_gw(v, &gw);
2990 shadow_audit_tables(v);
2991 shadow_unlock(d);
2993 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
2995 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
2997 /*
2998 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
2999 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3000 * then it must be 'failable': we cannot require the unshadow to succeed.
3001 */
3002 if ( r == X86EMUL_UNHANDLEABLE )
3004 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3005 mfn_x(gmfn));
3006 perfc_incr(shadow_fault_emulate_failed);
3007 /* If this is actually a page table, then we have a bug, and need
3008 * to support more operations in the emulator. More likely,
3009 * though, this is a hint that this page should not be shadowed. */
3010 shadow_remove_all_shadows(v, gmfn);
3013 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3014 if ( r == X86EMUL_OKAY ) {
3015 int i;
3016 /* Emulate up to four extra instructions in the hope of catching
3017 * the "second half" of a 64-bit pagetable write. */
3018 for ( i = 0 ; i < 4 ; i++ )
3020 shadow_continue_emulation(&emul_ctxt, regs);
3021 v->arch.paging.last_write_was_pt = 0;
3022 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3023 if ( r == X86EMUL_OKAY )
3025 if ( v->arch.paging.last_write_was_pt )
3027 perfc_incr(shadow_em_ex_pt);
3028 break; /* Don't emulate past the other half of the write */
3030 else
3031 perfc_incr(shadow_em_ex_non_pt);
3033 else
3035 perfc_incr(shadow_em_ex_fail);
3036 break; /* Don't emulate again if we failed! */
3040 #endif /* PAE guest */
3042 SHADOW_PRINTK("emulated\n");
3043 return EXCRET_fault_fixed;
3045 mmio:
3046 if ( !guest_mode(regs) )
3047 goto not_a_shadow_fault;
3048 perfc_incr(shadow_fault_mmio);
3049 sh_audit_gw(v, &gw);
3050 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3051 shadow_audit_tables(v);
3052 reset_early_unshadow(v);
3053 shadow_unlock(d);
3054 handle_mmio(gpa);
3055 return EXCRET_fault_fixed;
3057 not_a_shadow_fault:
3058 sh_audit_gw(v, &gw);
3059 SHADOW_PRINTK("not a shadow fault\n");
3060 shadow_audit_tables(v);
3061 reset_early_unshadow(v);
3062 shadow_unlock(d);
3063 return 0;
3067 static int
3068 sh_invlpg(struct vcpu *v, unsigned long va)
3069 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3070 * instruction should be issued on the hardware, or 0 if it's safe not
3071 * to do so. */
3073 shadow_l2e_t sl2e;
3075 perfc_incr(shadow_invlpg);
3077 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3078 /* No longer safe to use cached gva->gfn translations */
3079 vtlb_flush(v);
3080 #endif
3082 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3083 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3084 * yet. */
3085 #if SHADOW_PAGING_LEVELS == 4
3087 shadow_l3e_t sl3e;
3088 if ( !(shadow_l4e_get_flags(
3089 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3090 & _PAGE_PRESENT) )
3091 return 0;
3092 /* This must still be a copy-from-user because we don't have the
3093 * shadow lock, and the higher-level shadows might disappear
3094 * under our feet. */
3095 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3096 + shadow_l3_linear_offset(va)),
3097 sizeof (sl3e)) != 0 )
3099 perfc_incr(shadow_invlpg_fault);
3100 return 0;
3102 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3103 return 0;
3105 #elif SHADOW_PAGING_LEVELS == 3
3106 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3107 & _PAGE_PRESENT) )
3108 // no need to flush anything if there's no SL2...
3109 return 0;
3110 #endif
3112 /* This must still be a copy-from-user because we don't have the shadow
3113 * lock, and the higher-level shadows might disappear under our feet. */
3114 if ( __copy_from_user(&sl2e,
3115 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3116 sizeof (sl2e)) != 0 )
3118 perfc_incr(shadow_invlpg_fault);
3119 return 0;
3122 // If there's nothing shadowed for this particular sl2e, then
3123 // there is no need to do an invlpg, either...
3124 //
3125 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3126 return 0;
3128 // Check to see if the SL2 is a splintered superpage...
3129 // If so, then we'll need to flush the entire TLB (because that's
3130 // easier than invalidating all of the individual 4K pages).
3131 //
3132 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
3133 == SH_type_fl1_shadow )
3135 flush_tlb_local();
3136 return 0;
3139 return 1;
3143 static unsigned long
3144 sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3145 /* Called to translate a guest virtual address to what the *guest*
3146 * pagetables would map it to. */
3148 walk_t gw;
3149 gfn_t gfn;
3151 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3152 /* Check the vTLB cache first */
3153 unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3154 if ( VALID_GFN(vtlb_gfn) )
3155 return vtlb_gfn;
3156 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3158 if ( guest_walk_tables(v, va, &gw, pfec[0], 0) != 0 )
3160 if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3161 pfec[0] &= ~PFEC_page_present;
3162 return INVALID_GFN;
3164 gfn = guest_walk_to_gfn(&gw);
3166 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3167 /* Remember this successful VA->GFN translation for later. */
3168 vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3169 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3171 return gfn_x(gfn);
3175 static inline void
3176 sh_update_linear_entries(struct vcpu *v)
3177 /* Sync up all the linear mappings for this vcpu's pagetables */
3179 struct domain *d = v->domain;
3181 /* Linear pagetables in PV guests
3182 * ------------------------------
3184 * Guest linear pagetables, which map the guest pages, are at
3185 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3186 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3187 * are set up at shadow creation time, but (of course!) the PAE case
3188 * is subtler. Normal linear mappings are made by having an entry
3189 * in the top-level table that points to itself (shadow linear) or
3190 * to the guest top-level table (guest linear). For PAE, to set up
3191 * a linear map requires us to copy the four top-level entries into
3192 * level-2 entries. That means that every time we change a PAE l3e,
3193 * we need to reflect the change into the copy.
3195 * Linear pagetables in HVM guests
3196 * -------------------------------
3198 * For HVM guests, the linear pagetables are installed in the monitor
3199 * tables (since we can't put them in the shadow). Shadow linear
3200 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3201 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3202 * a linear pagetable of the monitor tables themselves. We have
3203 * the same issue of having to re-copy PAE l3 entries whevever we use
3204 * PAE shadows.
3206 * Because HVM guests run on the same monitor tables regardless of the
3207 * shadow tables in use, the linear mapping of the shadow tables has to
3208 * be updated every time v->arch.shadow_table changes.
3209 */
3211 /* Don't try to update the monitor table if it doesn't exist */
3212 if ( shadow_mode_external(d)
3213 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3214 return;
3216 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3218 /* For PV, one l4e points at the guest l4, one points at the shadow
3219 * l4. No maintenance required.
3220 * For HVM, just need to update the l4e that points to the shadow l4. */
3222 if ( shadow_mode_external(d) )
3224 /* Use the linear map if we can; otherwise make a new mapping */
3225 if ( v == current )
3227 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3228 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3229 __PAGE_HYPERVISOR);
3231 else
3233 l4_pgentry_t *ml4e;
3234 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3235 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3236 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3237 __PAGE_HYPERVISOR);
3238 sh_unmap_domain_page(ml4e);
3242 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3244 /* PV: XXX
3246 * HVM: To give ourselves a linear map of the shadows, we need to
3247 * extend a PAE shadow to 4 levels. We do this by having a monitor
3248 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3249 * entries into it. Then, by having the monitor l4e for shadow
3250 * pagetables also point to the monitor l4, we can use it to access
3251 * the shadows.
3252 */
3254 if ( shadow_mode_external(d) )
3256 /* Install copies of the shadow l3es into the monitor l2 table
3257 * that maps SH_LINEAR_PT_VIRT_START. */
3258 shadow_l3e_t *sl3e;
3259 l2_pgentry_t *ml2e;
3260 int i;
3262 /* Use linear mappings if we can; otherwise make new mappings */
3263 if ( v == current )
3264 ml2e = __linear_l2_table
3265 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3266 else
3268 mfn_t l3mfn, l2mfn;
3269 l4_pgentry_t *ml4e;
3270 l3_pgentry_t *ml3e;
3271 int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3272 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3274 ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3275 l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3276 ml3e = sh_map_domain_page(l3mfn);
3277 sh_unmap_domain_page(ml4e);
3279 ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3280 l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3281 ml2e = sh_map_domain_page(l2mfn);
3282 sh_unmap_domain_page(ml3e);
3285 /* Shadow l3 tables are made up by sh_update_cr3 */
3286 sl3e = v->arch.paging.shadow.l3table;
3288 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3290 ml2e[i] =
3291 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3292 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3293 __PAGE_HYPERVISOR)
3294 : l2e_empty();
3297 if ( v != current )
3298 sh_unmap_domain_page(ml2e);
3300 else
3301 domain_crash(d); /* XXX */
3303 #elif CONFIG_PAGING_LEVELS == 3
3305 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3306 * entries in the shadow, and the shadow's l3 entries into the
3307 * shadow-linear-map l2 entries in the shadow. This is safe to do
3308 * because Xen does not let guests share high-slot l2 tables between l3s,
3309 * so we know we're not treading on anyone's toes.
3311 * HVM: need to copy the shadow's l3 entries into the
3312 * shadow-linear-map l2 entries in the monitor table. This is safe
3313 * because we have one monitor table for each vcpu. The monitor's
3314 * own l3es don't need to be copied because they never change.
3315 * XXX That might change if we start stuffing things into the rest
3316 * of the monitor's virtual address space.
3317 */
3319 l2_pgentry_t *l2e, new_l2e;
3320 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3321 int i;
3322 int unmap_l2e = 0;
3324 #if GUEST_PAGING_LEVELS == 2
3326 /* Shadow l3 tables were built by sh_update_cr3 */
3327 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3328 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3330 #else /* GUEST_PAGING_LEVELS == 3 */
3332 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3333 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3335 #endif /* GUEST_PAGING_LEVELS */
3337 /* Choose where to write the entries, using linear maps if possible */
3338 if ( shadow_mode_external(d) )
3340 if ( v == current )
3342 /* From the monitor tables, it's safe to use linear maps
3343 * to update monitor l2s */
3344 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3346 else
3348 /* Map the monitor table's high l2 */
3349 l3_pgentry_t *l3e;
3350 l3e = sh_map_domain_page(
3351 pagetable_get_mfn(v->arch.monitor_table));
3352 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3353 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3354 unmap_l2e = 1;
3355 sh_unmap_domain_page(l3e);
3358 else
3360 /* Map the shadow table's high l2 */
3361 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3362 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3363 unmap_l2e = 1;
3366 /* Write linear mapping of guest (only in PV, and only when
3367 * not translated). */
3368 if ( !shadow_mode_translate(d) )
3370 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3372 new_l2e =
3373 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3374 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3375 __PAGE_HYPERVISOR)
3376 : l2e_empty());
3377 safe_write_entry(
3378 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3379 &new_l2e);
3383 /* Write linear mapping of shadow. */
3384 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3386 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3387 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3388 __PAGE_HYPERVISOR)
3389 : l2e_empty();
3390 safe_write_entry(
3391 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3392 &new_l2e);
3395 if ( unmap_l2e )
3396 sh_unmap_domain_page(l2e);
3399 #elif CONFIG_PAGING_LEVELS == 2
3401 /* For PV, one l2e points at the guest l2, one points at the shadow
3402 * l2. No maintenance required.
3403 * For HVM, just need to update the l2e that points to the shadow l2. */
3405 if ( shadow_mode_external(d) )
3407 /* Use the linear map if we can; otherwise make a new mapping */
3408 if ( v == current )
3410 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3411 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3412 __PAGE_HYPERVISOR);
3414 else
3416 l2_pgentry_t *ml2e;
3417 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3418 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3419 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3420 __PAGE_HYPERVISOR);
3421 sh_unmap_domain_page(ml2e);
3425 #else
3426 #error this should not happen
3427 #endif
3429 if ( shadow_mode_external(d) )
3431 /*
3432 * Having modified the linear pagetable mapping, flush local host TLBs.
3433 * This was not needed when vmenter/vmexit always had the side effect
3434 * of flushing host TLBs but, with ASIDs, it is possible to finish
3435 * this CR3 update, vmenter the guest, vmexit due to a page fault,
3436 * without an intervening host TLB flush. Then the page fault code
3437 * could use the linear pagetable to read a top-level shadow page
3438 * table entry. But, without this change, it would fetch the wrong
3439 * value due to a stale TLB.
3440 */
3441 flush_tlb_local();
3446 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3447 * Does all appropriate management/bookkeeping/refcounting/etc...
3448 */
3449 static void
3450 sh_detach_old_tables(struct vcpu *v)
3452 mfn_t smfn;
3453 int i = 0;
3455 ////
3456 //// vcpu->arch.paging.shadow.guest_vtable
3457 ////
3459 #if GUEST_PAGING_LEVELS == 3
3460 /* PAE guests don't have a mapping of the guest top-level table */
3461 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3462 #else
3463 if ( v->arch.paging.shadow.guest_vtable )
3465 struct domain *d = v->domain;
3466 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3467 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3468 v->arch.paging.shadow.guest_vtable = NULL;
3470 #endif
3473 ////
3474 //// vcpu->arch.shadow_table[]
3475 ////
3477 #if GUEST_PAGING_LEVELS == 3
3478 /* PAE guests have four shadow_table entries */
3479 for ( i = 0 ; i < 4 ; i++ )
3480 #endif
3482 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3483 if ( mfn_x(smfn) )
3484 sh_put_ref(v, smfn, 0);
3485 v->arch.shadow_table[i] = pagetable_null();
3489 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3490 static void
3491 sh_set_toplevel_shadow(struct vcpu *v,
3492 int slot,
3493 mfn_t gmfn,
3494 unsigned int root_type)
3496 mfn_t smfn;
3497 pagetable_t old_entry, new_entry;
3499 struct domain *d = v->domain;
3501 /* Remember the old contents of this slot */
3502 old_entry = v->arch.shadow_table[slot];
3504 /* Now figure out the new contents: is this a valid guest MFN? */
3505 if ( !mfn_valid(gmfn) )
3507 new_entry = pagetable_null();
3508 goto install_new_entry;
3511 /* Guest mfn is valid: shadow it and install the shadow */
3512 smfn = get_shadow_status(v, gmfn, root_type);
3513 if ( !mfn_valid(smfn) )
3515 /* Make sure there's enough free shadow memory. */
3516 shadow_prealloc(d, root_type, 1);
3517 /* Shadow the page. */
3518 smfn = sh_make_shadow(v, gmfn, root_type);
3520 ASSERT(mfn_valid(smfn));
3522 /* Pin the shadow and put it (back) on the list of pinned shadows */
3523 if ( sh_pin(v, smfn) == 0 )
3525 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3526 domain_crash(v->domain);
3529 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3530 * or the next call to set_toplevel_shadow() */
3531 if ( !sh_get_ref(v, smfn, 0) )
3533 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3534 domain_crash(v->domain);
3537 new_entry = pagetable_from_mfn(smfn);
3539 install_new_entry:
3540 /* Done. Install it */
3541 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3542 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3543 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3544 v->arch.shadow_table[slot] = new_entry;
3546 /* Decrement the refcount of the old contents of this slot */
3547 if ( !pagetable_is_null(old_entry) ) {
3548 mfn_t old_smfn = pagetable_get_mfn(old_entry);
3549 /* Need to repin the old toplevel shadow if it's been unpinned
3550 * by shadow_prealloc(): in PV mode we're still running on this
3551 * shadow and it's not safe to free it yet. */
3552 if ( !mfn_to_shadow_page(old_smfn)->pinned && !sh_pin(v, old_smfn) )
3554 SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
3555 domain_crash(v->domain);
3557 sh_put_ref(v, old_smfn, 0);
3562 static void
3563 sh_update_cr3(struct vcpu *v, int do_locking)
3564 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3565 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3566 * if appropriate).
3567 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3568 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
3569 * shadow tables are.
3570 * If do_locking != 0, assume we are being called from outside the
3571 * shadow code, and must take and release the shadow lock; otherwise
3572 * that is the caller's responsibility.
3573 */
3575 struct domain *d = v->domain;
3576 mfn_t gmfn;
3577 #if GUEST_PAGING_LEVELS == 3
3578 guest_l3e_t *gl3e;
3579 u32 guest_idx=0;
3580 int i;
3581 #endif
3583 /* Don't do anything on an uninitialised vcpu */
3584 if ( !is_hvm_domain(d) && !v->is_initialised )
3586 ASSERT(v->arch.cr3 == 0);
3587 return;
3590 if ( do_locking ) shadow_lock(v->domain);
3592 ASSERT(shadow_locked_by_me(v->domain));
3593 ASSERT(v->arch.paging.mode);
3595 ////
3596 //// vcpu->arch.guest_table is already set
3597 ////
3599 #ifndef NDEBUG
3600 /* Double-check that the HVM code has sent us a sane guest_table */
3601 if ( is_hvm_domain(d) )
3603 ASSERT(shadow_mode_external(d));
3604 if ( hvm_paging_enabled(v) )
3605 ASSERT(pagetable_get_pfn(v->arch.guest_table));
3606 else
3607 ASSERT(v->arch.guest_table.pfn
3608 == d->arch.paging.shadow.unpaged_pagetable.pfn);
3610 #endif
3612 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3613 d->domain_id, v->vcpu_id,
3614 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3616 #if GUEST_PAGING_LEVELS == 4
3617 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
3618 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3619 else
3620 #endif
3621 gmfn = pagetable_get_mfn(v->arch.guest_table);
3624 ////
3625 //// vcpu->arch.paging.shadow.guest_vtable
3626 ////
3627 #if GUEST_PAGING_LEVELS == 4
3628 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3630 if ( v->arch.paging.shadow.guest_vtable )
3631 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3632 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3633 /* PAGING_LEVELS==4 implies 64-bit, which means that
3634 * map_domain_page_global can't fail */
3635 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
3637 else
3638 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
3639 #elif GUEST_PAGING_LEVELS == 3
3640 /* On PAE guests we don't use a mapping of the guest's own top-level
3641 * table. We cache the current state of that table and shadow that,
3642 * until the next CR3 write makes us refresh our cache. */
3643 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3645 if ( shadow_mode_external(d) )
3646 /* Find where in the page the l3 table is */
3647 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
3648 else
3649 /* PV guest: l3 is at the start of a page */
3650 guest_idx = 0;
3652 // Ignore the low 2 bits of guest_idx -- they are really just
3653 // cache control.
3654 guest_idx &= ~3;
3656 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
3657 for ( i = 0; i < 4 ; i++ )
3658 v->arch.paging.shadow.gl3e[i] = gl3e[i];
3659 sh_unmap_domain_page(gl3e);
3660 #elif GUEST_PAGING_LEVELS == 2
3661 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3663 if ( v->arch.paging.shadow.guest_vtable )
3664 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3665 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3666 /* Does this really need map_domain_page_global? Handle the
3667 * error properly if so. */
3668 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
3670 else
3671 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
3672 #else
3673 #error this should never happen
3674 #endif
3676 #if 0
3677 printk("%s %s %d gmfn=%05lx shadow.guest_vtable=%p\n",
3678 __func__, __FILE__, __LINE__, gmfn, v->arch.paging.shadow.guest_vtable);
3679 #endif
3681 ////
3682 //// vcpu->arch.shadow_table[]
3683 ////
3685 /* We revoke write access to the new guest toplevel page(s) before we
3686 * replace the old shadow pagetable(s), so that we can safely use the
3687 * (old) shadow linear maps in the writeable mapping heuristics. */
3688 #if GUEST_PAGING_LEVELS == 2
3689 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
3690 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3691 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3692 #elif GUEST_PAGING_LEVELS == 3
3693 /* PAE guests have four shadow_table entries, based on the
3694 * current values of the guest's four l3es. */
3696 int flush = 0;
3697 gfn_t gl2gfn;
3698 mfn_t gl2mfn;
3699 p2m_type_t p2mt;
3700 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
3701 /* First, make all four entries read-only. */
3702 for ( i = 0; i < 4; i++ )
3704 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3706 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3707 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
3708 if ( p2m_is_ram(p2mt) )
3709 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
3712 if ( flush )
3713 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3714 /* Now install the new shadows. */
3715 for ( i = 0; i < 4; i++ )
3717 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3719 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3720 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
3721 if ( p2m_is_ram(p2mt) )
3722 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3723 ? SH_type_l2h_shadow
3724 : SH_type_l2_shadow);
3725 else
3726 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3728 else
3729 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3732 #elif GUEST_PAGING_LEVELS == 4
3733 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
3734 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3735 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3736 #else
3737 #error This should never happen
3738 #endif
3740 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3741 #endif
3743 ///
3744 /// v->arch.paging.shadow.l3table
3745 ///
3746 #if SHADOW_PAGING_LEVELS == 3
3748 mfn_t smfn;
3749 int i;
3750 for ( i = 0; i < 4; i++ )
3752 #if GUEST_PAGING_LEVELS == 2
3753 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3754 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3755 #else
3756 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3757 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3758 #endif
3759 v->arch.paging.shadow.l3table[i] =
3760 (mfn_x(smfn) == 0)
3761 ? shadow_l3e_empty()
3762 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3765 #endif /* SHADOW_PAGING_LEVELS == 3 */
3768 ///
3769 /// v->arch.cr3
3770 ///
3771 if ( shadow_mode_external(d) )
3773 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3775 else // not shadow_mode_external...
3777 /* We don't support PV except guest == shadow == config levels */
3778 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3779 #if SHADOW_PAGING_LEVELS == 3
3780 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3781 * Don't use make_cr3 because (a) we know it's below 4GB, and
3782 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3783 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
3784 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
3785 #else
3786 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3787 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3788 #endif
3792 ///
3793 /// v->arch.hvm_vcpu.hw_cr[3]
3794 ///
3795 if ( shadow_mode_external(d) )
3797 ASSERT(is_hvm_domain(d));
3798 #if SHADOW_PAGING_LEVELS == 3
3799 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3800 v->arch.hvm_vcpu.hw_cr[3] =
3801 virt_to_maddr(&v->arch.paging.shadow.l3table);
3802 #else
3803 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3804 v->arch.hvm_vcpu.hw_cr[3] =
3805 pagetable_get_paddr(v->arch.shadow_table[0]);
3806 #endif
3807 hvm_update_guest_cr(v, 3);
3810 /* Fix up the linear pagetable mappings */
3811 sh_update_linear_entries(v);
3813 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3814 /* No longer safe to use cached gva->gfn translations */
3815 vtlb_flush(v);
3816 #endif
3818 /* Release the lock, if we took it (otherwise it's the caller's problem) */
3819 if ( do_locking ) shadow_unlock(v->domain);
3823 /**************************************************************************/
3824 /* Functions to revoke guest rights */
3826 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3827 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3828 /* Look up this vaddr in the current shadow and see if it's a writeable
3829 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3831 shadow_l1e_t sl1e, *sl1p;
3832 shadow_l2e_t *sl2p;
3833 #if SHADOW_PAGING_LEVELS >= 3
3834 shadow_l3e_t *sl3p;
3835 #if SHADOW_PAGING_LEVELS >= 4
3836 shadow_l4e_t *sl4p;
3837 #endif
3838 #endif
3839 mfn_t sl1mfn;
3840 int r;
3842 /* Carefully look in the shadow linear map for the l1e we expect */
3843 #if SHADOW_PAGING_LEVELS >= 4
3844 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3845 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3846 return 0;
3847 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3848 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3849 return 0;
3850 #elif SHADOW_PAGING_LEVELS == 3
3851 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
3852 + shadow_l3_linear_offset(vaddr);
3853 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3854 return 0;
3855 #endif
3856 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3857 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3858 return 0;
3859 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3860 sl1e = *sl1p;
3861 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3862 != (_PAGE_PRESENT|_PAGE_RW))
3863 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3864 return 0;
3866 /* Found it! Need to remove its write permissions. */
3867 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3868 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3869 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3870 ASSERT( !(r & SHADOW_SET_ERROR) );
3871 return 1;
3873 #endif
3875 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
3876 mfn_t readonly_mfn)
3877 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3879 shadow_l1e_t *sl1e;
3880 int done = 0;
3881 int flags;
3882 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3884 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3886 flags = shadow_l1e_get_flags(*sl1e);
3887 if ( (flags & _PAGE_PRESENT)
3888 && (flags & _PAGE_RW)
3889 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3891 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
3892 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
3893 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3894 /* Remember the last shadow that we shot a writeable mapping in */
3895 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3896 #endif
3897 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3898 & PGT_count_mask) == 0 )
3899 /* This breaks us cleanly out of the FOREACH macro */
3900 done = 1;
3902 });
3903 return done;
3907 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3908 /* Excises all mappings to guest frame from this shadow l1 table */
3910 shadow_l1e_t *sl1e;
3911 int done = 0;
3912 int flags;
3914 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3916 flags = shadow_l1e_get_flags(*sl1e);
3917 if ( (flags & _PAGE_PRESENT)
3918 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3920 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3921 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3922 /* This breaks us cleanly out of the FOREACH macro */
3923 done = 1;
3925 });
3926 return done;
3929 /**************************************************************************/
3930 /* Functions to excise all pointers to shadows from higher-level shadows. */
3932 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3933 /* Blank out a single shadow entry */
3935 switch ( mfn_to_shadow_page(smfn)->type )
3937 case SH_type_l1_shadow:
3938 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3939 case SH_type_l2_shadow:
3940 #if GUEST_PAGING_LEVELS >= 3
3941 case SH_type_l2h_shadow:
3942 #endif
3943 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3944 #if GUEST_PAGING_LEVELS >= 4
3945 case SH_type_l3_shadow:
3946 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3947 case SH_type_l4_shadow:
3948 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3949 #endif
3950 default: BUG(); /* Called with the wrong kind of shadow. */
3954 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3955 /* Remove all mappings of this l1 shadow from this l2 shadow */
3957 shadow_l2e_t *sl2e;
3958 int done = 0;
3959 int flags;
3961 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
3963 flags = shadow_l2e_get_flags(*sl2e);
3964 if ( (flags & _PAGE_PRESENT)
3965 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3967 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3968 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
3969 /* This breaks us cleanly out of the FOREACH macro */
3970 done = 1;
3972 });
3973 return done;
3976 #if GUEST_PAGING_LEVELS >= 4
3977 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3978 /* Remove all mappings of this l2 shadow from this l3 shadow */
3980 shadow_l3e_t *sl3e;
3981 int done = 0;
3982 int flags;
3984 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3986 flags = shadow_l3e_get_flags(*sl3e);
3987 if ( (flags & _PAGE_PRESENT)
3988 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3990 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3991 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
3992 /* This breaks us cleanly out of the FOREACH macro */
3993 done = 1;
3995 });
3996 return done;
3999 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4000 /* Remove all mappings of this l3 shadow from this l4 shadow */
4002 shadow_l4e_t *sl4e;
4003 int done = 0;
4004 int flags;
4006 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4008 flags = shadow_l4e_get_flags(*sl4e);
4009 if ( (flags & _PAGE_PRESENT)
4010 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4012 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4013 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
4014 /* This breaks us cleanly out of the FOREACH macro */
4015 done = 1;
4017 });
4018 return done;
4020 #endif /* 64bit guest */
4022 /**************************************************************************/
4023 /* Handling HVM guest writes to pagetables */
4025 /* Translate a VA to an MFN, injecting a page-fault if we fail */
4026 #define BAD_GVA_TO_GFN (~0UL)
4027 #define BAD_GFN_TO_MFN (~1UL)
4028 static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4029 unsigned long vaddr,
4030 struct sh_emulate_ctxt *sh_ctxt)
4032 unsigned long gfn;
4033 mfn_t mfn;
4034 p2m_type_t p2mt;
4035 uint32_t pfec = PFEC_page_present | PFEC_write_access;
4037 /* Translate the VA to a GFN */
4038 gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4039 if ( gfn == INVALID_GFN )
4041 if ( is_hvm_vcpu(v) )
4042 hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4043 else
4044 propagate_page_fault(vaddr, pfec);
4045 return _mfn(BAD_GVA_TO_GFN);
4048 /* Translate the GFN to an MFN */
4049 mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4050 if ( p2m_is_ram(p2mt) )
4052 ASSERT(mfn_valid(mfn));
4053 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4054 return mfn;
4057 return _mfn(BAD_GFN_TO_MFN);
4060 /* Check that the user is allowed to perform this write.
4061 * Returns a mapped pointer to write to, or NULL for error. */
4062 #define MAPPING_UNHANDLEABLE ((void *)0)
4063 #define MAPPING_EXCEPTION ((void *)1)
4064 #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 1)
4065 static void *emulate_map_dest(struct vcpu *v,
4066 unsigned long vaddr,
4067 u32 bytes,
4068 struct sh_emulate_ctxt *sh_ctxt)
4070 struct segment_register *sreg;
4071 unsigned long offset;
4072 void *map = NULL;
4074 /* We don't emulate user-mode writes to page tables */
4075 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
4076 if ( sreg->attr.fields.dpl == 3 )
4077 return MAPPING_UNHANDLEABLE;
4079 sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4080 if ( !mfn_valid(sh_ctxt->mfn1) )
4081 return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4082 MAPPING_EXCEPTION : MAPPING_UNHANDLEABLE);
4084 /* Unaligned writes mean probably this isn't a pagetable */
4085 if ( vaddr & (bytes - 1) )
4086 sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4088 if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4090 /* Whole write fits on a single page */
4091 sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4092 map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4094 else
4096 /* Cross-page emulated writes are only supported for HVM guests;
4097 * PV guests ought to know better */
4098 if ( !is_hvm_vcpu(v) )
4099 return MAPPING_UNHANDLEABLE;
4101 /* This write crosses a page boundary. Translate the second page */
4102 sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4103 sh_ctxt);
4104 if ( !mfn_valid(sh_ctxt->mfn2) )
4105 return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4106 MAPPING_EXCEPTION : MAPPING_UNHANDLEABLE);
4108 /* Cross-page writes mean probably not a pagetable */
4109 sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4111 /* Hack: we map the pages into the vcpu's LDT space, since we
4112 * know that we're not going to need the LDT for HVM guests,
4113 * and only HVM guests are allowed unaligned writes. */
4114 ASSERT(is_hvm_vcpu(v));
4115 map = (void *)LDT_VIRT_START(v);
4116 offset = l1_linear_offset((unsigned long) map);
4117 l1e_write(&__linear_l1_table[offset],
4118 l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4119 l1e_write(&__linear_l1_table[offset + 1],
4120 l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4121 flush_tlb_local();
4122 map += (vaddr & ~PAGE_MASK);
4125 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4126 /* Remember if the bottom bit was clear, so we can choose not to run
4127 * the change through the verify code if it's still clear afterwards */
4128 sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4129 #endif
4131 return map;
4134 /* Tidy up after the emulated write: mark pages dirty, verify the new
4135 * contents, and undo the mapping */
4136 static void emulate_unmap_dest(struct vcpu *v,
4137 void *addr,
4138 u32 bytes,
4139 struct sh_emulate_ctxt *sh_ctxt)
4141 u32 b1 = bytes, b2 = 0, shflags;
4143 ASSERT(mfn_valid(sh_ctxt->mfn1));
4145 /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4146 if ( likely(bytes >= 4)
4147 && (*(u32 *)addr == 0)
4148 && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4149 check_for_early_unshadow(v, sh_ctxt->mfn1);
4150 else
4151 reset_early_unshadow(v);
4153 /* We can avoid re-verifying the page contents after the write if:
4154 * - it was no larger than the PTE type of this pagetable;
4155 * - it was aligned to the PTE boundaries; and
4156 * - _PAGE_PRESENT was clear before and after the write. */
4157 shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4158 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4159 if ( sh_ctxt->low_bit_was_clear
4160 && !(*(u8 *)addr & _PAGE_PRESENT)
4161 && ((!(shflags & SHF_32)
4162 /* Not shadowed 32-bit: aligned 64-bit writes that leave
4163 * the present bit unset are safe to ignore. */
4164 && ((unsigned long)addr & 7) == 0
4165 && bytes <= 8)
4166 ||
4167 (!(shflags & (SHF_PAE|SHF_64))
4168 /* Not shadowed PAE/64-bit: aligned 32-bit writes that
4169 * leave the present bit unset are safe to ignore. */
4170 && ((unsigned long)addr & 3) == 0
4171 && bytes <= 4)) )
4173 /* Writes with this alignment constraint can't possibly cross pages */
4174 ASSERT(!mfn_valid(sh_ctxt->mfn2));
4176 else
4177 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4179 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4181 /* Validate as two writes, one to each page */
4182 b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4183 b2 = bytes - b1;
4184 ASSERT(b2 < bytes);
4186 if ( likely(b1 > 0) )
4187 sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4188 if ( unlikely(b2 > 0) )
4189 sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4192 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4194 if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4196 unsigned long offset;
4197 paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4198 /* Undo the hacky two-frame contiguous map. */
4199 ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4200 offset = l1_linear_offset((unsigned long) addr);
4201 l1e_write(&__linear_l1_table[offset], l1e_empty());
4202 l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4203 flush_tlb_all();
4205 else
4206 sh_unmap_domain_page(addr);
4209 int
4210 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4211 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4213 void *addr;
4215 /* Unaligned writes are only acceptable on HVM */
4216 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4217 return X86EMUL_UNHANDLEABLE;
4219 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4220 if ( emulate_map_dest_failed(addr) )
4221 return ((addr == MAPPING_EXCEPTION) ?
4222 X86EMUL_EXCEPTION : X86EMUL_UNHANDLEABLE);
4224 shadow_lock(v->domain);
4225 memcpy(addr, src, bytes);
4227 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4228 shadow_audit_tables(v);
4229 shadow_unlock(v->domain);
4230 return X86EMUL_OKAY;
4233 int
4234 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4235 unsigned long old, unsigned long new,
4236 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4238 void *addr;
4239 unsigned long prev;
4240 int rv = X86EMUL_OKAY;
4242 /* Unaligned writes are only acceptable on HVM */
4243 if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4244 return X86EMUL_UNHANDLEABLE;
4246 addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4247 if ( emulate_map_dest_failed(addr) )
4248 return ((addr == MAPPING_EXCEPTION) ?
4249 X86EMUL_EXCEPTION : X86EMUL_UNHANDLEABLE);
4251 shadow_lock(v->domain);
4252 switch ( bytes )
4254 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4255 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4256 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4257 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4258 default:
4259 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4260 prev = ~old;
4263 if ( prev != old )
4264 rv = X86EMUL_CMPXCHG_FAILED;
4266 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4267 " wanted %#lx now %#lx bytes %u\n",
4268 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4270 emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4271 shadow_audit_tables(v);
4272 shadow_unlock(v->domain);
4273 return rv;
4276 int
4277 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4278 unsigned long old_lo, unsigned long old_hi,
4279 unsigned long new_lo, unsigned long new_hi,
4280 struct sh_emulate_ctxt *sh_ctxt)
4282 void *addr;
4283 u64 old, new, prev;
4284 int rv = X86EMUL_OKAY;
4286 /* Unaligned writes are only acceptable on HVM */
4287 if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4288 return X86EMUL_UNHANDLEABLE;
4290 addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4291 if ( emulate_map_dest_failed(addr) )
4292 return ((addr == MAPPING_EXCEPTION) ?
4293 X86EMUL_EXCEPTION : X86EMUL_UNHANDLEABLE);
4295 old = (((u64) old_hi) << 32) | (u64) old_lo;
4296 new = (((u64) new_hi) << 32) | (u64) new_lo;
4298 shadow_lock(v->domain);
4299 prev = cmpxchg(((u64 *)addr), old, new);
4301 if ( prev != old )
4302 rv = X86EMUL_CMPXCHG_FAILED;
4304 emulate_unmap_dest(v, addr, 8, sh_ctxt);
4305 shadow_audit_tables(v);
4306 shadow_unlock(v->domain);
4307 return rv;
4311 /**************************************************************************/
4312 /* Audit tools */
4314 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4316 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4317 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4318 "gl" #_level "mfn = %" PRI_mfn \
4319 " sl" #_level "mfn = %" PRI_mfn \
4320 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4321 " gl" #_level "e = %" SH_PRI_gpte \
4322 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4323 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4324 _level, guest_index(gl ## _level ## e), \
4325 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4326 gl ## _level ## e, sl ## _level ## e, \
4327 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4328 ##_a); \
4329 BUG(); \
4330 done = 1; \
4331 } while (0)
4334 static char * sh_audit_flags(struct vcpu *v, int level,
4335 int gflags, int sflags)
4336 /* Common code for auditing flag bits */
4338 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4339 return "shadow is present but guest is not present";
4340 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4341 return "global bit set in PV shadow";
4342 if ( level == 2 && (sflags & _PAGE_PSE) )
4343 return "PS bit set in shadow";
4344 #if SHADOW_PAGING_LEVELS == 3
4345 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4346 #endif
4347 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4348 return "accessed bit not propagated";
4349 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4350 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4351 return "dirty bit not propagated";
4352 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4353 return "user/supervisor bit does not match";
4354 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4355 return "NX bit does not match";
4356 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4357 return "shadow grants write access but guest does not";
4358 return NULL;
4361 static inline mfn_t
4362 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
4363 /* Convert this gfn to an mfn in the manner appropriate for the
4364 * guest pagetable it's used in (gmfn) */
4366 p2m_type_t p2mt;
4367 if ( !shadow_mode_translate(v->domain) )
4368 return _mfn(gfn_x(gfn));
4370 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
4371 != PGT_writable_page )
4372 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
4373 else
4374 return gfn_to_mfn(v->domain, gfn, &p2mt);
4378 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4380 guest_l1e_t *gl1e, *gp;
4381 shadow_l1e_t *sl1e;
4382 mfn_t mfn, gmfn, gl1mfn;
4383 gfn_t gfn;
4384 char *s;
4385 int done = 0;
4387 /* Follow the backpointer */
4388 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4389 gl1e = gp = sh_map_domain_page(gl1mfn);
4390 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4392 if ( sh_l1e_is_magic(*sl1e) )
4394 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4395 if ( sh_l1e_is_gnp(*sl1e) )
4397 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4398 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4400 else
4402 ASSERT(sh_l1e_is_mmio(*sl1e));
4403 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4404 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4405 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4406 " but guest gfn is %" SH_PRI_gfn,
4407 gfn_x(gfn),
4408 gfn_x(guest_l1e_get_gfn(*gl1e)));
4410 #endif
4412 else
4414 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4415 shadow_l1e_get_flags(*sl1e));
4416 if ( s ) AUDIT_FAIL(1, "%s", s);
4418 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4420 gfn = guest_l1e_get_gfn(*gl1e);
4421 mfn = shadow_l1e_get_mfn(*sl1e);
4422 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
4423 if ( mfn_x(gmfn) != mfn_x(mfn) )
4424 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4425 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4426 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4429 });
4430 sh_unmap_domain_page(gp);
4431 return done;
4434 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4436 guest_l1e_t *gl1e, e;
4437 shadow_l1e_t *sl1e;
4438 mfn_t gl1mfn = _mfn(INVALID_MFN);
4439 int f;
4440 int done = 0;
4442 /* fl1 has no useful backpointer: all we can check are flags */
4443 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4444 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4445 f = shadow_l1e_get_flags(*sl1e);
4446 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4447 if ( !(f == 0
4448 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4449 _PAGE_ACCESSED|_PAGE_DIRTY)
4450 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4451 || sh_l1e_is_magic(*sl1e)) )
4452 AUDIT_FAIL(1, "fl1e has bad flags");
4453 });
4454 return 0;
4457 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4459 guest_l2e_t *gl2e, *gp;
4460 shadow_l2e_t *sl2e;
4461 mfn_t mfn, gmfn, gl2mfn;
4462 gfn_t gfn;
4463 char *s;
4464 int done = 0;
4466 /* Follow the backpointer */
4467 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4468 gl2e = gp = sh_map_domain_page(gl2mfn);
4469 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
4471 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4472 shadow_l2e_get_flags(*sl2e));
4473 if ( s ) AUDIT_FAIL(2, "%s", s);
4475 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4477 gfn = guest_l2e_get_gfn(*gl2e);
4478 mfn = shadow_l2e_get_mfn(*sl2e);
4479 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4480 ? get_fl1_shadow_status(v, gfn)
4481 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4482 SH_type_l1_shadow);
4483 if ( mfn_x(gmfn) != mfn_x(mfn) )
4484 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4485 " (--> %" PRI_mfn ")"
4486 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4487 gfn_x(gfn),
4488 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4489 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4490 mfn_x(gmfn), mfn_x(mfn));
4492 });
4493 sh_unmap_domain_page(gp);
4494 return 0;
4497 #if GUEST_PAGING_LEVELS >= 4
4498 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4500 guest_l3e_t *gl3e, *gp;
4501 shadow_l3e_t *sl3e;
4502 mfn_t mfn, gmfn, gl3mfn;
4503 gfn_t gfn;
4504 char *s;
4505 int done = 0;
4507 /* Follow the backpointer */
4508 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4509 gl3e = gp = sh_map_domain_page(gl3mfn);
4510 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4512 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4513 shadow_l3e_get_flags(*sl3e));
4514 if ( s ) AUDIT_FAIL(3, "%s", s);
4516 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4518 gfn = guest_l3e_get_gfn(*gl3e);
4519 mfn = shadow_l3e_get_mfn(*sl3e);
4520 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4521 ((GUEST_PAGING_LEVELS == 3 ||
4522 is_pv_32on64_vcpu(v))
4523 && !shadow_mode_external(v->domain)
4524 && (guest_index(gl3e) % 4) == 3)
4525 ? SH_type_l2h_shadow
4526 : SH_type_l2_shadow);
4527 if ( mfn_x(gmfn) != mfn_x(mfn) )
4528 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4529 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4530 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4532 });
4533 sh_unmap_domain_page(gp);
4534 return 0;
4537 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4539 guest_l4e_t *gl4e, *gp;
4540 shadow_l4e_t *sl4e;
4541 mfn_t mfn, gmfn, gl4mfn;
4542 gfn_t gfn;
4543 char *s;
4544 int done = 0;
4546 /* Follow the backpointer */
4547 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4548 gl4e = gp = sh_map_domain_page(gl4mfn);
4549 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
4551 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4552 shadow_l4e_get_flags(*sl4e));
4553 if ( s ) AUDIT_FAIL(4, "%s", s);
4555 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4557 gfn = guest_l4e_get_gfn(*gl4e);
4558 mfn = shadow_l4e_get_mfn(*sl4e);
4559 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4560 SH_type_l3_shadow);
4561 if ( mfn_x(gmfn) != mfn_x(mfn) )
4562 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4563 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4564 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4566 });
4567 sh_unmap_domain_page(gp);
4568 return 0;
4570 #endif /* GUEST_PAGING_LEVELS >= 4 */
4573 #undef AUDIT_FAIL
4575 #endif /* Audit code */
4577 /**************************************************************************/
4578 /* Entry points into this mode of the shadow code.
4579 * This will all be mangled by the preprocessor to uniquify everything. */
4580 struct paging_mode sh_paging_mode = {
4581 .page_fault = sh_page_fault,
4582 .invlpg = sh_invlpg,
4583 .gva_to_gfn = sh_gva_to_gfn,
4584 .update_cr3 = sh_update_cr3,
4585 .update_paging_modes = shadow_update_paging_modes,
4586 .write_p2m_entry = shadow_write_p2m_entry,
4587 .write_guest_entry = shadow_write_guest_entry,
4588 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
4589 .guest_map_l1e = sh_guest_map_l1e,
4590 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4591 .guest_levels = GUEST_PAGING_LEVELS,
4592 .shadow.detach_old_tables = sh_detach_old_tables,
4593 .shadow.x86_emulate_write = sh_x86_emulate_write,
4594 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4595 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4596 .shadow.make_monitor_table = sh_make_monitor_table,
4597 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
4598 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4599 .shadow.guess_wrmap = sh_guess_wrmap,
4600 #endif
4601 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
4602 };
4604 /*
4605 * Local variables:
4606 * mode: C
4607 * c-set-style: "BSD"
4608 * c-basic-offset: 4
4609 * indent-tabs-mode: nil
4610 * End:
4611 */