ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 15812:86a154e1ef5d

[HVM] Shadow: don't shadow the p2m table.
For HVM vcpus with paging disabled, we used to shadow the p2m table,
and skip the p2m lookup to go from gfn to mfn. Instead, we now
provide a simple pagetable that gives a one-to-one mapping of 4GB, and
shadow that, making the translations from gfn to mfn via the p2m.
This removes the paging-disabled special-case code from the shadow
fault handler, and allows us to expand the p2m interface, since all HVM
translations now go through the same p2m lookups.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Fri Aug 31 11:06:22 2007 +0100 (2007-08-31)
parents da2c7dab1a3a
children a53aaea4c698
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include "private.h"
37 #include "types.h"
39 /* THINGS TO DO LATER:
40 *
41 * TEARDOWN HEURISTICS
42 * Also: have a heuristic for when to destroy a previous paging-mode's
43 * shadows. When a guest is done with its start-of-day 32-bit tables
44 * and reuses the memory we want to drop those shadows. Start with
45 * shadows in a page in two modes as a hint, but beware of clever tricks
46 * like reusing a pagetable for both PAE and 64-bit during boot...
47 *
48 * PAE LINEAR MAPS
49 * Rework shadow_get_l*e() to have the option of using map_domain_page()
50 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
51 * Then we can test the speed difference made by linear maps. If the
52 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
53 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
54 * to share l2h pages again.
55 *
56 * GUEST_WALK_TABLES TLB FLUSH COALESCE
57 * guest_walk_tables can do up to three remote TLB flushes as it walks to
58 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
59 * and if we do flush, re-do the walk. If anything has changed, then
60 * pause all the other vcpus and do the walk *again*.
61 *
62 * WP DISABLED
63 * Consider how to implement having the WP bit of CR0 set to 0.
64 * Since we need to be able to cause write faults to pagetables, this might
65 * end up looking like not having the (guest) pagetables present at all in
66 * HVM guests...
67 *
68 * PSE disabled / PSE36
69 * We don't support any modes other than PSE enabled, PSE36 disabled.
70 * Neither of those would be hard to change, but we'd need to be able to
71 * deal with shadows made in one mode and used in another.
72 */
74 #define FETCH_TYPE_PREFETCH 1
75 #define FETCH_TYPE_DEMAND 2
76 #define FETCH_TYPE_WRITE 4
77 typedef enum {
78 ft_prefetch = FETCH_TYPE_PREFETCH,
79 ft_demand_read = FETCH_TYPE_DEMAND,
80 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
81 } fetch_type_t;
83 #ifdef DEBUG_TRACE_DUMP
84 static char *fetch_type_names[] = {
85 [ft_prefetch] "prefetch",
86 [ft_demand_read] "demand read",
87 [ft_demand_write] "demand write",
88 };
89 #endif
91 /**************************************************************************/
92 /* Hash table mapping from guest pagetables to shadows
93 *
94 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
95 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
96 * shadow L1 which maps its "splinters".
97 */
99 static inline mfn_t
100 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
101 /* Look for FL1 shadows in the hash table */
102 {
103 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
104 return smfn;
105 }
107 static inline mfn_t
108 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
109 /* Look for shadows in the hash table */
110 {
111 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
112 perfc_incr(shadow_get_shadow_status);
113 return smfn;
114 }
116 static inline void
117 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
118 /* Put an FL1 shadow into the hash table */
119 {
120 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
121 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
123 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
124 }
126 static inline void
127 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
128 /* Put a shadow into the hash table */
129 {
130 struct domain *d = v->domain;
131 int res;
133 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
134 d->domain_id, v->vcpu_id, mfn_x(gmfn),
135 shadow_type, mfn_x(smfn));
137 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
138 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
139 {
140 res = get_page(mfn_to_page(gmfn), d);
141 ASSERT(res == 1);
142 }
144 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
145 }
147 static inline void
148 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
149 /* Remove a shadow from the hash table */
150 {
151 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
152 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
153 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
154 }
156 static inline void
157 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
158 /* Remove a shadow from the hash table */
159 {
160 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
161 v->domain->domain_id, v->vcpu_id,
162 mfn_x(gmfn), shadow_type, mfn_x(smfn));
163 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
164 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
165 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
166 put_page(mfn_to_page(gmfn));
167 }
169 /**************************************************************************/
170 /* CPU feature support querying */
172 static inline int
173 guest_supports_superpages(struct vcpu *v)
174 {
175 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
176 * CR4.PSE is set or the guest is in PAE or long mode.
177 * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
178 return (is_hvm_vcpu(v) &&
179 (GUEST_PAGING_LEVELS != 2
180 || !hvm_paging_enabled(v)
181 || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
182 }
184 static inline int
185 guest_supports_nx(struct vcpu *v)
186 {
187 if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
188 return 0;
189 if ( !is_hvm_vcpu(v) )
190 return 1;
191 return hvm_nx_enabled(v);
192 }
195 /**************************************************************************/
196 /* Functions for walking the guest page tables */
199 /* Walk the guest pagetables, filling the walk_t with what we see.
200 * Takes an uninitialised walk_t. The caller must call unmap_walk()
201 * on the walk_t before discarding it or calling guest_walk_tables again.
202 * If "guest_op" is non-zero, we are serving a genuine guest memory access,
203 * and must (a) be under the shadow lock, and (b) remove write access
204 * from any gueat PT pages we see, as we will be using their contents to
205 * perform shadow updates.
206 * Returns 0 for success or non-zero if the guest pagetables are malformed.
207 * N.B. Finding a not-present entry does not cause a non-zero return code. */
208 static inline int
209 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
210 {
211 struct domain *d = v->domain;
212 ASSERT(!guest_op || shadow_locked_by_me(d));
214 perfc_incr(shadow_guest_walk);
215 memset(gw, 0, sizeof(*gw));
216 gw->va = va;
218 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
219 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
220 /* Get l4e from the top level table */
221 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
222 gw->l4e = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable
223 + guest_l4_table_offset(va);
224 /* Walk down to the l3e */
225 if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
226 gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(*gw->l4e));
227 if ( !mfn_valid(gw->l3mfn) ) return 1;
228 /* This mfn is a pagetable: make sure the guest can't write to it. */
229 if ( guest_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
230 flush_tlb_mask(d->domain_dirty_cpumask);
231 gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
232 + guest_l3_table_offset(va);
233 #else /* PAE only... */
234 /* Get l3e from the cache of the guest's top level table */
235 gw->l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
236 #endif /* PAE or 64... */
237 /* Walk down to the l2e */
238 if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
239 gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(*gw->l3e));
240 if ( !mfn_valid(gw->l2mfn) ) return 1;
241 /* This mfn is a pagetable: make sure the guest can't write to it. */
242 if ( guest_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
243 flush_tlb_mask(d->domain_dirty_cpumask);
244 gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
245 + guest_l2_table_offset(va);
246 #else /* 32-bit only... */
247 /* Get l2e from the top level table */
248 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
249 gw->l2e = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable
250 + guest_l2_table_offset(va);
251 #endif /* All levels... */
253 if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
254 if ( guest_supports_superpages(v) &&
255 (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
256 {
257 /* Special case: this guest VA is in a PSE superpage, so there's
258 * no guest l1e. We make one up so that the propagation code
259 * can generate a shadow l1 table. Start with the gfn of the
260 * first 4k-page of the superpage. */
261 gfn_t start = guest_l2e_get_gfn(*gw->l2e);
262 /* Grant full access in the l1e, since all the guest entry's
263 * access controls are enforced in the shadow l2e. This lets
264 * us reflect l2 changes later without touching the l1s. */
265 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
266 _PAGE_ACCESSED|_PAGE_DIRTY);
267 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
268 * of the level 1 */
269 if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
270 flags |= _PAGE_PAT;
271 /* Increment the pfn by the right number of 4k pages.
272 * The ~0x1 is to mask out the PAT bit mentioned above. */
273 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
274 gw->eff_l1e = guest_l1e_from_gfn(start, flags);
275 gw->l1e = NULL;
276 gw->l1mfn = _mfn(INVALID_MFN);
277 }
278 else
279 {
280 /* Not a superpage: carry on and find the l1e. */
281 gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(*gw->l2e));
282 if ( !mfn_valid(gw->l1mfn) ) return 1;
283 /* This mfn is a pagetable: make sure the guest can't write to it. */
284 if ( guest_op
285 && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
286 flush_tlb_mask(d->domain_dirty_cpumask);
287 gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
288 + guest_l1_table_offset(va);
289 gw->eff_l1e = *gw->l1e;
290 }
292 return 0;
293 }
295 /* Given a walk_t, translate the gw->va into the guest's notion of the
296 * corresponding frame number. */
297 static inline gfn_t
298 guest_walk_to_gfn(walk_t *gw)
299 {
300 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
301 return _gfn(INVALID_GFN);
302 return guest_l1e_get_gfn(gw->eff_l1e);
303 }
305 /* Given a walk_t, translate the gw->va into the guest's notion of the
306 * corresponding physical address. */
307 static inline paddr_t
308 guest_walk_to_gpa(walk_t *gw)
309 {
310 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
311 return 0;
312 return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
313 }
316 /* Unmap (and reinitialise) a guest walk.
317 * Call this to dispose of any walk filled in by guest_walk_tables() */
318 static void unmap_walk(struct vcpu *v, walk_t *gw)
319 {
320 #if GUEST_PAGING_LEVELS >= 3
321 #if GUEST_PAGING_LEVELS >= 4
322 if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
323 #endif
324 if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
325 #endif
326 if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
327 #ifdef DEBUG
328 memset(gw, 0, sizeof(*gw));
329 #endif
330 }
333 /* Pretty-print the contents of a guest-walk */
334 static inline void print_gw(walk_t *gw)
335 {
336 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
337 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
338 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
339 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
340 SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
341 if ( gw->l4e )
342 SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
343 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
344 #endif /* PAE or 64... */
345 SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
346 if ( gw->l3e )
347 SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
348 #endif /* All levels... */
349 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
350 SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
351 if ( gw->l2e )
352 SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
353 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
354 SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
355 if ( gw->l1e )
356 SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
357 SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
358 }
361 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
362 /* Lightweight audit: pass all the shadows associated with this guest walk
363 * through the audit mechanisms */
364 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
365 {
366 mfn_t smfn;
368 if ( !(SHADOW_AUDIT_ENABLE) )
369 return;
371 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
372 if ( mfn_valid(gw->l4mfn)
373 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
374 SH_type_l4_shadow))) )
375 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
376 if ( mfn_valid(gw->l3mfn)
377 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
378 SH_type_l3_shadow))) )
379 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
380 #endif /* PAE or 64... */
381 if ( mfn_valid(gw->l2mfn) )
382 {
383 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
384 SH_type_l2_shadow))) )
385 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
386 #if GUEST_PAGING_LEVELS == 3
387 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
388 SH_type_l2h_shadow))) )
389 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
390 #endif
391 }
392 if ( mfn_valid(gw->l1mfn)
393 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
394 SH_type_l1_shadow))) )
395 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
396 else if ( gw->l2e
397 && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
398 && mfn_valid(
399 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
400 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
401 }
403 #else
404 #define sh_audit_gw(_v, _gw) do {} while(0)
405 #endif /* audit code */
409 /**************************************************************************/
410 /* Function to write to the guest tables, for propagating accessed and
411 * dirty bits from the shadow to the guest.
412 * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
413 * and an operation type. The guest entry is always passed as an l1e:
414 * since we only ever write flags, that's OK.
415 * Returns the new flag bits of the guest entry. */
417 static u32 guest_set_ad_bits(struct vcpu *v,
418 mfn_t gmfn,
419 guest_l1e_t *ep,
420 unsigned int level,
421 fetch_type_t ft)
422 {
423 u32 flags;
424 int res = 0;
426 ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
427 ASSERT(level <= GUEST_PAGING_LEVELS);
428 ASSERT(shadow_locked_by_me(v->domain));
430 flags = guest_l1e_get_flags(*ep);
432 /* Only set A and D bits for guest-initiated accesses */
433 if ( !(ft & FETCH_TYPE_DEMAND) )
434 return flags;
436 ASSERT(mfn_valid(gmfn)
437 && (sh_mfn_is_a_page_table(gmfn)
438 || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
439 == 0)));
441 /* PAE l3s do not have A and D bits */
442 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
444 /* Need the D bit as well for writes, in L1es and PSE L2es. */
445 if ( ft == ft_demand_write
446 && (level == 1 ||
447 (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
448 {
449 if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
450 == (_PAGE_DIRTY | _PAGE_ACCESSED) )
451 return flags; /* Guest already has A and D bits set */
452 flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
453 perfc_incr(shadow_ad_update);
454 }
455 else
456 {
457 if ( flags & _PAGE_ACCESSED )
458 return flags; /* Guest already has A bit set */
459 flags |= _PAGE_ACCESSED;
460 perfc_incr(shadow_a_update);
461 }
463 /* Set the bit(s) */
464 paging_mark_dirty(v->domain, mfn_x(gmfn));
465 SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
466 "old flags = %#x, new flags = %#x\n",
467 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep),
468 flags);
469 *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
471 /* Propagate this change to any other shadows of the page
472 * (only necessary if there is more than one shadow) */
473 if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
474 {
475 u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
476 /* More than one type bit set in shadow-flags? */
477 if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
478 res = sh_validate_guest_entry(v, gmfn, ep, sizeof (*ep));
479 }
481 /* We should never need to flush the TLB or recopy PAE entries */
482 ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
484 return flags;
485 }
487 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
488 void *
489 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
490 unsigned long *gl1mfn)
491 {
492 void *pl1e = NULL;
493 walk_t gw;
495 ASSERT(shadow_mode_translate(v->domain));
497 // XXX -- this is expensive, but it's easy to cobble together...
498 // FIXME!
500 shadow_lock(v->domain);
501 guest_walk_tables(v, addr, &gw, 1);
503 if ( gw.l2e &&
504 (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) &&
505 !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) )
506 {
507 if ( gl1mfn )
508 *gl1mfn = mfn_x(gw.l1mfn);
509 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
510 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
511 }
513 unmap_walk(v, &gw);
514 shadow_unlock(v->domain);
516 return pl1e;
517 }
519 void
520 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
521 {
522 walk_t gw;
524 ASSERT(shadow_mode_translate(v->domain));
526 // XXX -- this is expensive, but it's easy to cobble together...
527 // FIXME!
529 shadow_lock(v->domain);
530 guest_walk_tables(v, addr, &gw, 1);
531 *(guest_l1e_t *)eff_l1e = gw.eff_l1e;
532 unmap_walk(v, &gw);
533 shadow_unlock(v->domain);
534 }
535 #endif /* CONFIG==SHADOW==GUEST */
537 /**************************************************************************/
538 /* Functions to compute the correct index into a shadow page, given an
539 * index into the guest page (as returned by guest_get_index()).
540 * This is trivial when the shadow and guest use the same sized PTEs, but
541 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
542 * PAE- or 64-bit shadows).
543 *
544 * These functions also increment the shadow mfn, when necessary. When PTE
545 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
546 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
547 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
548 * which shadow page we really want. Similarly, when PTE sizes are
549 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
550 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
551 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
552 * space.)
553 *
554 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
555 * of shadow (to store both the shadow, and the info that would normally be
556 * stored in page_info fields). This arrangement allows the shadow and the
557 * "page_info" fields to always be stored in the same page (in fact, in
558 * the same cache line), avoiding an extra call to map_domain_page().
559 */
561 static inline u32
562 guest_index(void *ptr)
563 {
564 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
565 }
567 static u32
568 shadow_l1_index(mfn_t *smfn, u32 guest_index)
569 {
570 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
571 *smfn = _mfn(mfn_x(*smfn) +
572 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
573 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
574 #else
575 return guest_index;
576 #endif
577 }
579 static u32
580 shadow_l2_index(mfn_t *smfn, u32 guest_index)
581 {
582 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
583 // Because we use 2 shadow l2 entries for each guest entry, the number of
584 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
585 //
586 *smfn = _mfn(mfn_x(*smfn) +
587 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
589 // We multiple by two to get the index of the first of the two entries
590 // used to shadow the specified guest entry.
591 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
592 #else
593 return guest_index;
594 #endif
595 }
597 #if GUEST_PAGING_LEVELS >= 4
599 static u32
600 shadow_l3_index(mfn_t *smfn, u32 guest_index)
601 {
602 return guest_index;
603 }
605 static u32
606 shadow_l4_index(mfn_t *smfn, u32 guest_index)
607 {
608 return guest_index;
609 }
611 #endif // GUEST_PAGING_LEVELS >= 4
614 /**************************************************************************/
615 /* Function which computes shadow entries from their corresponding guest
616 * entries. This is the "heart" of the shadow code. It operates using
617 * level-1 shadow types, but handles all levels of entry.
618 * Don't call it directly, but use the four wrappers below.
619 */
621 static always_inline void
622 _sh_propagate(struct vcpu *v,
623 void *guest_entry_ptr,
624 mfn_t guest_table_mfn,
625 mfn_t target_mfn,
626 void *shadow_entry_ptr,
627 int level,
628 fetch_type_t ft,
629 int mmio)
630 {
631 guest_l1e_t *gp = guest_entry_ptr;
632 shadow_l1e_t *sp = shadow_entry_ptr;
633 struct domain *d = v->domain;
634 u32 pass_thru_flags;
635 u32 gflags, sflags;
637 /* We don't shadow PAE l3s */
638 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
640 if ( mfn_valid(guest_table_mfn) )
641 /* Handle A and D bit propagation into the guest */
642 gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
643 else
644 {
645 /* Must be an fl1e or a prefetch */
646 ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
647 gflags = guest_l1e_get_flags(*gp);
648 }
650 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
651 {
652 /* If a guest l1 entry is not present, shadow with the magic
653 * guest-not-present entry. */
654 if ( level == 1 )
655 *sp = sh_l1e_gnp();
656 else
657 *sp = shadow_l1e_empty();
658 goto done;
659 }
661 if ( level == 1 && mmio )
662 {
663 /* Guest l1e maps MMIO space */
664 *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
665 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
666 d->arch.paging.shadow.has_fast_mmio_entries = 1;
667 goto done;
668 }
670 // Must have a valid target_mfn, unless this is a prefetch. In the
671 // case of a prefetch, an invalid mfn means that we can not usefully
672 // shadow anything, and so we return early.
673 //
674 if ( !mfn_valid(target_mfn) )
675 {
676 ASSERT((ft == ft_prefetch));
677 *sp = shadow_l1e_empty();
678 goto done;
679 }
681 // Propagate bits from the guest to the shadow.
682 // Some of these may be overwritten, below.
683 // Since we know the guest's PRESENT bit is set, we also set the shadow's
684 // SHADOW_PRESENT bit.
685 //
686 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
687 _PAGE_RW | _PAGE_PRESENT);
688 if ( guest_supports_nx(v) )
689 pass_thru_flags |= _PAGE_NX_BIT;
690 sflags = gflags & pass_thru_flags;
692 // Set the A&D bits for higher level shadows.
693 // Higher level entries do not, strictly speaking, have dirty bits, but
694 // since we use shadow linear tables, each of these entries may, at some
695 // point in time, also serve as a shadow L1 entry.
696 // By setting both the A&D bits in each of these, we eliminate the burden
697 // on the hardware to update these bits on initial accesses.
698 //
699 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
700 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
702 // If the A or D bit has not yet been set in the guest, then we must
703 // prevent the corresponding kind of access.
704 //
705 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
706 sflags &= ~_PAGE_PRESENT;
708 /* D bits exist in L1es and PSE L2es */
709 if ( unlikely(((level == 1) ||
710 ((level == 2) &&
711 (gflags & _PAGE_PSE) &&
712 guest_supports_superpages(v)))
713 && !(gflags & _PAGE_DIRTY)) )
714 sflags &= ~_PAGE_RW;
716 // shadow_mode_log_dirty support
717 //
718 // Only allow the guest write access to a page a) on a demand fault,
719 // or b) if the page is already marked as dirty.
720 //
721 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
722 {
723 if ( ft & FETCH_TYPE_WRITE )
724 paging_mark_dirty(d, mfn_x(target_mfn));
725 else if ( !sh_mfn_is_dirty(d, target_mfn) )
726 sflags &= ~_PAGE_RW;
727 }
729 // protect guest page tables
730 //
731 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
732 {
733 if ( shadow_mode_trap_reads(d) )
734 {
735 // if we are trapping both reads & writes, then mark this page
736 // as not present...
737 //
738 sflags &= ~_PAGE_PRESENT;
739 }
740 else
741 {
742 // otherwise, just prevent any writes...
743 //
744 sflags &= ~_PAGE_RW;
745 }
746 }
748 // PV guests in 64-bit mode use two different page tables for user vs
749 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
750 // It is always shadowed as present...
751 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
752 && !is_hvm_domain(d) )
753 {
754 sflags |= _PAGE_USER;
755 }
757 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
758 done:
759 SHADOW_DEBUG(PROPAGATE,
760 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
761 fetch_type_names[ft], level, gp->l1, sp->l1);
762 }
765 /* These four wrappers give us a little bit of type-safety back around the
766 * use of void-* pointers in _sh_propagate(), and allow the compiler to
767 * optimize out some level checks. */
769 #if GUEST_PAGING_LEVELS >= 4
770 static void
771 l4e_propagate_from_guest(struct vcpu *v,
772 guest_l4e_t *gl4e,
773 mfn_t gl4mfn,
774 mfn_t sl3mfn,
775 shadow_l4e_t *sl4e,
776 fetch_type_t ft)
777 {
778 _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, 0);
779 }
781 static void
782 l3e_propagate_from_guest(struct vcpu *v,
783 guest_l3e_t *gl3e,
784 mfn_t gl3mfn,
785 mfn_t sl2mfn,
786 shadow_l3e_t *sl3e,
787 fetch_type_t ft)
788 {
789 _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, 0);
790 }
791 #endif // GUEST_PAGING_LEVELS >= 4
793 static void
794 l2e_propagate_from_guest(struct vcpu *v,
795 guest_l2e_t *gl2e,
796 mfn_t gl2mfn,
797 mfn_t sl1mfn,
798 shadow_l2e_t *sl2e,
799 fetch_type_t ft)
800 {
801 _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, 0);
802 }
804 static void
805 l1e_propagate_from_guest(struct vcpu *v,
806 guest_l1e_t *gl1e,
807 mfn_t gl1mfn,
808 mfn_t gmfn,
809 shadow_l1e_t *sl1e,
810 fetch_type_t ft,
811 int mmio)
812 {
813 _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, mmio);
814 }
817 /**************************************************************************/
818 /* These functions update shadow entries (and do bookkeeping on the shadow
819 * tables they are in). It is intended that they are the only
820 * functions which ever write (non-zero) data onto a shadow page.
821 */
823 static inline void safe_write_entry(void *dst, void *src)
824 /* Copy one PTE safely when processors might be running on the
825 * destination pagetable. This does *not* give safety against
826 * concurrent writes (that's what the shadow lock is for), just
827 * stops the hardware picking up partially written entries. */
828 {
829 volatile unsigned long *d = dst;
830 unsigned long *s = src;
831 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
832 #if CONFIG_PAGING_LEVELS == 3
833 /* In PAE mode, pagetable entries are larger
834 * than machine words, so won't get written atomically. We need to make
835 * sure any other cpu running on these shadows doesn't see a
836 * half-written entry. Do this by marking the entry not-present first,
837 * then writing the high word before the low word. */
838 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
839 d[0] = 0;
840 d[1] = s[1];
841 d[0] = s[0];
842 #else
843 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
844 * which will be an atomic write, since the entry is aligned. */
845 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
846 *d = *s;
847 #endif
848 }
851 static inline void
852 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
853 /* This function does the actual writes to shadow pages.
854 * It must not be called directly, since it doesn't do the bookkeeping
855 * that shadow_set_l*e() functions do. */
856 {
857 shadow_l1e_t *dst = d;
858 shadow_l1e_t *src = s;
859 void *map = NULL;
860 int i;
862 /* Because we mirror access rights at all levels in the shadow, an
863 * l2 (or higher) entry with the RW bit cleared will leave us with
864 * no write access through the linear map.
865 * We detect that by writing to the shadow with copy_to_user() and
866 * using map_domain_page() to get a writeable mapping if we need to. */
867 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
868 {
869 perfc_incr(shadow_linear_map_failed);
870 map = sh_map_domain_page(mfn);
871 ASSERT(map != NULL);
872 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
873 }
876 for ( i = 0; i < entries; i++ )
877 safe_write_entry(dst++, src++);
879 if ( map != NULL ) sh_unmap_domain_page(map);
880 }
882 static inline int
883 perms_strictly_increased(u32 old_flags, u32 new_flags)
884 /* Given the flags of two entries, are the new flags a strict
885 * increase in rights over the old ones? */
886 {
887 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
888 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
889 /* Flip the NX bit, since it's the only one that decreases rights;
890 * we calculate as if it were an "X" bit. */
891 of ^= _PAGE_NX_BIT;
892 nf ^= _PAGE_NX_BIT;
893 /* If the changed bits are all set in the new flags, then rights strictly
894 * increased between old and new. */
895 return ((of | (of ^ nf)) == nf);
896 }
898 static int inline
899 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
900 {
901 int res;
902 mfn_t mfn;
903 struct domain *owner;
905 ASSERT(!sh_l1e_is_magic(sl1e));
907 if ( !shadow_mode_refcounts(d) )
908 return 1;
910 res = get_page_from_l1e(sl1e, d);
912 // If a privileged domain is attempting to install a map of a page it does
913 // not own, we let it succeed anyway.
914 //
915 if ( unlikely(!res) &&
916 IS_PRIV(d) &&
917 !shadow_mode_translate(d) &&
918 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
919 (owner = page_get_owner(mfn_to_page(mfn))) &&
920 (d != owner) )
921 {
922 res = get_page_from_l1e(sl1e, owner);
923 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
924 "which is owned by domain %d: %s\n",
925 d->domain_id, mfn_x(mfn), owner->domain_id,
926 res ? "success" : "failed");
927 }
929 if ( unlikely(!res) )
930 {
931 perfc_incr(shadow_get_page_fail);
932 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
933 }
935 return res;
936 }
938 static void inline
939 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
940 {
941 if ( !shadow_mode_refcounts(d) )
942 return;
944 put_page_from_l1e(sl1e, d);
945 }
947 #if GUEST_PAGING_LEVELS >= 4
948 static int shadow_set_l4e(struct vcpu *v,
949 shadow_l4e_t *sl4e,
950 shadow_l4e_t new_sl4e,
951 mfn_t sl4mfn)
952 {
953 int flags = 0, ok;
954 shadow_l4e_t old_sl4e;
955 paddr_t paddr;
956 ASSERT(sl4e != NULL);
957 old_sl4e = *sl4e;
959 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
961 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
962 | (((unsigned long)sl4e) & ~PAGE_MASK));
964 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
965 {
966 /* About to install a new reference */
967 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
968 ok = sh_get_ref(v, sl3mfn, paddr);
969 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
970 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
971 ok |= sh_pin(v, sl3mfn);
972 if ( !ok )
973 {
974 domain_crash(v->domain);
975 return SHADOW_SET_ERROR;
976 }
977 }
979 /* Write the new entry */
980 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
981 flags |= SHADOW_SET_CHANGED;
983 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
984 {
985 /* We lost a reference to an old mfn. */
986 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
987 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
988 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
989 shadow_l4e_get_flags(new_sl4e)) )
990 {
991 flags |= SHADOW_SET_FLUSH;
992 }
993 sh_put_ref(v, osl3mfn, paddr);
994 }
995 return flags;
996 }
998 static int shadow_set_l3e(struct vcpu *v,
999 shadow_l3e_t *sl3e,
1000 shadow_l3e_t new_sl3e,
1001 mfn_t sl3mfn)
1003 int flags = 0;
1004 shadow_l3e_t old_sl3e;
1005 paddr_t paddr;
1006 ASSERT(sl3e != NULL);
1007 old_sl3e = *sl3e;
1009 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1011 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1012 | (((unsigned long)sl3e) & ~PAGE_MASK));
1014 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1015 /* About to install a new reference */
1016 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1018 domain_crash(v->domain);
1019 return SHADOW_SET_ERROR;
1022 /* Write the new entry */
1023 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1024 flags |= SHADOW_SET_CHANGED;
1026 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1028 /* We lost a reference to an old mfn. */
1029 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1030 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1031 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1032 shadow_l3e_get_flags(new_sl3e)) )
1034 flags |= SHADOW_SET_FLUSH;
1036 sh_put_ref(v, osl2mfn, paddr);
1038 return flags;
1040 #endif /* GUEST_PAGING_LEVELS >= 4 */
1042 static int shadow_set_l2e(struct vcpu *v,
1043 shadow_l2e_t *sl2e,
1044 shadow_l2e_t new_sl2e,
1045 mfn_t sl2mfn)
1047 int flags = 0;
1048 shadow_l2e_t old_sl2e;
1049 paddr_t paddr;
1051 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1052 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1053 * shadows. Reference counting and up-pointers track from the first
1054 * page of the shadow to the first l2e, so make sure that we're
1055 * working with those:
1056 * Align the pointer down so it's pointing at the first of the pair */
1057 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1058 /* Align the mfn of the shadow entry too */
1059 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1060 #endif
1062 ASSERT(sl2e != NULL);
1063 old_sl2e = *sl2e;
1065 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1067 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1068 | (((unsigned long)sl2e) & ~PAGE_MASK));
1070 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1071 /* About to install a new reference */
1072 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1074 domain_crash(v->domain);
1075 return SHADOW_SET_ERROR;
1078 /* Write the new entry */
1079 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1081 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1082 /* The l1 shadow is two pages long and need to be pointed to by
1083 * two adjacent l1es. The pair have the same flags, but point
1084 * at odd and even MFNs */
1085 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1086 pair[1].l2 |= (1<<PAGE_SHIFT);
1087 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1089 #else /* normal case */
1090 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1091 #endif
1092 flags |= SHADOW_SET_CHANGED;
1094 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1096 /* We lost a reference to an old mfn. */
1097 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1098 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1099 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1100 shadow_l2e_get_flags(new_sl2e)) )
1102 flags |= SHADOW_SET_FLUSH;
1104 sh_put_ref(v, osl1mfn, paddr);
1106 return flags;
1109 static int shadow_set_l1e(struct vcpu *v,
1110 shadow_l1e_t *sl1e,
1111 shadow_l1e_t new_sl1e,
1112 mfn_t sl1mfn)
1114 int flags = 0;
1115 struct domain *d = v->domain;
1116 shadow_l1e_t old_sl1e;
1117 ASSERT(sl1e != NULL);
1119 old_sl1e = *sl1e;
1121 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1123 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1124 && !sh_l1e_is_magic(new_sl1e) )
1126 /* About to install a new reference */
1127 if ( shadow_mode_refcounts(d) ) {
1128 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1130 /* Doesn't look like a pagetable. */
1131 flags |= SHADOW_SET_ERROR;
1132 new_sl1e = shadow_l1e_empty();
1137 /* Write the new entry */
1138 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1139 flags |= SHADOW_SET_CHANGED;
1141 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1142 && !sh_l1e_is_magic(old_sl1e) )
1144 /* We lost a reference to an old mfn. */
1145 /* N.B. Unlike higher-level sets, never need an extra flush
1146 * when writing an l1e. Because it points to the same guest frame
1147 * as the guest l1e did, it's the guest's responsibility to
1148 * trigger a flush later. */
1149 if ( shadow_mode_refcounts(d) )
1151 shadow_put_page_from_l1e(old_sl1e, d);
1154 return flags;
1158 /**************************************************************************/
1159 /* Macros to walk pagetables. These take the shadow of a pagetable and
1160 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1161 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1162 * second entry (since pairs of entries are managed together). For multi-page
1163 * shadows they walk all pages.
1165 * Arguments are an MFN, the variable to point to each entry, a variable
1166 * to indicate that we are done (we will shortcut to the end of the scan
1167 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1168 * and the code.
1170 * WARNING: These macros have side-effects. They change the values of both
1171 * the pointer and the MFN. */
1173 static inline void increment_ptr_to_guest_entry(void *ptr)
1175 if ( ptr )
1177 guest_l1e_t **entry = ptr;
1178 (*entry)++;
1182 /* All kinds of l1: touch all entries */
1183 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1184 do { \
1185 int _i; \
1186 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1187 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1188 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1189 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1190 { \
1191 (_sl1e) = _sp + _i; \
1192 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1193 {_code} \
1194 if ( _done ) break; \
1195 increment_ptr_to_guest_entry(_gl1p); \
1196 } \
1197 unmap_shadow_page(_sp); \
1198 } while (0)
1200 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1201 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1202 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1203 do { \
1204 int __done = 0; \
1205 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1206 ({ (__done = _done); }), _code); \
1207 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1208 if ( !__done ) \
1209 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1210 ({ (__done = _done); }), _code); \
1211 } while (0)
1212 #else /* Everything else; l1 shadows are only one page */
1213 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1214 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1215 #endif
1218 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1220 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1221 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1222 do { \
1223 int _i, _j, __done = 0; \
1224 int _xen = !shadow_mode_external(_dom); \
1225 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1226 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1227 { \
1228 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1229 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1230 if ( (!(_xen)) \
1231 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1232 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1233 { \
1234 (_sl2e) = _sp + _i; \
1235 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1236 {_code} \
1237 if ( (__done = (_done)) ) break; \
1238 increment_ptr_to_guest_entry(_gl2p); \
1239 } \
1240 unmap_shadow_page(_sp); \
1241 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1242 } \
1243 } while (0)
1245 #elif GUEST_PAGING_LEVELS == 2
1247 /* 32-bit on 32-bit: avoid Xen entries */
1248 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1249 do { \
1250 int _i; \
1251 int _xen = !shadow_mode_external(_dom); \
1252 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1253 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1254 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1255 if ( (!(_xen)) \
1256 || \
1257 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1258 { \
1259 (_sl2e) = _sp + _i; \
1260 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1261 {_code} \
1262 if ( _done ) break; \
1263 increment_ptr_to_guest_entry(_gl2p); \
1264 } \
1265 unmap_shadow_page(_sp); \
1266 } while (0)
1268 #elif GUEST_PAGING_LEVELS == 3
1270 /* PAE: if it's an l2h, don't touch Xen mappings */
1271 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1272 do { \
1273 int _i; \
1274 int _xen = !shadow_mode_external(_dom); \
1275 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1276 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1277 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1278 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1279 if ( (!(_xen)) \
1280 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1281 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1282 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1283 { \
1284 (_sl2e) = _sp + _i; \
1285 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1286 {_code} \
1287 if ( _done ) break; \
1288 increment_ptr_to_guest_entry(_gl2p); \
1289 } \
1290 unmap_shadow_page(_sp); \
1291 } while (0)
1293 #else
1295 /* 64-bit l2: touch all entries except for PAE compat guests. */
1296 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1297 do { \
1298 int _i; \
1299 int _xen = !shadow_mode_external(_dom); \
1300 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1301 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1302 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1303 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1304 { \
1305 if ( (!(_xen)) \
1306 || !is_pv_32on64_domain(_dom) \
1307 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1308 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1309 { \
1310 (_sl2e) = _sp + _i; \
1311 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1312 {_code} \
1313 if ( _done ) break; \
1314 increment_ptr_to_guest_entry(_gl2p); \
1315 } \
1316 } \
1317 unmap_shadow_page(_sp); \
1318 } while (0)
1320 #endif /* different kinds of l2 */
1322 #if GUEST_PAGING_LEVELS == 4
1324 /* 64-bit l3: touch all entries */
1325 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1326 do { \
1327 int _i; \
1328 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1329 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1330 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1331 { \
1332 (_sl3e) = _sp + _i; \
1333 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1334 {_code} \
1335 if ( _done ) break; \
1336 increment_ptr_to_guest_entry(_gl3p); \
1337 } \
1338 unmap_shadow_page(_sp); \
1339 } while (0)
1341 /* 64-bit l4: avoid Xen mappings */
1342 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1343 do { \
1344 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1345 int _xen = !shadow_mode_external(_dom); \
1346 int _i; \
1347 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1348 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1349 { \
1350 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1351 { \
1352 (_sl4e) = _sp + _i; \
1353 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1354 {_code} \
1355 if ( _done ) break; \
1356 } \
1357 increment_ptr_to_guest_entry(_gl4p); \
1358 } \
1359 unmap_shadow_page(_sp); \
1360 } while (0)
1362 #endif
1366 /**************************************************************************/
1367 /* Functions to install Xen mappings and linear mappings in shadow pages */
1369 // XXX -- this function should probably be moved to shadow-common.c, but that
1370 // probably wants to wait until the shadow types have been moved from
1371 // shadow-types.h to shadow-private.h
1372 //
1373 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1374 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1376 struct domain *d = v->domain;
1377 shadow_l4e_t *sl4e;
1379 sl4e = sh_map_domain_page(sl4mfn);
1380 ASSERT(sl4e != NULL);
1381 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1383 /* Copy the common Xen mappings from the idle domain */
1384 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1385 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1386 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1388 /* Install the per-domain mappings for this domain */
1389 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1390 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1391 __PAGE_HYPERVISOR);
1393 /* Linear mapping */
1394 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1395 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1397 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1399 // linear tables may not be used with translated PV guests
1400 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1401 shadow_l4e_empty();
1403 else
1405 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1406 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1409 if ( shadow_mode_translate(v->domain) )
1411 /* install domain-specific P2M table */
1412 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1413 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1414 __PAGE_HYPERVISOR);
1417 if ( is_pv_32on64_domain(v->domain) )
1419 /* install compat arg xlat entry */
1420 sl4e[shadow_l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1421 shadow_l4e_from_mfn(
1422 page_to_mfn(virt_to_page(d->arch.mm_arg_xlat_l3)),
1423 __PAGE_HYPERVISOR);
1426 sh_unmap_domain_page(sl4e);
1428 #endif
1430 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1431 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1432 // place, which means that we need to populate the l2h entry in the l3
1433 // table.
1435 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1437 struct domain *d = v->domain;
1438 shadow_l2e_t *sl2e;
1439 #if CONFIG_PAGING_LEVELS == 3
1440 int i;
1441 #else
1443 if ( !is_pv_32on64_vcpu(v) )
1444 return;
1445 #endif
1447 sl2e = sh_map_domain_page(sl2hmfn);
1448 ASSERT(sl2e != NULL);
1449 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1451 #if CONFIG_PAGING_LEVELS == 3
1453 /* Copy the common Xen mappings from the idle domain */
1454 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1455 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1456 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1458 /* Install the per-domain mappings for this domain */
1459 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1460 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1461 shadow_l2e_from_mfn(
1462 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1463 __PAGE_HYPERVISOR);
1465 /* We don't set up a linear mapping here because we can't until this
1466 * l2h is installed in an l3e. sh_update_linear_entries() handles
1467 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1468 * We zero them here, just as a safety measure.
1469 */
1470 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1471 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1472 shadow_l2e_empty();
1473 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1474 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1475 shadow_l2e_empty();
1477 if ( shadow_mode_translate(d) )
1479 /* Install the domain-specific p2m table */
1480 l3_pgentry_t *p2m;
1481 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1482 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1483 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1485 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1486 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1487 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1488 __PAGE_HYPERVISOR)
1489 : shadow_l2e_empty();
1491 sh_unmap_domain_page(p2m);
1494 #else
1496 /* Copy the common Xen mappings from the idle domain */
1497 memcpy(
1498 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1499 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1500 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1502 #endif
1504 sh_unmap_domain_page(sl2e);
1506 #endif
1509 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1510 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1512 struct domain *d = v->domain;
1513 shadow_l2e_t *sl2e;
1514 int i;
1516 sl2e = sh_map_domain_page(sl2mfn);
1517 ASSERT(sl2e != NULL);
1518 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1520 /* Copy the common Xen mappings from the idle domain */
1521 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1522 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1523 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1525 /* Install the per-domain mappings for this domain */
1526 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1527 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1528 shadow_l2e_from_mfn(
1529 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1530 __PAGE_HYPERVISOR);
1532 /* Linear mapping */
1533 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1534 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1536 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1538 // linear tables may not be used with translated PV guests
1539 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1540 shadow_l2e_empty();
1542 else
1544 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1545 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1548 if ( shadow_mode_translate(d) )
1550 /* install domain-specific P2M table */
1551 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1552 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1553 __PAGE_HYPERVISOR);
1556 sh_unmap_domain_page(sl2e);
1558 #endif
1562 /**************************************************************************/
1563 /* Create a shadow of a given guest page.
1564 */
1565 static mfn_t
1566 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1568 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1569 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1570 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1572 if ( shadow_type != SH_type_l2_32_shadow
1573 && shadow_type != SH_type_l2_pae_shadow
1574 && shadow_type != SH_type_l2h_pae_shadow
1575 && shadow_type != SH_type_l4_64_shadow )
1576 /* Lower-level shadow, not yet linked form a higher level */
1577 mfn_to_shadow_page(smfn)->up = 0;
1579 #if GUEST_PAGING_LEVELS == 4
1580 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1581 if ( shadow_type == SH_type_l4_64_shadow &&
1582 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1584 /* We're shadowing a new l4, but we've been assuming the guest uses
1585 * only one l4 per vcpu and context switches using an l4 entry.
1586 * Count the number of active l4 shadows. If there are enough
1587 * of them, decide that this isn't an old linux guest, and stop
1588 * pinning l3es. This is not very quick but it doesn't happen
1589 * very often. */
1590 struct list_head *l, *t;
1591 struct shadow_page_info *sp;
1592 struct vcpu *v2;
1593 int l4count = 0, vcpus = 0;
1594 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1596 sp = list_entry(l, struct shadow_page_info, list);
1597 if ( sp->type == SH_type_l4_64_shadow )
1598 l4count++;
1600 for_each_vcpu ( v->domain, v2 )
1601 vcpus++;
1602 if ( l4count > 2 * vcpus )
1604 /* Unpin all the pinned l3 tables, and don't pin any more. */
1605 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1607 sp = list_entry(l, struct shadow_page_info, list);
1608 if ( sp->type == SH_type_l3_64_shadow )
1609 sh_unpin(v, shadow_page_to_mfn(sp));
1611 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1614 #endif
1615 #endif
1617 // Create the Xen mappings...
1618 if ( !shadow_mode_external(v->domain) )
1620 switch (shadow_type)
1622 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1623 case SH_type_l4_shadow:
1624 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1625 #endif
1626 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1627 case SH_type_l2h_shadow:
1628 sh_install_xen_entries_in_l2h(v, smfn); break;
1629 #endif
1630 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1631 case SH_type_l2_shadow:
1632 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1633 #endif
1634 default: /* Do nothing */ break;
1638 shadow_promote(v, gmfn, shadow_type);
1639 set_shadow_status(v, gmfn, shadow_type, smfn);
1641 return smfn;
1644 /* Make a splintered superpage shadow */
1645 static mfn_t
1646 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1648 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1649 (unsigned long) gfn_x(gfn));
1651 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1652 gfn_x(gfn), mfn_x(smfn));
1654 set_fl1_shadow_status(v, gfn, smfn);
1655 return smfn;
1659 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1660 mfn_t
1661 sh_make_monitor_table(struct vcpu *v)
1663 struct domain *d = v->domain;
1665 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1667 /* Guarantee we can get the memory we need */
1668 shadow_prealloc(d, SHADOW_MAX_ORDER);
1670 #if CONFIG_PAGING_LEVELS == 4
1672 mfn_t m4mfn;
1673 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1674 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1675 /* Remember the level of this table */
1676 mfn_to_page(m4mfn)->shadow_flags = 4;
1677 #if SHADOW_PAGING_LEVELS < 4
1678 // Install a monitor l3 table in slot 0 of the l4 table.
1679 // This is used for shadow linear maps.
1681 mfn_t m3mfn;
1682 l4_pgentry_t *l4e;
1683 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1684 mfn_to_page(m3mfn)->shadow_flags = 3;
1685 l4e = sh_map_domain_page(m4mfn);
1686 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1687 sh_unmap_domain_page(l4e);
1688 if ( is_pv_32on64_vcpu(v) )
1690 // Install a monitor l2 table in slot 3 of the l3 table.
1691 // This is used for all Xen entries.
1692 mfn_t m2mfn;
1693 l3_pgentry_t *l3e;
1694 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1695 mfn_to_page(m2mfn)->shadow_flags = 2;
1696 l3e = sh_map_domain_page(m3mfn);
1697 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1698 sh_install_xen_entries_in_l2h(v, m2mfn);
1699 sh_unmap_domain_page(l3e);
1702 #endif /* SHADOW_PAGING_LEVELS < 4 */
1703 return m4mfn;
1706 #elif CONFIG_PAGING_LEVELS == 3
1709 mfn_t m3mfn, m2mfn;
1710 l3_pgentry_t *l3e;
1711 l2_pgentry_t *l2e;
1712 int i;
1714 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1715 /* Remember the level of this table */
1716 mfn_to_page(m3mfn)->shadow_flags = 3;
1718 // Install a monitor l2 table in slot 3 of the l3 table.
1719 // This is used for all Xen entries, including linear maps
1720 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1721 mfn_to_page(m2mfn)->shadow_flags = 2;
1722 l3e = sh_map_domain_page(m3mfn);
1723 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1724 sh_install_xen_entries_in_l2h(v, m2mfn);
1725 /* Install the monitor's own linear map */
1726 l2e = sh_map_domain_page(m2mfn);
1727 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1728 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1729 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1730 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1731 : l2e_empty();
1732 sh_unmap_domain_page(l2e);
1733 sh_unmap_domain_page(l3e);
1735 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1736 return m3mfn;
1739 #elif CONFIG_PAGING_LEVELS == 2
1742 mfn_t m2mfn;
1743 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1744 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1745 /* Remember the level of this table */
1746 mfn_to_page(m2mfn)->shadow_flags = 2;
1747 return m2mfn;
1750 #else
1751 #error this should not happen
1752 #endif /* CONFIG_PAGING_LEVELS */
1754 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1756 /**************************************************************************/
1757 /* These functions also take a virtual address and return the level-N
1758 * shadow table mfn and entry, but they create the shadow pagetables if
1759 * they are needed. The "demand" argument is non-zero when handling
1760 * a demand fault (so we know what to do about accessed bits &c).
1761 * If the necessary tables are not present in the guest, they return NULL. */
1763 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1764 * more levels than the guest, the upper levels are always fixed and do not
1765 * reflect any information from the guest, so we do not use these functions
1766 * to access them. */
1768 #if GUEST_PAGING_LEVELS >= 4
1769 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1770 walk_t *gw,
1771 mfn_t *sl4mfn)
1773 /* There is always a shadow of the top level table. Get it. */
1774 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1775 /* Reading the top level table is always valid. */
1776 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1779 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1780 walk_t *gw,
1781 mfn_t *sl3mfn,
1782 fetch_type_t ft)
1784 mfn_t sl4mfn;
1785 shadow_l4e_t *sl4e;
1786 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1787 /* Get the l4e */
1788 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1789 ASSERT(sl4e != NULL);
1790 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1792 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1793 ASSERT(mfn_valid(*sl3mfn));
1795 else
1797 int r;
1798 shadow_l4e_t new_sl4e;
1799 /* No l3 shadow installed: find and install it. */
1800 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1801 if ( !mfn_valid(*sl3mfn) )
1803 /* No l3 shadow of this page exists at all: make one. */
1804 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1806 /* Install the new sl3 table in the sl4e */
1807 l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
1808 *sl3mfn, &new_sl4e, ft);
1809 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1810 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1811 if ( r & SHADOW_SET_ERROR )
1812 return NULL;
1814 /* Now follow it down a level. Guaranteed to succeed. */
1815 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1817 #endif /* GUEST_PAGING_LEVELS >= 4 */
1820 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1821 walk_t *gw,
1822 mfn_t *sl2mfn,
1823 fetch_type_t ft)
1825 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1826 mfn_t sl3mfn = _mfn(INVALID_MFN);
1827 shadow_l3e_t *sl3e;
1828 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1829 /* Get the l3e */
1830 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1831 if ( sl3e == NULL ) return NULL;
1832 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1834 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1835 ASSERT(mfn_valid(*sl2mfn));
1837 else
1839 int r;
1840 shadow_l3e_t new_sl3e;
1841 unsigned int t = SH_type_l2_shadow;
1843 /* Tag compat L2 containing hypervisor (m2p) mappings */
1844 if ( is_pv_32on64_domain(v->domain) &&
1845 guest_l4_table_offset(gw->va) == 0 &&
1846 guest_l3_table_offset(gw->va) == 3 )
1847 t = SH_type_l2h_shadow;
1849 /* No l2 shadow installed: find and install it. */
1850 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1851 if ( !mfn_valid(*sl2mfn) )
1853 /* No l2 shadow of this page exists at all: make one. */
1854 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1856 /* Install the new sl2 table in the sl3e */
1857 l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
1858 *sl2mfn, &new_sl3e, ft);
1859 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1860 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1861 if ( r & SHADOW_SET_ERROR )
1862 return NULL;
1864 /* Now follow it down a level. Guaranteed to succeed. */
1865 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1866 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1867 /* We never demand-shadow PAE l3es: they are only created in
1868 * sh_update_cr3(). Check if the relevant sl3e is present. */
1869 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1870 + shadow_l3_linear_offset(gw->va);
1871 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1872 return NULL;
1873 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1874 ASSERT(mfn_valid(*sl2mfn));
1875 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1876 #else /* 32bit... */
1877 /* There is always a shadow of the top level table. Get it. */
1878 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1879 /* This next line is important: the guest l2 has a 16k
1880 * shadow, we need to return the right mfn of the four. This
1881 * call will set it for us as a side-effect. */
1882 (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
1883 /* Reading the top level table is always valid. */
1884 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1885 #endif
1889 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1890 walk_t *gw,
1891 mfn_t *sl1mfn,
1892 fetch_type_t ft)
1894 mfn_t sl2mfn;
1895 shadow_l2e_t *sl2e;
1897 /* Get the l2e */
1898 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1899 if ( sl2e == NULL ) return NULL;
1900 /* Install the sl1 in the l2e if it wasn't there or if we need to
1901 * re-do it to fix a PSE dirty bit. */
1902 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1903 && likely(ft != ft_demand_write
1904 || (guest_l2e_get_flags(*gw->l2e) & _PAGE_DIRTY)
1905 || !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)) )
1907 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1908 ASSERT(mfn_valid(*sl1mfn));
1910 else
1912 shadow_l2e_t new_sl2e;
1913 int r, flags = guest_l2e_get_flags(*gw->l2e);
1914 /* No l1 shadow installed: find and install it. */
1915 if ( !(flags & _PAGE_PRESENT) )
1916 return NULL; /* No guest page. */
1917 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1919 /* Splintering a superpage */
1920 gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
1921 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1922 if ( !mfn_valid(*sl1mfn) )
1924 /* No fl1 shadow of this superpage exists at all: make one. */
1925 *sl1mfn = make_fl1_shadow(v, l2gfn);
1928 else
1930 /* Shadowing an actual guest l1 table */
1931 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1932 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1933 if ( !mfn_valid(*sl1mfn) )
1935 /* No l1 shadow of this page exists at all: make one. */
1936 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1939 /* Install the new sl1 table in the sl2e */
1940 l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
1941 *sl1mfn, &new_sl2e, ft);
1942 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1943 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1944 if ( r & SHADOW_SET_ERROR )
1945 return NULL;
1946 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1947 * the guest l1 table has an 8k shadow, and we need to return
1948 * the right mfn of the pair. This call will set it for us as a
1949 * side-effect. (In all other cases, it's a no-op and will be
1950 * compiled out.) */
1951 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1953 /* Now follow it down a level. Guaranteed to succeed. */
1954 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1959 /**************************************************************************/
1960 /* Destructors for shadow tables:
1961 * Unregister the shadow, decrement refcounts of any entries present in it,
1962 * and release the memory.
1964 * N.B. These destructors do not clear the contents of the shadows.
1965 * This allows us to delay TLB shootdowns until the page is being reused.
1966 * See shadow_alloc() and shadow_free() for how this is handled.
1967 */
1969 #if GUEST_PAGING_LEVELS >= 4
1970 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1972 shadow_l4e_t *sl4e;
1973 u32 t = mfn_to_shadow_page(smfn)->type;
1974 mfn_t gmfn, sl4mfn;
1976 SHADOW_DEBUG(DESTROY_SHADOW,
1977 "%s(%05lx)\n", __func__, mfn_x(smfn));
1978 ASSERT(t == SH_type_l4_shadow);
1980 /* Record that the guest page isn't shadowed any more (in this type) */
1981 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1982 delete_shadow_status(v, gmfn, t, smfn);
1983 shadow_demote(v, gmfn, t);
1984 /* Decrement refcounts of all the old entries */
1985 sl4mfn = smfn;
1986 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
1987 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1989 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1990 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1991 | ((unsigned long)sl4e & ~PAGE_MASK));
1993 });
1995 /* Put the memory back in the pool */
1996 shadow_free(v->domain, smfn);
1999 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2001 shadow_l3e_t *sl3e;
2002 u32 t = mfn_to_shadow_page(smfn)->type;
2003 mfn_t gmfn, sl3mfn;
2005 SHADOW_DEBUG(DESTROY_SHADOW,
2006 "%s(%05lx)\n", __func__, mfn_x(smfn));
2007 ASSERT(t == SH_type_l3_shadow);
2009 /* Record that the guest page isn't shadowed any more (in this type) */
2010 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2011 delete_shadow_status(v, gmfn, t, smfn);
2012 shadow_demote(v, gmfn, t);
2014 /* Decrement refcounts of all the old entries */
2015 sl3mfn = smfn;
2016 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2017 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2018 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2019 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2020 | ((unsigned long)sl3e & ~PAGE_MASK));
2021 });
2023 /* Put the memory back in the pool */
2024 shadow_free(v->domain, smfn);
2026 #endif /* GUEST_PAGING_LEVELS >= 4 */
2029 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2031 shadow_l2e_t *sl2e;
2032 u32 t = mfn_to_shadow_page(smfn)->type;
2033 mfn_t gmfn, sl2mfn;
2035 SHADOW_DEBUG(DESTROY_SHADOW,
2036 "%s(%05lx)\n", __func__, mfn_x(smfn));
2038 #if GUEST_PAGING_LEVELS >= 3
2039 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2040 #else
2041 ASSERT(t == SH_type_l2_shadow);
2042 #endif
2044 /* Record that the guest page isn't shadowed any more (in this type) */
2045 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2046 delete_shadow_status(v, gmfn, t, smfn);
2047 shadow_demote(v, gmfn, t);
2049 /* Decrement refcounts of all the old entries */
2050 sl2mfn = smfn;
2051 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2052 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2053 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2054 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2055 | ((unsigned long)sl2e & ~PAGE_MASK));
2056 });
2058 /* Put the memory back in the pool */
2059 shadow_free(v->domain, smfn);
2062 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2064 struct domain *d = v->domain;
2065 shadow_l1e_t *sl1e;
2066 u32 t = mfn_to_shadow_page(smfn)->type;
2068 SHADOW_DEBUG(DESTROY_SHADOW,
2069 "%s(%05lx)\n", __func__, mfn_x(smfn));
2070 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2072 /* Record that the guest page isn't shadowed any more (in this type) */
2073 if ( t == SH_type_fl1_shadow )
2075 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2076 delete_fl1_shadow_status(v, gfn, smfn);
2078 else
2080 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2081 delete_shadow_status(v, gmfn, t, smfn);
2082 shadow_demote(v, gmfn, t);
2085 if ( shadow_mode_refcounts(d) )
2087 /* Decrement refcounts of all the old entries */
2088 mfn_t sl1mfn = smfn;
2089 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2090 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2091 && !sh_l1e_is_magic(*sl1e) )
2092 shadow_put_page_from_l1e(*sl1e, d);
2093 });
2096 /* Put the memory back in the pool */
2097 shadow_free(v->domain, smfn);
2100 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2101 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2103 struct domain *d = v->domain;
2104 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2106 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2107 /* Need to destroy the l3 monitor page in slot 0 too */
2109 mfn_t m3mfn;
2110 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2111 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2112 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2113 if ( is_pv_32on64_vcpu(v) )
2115 /* Need to destroy the l2 monitor page in slot 3 too */
2116 l3_pgentry_t *l3e = sh_map_domain_page(m3mfn);
2117 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2118 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2119 sh_unmap_domain_page(l3e);
2121 shadow_free(d, m3mfn);
2122 sh_unmap_domain_page(l4e);
2124 #elif CONFIG_PAGING_LEVELS == 3
2125 /* Need to destroy the l2 monitor page in slot 4 too */
2127 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2128 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2129 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2130 sh_unmap_domain_page(l3e);
2132 #endif
2134 /* Put the memory back in the pool */
2135 shadow_free(d, mmfn);
2137 #endif
2139 /**************************************************************************/
2140 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2141 * These are called from common code when we are running out of shadow
2142 * memory, and unpinning all the top-level shadows hasn't worked.
2144 * This implementation is pretty crude and slow, but we hope that it won't
2145 * be called very often. */
2147 #if GUEST_PAGING_LEVELS == 2
2149 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2151 shadow_l2e_t *sl2e;
2152 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2153 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2154 });
2157 #elif GUEST_PAGING_LEVELS == 3
2159 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2160 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2162 shadow_l2e_t *sl2e;
2163 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2164 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2165 });
2168 #elif GUEST_PAGING_LEVELS == 4
2170 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2172 shadow_l4e_t *sl4e;
2173 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2174 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2175 });
2178 #endif
2180 /**************************************************************************/
2181 /* Internal translation functions.
2182 * These functions require a pointer to the shadow entry that will be updated.
2183 */
2185 /* These functions take a new guest entry, translate it to shadow and write
2186 * the shadow entry.
2188 * They return the same bitmaps as the shadow_set_lXe() functions.
2189 */
2191 #if GUEST_PAGING_LEVELS >= 4
2192 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2194 shadow_l4e_t new_sl4e;
2195 guest_l4e_t *new_gl4e = new_ge;
2196 shadow_l4e_t *sl4p = se;
2197 mfn_t sl3mfn = _mfn(INVALID_MFN);
2198 struct domain *d = v->domain;
2199 int result = 0;
2201 perfc_incr(shadow_validate_gl4e_calls);
2203 if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
2205 gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
2206 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn);
2207 if ( mfn_valid(gl3mfn) )
2208 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2209 else
2210 result |= SHADOW_SET_ERROR;
2212 l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
2213 sl3mfn, &new_sl4e, ft_prefetch);
2215 // check for updates to xen reserved slots
2216 if ( !shadow_mode_external(d) )
2218 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2219 sizeof(shadow_l4e_t));
2220 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2222 if ( unlikely(reserved_xen_slot) )
2224 // attempt by the guest to write to a xen reserved slot
2225 //
2226 SHADOW_PRINTK("%s out-of-range update "
2227 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2228 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2229 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2231 SHADOW_ERROR("out-of-range l4e update\n");
2232 result |= SHADOW_SET_ERROR;
2235 // do not call shadow_set_l4e...
2236 return result;
2240 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2241 return result;
2245 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2247 shadow_l3e_t new_sl3e;
2248 guest_l3e_t *new_gl3e = new_ge;
2249 shadow_l3e_t *sl3p = se;
2250 mfn_t sl2mfn = _mfn(INVALID_MFN);
2251 int result = 0;
2253 perfc_incr(shadow_validate_gl3e_calls);
2255 if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
2257 gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
2258 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn);
2259 if ( mfn_valid(gl2mfn) )
2260 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2261 else
2262 result |= SHADOW_SET_ERROR;
2264 l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
2265 sl2mfn, &new_sl3e, ft_prefetch);
2266 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2268 return result;
2270 #endif // GUEST_PAGING_LEVELS >= 4
2272 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2274 shadow_l2e_t new_sl2e;
2275 guest_l2e_t *new_gl2e = new_ge;
2276 shadow_l2e_t *sl2p = se;
2277 mfn_t sl1mfn = _mfn(INVALID_MFN);
2278 int result = 0;
2280 perfc_incr(shadow_validate_gl2e_calls);
2282 if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
2284 gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
2285 if ( guest_supports_superpages(v) &&
2286 (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
2288 // superpage -- need to look up the shadow L1 which holds the
2289 // splitters...
2290 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2291 #if 0
2292 // XXX - it's possible that we want to do some kind of prefetch
2293 // for superpage fl1's here, but this is *not* on the demand path,
2294 // so we'll hold off trying that for now...
2295 //
2296 if ( !mfn_valid(sl1mfn) )
2297 sl1mfn = make_fl1_shadow(v, gl1gfn);
2298 #endif
2300 else
2302 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn);
2303 if ( mfn_valid(gl1mfn) )
2304 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2305 else
2306 result |= SHADOW_SET_ERROR;
2309 l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
2310 sl1mfn, &new_sl2e, ft_prefetch);
2312 // check for updates to xen reserved slots in PV guests...
2313 // XXX -- need to revisit this for PV 3-on-4 guests.
2314 //
2315 #if SHADOW_PAGING_LEVELS < 4
2316 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2317 if ( !shadow_mode_external(v->domain) )
2319 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2320 sizeof(shadow_l2e_t));
2321 int reserved_xen_slot;
2323 #if SHADOW_PAGING_LEVELS == 3
2324 reserved_xen_slot =
2325 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2326 (shadow_index
2327 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2328 #else /* SHADOW_PAGING_LEVELS == 2 */
2329 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2330 #endif
2332 if ( unlikely(reserved_xen_slot) )
2334 // attempt by the guest to write to a xen reserved slot
2335 //
2336 SHADOW_PRINTK("%s out-of-range update "
2337 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2338 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2339 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2341 SHADOW_ERROR("out-of-range l2e update\n");
2342 result |= SHADOW_SET_ERROR;
2345 // do not call shadow_set_l2e...
2346 return result;
2349 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2350 #endif /* SHADOW_PAGING_LEVELS < 4 */
2352 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2354 return result;
2357 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2359 shadow_l1e_t new_sl1e;
2360 guest_l1e_t *new_gl1e = new_ge;
2361 shadow_l1e_t *sl1p = se;
2362 gfn_t gfn;
2363 mfn_t gmfn;
2364 int result = 0, mmio;
2366 perfc_incr(shadow_validate_gl1e_calls);
2368 gfn = guest_l1e_get_gfn(*new_gl1e);
2369 gmfn = gfn_to_mfn(v->domain, gfn);
2371 mmio = (is_hvm_vcpu(v) && mmio_space(gfn_to_paddr(gfn)));
2372 l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e,
2373 ft_prefetch, mmio);
2375 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2376 return result;
2380 /**************************************************************************/
2381 /* Functions which translate and install the shadows of arbitrary guest
2382 * entries that we have just seen the guest write. */
2385 static inline int
2386 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2387 void *new_gp, u32 size, u32 sh_type,
2388 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2389 int (*validate_ge)(struct vcpu *v, void *ge,
2390 mfn_t smfn, void *se))
2391 /* Generic function for mapping and validating. */
2393 mfn_t smfn, smfn2, map_mfn;
2394 shadow_l1e_t *sl1p;
2395 u32 shadow_idx, guest_idx;
2396 int result = 0;
2398 /* Align address and size to guest entry boundaries */
2399 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2400 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2401 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2402 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2404 /* Map the shadow page */
2405 smfn = get_shadow_status(v, gmfn, sh_type);
2406 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2407 guest_idx = guest_index(new_gp);
2408 map_mfn = smfn;
2409 shadow_idx = shadow_index(&map_mfn, guest_idx);
2410 sl1p = map_shadow_page(map_mfn);
2412 /* Validate one entry at a time */
2413 while ( size )
2415 smfn2 = smfn;
2416 guest_idx = guest_index(new_gp);
2417 shadow_idx = shadow_index(&smfn2, guest_idx);
2418 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2420 /* We have moved to another page of the shadow */
2421 map_mfn = smfn2;
2422 unmap_shadow_page(sl1p);
2423 sl1p = map_shadow_page(map_mfn);
2425 result |= validate_ge(v,
2426 new_gp,
2427 map_mfn,
2428 &sl1p[shadow_idx]);
2429 size -= sizeof(guest_l1e_t);
2430 new_gp += sizeof(guest_l1e_t);
2432 unmap_shadow_page(sl1p);
2433 return result;
2437 int
2438 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2439 void *new_gl4p, u32 size)
2441 #if GUEST_PAGING_LEVELS >= 4
2442 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2443 SH_type_l4_shadow,
2444 shadow_l4_index,
2445 validate_gl4e);
2446 #else // ! GUEST_PAGING_LEVELS >= 4
2447 SHADOW_PRINTK("called in wrong paging mode!\n");
2448 BUG();
2449 return 0;
2450 #endif
2453 int
2454 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2455 void *new_gl3p, u32 size)
2457 #if GUEST_PAGING_LEVELS >= 4
2458 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2459 SH_type_l3_shadow,
2460 shadow_l3_index,
2461 validate_gl3e);
2462 #else // ! GUEST_PAGING_LEVELS >= 4
2463 SHADOW_PRINTK("called in wrong paging mode!\n");
2464 BUG();
2465 return 0;
2466 #endif
2469 int
2470 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2471 void *new_gl2p, u32 size)
2473 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2474 SH_type_l2_shadow,
2475 shadow_l2_index,
2476 validate_gl2e);
2479 int
2480 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2481 void *new_gl2p, u32 size)
2483 #if GUEST_PAGING_LEVELS >= 3
2484 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2485 SH_type_l2h_shadow,
2486 shadow_l2_index,
2487 validate_gl2e);
2488 #else /* Non-PAE guests don't have different kinds of l2 table */
2489 SHADOW_PRINTK("called in wrong paging mode!\n");
2490 BUG();
2491 return 0;
2492 #endif
2495 int
2496 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2497 void *new_gl1p, u32 size)
2499 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2500 SH_type_l1_shadow,
2501 shadow_l1_index,
2502 validate_gl1e);
2506 /**************************************************************************/
2507 /* Optimization: If we see two emulated writes of zeros to the same
2508 * page-table without another kind of page fault in between, we guess
2509 * that this is a batch of changes (for process destruction) and
2510 * unshadow the page so we don't take a pagefault on every entry. This
2511 * should also make finding writeable mappings of pagetables much
2512 * easier. */
2514 /* Look to see if this is the second emulated write in a row to this
2515 * page, and unshadow/unhook if it is */
2516 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2518 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2519 if ( v->arch.paging.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2520 sh_mfn_is_a_page_table(gmfn) )
2522 u32 flags = mfn_to_page(gmfn)->shadow_flags;
2523 if ( !(flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
2525 perfc_incr(shadow_early_unshadow);
2526 sh_remove_shadows(v, gmfn, 0, 0 /* Slow, can fail to unshadow */ );
2529 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
2530 #endif
2533 /* Stop counting towards early unshadows, as we've seen a real page fault */
2534 static inline void reset_early_unshadow(struct vcpu *v)
2536 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2537 v->arch.paging.shadow.last_emulated_mfn = INVALID_MFN;
2538 #endif
2543 /**************************************************************************/
2544 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2545 * demand-faulted a shadow l1e in the fault handler, to see if it's
2546 * worth fetching some more.
2547 */
2549 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2551 /* XXX magic number */
2552 #define PREFETCH_DISTANCE 32
2554 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2555 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2557 int i, dist, mmio;
2558 gfn_t gfn;
2559 mfn_t gmfn;
2560 guest_l1e_t gl1e;
2561 shadow_l1e_t sl1e;
2562 u32 gflags;
2564 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2565 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2566 /* And no more than a maximum fetches-per-fault */
2567 if ( dist > PREFETCH_DISTANCE )
2568 dist = PREFETCH_DISTANCE;
2570 for ( i = 1; i < dist ; i++ )
2572 /* No point in prefetching if there's already a shadow */
2573 if ( ptr_sl1e[i].l1 != 0 )
2574 break;
2576 if ( gw->l1e )
2578 /* Normal guest page; grab the next guest entry */
2579 gl1e = gw->l1e[i];
2580 /* Not worth continuing if we hit an entry that will need another
2581 * fault for A/D-bit propagation anyway */
2582 gflags = guest_l1e_get_flags(gl1e);
2583 if ( (gflags & _PAGE_PRESENT)
2584 && (!(gflags & _PAGE_ACCESSED)
2585 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2586 break;
2588 else
2590 /* Fragmented superpage, unless we've been called wrongly */
2591 ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
2592 /* Increment the l1e's GFN by the right number of guest pages */
2593 gl1e = guest_l1e_from_gfn(
2594 _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i),
2595 guest_l1e_get_flags(gw->eff_l1e));
2598 /* Look at the gfn that the l1e is pointing at */
2599 gfn = guest_l1e_get_gfn(gl1e);
2600 gmfn = gfn_to_mfn(v->domain, gfn);
2601 mmio = ( is_hvm_vcpu(v) && mmio_space(gfn_to_paddr(gfn)) );
2603 /* Propagate the entry. Safe to use a pointer to our local
2604 * gl1e, since this is not a demand-fetch so there will be no
2605 * write-back to the guest. */
2606 l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
2607 gmfn, &sl1e, ft_prefetch, mmio);
2608 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2612 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2615 /**************************************************************************/
2616 /* Entry points into the shadow code */
2618 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2619 * for pagefaults. Returns 1 if this fault was an artefact of the
2620 * shadow code (and the guest should retry) or 0 if it is not (and the
2621 * fault should be handled elsewhere or passed to the guest). */
2623 static int sh_page_fault(struct vcpu *v,
2624 unsigned long va,
2625 struct cpu_user_regs *regs)
2627 struct domain *d = v->domain;
2628 walk_t gw;
2629 u32 accumulated_gflags;
2630 gfn_t gfn;
2631 mfn_t gmfn, sl1mfn=_mfn(0);
2632 shadow_l1e_t sl1e, *ptr_sl1e;
2633 paddr_t gpa;
2634 struct sh_emulate_ctxt emul_ctxt;
2635 struct x86_emulate_ops *emul_ops;
2636 int r, mmio;
2637 fetch_type_t ft = 0;
2639 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
2640 v->domain->domain_id, v->vcpu_id, va, regs->error_code);
2642 perfc_incr(shadow_fault);
2643 //
2644 // XXX: Need to think about eventually mapping superpages directly in the
2645 // shadow (when possible), as opposed to splintering them into a
2646 // bunch of 4K maps.
2647 //
2649 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2650 if ( (regs->error_code & PFEC_reserved_bit) )
2652 /* The only reasons for reserved bits to be set in shadow entries
2653 * are the two "magic" shadow_l1e entries. */
2654 if ( likely((__copy_from_user(&sl1e,
2655 (sh_linear_l1_table(v)
2656 + shadow_l1_linear_offset(va)),
2657 sizeof(sl1e)) == 0)
2658 && sh_l1e_is_magic(sl1e)) )
2660 if ( sh_l1e_is_gnp(sl1e) )
2662 /* Not-present in a guest PT: pass to the guest as
2663 * a not-present fault (by flipping two bits). */
2664 ASSERT(regs->error_code & PFEC_page_present);
2665 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2666 reset_early_unshadow(v);
2667 perfc_incr(shadow_fault_fast_gnp);
2668 SHADOW_PRINTK("fast path not-present\n");
2669 return 0;
2671 else
2673 /* Magic MMIO marker: extract gfn for MMIO address */
2674 ASSERT(sh_l1e_is_mmio(sl1e));
2675 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2676 << PAGE_SHIFT)
2677 | (va & ~PAGE_MASK);
2679 perfc_incr(shadow_fault_fast_mmio);
2680 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2681 reset_early_unshadow(v);
2682 handle_mmio(gpa);
2683 return EXCRET_fault_fixed;
2685 else
2687 /* This should be exceptionally rare: another vcpu has fixed
2688 * the tables between the fault and our reading the l1e.
2689 * Retry and let the hardware give us the right fault next time. */
2690 perfc_incr(shadow_fault_fast_fail);
2691 SHADOW_PRINTK("fast path false alarm!\n");
2692 return EXCRET_fault_fixed;
2695 #endif /* SHOPT_FAST_FAULT_PATH */
2697 /* Detect if this page fault happened while we were already in Xen
2698 * doing a shadow operation. If that happens, the only thing we can
2699 * do is let Xen's normal fault handlers try to fix it. In any case,
2700 * a diagnostic trace of the fault will be more useful than
2701 * a BUG() when we try to take the lock again. */
2702 if ( unlikely(shadow_locked_by_me(d)) )
2704 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
2705 d->arch.paging.shadow.locker_function);
2706 return 0;
2709 shadow_lock(d);
2711 shadow_audit_tables(v);
2713 if ( guest_walk_tables(v, va, &gw, 1) != 0 )
2715 SHADOW_PRINTK("malformed guest pagetable\n");
2716 print_gw(&gw);
2719 /* It's possible that the guest has put pagetables in memory that it has
2720 * already used for some special purpose (ioreq pages, or granted pages).
2721 * If that happens we'll have killed the guest already but it's still not
2722 * safe to propagate entries out of the guest PT so get out now. */
2723 if ( unlikely(d->is_shutting_down) )
2725 SHADOW_PRINTK("guest is shutting down\n");
2726 shadow_unlock(d);
2727 return 0;
2730 sh_audit_gw(v, &gw);
2732 // We do not look at the gw->l1e, as that will not exist for superpages.
2733 // Instead, we use the gw->eff_l1e...
2734 //
2735 // We need not check all the levels of the guest page table entries for
2736 // present vs not-present, as the eff_l1e will always be not present if
2737 // one of the higher level entries is not present.
2738 //
2739 if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
2741 perfc_incr(shadow_fault_bail_not_present);
2742 goto not_a_shadow_fault;
2745 // All levels of the guest page table are now known to be present.
2746 accumulated_gflags = accumulate_guest_flags(v, &gw);
2748 // Check for attempts to access supervisor-only pages from user mode,
2749 // i.e. ring 3. Such errors are not caused or dealt with by the shadow
2750 // code.
2751 //
2752 if ( (regs->error_code & PFEC_user_mode) &&
2753 !(accumulated_gflags & _PAGE_USER) )
2755 /* illegal user-mode access to supervisor-only page */
2756 perfc_incr(shadow_fault_bail_user_supervisor);
2757 goto not_a_shadow_fault;
2760 // Was it a write fault?
2761 ft = ((regs->error_code & PFEC_write_access)
2762 ? ft_demand_write : ft_demand_read);
2763 if ( ft == ft_demand_write )
2765 if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
2767 perfc_incr(shadow_fault_bail_ro_mapping);
2768 goto not_a_shadow_fault;
2771 else // must have been either an insn fetch or read fault
2773 // Check for NX bit violations: attempts to execute code that is
2774 // marked "do not execute". Such errors are not caused or dealt with
2775 // by the shadow code.
2776 //
2777 if ( regs->error_code & PFEC_insn_fetch )
2779 if ( accumulated_gflags & _PAGE_NX_BIT )
2781 /* NX prevented this code fetch */
2782 perfc_incr(shadow_fault_bail_nx);
2783 goto not_a_shadow_fault;
2788 /* What mfn is the guest trying to access? */
2789 gfn = guest_l1e_get_gfn(gw.eff_l1e);
2790 gmfn = gfn_to_mfn(d, gfn);
2791 mmio = (is_hvm_domain(d) && mmio_space(gfn_to_paddr(gfn)));
2793 if ( !mmio && !mfn_valid(gmfn) )
2795 perfc_incr(shadow_fault_bail_bad_gfn);
2796 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
2797 gfn_x(gfn), mfn_x(gmfn));
2798 goto not_a_shadow_fault;
2801 /* Make sure there is enough free shadow memory to build a chain of
2802 * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
2803 * to allocate all we need. (We never allocate a top-level shadow
2804 * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
2805 shadow_prealloc(d, SHADOW_MAX_ORDER);
2807 /* Acquire the shadow. This must happen before we figure out the rights
2808 * for the shadow entry, since we might promote a page here. */
2809 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2810 if ( unlikely(ptr_sl1e == NULL) )
2812 /* Couldn't get the sl1e! Since we know the guest entries
2813 * are OK, this can only have been caused by a failed
2814 * shadow_set_l*e(), which will have crashed the guest.
2815 * Get out of the fault handler immediately. */
2816 ASSERT(d->is_shutting_down);
2817 unmap_walk(v, &gw);
2818 shadow_unlock(d);
2819 return 0;
2822 /* Calculate the shadow entry and write it */
2823 l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn,
2824 gmfn, &sl1e, ft, mmio);
2825 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2827 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2828 /* Prefetch some more shadow entries */
2829 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2830 #endif
2832 /* Need to emulate accesses to page tables */
2833 if ( sh_mfn_is_a_page_table(gmfn) )
2835 if ( ft == ft_demand_write )
2837 perfc_incr(shadow_fault_emulate_write);
2838 goto emulate;
2840 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2842 perfc_incr(shadow_fault_emulate_read);
2843 goto emulate;
2847 if ( mmio )
2849 gpa = guest_walk_to_gpa(&gw);
2850 goto mmio;
2853 perfc_incr(shadow_fault_fixed);
2854 d->arch.paging.log_dirty.fault_count++;
2855 reset_early_unshadow(v);
2857 done:
2858 sh_audit_gw(v, &gw);
2859 unmap_walk(v, &gw);
2860 SHADOW_PRINTK("fixed\n");
2861 shadow_audit_tables(v);
2862 shadow_unlock(d);
2863 return EXCRET_fault_fixed;
2865 emulate:
2866 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
2867 goto not_a_shadow_fault;
2869 /*
2870 * We do not emulate user writes. Instead we use them as a hint that the
2871 * page is no longer a page table. This behaviour differs from native, but
2872 * it seems very unlikely that any OS grants user access to page tables.
2873 */
2874 if ( (regs->error_code & PFEC_user_mode) )
2876 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
2877 mfn_x(gmfn));
2878 perfc_incr(shadow_fault_emulate_failed);
2879 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2880 goto done;
2883 if ( is_hvm_domain(d) )
2885 /*
2886 * If we are in the middle of injecting an exception or interrupt then
2887 * we should not emulate: it is not the instruction at %eip that caused
2888 * the fault. Furthermore it is almost certainly the case the handler
2889 * stack is currently considered to be a page table, so we should
2890 * unshadow the faulting page before exiting.
2891 */
2892 if ( unlikely(hvm_event_pending(v)) )
2894 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
2895 "injection: cr2=%#lx, mfn=%#lx\n",
2896 va, mfn_x(gmfn));
2897 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2898 goto done;
2901 hvm_store_cpu_guest_regs(v, regs, NULL);
2904 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
2905 (unsigned long)regs->eip, (unsigned long)regs->esp);
2907 /*
2908 * We don't need to hold the lock for the whole emulation; we will
2909 * take it again when we write to the pagetables.
2910 */
2911 sh_audit_gw(v, &gw);
2912 unmap_walk(v, &gw);
2913 shadow_audit_tables(v);
2914 shadow_unlock(d);
2916 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
2918 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
2920 /*
2921 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
2922 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
2923 * then it must be 'failable': we cannot require the unshadow to succeed.
2924 */
2925 if ( r == X86EMUL_UNHANDLEABLE )
2927 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
2928 mfn_x(gmfn));
2929 perfc_incr(shadow_fault_emulate_failed);
2930 /* If this is actually a page table, then we have a bug, and need
2931 * to support more operations in the emulator. More likely,
2932 * though, this is a hint that this page should not be shadowed. */
2933 shadow_remove_all_shadows(v, gmfn);
2936 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
2937 if ( r == X86EMUL_OKAY ) {
2938 int i;
2939 /* Emulate up to four extra instructions in the hope of catching
2940 * the "second half" of a 64-bit pagetable write. */
2941 for ( i = 0 ; i < 4 ; i++ )
2943 shadow_continue_emulation(&emul_ctxt, regs);
2944 v->arch.paging.last_write_was_pt = 0;
2945 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
2946 if ( r == X86EMUL_OKAY )
2948 if ( v->arch.paging.last_write_was_pt )
2950 perfc_incr(shadow_em_ex_pt);
2951 break; /* Don't emulate past the other half of the write */
2953 else
2954 perfc_incr(shadow_em_ex_non_pt);
2956 else
2958 perfc_incr(shadow_em_ex_fail);
2959 break; /* Don't emulate again if we failed! */
2963 #endif /* PAE guest */
2965 /* Emulator has changed the user registers: write back */
2966 if ( is_hvm_domain(d) )
2967 hvm_load_cpu_guest_regs(v, regs);
2969 SHADOW_PRINTK("emulated\n");
2970 return EXCRET_fault_fixed;
2972 mmio:
2973 if ( !guest_mode(regs) )
2974 goto not_a_shadow_fault;
2975 perfc_incr(shadow_fault_mmio);
2976 sh_audit_gw(v, &gw);
2977 unmap_walk(v, &gw);
2978 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
2979 shadow_audit_tables(v);
2980 reset_early_unshadow(v);
2981 shadow_unlock(d);
2982 handle_mmio(gpa);
2983 return EXCRET_fault_fixed;
2985 not_a_shadow_fault:
2986 sh_audit_gw(v, &gw);
2987 unmap_walk(v, &gw);
2988 SHADOW_PRINTK("not a shadow fault\n");
2989 shadow_audit_tables(v);
2990 reset_early_unshadow(v);
2991 shadow_unlock(d);
2992 return 0;
2996 static int
2997 sh_invlpg(struct vcpu *v, unsigned long va)
2998 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
2999 * instruction should be issued on the hardware, or 0 if it's safe not
3000 * to do so. */
3002 shadow_l2e_t sl2e;
3004 perfc_incr(shadow_invlpg);
3006 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3007 /* No longer safe to use cached gva->gfn translations */
3008 vtlb_flush(v);
3009 #endif
3011 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3012 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3013 * yet. */
3014 #if SHADOW_PAGING_LEVELS == 4
3016 shadow_l3e_t sl3e;
3017 if ( !(shadow_l4e_get_flags(
3018 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3019 & _PAGE_PRESENT) )
3020 return 0;
3021 /* This must still be a copy-from-user because we don't have the
3022 * shadow lock, and the higher-level shadows might disappear
3023 * under our feet. */
3024 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3025 + shadow_l3_linear_offset(va)),
3026 sizeof (sl3e)) != 0 )
3028 perfc_incr(shadow_invlpg_fault);
3029 return 0;
3031 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3032 return 0;
3034 #elif SHADOW_PAGING_LEVELS == 3
3035 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3036 & _PAGE_PRESENT) )
3037 // no need to flush anything if there's no SL2...
3038 return 0;
3039 #endif
3041 /* This must still be a copy-from-user because we don't have the shadow
3042 * lock, and the higher-level shadows might disappear under our feet. */
3043 if ( __copy_from_user(&sl2e,
3044 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3045 sizeof (sl2e)) != 0 )
3047 perfc_incr(shadow_invlpg_fault);
3048 return 0;
3051 // If there's nothing shadowed for this particular sl2e, then
3052 // there is no need to do an invlpg, either...
3053 //
3054 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3055 return 0;
3057 // Check to see if the SL2 is a splintered superpage...
3058 // If so, then we'll need to flush the entire TLB (because that's
3059 // easier than invalidating all of the individual 4K pages).
3060 //
3061 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
3062 == SH_type_fl1_shadow )
3064 local_flush_tlb();
3065 return 0;
3068 return 1;
3072 static unsigned long
3073 sh_gva_to_gfn(struct vcpu *v, unsigned long va)
3074 /* Called to translate a guest virtual address to what the *guest*
3075 * pagetables would map it to. */
3077 walk_t gw;
3078 gfn_t gfn;
3080 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3081 struct shadow_vtlb t = {0};
3082 if ( vtlb_lookup(v, va, &t) )
3083 return t.frame_number;
3084 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3086 guest_walk_tables(v, va, &gw, 0);
3087 gfn = guest_walk_to_gfn(&gw);
3089 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3090 t.page_number = va >> PAGE_SHIFT;
3091 t.frame_number = gfn_x(gfn);
3092 t.flags = accumulate_guest_flags(v, &gw);
3093 vtlb_insert(v, t);
3094 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3096 unmap_walk(v, &gw);
3097 return gfn_x(gfn);
3101 static inline void
3102 sh_update_linear_entries(struct vcpu *v)
3103 /* Sync up all the linear mappings for this vcpu's pagetables */
3105 struct domain *d = v->domain;
3107 /* Linear pagetables in PV guests
3108 * ------------------------------
3110 * Guest linear pagetables, which map the guest pages, are at
3111 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3112 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3113 * are set up at shadow creation time, but (of course!) the PAE case
3114 * is subtler. Normal linear mappings are made by having an entry
3115 * in the top-level table that points to itself (shadow linear) or
3116 * to the guest top-level table (guest linear). For PAE, to set up
3117 * a linear map requires us to copy the four top-level entries into
3118 * level-2 entries. That means that every time we change a PAE l3e,
3119 * we need to reflect the change into the copy.
3121 * Linear pagetables in HVM guests
3122 * -------------------------------
3124 * For HVM guests, the linear pagetables are installed in the monitor
3125 * tables (since we can't put them in the shadow). Shadow linear
3126 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3127 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3128 * a linear pagetable of the monitor tables themselves. We have
3129 * the same issue of having to re-copy PAE l3 entries whevever we use
3130 * PAE shadows.
3132 * Because HVM guests run on the same monitor tables regardless of the
3133 * shadow tables in use, the linear mapping of the shadow tables has to
3134 * be updated every time v->arch.shadow_table changes.
3135 */
3137 /* Don't try to update the monitor table if it doesn't exist */
3138 if ( shadow_mode_external(d)
3139 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3140 return;
3142 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3144 /* For PV, one l4e points at the guest l4, one points at the shadow
3145 * l4. No maintenance required.
3146 * For HVM, just need to update the l4e that points to the shadow l4. */
3148 if ( shadow_mode_external(d) )
3150 /* Use the linear map if we can; otherwise make a new mapping */
3151 if ( v == current )
3153 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3154 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3155 __PAGE_HYPERVISOR);
3157 else
3159 l4_pgentry_t *ml4e;
3160 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3161 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3162 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3163 __PAGE_HYPERVISOR);
3164 sh_unmap_domain_page(ml4e);
3168 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3170 /* PV: XXX
3172 * HVM: To give ourselves a linear map of the shadows, we need to
3173 * extend a PAE shadow to 4 levels. We do this by having a monitor
3174 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3175 * entries into it. Then, by having the monitor l4e for shadow
3176 * pagetables also point to the monitor l4, we can use it to access
3177 * the shadows.
3178 */
3180 if ( shadow_mode_external(d) )
3182 /* Install copies of the shadow l3es into the monitor l3 table.
3183 * The monitor l3 table is hooked into slot 0 of the monitor
3184 * l4 table, so we use l3 linear indices 0 to 3 */
3185 shadow_l3e_t *sl3e;
3186 l3_pgentry_t *ml3e;
3187 mfn_t l3mfn;
3188 int i;
3190 /* Use linear mappings if we can; otherwise make new mappings */
3191 if ( v == current )
3193 ml3e = __linear_l3_table;
3194 l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
3196 else
3198 l4_pgentry_t *ml4e;
3199 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3200 ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
3201 l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
3202 ml3e = sh_map_domain_page(l3mfn);
3203 sh_unmap_domain_page(ml4e);
3206 /* Shadow l3 tables are made up by sh_update_cr3 */
3207 sl3e = v->arch.paging.shadow.l3table;
3209 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3211 ml3e[i] =
3212 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3213 ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3214 __PAGE_HYPERVISOR)
3215 : l3e_empty();
3218 if ( v != current )
3219 sh_unmap_domain_page(ml3e);
3221 else
3222 domain_crash(d); /* XXX */
3224 #elif CONFIG_PAGING_LEVELS == 3
3226 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3227 * entries in the shadow, and the shadow's l3 entries into the
3228 * shadow-linear-map l2 entries in the shadow. This is safe to do
3229 * because Xen does not let guests share high-slot l2 tables between l3s,
3230 * so we know we're not treading on anyone's toes.
3232 * HVM: need to copy the shadow's l3 entries into the
3233 * shadow-linear-map l2 entries in the monitor table. This is safe
3234 * because we have one monitor table for each vcpu. The monitor's
3235 * own l3es don't need to be copied because they never change.
3236 * XXX That might change if we start stuffing things into the rest
3237 * of the monitor's virtual address space.
3238 */
3240 l2_pgentry_t *l2e, new_l2e;
3241 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3242 int i;
3243 int unmap_l2e = 0;
3245 #if GUEST_PAGING_LEVELS == 2
3247 /* Shadow l3 tables were built by sh_update_cr3 */
3248 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3249 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3251 #else /* GUEST_PAGING_LEVELS == 3 */
3253 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3254 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3256 #endif /* GUEST_PAGING_LEVELS */
3258 /* Choose where to write the entries, using linear maps if possible */
3259 if ( shadow_mode_external(d) )
3261 if ( v == current )
3263 /* From the monitor tables, it's safe to use linear maps
3264 * to update monitor l2s */
3265 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3267 else
3269 /* Map the monitor table's high l2 */
3270 l3_pgentry_t *l3e;
3271 l3e = sh_map_domain_page(
3272 pagetable_get_mfn(v->arch.monitor_table));
3273 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3274 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3275 unmap_l2e = 1;
3276 sh_unmap_domain_page(l3e);
3279 else
3281 /* Map the shadow table's high l2 */
3282 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3283 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3284 unmap_l2e = 1;
3287 /* Write linear mapping of guest (only in PV, and only when
3288 * not translated). */
3289 if ( !shadow_mode_translate(d) )
3291 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3293 new_l2e =
3294 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3295 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3296 __PAGE_HYPERVISOR)
3297 : l2e_empty());
3298 safe_write_entry(
3299 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3300 &new_l2e);
3304 /* Write linear mapping of shadow. */
3305 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3307 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3308 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3309 __PAGE_HYPERVISOR)
3310 : l2e_empty();
3311 safe_write_entry(
3312 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3313 &new_l2e);
3316 if ( unmap_l2e )
3317 sh_unmap_domain_page(l2e);
3320 #elif CONFIG_PAGING_LEVELS == 2
3322 /* For PV, one l2e points at the guest l2, one points at the shadow
3323 * l2. No maintenance required.
3324 * For HVM, just need to update the l2e that points to the shadow l2. */
3326 if ( shadow_mode_external(d) )
3328 /* Use the linear map if we can; otherwise make a new mapping */
3329 if ( v == current )
3331 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3332 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3333 __PAGE_HYPERVISOR);
3335 else
3337 l2_pgentry_t *ml2e;
3338 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3339 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3340 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3341 __PAGE_HYPERVISOR);
3342 sh_unmap_domain_page(ml2e);
3346 #else
3347 #error this should not happen
3348 #endif
3352 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3353 * Does all appropriate management/bookkeeping/refcounting/etc...
3354 */
3355 static void
3356 sh_detach_old_tables(struct vcpu *v)
3358 mfn_t smfn;
3359 int i = 0;
3361 ////
3362 //// vcpu->arch.paging.shadow.guest_vtable
3363 ////
3365 #if GUEST_PAGING_LEVELS == 3
3366 /* PAE guests don't have a mapping of the guest top-level table */
3367 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3368 #else
3369 if ( v->arch.paging.shadow.guest_vtable )
3371 struct domain *d = v->domain;
3372 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3373 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3374 v->arch.paging.shadow.guest_vtable = NULL;
3376 #endif
3379 ////
3380 //// vcpu->arch.shadow_table[]
3381 ////
3383 #if GUEST_PAGING_LEVELS == 3
3384 /* PAE guests have four shadow_table entries */
3385 for ( i = 0 ; i < 4 ; i++ )
3386 #endif
3388 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3389 if ( mfn_x(smfn) )
3390 sh_put_ref(v, smfn, 0);
3391 v->arch.shadow_table[i] = pagetable_null();
3395 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3396 static void
3397 sh_set_toplevel_shadow(struct vcpu *v,
3398 int slot,
3399 mfn_t gmfn,
3400 unsigned int root_type)
3402 mfn_t smfn;
3403 pagetable_t old_entry, new_entry;
3405 struct domain *d = v->domain;
3407 /* Remember the old contents of this slot */
3408 old_entry = v->arch.shadow_table[slot];
3410 /* Now figure out the new contents: is this a valid guest MFN? */
3411 if ( !mfn_valid(gmfn) )
3413 new_entry = pagetable_null();
3414 goto install_new_entry;
3417 /* Guest mfn is valid: shadow it and install the shadow */
3418 smfn = get_shadow_status(v, gmfn, root_type);
3419 if ( !mfn_valid(smfn) )
3421 /* Make sure there's enough free shadow memory. */
3422 shadow_prealloc(d, SHADOW_MAX_ORDER);
3423 /* Shadow the page. */
3424 smfn = sh_make_shadow(v, gmfn, root_type);
3426 ASSERT(mfn_valid(smfn));
3428 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
3429 /* Once again OK to unhook entries from this table if we see fork/exit */
3430 ASSERT(sh_mfn_is_a_page_table(gmfn));
3431 mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
3432 #endif
3434 /* Pin the shadow and put it (back) on the list of pinned shadows */
3435 if ( sh_pin(v, smfn) == 0 )
3437 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3438 domain_crash(v->domain);
3441 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3442 * or the next call to set_toplevel_shadow() */
3443 if ( !sh_get_ref(v, smfn, 0) )
3445 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3446 domain_crash(v->domain);
3449 new_entry = pagetable_from_mfn(smfn);
3451 install_new_entry:
3452 /* Done. Install it */
3453 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3454 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3455 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3456 v->arch.shadow_table[slot] = new_entry;
3458 /* Decrement the refcount of the old contents of this slot */
3459 if ( !pagetable_is_null(old_entry) )
3460 sh_put_ref(v, pagetable_get_mfn(old_entry), 0);
3464 static void
3465 sh_update_cr3(struct vcpu *v, int do_locking)
3466 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3467 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3468 * if appropriate).
3469 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3470 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
3471 * shadow tables are.
3472 * If do_locking != 0, assume we are being called from outside the
3473 * shadow code, and must take and release the shadow lock; otherwise
3474 * that is the caller's responsibility.
3475 */
3477 struct domain *d = v->domain;
3478 mfn_t gmfn;
3479 #if GUEST_PAGING_LEVELS == 3
3480 guest_l3e_t *gl3e;
3481 u32 guest_idx=0;
3482 int i;
3483 #endif
3485 /* Don't do anything on an uninitialised vcpu */
3486 if ( !is_hvm_domain(d) && !v->is_initialised )
3488 ASSERT(v->arch.cr3 == 0);
3489 return;
3492 if ( do_locking ) shadow_lock(v->domain);
3494 ASSERT(shadow_locked_by_me(v->domain));
3495 ASSERT(v->arch.paging.mode);
3497 ////
3498 //// vcpu->arch.guest_table is already set
3499 ////
3501 #ifndef NDEBUG
3502 /* Double-check that the HVM code has sent us a sane guest_table */
3503 if ( is_hvm_domain(d) )
3505 gfn_t gfn;
3507 ASSERT(shadow_mode_external(d));
3509 // Is paging enabled on this vcpu?
3510 if ( hvm_paging_enabled(v) )
3512 gfn = _gfn(paddr_to_pfn(v->arch.hvm_vcpu.guest_cr[3]));
3513 gmfn = gfn_to_mfn(d, gfn);
3514 ASSERT(mfn_valid(gmfn));
3515 ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
3517 else
3519 /* Paging disabled: guest_table points at a 32-bit 1-to-1 map */
3520 ASSERT(v->arch.guest_table.pfn
3521 == d->arch.paging.shadow.unpaged_pagetable.pfn);
3524 #endif
3526 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3527 d->domain_id, v->vcpu_id,
3528 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3530 #if GUEST_PAGING_LEVELS == 4
3531 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
3532 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3533 else
3534 #endif
3535 gmfn = pagetable_get_mfn(v->arch.guest_table);
3538 ////
3539 //// vcpu->arch.paging.shadow.guest_vtable
3540 ////
3541 #if GUEST_PAGING_LEVELS == 4
3542 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3544 if ( v->arch.paging.shadow.guest_vtable )
3545 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3546 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3547 /* PAGING_LEVELS==4 implies 64-bit, which means that
3548 * map_domain_page_global can't fail */
3549 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
3551 else
3552 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
3553 #elif GUEST_PAGING_LEVELS == 3
3554 /* On PAE guests we don't use a mapping of the guest's own top-level
3555 * table. We cache the current state of that table and shadow that,
3556 * until the next CR3 write makes us refresh our cache. */
3557 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3559 if ( shadow_mode_external(d) )
3560 /* Find where in the page the l3 table is */
3561 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
3562 else
3563 /* PV guest: l3 is at the start of a page */
3564 guest_idx = 0;
3566 // Ignore the low 2 bits of guest_idx -- they are really just
3567 // cache control.
3568 guest_idx &= ~3;
3570 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
3571 for ( i = 0; i < 4 ; i++ )
3572 v->arch.paging.shadow.gl3e[i] = gl3e[i];
3573 sh_unmap_domain_page(gl3e);
3574 #elif GUEST_PAGING_LEVELS == 2
3575 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3577 if ( v->arch.paging.shadow.guest_vtable )
3578 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3579 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3580 /* Does this really need map_domain_page_global? Handle the
3581 * error properly if so. */
3582 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
3584 else
3585 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
3586 #else
3587 #error this should never happen
3588 #endif
3590 #if 0
3591 printk("%s %s %d gmfn=%05lx shadow.guest_vtable=%p\n",
3592 __func__, __FILE__, __LINE__, gmfn, v->arch.paging.shadow.guest_vtable);
3593 #endif
3595 ////
3596 //// vcpu->arch.shadow_table[]
3597 ////
3599 /* We revoke write access to the new guest toplevel page(s) before we
3600 * replace the old shadow pagetable(s), so that we can safely use the
3601 * (old) shadow linear maps in the writeable mapping heuristics. */
3602 #if GUEST_PAGING_LEVELS == 2
3603 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
3604 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3605 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3606 #elif GUEST_PAGING_LEVELS == 3
3607 /* PAE guests have four shadow_table entries, based on the
3608 * current values of the guest's four l3es. */
3610 int flush = 0;
3611 gfn_t gl2gfn;
3612 mfn_t gl2mfn;
3613 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
3614 /* First, make all four entries read-only. */
3615 for ( i = 0; i < 4; i++ )
3617 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3619 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3620 gl2mfn = gfn_to_mfn(d, gl2gfn);
3621 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
3624 if ( flush )
3625 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3626 /* Now install the new shadows. */
3627 for ( i = 0; i < 4; i++ )
3629 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3631 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3632 gl2mfn = gfn_to_mfn(d, gl2gfn);
3633 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3634 ? SH_type_l2h_shadow
3635 : SH_type_l2_shadow);
3637 else
3638 /* The guest is not present: clear out the shadow. */
3639 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3642 #elif GUEST_PAGING_LEVELS == 4
3643 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
3644 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3645 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3646 #else
3647 #error This should never happen
3648 #endif
3650 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3651 #endif
3653 ///
3654 /// v->arch.paging.shadow.l3table
3655 ///
3656 #if SHADOW_PAGING_LEVELS == 3
3658 mfn_t smfn;
3659 int i;
3660 for ( i = 0; i < 4; i++ )
3662 #if GUEST_PAGING_LEVELS == 2
3663 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3664 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3665 #else
3666 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3667 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3668 #endif
3669 v->arch.paging.shadow.l3table[i] =
3670 (mfn_x(smfn) == 0)
3671 ? shadow_l3e_empty()
3672 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3675 #endif /* SHADOW_PAGING_LEVELS == 3 */
3678 ///
3679 /// v->arch.cr3
3680 ///
3681 if ( shadow_mode_external(d) )
3683 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3685 else // not shadow_mode_external...
3687 /* We don't support PV except guest == shadow == config levels */
3688 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3689 #if SHADOW_PAGING_LEVELS == 3
3690 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3691 * Don't use make_cr3 because (a) we know it's below 4GB, and
3692 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3693 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
3694 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
3695 #else
3696 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3697 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3698 #endif
3702 ///
3703 /// v->arch.hvm_vcpu.hw_cr[3]
3704 ///
3705 if ( shadow_mode_external(d) )
3707 ASSERT(is_hvm_domain(d));
3708 #if SHADOW_PAGING_LEVELS == 3
3709 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3710 v->arch.hvm_vcpu.hw_cr[3] =
3711 virt_to_maddr(&v->arch.paging.shadow.l3table);
3712 #else
3713 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3714 v->arch.hvm_vcpu.hw_cr[3] =
3715 pagetable_get_paddr(v->arch.shadow_table[0]);
3716 #endif
3717 hvm_update_guest_cr(v, 3);
3720 /* Fix up the linear pagetable mappings */
3721 sh_update_linear_entries(v);
3723 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3724 /* No longer safe to use cached gva->gfn translations */
3725 vtlb_flush(v);
3726 #endif
3728 /* Release the lock, if we took it (otherwise it's the caller's problem) */
3729 if ( do_locking ) shadow_unlock(v->domain);
3733 /**************************************************************************/
3734 /* Functions to revoke guest rights */
3736 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3737 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3738 /* Look up this vaddr in the current shadow and see if it's a writeable
3739 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3741 shadow_l1e_t sl1e, *sl1p;
3742 shadow_l2e_t *sl2p;
3743 #if SHADOW_PAGING_LEVELS >= 3
3744 shadow_l3e_t *sl3p;
3745 #if SHADOW_PAGING_LEVELS >= 4
3746 shadow_l4e_t *sl4p;
3747 #endif
3748 #endif
3749 mfn_t sl1mfn;
3750 int r;
3752 /* Carefully look in the shadow linear map for the l1e we expect */
3753 #if SHADOW_PAGING_LEVELS >= 4
3754 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3755 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3756 return 0;
3757 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3758 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3759 return 0;
3760 #elif SHADOW_PAGING_LEVELS == 3
3761 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
3762 + shadow_l3_linear_offset(vaddr);
3763 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3764 return 0;
3765 #endif
3766 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3767 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3768 return 0;
3769 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3770 sl1e = *sl1p;
3771 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3772 != (_PAGE_PRESENT|_PAGE_RW))
3773 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3774 return 0;
3776 /* Found it! Need to remove its write permissions. */
3777 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3778 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3779 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3780 ASSERT( !(r & SHADOW_SET_ERROR) );
3781 return 1;
3783 #endif
3785 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
3786 mfn_t readonly_mfn)
3787 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3789 shadow_l1e_t *sl1e;
3790 int done = 0;
3791 int flags;
3792 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3794 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3796 flags = shadow_l1e_get_flags(*sl1e);
3797 if ( (flags & _PAGE_PRESENT)
3798 && (flags & _PAGE_RW)
3799 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3801 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
3802 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
3803 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3804 /* Remember the last shadow that we shot a writeable mapping in */
3805 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3806 #endif
3807 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3808 & PGT_count_mask) == 0 )
3809 /* This breaks us cleanly out of the FOREACH macro */
3810 done = 1;
3812 });
3813 return done;
3817 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3818 /* Excises all mappings to guest frame from this shadow l1 table */
3820 shadow_l1e_t *sl1e;
3821 int done = 0;
3822 int flags;
3824 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3826 flags = shadow_l1e_get_flags(*sl1e);
3827 if ( (flags & _PAGE_PRESENT)
3828 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3830 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3831 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3832 /* This breaks us cleanly out of the FOREACH macro */
3833 done = 1;
3835 });
3836 return done;
3839 /**************************************************************************/
3840 /* Functions to excise all pointers to shadows from higher-level shadows. */
3842 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3843 /* Blank out a single shadow entry */
3845 switch ( mfn_to_shadow_page(smfn)->type )
3847 case SH_type_l1_shadow:
3848 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3849 case SH_type_l2_shadow:
3850 #if GUEST_PAGING_LEVELS >= 3
3851 case SH_type_l2h_shadow:
3852 #endif
3853 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3854 #if GUEST_PAGING_LEVELS >= 4
3855 case SH_type_l3_shadow:
3856 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3857 case SH_type_l4_shadow:
3858 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3859 #endif
3860 default: BUG(); /* Called with the wrong kind of shadow. */
3864 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3865 /* Remove all mappings of this l1 shadow from this l2 shadow */
3867 shadow_l2e_t *sl2e;
3868 int done = 0;
3869 int flags;
3871 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
3873 flags = shadow_l2e_get_flags(*sl2e);
3874 if ( (flags & _PAGE_PRESENT)
3875 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3877 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3878 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
3879 /* This breaks us cleanly out of the FOREACH macro */
3880 done = 1;
3882 });
3883 return done;
3886 #if GUEST_PAGING_LEVELS >= 4
3887 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3888 /* Remove all mappings of this l2 shadow from this l3 shadow */
3890 shadow_l3e_t *sl3e;
3891 int done = 0;
3892 int flags;
3894 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3896 flags = shadow_l3e_get_flags(*sl3e);
3897 if ( (flags & _PAGE_PRESENT)
3898 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3900 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3901 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
3902 /* This breaks us cleanly out of the FOREACH macro */
3903 done = 1;
3905 });
3906 return done;
3909 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
3910 /* Remove all mappings of this l3 shadow from this l4 shadow */
3912 shadow_l4e_t *sl4e;
3913 int done = 0;
3914 int flags;
3916 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
3918 flags = shadow_l4e_get_flags(*sl4e);
3919 if ( (flags & _PAGE_PRESENT)
3920 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
3922 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
3923 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
3924 /* This breaks us cleanly out of the FOREACH macro */
3925 done = 1;
3927 });
3928 return done;
3930 #endif /* 64bit guest */
3932 /**************************************************************************/
3933 /* Handling HVM guest writes to pagetables */
3935 /* Check that the user is allowed to perform this write.
3936 * Returns a mapped pointer to write to, and the mfn it's on,
3937 * or NULL for error. */
3938 static inline void * emulate_map_dest(struct vcpu *v,
3939 unsigned long vaddr,
3940 struct sh_emulate_ctxt *sh_ctxt,
3941 mfn_t *mfnp)
3943 walk_t gw;
3944 u32 flags, errcode;
3945 gfn_t gfn;
3946 mfn_t mfn;
3948 /* We don't emulate user-mode writes to page tables */
3949 if ( ring_3(sh_ctxt->ctxt.regs) )
3950 return NULL;
3952 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3953 /* Try the virtual TLB first */
3955 struct shadow_vtlb t = {0};
3956 if ( vtlb_lookup(v, vaddr, &t)
3957 && ((t.flags & (_PAGE_PRESENT|_PAGE_RW))
3958 == (_PAGE_PRESENT|_PAGE_RW)) )
3960 flags = t.flags;
3961 gfn = _gfn(t.frame_number);
3963 else
3965 /* Need to do the full lookup, just in case permissions
3966 * have increased since we cached this entry */
3968 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3970 /* Walk the guest pagetables */
3971 guest_walk_tables(v, vaddr, &gw, 1);
3972 flags = accumulate_guest_flags(v, &gw);
3973 gfn = guest_l1e_get_gfn(gw.eff_l1e);
3974 sh_audit_gw(v, &gw);
3975 unmap_walk(v, &gw);
3977 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3978 /* Remember this translation for next time */
3979 t.page_number = vaddr >> PAGE_SHIFT;
3980 t.frame_number = gfn_x(gfn);
3981 t.flags = flags;
3982 vtlb_insert(v, t);
3985 #endif
3986 mfn = gfn_to_mfn(v->domain, gfn);
3988 errcode = PFEC_write_access;
3989 if ( !(flags & _PAGE_PRESENT) )
3990 goto page_fault;
3992 errcode |= PFEC_page_present;
3993 if ( !(flags & _PAGE_RW) )
3994 goto page_fault;
3996 if ( mfn_valid(mfn) )
3998 *mfnp = mfn;
3999 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4000 return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
4002 else
4003 return NULL;
4005 page_fault:
4006 if ( is_hvm_vcpu(v) )
4007 hvm_inject_exception(TRAP_page_fault, errcode, vaddr);
4008 else
4009 propagate_page_fault(vaddr, errcode);
4010 return NULL;
4013 static int safe_not_to_verify_write(mfn_t gmfn, void *dst, void *src,
4014 int bytes)
4016 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4017 struct page_info *pg = mfn_to_page(gmfn);
4018 if ( !(pg->shadow_flags & SHF_32)
4019 && ((unsigned long)dst & 7) == 0 )
4021 /* Not shadowed 32-bit: aligned 64-bit writes that leave the
4022 * present bit unset are safe to ignore. */
4023 if ( (*(u64*)src & _PAGE_PRESENT) == 0
4024 && (*(u64*)dst & _PAGE_PRESENT) == 0 )
4025 return 1;
4027 else if ( !(pg->shadow_flags & (SHF_PAE|SHF_64))
4028 && ((unsigned long)dst & 3) == 0 )
4030 /* Not shadowed PAE/64-bit: aligned 32-bit writes that leave the
4031 * present bit unset are safe to ignore. */
4032 if ( (*(u32*)src & _PAGE_PRESENT) == 0
4033 && (*(u32*)dst & _PAGE_PRESENT) == 0 )
4034 return 1;
4036 #endif
4037 return 0;
4041 int
4042 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4043 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4045 mfn_t mfn;
4046 void *addr;
4047 int skip;
4049 if ( vaddr & (bytes-1) )
4050 return X86EMUL_UNHANDLEABLE;
4052 ASSERT(((vaddr & ~PAGE_MASK) + bytes) <= PAGE_SIZE);
4053 shadow_lock(v->domain);
4055 addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn);
4056 if ( addr == NULL )
4058 shadow_unlock(v->domain);
4059 return X86EMUL_EXCEPTION;
4062 skip = safe_not_to_verify_write(mfn, addr, src, bytes);
4063 memcpy(addr, src, bytes);
4064 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
4066 /* If we are writing zeros to this page, might want to unshadow */
4067 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
4068 check_for_early_unshadow(v, mfn);
4069 else
4070 reset_early_unshadow(v);
4072 paging_mark_dirty(v->domain, mfn_x(mfn));
4074 sh_unmap_domain_page(addr);
4075 shadow_audit_tables(v);
4076 shadow_unlock(v->domain);
4077 return X86EMUL_OKAY;
4080 int
4081 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4082 unsigned long old, unsigned long new,
4083 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4085 mfn_t mfn;
4086 void *addr;
4087 unsigned long prev;
4088 int rv = X86EMUL_OKAY, skip;
4090 ASSERT(bytes <= sizeof(unsigned long));
4091 shadow_lock(v->domain);
4093 if ( vaddr & (bytes-1) )
4094 return X86EMUL_UNHANDLEABLE;
4096 addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn);
4097 if ( addr == NULL )
4099 shadow_unlock(v->domain);
4100 return X86EMUL_EXCEPTION;
4103 skip = safe_not_to_verify_write(mfn, &new, &old, bytes);
4105 switch ( bytes )
4107 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4108 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4109 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4110 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4111 default:
4112 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4113 prev = ~old;
4116 if ( prev == old )
4118 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
4120 else
4121 rv = X86EMUL_CMPXCHG_FAILED;
4123 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4124 " wanted %#lx now %#lx bytes %u\n",
4125 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4127 /* If we are writing zeros to this page, might want to unshadow */
4128 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
4129 check_for_early_unshadow(v, mfn);
4130 else
4131 reset_early_unshadow(v);
4133 paging_mark_dirty(v->domain, mfn_x(mfn));
4135 sh_unmap_domain_page(addr);
4136 shadow_audit_tables(v);
4137 shadow_unlock(v->domain);
4138 return rv;
4141 int
4142 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4143 unsigned long old_lo, unsigned long old_hi,
4144 unsigned long new_lo, unsigned long new_hi,
4145 struct sh_emulate_ctxt *sh_ctxt)
4147 mfn_t mfn;
4148 void *addr;
4149 u64 old, new, prev;
4150 int rv = X86EMUL_OKAY, skip;
4152 if ( vaddr & 7 )
4153 return X86EMUL_UNHANDLEABLE;
4155 shadow_lock(v->domain);
4157 addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn);
4158 if ( addr == NULL )
4160 shadow_unlock(v->domain);
4161 return X86EMUL_EXCEPTION;
4164 old = (((u64) old_hi) << 32) | (u64) old_lo;
4165 new = (((u64) new_hi) << 32) | (u64) new_lo;
4166 skip = safe_not_to_verify_write(mfn, &new, &old, 8);
4167 prev = cmpxchg(((u64 *)addr), old, new);
4169 if ( prev == old )
4171 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, 8);
4173 else
4174 rv = X86EMUL_CMPXCHG_FAILED;
4176 /* If we are writing zeros to this page, might want to unshadow */
4177 if ( *(u32 *)addr == 0 )
4178 check_for_early_unshadow(v, mfn);
4179 else
4180 reset_early_unshadow(v);
4182 paging_mark_dirty(v->domain, mfn_x(mfn));
4184 sh_unmap_domain_page(addr);
4185 shadow_audit_tables(v);
4186 shadow_unlock(v->domain);
4187 return rv;
4191 /**************************************************************************/
4192 /* Audit tools */
4194 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4196 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4197 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4198 "gl" #_level "mfn = %" PRI_mfn \
4199 " sl" #_level "mfn = %" PRI_mfn \
4200 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4201 " gl" #_level "e = %" SH_PRI_gpte \
4202 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4203 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4204 _level, guest_index(gl ## _level ## e), \
4205 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4206 gl ## _level ## e, sl ## _level ## e, \
4207 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4208 ##_a); \
4209 BUG(); \
4210 done = 1; \
4211 } while (0)
4214 static char * sh_audit_flags(struct vcpu *v, int level,
4215 int gflags, int sflags)
4216 /* Common code for auditing flag bits */
4218 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4219 return "shadow is present but guest is not present";
4220 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4221 return "global bit set in PV shadow";
4222 if ( level == 2 && (sflags & _PAGE_PSE) )
4223 return "PS bit set in shadow";
4224 #if SHADOW_PAGING_LEVELS == 3
4225 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4226 #endif
4227 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4228 return "accessed bit not propagated";
4229 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4230 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4231 return "dirty bit not propagated";
4232 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4233 return "user/supervisor bit does not match";
4234 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4235 return "NX bit does not match";
4236 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4237 return "shadow grants write access but guest does not";
4238 return NULL;
4241 static inline mfn_t
4242 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
4243 /* Convert this gfn to an mfn in the manner appropriate for the
4244 * guest pagetable it's used in (gmfn) */
4246 if ( !shadow_mode_translate(v->domain) )
4247 return _mfn(gfn_x(gfn));
4249 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
4250 != PGT_writable_page )
4251 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
4252 else
4253 return gfn_to_mfn(v->domain, gfn);
4257 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4259 guest_l1e_t *gl1e, *gp;
4260 shadow_l1e_t *sl1e;
4261 mfn_t mfn, gmfn, gl1mfn;
4262 gfn_t gfn;
4263 char *s;
4264 int done = 0;
4266 /* Follow the backpointer */
4267 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4268 gl1e = gp = sh_map_domain_page(gl1mfn);
4269 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4271 if ( sh_l1e_is_magic(*sl1e) )
4273 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4274 if ( sh_l1e_is_gnp(*sl1e) )
4276 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4277 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4279 else
4281 ASSERT(sh_l1e_is_mmio(*sl1e));
4282 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4283 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4284 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4285 " but guest gfn is %" SH_PRI_gfn,
4286 gfn_x(gfn),
4287 gfn_x(guest_l1e_get_gfn(*gl1e)));
4289 #endif
4291 else
4293 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4294 shadow_l1e_get_flags(*sl1e));
4295 if ( s ) AUDIT_FAIL(1, "%s", s);
4297 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4299 gfn = guest_l1e_get_gfn(*gl1e);
4300 mfn = shadow_l1e_get_mfn(*sl1e);
4301 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
4302 if ( mfn_x(gmfn) != mfn_x(mfn) )
4303 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4304 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4305 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4308 });
4309 sh_unmap_domain_page(gp);
4310 return done;
4313 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4315 guest_l1e_t *gl1e, e;
4316 shadow_l1e_t *sl1e;
4317 mfn_t gl1mfn = _mfn(INVALID_MFN);
4318 int f;
4319 int done = 0;
4321 /* fl1 has no useful backpointer: all we can check are flags */
4322 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4323 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4324 f = shadow_l1e_get_flags(*sl1e);
4325 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4326 if ( !(f == 0
4327 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4328 _PAGE_ACCESSED|_PAGE_DIRTY)
4329 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4330 || sh_l1e_is_magic(*sl1e)) )
4331 AUDIT_FAIL(1, "fl1e has bad flags");
4332 });
4333 return 0;
4336 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4338 guest_l2e_t *gl2e, *gp;
4339 shadow_l2e_t *sl2e;
4340 mfn_t mfn, gmfn, gl2mfn;
4341 gfn_t gfn;
4342 char *s;
4343 int done = 0;
4345 /* Follow the backpointer */
4346 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4347 gl2e = gp = sh_map_domain_page(gl2mfn);
4348 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
4350 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4351 shadow_l2e_get_flags(*sl2e));
4352 if ( s ) AUDIT_FAIL(2, "%s", s);
4354 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4356 gfn = guest_l2e_get_gfn(*gl2e);
4357 mfn = shadow_l2e_get_mfn(*sl2e);
4358 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4359 ? get_fl1_shadow_status(v, gfn)
4360 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4361 SH_type_l1_shadow);
4362 if ( mfn_x(gmfn) != mfn_x(mfn) )
4363 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4364 " (--> %" PRI_mfn ")"
4365 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4366 gfn_x(gfn),
4367 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4368 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4369 mfn_x(gmfn), mfn_x(mfn));
4371 });
4372 sh_unmap_domain_page(gp);
4373 return 0;
4376 #if GUEST_PAGING_LEVELS >= 4
4377 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4379 guest_l3e_t *gl3e, *gp;
4380 shadow_l3e_t *sl3e;
4381 mfn_t mfn, gmfn, gl3mfn;
4382 gfn_t gfn;
4383 char *s;
4384 int done = 0;
4386 /* Follow the backpointer */
4387 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4388 gl3e = gp = sh_map_domain_page(gl3mfn);
4389 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4391 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4392 shadow_l3e_get_flags(*sl3e));
4393 if ( s ) AUDIT_FAIL(3, "%s", s);
4395 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4397 gfn = guest_l3e_get_gfn(*gl3e);
4398 mfn = shadow_l3e_get_mfn(*sl3e);
4399 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4400 ((GUEST_PAGING_LEVELS == 3 ||
4401 is_pv_32on64_vcpu(v))
4402 && !shadow_mode_external(v->domain)
4403 && (guest_index(gl3e) % 4) == 3)
4404 ? SH_type_l2h_shadow
4405 : SH_type_l2_shadow);
4406 if ( mfn_x(gmfn) != mfn_x(mfn) )
4407 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4408 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4409 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4411 });
4412 sh_unmap_domain_page(gp);
4413 return 0;
4416 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4418 guest_l4e_t *gl4e, *gp;
4419 shadow_l4e_t *sl4e;
4420 mfn_t mfn, gmfn, gl4mfn;
4421 gfn_t gfn;
4422 char *s;
4423 int done = 0;
4425 /* Follow the backpointer */
4426 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4427 gl4e = gp = sh_map_domain_page(gl4mfn);
4428 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
4430 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4431 shadow_l4e_get_flags(*sl4e));
4432 if ( s ) AUDIT_FAIL(4, "%s", s);
4434 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4436 gfn = guest_l4e_get_gfn(*gl4e);
4437 mfn = shadow_l4e_get_mfn(*sl4e);
4438 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4439 SH_type_l3_shadow);
4440 if ( mfn_x(gmfn) != mfn_x(mfn) )
4441 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4442 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4443 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4445 });
4446 sh_unmap_domain_page(gp);
4447 return 0;
4449 #endif /* GUEST_PAGING_LEVELS >= 4 */
4452 #undef AUDIT_FAIL
4454 #endif /* Audit code */
4456 /**************************************************************************/
4457 /* Entry points into this mode of the shadow code.
4458 * This will all be mangled by the preprocessor to uniquify everything. */
4459 struct paging_mode sh_paging_mode = {
4460 .page_fault = sh_page_fault,
4461 .invlpg = sh_invlpg,
4462 .gva_to_gfn = sh_gva_to_gfn,
4463 .update_cr3 = sh_update_cr3,
4464 .update_paging_modes = shadow_update_paging_modes,
4465 .write_p2m_entry = shadow_write_p2m_entry,
4466 .write_guest_entry = shadow_write_guest_entry,
4467 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
4468 .guest_map_l1e = sh_guest_map_l1e,
4469 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4470 .guest_levels = GUEST_PAGING_LEVELS,
4471 .shadow.detach_old_tables = sh_detach_old_tables,
4472 .shadow.x86_emulate_write = sh_x86_emulate_write,
4473 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4474 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4475 .shadow.make_monitor_table = sh_make_monitor_table,
4476 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
4477 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4478 .shadow.guess_wrmap = sh_guess_wrmap,
4479 #endif
4480 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
4481 };
4483 /*
4484 * Local variables:
4485 * mode: C
4486 * c-set-style: "BSD"
4487 * c-basic-offset: 4
4488 * indent-tabs-mode: nil
4489 * End:
4490 */