ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 12564:2fd223c64fc6

[XEN] Pin l3 shadows of older x86_64 linux guests.
Older x86_64 linux kernels use one l4 table per cpu and context switch by
changing an l4 entry pointing to an l3 table. If we're shadowing them
we need to pin l3 shadows to stop them being torn down on every
context switch. (But don't do this for normal 64bit guests).
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Thu Nov 23 17:46:52 2006 +0000 (2006-11-23)
parents 47a8bb3cd123
children cdd9e366aa59
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include "private.h"
37 #include "types.h"
39 /* THINGS TO DO LATER:
40 *
41 * TEARDOWN HEURISTICS
42 * Also: have a heuristic for when to destroy a previous paging-mode's
43 * shadows. When a guest is done with its start-of-day 32-bit tables
44 * and reuses the memory we want to drop those shadows. Start with
45 * shadows in a page in two modes as a hint, but beware of clever tricks
46 * like reusing a pagetable for both PAE and 64-bit during boot...
47 *
48 * PAE LINEAR MAPS
49 * Rework shadow_get_l*e() to have the option of using map_domain_page()
50 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
51 * Then we can test the speed difference made by linear maps. If the
52 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
53 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
54 * to share l2h pages again.
55 *
56 * GUEST_WALK_TABLES TLB FLUSH COALESCE
57 * guest_walk_tables can do up to three remote TLB flushes as it walks to
58 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
59 * and if we do flush, re-do the walk. If anything has changed, then
60 * pause all the other vcpus and do the walk *again*.
61 *
62 * WP DISABLED
63 * Consider how to implement having the WP bit of CR0 set to 0.
64 * Since we need to be able to cause write faults to pagetables, this might
65 * end up looking like not having the (guest) pagetables present at all in
66 * HVM guests...
67 *
68 * PSE disabled / PSE36
69 * We don't support any modes other than PSE enabled, PSE36 disabled.
70 * Neither of those would be hard to change, but we'd need to be able to
71 * deal with shadows made in one mode and used in another.
72 */
74 #define FETCH_TYPE_PREFETCH 1
75 #define FETCH_TYPE_DEMAND 2
76 #define FETCH_TYPE_WRITE 4
77 typedef enum {
78 ft_prefetch = FETCH_TYPE_PREFETCH,
79 ft_demand_read = FETCH_TYPE_DEMAND,
80 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
81 } fetch_type_t;
83 #ifdef DEBUG_TRACE_DUMP
84 static char *fetch_type_names[] = {
85 [ft_prefetch] "prefetch",
86 [ft_demand_read] "demand read",
87 [ft_demand_write] "demand write",
88 };
89 #endif
91 /**************************************************************************/
92 /* Hash table mapping from guest pagetables to shadows
93 *
94 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
95 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
96 * shadow L1 which maps its "splinters".
97 */
99 static inline mfn_t
100 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
101 /* Look for FL1 shadows in the hash table */
102 {
103 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
105 if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
106 {
107 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
108 if ( !(sp->logdirty) )
109 shadow_convert_to_log_dirty(v, smfn);
110 }
112 return smfn;
113 }
115 static inline mfn_t
116 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
117 /* Look for shadows in the hash table */
118 {
119 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
120 perfc_incrc(shadow_get_shadow_status);
122 if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
123 {
124 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
125 if ( !(sp->logdirty) )
126 shadow_convert_to_log_dirty(v, smfn);
127 }
129 return smfn;
130 }
132 static inline void
133 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
134 /* Put an FL1 shadow into the hash table */
135 {
136 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
137 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
139 if ( unlikely(shadow_mode_log_dirty(v->domain)) )
140 // mark this shadow as a log dirty shadow...
141 mfn_to_shadow_page(smfn)->logdirty = 1;
142 else
143 mfn_to_shadow_page(smfn)->logdirty = 0;
145 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
146 }
148 static inline void
149 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
150 /* Put a shadow into the hash table */
151 {
152 struct domain *d = v->domain;
153 int res;
155 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
156 d->domain_id, v->vcpu_id, mfn_x(gmfn),
157 shadow_type, mfn_x(smfn));
159 if ( unlikely(shadow_mode_log_dirty(d)) )
160 // mark this shadow as a log dirty shadow...
161 mfn_to_shadow_page(smfn)->logdirty = 1;
162 else
163 mfn_to_shadow_page(smfn)->logdirty = 0;
165 res = get_page(mfn_to_page(gmfn), d);
166 ASSERT(res == 1);
168 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
169 }
171 static inline void
172 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
173 /* Remove a shadow from the hash table */
174 {
175 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
176 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
177 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
178 }
180 static inline void
181 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
182 /* Remove a shadow from the hash table */
183 {
184 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
185 v->domain->domain_id, v->vcpu_id,
186 mfn_x(gmfn), shadow_type, mfn_x(smfn));
187 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
188 put_page(mfn_to_page(gmfn));
189 }
191 /**************************************************************************/
192 /* CPU feature support querying */
194 static inline int
195 guest_supports_superpages(struct vcpu *v)
196 {
197 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
198 * CR4.PSE is set or the guest is in PAE or long mode */
199 return (is_hvm_vcpu(v) && (GUEST_PAGING_LEVELS != 2
200 || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
201 }
203 static inline int
204 guest_supports_nx(struct vcpu *v)
205 {
206 if ( !is_hvm_vcpu(v) )
207 return cpu_has_nx;
209 // XXX - fix this!
210 return 1;
211 }
214 /**************************************************************************/
215 /* Functions for walking the guest page tables */
218 /* Walk the guest pagetables, filling the walk_t with what we see.
219 * Takes an uninitialised walk_t. The caller must call unmap_walk()
220 * on the walk_t before discarding it or calling guest_walk_tables again.
221 * If "guest_op" is non-zero, we are serving a genuine guest memory access,
222 * and must (a) be under the shadow lock, and (b) remove write access
223 * from any gueat PT pages we see, as we will be using their contents to
224 * perform shadow updates.
225 * Returns 0 for success or non-zero if the guest pagetables are malformed.
226 * N.B. Finding a not-present entry does not cause a non-zero return code. */
227 static inline int
228 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
229 {
230 ASSERT(!guest_op || shadow_lock_is_acquired(v->domain));
232 perfc_incrc(shadow_guest_walk);
233 memset(gw, 0, sizeof(*gw));
234 gw->va = va;
236 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
237 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
238 /* Get l4e from the top level table */
239 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
240 gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
241 /* Walk down to the l3e */
242 if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
243 gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
244 if ( !valid_mfn(gw->l3mfn) ) return 1;
245 /* This mfn is a pagetable: make sure the guest can't write to it. */
246 if ( guest_op && shadow_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
247 flush_tlb_mask(v->domain->domain_dirty_cpumask);
248 gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
249 + guest_l3_table_offset(va);
250 #else /* PAE only... */
251 /* Get l3e from the top level table */
252 gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
253 gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
254 #endif /* PAE or 64... */
255 /* Walk down to the l2e */
256 if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
257 gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
258 if ( !valid_mfn(gw->l2mfn) ) return 1;
259 /* This mfn is a pagetable: make sure the guest can't write to it. */
260 if ( guest_op && shadow_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
261 flush_tlb_mask(v->domain->domain_dirty_cpumask);
262 gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
263 + guest_l2_table_offset(va);
264 #else /* 32-bit only... */
265 /* Get l2e from the top level table */
266 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
267 gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
268 #endif /* All levels... */
270 if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
271 if ( guest_supports_superpages(v) &&
272 (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
273 {
274 /* Special case: this guest VA is in a PSE superpage, so there's
275 * no guest l1e. We make one up so that the propagation code
276 * can generate a shadow l1 table. Start with the gfn of the
277 * first 4k-page of the superpage. */
278 gfn_t start = guest_l2e_get_gfn(*gw->l2e);
279 /* Grant full access in the l1e, since all the guest entry's
280 * access controls are enforced in the shadow l2e. This lets
281 * us reflect l2 changes later without touching the l1s. */
282 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
283 _PAGE_ACCESSED|_PAGE_DIRTY);
284 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
285 * of the level 1 */
286 if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
287 flags |= _PAGE_PAT;
288 /* Increment the pfn by the right number of 4k pages.
289 * The ~0x1 is to mask out the PAT bit mentioned above. */
290 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
291 gw->eff_l1e = guest_l1e_from_gfn(start, flags);
292 gw->l1e = NULL;
293 gw->l1mfn = _mfn(INVALID_MFN);
294 }
295 else
296 {
297 /* Not a superpage: carry on and find the l1e. */
298 gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
299 if ( !valid_mfn(gw->l1mfn) ) return 1;
300 /* This mfn is a pagetable: make sure the guest can't write to it. */
301 if ( guest_op
302 && shadow_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
303 flush_tlb_mask(v->domain->domain_dirty_cpumask);
304 gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
305 + guest_l1_table_offset(va);
306 gw->eff_l1e = *gw->l1e;
307 }
309 return 0;
310 }
312 /* Given a walk_t, translate the gw->va into the guest's notion of the
313 * corresponding frame number. */
314 static inline gfn_t
315 guest_walk_to_gfn(walk_t *gw)
316 {
317 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
318 return _gfn(INVALID_GFN);
319 return guest_l1e_get_gfn(gw->eff_l1e);
320 }
322 /* Given a walk_t, translate the gw->va into the guest's notion of the
323 * corresponding physical address. */
324 static inline paddr_t
325 guest_walk_to_gpa(walk_t *gw)
326 {
327 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
328 return 0;
329 return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
330 }
333 /* Unmap (and reinitialise) a guest walk.
334 * Call this to dispose of any walk filled in by guest_walk_tables() */
335 static void unmap_walk(struct vcpu *v, walk_t *gw)
336 {
337 #if GUEST_PAGING_LEVELS >= 3
338 #if GUEST_PAGING_LEVELS >= 4
339 if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
340 #endif
341 if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
342 #endif
343 if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
344 #ifdef DEBUG
345 memset(gw, 0, sizeof(*gw));
346 #endif
347 }
350 /* Pretty-print the contents of a guest-walk */
351 static inline void print_gw(walk_t *gw)
352 {
353 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
354 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
355 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
356 SHADOW_PRINTK(" l4mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l4mfn));
357 SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
358 if ( gw->l4e )
359 SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
360 #endif /* PAE or 64... */
361 SHADOW_PRINTK(" l3mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l3mfn));
362 SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
363 if ( gw->l3e )
364 SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
365 #endif /* All levels... */
366 SHADOW_PRINTK(" l2mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l2mfn));
367 SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
368 if ( gw->l2e )
369 SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
370 SHADOW_PRINTK(" l1mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l1mfn));
371 SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
372 if ( gw->l1e )
373 SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
374 SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
375 }
378 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
379 /* Lightweight audit: pass all the shadows associated with this guest walk
380 * through the audit mechanisms */
381 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
382 {
383 mfn_t smfn;
385 if ( !(SHADOW_AUDIT_ENABLE) )
386 return;
388 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
389 if ( valid_mfn(gw->l4mfn)
390 && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn,
391 SH_type_l4_shadow))) )
392 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
393 if ( valid_mfn(gw->l3mfn)
394 && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn,
395 SH_type_l3_shadow))) )
396 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
397 #endif /* PAE or 64... */
398 if ( valid_mfn(gw->l2mfn) )
399 {
400 if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
401 SH_type_l2_shadow))) )
402 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
403 #if GUEST_PAGING_LEVELS == 3
404 if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
405 SH_type_l2h_shadow))) )
406 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
407 #endif
408 }
409 if ( valid_mfn(gw->l1mfn)
410 && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn,
411 SH_type_l1_shadow))) )
412 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
413 else if ( gw->l2e
414 && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
415 && valid_mfn(
416 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
417 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
418 }
420 #else
421 #define sh_audit_gw(_v, _gw) do {} while(0)
422 #endif /* audit code */
426 /**************************************************************************/
427 /* Function to write to the guest tables, for propagating accessed and
428 * dirty bits from the shadow to the guest.
429 * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
430 * and an operation type. The guest entry is always passed as an l1e:
431 * since we only ever write flags, that's OK.
432 * Returns the new flag bits of the guest entry. */
434 static u32 guest_set_ad_bits(struct vcpu *v,
435 mfn_t gmfn,
436 guest_l1e_t *ep,
437 unsigned int level,
438 fetch_type_t ft)
439 {
440 u32 flags;
441 int res = 0;
443 ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
444 ASSERT(level <= GUEST_PAGING_LEVELS);
445 ASSERT(shadow_lock_is_acquired(v->domain));
447 flags = guest_l1e_get_flags(*ep);
449 /* Only set A and D bits for guest-initiated accesses */
450 if ( !(ft & FETCH_TYPE_DEMAND) )
451 return flags;
453 ASSERT(valid_mfn(gmfn)
454 && (sh_mfn_is_a_page_table(gmfn)
455 || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
456 == 0)));
458 /* PAE l3s do not have A and D bits */
459 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
461 /* Need the D bit as well for writes, in L1es and PSE L2es. */
462 if ( ft == ft_demand_write
463 && (level == 1 ||
464 (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
465 {
466 if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
467 == (_PAGE_DIRTY | _PAGE_ACCESSED) )
468 return flags; /* Guest already has A and D bits set */
469 flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
470 perfc_incrc(shadow_ad_update);
471 }
472 else
473 {
474 if ( flags & _PAGE_ACCESSED )
475 return flags; /* Guest already has A bit set */
476 flags |= _PAGE_ACCESSED;
477 perfc_incrc(shadow_a_update);
478 }
480 /* Set the bit(s) */
481 sh_mark_dirty(v->domain, gmfn);
482 SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
483 "old flags = %#x, new flags = %#x\n",
484 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep),
485 flags);
486 *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
488 /* Propagate this change to any other shadows of the page
489 * (only necessary if there is more than one shadow) */
490 if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
491 {
492 u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
493 /* More than one type bit set in shadow-flags? */
494 if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
495 res = __shadow_validate_guest_entry(v, gmfn, ep, sizeof(*ep));
496 }
498 /* We should never need to flush the TLB or recopy PAE entries */
499 ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
501 return flags;
502 }
504 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
505 void *
506 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
507 unsigned long *gl1mfn)
508 {
509 void *pl1e = NULL;
510 walk_t gw;
512 ASSERT(shadow_mode_translate(v->domain));
514 // XXX -- this is expensive, but it's easy to cobble together...
515 // FIXME!
517 shadow_lock(v->domain);
518 guest_walk_tables(v, addr, &gw, 1);
520 if ( gw.l2e &&
521 (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) &&
522 !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) )
523 {
524 if ( gl1mfn )
525 *gl1mfn = mfn_x(gw.l1mfn);
526 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
527 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
528 }
530 unmap_walk(v, &gw);
531 shadow_unlock(v->domain);
533 return pl1e;
534 }
536 void
537 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
538 {
539 walk_t gw;
541 ASSERT(shadow_mode_translate(v->domain));
543 // XXX -- this is expensive, but it's easy to cobble together...
544 // FIXME!
546 shadow_lock(v->domain);
547 guest_walk_tables(v, addr, &gw, 1);
548 *(guest_l1e_t *)eff_l1e = gw.eff_l1e;
549 unmap_walk(v, &gw);
550 shadow_unlock(v->domain);
551 }
552 #endif /* CONFIG==SHADOW==GUEST */
554 /**************************************************************************/
555 /* Functions to compute the correct index into a shadow page, given an
556 * index into the guest page (as returned by guest_get_index()).
557 * This is trivial when the shadow and guest use the same sized PTEs, but
558 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
559 * PAE- or 64-bit shadows).
560 *
561 * These functions also increment the shadow mfn, when necessary. When PTE
562 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
563 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
564 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
565 * which shadow page we really want. Similarly, when PTE sizes are
566 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
567 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
568 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
569 * space.)
570 *
571 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
572 * of shadow (to store both the shadow, and the info that would normally be
573 * stored in page_info fields). This arrangement allows the shadow and the
574 * "page_info" fields to always be stored in the same page (in fact, in
575 * the same cache line), avoiding an extra call to map_domain_page().
576 */
578 static inline u32
579 guest_index(void *ptr)
580 {
581 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
582 }
584 static inline u32
585 shadow_l1_index(mfn_t *smfn, u32 guest_index)
586 {
587 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
588 *smfn = _mfn(mfn_x(*smfn) +
589 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
590 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
591 #else
592 return guest_index;
593 #endif
594 }
596 static inline u32
597 shadow_l2_index(mfn_t *smfn, u32 guest_index)
598 {
599 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
600 // Because we use 2 shadow l2 entries for each guest entry, the number of
601 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
602 //
603 *smfn = _mfn(mfn_x(*smfn) +
604 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
606 // We multiple by two to get the index of the first of the two entries
607 // used to shadow the specified guest entry.
608 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
609 #else
610 return guest_index;
611 #endif
612 }
614 #if GUEST_PAGING_LEVELS >= 4
616 static inline u32
617 shadow_l3_index(mfn_t *smfn, u32 guest_index)
618 {
619 return guest_index;
620 }
622 static inline u32
623 shadow_l4_index(mfn_t *smfn, u32 guest_index)
624 {
625 return guest_index;
626 }
628 #endif // GUEST_PAGING_LEVELS >= 4
631 /**************************************************************************/
632 /* Function which computes shadow entries from their corresponding guest
633 * entries. This is the "heart" of the shadow code. It operates using
634 * level-1 shadow types, but handles all levels of entry.
635 * Don't call it directly, but use the four wrappers below.
636 */
638 static always_inline void
639 _sh_propagate(struct vcpu *v,
640 void *guest_entry_ptr,
641 mfn_t guest_table_mfn,
642 mfn_t target_mfn,
643 void *shadow_entry_ptr,
644 int level,
645 fetch_type_t ft,
646 int mmio)
647 {
648 guest_l1e_t *gp = guest_entry_ptr;
649 shadow_l1e_t *sp = shadow_entry_ptr;
650 struct domain *d = v->domain;
651 u32 pass_thru_flags;
652 u32 gflags, sflags;
654 /* We don't shadow PAE l3s */
655 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
657 if ( valid_mfn(guest_table_mfn) )
658 /* Handle A and D bit propagation into the guest */
659 gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
660 else
661 {
662 /* Must be an fl1e or a prefetch */
663 ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
664 gflags = guest_l1e_get_flags(*gp);
665 }
667 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
668 {
669 /* If a guest l1 entry is not present, shadow with the magic
670 * guest-not-present entry. */
671 if ( level == 1 )
672 *sp = sh_l1e_gnp();
673 else
674 *sp = shadow_l1e_empty();
675 goto done;
676 }
678 if ( level == 1 && mmio )
679 {
680 /* Guest l1e maps MMIO space */
681 *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
682 goto done;
683 }
685 // Must have a valid target_mfn, unless this is a prefetch. In the
686 // case of a prefetch, an invalid mfn means that we can not usefully
687 // shadow anything, and so we return early.
688 //
689 if ( !valid_mfn(target_mfn) )
690 {
691 ASSERT((ft == ft_prefetch));
692 *sp = shadow_l1e_empty();
693 goto done;
694 }
696 // Propagate bits from the guest to the shadow.
697 // Some of these may be overwritten, below.
698 // Since we know the guest's PRESENT bit is set, we also set the shadow's
699 // SHADOW_PRESENT bit.
700 //
701 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
702 _PAGE_RW | _PAGE_PRESENT);
703 if ( guest_supports_nx(v) )
704 pass_thru_flags |= _PAGE_NX_BIT;
705 sflags = gflags & pass_thru_flags;
707 // Set the A&D bits for higher level shadows.
708 // Higher level entries do not, strictly speaking, have dirty bits, but
709 // since we use shadow linear tables, each of these entries may, at some
710 // point in time, also serve as a shadow L1 entry.
711 // By setting both the A&D bits in each of these, we eliminate the burden
712 // on the hardware to update these bits on initial accesses.
713 //
714 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
715 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
717 // If the A or D bit has not yet been set in the guest, then we must
718 // prevent the corresponding kind of access.
719 //
720 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
721 sflags &= ~_PAGE_PRESENT;
723 /* D bits exist in L1es and PSE L2es */
724 if ( unlikely(((level == 1) ||
725 ((level == 2) &&
726 (gflags & _PAGE_PSE) &&
727 guest_supports_superpages(v)))
728 && !(gflags & _PAGE_DIRTY)) )
729 sflags &= ~_PAGE_RW;
731 // shadow_mode_log_dirty support
732 //
733 // Only allow the guest write access to a page a) on a demand fault,
734 // or b) if the page is already marked as dirty.
735 //
736 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
737 {
738 if ( ft & FETCH_TYPE_WRITE )
739 sh_mark_dirty(d, target_mfn);
740 else if ( !sh_mfn_is_dirty(d, target_mfn) )
741 sflags &= ~_PAGE_RW;
742 }
744 // protect guest page tables
745 //
746 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
747 {
748 if ( shadow_mode_trap_reads(d) )
749 {
750 // if we are trapping both reads & writes, then mark this page
751 // as not present...
752 //
753 sflags &= ~_PAGE_PRESENT;
754 }
755 else
756 {
757 // otherwise, just prevent any writes...
758 //
759 sflags &= ~_PAGE_RW;
760 }
761 }
763 // PV guests in 64-bit mode use two different page tables for user vs
764 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
765 // It is always shadowed as present...
766 if ( (GUEST_PAGING_LEVELS == 4) && !is_hvm_domain(d) )
767 {
768 sflags |= _PAGE_USER;
769 }
771 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
772 done:
773 SHADOW_DEBUG(PROPAGATE,
774 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
775 fetch_type_names[ft], level, gp->l1, sp->l1);
776 }
779 /* These four wrappers give us a little bit of type-safety back around the
780 * use of void-* pointers in _sh_propagate(), and allow the compiler to
781 * optimize out some level checks. */
783 #if GUEST_PAGING_LEVELS >= 4
784 static void
785 l4e_propagate_from_guest(struct vcpu *v,
786 guest_l4e_t *gl4e,
787 mfn_t gl4mfn,
788 mfn_t sl3mfn,
789 shadow_l4e_t *sl4e,
790 fetch_type_t ft)
791 {
792 _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, 0);
793 }
795 static void
796 l3e_propagate_from_guest(struct vcpu *v,
797 guest_l3e_t *gl3e,
798 mfn_t gl3mfn,
799 mfn_t sl2mfn,
800 shadow_l3e_t *sl3e,
801 fetch_type_t ft)
802 {
803 _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, 0);
804 }
805 #endif // GUEST_PAGING_LEVELS >= 4
807 static void
808 l2e_propagate_from_guest(struct vcpu *v,
809 guest_l2e_t *gl2e,
810 mfn_t gl2mfn,
811 mfn_t sl1mfn,
812 shadow_l2e_t *sl2e,
813 fetch_type_t ft)
814 {
815 _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, 0);
816 }
818 static void
819 l1e_propagate_from_guest(struct vcpu *v,
820 guest_l1e_t *gl1e,
821 mfn_t gl1mfn,
822 mfn_t gmfn,
823 shadow_l1e_t *sl1e,
824 fetch_type_t ft,
825 int mmio)
826 {
827 _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, mmio);
828 }
831 /**************************************************************************/
832 /* These functions update shadow entries (and do bookkeeping on the shadow
833 * tables they are in). It is intended that they are the only
834 * functions which ever write (non-zero) data onto a shadow page.
835 */
837 static inline void safe_write_entry(void *dst, void *src)
838 /* Copy one PTE safely when processors might be running on the
839 * destination pagetable. This does *not* give safety against
840 * concurrent writes (that's what the shadow lock is for), just
841 * stops the hardware picking up partially written entries. */
842 {
843 volatile unsigned long *d = dst;
844 unsigned long *s = src;
845 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
846 #if CONFIG_PAGING_LEVELS == 3
847 /* In PAE mode, pagetable entries are larger
848 * than machine words, so won't get written atomically. We need to make
849 * sure any other cpu running on these shadows doesn't see a
850 * half-written entry. Do this by marking the entry not-present first,
851 * then writing the high word before the low word. */
852 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
853 d[0] = 0;
854 d[1] = s[1];
855 d[0] = s[0];
856 #else
857 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
858 * which will be an atomic write, since the entry is aligned. */
859 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
860 *d = *s;
861 #endif
862 }
865 static inline void
866 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
867 /* This function does the actual writes to shadow pages.
868 * It must not be called directly, since it doesn't do the bookkeeping
869 * that shadow_set_l*e() functions do. */
870 {
871 shadow_l1e_t *dst = d;
872 shadow_l1e_t *src = s;
873 void *map = NULL;
874 int i;
876 /* Because we mirror access rights at all levels in the shadow, an
877 * l2 (or higher) entry with the RW bit cleared will leave us with
878 * no write access through the linear map.
879 * We detect that by writing to the shadow with copy_to_user() and
880 * using map_domain_page() to get a writeable mapping if we need to. */
881 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
882 {
883 perfc_incrc(shadow_linear_map_failed);
884 map = sh_map_domain_page(mfn);
885 ASSERT(map != NULL);
886 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
887 }
890 for ( i = 0; i < entries; i++ )
891 safe_write_entry(dst++, src++);
893 if ( map != NULL ) sh_unmap_domain_page(map);
894 }
896 static inline int
897 perms_strictly_increased(u32 old_flags, u32 new_flags)
898 /* Given the flags of two entries, are the new flags a strict
899 * increase in rights over the old ones? */
900 {
901 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
902 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
903 /* Flip the NX bit, since it's the only one that decreases rights;
904 * we calculate as if it were an "X" bit. */
905 of ^= _PAGE_NX_BIT;
906 nf ^= _PAGE_NX_BIT;
907 /* If the changed bits are all set in the new flags, then rights strictly
908 * increased between old and new. */
909 return ((of | (of ^ nf)) == nf);
910 }
912 static int inline
913 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
914 {
915 int res;
916 mfn_t mfn;
917 struct domain *owner;
919 ASSERT(!sh_l1e_is_magic(sl1e));
921 if ( !shadow_mode_refcounts(d) )
922 return 1;
924 res = get_page_from_l1e(sl1e, d);
926 // If a privileged domain is attempting to install a map of a page it does
927 // not own, we let it succeed anyway.
928 //
929 if ( unlikely(!res) &&
930 IS_PRIV(d) &&
931 !shadow_mode_translate(d) &&
932 valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
933 (owner = page_get_owner(mfn_to_page(mfn))) &&
934 (d != owner) )
935 {
936 res = get_page_from_l1e(sl1e, owner);
937 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
938 "which is owned by domain %d: %s\n",
939 d->domain_id, mfn_x(mfn), owner->domain_id,
940 res ? "success" : "failed");
941 }
943 if ( unlikely(!res) )
944 {
945 perfc_incrc(shadow_get_page_fail);
946 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
947 }
949 return res;
950 }
952 static void inline
953 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
954 {
955 if ( !shadow_mode_refcounts(d) )
956 return;
958 put_page_from_l1e(sl1e, d);
959 }
961 #if GUEST_PAGING_LEVELS >= 4
962 static int shadow_set_l4e(struct vcpu *v,
963 shadow_l4e_t *sl4e,
964 shadow_l4e_t new_sl4e,
965 mfn_t sl4mfn)
966 {
967 int flags = 0, ok;
968 shadow_l4e_t old_sl4e;
969 paddr_t paddr;
970 ASSERT(sl4e != NULL);
971 old_sl4e = *sl4e;
973 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
975 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
976 | (((unsigned long)sl4e) & ~PAGE_MASK));
978 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
979 {
980 /* About to install a new reference */
981 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
982 ok = sh_get_ref(v, sl3mfn, paddr);
983 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
984 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
985 ok |= sh_pin(v, sl3mfn);
986 if ( !ok )
987 {
988 domain_crash(v->domain);
989 return SHADOW_SET_ERROR;
990 }
991 }
993 /* Write the new entry */
994 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
995 flags |= SHADOW_SET_CHANGED;
997 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
998 {
999 /* We lost a reference to an old mfn. */
1000 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1001 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1002 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1003 shadow_l4e_get_flags(new_sl4e)) )
1005 flags |= SHADOW_SET_FLUSH;
1007 sh_put_ref(v, osl3mfn, paddr);
1009 return flags;
1012 static int shadow_set_l3e(struct vcpu *v,
1013 shadow_l3e_t *sl3e,
1014 shadow_l3e_t new_sl3e,
1015 mfn_t sl3mfn)
1017 int flags = 0;
1018 shadow_l3e_t old_sl3e;
1019 paddr_t paddr;
1020 ASSERT(sl3e != NULL);
1021 old_sl3e = *sl3e;
1023 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1025 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1026 | (((unsigned long)sl3e) & ~PAGE_MASK));
1028 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1029 /* About to install a new reference */
1030 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1032 domain_crash(v->domain);
1033 return SHADOW_SET_ERROR;
1036 /* Write the new entry */
1037 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1038 flags |= SHADOW_SET_CHANGED;
1040 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1042 /* We lost a reference to an old mfn. */
1043 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1044 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1045 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1046 shadow_l3e_get_flags(new_sl3e)) )
1048 flags |= SHADOW_SET_FLUSH;
1050 sh_put_ref(v, osl2mfn, paddr);
1052 return flags;
1054 #endif /* GUEST_PAGING_LEVELS >= 4 */
1056 static int shadow_set_l2e(struct vcpu *v,
1057 shadow_l2e_t *sl2e,
1058 shadow_l2e_t new_sl2e,
1059 mfn_t sl2mfn)
1061 int flags = 0;
1062 shadow_l2e_t old_sl2e;
1063 paddr_t paddr;
1065 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1066 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1067 * shadows. Reference counting and up-pointers track from the first
1068 * page of the shadow to the first l2e, so make sure that we're
1069 * working with those:
1070 * Align the pointer down so it's pointing at the first of the pair */
1071 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1072 /* Align the mfn of the shadow entry too */
1073 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1074 #endif
1076 ASSERT(sl2e != NULL);
1077 old_sl2e = *sl2e;
1079 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1081 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1082 | (((unsigned long)sl2e) & ~PAGE_MASK));
1084 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1085 /* About to install a new reference */
1086 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1088 domain_crash(v->domain);
1089 return SHADOW_SET_ERROR;
1092 /* Write the new entry */
1093 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1095 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1096 /* The l1 shadow is two pages long and need to be pointed to by
1097 * two adjacent l1es. The pair have the same flags, but point
1098 * at odd and even MFNs */
1099 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1100 pair[1].l2 |= (1<<PAGE_SHIFT);
1101 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1103 #else /* normal case */
1104 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1105 #endif
1106 flags |= SHADOW_SET_CHANGED;
1108 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1110 /* We lost a reference to an old mfn. */
1111 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1112 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1113 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1114 shadow_l2e_get_flags(new_sl2e)) )
1116 flags |= SHADOW_SET_FLUSH;
1118 sh_put_ref(v, osl1mfn, paddr);
1120 return flags;
1123 static int shadow_set_l1e(struct vcpu *v,
1124 shadow_l1e_t *sl1e,
1125 shadow_l1e_t new_sl1e,
1126 mfn_t sl1mfn)
1128 int flags = 0;
1129 struct domain *d = v->domain;
1130 shadow_l1e_t old_sl1e;
1131 ASSERT(sl1e != NULL);
1133 old_sl1e = *sl1e;
1135 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1137 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1138 && !sh_l1e_is_magic(new_sl1e) )
1140 /* About to install a new reference */
1141 if ( shadow_mode_refcounts(d) ) {
1142 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1144 /* Doesn't look like a pagetable. */
1145 flags |= SHADOW_SET_ERROR;
1146 new_sl1e = shadow_l1e_empty();
1151 /* Write the new entry */
1152 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1153 flags |= SHADOW_SET_CHANGED;
1155 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1156 && !sh_l1e_is_magic(old_sl1e) )
1158 /* We lost a reference to an old mfn. */
1159 /* N.B. Unlike higher-level sets, never need an extra flush
1160 * when writing an l1e. Because it points to the same guest frame
1161 * as the guest l1e did, it's the guest's responsibility to
1162 * trigger a flush later. */
1163 if ( shadow_mode_refcounts(d) )
1165 shadow_put_page_from_l1e(old_sl1e, d);
1168 return flags;
1172 /**************************************************************************/
1173 /* Macros to walk pagetables. These take the shadow of a pagetable and
1174 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1175 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1176 * second entry (since pairs of entries are managed together). For multi-page
1177 * shadows they walk all pages.
1179 * Arguments are an MFN, the variable to point to each entry, a variable
1180 * to indicate that we are done (we will shortcut to the end of the scan
1181 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1182 * and the code.
1184 * WARNING: These macros have side-effects. They change the values of both
1185 * the pointer and the MFN. */
1187 static inline void increment_ptr_to_guest_entry(void *ptr)
1189 if ( ptr )
1191 guest_l1e_t **entry = ptr;
1192 (*entry)++;
1196 /* All kinds of l1: touch all entries */
1197 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1198 do { \
1199 int _i; \
1200 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1201 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1202 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1203 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1204 { \
1205 (_sl1e) = _sp + _i; \
1206 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1207 {_code} \
1208 if ( _done ) break; \
1209 increment_ptr_to_guest_entry(_gl1p); \
1210 } \
1211 unmap_shadow_page(_sp); \
1212 } while (0)
1214 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1215 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1216 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1217 do { \
1218 int __done = 0; \
1219 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1220 ({ (__done = _done); }), _code); \
1221 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1222 if ( !__done ) \
1223 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1224 ({ (__done = _done); }), _code); \
1225 } while (0)
1226 #else /* Everything else; l1 shadows are only one page */
1227 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1228 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1229 #endif
1232 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1234 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1235 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1236 do { \
1237 int _i, _j, __done = 0; \
1238 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1239 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1240 { \
1241 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1242 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1243 if ( (!(_xen)) \
1244 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1245 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1246 { \
1247 (_sl2e) = _sp + _i; \
1248 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1249 {_code} \
1250 if ( (__done = (_done)) ) break; \
1251 increment_ptr_to_guest_entry(_gl2p); \
1252 } \
1253 unmap_shadow_page(_sp); \
1254 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1255 } \
1256 } while (0)
1258 #elif GUEST_PAGING_LEVELS == 2
1260 /* 32-bit on 32-bit: avoid Xen entries */
1261 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1262 do { \
1263 int _i; \
1264 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1265 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1266 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1267 if ( (!(_xen)) \
1268 || \
1269 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1270 { \
1271 (_sl2e) = _sp + _i; \
1272 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1273 {_code} \
1274 if ( _done ) break; \
1275 increment_ptr_to_guest_entry(_gl2p); \
1276 } \
1277 unmap_shadow_page(_sp); \
1278 } while (0)
1280 #elif GUEST_PAGING_LEVELS == 3
1282 /* PAE: if it's an l2h, don't touch Xen mappings */
1283 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1284 do { \
1285 int _i; \
1286 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1287 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1288 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1289 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1290 if ( (!(_xen)) \
1291 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1292 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1293 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1294 { \
1295 (_sl2e) = _sp + _i; \
1296 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1297 {_code} \
1298 if ( _done ) break; \
1299 increment_ptr_to_guest_entry(_gl2p); \
1300 } \
1301 unmap_shadow_page(_sp); \
1302 } while (0)
1304 #else
1306 /* 64-bit l2: touch all entries */
1307 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1308 do { \
1309 int _i; \
1310 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1311 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow); \
1312 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1313 { \
1314 (_sl2e) = _sp + _i; \
1315 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1316 {_code} \
1317 if ( _done ) break; \
1318 increment_ptr_to_guest_entry(_gl2p); \
1319 } \
1320 unmap_shadow_page(_sp); \
1321 } while (0)
1323 #endif /* different kinds of l2 */
1325 #if GUEST_PAGING_LEVELS == 4
1327 /* 64-bit l3: touch all entries */
1328 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1329 do { \
1330 int _i; \
1331 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1332 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1333 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1334 { \
1335 (_sl3e) = _sp + _i; \
1336 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1337 {_code} \
1338 if ( _done ) break; \
1339 increment_ptr_to_guest_entry(_gl3p); \
1340 } \
1341 unmap_shadow_page(_sp); \
1342 } while (0)
1344 /* 64-bit l4: avoid Xen mappings */
1345 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \
1346 do { \
1347 int _i; \
1348 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1349 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1350 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1351 { \
1352 if ( (!(_xen)) || is_guest_l4_slot(_i) ) \
1353 { \
1354 (_sl4e) = _sp + _i; \
1355 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1356 {_code} \
1357 if ( _done ) break; \
1358 } \
1359 increment_ptr_to_guest_entry(_gl4p); \
1360 } \
1361 unmap_shadow_page(_sp); \
1362 } while (0)
1364 #endif
1368 /**************************************************************************/
1369 /* Functions to install Xen mappings and linear mappings in shadow pages */
1371 // XXX -- this function should probably be moved to shadow-common.c, but that
1372 // probably wants to wait until the shadow types have been moved from
1373 // shadow-types.h to shadow-private.h
1374 //
1375 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1376 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1378 struct domain *d = v->domain;
1379 shadow_l4e_t *sl4e;
1381 sl4e = sh_map_domain_page(sl4mfn);
1382 ASSERT(sl4e != NULL);
1383 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1385 /* Copy the common Xen mappings from the idle domain */
1386 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1387 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1388 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1390 /* Install the per-domain mappings for this domain */
1391 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1392 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1393 __PAGE_HYPERVISOR);
1395 /* Linear mapping */
1396 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1397 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1399 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1401 // linear tables may not be used with translated PV guests
1402 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1403 shadow_l4e_empty();
1405 else
1407 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1408 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1411 if ( shadow_mode_translate(v->domain) )
1413 /* install domain-specific P2M table */
1414 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1415 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1416 __PAGE_HYPERVISOR);
1419 sh_unmap_domain_page(sl4e);
1421 #endif
1423 #if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
1424 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1425 // place, which means that we need to populate the l2h entry in the l3
1426 // table.
1428 void sh_install_xen_entries_in_l2h(struct vcpu *v,
1429 mfn_t sl2hmfn)
1431 struct domain *d = v->domain;
1432 shadow_l2e_t *sl2e;
1433 int i;
1435 sl2e = sh_map_domain_page(sl2hmfn);
1436 ASSERT(sl2e != NULL);
1437 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1439 /* Copy the common Xen mappings from the idle domain */
1440 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1441 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1442 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1444 /* Install the per-domain mappings for this domain */
1445 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1446 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1447 shadow_l2e_from_mfn(
1448 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1449 __PAGE_HYPERVISOR);
1451 /* We don't set up a linear mapping here because we can't until this
1452 * l2h is installed in an l3e. sh_update_linear_entries() handles
1453 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1454 * We zero them here, just as a safety measure.
1455 */
1456 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1457 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1458 shadow_l2e_empty();
1459 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1460 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1461 shadow_l2e_empty();
1463 if ( shadow_mode_translate(d) )
1465 /* Install the domain-specific p2m table */
1466 l3_pgentry_t *p2m;
1467 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1468 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1469 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1471 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1472 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1473 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1474 __PAGE_HYPERVISOR)
1475 : shadow_l2e_empty();
1477 sh_unmap_domain_page(p2m);
1480 sh_unmap_domain_page(sl2e);
1482 #endif
1485 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1486 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1488 struct domain *d = v->domain;
1489 shadow_l2e_t *sl2e;
1490 int i;
1492 sl2e = sh_map_domain_page(sl2mfn);
1493 ASSERT(sl2e != NULL);
1494 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1496 /* Copy the common Xen mappings from the idle domain */
1497 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1498 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1499 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1501 /* Install the per-domain mappings for this domain */
1502 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1503 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1504 shadow_l2e_from_mfn(
1505 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1506 __PAGE_HYPERVISOR);
1508 /* Linear mapping */
1509 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1510 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1512 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1514 // linear tables may not be used with translated PV guests
1515 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1516 shadow_l2e_empty();
1518 else
1520 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1521 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1524 if ( shadow_mode_translate(d) )
1526 /* install domain-specific P2M table */
1527 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1528 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1529 __PAGE_HYPERVISOR);
1532 sh_unmap_domain_page(sl2e);
1534 #endif
1538 /**************************************************************************/
1539 /* Create a shadow of a given guest page.
1540 */
1541 static mfn_t
1542 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1544 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1545 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1546 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1548 if ( shadow_type != SH_type_l2_32_shadow
1549 && shadow_type != SH_type_l2_pae_shadow
1550 && shadow_type != SH_type_l2h_pae_shadow
1551 && shadow_type != SH_type_l4_64_shadow )
1552 /* Lower-level shadow, not yet linked form a higher level */
1553 mfn_to_shadow_page(smfn)->up = 0;
1555 #if GUEST_PAGING_LEVELS == 4
1556 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1557 if ( shadow_type == SH_type_l4_64_shadow &&
1558 unlikely(v->domain->arch.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1560 /* We're shadowing a new l4, but we've been assuming the guest uses
1561 * only one l4 per vcpu and context switches using an l4 entry.
1562 * Count the number of active l4 shadows. If there are enough
1563 * of them, decide that this isn't an old linux guest, and stop
1564 * pinning l3es. This is not very quick but it doesn't happen
1565 * very often. */
1566 struct list_head *l, *t;
1567 struct shadow_page_info *sp;
1568 struct vcpu *v2;
1569 int l4count = 0, vcpus = 0;
1570 list_for_each(l, &v->domain->arch.shadow.pinned_shadows)
1572 sp = list_entry(l, struct shadow_page_info, list);
1573 if ( sp->type == SH_type_l4_64_shadow )
1574 l4count++;
1576 for_each_vcpu ( v->domain, v2 )
1577 vcpus++;
1578 if ( l4count > 2 * vcpus )
1580 /* Unpin all the pinned l3 tables, and don't pin any more. */
1581 list_for_each_safe(l, t, &v->domain->arch.shadow.pinned_shadows)
1583 sp = list_entry(l, struct shadow_page_info, list);
1584 if ( sp->type == SH_type_l3_64_shadow )
1585 sh_unpin(v, shadow_page_to_mfn(sp));
1587 v->domain->arch.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1590 #endif
1591 #endif
1593 // Create the Xen mappings...
1594 if ( !shadow_mode_external(v->domain) )
1596 switch (shadow_type)
1598 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1599 case SH_type_l4_shadow:
1600 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1601 #endif
1602 #if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
1603 case SH_type_l2h_shadow:
1604 sh_install_xen_entries_in_l2h(v, smfn); break;
1605 #endif
1606 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1607 case SH_type_l2_shadow:
1608 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1609 #endif
1610 default: /* Do nothing */ break;
1614 shadow_promote(v, gmfn, shadow_type);
1615 set_shadow_status(v, gmfn, shadow_type, smfn);
1617 return smfn;
1620 /* Make a splintered superpage shadow */
1621 static mfn_t
1622 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1624 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1625 (unsigned long) gfn_x(gfn));
1627 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" SH_PRI_mfn "\n",
1628 gfn_x(gfn), mfn_x(smfn));
1630 set_fl1_shadow_status(v, gfn, smfn);
1631 return smfn;
1635 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1636 mfn_t
1637 sh_make_monitor_table(struct vcpu *v)
1640 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1642 #if CONFIG_PAGING_LEVELS == 4
1644 struct domain *d = v->domain;
1645 mfn_t m4mfn;
1646 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1647 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1648 /* Remember the level of this table */
1649 mfn_to_page(m4mfn)->shadow_flags = 4;
1650 #if SHADOW_PAGING_LEVELS < 4
1651 // Install a monitor l3 table in slot 0 of the l4 table.
1652 // This is used for shadow linear maps.
1654 mfn_t m3mfn;
1655 l4_pgentry_t *l4e;
1656 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1657 mfn_to_page(m3mfn)->shadow_flags = 3;
1658 l4e = sh_map_domain_page(m4mfn);
1659 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1660 sh_unmap_domain_page(l4e);
1662 #endif /* SHADOW_PAGING_LEVELS < 4 */
1663 return m4mfn;
1666 #elif CONFIG_PAGING_LEVELS == 3
1669 struct domain *d = v->domain;
1670 mfn_t m3mfn, m2mfn;
1671 l3_pgentry_t *l3e;
1672 l2_pgentry_t *l2e;
1673 int i;
1675 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1676 /* Remember the level of this table */
1677 mfn_to_page(m3mfn)->shadow_flags = 3;
1679 // Install a monitor l2 table in slot 3 of the l3 table.
1680 // This is used for all Xen entries, including linear maps
1681 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1682 mfn_to_page(m2mfn)->shadow_flags = 2;
1683 l3e = sh_map_domain_page(m3mfn);
1684 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1685 sh_install_xen_entries_in_l2h(v, m2mfn);
1686 /* Install the monitor's own linear map */
1687 l2e = sh_map_domain_page(m2mfn);
1688 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1689 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1690 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1691 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1692 : l2e_empty();
1693 sh_unmap_domain_page(l2e);
1694 sh_unmap_domain_page(l3e);
1696 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1697 return m3mfn;
1700 #elif CONFIG_PAGING_LEVELS == 2
1703 struct domain *d = v->domain;
1704 mfn_t m2mfn;
1705 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1706 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1707 /* Remember the level of this table */
1708 mfn_to_page(m2mfn)->shadow_flags = 2;
1709 return m2mfn;
1712 #else
1713 #error this should not happen
1714 #endif /* CONFIG_PAGING_LEVELS */
1716 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1718 /**************************************************************************/
1719 /* These functions also take a virtual address and return the level-N
1720 * shadow table mfn and entry, but they create the shadow pagetables if
1721 * they are needed. The "demand" argument is non-zero when handling
1722 * a demand fault (so we know what to do about accessed bits &c).
1723 * If the necessary tables are not present in the guest, they return NULL. */
1725 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1726 * more levels than the guest, the upper levels are always fixed and do not
1727 * reflect any information from the guest, so we do not use these functions
1728 * to access them. */
1730 #if GUEST_PAGING_LEVELS >= 4
1731 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1732 walk_t *gw,
1733 mfn_t *sl4mfn)
1735 /* There is always a shadow of the top level table. Get it. */
1736 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1737 /* Reading the top level table is always valid. */
1738 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1741 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1742 walk_t *gw,
1743 mfn_t *sl3mfn,
1744 fetch_type_t ft)
1746 mfn_t sl4mfn;
1747 shadow_l4e_t *sl4e;
1748 if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
1749 /* Get the l4e */
1750 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1751 ASSERT(sl4e != NULL);
1752 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1754 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1755 ASSERT(valid_mfn(*sl3mfn));
1757 else
1759 int r;
1760 shadow_l4e_t new_sl4e;
1761 /* No l3 shadow installed: find and install it. */
1762 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1763 if ( !valid_mfn(*sl3mfn) )
1765 /* No l3 shadow of this page exists at all: make one. */
1766 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1768 /* Install the new sl3 table in the sl4e */
1769 l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
1770 *sl3mfn, &new_sl4e, ft);
1771 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1772 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1773 if ( r & SHADOW_SET_ERROR )
1774 return NULL;
1776 /* Now follow it down a level. Guaranteed to succeed. */
1777 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1779 #endif /* GUEST_PAGING_LEVELS >= 4 */
1782 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1783 walk_t *gw,
1784 mfn_t *sl2mfn,
1785 fetch_type_t ft)
1787 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1788 mfn_t sl3mfn = _mfn(INVALID_MFN);
1789 shadow_l3e_t *sl3e;
1790 if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
1791 /* Get the l3e */
1792 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1793 if ( sl3e == NULL ) return NULL;
1794 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1796 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1797 ASSERT(valid_mfn(*sl2mfn));
1799 else
1801 int r;
1802 shadow_l3e_t new_sl3e;
1803 /* No l2 shadow installed: find and install it. */
1804 *sl2mfn = get_shadow_status(v, gw->l2mfn, SH_type_l2_shadow);
1805 if ( !valid_mfn(*sl2mfn) )
1807 /* No l2 shadow of this page exists at all: make one. */
1808 *sl2mfn = sh_make_shadow(v, gw->l2mfn, SH_type_l2_shadow);
1810 /* Install the new sl2 table in the sl3e */
1811 l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
1812 *sl2mfn, &new_sl3e, ft);
1813 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1814 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1815 if ( r & SHADOW_SET_ERROR )
1816 return NULL;
1818 /* Now follow it down a level. Guaranteed to succeed. */
1819 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1820 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1821 /* We never demand-shadow PAE l3es: they are only created in
1822 * sh_update_cr3(). Check if the relevant sl3e is present. */
1823 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.shadow.l3table)
1824 + shadow_l3_linear_offset(gw->va);
1825 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1826 return NULL;
1827 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1828 ASSERT(valid_mfn(*sl2mfn));
1829 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1830 #else /* 32bit... */
1831 /* There is always a shadow of the top level table. Get it. */
1832 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1833 /* This next line is important: the guest l2 has a 16k
1834 * shadow, we need to return the right mfn of the four. This
1835 * call will set it for us as a side-effect. */
1836 (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
1837 /* Reading the top level table is always valid. */
1838 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1839 #endif
1843 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1844 walk_t *gw,
1845 mfn_t *sl1mfn,
1846 fetch_type_t ft)
1848 mfn_t sl2mfn;
1849 shadow_l2e_t *sl2e;
1851 /* Get the l2e */
1852 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1853 if ( sl2e == NULL ) return NULL;
1854 /* Install the sl1 in the l2e if it wasn't there or if we need to
1855 * re-do it to fix a PSE dirty bit. */
1856 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1857 && likely(ft != ft_demand_write
1858 || (guest_l2e_get_flags(*gw->l2e) & _PAGE_DIRTY)
1859 || !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)) )
1861 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1862 ASSERT(valid_mfn(*sl1mfn));
1864 else
1866 shadow_l2e_t new_sl2e;
1867 int r, flags = guest_l2e_get_flags(*gw->l2e);
1868 /* No l1 shadow installed: find and install it. */
1869 if ( !(flags & _PAGE_PRESENT) )
1870 return NULL; /* No guest page. */
1871 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1873 /* Splintering a superpage */
1874 gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
1875 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1876 if ( !valid_mfn(*sl1mfn) )
1878 /* No fl1 shadow of this superpage exists at all: make one. */
1879 *sl1mfn = make_fl1_shadow(v, l2gfn);
1882 else
1884 /* Shadowing an actual guest l1 table */
1885 if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
1886 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1887 if ( !valid_mfn(*sl1mfn) )
1889 /* No l1 shadow of this page exists at all: make one. */
1890 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1893 /* Install the new sl1 table in the sl2e */
1894 l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
1895 *sl1mfn, &new_sl2e, ft);
1896 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1897 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1898 if ( r & SHADOW_SET_ERROR )
1899 return NULL;
1900 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1901 * the guest l1 table has an 8k shadow, and we need to return
1902 * the right mfn of the pair. This call will set it for us as a
1903 * side-effect. (In all other cases, it's a no-op and will be
1904 * compiled out.) */
1905 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1907 /* Now follow it down a level. Guaranteed to succeed. */
1908 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1913 /**************************************************************************/
1914 /* Destructors for shadow tables:
1915 * Unregister the shadow, decrement refcounts of any entries present in it,
1916 * and release the memory.
1918 * N.B. These destructors do not clear the contents of the shadows.
1919 * This allows us to delay TLB shootdowns until the page is being reused.
1920 * See shadow_alloc() and shadow_free() for how this is handled.
1921 */
1923 #if GUEST_PAGING_LEVELS >= 4
1924 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1926 shadow_l4e_t *sl4e;
1927 u32 t = mfn_to_shadow_page(smfn)->type;
1928 mfn_t gmfn, sl4mfn;
1929 int xen_mappings;
1931 SHADOW_DEBUG(DESTROY_SHADOW,
1932 "%s(%05lx)\n", __func__, mfn_x(smfn));
1933 ASSERT(t == SH_type_l4_shadow);
1935 /* Record that the guest page isn't shadowed any more (in this type) */
1936 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1937 delete_shadow_status(v, gmfn, t, smfn);
1938 shadow_demote(v, gmfn, t);
1939 /* Decrement refcounts of all the old entries */
1940 xen_mappings = (!shadow_mode_external(v->domain));
1941 sl4mfn = smfn;
1942 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
1943 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1945 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1946 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1947 | ((unsigned long)sl4e & ~PAGE_MASK));
1949 });
1951 /* Put the memory back in the pool */
1952 shadow_free(v->domain, smfn);
1955 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
1957 shadow_l3e_t *sl3e;
1958 u32 t = mfn_to_shadow_page(smfn)->type;
1959 mfn_t gmfn, sl3mfn;
1961 SHADOW_DEBUG(DESTROY_SHADOW,
1962 "%s(%05lx)\n", __func__, mfn_x(smfn));
1963 ASSERT(t == SH_type_l3_shadow);
1965 /* Record that the guest page isn't shadowed any more (in this type) */
1966 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1967 delete_shadow_status(v, gmfn, t, smfn);
1968 shadow_demote(v, gmfn, t);
1970 /* Decrement refcounts of all the old entries */
1971 sl3mfn = smfn;
1972 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
1973 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1974 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
1975 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1976 | ((unsigned long)sl3e & ~PAGE_MASK));
1977 });
1979 /* Put the memory back in the pool */
1980 shadow_free(v->domain, smfn);
1982 #endif /* GUEST_PAGING_LEVELS >= 4 */
1985 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
1987 shadow_l2e_t *sl2e;
1988 u32 t = mfn_to_shadow_page(smfn)->type;
1989 mfn_t gmfn, sl2mfn;
1990 int xen_mappings;
1992 SHADOW_DEBUG(DESTROY_SHADOW,
1993 "%s(%05lx)\n", __func__, mfn_x(smfn));
1994 ASSERT(t == SH_type_l2_shadow
1995 || t == SH_type_l2h_pae_shadow);
1997 /* Record that the guest page isn't shadowed any more (in this type) */
1998 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1999 delete_shadow_status(v, gmfn, t, smfn);
2000 shadow_demote(v, gmfn, t);
2002 /* Decrement refcounts of all the old entries */
2003 sl2mfn = smfn;
2004 xen_mappings = (!shadow_mode_external(v->domain) &&
2005 ((GUEST_PAGING_LEVELS == 2) ||
2006 ((GUEST_PAGING_LEVELS == 3) &&
2007 (t == SH_type_l2h_pae_shadow))));
2008 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2009 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2010 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2011 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2012 | ((unsigned long)sl2e & ~PAGE_MASK));
2013 });
2015 /* Put the memory back in the pool */
2016 shadow_free(v->domain, smfn);
2019 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2021 struct domain *d = v->domain;
2022 shadow_l1e_t *sl1e;
2023 u32 t = mfn_to_shadow_page(smfn)->type;
2025 SHADOW_DEBUG(DESTROY_SHADOW,
2026 "%s(%05lx)\n", __func__, mfn_x(smfn));
2027 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2029 /* Record that the guest page isn't shadowed any more (in this type) */
2030 if ( t == SH_type_fl1_shadow )
2032 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2033 delete_fl1_shadow_status(v, gfn, smfn);
2035 else
2037 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2038 delete_shadow_status(v, gmfn, t, smfn);
2039 shadow_demote(v, gmfn, t);
2042 if ( shadow_mode_refcounts(d) )
2044 /* Decrement refcounts of all the old entries */
2045 mfn_t sl1mfn = smfn;
2046 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2047 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2048 && !sh_l1e_is_magic(*sl1e) )
2049 shadow_put_page_from_l1e(*sl1e, d);
2050 });
2053 /* Put the memory back in the pool */
2054 shadow_free(v->domain, smfn);
2057 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2058 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2060 struct domain *d = v->domain;
2061 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2063 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2064 /* Need to destroy the l3 monitor page in slot 0 too */
2066 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2067 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2068 shadow_free(d, _mfn(l4e_get_pfn(l4e[0])));
2069 sh_unmap_domain_page(l4e);
2071 #elif CONFIG_PAGING_LEVELS == 3
2072 /* Need to destroy the l2 monitor page in slot 4 too */
2074 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2075 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2076 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2077 sh_unmap_domain_page(l3e);
2079 #endif
2081 /* Put the memory back in the pool */
2082 shadow_free(d, mmfn);
2084 #endif
2086 /**************************************************************************/
2087 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2088 * These are called from common code when we are running out of shadow
2089 * memory, and unpinning all the top-level shadows hasn't worked.
2091 * This implementation is pretty crude and slow, but we hope that it won't
2092 * be called very often. */
2094 #if GUEST_PAGING_LEVELS == 2
2096 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2098 shadow_l2e_t *sl2e;
2099 int xen_mappings = !shadow_mode_external(v->domain);
2100 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2101 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2102 });
2105 #elif GUEST_PAGING_LEVELS == 3
2107 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2108 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2110 shadow_l2e_t *sl2e;
2111 int xen_mappings = !shadow_mode_external(v->domain);
2112 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2113 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2114 });
2117 #elif GUEST_PAGING_LEVELS == 4
2119 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2121 shadow_l4e_t *sl4e;
2122 int xen_mappings = !shadow_mode_external(v->domain);
2123 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
2124 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2125 });
2128 #endif
2130 /**************************************************************************/
2131 /* Internal translation functions.
2132 * These functions require a pointer to the shadow entry that will be updated.
2133 */
2135 /* These functions take a new guest entry, translate it to shadow and write
2136 * the shadow entry.
2138 * They return the same bitmaps as the shadow_set_lXe() functions.
2139 */
2141 #if GUEST_PAGING_LEVELS >= 4
2142 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2144 shadow_l4e_t new_sl4e;
2145 guest_l4e_t *new_gl4e = new_ge;
2146 shadow_l4e_t *sl4p = se;
2147 mfn_t sl3mfn = _mfn(INVALID_MFN);
2148 int result = 0;
2150 perfc_incrc(shadow_validate_gl4e_calls);
2152 if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
2154 gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
2155 mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
2156 if ( valid_mfn(gl3mfn) )
2157 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2158 else
2159 result |= SHADOW_SET_ERROR;
2161 l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
2162 sl3mfn, &new_sl4e, ft_prefetch);
2164 // check for updates to xen reserved slots
2165 if ( !shadow_mode_external(v->domain) )
2167 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2168 sizeof(shadow_l4e_t));
2169 int reserved_xen_slot = !is_guest_l4_slot(shadow_index);
2171 if ( unlikely(reserved_xen_slot) )
2173 // attempt by the guest to write to a xen reserved slot
2174 //
2175 SHADOW_PRINTK("%s out-of-range update "
2176 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2177 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2178 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2180 SHADOW_ERROR("out-of-range l4e update\n");
2181 result |= SHADOW_SET_ERROR;
2184 // do not call shadow_set_l4e...
2185 return result;
2189 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2190 return result;
2194 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2196 shadow_l3e_t new_sl3e;
2197 guest_l3e_t *new_gl3e = new_ge;
2198 shadow_l3e_t *sl3p = se;
2199 mfn_t sl2mfn = _mfn(INVALID_MFN);
2200 int result = 0;
2202 perfc_incrc(shadow_validate_gl3e_calls);
2204 if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
2206 gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
2207 mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
2208 if ( valid_mfn(gl2mfn) )
2209 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2210 else
2211 result |= SHADOW_SET_ERROR;
2213 l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
2214 sl2mfn, &new_sl3e, ft_prefetch);
2215 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2217 return result;
2219 #endif // GUEST_PAGING_LEVELS >= 4
2221 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2223 shadow_l2e_t new_sl2e;
2224 guest_l2e_t *new_gl2e = new_ge;
2225 shadow_l2e_t *sl2p = se;
2226 mfn_t sl1mfn = _mfn(INVALID_MFN);
2227 int result = 0;
2229 perfc_incrc(shadow_validate_gl2e_calls);
2231 if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
2233 gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
2234 if ( guest_supports_superpages(v) &&
2235 (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
2237 // superpage -- need to look up the shadow L1 which holds the
2238 // splitters...
2239 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2240 #if 0
2241 // XXX - it's possible that we want to do some kind of prefetch
2242 // for superpage fl1's here, but this is *not* on the demand path,
2243 // so we'll hold off trying that for now...
2244 //
2245 if ( !valid_mfn(sl1mfn) )
2246 sl1mfn = make_fl1_shadow(v, gl1gfn);
2247 #endif
2249 else
2251 mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
2252 if ( valid_mfn(gl1mfn) )
2253 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2254 else
2255 result |= SHADOW_SET_ERROR;
2258 l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
2259 sl1mfn, &new_sl2e, ft_prefetch);
2261 // check for updates to xen reserved slots in PV guests...
2262 // XXX -- need to revisit this for PV 3-on-4 guests.
2263 //
2264 #if SHADOW_PAGING_LEVELS < 4
2265 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2266 if ( !shadow_mode_external(v->domain) )
2268 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2269 sizeof(shadow_l2e_t));
2270 int reserved_xen_slot;
2272 #if SHADOW_PAGING_LEVELS == 3
2273 reserved_xen_slot =
2274 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2275 (shadow_index
2276 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2277 #else /* SHADOW_PAGING_LEVELS == 2 */
2278 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2279 #endif
2281 if ( unlikely(reserved_xen_slot) )
2283 // attempt by the guest to write to a xen reserved slot
2284 //
2285 SHADOW_PRINTK("%s out-of-range update "
2286 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2287 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2288 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2290 SHADOW_ERROR("out-of-range l2e update\n");
2291 result |= SHADOW_SET_ERROR;
2294 // do not call shadow_set_l2e...
2295 return result;
2298 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2299 #endif /* SHADOW_PAGING_LEVELS < 4 */
2301 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2303 return result;
2306 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2308 shadow_l1e_t new_sl1e;
2309 guest_l1e_t *new_gl1e = new_ge;
2310 shadow_l1e_t *sl1p = se;
2311 gfn_t gfn;
2312 mfn_t gmfn;
2313 int result = 0, mmio;
2315 perfc_incrc(shadow_validate_gl1e_calls);
2317 gfn = guest_l1e_get_gfn(*new_gl1e);
2318 gmfn = vcpu_gfn_to_mfn(v, gfn);
2320 mmio = (is_hvm_vcpu(v) && shadow_vcpu_mode_translate(v) && !valid_mfn(gmfn));
2321 l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e,
2322 ft_prefetch, mmio);
2324 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2325 return result;
2329 /**************************************************************************/
2330 /* Functions which translate and install the shadows of arbitrary guest
2331 * entries that we have just seen the guest write. */
2334 static inline int
2335 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2336 void *new_gp, u32 size, u32 sh_type,
2337 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2338 int (*validate_ge)(struct vcpu *v, void *ge,
2339 mfn_t smfn, void *se))
2340 /* Generic function for mapping and validating. */
2342 mfn_t smfn, smfn2, map_mfn;
2343 shadow_l1e_t *sl1p;
2344 u32 shadow_idx, guest_idx;
2345 int result = 0;
2347 /* Align address and size to guest entry boundaries */
2348 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2349 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2350 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2351 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2353 /* Map the shadow page */
2354 smfn = get_shadow_status(v, gmfn, sh_type);
2355 ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
2356 guest_idx = guest_index(new_gp);
2357 map_mfn = smfn;
2358 shadow_idx = shadow_index(&map_mfn, guest_idx);
2359 sl1p = map_shadow_page(map_mfn);
2361 /* Validate one entry at a time */
2362 while ( size )
2364 smfn2 = smfn;
2365 guest_idx = guest_index(new_gp);
2366 shadow_idx = shadow_index(&smfn2, guest_idx);
2367 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2369 /* We have moved to another page of the shadow */
2370 map_mfn = smfn2;
2371 unmap_shadow_page(sl1p);
2372 sl1p = map_shadow_page(map_mfn);
2374 result |= validate_ge(v,
2375 new_gp,
2376 map_mfn,
2377 &sl1p[shadow_idx]);
2378 size -= sizeof(guest_l1e_t);
2379 new_gp += sizeof(guest_l1e_t);
2381 unmap_shadow_page(sl1p);
2382 return result;
2386 int
2387 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2388 void *new_gl4p, u32 size)
2390 #if GUEST_PAGING_LEVELS >= 4
2391 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2392 SH_type_l4_shadow,
2393 shadow_l4_index,
2394 validate_gl4e);
2395 #else // ! GUEST_PAGING_LEVELS >= 4
2396 SHADOW_PRINTK("called in wrong paging mode!\n");
2397 BUG();
2398 return 0;
2399 #endif
2402 int
2403 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2404 void *new_gl3p, u32 size)
2406 #if GUEST_PAGING_LEVELS >= 4
2407 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2408 SH_type_l3_shadow,
2409 shadow_l3_index,
2410 validate_gl3e);
2411 #else // ! GUEST_PAGING_LEVELS >= 4
2412 SHADOW_PRINTK("called in wrong paging mode!\n");
2413 BUG();
2414 return 0;
2415 #endif
2418 int
2419 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2420 void *new_gl2p, u32 size)
2422 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2423 SH_type_l2_shadow,
2424 shadow_l2_index,
2425 validate_gl2e);
2428 int
2429 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2430 void *new_gl2p, u32 size)
2432 #if GUEST_PAGING_LEVELS == 3
2433 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2434 SH_type_l2h_shadow,
2435 shadow_l2_index,
2436 validate_gl2e);
2437 #else /* Non-PAE guests don't have different kinds of l2 table */
2438 SHADOW_PRINTK("called in wrong paging mode!\n");
2439 BUG();
2440 return 0;
2441 #endif
2444 int
2445 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2446 void *new_gl1p, u32 size)
2448 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2449 SH_type_l1_shadow,
2450 shadow_l1_index,
2451 validate_gl1e);
2455 /**************************************************************************/
2456 /* Optimization: If we see two emulated writes of zeros to the same
2457 * page-table without another kind of page fault in between, we guess
2458 * that this is a batch of changes (for process destruction) and
2459 * unshadow the page so we don't take a pagefault on every entry. This
2460 * should also make finding writeable mappings of pagetables much
2461 * easier. */
2463 /* Look to see if this is the second emulated write in a row to this
2464 * page, and unshadow/unhook if it is */
2465 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2467 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2468 if ( v->arch.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2469 sh_mfn_is_a_page_table(gmfn) )
2471 u32 flags = mfn_to_page(gmfn)->shadow_flags;
2472 if ( !(flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
2474 perfc_incrc(shadow_early_unshadow);
2475 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2478 v->arch.shadow.last_emulated_mfn = mfn_x(gmfn);
2479 #endif
2482 /* Stop counting towards early unshadows, as we've seen a real page fault */
2483 static inline void reset_early_unshadow(struct vcpu *v)
2485 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2486 v->arch.shadow.last_emulated_mfn = INVALID_MFN;
2487 #endif
2492 /**************************************************************************/
2493 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2494 * demand-faulted a shadow l1e in the fault handler, to see if it's
2495 * worth fetching some more.
2496 */
2498 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2500 /* XXX magic number */
2501 #define PREFETCH_DISTANCE 32
2503 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2504 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2506 int i, dist, mmio;
2507 gfn_t gfn;
2508 mfn_t gmfn;
2509 guest_l1e_t gl1e;
2510 shadow_l1e_t sl1e;
2511 u32 gflags;
2513 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2514 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2515 /* And no more than a maximum fetches-per-fault */
2516 if ( dist > PREFETCH_DISTANCE )
2517 dist = PREFETCH_DISTANCE;
2519 for ( i = 1; i < dist ; i++ )
2521 /* No point in prefetching if there's already a shadow */
2522 if ( ptr_sl1e[i].l1 != 0 )
2523 break;
2525 if ( gw->l1e )
2527 /* Normal guest page; grab the next guest entry */
2528 gl1e = gw->l1e[i];
2529 /* Not worth continuing if we hit an entry that will need another
2530 * fault for A/D-bit propagation anyway */
2531 gflags = guest_l1e_get_flags(gl1e);
2532 if ( (gflags & _PAGE_PRESENT)
2533 && (!(gflags & _PAGE_ACCESSED)
2534 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2535 break;
2537 else
2539 /* Fragmented superpage, unless we've been called wrongly */
2540 ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
2541 /* Increment the l1e's GFN by the right number of guest pages */
2542 gl1e = guest_l1e_from_gfn(
2543 _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i),
2544 guest_l1e_get_flags(gw->eff_l1e));
2547 /* Look at the gfn that the l1e is pointing at */
2548 gfn = guest_l1e_get_gfn(gl1e);
2549 gmfn = vcpu_gfn_to_mfn(v, gfn);
2550 mmio = ( is_hvm_vcpu(v)
2551 && shadow_vcpu_mode_translate(v)
2552 && mmio_space(gfn_to_paddr(gfn)) );
2554 /* Propagate the entry. Safe to use a pointer to our local
2555 * gl1e, since this is not a demand-fetch so there will be no
2556 * write-back to the guest. */
2557 l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
2558 gmfn, &sl1e, ft_prefetch, mmio);
2559 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2563 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2566 /**************************************************************************/
2567 /* Entry points into the shadow code */
2569 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2570 * for pagefaults. Returns 1 if this fault was an artefact of the
2571 * shadow code (and the guest should retry) or 0 if it is not (and the
2572 * fault should be handled elsewhere or passed to the guest). */
2574 static int sh_page_fault(struct vcpu *v,
2575 unsigned long va,
2576 struct cpu_user_regs *regs)
2578 struct domain *d = v->domain;
2579 walk_t gw;
2580 u32 accumulated_gflags;
2581 gfn_t gfn;
2582 mfn_t gmfn, sl1mfn=_mfn(0);
2583 shadow_l1e_t sl1e, *ptr_sl1e;
2584 paddr_t gpa;
2585 struct cpu_user_regs emul_regs;
2586 struct x86_emulate_ctxt emul_ctxt;
2587 int r, mmio;
2588 fetch_type_t ft = 0;
2590 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
2591 v->domain->domain_id, v->vcpu_id, va, regs->error_code);
2593 //
2594 // XXX: Need to think about eventually mapping superpages directly in the
2595 // shadow (when possible), as opposed to splintering them into a
2596 // bunch of 4K maps.
2597 //
2599 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2600 if ( (regs->error_code & PFEC_reserved_bit) )
2602 /* The only reasons for reserved bits to be set in shadow entries
2603 * are the two "magic" shadow_l1e entries. */
2604 if ( likely((__copy_from_user(&sl1e,
2605 (sh_linear_l1_table(v)
2606 + shadow_l1_linear_offset(va)),
2607 sizeof(sl1e)) == 0)
2608 && sh_l1e_is_magic(sl1e)) )
2610 if ( sh_l1e_is_gnp(sl1e) )
2612 if ( likely(!is_hvm_domain(d) ||
2613 shadow_vcpu_mode_translate(v)) )
2615 /* Not-present in a guest PT: pass to the guest as
2616 * a not-present fault (by flipping two bits). */
2617 ASSERT(regs->error_code & PFEC_page_present);
2618 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2619 perfc_incrc(shadow_fault_fast_gnp);
2620 SHADOW_PRINTK("fast path not-present\n");
2621 return 0;
2623 else
2625 /* Not-present in the P2M: MMIO */
2626 gpa = va;
2629 else
2631 /* Magic MMIO marker: extract gfn for MMIO address */
2632 ASSERT(sh_l1e_is_mmio(sl1e));
2633 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2634 << PAGE_SHIFT)
2635 | (va & ~PAGE_MASK);
2637 perfc_incrc(shadow_fault_fast_mmio);
2638 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2639 reset_early_unshadow(v);
2640 handle_mmio(gpa);
2641 return EXCRET_fault_fixed;
2643 else
2645 /* This should be exceptionally rare: another vcpu has fixed
2646 * the tables between the fault and our reading the l1e.
2647 * Fall through to the normal fault handing logic */
2648 perfc_incrc(shadow_fault_fast_fail);
2649 SHADOW_PRINTK("fast path false alarm!\n");
2650 /* Don't pass the reserved-bit bit: if we look at the fault
2651 * below and decide to pass it to the guest, the reserved-bit
2652 * bit won't make sense there. */
2653 regs->error_code &= ~PFEC_reserved_bit;
2656 #endif /* SHOPT_FAST_FAULT_PATH */
2658 shadow_lock(d);
2660 shadow_audit_tables(v);
2662 if ( guest_walk_tables(v, va, &gw, 1) != 0 )
2664 SHADOW_PRINTK("malformed guest pagetable!");
2665 print_gw(&gw);
2668 sh_audit_gw(v, &gw);
2670 // We do not look at the gw->l1e, as that will not exist for superpages.
2671 // Instead, we use the gw->eff_l1e...
2672 //
2673 // We need not check all the levels of the guest page table entries for
2674 // present vs not-present, as the eff_l1e will always be not present if
2675 // one of the higher level entries is not present.
2676 //
2677 if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
2679 if ( is_hvm_domain(d) && !shadow_vcpu_mode_translate(v) )
2681 /* Not present in p2m map, means this is mmio */
2682 gpa = va;
2683 goto mmio;
2686 perfc_incrc(shadow_fault_bail_not_present);
2687 goto not_a_shadow_fault;
2690 // All levels of the guest page table are now known to be present.
2691 accumulated_gflags = accumulate_guest_flags(v, &gw);
2693 // Check for attempts to access supervisor-only pages from user mode,
2694 // i.e. ring 3. Such errors are not caused or dealt with by the shadow
2695 // code.
2696 //
2697 if ( (regs->error_code & PFEC_user_mode) &&
2698 !(accumulated_gflags & _PAGE_USER) )
2700 /* illegal user-mode access to supervisor-only page */
2701 perfc_incrc(shadow_fault_bail_user_supervisor);
2702 goto not_a_shadow_fault;
2705 // Was it a write fault?
2706 ft = ((regs->error_code & PFEC_write_access)
2707 ? ft_demand_write : ft_demand_read);
2708 if ( ft == ft_demand_write )
2710 if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
2712 perfc_incrc(shadow_fault_bail_ro_mapping);
2713 goto not_a_shadow_fault;
2716 else // must have been either an insn fetch or read fault
2718 // Check for NX bit violations: attempts to execute code that is
2719 // marked "do not execute". Such errors are not caused or dealt with
2720 // by the shadow code.
2721 //
2722 if ( regs->error_code & PFEC_insn_fetch )
2724 if ( accumulated_gflags & _PAGE_NX_BIT )
2726 /* NX prevented this code fetch */
2727 perfc_incrc(shadow_fault_bail_nx);
2728 goto not_a_shadow_fault;
2733 /* What mfn is the guest trying to access? */
2734 gfn = guest_l1e_get_gfn(gw.eff_l1e);
2735 gmfn = vcpu_gfn_to_mfn(v, gfn);
2736 mmio = (is_hvm_domain(d)
2737 && shadow_vcpu_mode_translate(v)
2738 && mmio_space(gfn_to_paddr(gfn)));
2740 if ( !mmio && !valid_mfn(gmfn) )
2742 perfc_incrc(shadow_fault_bail_bad_gfn);
2743 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n",
2744 gfn_x(gfn), mfn_x(gmfn));
2745 goto not_a_shadow_fault;
2748 /* Make sure there is enough free shadow memory to build a chain of
2749 * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
2750 * to allocate all we need. (We never allocate a top-level shadow
2751 * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
2752 shadow_prealloc(d, SHADOW_MAX_ORDER);
2754 /* Acquire the shadow. This must happen before we figure out the rights
2755 * for the shadow entry, since we might promote a page here. */
2756 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2757 if ( unlikely(ptr_sl1e == NULL) )
2759 /* Couldn't get the sl1e! Since we know the guest entries
2760 * are OK, this can only have been caused by a failed
2761 * shadow_set_l*e(), which will have crashed the guest.
2762 * Get out of the fault handler immediately. */
2763 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2764 shadow_unlock(d);
2765 return 0;
2768 /* Calculate the shadow entry and write it */
2769 l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn,
2770 gmfn, &sl1e, ft, mmio);
2771 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2773 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2774 /* Prefetch some more shadow entries */
2775 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2776 #endif
2778 /* Need to emulate accesses to page tables */
2779 if ( sh_mfn_is_a_page_table(gmfn) )
2781 if ( ft == ft_demand_write )
2783 perfc_incrc(shadow_fault_emulate_write);
2784 goto emulate;
2786 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2788 perfc_incrc(shadow_fault_emulate_read);
2789 goto emulate;
2793 if ( mmio )
2795 gpa = guest_walk_to_gpa(&gw);
2796 goto mmio;
2799 perfc_incrc(shadow_fault_fixed);
2800 d->arch.shadow.fault_count++;
2801 reset_early_unshadow(v);
2803 done:
2804 sh_audit_gw(v, &gw);
2805 unmap_walk(v, &gw);
2806 SHADOW_PRINTK("fixed\n");
2807 shadow_audit_tables(v);
2808 shadow_unlock(d);
2809 return EXCRET_fault_fixed;
2811 emulate:
2812 /* Take the register set we were called with */
2813 emul_regs = *regs;
2814 if ( is_hvm_domain(d) )
2816 /* Add the guest's segment selectors, rip, rsp. rflags */
2817 hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
2819 emul_ctxt.regs = &emul_regs;
2820 emul_ctxt.cr2 = va;
2821 emul_ctxt.mode = (is_hvm_domain(d) ?
2822 hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST);
2824 SHADOW_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
2826 v->arch.shadow.propagate_fault = 0;
2828 /*
2829 * We do not emulate user writes. Instead we use them as a hint that the
2830 * page is no longer a page table. This behaviour differs from native, but
2831 * it seems very unlikely that any OS grants user access to page tables.
2832 * We also disallow guest PTE updates from within Xen.
2833 */
2834 if ( (regs->error_code & PFEC_user_mode) || !guest_mode(regs) ||
2835 x86_emulate_memop(&emul_ctxt, &shadow_emulator_ops) )
2837 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
2838 mfn_x(gmfn));
2839 perfc_incrc(shadow_fault_emulate_failed);
2840 /* If this is actually a page table, then we have a bug, and need
2841 * to support more operations in the emulator. More likely,
2842 * though, this is a hint that this page should not be shadowed. */
2843 shadow_remove_all_shadows(v, gmfn);
2844 /* This means that actual missing operations will cause the
2845 * guest to loop on the same page fault. */
2846 goto done;
2849 /* Emulation triggered another page fault? */
2850 if ( v->arch.shadow.propagate_fault )
2851 goto not_a_shadow_fault;
2853 /* Emulator has changed the user registers: write back */
2854 if ( is_hvm_domain(d) )
2856 /* Write back the guest's segment selectors, rip, rsp. rflags */
2857 hvm_load_cpu_guest_regs(v, &emul_regs);
2858 /* And don't overwrite those in the caller's regs. */
2859 emul_regs.eip = regs->eip;
2860 emul_regs.cs = regs->cs;
2861 emul_regs.eflags = regs->eflags;
2862 emul_regs.esp = regs->esp;
2863 emul_regs.ss = regs->ss;
2864 emul_regs.es = regs->es;
2865 emul_regs.ds = regs->ds;
2866 emul_regs.fs = regs->fs;
2867 emul_regs.gs = regs->gs;
2869 *regs = emul_regs;
2871 goto done;
2873 mmio:
2874 if ( !guest_mode(regs) )
2875 goto not_a_shadow_fault;
2876 perfc_incrc(shadow_fault_mmio);
2877 sh_audit_gw(v, &gw);
2878 unmap_walk(v, &gw);
2879 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
2880 shadow_audit_tables(v);
2881 reset_early_unshadow(v);
2882 shadow_unlock(d);
2883 handle_mmio(gpa);
2884 return EXCRET_fault_fixed;
2886 not_a_shadow_fault:
2887 sh_audit_gw(v, &gw);
2888 unmap_walk(v, &gw);
2889 SHADOW_PRINTK("not a shadow fault\n");
2890 shadow_audit_tables(v);
2891 reset_early_unshadow(v);
2892 shadow_unlock(d);
2893 return 0;
2897 static int
2898 sh_invlpg(struct vcpu *v, unsigned long va)
2899 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
2900 * instruction should be issued on the hardware, or 0 if it's safe not
2901 * to do so. */
2903 shadow_l2e_t sl2e;
2905 perfc_incrc(shadow_invlpg);
2907 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
2908 * run as high as 6% of invlpg calls where we haven't shadowed the l2
2909 * yet. */
2910 #if SHADOW_PAGING_LEVELS == 4
2912 shadow_l3e_t sl3e;
2913 if ( !(shadow_l4e_get_flags(
2914 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
2915 & _PAGE_PRESENT) )
2916 return 0;
2917 /* This must still be a copy-from-user because we don't have the
2918 * shadow lock, and the higher-level shadows might disappear
2919 * under our feet. */
2920 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
2921 + shadow_l3_linear_offset(va)),
2922 sizeof (sl3e)) != 0 )
2924 perfc_incrc(shadow_invlpg_fault);
2925 return 0;
2927 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
2928 return 0;
2930 #elif SHADOW_PAGING_LEVELS == 3
2931 if ( !(l3e_get_flags(v->arch.shadow.l3table[shadow_l3_linear_offset(va)])
2932 & _PAGE_PRESENT) )
2933 // no need to flush anything if there's no SL2...
2934 return 0;
2935 #endif
2937 /* This must still be a copy-from-user because we don't have the shadow
2938 * lock, and the higher-level shadows might disappear under our feet. */
2939 if ( __copy_from_user(&sl2e,
2940 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
2941 sizeof (sl2e)) != 0 )
2943 perfc_incrc(shadow_invlpg_fault);
2944 return 0;
2947 // If there's nothing shadowed for this particular sl2e, then
2948 // there is no need to do an invlpg, either...
2949 //
2950 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
2951 return 0;
2953 // Check to see if the SL2 is a splintered superpage...
2954 // If so, then we'll need to flush the entire TLB (because that's
2955 // easier than invalidating all of the individual 4K pages).
2956 //
2957 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
2958 == SH_type_fl1_shadow )
2960 local_flush_tlb();
2961 return 0;
2964 return 1;
2967 static unsigned long
2968 sh_gva_to_gfn(struct vcpu *v, unsigned long va)
2969 /* Called to translate a guest virtual address to what the *guest*
2970 * pagetables would map it to. */
2972 walk_t gw;
2973 gfn_t gfn;
2975 guest_walk_tables(v, va, &gw, 0);
2976 gfn = guest_walk_to_gfn(&gw);
2977 unmap_walk(v, &gw);
2979 return gfn_x(gfn);
2983 static paddr_t
2984 sh_gva_to_gpa(struct vcpu *v, unsigned long va)
2985 /* Called to translate a guest virtual address to what the *guest*
2986 * pagetables would map it to. */
2988 unsigned long gfn = sh_gva_to_gfn(v, va);
2989 if ( gfn == INVALID_GFN )
2990 return 0;
2991 else
2992 return (((paddr_t)gfn) << PAGE_SHIFT) + (va & ~PAGE_MASK);
2996 static inline void
2997 sh_update_linear_entries(struct vcpu *v)
2998 /* Sync up all the linear mappings for this vcpu's pagetables */
3000 struct domain *d = v->domain;
3002 /* Linear pagetables in PV guests
3003 * ------------------------------
3005 * Guest linear pagetables, which map the guest pages, are at
3006 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3007 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3008 * are set up at shadow creation time, but (of course!) the PAE case
3009 * is subtler. Normal linear mappings are made by having an entry
3010 * in the top-level table that points to itself (shadow linear) or
3011 * to the guest top-level table (guest linear). For PAE, to set up
3012 * a linear map requires us to copy the four top-level entries into
3013 * level-2 entries. That means that every time we change a PAE l3e,
3014 * we need to reflect the change into the copy.
3016 * Linear pagetables in HVM guests
3017 * -------------------------------
3019 * For HVM guests, the linear pagetables are installed in the monitor
3020 * tables (since we can't put them in the shadow). Shadow linear
3021 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3022 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3023 * a linear pagetable of the monitor tables themselves. We have
3024 * the same issue of having to re-copy PAE l3 entries whevever we use
3025 * PAE shadows.
3027 * Because HVM guests run on the same monitor tables regardless of the
3028 * shadow tables in use, the linear mapping of the shadow tables has to
3029 * be updated every time v->arch.shadow_table changes.
3030 */
3032 /* Don't try to update the monitor table if it doesn't exist */
3033 if ( shadow_mode_external(d)
3034 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3035 return;
3037 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3039 /* For PV, one l4e points at the guest l4, one points at the shadow
3040 * l4. No maintenance required.
3041 * For HVM, just need to update the l4e that points to the shadow l4. */
3043 if ( shadow_mode_external(d) )
3045 /* Use the linear map if we can; otherwise make a new mapping */
3046 if ( v == current )
3048 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3049 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3050 __PAGE_HYPERVISOR);
3052 else
3054 l4_pgentry_t *ml4e;
3055 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3056 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3057 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3058 __PAGE_HYPERVISOR);
3059 sh_unmap_domain_page(ml4e);
3063 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3065 /* This case only exists in HVM. To give ourselves a linear map of the
3066 * shadows, we need to extend a PAE shadow to 4 levels. We do this by
3067 * having a monitor l3 in slot 0 of the monitor l4 table, and
3068 * copying the PAE l3 entries into it. Then, by having the monitor l4e
3069 * for shadow pagetables also point to the monitor l4, we can use it
3070 * to access the shadows. */
3072 if ( shadow_mode_external(d) )
3074 /* Install copies of the shadow l3es into the monitor l3 table.
3075 * The monitor l3 table is hooked into slot 0 of the monitor
3076 * l4 table, so we use l3 linear indices 0 to 3 */
3077 shadow_l3e_t *sl3e;
3078 l3_pgentry_t *ml3e;
3079 mfn_t l3mfn;
3080 int i;
3082 /* Use linear mappings if we can; otherwise make new mappings */
3083 if ( v == current )
3085 ml3e = __linear_l3_table;
3086 l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
3088 else
3090 l4_pgentry_t *ml4e;
3091 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3092 ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
3093 l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
3094 ml3e = sh_map_domain_page(l3mfn);
3095 sh_unmap_domain_page(ml4e);
3098 /* Shadow l3 tables are made up by update_cr3 */
3099 sl3e = v->arch.shadow.l3table;
3101 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3103 ml3e[i] =
3104 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3105 ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3106 __PAGE_HYPERVISOR)
3107 : l3e_empty();
3110 if ( v != current )
3111 sh_unmap_domain_page(ml3e);
3114 #elif CONFIG_PAGING_LEVELS == 3
3116 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3117 * entries in the shadow, and the shadow's l3 entries into the
3118 * shadow-linear-map l2 entries in the shadow. This is safe to do
3119 * because Xen does not let guests share high-slot l2 tables between l3s,
3120 * so we know we're not treading on anyone's toes.
3122 * HVM: need to copy the shadow's l3 entries into the
3123 * shadow-linear-map l2 entries in the monitor table. This is safe
3124 * because we have one monitor table for each vcpu. The monitor's
3125 * own l3es don't need to be copied because they never change.
3126 * XXX That might change if we start stuffing things into the rest
3127 * of the monitor's virtual address space.
3128 */
3130 l2_pgentry_t *l2e, new_l2e;
3131 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3132 int i;
3133 int unmap_l2e = 0;
3135 #if GUEST_PAGING_LEVELS == 2
3136 /* Shadow l3 tables were built by update_cr3 */
3137 if ( shadow_mode_external(d) )
3138 shadow_l3e = (shadow_l3e_t *)&v->arch.shadow.l3table;
3139 else
3140 BUG(); /* PV 2-on-3 is not supported yet */
3142 #else /* GUEST_PAGING_LEVELS == 3 */
3144 shadow_l3e = (shadow_l3e_t *)&v->arch.shadow.l3table;
3145 /* Always safe to use guest_vtable, because it's globally mapped */
3146 guest_l3e = v->arch.guest_vtable;
3148 #endif /* GUEST_PAGING_LEVELS */
3150 /* Choose where to write the entries, using linear maps if possible */
3151 if ( shadow_mode_external(d) )
3153 if ( v == current )
3155 /* From the monitor tables, it's safe to use linear maps
3156 * to update monitor l2s */
3157 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3159 else
3161 /* Map the monitor table's high l2 */
3162 l3_pgentry_t *l3e;
3163 l3e = sh_map_domain_page(
3164 pagetable_get_mfn(v->arch.monitor_table));
3165 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3166 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3167 unmap_l2e = 1;
3168 sh_unmap_domain_page(l3e);
3171 else
3173 /* Map the shadow table's high l2 */
3174 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3175 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3176 unmap_l2e = 1;
3179 /* Write linear mapping of guest (only in PV, and only when
3180 * not translated). */
3181 if ( !shadow_mode_translate(d) )
3183 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3185 new_l2e =
3186 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3187 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3188 __PAGE_HYPERVISOR)
3189 : l2e_empty());
3190 safe_write_entry(
3191 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3192 &new_l2e);
3196 /* Write linear mapping of shadow. */
3197 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3199 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3200 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3201 __PAGE_HYPERVISOR)
3202 : l2e_empty();
3203 safe_write_entry(
3204 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3205 &new_l2e);
3208 if ( unmap_l2e )
3209 sh_unmap_domain_page(l2e);
3212 #elif CONFIG_PAGING_LEVELS == 2
3214 /* For PV, one l2e points at the guest l2, one points at the shadow
3215 * l2. No maintenance required.
3216 * For HVM, just need to update the l2e that points to the shadow l2. */
3218 if ( shadow_mode_external(d) )
3220 /* Use the linear map if we can; otherwise make a new mapping */
3221 if ( v == current )
3223 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3224 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3225 __PAGE_HYPERVISOR);
3227 else
3229 l2_pgentry_t *ml2e;
3230 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3231 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3232 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3233 __PAGE_HYPERVISOR);
3234 sh_unmap_domain_page(ml2e);
3238 #else
3239 #error this should not happen
3240 #endif
3244 /* Removes vcpu->arch.guest_vtable and vcpu->arch.shadow_table[].
3245 * Does all appropriate management/bookkeeping/refcounting/etc...
3246 */
3247 static void
3248 sh_detach_old_tables(struct vcpu *v)
3250 struct domain *d = v->domain;
3251 mfn_t smfn;
3252 int i = 0;
3254 ////
3255 //// vcpu->arch.guest_vtable
3256 ////
3257 if ( v->arch.guest_vtable )
3259 #if GUEST_PAGING_LEVELS == 4
3260 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3261 sh_unmap_domain_page_global(v->arch.guest_vtable);
3262 #elif GUEST_PAGING_LEVELS == 3
3263 if ( 1 || shadow_mode_external(d) || shadow_mode_translate(d) )
3264 sh_unmap_domain_page_global(v->arch.guest_vtable);
3265 #elif GUEST_PAGING_LEVELS == 2
3266 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3267 sh_unmap_domain_page_global(v->arch.guest_vtable);
3268 #endif
3269 v->arch.guest_vtable = NULL;
3272 ////
3273 //// vcpu->arch.shadow_table[]
3274 ////
3277 #if GUEST_PAGING_LEVELS == 3
3278 /* PAE guests have four shadow_table entries */
3279 for ( i = 0 ; i < 4 ; i++ )
3280 #endif
3282 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3283 if ( mfn_x(smfn) )
3284 sh_put_ref(v, smfn, 0);
3285 v->arch.shadow_table[i] = pagetable_null();
3289 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3290 static void
3291 sh_set_toplevel_shadow(struct vcpu *v,
3292 int slot,
3293 mfn_t gmfn,
3294 unsigned int root_type)
3296 mfn_t smfn;
3297 struct domain *d = v->domain;
3299 /* Decrement the refcount of the old contents of this slot */
3300 smfn = pagetable_get_mfn(v->arch.shadow_table[slot]);
3301 if ( mfn_x(smfn) )
3302 sh_put_ref(v, smfn, 0);
3304 /* Now figure out the new contents: is this a valid guest MFN? */
3305 if ( !valid_mfn(gmfn) )
3307 SHADOW_PRINTK("%u/%u [%u] invalid gmfn\n",
3308 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot);
3309 v->arch.shadow_table[slot] = pagetable_null();
3310 return;
3313 /* Guest mfn is valid: shadow it and install the shadow */
3314 smfn = get_shadow_status(v, gmfn, root_type);
3315 if ( !valid_mfn(smfn) )
3317 /* Make sure there's enough free shadow memory. */
3318 shadow_prealloc(d, SHADOW_MAX_ORDER);
3319 /* Shadow the page. */
3320 smfn = sh_make_shadow(v, gmfn, root_type);
3322 ASSERT(valid_mfn(smfn));
3324 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
3325 /* Once again OK to unhook entries from this table if we see fork/exit */
3326 ASSERT(sh_mfn_is_a_page_table(gmfn));
3327 mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
3328 #endif
3330 /* Pin the shadow and put it (back) on the list of top-level shadows */
3331 if ( sh_pin(v, smfn) == 0 )
3333 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3334 domain_crash(v->domain);
3337 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3338 * or the next call to set_toplevel_shadow() */
3339 if ( !sh_get_ref(v, smfn, 0) )
3341 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3342 domain_crash(v->domain);
3345 /* Done. Install it */
3346 SHADOW_PRINTK("%u/%u [%u] gmfn %#"SH_PRI_mfn" smfn %#"SH_PRI_mfn"\n",
3347 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3348 mfn_x(gmfn), mfn_x(smfn));
3349 v->arch.shadow_table[slot] = pagetable_from_mfn(smfn);
3353 static void
3354 sh_update_cr3(struct vcpu *v)
3355 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3356 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3357 * if appropriate).
3358 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works,
3359 * and read vcpu->arch.hvm_vcpu.hw_cr3 afterwards.
3360 */
3362 struct domain *d = v->domain;
3363 mfn_t gmfn;
3364 #if GUEST_PAGING_LEVELS == 3
3365 u32 guest_idx=0;
3366 #endif
3368 ASSERT(shadow_lock_is_acquired(v->domain));
3369 ASSERT(v->arch.shadow.mode);
3371 ////
3372 //// vcpu->arch.guest_table is already set
3373 ////
3375 #ifndef NDEBUG
3376 /* Double-check that the HVM code has sent us a sane guest_table */
3377 if ( is_hvm_domain(d) )
3379 gfn_t gfn;
3381 ASSERT(shadow_mode_external(d));
3383 // Is paging enabled on this vcpu?
3384 if ( shadow_vcpu_mode_translate(v) )
3386 gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
3387 gmfn = vcpu_gfn_to_mfn(v, gfn);
3388 ASSERT(valid_mfn(gmfn));
3389 ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
3391 else
3393 /* Paging disabled: guest_table points at (part of) p2m */
3394 #if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
3395 /* For everything else, they sould be the same */
3396 ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
3397 #endif
3400 #endif
3402 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3403 d->domain_id, v->vcpu_id,
3404 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3406 #if GUEST_PAGING_LEVELS == 4
3407 if ( !(v->arch.flags & TF_kernel_mode) )
3408 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3409 else
3410 #endif
3411 gmfn = pagetable_get_mfn(v->arch.guest_table);
3413 if ( !is_hvm_domain(d) && !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
3415 ASSERT(v->arch.cr3 == 0);
3416 return;
3419 ////
3420 //// vcpu->arch.guest_vtable
3421 ////
3422 #if GUEST_PAGING_LEVELS == 4
3423 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3425 if ( v->arch.guest_vtable )
3426 sh_unmap_domain_page_global(v->arch.guest_vtable);
3427 v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
3429 else
3430 v->arch.guest_vtable = __linear_l4_table;
3431 #elif GUEST_PAGING_LEVELS == 3
3432 if ( v->arch.guest_vtable )
3433 sh_unmap_domain_page_global(v->arch.guest_vtable);
3434 if ( shadow_mode_external(d) )
3436 if ( shadow_vcpu_mode_translate(v) )
3437 /* Paging enabled: find where in the page the l3 table is */
3438 guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
3439 else
3440 /* Paging disabled: l3 is at the start of a page (in the p2m) */
3441 guest_idx = 0;
3443 // Ignore the low 2 bits of guest_idx -- they are really just
3444 // cache control.
3445 guest_idx &= ~3;
3447 // XXX - why does this need a global map?
3448 v->arch.guest_vtable =
3449 (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx;
3451 else
3452 v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
3453 #elif GUEST_PAGING_LEVELS == 2
3454 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3456 if ( v->arch.guest_vtable )
3457 sh_unmap_domain_page_global(v->arch.guest_vtable);
3458 v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
3460 else
3461 v->arch.guest_vtable = __linear_l2_table;
3462 #else
3463 #error this should never happen
3464 #endif
3466 #if 0
3467 printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
3468 __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
3469 #endif
3471 ////
3472 //// vcpu->arch.shadow_table[]
3473 ////
3475 /* We revoke write access to the new guest toplevel page(s) before we
3476 * replace the old shadow pagetable(s), so that we can safely use the
3477 * (old) shadow linear maps in the writeable mapping heuristics. */
3478 #if GUEST_PAGING_LEVELS == 2
3479 if ( shadow_remove_write_access(v, gmfn, 2, 0) != 0 )
3480 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3481 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3482 #elif GUEST_PAGING_LEVELS == 3
3483 /* PAE guests have four shadow_table entries, based on the
3484 * current values of the guest's four l3es. */
3486 int i, flush = 0;
3487 gfn_t gl2gfn;
3488 mfn_t gl2mfn;
3489 guest_l3e_t *gl3e = (guest_l3e_t*)v->arch.guest_vtable;
3490 /* First, make all four entries read-only. */
3491 for ( i = 0; i < 4; i++ )
3493 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3495 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3496 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3497 flush |= shadow_remove_write_access(v, gl2mfn, 2, 0);
3500 if ( flush )
3501 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3502 /* Now install the new shadows. */
3503 for ( i = 0; i < 4; i++ )
3505 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3507 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3508 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3509 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3510 ? SH_type_l2h_shadow
3511 : SH_type_l2_shadow);
3515 #elif GUEST_PAGING_LEVELS == 4
3516 if ( shadow_remove_write_access(v, gmfn, 4, 0) != 0 )
3517 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3518 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3519 #else
3520 #error This should never happen
3521 #endif
3523 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3524 #endif
3526 ///
3527 /// v->arch.shadow.l3table
3528 ///
3529 #if SHADOW_PAGING_LEVELS == 3
3531 mfn_t smfn;
3532 int i;
3533 for ( i = 0; i < 4; i++ )
3535 #if GUEST_PAGING_LEVELS == 2
3536 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3537 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3538 #else
3539 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3540 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3541 #endif
3542 v->arch.shadow.l3table[i] =
3543 (mfn_x(smfn) == 0)
3544 ? shadow_l3e_empty()
3545 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3548 #endif /* SHADOW_PAGING_LEVELS == 3 */
3551 ///
3552 /// v->arch.cr3
3553 ///
3554 if ( shadow_mode_external(d) )
3556 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3558 else // not shadow_mode_external...
3560 /* We don't support PV except guest == shadow == config levels */
3561 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3562 #if SHADOW_PAGING_LEVELS == 3
3563 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3564 * Don't use make_cr3 because (a) we know it's below 4GB, and
3565 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3566 ASSERT(virt_to_maddr(&v->arch.shadow.l3table) <= 0xffffffe0ULL);
3567 v->arch.cr3 = virt_to_maddr(&v->arch.shadow.l3table);
3568 #else
3569 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3570 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3571 #endif
3575 ///
3576 /// v->arch.hvm_vcpu.hw_cr3
3577 ///
3578 if ( shadow_mode_external(d) )
3580 ASSERT(is_hvm_domain(d));
3581 #if SHADOW_PAGING_LEVELS == 3
3582 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3583 v->arch.hvm_vcpu.hw_cr3 = virt_to_maddr(&v->arch.shadow.l3table);
3584 #else
3585 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3586 v->arch.hvm_vcpu.hw_cr3 = pagetable_get_paddr(v->arch.shadow_table[0]);
3587 #endif
3590 /* Fix up the linear pagetable mappings */
3591 sh_update_linear_entries(v);
3595 /**************************************************************************/
3596 /* Functions to revoke guest rights */
3598 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3599 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3600 /* Look up this vaddr in the current shadow and see if it's a writeable
3601 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3603 shadow_l1e_t sl1e, *sl1p;
3604 shadow_l2e_t *sl2p;
3605 #if SHADOW_PAGING_LEVELS >= 3
3606 shadow_l3e_t *sl3p;
3607 #if SHADOW_PAGING_LEVELS >= 4
3608 shadow_l4e_t *sl4p;
3609 #endif
3610 #endif
3611 mfn_t sl1mfn;
3612 int r;
3614 /* Carefully look in the shadow linear map for the l1e we expect */
3615 #if SHADOW_PAGING_LEVELS >= 4
3616 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3617 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3618 return 0;
3619 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3620 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3621 return 0;
3622 #elif SHADOW_PAGING_LEVELS == 3
3623 sl3p = ((shadow_l3e_t *) v->arch.shadow.l3table)
3624 + shadow_l3_linear_offset(vaddr);
3625 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3626 return 0;
3627 #endif
3628 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3629 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3630 return 0;
3631 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3632 sl1e = *sl1p;
3633 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3634 != (_PAGE_PRESENT|_PAGE_RW))
3635 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3636 return 0;
3638 /* Found it! Need to remove its write permissions. */
3639 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3640 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3641 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3642 ASSERT( !(r & SHADOW_SET_ERROR) );
3643 return 1;
3645 #endif
3647 int sh_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
3648 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3650 shadow_l1e_t *sl1e;
3651 int done = 0;
3652 int flags;
3653 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3655 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3657 flags = shadow_l1e_get_flags(*sl1e);
3658 if ( (flags & _PAGE_PRESENT)
3659 && (flags & _PAGE_RW)
3660 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3662 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3663 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3664 /* Remember the last shadow that we shot a writeable mapping in */
3665 v->arch.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3666 #endif
3667 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3668 & PGT_count_mask) == 0 )
3669 /* This breaks us cleanly out of the FOREACH macro */
3670 done = 1;
3672 });
3673 return done;
3677 int sh_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3678 /* Excises all mappings to guest frame from this shadow l1 table */
3680 shadow_l1e_t *sl1e;
3681 int done = 0;
3682 int flags;
3684 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3686 flags = shadow_l1e_get_flags(*sl1e);
3687 if ( (flags & _PAGE_PRESENT)
3688 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3690 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3691 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3692 /* This breaks us cleanly out of the FOREACH macro */
3693 done = 1;
3695 });
3696 return done;
3699 /**************************************************************************/
3700 /* Functions to excise all pointers to shadows from higher-level shadows. */
3702 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3703 /* Blank out a single shadow entry */
3705 switch ( mfn_to_shadow_page(smfn)->type )
3707 case SH_type_l1_shadow:
3708 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3709 case SH_type_l2_shadow:
3710 #if GUEST_PAGING_LEVELS == 3
3711 case SH_type_l2h_shadow:
3712 #endif
3713 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3714 #if GUEST_PAGING_LEVELS >= 4
3715 case SH_type_l3_shadow:
3716 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3717 case SH_type_l4_shadow:
3718 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3719 #endif
3720 default: BUG(); /* Called with the wrong kind of shadow. */
3724 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3725 /* Remove all mappings of this l1 shadow from this l2 shadow */
3727 shadow_l2e_t *sl2e;
3728 int done = 0;
3729 int flags;
3730 #if GUEST_PAGING_LEVELS != 4
3731 int xen_mappings = !shadow_mode_external(v->domain);
3732 #endif
3734 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings,
3736 flags = shadow_l2e_get_flags(*sl2e);
3737 if ( (flags & _PAGE_PRESENT)
3738 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3740 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3741 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
3742 /* This breaks us cleanly out of the FOREACH macro */
3743 done = 1;
3745 });
3746 return done;
3749 #if GUEST_PAGING_LEVELS >= 4
3750 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3751 /* Remove all mappings of this l2 shadow from this l3 shadow */
3753 shadow_l3e_t *sl3e;
3754 int done = 0;
3755 int flags;
3757 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3759 flags = shadow_l3e_get_flags(*sl3e);
3760 if ( (flags & _PAGE_PRESENT)
3761 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3763 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3764 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
3765 /* This breaks us cleanly out of the FOREACH macro */
3766 done = 1;
3768 });
3769 return done;
3772 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
3773 /* Remove all mappings of this l3 shadow from this l4 shadow */
3775 shadow_l4e_t *sl4e;
3776 int done = 0;
3777 int flags, xen_mappings = !shadow_mode_external(v->domain);
3779 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
3781 flags = shadow_l4e_get_flags(*sl4e);
3782 if ( (flags & _PAGE_PRESENT)
3783 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
3785 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
3786 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
3787 /* This breaks us cleanly out of the FOREACH macro */
3788 done = 1;
3790 });
3791 return done;
3793 #endif /* 64bit guest */
3795 /**************************************************************************/
3796 /* Handling HVM guest writes to pagetables */
3798 /* Check that the user is allowed to perform this write.
3799 * Returns a mapped pointer to write to, and the mfn it's on,
3800 * or NULL for error. */
3801 static inline void * emulate_map_dest(struct vcpu *v,
3802 unsigned long vaddr,
3803 struct x86_emulate_ctxt *ctxt,
3804 mfn_t *mfnp)
3806 walk_t gw;
3807 u32 flags;
3808 gfn_t gfn;
3809 mfn_t mfn;
3811 guest_walk_tables(v, vaddr, &gw, 1);
3812 flags = accumulate_guest_flags(v, &gw);
3813 gfn = guest_l1e_get_gfn(gw.eff_l1e);
3814 mfn = vcpu_gfn_to_mfn(v, gfn);
3815 sh_audit_gw(v, &gw);
3816 unmap_walk(v, &gw);
3818 if ( !(flags & _PAGE_PRESENT)
3819 || !(flags & _PAGE_RW)
3820 || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) )
3822 /* This write would have faulted even on bare metal */
3823 v->arch.shadow.propagate_fault = 1;
3824 return NULL;
3827 if ( !valid_mfn(mfn) )
3829 /* Attempted a write to a bad gfn. This should never happen:
3830 * after all, we're here because this write is to a page table. */
3831 BUG();
3834 ASSERT(sh_mfn_is_a_page_table(mfn));
3835 *mfnp = mfn;
3836 return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
3839 int
3840 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
3841 u32 bytes, struct x86_emulate_ctxt *ctxt)
3843 ASSERT(shadow_lock_is_acquired(v->domain));
3844 while ( bytes > 0 )
3846 mfn_t mfn;
3847 int bytes_on_page;
3848 void *addr;
3850 bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK);
3851 if ( bytes_on_page > bytes )
3852 bytes_on_page = bytes;
3854 if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
3855 return X86EMUL_PROPAGATE_FAULT;
3856 memcpy(addr, src, bytes_on_page);
3857 shadow_validate_guest_pt_write(v, mfn, addr, bytes_on_page);
3858 bytes -= bytes_on_page;
3859 /* If we are writing zeros to this page, might want to unshadow */
3860 if ( likely(bytes_on_page >= 4) && (*(u32 *)addr == 0) )
3861 check_for_early_unshadow(v, mfn);
3862 sh_unmap_domain_page(addr);
3864 shadow_audit_tables(v);
3865 return X86EMUL_CONTINUE;
3868 int
3869 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
3870 unsigned long old, unsigned long new,
3871 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
3873 mfn_t mfn;
3874 void *addr;
3875 unsigned long prev;
3876 int rv = X86EMUL_CONTINUE;
3878 ASSERT(shadow_lock_is_acquired(v->domain));
3879 ASSERT(bytes <= sizeof (unsigned long));
3881 if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
3882 return X86EMUL_PROPAGATE_FAULT;
3884 switch (bytes)
3886 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
3887 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
3888 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
3889 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
3890 default:
3891 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
3892 prev = ~old;
3895 if ( (prev == old) )
3896 shadow_validate_guest_pt_write(v, mfn, addr, bytes);
3897 else
3898 rv = X86EMUL_CMPXCHG_FAILED;
3900 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
3901 " wanted %#lx now %#lx bytes %u\n",
3902 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
3904 /* If we are writing zeros to this page, might want to unshadow */
3905 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) )
3906 check_for_early_unshadow(v, mfn);
3908 sh_unmap_domain_page(addr);
3909 shadow_audit_tables(v);
3910 return rv;
3913 int
3914 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
3915 unsigned long old_lo, unsigned long old_hi,
3916 unsigned long new_lo, unsigned long new_hi,
3917 struct x86_emulate_ctxt *ctxt)
3919 mfn_t mfn;
3920 void *addr;
3921 u64 old, new, prev;
3922 int rv = X86EMUL_CONTINUE;
3924 ASSERT(shadow_lock_is_acquired(v->domain));
3926 if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
3927 return X86EMUL_PROPAGATE_FAULT;
3929 old = (((u64) old_hi) << 32) | (u64) old_lo;
3930 new = (((u64) new_hi) << 32) | (u64) new_lo;
3931 prev = cmpxchg(((u64 *)addr), old, new);
3933 if ( (prev == old) )
3934 shadow_validate_guest_pt_write(v, mfn, addr, 8);
3935 else
3936 rv = X86EMUL_CMPXCHG_FAILED;
3938 /* If we are writing zeros to this page, might want to unshadow */
3939 if ( *(u32 *)addr == 0 )
3940 check_for_early_unshadow(v, mfn);
3942 sh_unmap_domain_page(addr);
3943 shadow_audit_tables(v);
3944 return rv;
3948 /**************************************************************************/
3949 /* Audit tools */
3951 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
3953 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
3954 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
3955 "gl" #_level "mfn = %" SH_PRI_mfn \
3956 " sl" #_level "mfn = %" SH_PRI_mfn \
3957 " &gl" #_level "e = %p &sl" #_level "e = %p" \
3958 " gl" #_level "e = %" SH_PRI_gpte \
3959 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
3960 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
3961 _level, guest_index(gl ## _level ## e), \
3962 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
3963 gl ## _level ## e, sl ## _level ## e, \
3964 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
3965 ##_a); \
3966 BUG(); \
3967 done = 1; \
3968 } while (0)
3971 static char * sh_audit_flags(struct vcpu *v, int level,
3972 int gflags, int sflags)
3973 /* Common code for auditing flag bits */
3975 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
3976 return "shadow is present but guest is not present";
3977 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
3978 return "global bit set in PV shadow";
3979 if ( level == 2 && (sflags & _PAGE_PSE) )
3980 return "PS bit set in shadow";
3981 #if SHADOW_PAGING_LEVELS == 3
3982 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
3983 #endif
3984 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
3985 return "accessed bit not propagated";
3986 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
3987 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
3988 return "dirty bit not propagated";
3989 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
3990 return "user/supervisor bit does not match";
3991 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
3992 return "NX bit does not match";
3993 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
3994 return "shadow grants write access but guest does not";
3995 return NULL;
3998 static inline mfn_t
3999 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
4000 /* Convert this gfn to an mfn in the manner appropriate for the
4001 * guest pagetable it's used in (gmfn) */
4003 if ( !shadow_mode_translate(v->domain) )
4004 return _mfn(gfn_x(gfn));
4006 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
4007 != PGT_writable_page )
4008 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
4009 else
4010 return sh_gfn_to_mfn(v->domain, gfn_x(gfn));
4014 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4016 guest_l1e_t *gl1e, *gp;
4017 shadow_l1e_t *sl1e;
4018 mfn_t mfn, gmfn, gl1mfn;
4019 gfn_t gfn;
4020 char *s;
4021 int done = 0;
4023 /* Follow the backpointer */
4024 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4025 gl1e = gp = sh_map_domain_page(gl1mfn);
4026 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4028 if ( sh_l1e_is_magic(*sl1e) )
4030 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4031 if ( sh_l1e_is_gnp(*sl1e) )
4033 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4034 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4036 else
4038 ASSERT(sh_l1e_is_mmio(*sl1e));
4039 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4040 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4041 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4042 " but guest gfn is %" SH_PRI_gfn,
4043 gfn_x(gfn),
4044 gfn_x(guest_l1e_get_gfn(*gl1e)));
4046 #endif
4048 else
4050 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4051 shadow_l1e_get_flags(*sl1e));
4052 if ( s ) AUDIT_FAIL(1, "%s", s);
4054 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4056 gfn = guest_l1e_get_gfn(*gl1e);
4057 mfn = shadow_l1e_get_mfn(*sl1e);
4058 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
4059 if ( mfn_x(gmfn) != mfn_x(mfn) )
4060 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4061 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4062 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4065 });
4066 sh_unmap_domain_page(gp);
4067 return done;
4070 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4072 guest_l1e_t *gl1e, e;
4073 shadow_l1e_t *sl1e;
4074 mfn_t gl1mfn = _mfn(INVALID_MFN);
4075 int f;
4076 int done = 0;
4078 /* fl1 has no useful backpointer: all we can check are flags */
4079 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4080 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4081 f = shadow_l1e_get_flags(*sl1e);
4082 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4083 if ( !(f == 0
4084 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4085 _PAGE_ACCESSED|_PAGE_DIRTY)
4086 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4087 || sh_l1e_is_magic(*sl1e)) )
4088 AUDIT_FAIL(1, "fl1e has bad flags");
4089 });
4090 return 0;
4093 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4095 guest_l2e_t *gl2e, *gp;
4096 shadow_l2e_t *sl2e;
4097 mfn_t mfn, gmfn, gl2mfn;
4098 gfn_t gfn;
4099 char *s;
4100 int done = 0;
4101 #if GUEST_PAGING_LEVELS != 4
4102 int xen_mappings = !shadow_mode_external(v->domain);
4103 #endif
4105 /* Follow the backpointer */
4106 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4107 gl2e = gp = sh_map_domain_page(gl2mfn);
4108 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
4110 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4111 shadow_l2e_get_flags(*sl2e));
4112 if ( s ) AUDIT_FAIL(2, "%s", s);
4114 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4116 gfn = guest_l2e_get_gfn(*gl2e);
4117 mfn = shadow_l2e_get_mfn(*sl2e);
4118 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4119 ? get_fl1_shadow_status(v, gfn)
4120 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4121 SH_type_l1_shadow);
4122 if ( mfn_x(gmfn) != mfn_x(mfn) )
4123 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4124 " (--> %" SH_PRI_mfn ")"
4125 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4126 gfn_x(gfn),
4127 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4128 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4129 mfn_x(gmfn), mfn_x(mfn));
4131 });
4132 sh_unmap_domain_page(gp);
4133 return 0;
4136 #if GUEST_PAGING_LEVELS >= 4
4137 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4139 guest_l3e_t *gl3e, *gp;
4140 shadow_l3e_t *sl3e;
4141 mfn_t mfn, gmfn, gl3mfn;
4142 gfn_t gfn;
4143 char *s;
4144 int done = 0;
4146 /* Follow the backpointer */
4147 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4148 gl3e = gp = sh_map_domain_page(gl3mfn);
4149 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4151 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4152 shadow_l3e_get_flags(*sl3e));
4153 if ( s ) AUDIT_FAIL(3, "%s", s);
4155 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4157 gfn = guest_l3e_get_gfn(*gl3e);
4158 mfn = shadow_l3e_get_mfn(*sl3e);
4159 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4160 (GUEST_PAGING_LEVELS == 3
4161 && !shadow_mode_external(v->domain)
4162 && (guest_index(gl3e) % 4) == 3)
4163 ? SH_type_l2h_pae_shadow
4164 : SH_type_l2_shadow);
4165 if ( mfn_x(gmfn) != mfn_x(mfn) )
4166 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4167 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4168 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4170 });
4171 sh_unmap_domain_page(gp);
4172 return 0;
4175 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4177 guest_l4e_t *gl4e, *gp;
4178 shadow_l4e_t *sl4e;
4179 mfn_t mfn, gmfn, gl4mfn;
4180 gfn_t gfn;
4181 char *s;
4182 int done = 0;
4183 int xen_mappings = !shadow_mode_external(v->domain);
4185 /* Follow the backpointer */
4186 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4187 gl4e = gp = sh_map_domain_page(gl4mfn);
4188 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
4190 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4191 shadow_l4e_get_flags(*sl4e));
4192 if ( s ) AUDIT_FAIL(4, "%s", s);
4194 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4196 gfn = guest_l4e_get_gfn(*gl4e);
4197 mfn = shadow_l4e_get_mfn(*sl4e);
4198 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4199 SH_type_l3_shadow);
4200 if ( mfn_x(gmfn) != mfn_x(mfn) )
4201 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4202 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4203 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4205 });
4206 sh_unmap_domain_page(gp);
4207 return 0;
4209 #endif /* GUEST_PAGING_LEVELS >= 4 */
4212 #undef AUDIT_FAIL
4214 #endif /* Audit code */
4216 /**************************************************************************/
4217 /* Entry points into this mode of the shadow code.
4218 * This will all be mangled by the preprocessor to uniquify everything. */
4219 struct shadow_paging_mode sh_paging_mode = {
4220 .page_fault = sh_page_fault,
4221 .invlpg = sh_invlpg,
4222 .gva_to_gpa = sh_gva_to_gpa,
4223 .gva_to_gfn = sh_gva_to_gfn,
4224 .update_cr3 = sh_update_cr3,
4225 .map_and_validate_gl1e = sh_map_and_validate_gl1e,
4226 .map_and_validate_gl2e = sh_map_and_validate_gl2e,
4227 .map_and_validate_gl2he = sh_map_and_validate_gl2he,
4228 .map_and_validate_gl3e = sh_map_and_validate_gl3e,
4229 .map_and_validate_gl4e = sh_map_and_validate_gl4e,
4230 .detach_old_tables = sh_detach_old_tables,
4231 .x86_emulate_write = sh_x86_emulate_write,
4232 .x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4233 .x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4234 .make_monitor_table = sh_make_monitor_table,
4235 .destroy_monitor_table = sh_destroy_monitor_table,
4236 .guest_map_l1e = sh_guest_map_l1e,
4237 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4238 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4239 .guess_wrmap = sh_guess_wrmap,
4240 #endif
4241 .guest_levels = GUEST_PAGING_LEVELS,
4242 .shadow_levels = SHADOW_PAGING_LEVELS,
4243 };
4245 /*
4246 * Local variables:
4247 * mode: C
4248 * c-set-style: "BSD"
4249 * c-basic-offset: 4
4250 * indent-tabs-mode: nil
4251 * End:
4252 */