ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 12895:f5121d001d1a

[XEN] Shadow-mode-refcount PTE update fix.

Add back in support for emulated PTE updates which is critical for
shdow_refcount operation.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@localhost.localdomain
date Sat Dec 09 16:29:52 2006 +0000 (2006-12-09)
parents d3846d6f30d5
children 6b68a3688509
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include "private.h"
37 #include "types.h"
39 /* THINGS TO DO LATER:
40 *
41 * TEARDOWN HEURISTICS
42 * Also: have a heuristic for when to destroy a previous paging-mode's
43 * shadows. When a guest is done with its start-of-day 32-bit tables
44 * and reuses the memory we want to drop those shadows. Start with
45 * shadows in a page in two modes as a hint, but beware of clever tricks
46 * like reusing a pagetable for both PAE and 64-bit during boot...
47 *
48 * PAE LINEAR MAPS
49 * Rework shadow_get_l*e() to have the option of using map_domain_page()
50 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
51 * Then we can test the speed difference made by linear maps. If the
52 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
53 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
54 * to share l2h pages again.
55 *
56 * GUEST_WALK_TABLES TLB FLUSH COALESCE
57 * guest_walk_tables can do up to three remote TLB flushes as it walks to
58 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
59 * and if we do flush, re-do the walk. If anything has changed, then
60 * pause all the other vcpus and do the walk *again*.
61 *
62 * WP DISABLED
63 * Consider how to implement having the WP bit of CR0 set to 0.
64 * Since we need to be able to cause write faults to pagetables, this might
65 * end up looking like not having the (guest) pagetables present at all in
66 * HVM guests...
67 *
68 * PSE disabled / PSE36
69 * We don't support any modes other than PSE enabled, PSE36 disabled.
70 * Neither of those would be hard to change, but we'd need to be able to
71 * deal with shadows made in one mode and used in another.
72 */
74 #define FETCH_TYPE_PREFETCH 1
75 #define FETCH_TYPE_DEMAND 2
76 #define FETCH_TYPE_WRITE 4
77 typedef enum {
78 ft_prefetch = FETCH_TYPE_PREFETCH,
79 ft_demand_read = FETCH_TYPE_DEMAND,
80 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
81 } fetch_type_t;
83 #ifdef DEBUG_TRACE_DUMP
84 static char *fetch_type_names[] = {
85 [ft_prefetch] "prefetch",
86 [ft_demand_read] "demand read",
87 [ft_demand_write] "demand write",
88 };
89 #endif
91 /**************************************************************************/
92 /* Hash table mapping from guest pagetables to shadows
93 *
94 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
95 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
96 * shadow L1 which maps its "splinters".
97 */
99 static inline mfn_t
100 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
101 /* Look for FL1 shadows in the hash table */
102 {
103 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
105 if ( unlikely(shadow_mode_log_dirty(v->domain) && mfn_valid(smfn)) )
106 {
107 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
108 if ( !(sp->logdirty) )
109 shadow_convert_to_log_dirty(v, smfn);
110 }
112 return smfn;
113 }
115 static inline mfn_t
116 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
117 /* Look for shadows in the hash table */
118 {
119 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
120 perfc_incrc(shadow_get_shadow_status);
122 if ( unlikely(shadow_mode_log_dirty(v->domain) && mfn_valid(smfn)) )
123 {
124 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
125 if ( !(sp->logdirty) )
126 shadow_convert_to_log_dirty(v, smfn);
127 }
129 return smfn;
130 }
132 static inline void
133 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
134 /* Put an FL1 shadow into the hash table */
135 {
136 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
137 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
139 if ( unlikely(shadow_mode_log_dirty(v->domain)) )
140 // mark this shadow as a log dirty shadow...
141 mfn_to_shadow_page(smfn)->logdirty = 1;
142 else
143 mfn_to_shadow_page(smfn)->logdirty = 0;
145 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
146 }
148 static inline void
149 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
150 /* Put a shadow into the hash table */
151 {
152 struct domain *d = v->domain;
153 int res;
155 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
156 d->domain_id, v->vcpu_id, mfn_x(gmfn),
157 shadow_type, mfn_x(smfn));
159 if ( unlikely(shadow_mode_log_dirty(d)) )
160 // mark this shadow as a log dirty shadow...
161 mfn_to_shadow_page(smfn)->logdirty = 1;
162 else
163 mfn_to_shadow_page(smfn)->logdirty = 0;
165 res = get_page(mfn_to_page(gmfn), d);
166 ASSERT(res == 1);
168 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
169 }
171 static inline void
172 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
173 /* Remove a shadow from the hash table */
174 {
175 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
176 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
177 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
178 }
180 static inline void
181 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
182 /* Remove a shadow from the hash table */
183 {
184 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
185 v->domain->domain_id, v->vcpu_id,
186 mfn_x(gmfn), shadow_type, mfn_x(smfn));
187 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
188 put_page(mfn_to_page(gmfn));
189 }
191 /**************************************************************************/
192 /* CPU feature support querying */
194 static inline int
195 guest_supports_superpages(struct vcpu *v)
196 {
197 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
198 * CR4.PSE is set or the guest is in PAE or long mode */
199 return (is_hvm_vcpu(v) && (GUEST_PAGING_LEVELS != 2
200 || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
201 }
203 static inline int
204 guest_supports_nx(struct vcpu *v)
205 {
206 if ( !is_hvm_vcpu(v) )
207 return cpu_has_nx;
209 // XXX - fix this!
210 return 1;
211 }
214 /**************************************************************************/
215 /* Functions for walking the guest page tables */
218 /* Walk the guest pagetables, filling the walk_t with what we see.
219 * Takes an uninitialised walk_t. The caller must call unmap_walk()
220 * on the walk_t before discarding it or calling guest_walk_tables again.
221 * If "guest_op" is non-zero, we are serving a genuine guest memory access,
222 * and must (a) be under the shadow lock, and (b) remove write access
223 * from any gueat PT pages we see, as we will be using their contents to
224 * perform shadow updates.
225 * Returns 0 for success or non-zero if the guest pagetables are malformed.
226 * N.B. Finding a not-present entry does not cause a non-zero return code. */
227 static inline int
228 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
229 {
230 ASSERT(!guest_op || shadow_lock_is_acquired(v->domain));
232 perfc_incrc(shadow_guest_walk);
233 memset(gw, 0, sizeof(*gw));
234 gw->va = va;
236 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
237 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
238 /* Get l4e from the top level table */
239 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
240 gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
241 /* Walk down to the l3e */
242 if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
243 gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
244 if ( !mfn_valid(gw->l3mfn) ) return 1;
245 /* This mfn is a pagetable: make sure the guest can't write to it. */
246 if ( guest_op && shadow_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
247 flush_tlb_mask(v->domain->domain_dirty_cpumask);
248 gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
249 + guest_l3_table_offset(va);
250 #else /* PAE only... */
251 /* Get l3e from the top level table */
252 gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
253 gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
254 #endif /* PAE or 64... */
255 /* Walk down to the l2e */
256 if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
257 gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
258 if ( !mfn_valid(gw->l2mfn) ) return 1;
259 /* This mfn is a pagetable: make sure the guest can't write to it. */
260 if ( guest_op && shadow_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
261 flush_tlb_mask(v->domain->domain_dirty_cpumask);
262 gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
263 + guest_l2_table_offset(va);
264 #else /* 32-bit only... */
265 /* Get l2e from the top level table */
266 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
267 gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
268 #endif /* All levels... */
270 if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
271 if ( guest_supports_superpages(v) &&
272 (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
273 {
274 /* Special case: this guest VA is in a PSE superpage, so there's
275 * no guest l1e. We make one up so that the propagation code
276 * can generate a shadow l1 table. Start with the gfn of the
277 * first 4k-page of the superpage. */
278 gfn_t start = guest_l2e_get_gfn(*gw->l2e);
279 /* Grant full access in the l1e, since all the guest entry's
280 * access controls are enforced in the shadow l2e. This lets
281 * us reflect l2 changes later without touching the l1s. */
282 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
283 _PAGE_ACCESSED|_PAGE_DIRTY);
284 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
285 * of the level 1 */
286 if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
287 flags |= _PAGE_PAT;
288 /* Increment the pfn by the right number of 4k pages.
289 * The ~0x1 is to mask out the PAT bit mentioned above. */
290 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
291 gw->eff_l1e = guest_l1e_from_gfn(start, flags);
292 gw->l1e = NULL;
293 gw->l1mfn = _mfn(INVALID_MFN);
294 }
295 else
296 {
297 /* Not a superpage: carry on and find the l1e. */
298 gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
299 if ( !mfn_valid(gw->l1mfn) ) return 1;
300 /* This mfn is a pagetable: make sure the guest can't write to it. */
301 if ( guest_op
302 && shadow_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
303 flush_tlb_mask(v->domain->domain_dirty_cpumask);
304 gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
305 + guest_l1_table_offset(va);
306 gw->eff_l1e = *gw->l1e;
307 }
309 return 0;
310 }
312 /* Given a walk_t, translate the gw->va into the guest's notion of the
313 * corresponding frame number. */
314 static inline gfn_t
315 guest_walk_to_gfn(walk_t *gw)
316 {
317 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
318 return _gfn(INVALID_GFN);
319 return guest_l1e_get_gfn(gw->eff_l1e);
320 }
322 /* Given a walk_t, translate the gw->va into the guest's notion of the
323 * corresponding physical address. */
324 static inline paddr_t
325 guest_walk_to_gpa(walk_t *gw)
326 {
327 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
328 return 0;
329 return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
330 }
333 /* Unmap (and reinitialise) a guest walk.
334 * Call this to dispose of any walk filled in by guest_walk_tables() */
335 static void unmap_walk(struct vcpu *v, walk_t *gw)
336 {
337 #if GUEST_PAGING_LEVELS >= 3
338 #if GUEST_PAGING_LEVELS >= 4
339 if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
340 #endif
341 if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
342 #endif
343 if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
344 #ifdef DEBUG
345 memset(gw, 0, sizeof(*gw));
346 #endif
347 }
350 /* Pretty-print the contents of a guest-walk */
351 static inline void print_gw(walk_t *gw)
352 {
353 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
354 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
355 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
356 SHADOW_PRINTK(" l4mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l4mfn));
357 SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
358 if ( gw->l4e )
359 SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
360 #endif /* PAE or 64... */
361 SHADOW_PRINTK(" l3mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l3mfn));
362 SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
363 if ( gw->l3e )
364 SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
365 #endif /* All levels... */
366 SHADOW_PRINTK(" l2mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l2mfn));
367 SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
368 if ( gw->l2e )
369 SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
370 SHADOW_PRINTK(" l1mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l1mfn));
371 SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
372 if ( gw->l1e )
373 SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
374 SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
375 }
378 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
379 /* Lightweight audit: pass all the shadows associated with this guest walk
380 * through the audit mechanisms */
381 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
382 {
383 mfn_t smfn;
385 if ( !(SHADOW_AUDIT_ENABLE) )
386 return;
388 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
389 if ( mfn_valid(gw->l4mfn)
390 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
391 SH_type_l4_shadow))) )
392 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
393 if ( mfn_valid(gw->l3mfn)
394 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
395 SH_type_l3_shadow))) )
396 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
397 #endif /* PAE or 64... */
398 if ( mfn_valid(gw->l2mfn) )
399 {
400 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
401 SH_type_l2_shadow))) )
402 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
403 #if GUEST_PAGING_LEVELS == 3
404 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
405 SH_type_l2h_shadow))) )
406 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
407 #endif
408 }
409 if ( mfn_valid(gw->l1mfn)
410 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
411 SH_type_l1_shadow))) )
412 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
413 else if ( gw->l2e
414 && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
415 && mfn_valid(
416 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
417 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
418 }
420 #else
421 #define sh_audit_gw(_v, _gw) do {} while(0)
422 #endif /* audit code */
426 /**************************************************************************/
427 /* Function to write to the guest tables, for propagating accessed and
428 * dirty bits from the shadow to the guest.
429 * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
430 * and an operation type. The guest entry is always passed as an l1e:
431 * since we only ever write flags, that's OK.
432 * Returns the new flag bits of the guest entry. */
434 static u32 guest_set_ad_bits(struct vcpu *v,
435 mfn_t gmfn,
436 guest_l1e_t *ep,
437 unsigned int level,
438 fetch_type_t ft)
439 {
440 u32 flags;
441 int res = 0;
443 ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
444 ASSERT(level <= GUEST_PAGING_LEVELS);
445 ASSERT(shadow_lock_is_acquired(v->domain));
447 flags = guest_l1e_get_flags(*ep);
449 /* Only set A and D bits for guest-initiated accesses */
450 if ( !(ft & FETCH_TYPE_DEMAND) )
451 return flags;
453 ASSERT(mfn_valid(gmfn)
454 && (sh_mfn_is_a_page_table(gmfn)
455 || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
456 == 0)));
458 /* PAE l3s do not have A and D bits */
459 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
461 /* Need the D bit as well for writes, in L1es and PSE L2es. */
462 if ( ft == ft_demand_write
463 && (level == 1 ||
464 (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
465 {
466 if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
467 == (_PAGE_DIRTY | _PAGE_ACCESSED) )
468 return flags; /* Guest already has A and D bits set */
469 flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
470 perfc_incrc(shadow_ad_update);
471 }
472 else
473 {
474 if ( flags & _PAGE_ACCESSED )
475 return flags; /* Guest already has A bit set */
476 flags |= _PAGE_ACCESSED;
477 perfc_incrc(shadow_a_update);
478 }
480 /* Set the bit(s) */
481 sh_mark_dirty(v->domain, gmfn);
482 SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
483 "old flags = %#x, new flags = %#x\n",
484 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep),
485 flags);
486 *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
488 /* Propagate this change to any other shadows of the page
489 * (only necessary if there is more than one shadow) */
490 if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
491 {
492 u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
493 /* More than one type bit set in shadow-flags? */
494 if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
495 res = __shadow_validate_guest_entry(v, gmfn, ep, sizeof(*ep));
496 }
498 /* We should never need to flush the TLB or recopy PAE entries */
499 ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
501 return flags;
502 }
504 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
505 void *
506 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
507 unsigned long *gl1mfn)
508 {
509 void *pl1e = NULL;
510 walk_t gw;
512 ASSERT(shadow_mode_translate(v->domain));
514 // XXX -- this is expensive, but it's easy to cobble together...
515 // FIXME!
517 shadow_lock(v->domain);
518 guest_walk_tables(v, addr, &gw, 1);
520 if ( gw.l2e &&
521 (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) &&
522 !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) )
523 {
524 if ( gl1mfn )
525 *gl1mfn = mfn_x(gw.l1mfn);
526 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
527 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
528 }
530 unmap_walk(v, &gw);
531 shadow_unlock(v->domain);
533 return pl1e;
534 }
536 void
537 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
538 {
539 walk_t gw;
541 ASSERT(shadow_mode_translate(v->domain));
543 // XXX -- this is expensive, but it's easy to cobble together...
544 // FIXME!
546 shadow_lock(v->domain);
547 guest_walk_tables(v, addr, &gw, 1);
548 *(guest_l1e_t *)eff_l1e = gw.eff_l1e;
549 unmap_walk(v, &gw);
550 shadow_unlock(v->domain);
551 }
552 #endif /* CONFIG==SHADOW==GUEST */
554 /**************************************************************************/
555 /* Functions to compute the correct index into a shadow page, given an
556 * index into the guest page (as returned by guest_get_index()).
557 * This is trivial when the shadow and guest use the same sized PTEs, but
558 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
559 * PAE- or 64-bit shadows).
560 *
561 * These functions also increment the shadow mfn, when necessary. When PTE
562 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
563 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
564 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
565 * which shadow page we really want. Similarly, when PTE sizes are
566 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
567 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
568 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
569 * space.)
570 *
571 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
572 * of shadow (to store both the shadow, and the info that would normally be
573 * stored in page_info fields). This arrangement allows the shadow and the
574 * "page_info" fields to always be stored in the same page (in fact, in
575 * the same cache line), avoiding an extra call to map_domain_page().
576 */
578 static inline u32
579 guest_index(void *ptr)
580 {
581 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
582 }
584 static u32
585 shadow_l1_index(mfn_t *smfn, u32 guest_index)
586 {
587 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
588 *smfn = _mfn(mfn_x(*smfn) +
589 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
590 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
591 #else
592 return guest_index;
593 #endif
594 }
596 static u32
597 shadow_l2_index(mfn_t *smfn, u32 guest_index)
598 {
599 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
600 // Because we use 2 shadow l2 entries for each guest entry, the number of
601 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
602 //
603 *smfn = _mfn(mfn_x(*smfn) +
604 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
606 // We multiple by two to get the index of the first of the two entries
607 // used to shadow the specified guest entry.
608 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
609 #else
610 return guest_index;
611 #endif
612 }
614 #if GUEST_PAGING_LEVELS >= 4
616 static u32
617 shadow_l3_index(mfn_t *smfn, u32 guest_index)
618 {
619 return guest_index;
620 }
622 static u32
623 shadow_l4_index(mfn_t *smfn, u32 guest_index)
624 {
625 return guest_index;
626 }
628 #endif // GUEST_PAGING_LEVELS >= 4
631 /**************************************************************************/
632 /* Function which computes shadow entries from their corresponding guest
633 * entries. This is the "heart" of the shadow code. It operates using
634 * level-1 shadow types, but handles all levels of entry.
635 * Don't call it directly, but use the four wrappers below.
636 */
638 static always_inline void
639 _sh_propagate(struct vcpu *v,
640 void *guest_entry_ptr,
641 mfn_t guest_table_mfn,
642 mfn_t target_mfn,
643 void *shadow_entry_ptr,
644 int level,
645 fetch_type_t ft,
646 int mmio)
647 {
648 guest_l1e_t *gp = guest_entry_ptr;
649 shadow_l1e_t *sp = shadow_entry_ptr;
650 struct domain *d = v->domain;
651 u32 pass_thru_flags;
652 u32 gflags, sflags;
654 /* We don't shadow PAE l3s */
655 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
657 if ( mfn_valid(guest_table_mfn) )
658 /* Handle A and D bit propagation into the guest */
659 gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
660 else
661 {
662 /* Must be an fl1e or a prefetch */
663 ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
664 gflags = guest_l1e_get_flags(*gp);
665 }
667 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
668 {
669 /* If a guest l1 entry is not present, shadow with the magic
670 * guest-not-present entry. */
671 if ( level == 1 )
672 *sp = sh_l1e_gnp();
673 else
674 *sp = shadow_l1e_empty();
675 goto done;
676 }
678 if ( level == 1 && mmio )
679 {
680 /* Guest l1e maps MMIO space */
681 *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
682 goto done;
683 }
685 // Must have a valid target_mfn, unless this is a prefetch. In the
686 // case of a prefetch, an invalid mfn means that we can not usefully
687 // shadow anything, and so we return early.
688 //
689 if ( !mfn_valid(target_mfn) )
690 {
691 ASSERT((ft == ft_prefetch));
692 *sp = shadow_l1e_empty();
693 goto done;
694 }
696 // Propagate bits from the guest to the shadow.
697 // Some of these may be overwritten, below.
698 // Since we know the guest's PRESENT bit is set, we also set the shadow's
699 // SHADOW_PRESENT bit.
700 //
701 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
702 _PAGE_RW | _PAGE_PRESENT);
703 if ( guest_supports_nx(v) )
704 pass_thru_flags |= _PAGE_NX_BIT;
705 sflags = gflags & pass_thru_flags;
707 // Set the A&D bits for higher level shadows.
708 // Higher level entries do not, strictly speaking, have dirty bits, but
709 // since we use shadow linear tables, each of these entries may, at some
710 // point in time, also serve as a shadow L1 entry.
711 // By setting both the A&D bits in each of these, we eliminate the burden
712 // on the hardware to update these bits on initial accesses.
713 //
714 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
715 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
717 // If the A or D bit has not yet been set in the guest, then we must
718 // prevent the corresponding kind of access.
719 //
720 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
721 sflags &= ~_PAGE_PRESENT;
723 /* D bits exist in L1es and PSE L2es */
724 if ( unlikely(((level == 1) ||
725 ((level == 2) &&
726 (gflags & _PAGE_PSE) &&
727 guest_supports_superpages(v)))
728 && !(gflags & _PAGE_DIRTY)) )
729 sflags &= ~_PAGE_RW;
731 // shadow_mode_log_dirty support
732 //
733 // Only allow the guest write access to a page a) on a demand fault,
734 // or b) if the page is already marked as dirty.
735 //
736 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
737 {
738 if ( ft & FETCH_TYPE_WRITE )
739 sh_mark_dirty(d, target_mfn);
740 else if ( !sh_mfn_is_dirty(d, target_mfn) )
741 sflags &= ~_PAGE_RW;
742 }
744 // protect guest page tables
745 //
746 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
747 {
748 if ( shadow_mode_trap_reads(d) )
749 {
750 // if we are trapping both reads & writes, then mark this page
751 // as not present...
752 //
753 sflags &= ~_PAGE_PRESENT;
754 }
755 else
756 {
757 // otherwise, just prevent any writes...
758 //
759 sflags &= ~_PAGE_RW;
760 }
761 }
763 // PV guests in 64-bit mode use two different page tables for user vs
764 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
765 // It is always shadowed as present...
766 if ( (GUEST_PAGING_LEVELS == 4) && !is_hvm_domain(d) )
767 {
768 sflags |= _PAGE_USER;
769 }
771 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
772 done:
773 SHADOW_DEBUG(PROPAGATE,
774 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
775 fetch_type_names[ft], level, gp->l1, sp->l1);
776 }
779 /* These four wrappers give us a little bit of type-safety back around the
780 * use of void-* pointers in _sh_propagate(), and allow the compiler to
781 * optimize out some level checks. */
783 #if GUEST_PAGING_LEVELS >= 4
784 static void
785 l4e_propagate_from_guest(struct vcpu *v,
786 guest_l4e_t *gl4e,
787 mfn_t gl4mfn,
788 mfn_t sl3mfn,
789 shadow_l4e_t *sl4e,
790 fetch_type_t ft)
791 {
792 _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, 0);
793 }
795 static void
796 l3e_propagate_from_guest(struct vcpu *v,
797 guest_l3e_t *gl3e,
798 mfn_t gl3mfn,
799 mfn_t sl2mfn,
800 shadow_l3e_t *sl3e,
801 fetch_type_t ft)
802 {
803 _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, 0);
804 }
805 #endif // GUEST_PAGING_LEVELS >= 4
807 static void
808 l2e_propagate_from_guest(struct vcpu *v,
809 guest_l2e_t *gl2e,
810 mfn_t gl2mfn,
811 mfn_t sl1mfn,
812 shadow_l2e_t *sl2e,
813 fetch_type_t ft)
814 {
815 _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, 0);
816 }
818 static void
819 l1e_propagate_from_guest(struct vcpu *v,
820 guest_l1e_t *gl1e,
821 mfn_t gl1mfn,
822 mfn_t gmfn,
823 shadow_l1e_t *sl1e,
824 fetch_type_t ft,
825 int mmio)
826 {
827 _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, mmio);
828 }
831 /**************************************************************************/
832 /* These functions update shadow entries (and do bookkeeping on the shadow
833 * tables they are in). It is intended that they are the only
834 * functions which ever write (non-zero) data onto a shadow page.
835 */
837 static inline void safe_write_entry(void *dst, void *src)
838 /* Copy one PTE safely when processors might be running on the
839 * destination pagetable. This does *not* give safety against
840 * concurrent writes (that's what the shadow lock is for), just
841 * stops the hardware picking up partially written entries. */
842 {
843 volatile unsigned long *d = dst;
844 unsigned long *s = src;
845 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
846 #if CONFIG_PAGING_LEVELS == 3
847 /* In PAE mode, pagetable entries are larger
848 * than machine words, so won't get written atomically. We need to make
849 * sure any other cpu running on these shadows doesn't see a
850 * half-written entry. Do this by marking the entry not-present first,
851 * then writing the high word before the low word. */
852 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
853 d[0] = 0;
854 wmb();
855 d[1] = s[1];
856 wmb();
857 d[0] = s[0];
858 #else
859 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
860 * which will be an atomic write, since the entry is aligned. */
861 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
862 *d = *s;
863 #endif
864 }
867 static inline void
868 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
869 /* This function does the actual writes to shadow pages.
870 * It must not be called directly, since it doesn't do the bookkeeping
871 * that shadow_set_l*e() functions do. */
872 {
873 shadow_l1e_t *dst = d;
874 shadow_l1e_t *src = s;
875 void *map = NULL;
876 int i;
878 /* Because we mirror access rights at all levels in the shadow, an
879 * l2 (or higher) entry with the RW bit cleared will leave us with
880 * no write access through the linear map.
881 * We detect that by writing to the shadow with copy_to_user() and
882 * using map_domain_page() to get a writeable mapping if we need to. */
883 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
884 {
885 perfc_incrc(shadow_linear_map_failed);
886 map = sh_map_domain_page(mfn);
887 ASSERT(map != NULL);
888 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
889 }
892 for ( i = 0; i < entries; i++ )
893 safe_write_entry(dst++, src++);
895 if ( map != NULL ) sh_unmap_domain_page(map);
896 }
898 static inline int
899 perms_strictly_increased(u32 old_flags, u32 new_flags)
900 /* Given the flags of two entries, are the new flags a strict
901 * increase in rights over the old ones? */
902 {
903 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
904 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
905 /* Flip the NX bit, since it's the only one that decreases rights;
906 * we calculate as if it were an "X" bit. */
907 of ^= _PAGE_NX_BIT;
908 nf ^= _PAGE_NX_BIT;
909 /* If the changed bits are all set in the new flags, then rights strictly
910 * increased between old and new. */
911 return ((of | (of ^ nf)) == nf);
912 }
914 static int inline
915 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
916 {
917 int res;
918 mfn_t mfn;
919 struct domain *owner;
921 ASSERT(!sh_l1e_is_magic(sl1e));
923 if ( !shadow_mode_refcounts(d) )
924 return 1;
926 res = get_page_from_l1e(sl1e, d);
928 // If a privileged domain is attempting to install a map of a page it does
929 // not own, we let it succeed anyway.
930 //
931 if ( unlikely(!res) &&
932 IS_PRIV(d) &&
933 !shadow_mode_translate(d) &&
934 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
935 (owner = page_get_owner(mfn_to_page(mfn))) &&
936 (d != owner) )
937 {
938 res = get_page_from_l1e(sl1e, owner);
939 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
940 "which is owned by domain %d: %s\n",
941 d->domain_id, mfn_x(mfn), owner->domain_id,
942 res ? "success" : "failed");
943 }
945 if ( unlikely(!res) )
946 {
947 perfc_incrc(shadow_get_page_fail);
948 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
949 }
951 return res;
952 }
954 static void inline
955 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
956 {
957 if ( !shadow_mode_refcounts(d) )
958 return;
960 put_page_from_l1e(sl1e, d);
961 }
963 #if GUEST_PAGING_LEVELS >= 4
964 static int shadow_set_l4e(struct vcpu *v,
965 shadow_l4e_t *sl4e,
966 shadow_l4e_t new_sl4e,
967 mfn_t sl4mfn)
968 {
969 int flags = 0, ok;
970 shadow_l4e_t old_sl4e;
971 paddr_t paddr;
972 ASSERT(sl4e != NULL);
973 old_sl4e = *sl4e;
975 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
977 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
978 | (((unsigned long)sl4e) & ~PAGE_MASK));
980 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
981 {
982 /* About to install a new reference */
983 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
984 ok = sh_get_ref(v, sl3mfn, paddr);
985 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
986 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
987 ok |= sh_pin(v, sl3mfn);
988 if ( !ok )
989 {
990 domain_crash(v->domain);
991 return SHADOW_SET_ERROR;
992 }
993 }
995 /* Write the new entry */
996 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
997 flags |= SHADOW_SET_CHANGED;
999 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1001 /* We lost a reference to an old mfn. */
1002 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1003 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1004 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1005 shadow_l4e_get_flags(new_sl4e)) )
1007 flags |= SHADOW_SET_FLUSH;
1009 sh_put_ref(v, osl3mfn, paddr);
1011 return flags;
1014 static int shadow_set_l3e(struct vcpu *v,
1015 shadow_l3e_t *sl3e,
1016 shadow_l3e_t new_sl3e,
1017 mfn_t sl3mfn)
1019 int flags = 0;
1020 shadow_l3e_t old_sl3e;
1021 paddr_t paddr;
1022 ASSERT(sl3e != NULL);
1023 old_sl3e = *sl3e;
1025 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1027 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1028 | (((unsigned long)sl3e) & ~PAGE_MASK));
1030 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1031 /* About to install a new reference */
1032 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1034 domain_crash(v->domain);
1035 return SHADOW_SET_ERROR;
1038 /* Write the new entry */
1039 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1040 flags |= SHADOW_SET_CHANGED;
1042 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1044 /* We lost a reference to an old mfn. */
1045 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1046 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1047 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1048 shadow_l3e_get_flags(new_sl3e)) )
1050 flags |= SHADOW_SET_FLUSH;
1052 sh_put_ref(v, osl2mfn, paddr);
1054 return flags;
1056 #endif /* GUEST_PAGING_LEVELS >= 4 */
1058 static int shadow_set_l2e(struct vcpu *v,
1059 shadow_l2e_t *sl2e,
1060 shadow_l2e_t new_sl2e,
1061 mfn_t sl2mfn)
1063 int flags = 0;
1064 shadow_l2e_t old_sl2e;
1065 paddr_t paddr;
1067 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1068 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1069 * shadows. Reference counting and up-pointers track from the first
1070 * page of the shadow to the first l2e, so make sure that we're
1071 * working with those:
1072 * Align the pointer down so it's pointing at the first of the pair */
1073 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1074 /* Align the mfn of the shadow entry too */
1075 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1076 #endif
1078 ASSERT(sl2e != NULL);
1079 old_sl2e = *sl2e;
1081 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1083 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1084 | (((unsigned long)sl2e) & ~PAGE_MASK));
1086 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1087 /* About to install a new reference */
1088 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1090 domain_crash(v->domain);
1091 return SHADOW_SET_ERROR;
1094 /* Write the new entry */
1095 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1097 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1098 /* The l1 shadow is two pages long and need to be pointed to by
1099 * two adjacent l1es. The pair have the same flags, but point
1100 * at odd and even MFNs */
1101 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1102 pair[1].l2 |= (1<<PAGE_SHIFT);
1103 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1105 #else /* normal case */
1106 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1107 #endif
1108 flags |= SHADOW_SET_CHANGED;
1110 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1112 /* We lost a reference to an old mfn. */
1113 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1114 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1115 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1116 shadow_l2e_get_flags(new_sl2e)) )
1118 flags |= SHADOW_SET_FLUSH;
1120 sh_put_ref(v, osl1mfn, paddr);
1122 return flags;
1125 static int shadow_set_l1e(struct vcpu *v,
1126 shadow_l1e_t *sl1e,
1127 shadow_l1e_t new_sl1e,
1128 mfn_t sl1mfn)
1130 int flags = 0;
1131 struct domain *d = v->domain;
1132 shadow_l1e_t old_sl1e;
1133 ASSERT(sl1e != NULL);
1135 old_sl1e = *sl1e;
1137 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1139 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1140 && !sh_l1e_is_magic(new_sl1e) )
1142 /* About to install a new reference */
1143 if ( shadow_mode_refcounts(d) ) {
1144 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1146 /* Doesn't look like a pagetable. */
1147 flags |= SHADOW_SET_ERROR;
1148 new_sl1e = shadow_l1e_empty();
1153 /* Write the new entry */
1154 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1155 flags |= SHADOW_SET_CHANGED;
1157 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1158 && !sh_l1e_is_magic(old_sl1e) )
1160 /* We lost a reference to an old mfn. */
1161 /* N.B. Unlike higher-level sets, never need an extra flush
1162 * when writing an l1e. Because it points to the same guest frame
1163 * as the guest l1e did, it's the guest's responsibility to
1164 * trigger a flush later. */
1165 if ( shadow_mode_refcounts(d) )
1167 shadow_put_page_from_l1e(old_sl1e, d);
1170 return flags;
1174 /**************************************************************************/
1175 /* Macros to walk pagetables. These take the shadow of a pagetable and
1176 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1177 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1178 * second entry (since pairs of entries are managed together). For multi-page
1179 * shadows they walk all pages.
1181 * Arguments are an MFN, the variable to point to each entry, a variable
1182 * to indicate that we are done (we will shortcut to the end of the scan
1183 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1184 * and the code.
1186 * WARNING: These macros have side-effects. They change the values of both
1187 * the pointer and the MFN. */
1189 static inline void increment_ptr_to_guest_entry(void *ptr)
1191 if ( ptr )
1193 guest_l1e_t **entry = ptr;
1194 (*entry)++;
1198 /* All kinds of l1: touch all entries */
1199 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1200 do { \
1201 int _i; \
1202 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1203 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1204 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1205 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1206 { \
1207 (_sl1e) = _sp + _i; \
1208 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1209 {_code} \
1210 if ( _done ) break; \
1211 increment_ptr_to_guest_entry(_gl1p); \
1212 } \
1213 unmap_shadow_page(_sp); \
1214 } while (0)
1216 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1217 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1218 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1219 do { \
1220 int __done = 0; \
1221 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1222 ({ (__done = _done); }), _code); \
1223 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1224 if ( !__done ) \
1225 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1226 ({ (__done = _done); }), _code); \
1227 } while (0)
1228 #else /* Everything else; l1 shadows are only one page */
1229 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1230 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1231 #endif
1234 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1236 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1237 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1238 do { \
1239 int _i, _j, __done = 0; \
1240 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1241 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1242 { \
1243 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1244 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1245 if ( (!(_xen)) \
1246 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1247 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1248 { \
1249 (_sl2e) = _sp + _i; \
1250 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1251 {_code} \
1252 if ( (__done = (_done)) ) break; \
1253 increment_ptr_to_guest_entry(_gl2p); \
1254 } \
1255 unmap_shadow_page(_sp); \
1256 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1257 } \
1258 } while (0)
1260 #elif GUEST_PAGING_LEVELS == 2
1262 /* 32-bit on 32-bit: avoid Xen entries */
1263 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1264 do { \
1265 int _i; \
1266 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1267 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1268 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1269 if ( (!(_xen)) \
1270 || \
1271 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1272 { \
1273 (_sl2e) = _sp + _i; \
1274 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1275 {_code} \
1276 if ( _done ) break; \
1277 increment_ptr_to_guest_entry(_gl2p); \
1278 } \
1279 unmap_shadow_page(_sp); \
1280 } while (0)
1282 #elif GUEST_PAGING_LEVELS == 3
1284 /* PAE: if it's an l2h, don't touch Xen mappings */
1285 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1286 do { \
1287 int _i; \
1288 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1289 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1290 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1291 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1292 if ( (!(_xen)) \
1293 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1294 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1295 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1296 { \
1297 (_sl2e) = _sp + _i; \
1298 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1299 {_code} \
1300 if ( _done ) break; \
1301 increment_ptr_to_guest_entry(_gl2p); \
1302 } \
1303 unmap_shadow_page(_sp); \
1304 } while (0)
1306 #else
1308 /* 64-bit l2: touch all entries */
1309 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1310 do { \
1311 int _i; \
1312 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1313 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow); \
1314 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1315 { \
1316 (_sl2e) = _sp + _i; \
1317 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1318 {_code} \
1319 if ( _done ) break; \
1320 increment_ptr_to_guest_entry(_gl2p); \
1321 } \
1322 unmap_shadow_page(_sp); \
1323 } while (0)
1325 #endif /* different kinds of l2 */
1327 #if GUEST_PAGING_LEVELS == 4
1329 /* 64-bit l3: touch all entries */
1330 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1331 do { \
1332 int _i; \
1333 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1334 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1335 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1336 { \
1337 (_sl3e) = _sp + _i; \
1338 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1339 {_code} \
1340 if ( _done ) break; \
1341 increment_ptr_to_guest_entry(_gl3p); \
1342 } \
1343 unmap_shadow_page(_sp); \
1344 } while (0)
1346 /* 64-bit l4: avoid Xen mappings */
1347 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \
1348 do { \
1349 int _i; \
1350 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1351 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1352 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1353 { \
1354 if ( (!(_xen)) || is_guest_l4_slot(_i) ) \
1355 { \
1356 (_sl4e) = _sp + _i; \
1357 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1358 {_code} \
1359 if ( _done ) break; \
1360 } \
1361 increment_ptr_to_guest_entry(_gl4p); \
1362 } \
1363 unmap_shadow_page(_sp); \
1364 } while (0)
1366 #endif
1370 /**************************************************************************/
1371 /* Functions to install Xen mappings and linear mappings in shadow pages */
1373 // XXX -- this function should probably be moved to shadow-common.c, but that
1374 // probably wants to wait until the shadow types have been moved from
1375 // shadow-types.h to shadow-private.h
1376 //
1377 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1378 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1380 struct domain *d = v->domain;
1381 shadow_l4e_t *sl4e;
1383 sl4e = sh_map_domain_page(sl4mfn);
1384 ASSERT(sl4e != NULL);
1385 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1387 /* Copy the common Xen mappings from the idle domain */
1388 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1389 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1390 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1392 /* Install the per-domain mappings for this domain */
1393 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1394 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1395 __PAGE_HYPERVISOR);
1397 /* Linear mapping */
1398 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1399 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1401 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1403 // linear tables may not be used with translated PV guests
1404 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1405 shadow_l4e_empty();
1407 else
1409 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1410 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1413 if ( shadow_mode_translate(v->domain) )
1415 /* install domain-specific P2M table */
1416 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1417 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1418 __PAGE_HYPERVISOR);
1421 sh_unmap_domain_page(sl4e);
1423 #endif
1425 #if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
1426 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1427 // place, which means that we need to populate the l2h entry in the l3
1428 // table.
1430 void sh_install_xen_entries_in_l2h(struct vcpu *v,
1431 mfn_t sl2hmfn)
1433 struct domain *d = v->domain;
1434 shadow_l2e_t *sl2e;
1435 int i;
1437 sl2e = sh_map_domain_page(sl2hmfn);
1438 ASSERT(sl2e != NULL);
1439 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1441 /* Copy the common Xen mappings from the idle domain */
1442 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1443 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1444 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1446 /* Install the per-domain mappings for this domain */
1447 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1448 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1449 shadow_l2e_from_mfn(
1450 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1451 __PAGE_HYPERVISOR);
1453 /* We don't set up a linear mapping here because we can't until this
1454 * l2h is installed in an l3e. sh_update_linear_entries() handles
1455 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1456 * We zero them here, just as a safety measure.
1457 */
1458 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1459 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1460 shadow_l2e_empty();
1461 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1462 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1463 shadow_l2e_empty();
1465 if ( shadow_mode_translate(d) )
1467 /* Install the domain-specific p2m table */
1468 l3_pgentry_t *p2m;
1469 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1470 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1471 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1473 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1474 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1475 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1476 __PAGE_HYPERVISOR)
1477 : shadow_l2e_empty();
1479 sh_unmap_domain_page(p2m);
1482 sh_unmap_domain_page(sl2e);
1484 #endif
1487 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1488 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1490 struct domain *d = v->domain;
1491 shadow_l2e_t *sl2e;
1492 int i;
1494 sl2e = sh_map_domain_page(sl2mfn);
1495 ASSERT(sl2e != NULL);
1496 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1498 /* Copy the common Xen mappings from the idle domain */
1499 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1500 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1501 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1503 /* Install the per-domain mappings for this domain */
1504 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1505 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1506 shadow_l2e_from_mfn(
1507 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1508 __PAGE_HYPERVISOR);
1510 /* Linear mapping */
1511 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1512 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1514 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1516 // linear tables may not be used with translated PV guests
1517 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1518 shadow_l2e_empty();
1520 else
1522 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1523 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1526 if ( shadow_mode_translate(d) )
1528 /* install domain-specific P2M table */
1529 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1530 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1531 __PAGE_HYPERVISOR);
1534 sh_unmap_domain_page(sl2e);
1536 #endif
1540 /**************************************************************************/
1541 /* Create a shadow of a given guest page.
1542 */
1543 static mfn_t
1544 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1546 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1547 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1548 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1550 if ( shadow_type != SH_type_l2_32_shadow
1551 && shadow_type != SH_type_l2_pae_shadow
1552 && shadow_type != SH_type_l2h_pae_shadow
1553 && shadow_type != SH_type_l4_64_shadow )
1554 /* Lower-level shadow, not yet linked form a higher level */
1555 mfn_to_shadow_page(smfn)->up = 0;
1557 #if GUEST_PAGING_LEVELS == 4
1558 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1559 if ( shadow_type == SH_type_l4_64_shadow &&
1560 unlikely(v->domain->arch.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1562 /* We're shadowing a new l4, but we've been assuming the guest uses
1563 * only one l4 per vcpu and context switches using an l4 entry.
1564 * Count the number of active l4 shadows. If there are enough
1565 * of them, decide that this isn't an old linux guest, and stop
1566 * pinning l3es. This is not very quick but it doesn't happen
1567 * very often. */
1568 struct list_head *l, *t;
1569 struct shadow_page_info *sp;
1570 struct vcpu *v2;
1571 int l4count = 0, vcpus = 0;
1572 list_for_each(l, &v->domain->arch.shadow.pinned_shadows)
1574 sp = list_entry(l, struct shadow_page_info, list);
1575 if ( sp->type == SH_type_l4_64_shadow )
1576 l4count++;
1578 for_each_vcpu ( v->domain, v2 )
1579 vcpus++;
1580 if ( l4count > 2 * vcpus )
1582 /* Unpin all the pinned l3 tables, and don't pin any more. */
1583 list_for_each_safe(l, t, &v->domain->arch.shadow.pinned_shadows)
1585 sp = list_entry(l, struct shadow_page_info, list);
1586 if ( sp->type == SH_type_l3_64_shadow )
1587 sh_unpin(v, shadow_page_to_mfn(sp));
1589 v->domain->arch.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1592 #endif
1593 #endif
1595 // Create the Xen mappings...
1596 if ( !shadow_mode_external(v->domain) )
1598 switch (shadow_type)
1600 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1601 case SH_type_l4_shadow:
1602 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1603 #endif
1604 #if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
1605 case SH_type_l2h_shadow:
1606 sh_install_xen_entries_in_l2h(v, smfn); break;
1607 #endif
1608 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1609 case SH_type_l2_shadow:
1610 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1611 #endif
1612 default: /* Do nothing */ break;
1616 shadow_promote(v, gmfn, shadow_type);
1617 set_shadow_status(v, gmfn, shadow_type, smfn);
1619 return smfn;
1622 /* Make a splintered superpage shadow */
1623 static mfn_t
1624 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1626 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1627 (unsigned long) gfn_x(gfn));
1629 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" SH_PRI_mfn "\n",
1630 gfn_x(gfn), mfn_x(smfn));
1632 set_fl1_shadow_status(v, gfn, smfn);
1633 return smfn;
1637 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1638 mfn_t
1639 sh_make_monitor_table(struct vcpu *v)
1642 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1644 #if CONFIG_PAGING_LEVELS == 4
1646 struct domain *d = v->domain;
1647 mfn_t m4mfn;
1648 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1649 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1650 /* Remember the level of this table */
1651 mfn_to_page(m4mfn)->shadow_flags = 4;
1652 #if SHADOW_PAGING_LEVELS < 4
1653 // Install a monitor l3 table in slot 0 of the l4 table.
1654 // This is used for shadow linear maps.
1656 mfn_t m3mfn;
1657 l4_pgentry_t *l4e;
1658 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1659 mfn_to_page(m3mfn)->shadow_flags = 3;
1660 l4e = sh_map_domain_page(m4mfn);
1661 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1662 sh_unmap_domain_page(l4e);
1664 #endif /* SHADOW_PAGING_LEVELS < 4 */
1665 return m4mfn;
1668 #elif CONFIG_PAGING_LEVELS == 3
1671 struct domain *d = v->domain;
1672 mfn_t m3mfn, m2mfn;
1673 l3_pgentry_t *l3e;
1674 l2_pgentry_t *l2e;
1675 int i;
1677 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1678 /* Remember the level of this table */
1679 mfn_to_page(m3mfn)->shadow_flags = 3;
1681 // Install a monitor l2 table in slot 3 of the l3 table.
1682 // This is used for all Xen entries, including linear maps
1683 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1684 mfn_to_page(m2mfn)->shadow_flags = 2;
1685 l3e = sh_map_domain_page(m3mfn);
1686 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1687 sh_install_xen_entries_in_l2h(v, m2mfn);
1688 /* Install the monitor's own linear map */
1689 l2e = sh_map_domain_page(m2mfn);
1690 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1691 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1692 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1693 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1694 : l2e_empty();
1695 sh_unmap_domain_page(l2e);
1696 sh_unmap_domain_page(l3e);
1698 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1699 return m3mfn;
1702 #elif CONFIG_PAGING_LEVELS == 2
1705 struct domain *d = v->domain;
1706 mfn_t m2mfn;
1707 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1708 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1709 /* Remember the level of this table */
1710 mfn_to_page(m2mfn)->shadow_flags = 2;
1711 return m2mfn;
1714 #else
1715 #error this should not happen
1716 #endif /* CONFIG_PAGING_LEVELS */
1718 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1720 /**************************************************************************/
1721 /* These functions also take a virtual address and return the level-N
1722 * shadow table mfn and entry, but they create the shadow pagetables if
1723 * they are needed. The "demand" argument is non-zero when handling
1724 * a demand fault (so we know what to do about accessed bits &c).
1725 * If the necessary tables are not present in the guest, they return NULL. */
1727 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1728 * more levels than the guest, the upper levels are always fixed and do not
1729 * reflect any information from the guest, so we do not use these functions
1730 * to access them. */
1732 #if GUEST_PAGING_LEVELS >= 4
1733 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1734 walk_t *gw,
1735 mfn_t *sl4mfn)
1737 /* There is always a shadow of the top level table. Get it. */
1738 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1739 /* Reading the top level table is always valid. */
1740 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1743 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1744 walk_t *gw,
1745 mfn_t *sl3mfn,
1746 fetch_type_t ft)
1748 mfn_t sl4mfn;
1749 shadow_l4e_t *sl4e;
1750 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1751 /* Get the l4e */
1752 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1753 ASSERT(sl4e != NULL);
1754 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1756 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1757 ASSERT(mfn_valid(*sl3mfn));
1759 else
1761 int r;
1762 shadow_l4e_t new_sl4e;
1763 /* No l3 shadow installed: find and install it. */
1764 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1765 if ( !mfn_valid(*sl3mfn) )
1767 /* No l3 shadow of this page exists at all: make one. */
1768 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1770 /* Install the new sl3 table in the sl4e */
1771 l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
1772 *sl3mfn, &new_sl4e, ft);
1773 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1774 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1775 if ( r & SHADOW_SET_ERROR )
1776 return NULL;
1778 /* Now follow it down a level. Guaranteed to succeed. */
1779 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1781 #endif /* GUEST_PAGING_LEVELS >= 4 */
1784 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1785 walk_t *gw,
1786 mfn_t *sl2mfn,
1787 fetch_type_t ft)
1789 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1790 mfn_t sl3mfn = _mfn(INVALID_MFN);
1791 shadow_l3e_t *sl3e;
1792 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1793 /* Get the l3e */
1794 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1795 if ( sl3e == NULL ) return NULL;
1796 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1798 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1799 ASSERT(mfn_valid(*sl2mfn));
1801 else
1803 int r;
1804 shadow_l3e_t new_sl3e;
1805 /* No l2 shadow installed: find and install it. */
1806 *sl2mfn = get_shadow_status(v, gw->l2mfn, SH_type_l2_shadow);
1807 if ( !mfn_valid(*sl2mfn) )
1809 /* No l2 shadow of this page exists at all: make one. */
1810 *sl2mfn = sh_make_shadow(v, gw->l2mfn, SH_type_l2_shadow);
1812 /* Install the new sl2 table in the sl3e */
1813 l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
1814 *sl2mfn, &new_sl3e, ft);
1815 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1816 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1817 if ( r & SHADOW_SET_ERROR )
1818 return NULL;
1820 /* Now follow it down a level. Guaranteed to succeed. */
1821 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1822 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1823 /* We never demand-shadow PAE l3es: they are only created in
1824 * sh_update_cr3(). Check if the relevant sl3e is present. */
1825 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.shadow.l3table)
1826 + shadow_l3_linear_offset(gw->va);
1827 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1828 return NULL;
1829 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1830 ASSERT(mfn_valid(*sl2mfn));
1831 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1832 #else /* 32bit... */
1833 /* There is always a shadow of the top level table. Get it. */
1834 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1835 /* This next line is important: the guest l2 has a 16k
1836 * shadow, we need to return the right mfn of the four. This
1837 * call will set it for us as a side-effect. */
1838 (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
1839 /* Reading the top level table is always valid. */
1840 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1841 #endif
1845 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1846 walk_t *gw,
1847 mfn_t *sl1mfn,
1848 fetch_type_t ft)
1850 mfn_t sl2mfn;
1851 shadow_l2e_t *sl2e;
1853 /* Get the l2e */
1854 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1855 if ( sl2e == NULL ) return NULL;
1856 /* Install the sl1 in the l2e if it wasn't there or if we need to
1857 * re-do it to fix a PSE dirty bit. */
1858 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1859 && likely(ft != ft_demand_write
1860 || (guest_l2e_get_flags(*gw->l2e) & _PAGE_DIRTY)
1861 || !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)) )
1863 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1864 ASSERT(mfn_valid(*sl1mfn));
1866 else
1868 shadow_l2e_t new_sl2e;
1869 int r, flags = guest_l2e_get_flags(*gw->l2e);
1870 /* No l1 shadow installed: find and install it. */
1871 if ( !(flags & _PAGE_PRESENT) )
1872 return NULL; /* No guest page. */
1873 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1875 /* Splintering a superpage */
1876 gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
1877 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1878 if ( !mfn_valid(*sl1mfn) )
1880 /* No fl1 shadow of this superpage exists at all: make one. */
1881 *sl1mfn = make_fl1_shadow(v, l2gfn);
1884 else
1886 /* Shadowing an actual guest l1 table */
1887 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1888 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1889 if ( !mfn_valid(*sl1mfn) )
1891 /* No l1 shadow of this page exists at all: make one. */
1892 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1895 /* Install the new sl1 table in the sl2e */
1896 l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
1897 *sl1mfn, &new_sl2e, ft);
1898 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1899 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1900 if ( r & SHADOW_SET_ERROR )
1901 return NULL;
1902 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1903 * the guest l1 table has an 8k shadow, and we need to return
1904 * the right mfn of the pair. This call will set it for us as a
1905 * side-effect. (In all other cases, it's a no-op and will be
1906 * compiled out.) */
1907 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1909 /* Now follow it down a level. Guaranteed to succeed. */
1910 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1915 /**************************************************************************/
1916 /* Destructors for shadow tables:
1917 * Unregister the shadow, decrement refcounts of any entries present in it,
1918 * and release the memory.
1920 * N.B. These destructors do not clear the contents of the shadows.
1921 * This allows us to delay TLB shootdowns until the page is being reused.
1922 * See shadow_alloc() and shadow_free() for how this is handled.
1923 */
1925 #if GUEST_PAGING_LEVELS >= 4
1926 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1928 shadow_l4e_t *sl4e;
1929 u32 t = mfn_to_shadow_page(smfn)->type;
1930 mfn_t gmfn, sl4mfn;
1931 int xen_mappings;
1933 SHADOW_DEBUG(DESTROY_SHADOW,
1934 "%s(%05lx)\n", __func__, mfn_x(smfn));
1935 ASSERT(t == SH_type_l4_shadow);
1937 /* Record that the guest page isn't shadowed any more (in this type) */
1938 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1939 delete_shadow_status(v, gmfn, t, smfn);
1940 shadow_demote(v, gmfn, t);
1941 /* Decrement refcounts of all the old entries */
1942 xen_mappings = (!shadow_mode_external(v->domain));
1943 sl4mfn = smfn;
1944 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
1945 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1947 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1948 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1949 | ((unsigned long)sl4e & ~PAGE_MASK));
1951 });
1953 /* Put the memory back in the pool */
1954 shadow_free(v->domain, smfn);
1957 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
1959 shadow_l3e_t *sl3e;
1960 u32 t = mfn_to_shadow_page(smfn)->type;
1961 mfn_t gmfn, sl3mfn;
1963 SHADOW_DEBUG(DESTROY_SHADOW,
1964 "%s(%05lx)\n", __func__, mfn_x(smfn));
1965 ASSERT(t == SH_type_l3_shadow);
1967 /* Record that the guest page isn't shadowed any more (in this type) */
1968 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1969 delete_shadow_status(v, gmfn, t, smfn);
1970 shadow_demote(v, gmfn, t);
1972 /* Decrement refcounts of all the old entries */
1973 sl3mfn = smfn;
1974 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
1975 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1976 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
1977 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1978 | ((unsigned long)sl3e & ~PAGE_MASK));
1979 });
1981 /* Put the memory back in the pool */
1982 shadow_free(v->domain, smfn);
1984 #endif /* GUEST_PAGING_LEVELS >= 4 */
1987 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
1989 shadow_l2e_t *sl2e;
1990 u32 t = mfn_to_shadow_page(smfn)->type;
1991 mfn_t gmfn, sl2mfn;
1992 int xen_mappings;
1994 SHADOW_DEBUG(DESTROY_SHADOW,
1995 "%s(%05lx)\n", __func__, mfn_x(smfn));
1996 ASSERT(t == SH_type_l2_shadow
1997 || t == SH_type_l2h_pae_shadow);
1999 /* Record that the guest page isn't shadowed any more (in this type) */
2000 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2001 delete_shadow_status(v, gmfn, t, smfn);
2002 shadow_demote(v, gmfn, t);
2004 /* Decrement refcounts of all the old entries */
2005 sl2mfn = smfn;
2006 xen_mappings = (!shadow_mode_external(v->domain) &&
2007 ((GUEST_PAGING_LEVELS == 2) ||
2008 ((GUEST_PAGING_LEVELS == 3) &&
2009 (t == SH_type_l2h_pae_shadow))));
2010 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2011 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2012 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2013 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2014 | ((unsigned long)sl2e & ~PAGE_MASK));
2015 });
2017 /* Put the memory back in the pool */
2018 shadow_free(v->domain, smfn);
2021 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2023 struct domain *d = v->domain;
2024 shadow_l1e_t *sl1e;
2025 u32 t = mfn_to_shadow_page(smfn)->type;
2027 SHADOW_DEBUG(DESTROY_SHADOW,
2028 "%s(%05lx)\n", __func__, mfn_x(smfn));
2029 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2031 /* Record that the guest page isn't shadowed any more (in this type) */
2032 if ( t == SH_type_fl1_shadow )
2034 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2035 delete_fl1_shadow_status(v, gfn, smfn);
2037 else
2039 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2040 delete_shadow_status(v, gmfn, t, smfn);
2041 shadow_demote(v, gmfn, t);
2044 if ( shadow_mode_refcounts(d) )
2046 /* Decrement refcounts of all the old entries */
2047 mfn_t sl1mfn = smfn;
2048 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2049 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2050 && !sh_l1e_is_magic(*sl1e) )
2051 shadow_put_page_from_l1e(*sl1e, d);
2052 });
2055 /* Put the memory back in the pool */
2056 shadow_free(v->domain, smfn);
2059 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2060 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2062 struct domain *d = v->domain;
2063 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2065 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2066 /* Need to destroy the l3 monitor page in slot 0 too */
2068 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2069 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2070 shadow_free(d, _mfn(l4e_get_pfn(l4e[0])));
2071 sh_unmap_domain_page(l4e);
2073 #elif CONFIG_PAGING_LEVELS == 3
2074 /* Need to destroy the l2 monitor page in slot 4 too */
2076 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2077 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2078 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2079 sh_unmap_domain_page(l3e);
2081 #endif
2083 /* Put the memory back in the pool */
2084 shadow_free(d, mmfn);
2086 #endif
2088 /**************************************************************************/
2089 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2090 * These are called from common code when we are running out of shadow
2091 * memory, and unpinning all the top-level shadows hasn't worked.
2093 * This implementation is pretty crude and slow, but we hope that it won't
2094 * be called very often. */
2096 #if GUEST_PAGING_LEVELS == 2
2098 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2100 shadow_l2e_t *sl2e;
2101 int xen_mappings = !shadow_mode_external(v->domain);
2102 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2103 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2104 });
2107 #elif GUEST_PAGING_LEVELS == 3
2109 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2110 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2112 shadow_l2e_t *sl2e;
2113 int xen_mappings = !shadow_mode_external(v->domain);
2114 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2115 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2116 });
2119 #elif GUEST_PAGING_LEVELS == 4
2121 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2123 shadow_l4e_t *sl4e;
2124 int xen_mappings = !shadow_mode_external(v->domain);
2125 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
2126 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2127 });
2130 #endif
2132 /**************************************************************************/
2133 /* Internal translation functions.
2134 * These functions require a pointer to the shadow entry that will be updated.
2135 */
2137 /* These functions take a new guest entry, translate it to shadow and write
2138 * the shadow entry.
2140 * They return the same bitmaps as the shadow_set_lXe() functions.
2141 */
2143 #if GUEST_PAGING_LEVELS >= 4
2144 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2146 shadow_l4e_t new_sl4e;
2147 guest_l4e_t *new_gl4e = new_ge;
2148 shadow_l4e_t *sl4p = se;
2149 mfn_t sl3mfn = _mfn(INVALID_MFN);
2150 int result = 0;
2152 perfc_incrc(shadow_validate_gl4e_calls);
2154 if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
2156 gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
2157 mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
2158 if ( mfn_valid(gl3mfn) )
2159 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2160 else
2161 result |= SHADOW_SET_ERROR;
2163 l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
2164 sl3mfn, &new_sl4e, ft_prefetch);
2166 // check for updates to xen reserved slots
2167 if ( !shadow_mode_external(v->domain) )
2169 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2170 sizeof(shadow_l4e_t));
2171 int reserved_xen_slot = !is_guest_l4_slot(shadow_index);
2173 if ( unlikely(reserved_xen_slot) )
2175 // attempt by the guest to write to a xen reserved slot
2176 //
2177 SHADOW_PRINTK("%s out-of-range update "
2178 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2179 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2180 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2182 SHADOW_ERROR("out-of-range l4e update\n");
2183 result |= SHADOW_SET_ERROR;
2186 // do not call shadow_set_l4e...
2187 return result;
2191 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2192 return result;
2196 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2198 shadow_l3e_t new_sl3e;
2199 guest_l3e_t *new_gl3e = new_ge;
2200 shadow_l3e_t *sl3p = se;
2201 mfn_t sl2mfn = _mfn(INVALID_MFN);
2202 int result = 0;
2204 perfc_incrc(shadow_validate_gl3e_calls);
2206 if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
2208 gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
2209 mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
2210 if ( mfn_valid(gl2mfn) )
2211 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2212 else
2213 result |= SHADOW_SET_ERROR;
2215 l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
2216 sl2mfn, &new_sl3e, ft_prefetch);
2217 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2219 return result;
2221 #endif // GUEST_PAGING_LEVELS >= 4
2223 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2225 shadow_l2e_t new_sl2e;
2226 guest_l2e_t *new_gl2e = new_ge;
2227 shadow_l2e_t *sl2p = se;
2228 mfn_t sl1mfn = _mfn(INVALID_MFN);
2229 int result = 0;
2231 perfc_incrc(shadow_validate_gl2e_calls);
2233 if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
2235 gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
2236 if ( guest_supports_superpages(v) &&
2237 (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
2239 // superpage -- need to look up the shadow L1 which holds the
2240 // splitters...
2241 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2242 #if 0
2243 // XXX - it's possible that we want to do some kind of prefetch
2244 // for superpage fl1's here, but this is *not* on the demand path,
2245 // so we'll hold off trying that for now...
2246 //
2247 if ( !mfn_valid(sl1mfn) )
2248 sl1mfn = make_fl1_shadow(v, gl1gfn);
2249 #endif
2251 else
2253 mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
2254 if ( mfn_valid(gl1mfn) )
2255 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2256 else
2257 result |= SHADOW_SET_ERROR;
2260 l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
2261 sl1mfn, &new_sl2e, ft_prefetch);
2263 // check for updates to xen reserved slots in PV guests...
2264 // XXX -- need to revisit this for PV 3-on-4 guests.
2265 //
2266 #if SHADOW_PAGING_LEVELS < 4
2267 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2268 if ( !shadow_mode_external(v->domain) )
2270 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2271 sizeof(shadow_l2e_t));
2272 int reserved_xen_slot;
2274 #if SHADOW_PAGING_LEVELS == 3
2275 reserved_xen_slot =
2276 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2277 (shadow_index
2278 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2279 #else /* SHADOW_PAGING_LEVELS == 2 */
2280 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2281 #endif
2283 if ( unlikely(reserved_xen_slot) )
2285 // attempt by the guest to write to a xen reserved slot
2286 //
2287 SHADOW_PRINTK("%s out-of-range update "
2288 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2289 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2290 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2292 SHADOW_ERROR("out-of-range l2e update\n");
2293 result |= SHADOW_SET_ERROR;
2296 // do not call shadow_set_l2e...
2297 return result;
2300 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2301 #endif /* SHADOW_PAGING_LEVELS < 4 */
2303 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2305 return result;
2308 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2310 shadow_l1e_t new_sl1e;
2311 guest_l1e_t *new_gl1e = new_ge;
2312 shadow_l1e_t *sl1p = se;
2313 gfn_t gfn;
2314 mfn_t gmfn;
2315 int result = 0, mmio;
2317 perfc_incrc(shadow_validate_gl1e_calls);
2319 gfn = guest_l1e_get_gfn(*new_gl1e);
2320 gmfn = vcpu_gfn_to_mfn(v, gfn);
2322 mmio = (is_hvm_vcpu(v) && shadow_vcpu_mode_translate(v) && !mfn_valid(gmfn));
2323 l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e,
2324 ft_prefetch, mmio);
2326 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2327 return result;
2331 /**************************************************************************/
2332 /* Functions which translate and install the shadows of arbitrary guest
2333 * entries that we have just seen the guest write. */
2336 static inline int
2337 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2338 void *new_gp, u32 size, u32 sh_type,
2339 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2340 int (*validate_ge)(struct vcpu *v, void *ge,
2341 mfn_t smfn, void *se))
2342 /* Generic function for mapping and validating. */
2344 mfn_t smfn, smfn2, map_mfn;
2345 shadow_l1e_t *sl1p;
2346 u32 shadow_idx, guest_idx;
2347 int result = 0;
2349 /* Align address and size to guest entry boundaries */
2350 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2351 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2352 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2353 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2355 /* Map the shadow page */
2356 smfn = get_shadow_status(v, gmfn, sh_type);
2357 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2358 guest_idx = guest_index(new_gp);
2359 map_mfn = smfn;
2360 shadow_idx = shadow_index(&map_mfn, guest_idx);
2361 sl1p = map_shadow_page(map_mfn);
2363 /* Validate one entry at a time */
2364 while ( size )
2366 smfn2 = smfn;
2367 guest_idx = guest_index(new_gp);
2368 shadow_idx = shadow_index(&smfn2, guest_idx);
2369 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2371 /* We have moved to another page of the shadow */
2372 map_mfn = smfn2;
2373 unmap_shadow_page(sl1p);
2374 sl1p = map_shadow_page(map_mfn);
2376 result |= validate_ge(v,
2377 new_gp,
2378 map_mfn,
2379 &sl1p[shadow_idx]);
2380 size -= sizeof(guest_l1e_t);
2381 new_gp += sizeof(guest_l1e_t);
2383 unmap_shadow_page(sl1p);
2384 return result;
2388 int
2389 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2390 void *new_gl4p, u32 size)
2392 #if GUEST_PAGING_LEVELS >= 4
2393 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2394 SH_type_l4_shadow,
2395 shadow_l4_index,
2396 validate_gl4e);
2397 #else // ! GUEST_PAGING_LEVELS >= 4
2398 SHADOW_PRINTK("called in wrong paging mode!\n");
2399 BUG();
2400 return 0;
2401 #endif
2404 int
2405 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2406 void *new_gl3p, u32 size)
2408 #if GUEST_PAGING_LEVELS >= 4
2409 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2410 SH_type_l3_shadow,
2411 shadow_l3_index,
2412 validate_gl3e);
2413 #else // ! GUEST_PAGING_LEVELS >= 4
2414 SHADOW_PRINTK("called in wrong paging mode!\n");
2415 BUG();
2416 return 0;
2417 #endif
2420 int
2421 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2422 void *new_gl2p, u32 size)
2424 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2425 SH_type_l2_shadow,
2426 shadow_l2_index,
2427 validate_gl2e);
2430 int
2431 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2432 void *new_gl2p, u32 size)
2434 #if GUEST_PAGING_LEVELS == 3
2435 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2436 SH_type_l2h_shadow,
2437 shadow_l2_index,
2438 validate_gl2e);
2439 #else /* Non-PAE guests don't have different kinds of l2 table */
2440 SHADOW_PRINTK("called in wrong paging mode!\n");
2441 BUG();
2442 return 0;
2443 #endif
2446 int
2447 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2448 void *new_gl1p, u32 size)
2450 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2451 SH_type_l1_shadow,
2452 shadow_l1_index,
2453 validate_gl1e);
2457 /**************************************************************************/
2458 /* Optimization: If we see two emulated writes of zeros to the same
2459 * page-table without another kind of page fault in between, we guess
2460 * that this is a batch of changes (for process destruction) and
2461 * unshadow the page so we don't take a pagefault on every entry. This
2462 * should also make finding writeable mappings of pagetables much
2463 * easier. */
2465 /* Look to see if this is the second emulated write in a row to this
2466 * page, and unshadow/unhook if it is */
2467 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2469 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2470 if ( v->arch.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2471 sh_mfn_is_a_page_table(gmfn) )
2473 u32 flags = mfn_to_page(gmfn)->shadow_flags;
2474 if ( !(flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
2476 perfc_incrc(shadow_early_unshadow);
2477 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2480 v->arch.shadow.last_emulated_mfn = mfn_x(gmfn);
2481 #endif
2484 /* Stop counting towards early unshadows, as we've seen a real page fault */
2485 static inline void reset_early_unshadow(struct vcpu *v)
2487 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2488 v->arch.shadow.last_emulated_mfn = INVALID_MFN;
2489 #endif
2494 /**************************************************************************/
2495 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2496 * demand-faulted a shadow l1e in the fault handler, to see if it's
2497 * worth fetching some more.
2498 */
2500 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2502 /* XXX magic number */
2503 #define PREFETCH_DISTANCE 32
2505 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2506 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2508 int i, dist, mmio;
2509 gfn_t gfn;
2510 mfn_t gmfn;
2511 guest_l1e_t gl1e;
2512 shadow_l1e_t sl1e;
2513 u32 gflags;
2515 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2516 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2517 /* And no more than a maximum fetches-per-fault */
2518 if ( dist > PREFETCH_DISTANCE )
2519 dist = PREFETCH_DISTANCE;
2521 for ( i = 1; i < dist ; i++ )
2523 /* No point in prefetching if there's already a shadow */
2524 if ( ptr_sl1e[i].l1 != 0 )
2525 break;
2527 if ( gw->l1e )
2529 /* Normal guest page; grab the next guest entry */
2530 gl1e = gw->l1e[i];
2531 /* Not worth continuing if we hit an entry that will need another
2532 * fault for A/D-bit propagation anyway */
2533 gflags = guest_l1e_get_flags(gl1e);
2534 if ( (gflags & _PAGE_PRESENT)
2535 && (!(gflags & _PAGE_ACCESSED)
2536 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2537 break;
2539 else
2541 /* Fragmented superpage, unless we've been called wrongly */
2542 ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
2543 /* Increment the l1e's GFN by the right number of guest pages */
2544 gl1e = guest_l1e_from_gfn(
2545 _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i),
2546 guest_l1e_get_flags(gw->eff_l1e));
2549 /* Look at the gfn that the l1e is pointing at */
2550 gfn = guest_l1e_get_gfn(gl1e);
2551 gmfn = vcpu_gfn_to_mfn(v, gfn);
2552 mmio = ( is_hvm_vcpu(v)
2553 && shadow_vcpu_mode_translate(v)
2554 && mmio_space(gfn_to_paddr(gfn)) );
2556 /* Propagate the entry. Safe to use a pointer to our local
2557 * gl1e, since this is not a demand-fetch so there will be no
2558 * write-back to the guest. */
2559 l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
2560 gmfn, &sl1e, ft_prefetch, mmio);
2561 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2565 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2568 /**************************************************************************/
2569 /* Entry points into the shadow code */
2571 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2572 * for pagefaults. Returns 1 if this fault was an artefact of the
2573 * shadow code (and the guest should retry) or 0 if it is not (and the
2574 * fault should be handled elsewhere or passed to the guest). */
2576 static int sh_page_fault(struct vcpu *v,
2577 unsigned long va,
2578 struct cpu_user_regs *regs)
2580 struct domain *d = v->domain;
2581 walk_t gw;
2582 u32 accumulated_gflags;
2583 gfn_t gfn;
2584 mfn_t gmfn, sl1mfn=_mfn(0);
2585 shadow_l1e_t sl1e, *ptr_sl1e;
2586 paddr_t gpa;
2587 struct sh_emulate_ctxt emul_ctxt;
2588 struct x86_emulate_ops *emul_ops;
2589 int r, mmio;
2590 fetch_type_t ft = 0;
2592 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
2593 v->domain->domain_id, v->vcpu_id, va, regs->error_code);
2595 //
2596 // XXX: Need to think about eventually mapping superpages directly in the
2597 // shadow (when possible), as opposed to splintering them into a
2598 // bunch of 4K maps.
2599 //
2601 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2602 if ( (regs->error_code & PFEC_reserved_bit) )
2604 /* The only reasons for reserved bits to be set in shadow entries
2605 * are the two "magic" shadow_l1e entries. */
2606 if ( likely((__copy_from_user(&sl1e,
2607 (sh_linear_l1_table(v)
2608 + shadow_l1_linear_offset(va)),
2609 sizeof(sl1e)) == 0)
2610 && sh_l1e_is_magic(sl1e)) )
2612 if ( sh_l1e_is_gnp(sl1e) )
2614 if ( likely(!is_hvm_domain(d) ||
2615 shadow_vcpu_mode_translate(v)) )
2617 /* Not-present in a guest PT: pass to the guest as
2618 * a not-present fault (by flipping two bits). */
2619 ASSERT(regs->error_code & PFEC_page_present);
2620 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2621 perfc_incrc(shadow_fault_fast_gnp);
2622 SHADOW_PRINTK("fast path not-present\n");
2623 return 0;
2625 else
2627 /* Not-present in the P2M: MMIO */
2628 gpa = va;
2631 else
2633 /* Magic MMIO marker: extract gfn for MMIO address */
2634 ASSERT(sh_l1e_is_mmio(sl1e));
2635 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2636 << PAGE_SHIFT)
2637 | (va & ~PAGE_MASK);
2639 perfc_incrc(shadow_fault_fast_mmio);
2640 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2641 reset_early_unshadow(v);
2642 handle_mmio(gpa);
2643 return EXCRET_fault_fixed;
2645 else
2647 /* This should be exceptionally rare: another vcpu has fixed
2648 * the tables between the fault and our reading the l1e.
2649 * Fall through to the normal fault handing logic */
2650 perfc_incrc(shadow_fault_fast_fail);
2651 SHADOW_PRINTK("fast path false alarm!\n");
2652 /* Don't pass the reserved-bit bit: if we look at the fault
2653 * below and decide to pass it to the guest, the reserved-bit
2654 * bit won't make sense there. */
2655 regs->error_code &= ~PFEC_reserved_bit;
2658 #endif /* SHOPT_FAST_FAULT_PATH */
2660 shadow_lock(d);
2662 shadow_audit_tables(v);
2664 if ( guest_walk_tables(v, va, &gw, 1) != 0 )
2666 SHADOW_PRINTK("malformed guest pagetable!");
2667 print_gw(&gw);
2670 sh_audit_gw(v, &gw);
2672 // We do not look at the gw->l1e, as that will not exist for superpages.
2673 // Instead, we use the gw->eff_l1e...
2674 //
2675 // We need not check all the levels of the guest page table entries for
2676 // present vs not-present, as the eff_l1e will always be not present if
2677 // one of the higher level entries is not present.
2678 //
2679 if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
2681 if ( is_hvm_domain(d) && !shadow_vcpu_mode_translate(v) )
2683 /* Not present in p2m map, means this is mmio */
2684 gpa = va;
2685 goto mmio;
2688 perfc_incrc(shadow_fault_bail_not_present);
2689 goto not_a_shadow_fault;
2692 // All levels of the guest page table are now known to be present.
2693 accumulated_gflags = accumulate_guest_flags(v, &gw);
2695 // Check for attempts to access supervisor-only pages from user mode,
2696 // i.e. ring 3. Such errors are not caused or dealt with by the shadow
2697 // code.
2698 //
2699 if ( (regs->error_code & PFEC_user_mode) &&
2700 !(accumulated_gflags & _PAGE_USER) )
2702 /* illegal user-mode access to supervisor-only page */
2703 perfc_incrc(shadow_fault_bail_user_supervisor);
2704 goto not_a_shadow_fault;
2707 // Was it a write fault?
2708 ft = ((regs->error_code & PFEC_write_access)
2709 ? ft_demand_write : ft_demand_read);
2710 if ( ft == ft_demand_write )
2712 if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
2714 perfc_incrc(shadow_fault_bail_ro_mapping);
2715 goto not_a_shadow_fault;
2718 else // must have been either an insn fetch or read fault
2720 // Check for NX bit violations: attempts to execute code that is
2721 // marked "do not execute". Such errors are not caused or dealt with
2722 // by the shadow code.
2723 //
2724 if ( regs->error_code & PFEC_insn_fetch )
2726 if ( accumulated_gflags & _PAGE_NX_BIT )
2728 /* NX prevented this code fetch */
2729 perfc_incrc(shadow_fault_bail_nx);
2730 goto not_a_shadow_fault;
2735 /* What mfn is the guest trying to access? */
2736 gfn = guest_l1e_get_gfn(gw.eff_l1e);
2737 gmfn = vcpu_gfn_to_mfn(v, gfn);
2738 mmio = (is_hvm_domain(d)
2739 && shadow_vcpu_mode_translate(v)
2740 && mmio_space(gfn_to_paddr(gfn)));
2742 if ( !mmio && !mfn_valid(gmfn) )
2744 perfc_incrc(shadow_fault_bail_bad_gfn);
2745 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n",
2746 gfn_x(gfn), mfn_x(gmfn));
2747 goto not_a_shadow_fault;
2750 /* Make sure there is enough free shadow memory to build a chain of
2751 * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
2752 * to allocate all we need. (We never allocate a top-level shadow
2753 * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
2754 shadow_prealloc(d, SHADOW_MAX_ORDER);
2756 /* Acquire the shadow. This must happen before we figure out the rights
2757 * for the shadow entry, since we might promote a page here. */
2758 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2759 if ( unlikely(ptr_sl1e == NULL) )
2761 /* Couldn't get the sl1e! Since we know the guest entries
2762 * are OK, this can only have been caused by a failed
2763 * shadow_set_l*e(), which will have crashed the guest.
2764 * Get out of the fault handler immediately. */
2765 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2766 unmap_walk(v, &gw);
2767 shadow_unlock(d);
2768 return 0;
2771 /* Calculate the shadow entry and write it */
2772 l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn,
2773 gmfn, &sl1e, ft, mmio);
2774 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2776 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2777 /* Prefetch some more shadow entries */
2778 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2779 #endif
2781 /* Need to emulate accesses to page tables */
2782 if ( sh_mfn_is_a_page_table(gmfn) )
2784 if ( ft == ft_demand_write )
2786 perfc_incrc(shadow_fault_emulate_write);
2787 goto emulate;
2789 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2791 perfc_incrc(shadow_fault_emulate_read);
2792 goto emulate;
2796 if ( mmio )
2798 gpa = guest_walk_to_gpa(&gw);
2799 goto mmio;
2802 perfc_incrc(shadow_fault_fixed);
2803 d->arch.shadow.fault_count++;
2804 reset_early_unshadow(v);
2806 done:
2807 sh_audit_gw(v, &gw);
2808 unmap_walk(v, &gw);
2809 SHADOW_PRINTK("fixed\n");
2810 shadow_audit_tables(v);
2811 shadow_unlock(d);
2812 return EXCRET_fault_fixed;
2814 emulate:
2815 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
2816 goto not_a_shadow_fault;
2818 if ( is_hvm_domain(d) )
2819 hvm_store_cpu_guest_regs(v, regs, NULL);
2820 SHADOW_PRINTK("emulate: eip=%#lx\n", regs->eip);
2822 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
2824 /*
2825 * We do not emulate user writes. Instead we use them as a hint that the
2826 * page is no longer a page table. This behaviour differs from native, but
2827 * it seems very unlikely that any OS grants user access to page tables.
2828 */
2829 if ( (regs->error_code & PFEC_user_mode) ||
2830 x86_emulate_memop(&emul_ctxt.ctxt, emul_ops) )
2832 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
2833 mfn_x(gmfn));
2834 perfc_incrc(shadow_fault_emulate_failed);
2835 /* If this is actually a page table, then we have a bug, and need
2836 * to support more operations in the emulator. More likely,
2837 * though, this is a hint that this page should not be shadowed. */
2838 shadow_remove_all_shadows(v, gmfn);
2841 /* Emulator has changed the user registers: write back */
2842 if ( is_hvm_domain(d) )
2843 hvm_load_cpu_guest_regs(v, regs);
2844 goto done;
2846 mmio:
2847 if ( !guest_mode(regs) )
2848 goto not_a_shadow_fault;
2849 perfc_incrc(shadow_fault_mmio);
2850 sh_audit_gw(v, &gw);
2851 unmap_walk(v, &gw);
2852 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
2853 shadow_audit_tables(v);
2854 reset_early_unshadow(v);
2855 shadow_unlock(d);
2856 handle_mmio(gpa);
2857 return EXCRET_fault_fixed;
2859 not_a_shadow_fault:
2860 sh_audit_gw(v, &gw);
2861 unmap_walk(v, &gw);
2862 SHADOW_PRINTK("not a shadow fault\n");
2863 shadow_audit_tables(v);
2864 reset_early_unshadow(v);
2865 shadow_unlock(d);
2866 return 0;
2870 static int
2871 sh_invlpg(struct vcpu *v, unsigned long va)
2872 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
2873 * instruction should be issued on the hardware, or 0 if it's safe not
2874 * to do so. */
2876 shadow_l2e_t sl2e;
2878 perfc_incrc(shadow_invlpg);
2880 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
2881 * run as high as 6% of invlpg calls where we haven't shadowed the l2
2882 * yet. */
2883 #if SHADOW_PAGING_LEVELS == 4
2885 shadow_l3e_t sl3e;
2886 if ( !(shadow_l4e_get_flags(
2887 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
2888 & _PAGE_PRESENT) )
2889 return 0;
2890 /* This must still be a copy-from-user because we don't have the
2891 * shadow lock, and the higher-level shadows might disappear
2892 * under our feet. */
2893 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
2894 + shadow_l3_linear_offset(va)),
2895 sizeof (sl3e)) != 0 )
2897 perfc_incrc(shadow_invlpg_fault);
2898 return 0;
2900 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
2901 return 0;
2903 #elif SHADOW_PAGING_LEVELS == 3
2904 if ( !(l3e_get_flags(v->arch.shadow.l3table[shadow_l3_linear_offset(va)])
2905 & _PAGE_PRESENT) )
2906 // no need to flush anything if there's no SL2...
2907 return 0;
2908 #endif
2910 /* This must still be a copy-from-user because we don't have the shadow
2911 * lock, and the higher-level shadows might disappear under our feet. */
2912 if ( __copy_from_user(&sl2e,
2913 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
2914 sizeof (sl2e)) != 0 )
2916 perfc_incrc(shadow_invlpg_fault);
2917 return 0;
2920 // If there's nothing shadowed for this particular sl2e, then
2921 // there is no need to do an invlpg, either...
2922 //
2923 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
2924 return 0;
2926 // Check to see if the SL2 is a splintered superpage...
2927 // If so, then we'll need to flush the entire TLB (because that's
2928 // easier than invalidating all of the individual 4K pages).
2929 //
2930 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
2931 == SH_type_fl1_shadow )
2933 local_flush_tlb();
2934 return 0;
2937 return 1;
2940 static unsigned long
2941 sh_gva_to_gfn(struct vcpu *v, unsigned long va)
2942 /* Called to translate a guest virtual address to what the *guest*
2943 * pagetables would map it to. */
2945 walk_t gw;
2946 gfn_t gfn;
2948 guest_walk_tables(v, va, &gw, 0);
2949 gfn = guest_walk_to_gfn(&gw);
2950 unmap_walk(v, &gw);
2952 return gfn_x(gfn);
2956 static paddr_t
2957 sh_gva_to_gpa(struct vcpu *v, unsigned long va)
2958 /* Called to translate a guest virtual address to what the *guest*
2959 * pagetables would map it to. */
2961 unsigned long gfn = sh_gva_to_gfn(v, va);
2962 if ( gfn == INVALID_GFN )
2963 return 0;
2964 else
2965 return (((paddr_t)gfn) << PAGE_SHIFT) + (va & ~PAGE_MASK);
2969 static inline void
2970 sh_update_linear_entries(struct vcpu *v)
2971 /* Sync up all the linear mappings for this vcpu's pagetables */
2973 struct domain *d = v->domain;
2975 /* Linear pagetables in PV guests
2976 * ------------------------------
2978 * Guest linear pagetables, which map the guest pages, are at
2979 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
2980 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
2981 * are set up at shadow creation time, but (of course!) the PAE case
2982 * is subtler. Normal linear mappings are made by having an entry
2983 * in the top-level table that points to itself (shadow linear) or
2984 * to the guest top-level table (guest linear). For PAE, to set up
2985 * a linear map requires us to copy the four top-level entries into
2986 * level-2 entries. That means that every time we change a PAE l3e,
2987 * we need to reflect the change into the copy.
2989 * Linear pagetables in HVM guests
2990 * -------------------------------
2992 * For HVM guests, the linear pagetables are installed in the monitor
2993 * tables (since we can't put them in the shadow). Shadow linear
2994 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
2995 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
2996 * a linear pagetable of the monitor tables themselves. We have
2997 * the same issue of having to re-copy PAE l3 entries whevever we use
2998 * PAE shadows.
3000 * Because HVM guests run on the same monitor tables regardless of the
3001 * shadow tables in use, the linear mapping of the shadow tables has to
3002 * be updated every time v->arch.shadow_table changes.
3003 */
3005 /* Don't try to update the monitor table if it doesn't exist */
3006 if ( shadow_mode_external(d)
3007 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3008 return;
3010 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3012 /* For PV, one l4e points at the guest l4, one points at the shadow
3013 * l4. No maintenance required.
3014 * For HVM, just need to update the l4e that points to the shadow l4. */
3016 if ( shadow_mode_external(d) )
3018 /* Use the linear map if we can; otherwise make a new mapping */
3019 if ( v == current )
3021 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3022 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3023 __PAGE_HYPERVISOR);
3025 else
3027 l4_pgentry_t *ml4e;
3028 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3029 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3030 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3031 __PAGE_HYPERVISOR);
3032 sh_unmap_domain_page(ml4e);
3036 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3038 /* This case only exists in HVM. To give ourselves a linear map of the
3039 * shadows, we need to extend a PAE shadow to 4 levels. We do this by
3040 * having a monitor l3 in slot 0 of the monitor l4 table, and
3041 * copying the PAE l3 entries into it. Then, by having the monitor l4e
3042 * for shadow pagetables also point to the monitor l4, we can use it
3043 * to access the shadows. */
3045 if ( shadow_mode_external(d) )
3047 /* Install copies of the shadow l3es into the monitor l3 table.
3048 * The monitor l3 table is hooked into slot 0 of the monitor
3049 * l4 table, so we use l3 linear indices 0 to 3 */
3050 shadow_l3e_t *sl3e;
3051 l3_pgentry_t *ml3e;
3052 mfn_t l3mfn;
3053 int i;
3055 /* Use linear mappings if we can; otherwise make new mappings */
3056 if ( v == current )
3058 ml3e = __linear_l3_table;
3059 l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
3061 else
3063 l4_pgentry_t *ml4e;
3064 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3065 ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
3066 l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
3067 ml3e = sh_map_domain_page(l3mfn);
3068 sh_unmap_domain_page(ml4e);
3071 /* Shadow l3 tables are made up by update_cr3 */
3072 sl3e = v->arch.shadow.l3table;
3074 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3076 ml3e[i] =
3077 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3078 ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3079 __PAGE_HYPERVISOR)
3080 : l3e_empty();
3083 if ( v != current )
3084 sh_unmap_domain_page(ml3e);
3087 #elif CONFIG_PAGING_LEVELS == 3
3089 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3090 * entries in the shadow, and the shadow's l3 entries into the
3091 * shadow-linear-map l2 entries in the shadow. This is safe to do
3092 * because Xen does not let guests share high-slot l2 tables between l3s,
3093 * so we know we're not treading on anyone's toes.
3095 * HVM: need to copy the shadow's l3 entries into the
3096 * shadow-linear-map l2 entries in the monitor table. This is safe
3097 * because we have one monitor table for each vcpu. The monitor's
3098 * own l3es don't need to be copied because they never change.
3099 * XXX That might change if we start stuffing things into the rest
3100 * of the monitor's virtual address space.
3101 */
3103 l2_pgentry_t *l2e, new_l2e;
3104 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3105 int i;
3106 int unmap_l2e = 0;
3108 #if GUEST_PAGING_LEVELS == 2
3109 /* Shadow l3 tables were built by update_cr3 */
3110 if ( shadow_mode_external(d) )
3111 shadow_l3e = (shadow_l3e_t *)&v->arch.shadow.l3table;
3112 else
3113 BUG(); /* PV 2-on-3 is not supported yet */
3115 #else /* GUEST_PAGING_LEVELS == 3 */
3117 shadow_l3e = (shadow_l3e_t *)&v->arch.shadow.l3table;
3118 /* Always safe to use guest_vtable, because it's globally mapped */
3119 guest_l3e = v->arch.guest_vtable;
3121 #endif /* GUEST_PAGING_LEVELS */
3123 /* Choose where to write the entries, using linear maps if possible */
3124 if ( shadow_mode_external(d) )
3126 if ( v == current )
3128 /* From the monitor tables, it's safe to use linear maps
3129 * to update monitor l2s */
3130 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3132 else
3134 /* Map the monitor table's high l2 */
3135 l3_pgentry_t *l3e;
3136 l3e = sh_map_domain_page(
3137 pagetable_get_mfn(v->arch.monitor_table));
3138 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3139 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3140 unmap_l2e = 1;
3141 sh_unmap_domain_page(l3e);
3144 else
3146 /* Map the shadow table's high l2 */
3147 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3148 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3149 unmap_l2e = 1;
3152 /* Write linear mapping of guest (only in PV, and only when
3153 * not translated). */
3154 if ( !shadow_mode_translate(d) )
3156 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3158 new_l2e =
3159 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3160 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3161 __PAGE_HYPERVISOR)
3162 : l2e_empty());
3163 safe_write_entry(
3164 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3165 &new_l2e);
3169 /* Write linear mapping of shadow. */
3170 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3172 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3173 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3174 __PAGE_HYPERVISOR)
3175 : l2e_empty();
3176 safe_write_entry(
3177 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3178 &new_l2e);
3181 if ( unmap_l2e )
3182 sh_unmap_domain_page(l2e);
3185 #elif CONFIG_PAGING_LEVELS == 2
3187 /* For PV, one l2e points at the guest l2, one points at the shadow
3188 * l2. No maintenance required.
3189 * For HVM, just need to update the l2e that points to the shadow l2. */
3191 if ( shadow_mode_external(d) )
3193 /* Use the linear map if we can; otherwise make a new mapping */
3194 if ( v == current )
3196 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3197 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3198 __PAGE_HYPERVISOR);
3200 else
3202 l2_pgentry_t *ml2e;
3203 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3204 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3205 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3206 __PAGE_HYPERVISOR);
3207 sh_unmap_domain_page(ml2e);
3211 #else
3212 #error this should not happen
3213 #endif
3217 /* Removes vcpu->arch.guest_vtable and vcpu->arch.shadow_table[].
3218 * Does all appropriate management/bookkeeping/refcounting/etc...
3219 */
3220 static void
3221 sh_detach_old_tables(struct vcpu *v)
3223 struct domain *d = v->domain;
3224 mfn_t smfn;
3225 int i = 0;
3227 ////
3228 //// vcpu->arch.guest_vtable
3229 ////
3230 if ( v->arch.guest_vtable )
3232 #if GUEST_PAGING_LEVELS == 4
3233 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3234 sh_unmap_domain_page_global(v->arch.guest_vtable);
3235 #elif GUEST_PAGING_LEVELS == 3
3236 if ( 1 || shadow_mode_external(d) || shadow_mode_translate(d) )
3237 sh_unmap_domain_page_global(v->arch.guest_vtable);
3238 #elif GUEST_PAGING_LEVELS == 2
3239 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3240 sh_unmap_domain_page_global(v->arch.guest_vtable);
3241 #endif
3242 v->arch.guest_vtable = NULL;
3245 ////
3246 //// vcpu->arch.shadow_table[]
3247 ////
3250 #if GUEST_PAGING_LEVELS == 3
3251 /* PAE guests have four shadow_table entries */
3252 for ( i = 0 ; i < 4 ; i++ )
3253 #endif
3255 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3256 if ( mfn_x(smfn) )
3257 sh_put_ref(v, smfn, 0);
3258 v->arch.shadow_table[i] = pagetable_null();
3262 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3263 static void
3264 sh_set_toplevel_shadow(struct vcpu *v,
3265 int slot,
3266 mfn_t gmfn,
3267 unsigned int root_type)
3269 mfn_t smfn;
3270 pagetable_t old_entry, new_entry;
3272 struct domain *d = v->domain;
3274 /* Remember the old contents of this slot */
3275 old_entry = v->arch.shadow_table[slot];
3277 /* Now figure out the new contents: is this a valid guest MFN? */
3278 if ( !mfn_valid(gmfn) )
3280 new_entry = pagetable_null();
3281 goto install_new_entry;
3284 /* Guest mfn is valid: shadow it and install the shadow */
3285 smfn = get_shadow_status(v, gmfn, root_type);
3286 if ( !mfn_valid(smfn) )
3288 /* Make sure there's enough free shadow memory. */
3289 shadow_prealloc(d, SHADOW_MAX_ORDER);
3290 /* Shadow the page. */
3291 smfn = sh_make_shadow(v, gmfn, root_type);
3293 ASSERT(mfn_valid(smfn));
3295 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
3296 /* Once again OK to unhook entries from this table if we see fork/exit */
3297 ASSERT(sh_mfn_is_a_page_table(gmfn));
3298 mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
3299 #endif
3301 /* Pin the shadow and put it (back) on the list of top-level shadows */
3302 if ( sh_pin(v, smfn) == 0 )
3304 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3305 domain_crash(v->domain);
3308 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3309 * or the next call to set_toplevel_shadow() */
3310 if ( !sh_get_ref(v, smfn, 0) )
3312 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3313 domain_crash(v->domain);
3316 new_entry = pagetable_from_mfn(smfn);
3318 install_new_entry:
3319 /* Done. Install it */
3320 SHADOW_PRINTK("%u/%u [%u] gmfn %#"SH_PRI_mfn" smfn %#"SH_PRI_mfn"\n",
3321 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3322 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3323 v->arch.shadow_table[slot] = new_entry;
3325 /* Decrement the refcount of the old contents of this slot */
3326 if ( !pagetable_is_null(old_entry) )
3327 sh_put_ref(v, pagetable_get_mfn(old_entry), 0);
3331 static void
3332 sh_update_cr3(struct vcpu *v)
3333 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3334 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3335 * if appropriate).
3336 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works,
3337 * and read vcpu->arch.hvm_vcpu.hw_cr3 afterwards.
3338 */
3340 struct domain *d = v->domain;
3341 mfn_t gmfn;
3342 #if GUEST_PAGING_LEVELS == 3
3343 u32 guest_idx=0;
3344 #endif
3346 ASSERT(shadow_lock_is_acquired(v->domain));
3347 ASSERT(v->arch.shadow.mode);
3349 ////
3350 //// vcpu->arch.guest_table is already set
3351 ////
3353 #ifndef NDEBUG
3354 /* Double-check that the HVM code has sent us a sane guest_table */
3355 if ( is_hvm_domain(d) )
3357 gfn_t gfn;
3359 ASSERT(shadow_mode_external(d));
3361 // Is paging enabled on this vcpu?
3362 if ( shadow_vcpu_mode_translate(v) )
3364 gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
3365 gmfn = vcpu_gfn_to_mfn(v, gfn);
3366 ASSERT(mfn_valid(gmfn));
3367 ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
3369 else
3371 /* Paging disabled: guest_table points at (part of) p2m */
3372 #if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
3373 /* For everything else, they sould be the same */
3374 ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
3375 #endif
3378 #endif
3380 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3381 d->domain_id, v->vcpu_id,
3382 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3384 #if GUEST_PAGING_LEVELS == 4
3385 if ( !(v->arch.flags & TF_kernel_mode) )
3386 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3387 else
3388 #endif
3389 gmfn = pagetable_get_mfn(v->arch.guest_table);
3391 if ( !is_hvm_domain(d) && !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
3393 ASSERT(v->arch.cr3 == 0);
3394 return;
3397 ////
3398 //// vcpu->arch.guest_vtable
3399 ////
3400 #if GUEST_PAGING_LEVELS == 4
3401 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3403 if ( v->arch.guest_vtable )
3404 sh_unmap_domain_page_global(v->arch.guest_vtable);
3405 v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
3407 else
3408 v->arch.guest_vtable = __linear_l4_table;
3409 #elif GUEST_PAGING_LEVELS == 3
3410 if ( v->arch.guest_vtable )
3411 sh_unmap_domain_page_global(v->arch.guest_vtable);
3412 if ( shadow_mode_external(d) )
3414 if ( shadow_vcpu_mode_translate(v) )
3415 /* Paging enabled: find where in the page the l3 table is */
3416 guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
3417 else
3418 /* Paging disabled: l3 is at the start of a page (in the p2m) */
3419 guest_idx = 0;
3421 // Ignore the low 2 bits of guest_idx -- they are really just
3422 // cache control.
3423 guest_idx &= ~3;
3425 // XXX - why does this need a global map?
3426 v->arch.guest_vtable =
3427 (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx;
3429 else
3430 v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
3431 #elif GUEST_PAGING_LEVELS == 2
3432 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3434 if ( v->arch.guest_vtable )
3435 sh_unmap_domain_page_global(v->arch.guest_vtable);
3436 v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
3438 else
3439 v->arch.guest_vtable = __linear_l2_table;
3440 #else
3441 #error this should never happen
3442 #endif
3444 #if 0
3445 printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
3446 __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
3447 #endif
3449 ////
3450 //// vcpu->arch.shadow_table[]
3451 ////
3453 /* We revoke write access to the new guest toplevel page(s) before we
3454 * replace the old shadow pagetable(s), so that we can safely use the
3455 * (old) shadow linear maps in the writeable mapping heuristics. */
3456 #if GUEST_PAGING_LEVELS == 2
3457 if ( shadow_remove_write_access(v, gmfn, 2, 0) != 0 )
3458 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3459 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3460 #elif GUEST_PAGING_LEVELS == 3
3461 /* PAE guests have four shadow_table entries, based on the
3462 * current values of the guest's four l3es. */
3464 int i, flush = 0;
3465 gfn_t gl2gfn;
3466 mfn_t gl2mfn;
3467 guest_l3e_t *gl3e = (guest_l3e_t*)v->arch.guest_vtable;
3468 /* First, make all four entries read-only. */
3469 for ( i = 0; i < 4; i++ )
3471 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3473 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3474 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3475 flush |= shadow_remove_write_access(v, gl2mfn, 2, 0);
3478 if ( flush )
3479 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3480 /* Now install the new shadows. */
3481 for ( i = 0; i < 4; i++ )
3483 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3485 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3486 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3487 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3488 ? SH_type_l2h_shadow
3489 : SH_type_l2_shadow);
3493 #elif GUEST_PAGING_LEVELS == 4
3494 if ( shadow_remove_write_access(v, gmfn, 4, 0) != 0 )
3495 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3496 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3497 #else
3498 #error This should never happen
3499 #endif
3501 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3502 #endif
3504 ///
3505 /// v->arch.shadow.l3table
3506 ///
3507 #if SHADOW_PAGING_LEVELS == 3
3509 mfn_t smfn;
3510 int i;
3511 for ( i = 0; i < 4; i++ )
3513 #if GUEST_PAGING_LEVELS == 2
3514 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3515 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3516 #else
3517 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3518 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3519 #endif
3520 v->arch.shadow.l3table[i] =
3521 (mfn_x(smfn) == 0)
3522 ? shadow_l3e_empty()
3523 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3526 #endif /* SHADOW_PAGING_LEVELS == 3 */
3529 ///
3530 /// v->arch.cr3
3531 ///
3532 if ( shadow_mode_external(d) )
3534 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3536 else // not shadow_mode_external...
3538 /* We don't support PV except guest == shadow == config levels */
3539 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3540 #if SHADOW_PAGING_LEVELS == 3
3541 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3542 * Don't use make_cr3 because (a) we know it's below 4GB, and
3543 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3544 ASSERT(virt_to_maddr(&v->arch.shadow.l3table) <= 0xffffffe0ULL);
3545 v->arch.cr3 = virt_to_maddr(&v->arch.shadow.l3table);
3546 #else
3547 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3548 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3549 #endif
3553 ///
3554 /// v->arch.hvm_vcpu.hw_cr3
3555 ///
3556 if ( shadow_mode_external(d) )
3558 ASSERT(is_hvm_domain(d));
3559 #if SHADOW_PAGING_LEVELS == 3
3560 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3561 v->arch.hvm_vcpu.hw_cr3 = virt_to_maddr(&v->arch.shadow.l3table);
3562 #else
3563 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3564 v->arch.hvm_vcpu.hw_cr3 = pagetable_get_paddr(v->arch.shadow_table[0]);
3565 #endif
3568 /* Fix up the linear pagetable mappings */
3569 sh_update_linear_entries(v);
3573 /**************************************************************************/
3574 /* Functions to revoke guest rights */
3576 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3577 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3578 /* Look up this vaddr in the current shadow and see if it's a writeable
3579 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3581 shadow_l1e_t sl1e, *sl1p;
3582 shadow_l2e_t *sl2p;
3583 #if SHADOW_PAGING_LEVELS >= 3
3584 shadow_l3e_t *sl3p;
3585 #if SHADOW_PAGING_LEVELS >= 4
3586 shadow_l4e_t *sl4p;
3587 #endif
3588 #endif
3589 mfn_t sl1mfn;
3590 int r;
3592 /* Carefully look in the shadow linear map for the l1e we expect */
3593 #if SHADOW_PAGING_LEVELS >= 4
3594 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3595 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3596 return 0;
3597 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3598 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3599 return 0;
3600 #elif SHADOW_PAGING_LEVELS == 3
3601 sl3p = ((shadow_l3e_t *) v->arch.shadow.l3table)
3602 + shadow_l3_linear_offset(vaddr);
3603 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3604 return 0;
3605 #endif
3606 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3607 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3608 return 0;
3609 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3610 sl1e = *sl1p;
3611 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3612 != (_PAGE_PRESENT|_PAGE_RW))
3613 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3614 return 0;
3616 /* Found it! Need to remove its write permissions. */
3617 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3618 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3619 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3620 ASSERT( !(r & SHADOW_SET_ERROR) );
3621 return 1;
3623 #endif
3625 int sh_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
3626 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3628 shadow_l1e_t *sl1e;
3629 int done = 0;
3630 int flags;
3631 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3633 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3635 flags = shadow_l1e_get_flags(*sl1e);
3636 if ( (flags & _PAGE_PRESENT)
3637 && (flags & _PAGE_RW)
3638 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3640 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
3641 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
3642 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3643 /* Remember the last shadow that we shot a writeable mapping in */
3644 v->arch.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3645 #endif
3646 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3647 & PGT_count_mask) == 0 )
3648 /* This breaks us cleanly out of the FOREACH macro */
3649 done = 1;
3651 });
3652 return done;
3656 int sh_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3657 /* Excises all mappings to guest frame from this shadow l1 table */
3659 shadow_l1e_t *sl1e;
3660 int done = 0;
3661 int flags;
3663 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3665 flags = shadow_l1e_get_flags(*sl1e);
3666 if ( (flags & _PAGE_PRESENT)
3667 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3669 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3670 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3671 /* This breaks us cleanly out of the FOREACH macro */
3672 done = 1;
3674 });
3675 return done;
3678 /**************************************************************************/
3679 /* Functions to excise all pointers to shadows from higher-level shadows. */
3681 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3682 /* Blank out a single shadow entry */
3684 switch ( mfn_to_shadow_page(smfn)->type )
3686 case SH_type_l1_shadow:
3687 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3688 case SH_type_l2_shadow:
3689 #if GUEST_PAGING_LEVELS == 3
3690 case SH_type_l2h_shadow:
3691 #endif
3692 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3693 #if GUEST_PAGING_LEVELS >= 4
3694 case SH_type_l3_shadow:
3695 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3696 case SH_type_l4_shadow:
3697 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3698 #endif
3699 default: BUG(); /* Called with the wrong kind of shadow. */
3703 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3704 /* Remove all mappings of this l1 shadow from this l2 shadow */
3706 shadow_l2e_t *sl2e;
3707 int done = 0;
3708 int flags;
3709 #if GUEST_PAGING_LEVELS != 4
3710 int xen_mappings = !shadow_mode_external(v->domain);
3711 #endif
3713 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings,
3715 flags = shadow_l2e_get_flags(*sl2e);
3716 if ( (flags & _PAGE_PRESENT)
3717 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3719 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3720 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
3721 /* This breaks us cleanly out of the FOREACH macro */
3722 done = 1;
3724 });
3725 return done;
3728 #if GUEST_PAGING_LEVELS >= 4
3729 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3730 /* Remove all mappings of this l2 shadow from this l3 shadow */
3732 shadow_l3e_t *sl3e;
3733 int done = 0;
3734 int flags;
3736 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3738 flags = shadow_l3e_get_flags(*sl3e);
3739 if ( (flags & _PAGE_PRESENT)
3740 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3742 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3743 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
3744 /* This breaks us cleanly out of the FOREACH macro */
3745 done = 1;
3747 });
3748 return done;
3751 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
3752 /* Remove all mappings of this l3 shadow from this l4 shadow */
3754 shadow_l4e_t *sl4e;
3755 int done = 0;
3756 int flags, xen_mappings = !shadow_mode_external(v->domain);
3758 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
3760 flags = shadow_l4e_get_flags(*sl4e);
3761 if ( (flags & _PAGE_PRESENT)
3762 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
3764 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
3765 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
3766 /* This breaks us cleanly out of the FOREACH macro */
3767 done = 1;
3769 });
3770 return done;
3772 #endif /* 64bit guest */
3774 /**************************************************************************/
3775 /* Handling HVM guest writes to pagetables */
3777 /* Check that the user is allowed to perform this write.
3778 * Returns a mapped pointer to write to, and the mfn it's on,
3779 * or NULL for error. */
3780 static inline void * emulate_map_dest(struct vcpu *v,
3781 unsigned long vaddr,
3782 struct sh_emulate_ctxt *sh_ctxt,
3783 mfn_t *mfnp)
3785 walk_t gw;
3786 u32 flags, errcode;
3787 gfn_t gfn;
3788 mfn_t mfn;
3790 guest_walk_tables(v, vaddr, &gw, 1);
3791 flags = accumulate_guest_flags(v, &gw);
3792 gfn = guest_l1e_get_gfn(gw.eff_l1e);
3793 mfn = vcpu_gfn_to_mfn(v, gfn);
3794 sh_audit_gw(v, &gw);
3795 unmap_walk(v, &gw);
3797 if ( !(flags & _PAGE_PRESENT) )
3799 errcode = 0;
3800 goto page_fault;
3803 if ( !(flags & _PAGE_RW) ||
3804 (!(flags & _PAGE_USER) && ring_3(sh_ctxt->ctxt.regs)) )
3806 errcode = PFEC_page_present;
3807 goto page_fault;
3810 /* Attempted a write to a bad gfn? This should never happen:
3811 * after all, we're here because this write is to a page table. */
3812 BUG_ON(!mfn_valid(mfn));
3814 ASSERT(sh_mfn_is_a_page_table(mfn));
3815 *mfnp = mfn;
3816 return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
3818 page_fault:
3819 errcode |= PFEC_write_access;
3820 if ( is_hvm_vcpu(v) )
3821 hvm_inject_exception(TRAP_page_fault, errcode, vaddr);
3822 else
3823 propagate_page_fault(vaddr, errcode);
3824 return NULL;
3827 int
3828 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
3829 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
3831 mfn_t mfn;
3832 void *addr;
3834 if ( vaddr & (bytes-1) )
3835 return X86EMUL_UNHANDLEABLE;
3837 ASSERT(shadow_lock_is_acquired(v->domain));
3838 ASSERT(((vaddr & ~PAGE_MASK) + bytes) <= PAGE_SIZE);
3840 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
3841 return X86EMUL_PROPAGATE_FAULT;
3843 memcpy(addr, src, bytes);
3844 shadow_validate_guest_pt_write(v, mfn, addr, bytes);
3846 /* If we are writing zeros to this page, might want to unshadow */
3847 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) )
3848 check_for_early_unshadow(v, mfn);
3850 sh_unmap_domain_page(addr);
3851 shadow_audit_tables(v);
3852 return X86EMUL_CONTINUE;
3855 int
3856 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
3857 unsigned long old, unsigned long new,
3858 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
3860 mfn_t mfn;
3861 void *addr;
3862 unsigned long prev;
3863 int rv = X86EMUL_CONTINUE;
3865 ASSERT(shadow_lock_is_acquired(v->domain));
3866 ASSERT(bytes <= sizeof(unsigned long));
3868 if ( vaddr & (bytes-1) )
3869 return X86EMUL_UNHANDLEABLE;
3871 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
3872 return X86EMUL_PROPAGATE_FAULT;
3874 switch ( bytes )
3876 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
3877 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
3878 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
3879 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
3880 default:
3881 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
3882 prev = ~old;
3885 if ( prev == old )
3886 shadow_validate_guest_pt_write(v, mfn, addr, bytes);
3887 else
3888 rv = X86EMUL_CMPXCHG_FAILED;
3890 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
3891 " wanted %#lx now %#lx bytes %u\n",
3892 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
3894 /* If we are writing zeros to this page, might want to unshadow */
3895 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) )
3896 check_for_early_unshadow(v, mfn);
3898 sh_unmap_domain_page(addr);
3899 shadow_audit_tables(v);
3900 return rv;
3903 int
3904 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
3905 unsigned long old_lo, unsigned long old_hi,
3906 unsigned long new_lo, unsigned long new_hi,
3907 struct sh_emulate_ctxt *sh_ctxt)
3909 mfn_t mfn;
3910 void *addr;
3911 u64 old, new, prev;
3912 int rv = X86EMUL_CONTINUE;
3914 ASSERT(shadow_lock_is_acquired(v->domain));
3916 if ( vaddr & 7 )
3917 return X86EMUL_UNHANDLEABLE;
3919 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
3920 return X86EMUL_PROPAGATE_FAULT;
3922 old = (((u64) old_hi) << 32) | (u64) old_lo;
3923 new = (((u64) new_hi) << 32) | (u64) new_lo;
3924 prev = cmpxchg(((u64 *)addr), old, new);
3926 if ( prev == old )
3927 shadow_validate_guest_pt_write(v, mfn, addr, 8);
3928 else
3929 rv = X86EMUL_CMPXCHG_FAILED;
3931 /* If we are writing zeros to this page, might want to unshadow */
3932 if ( *(u32 *)addr == 0 )
3933 check_for_early_unshadow(v, mfn);
3935 sh_unmap_domain_page(addr);
3936 shadow_audit_tables(v);
3937 return rv;
3941 /**************************************************************************/
3942 /* Audit tools */
3944 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
3946 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
3947 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
3948 "gl" #_level "mfn = %" SH_PRI_mfn \
3949 " sl" #_level "mfn = %" SH_PRI_mfn \
3950 " &gl" #_level "e = %p &sl" #_level "e = %p" \
3951 " gl" #_level "e = %" SH_PRI_gpte \
3952 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
3953 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
3954 _level, guest_index(gl ## _level ## e), \
3955 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
3956 gl ## _level ## e, sl ## _level ## e, \
3957 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
3958 ##_a); \
3959 BUG(); \
3960 done = 1; \
3961 } while (0)
3964 static char * sh_audit_flags(struct vcpu *v, int level,
3965 int gflags, int sflags)
3966 /* Common code for auditing flag bits */
3968 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
3969 return "shadow is present but guest is not present";
3970 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
3971 return "global bit set in PV shadow";
3972 if ( level == 2 && (sflags & _PAGE_PSE) )
3973 return "PS bit set in shadow";
3974 #if SHADOW_PAGING_LEVELS == 3
3975 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
3976 #endif
3977 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
3978 return "accessed bit not propagated";
3979 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
3980 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
3981 return "dirty bit not propagated";
3982 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
3983 return "user/supervisor bit does not match";
3984 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
3985 return "NX bit does not match";
3986 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
3987 return "shadow grants write access but guest does not";
3988 return NULL;
3991 static inline mfn_t
3992 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
3993 /* Convert this gfn to an mfn in the manner appropriate for the
3994 * guest pagetable it's used in (gmfn) */
3996 if ( !shadow_mode_translate(v->domain) )
3997 return _mfn(gfn_x(gfn));
3999 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
4000 != PGT_writable_page )
4001 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
4002 else
4003 return sh_gfn_to_mfn(v->domain, gfn_x(gfn));
4007 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4009 guest_l1e_t *gl1e, *gp;
4010 shadow_l1e_t *sl1e;
4011 mfn_t mfn, gmfn, gl1mfn;
4012 gfn_t gfn;
4013 char *s;
4014 int done = 0;
4016 /* Follow the backpointer */
4017 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4018 gl1e = gp = sh_map_domain_page(gl1mfn);
4019 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4021 if ( sh_l1e_is_magic(*sl1e) )
4023 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4024 if ( sh_l1e_is_gnp(*sl1e) )
4026 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4027 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4029 else
4031 ASSERT(sh_l1e_is_mmio(*sl1e));
4032 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4033 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4034 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4035 " but guest gfn is %" SH_PRI_gfn,
4036 gfn_x(gfn),
4037 gfn_x(guest_l1e_get_gfn(*gl1e)));
4039 #endif
4041 else
4043 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4044 shadow_l1e_get_flags(*sl1e));
4045 if ( s ) AUDIT_FAIL(1, "%s", s);
4047 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4049 gfn = guest_l1e_get_gfn(*gl1e);
4050 mfn = shadow_l1e_get_mfn(*sl1e);
4051 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
4052 if ( mfn_x(gmfn) != mfn_x(mfn) )
4053 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4054 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4055 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4058 });
4059 sh_unmap_domain_page(gp);
4060 return done;
4063 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4065 guest_l1e_t *gl1e, e;
4066 shadow_l1e_t *sl1e;
4067 mfn_t gl1mfn = _mfn(INVALID_MFN);
4068 int f;
4069 int done = 0;
4071 /* fl1 has no useful backpointer: all we can check are flags */
4072 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4073 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4074 f = shadow_l1e_get_flags(*sl1e);
4075 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4076 if ( !(f == 0
4077 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4078 _PAGE_ACCESSED|_PAGE_DIRTY)
4079 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4080 || sh_l1e_is_magic(*sl1e)) )
4081 AUDIT_FAIL(1, "fl1e has bad flags");
4082 });
4083 return 0;
4086 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4088 guest_l2e_t *gl2e, *gp;
4089 shadow_l2e_t *sl2e;
4090 mfn_t mfn, gmfn, gl2mfn;
4091 gfn_t gfn;
4092 char *s;
4093 int done = 0;
4094 #if GUEST_PAGING_LEVELS != 4
4095 int xen_mappings = !shadow_mode_external(v->domain);
4096 #endif
4098 /* Follow the backpointer */
4099 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4100 gl2e = gp = sh_map_domain_page(gl2mfn);
4101 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
4103 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4104 shadow_l2e_get_flags(*sl2e));
4105 if ( s ) AUDIT_FAIL(2, "%s", s);
4107 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4109 gfn = guest_l2e_get_gfn(*gl2e);
4110 mfn = shadow_l2e_get_mfn(*sl2e);
4111 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4112 ? get_fl1_shadow_status(v, gfn)
4113 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4114 SH_type_l1_shadow);
4115 if ( mfn_x(gmfn) != mfn_x(mfn) )
4116 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4117 " (--> %" SH_PRI_mfn ")"
4118 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4119 gfn_x(gfn),
4120 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4121 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4122 mfn_x(gmfn), mfn_x(mfn));
4124 });
4125 sh_unmap_domain_page(gp);
4126 return 0;
4129 #if GUEST_PAGING_LEVELS >= 4
4130 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4132 guest_l3e_t *gl3e, *gp;
4133 shadow_l3e_t *sl3e;
4134 mfn_t mfn, gmfn, gl3mfn;
4135 gfn_t gfn;
4136 char *s;
4137 int done = 0;
4139 /* Follow the backpointer */
4140 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4141 gl3e = gp = sh_map_domain_page(gl3mfn);
4142 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4144 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4145 shadow_l3e_get_flags(*sl3e));
4146 if ( s ) AUDIT_FAIL(3, "%s", s);
4148 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4150 gfn = guest_l3e_get_gfn(*gl3e);
4151 mfn = shadow_l3e_get_mfn(*sl3e);
4152 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4153 (GUEST_PAGING_LEVELS == 3
4154 && !shadow_mode_external(v->domain)
4155 && (guest_index(gl3e) % 4) == 3)
4156 ? SH_type_l2h_pae_shadow
4157 : SH_type_l2_shadow);
4158 if ( mfn_x(gmfn) != mfn_x(mfn) )
4159 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4160 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4161 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4163 });
4164 sh_unmap_domain_page(gp);
4165 return 0;
4168 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4170 guest_l4e_t *gl4e, *gp;
4171 shadow_l4e_t *sl4e;
4172 mfn_t mfn, gmfn, gl4mfn;
4173 gfn_t gfn;
4174 char *s;
4175 int done = 0;
4176 int xen_mappings = !shadow_mode_external(v->domain);
4178 /* Follow the backpointer */
4179 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4180 gl4e = gp = sh_map_domain_page(gl4mfn);
4181 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
4183 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4184 shadow_l4e_get_flags(*sl4e));
4185 if ( s ) AUDIT_FAIL(4, "%s", s);
4187 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4189 gfn = guest_l4e_get_gfn(*gl4e);
4190 mfn = shadow_l4e_get_mfn(*sl4e);
4191 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4192 SH_type_l3_shadow);
4193 if ( mfn_x(gmfn) != mfn_x(mfn) )
4194 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4195 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4196 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4198 });
4199 sh_unmap_domain_page(gp);
4200 return 0;
4202 #endif /* GUEST_PAGING_LEVELS >= 4 */
4205 #undef AUDIT_FAIL
4207 #endif /* Audit code */
4209 /**************************************************************************/
4210 /* Entry points into this mode of the shadow code.
4211 * This will all be mangled by the preprocessor to uniquify everything. */
4212 struct shadow_paging_mode sh_paging_mode = {
4213 .page_fault = sh_page_fault,
4214 .invlpg = sh_invlpg,
4215 .gva_to_gpa = sh_gva_to_gpa,
4216 .gva_to_gfn = sh_gva_to_gfn,
4217 .update_cr3 = sh_update_cr3,
4218 .map_and_validate_gl1e = sh_map_and_validate_gl1e,
4219 .map_and_validate_gl2e = sh_map_and_validate_gl2e,
4220 .map_and_validate_gl2he = sh_map_and_validate_gl2he,
4221 .map_and_validate_gl3e = sh_map_and_validate_gl3e,
4222 .map_and_validate_gl4e = sh_map_and_validate_gl4e,
4223 .detach_old_tables = sh_detach_old_tables,
4224 .x86_emulate_write = sh_x86_emulate_write,
4225 .x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4226 .x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4227 .make_monitor_table = sh_make_monitor_table,
4228 .destroy_monitor_table = sh_destroy_monitor_table,
4229 .guest_map_l1e = sh_guest_map_l1e,
4230 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4231 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4232 .guess_wrmap = sh_guess_wrmap,
4233 #endif
4234 .guest_levels = GUEST_PAGING_LEVELS,
4235 .shadow_levels = SHADOW_PAGING_LEVELS,
4236 };
4238 /*
4239 * Local variables:
4240 * mode: C
4241 * c-set-style: "BSD"
4242 * c-basic-offset: 4
4243 * indent-tabs-mode: nil
4244 * End:
4245 */