ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 15863:4633e9604da9

[HVM] Add type information to the p2m map.
This is a base for memory tricks like page sharing, copy-on-write, lazy
allocation etc. It should also make pass-through MMIO easier to
implement in the p2m.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Mon Sep 10 14:42:30 2007 +0100 (2007-09-10)
parents a53aaea4c698
children 6146bea9e67f
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include "private.h"
37 #include "types.h"
39 /* THINGS TO DO LATER:
40 *
41 * TEARDOWN HEURISTICS
42 * Also: have a heuristic for when to destroy a previous paging-mode's
43 * shadows. When a guest is done with its start-of-day 32-bit tables
44 * and reuses the memory we want to drop those shadows. Start with
45 * shadows in a page in two modes as a hint, but beware of clever tricks
46 * like reusing a pagetable for both PAE and 64-bit during boot...
47 *
48 * PAE LINEAR MAPS
49 * Rework shadow_get_l*e() to have the option of using map_domain_page()
50 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
51 * Then we can test the speed difference made by linear maps. If the
52 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
53 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
54 * to share l2h pages again.
55 *
56 * GUEST_WALK_TABLES TLB FLUSH COALESCE
57 * guest_walk_tables can do up to three remote TLB flushes as it walks to
58 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
59 * and if we do flush, re-do the walk. If anything has changed, then
60 * pause all the other vcpus and do the walk *again*.
61 *
62 * WP DISABLED
63 * Consider how to implement having the WP bit of CR0 set to 0.
64 * Since we need to be able to cause write faults to pagetables, this might
65 * end up looking like not having the (guest) pagetables present at all in
66 * HVM guests...
67 *
68 * PSE disabled / PSE36
69 * We don't support any modes other than PSE enabled, PSE36 disabled.
70 * Neither of those would be hard to change, but we'd need to be able to
71 * deal with shadows made in one mode and used in another.
72 */
74 #define FETCH_TYPE_PREFETCH 1
75 #define FETCH_TYPE_DEMAND 2
76 #define FETCH_TYPE_WRITE 4
77 typedef enum {
78 ft_prefetch = FETCH_TYPE_PREFETCH,
79 ft_demand_read = FETCH_TYPE_DEMAND,
80 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
81 } fetch_type_t;
83 #ifdef DEBUG_TRACE_DUMP
84 static char *fetch_type_names[] = {
85 [ft_prefetch] "prefetch",
86 [ft_demand_read] "demand read",
87 [ft_demand_write] "demand write",
88 };
89 #endif
91 /**************************************************************************/
92 /* Hash table mapping from guest pagetables to shadows
93 *
94 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
95 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
96 * shadow L1 which maps its "splinters".
97 */
99 static inline mfn_t
100 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
101 /* Look for FL1 shadows in the hash table */
102 {
103 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
104 return smfn;
105 }
107 static inline mfn_t
108 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
109 /* Look for shadows in the hash table */
110 {
111 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
112 perfc_incr(shadow_get_shadow_status);
113 return smfn;
114 }
116 static inline void
117 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
118 /* Put an FL1 shadow into the hash table */
119 {
120 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
121 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
123 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
124 }
126 static inline void
127 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
128 /* Put a shadow into the hash table */
129 {
130 struct domain *d = v->domain;
131 int res;
133 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
134 d->domain_id, v->vcpu_id, mfn_x(gmfn),
135 shadow_type, mfn_x(smfn));
137 /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
138 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
139 {
140 res = get_page(mfn_to_page(gmfn), d);
141 ASSERT(res == 1);
142 }
144 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
145 }
147 static inline void
148 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
149 /* Remove a shadow from the hash table */
150 {
151 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
152 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
153 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
154 }
156 static inline void
157 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
158 /* Remove a shadow from the hash table */
159 {
160 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
161 v->domain->domain_id, v->vcpu_id,
162 mfn_x(gmfn), shadow_type, mfn_x(smfn));
163 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
164 /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
165 if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
166 put_page(mfn_to_page(gmfn));
167 }
169 /**************************************************************************/
170 /* CPU feature support querying */
172 static inline int
173 guest_supports_superpages(struct vcpu *v)
174 {
175 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
176 * CR4.PSE is set or the guest is in PAE or long mode.
177 * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
178 return (is_hvm_vcpu(v) &&
179 (GUEST_PAGING_LEVELS != 2
180 || !hvm_paging_enabled(v)
181 || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
182 }
184 static inline int
185 guest_supports_nx(struct vcpu *v)
186 {
187 if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
188 return 0;
189 if ( !is_hvm_vcpu(v) )
190 return 1;
191 return hvm_nx_enabled(v);
192 }
195 /**************************************************************************/
196 /* Functions for walking the guest page tables */
199 /* Walk the guest pagetables, filling the walk_t with what we see.
200 * Takes an uninitialised walk_t. The caller must call unmap_walk()
201 * on the walk_t before discarding it or calling guest_walk_tables again.
202 * If "guest_op" is non-zero, we are serving a genuine guest memory access,
203 * and must (a) be under the shadow lock, and (b) remove write access
204 * from any gueat PT pages we see, as we will be using their contents to
205 * perform shadow updates.
206 * Returns 0 for success or non-zero if the guest pagetables are malformed.
207 * N.B. Finding a not-present entry does not cause a non-zero return code. */
208 static inline int
209 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
210 {
211 struct domain *d = v->domain;
212 p2m_type_t p2mt;
213 ASSERT(!guest_op || shadow_locked_by_me(d));
215 perfc_incr(shadow_guest_walk);
216 memset(gw, 0, sizeof(*gw));
217 gw->va = va;
219 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
220 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
221 /* Get l4e from the top level table */
222 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
223 gw->l4e = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable
224 + guest_l4_table_offset(va);
225 /* Walk down to the l3e */
226 if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
227 gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(*gw->l4e), &p2mt);
228 if ( !p2m_is_ram(p2mt) ) return 1;
229 ASSERT(mfn_valid(gw->l3mfn));
230 /* This mfn is a pagetable: make sure the guest can't write to it. */
231 if ( guest_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
232 flush_tlb_mask(d->domain_dirty_cpumask);
233 gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
234 + guest_l3_table_offset(va);
235 #else /* PAE only... */
236 /* Get l3e from the cache of the guest's top level table */
237 gw->l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
238 #endif /* PAE or 64... */
239 /* Walk down to the l2e */
240 if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
241 gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(*gw->l3e), &p2mt);
242 if ( !p2m_is_ram(p2mt) ) return 1;
243 ASSERT(mfn_valid(gw->l2mfn));
244 /* This mfn is a pagetable: make sure the guest can't write to it. */
245 if ( guest_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
246 flush_tlb_mask(d->domain_dirty_cpumask);
247 gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
248 + guest_l2_table_offset(va);
249 #else /* 32-bit only... */
250 /* Get l2e from the top level table */
251 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
252 gw->l2e = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable
253 + guest_l2_table_offset(va);
254 #endif /* All levels... */
256 if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
257 if ( guest_supports_superpages(v) &&
258 (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
259 {
260 /* Special case: this guest VA is in a PSE superpage, so there's
261 * no guest l1e. We make one up so that the propagation code
262 * can generate a shadow l1 table. Start with the gfn of the
263 * first 4k-page of the superpage. */
264 gfn_t start = guest_l2e_get_gfn(*gw->l2e);
265 /* Grant full access in the l1e, since all the guest entry's
266 * access controls are enforced in the shadow l2e. This lets
267 * us reflect l2 changes later without touching the l1s. */
268 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
269 _PAGE_ACCESSED|_PAGE_DIRTY);
270 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
271 * of the level 1 */
272 if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
273 flags |= _PAGE_PAT;
274 /* Increment the pfn by the right number of 4k pages.
275 * The ~0x1 is to mask out the PAT bit mentioned above. */
276 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
277 gw->eff_l1e = guest_l1e_from_gfn(start, flags);
278 gw->l1e = NULL;
279 gw->l1mfn = _mfn(INVALID_MFN);
280 }
281 else
282 {
283 /* Not a superpage: carry on and find the l1e. */
284 gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(*gw->l2e), &p2mt);
285 if ( !p2m_is_ram(p2mt) ) return 1;
286 ASSERT(mfn_valid(gw->l1mfn));
287 /* This mfn is a pagetable: make sure the guest can't write to it. */
288 if ( guest_op
289 && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
290 flush_tlb_mask(d->domain_dirty_cpumask);
291 gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
292 + guest_l1_table_offset(va);
293 gw->eff_l1e = *gw->l1e;
294 }
296 return 0;
297 }
299 /* Given a walk_t, translate the gw->va into the guest's notion of the
300 * corresponding frame number. */
301 static inline gfn_t
302 guest_walk_to_gfn(walk_t *gw)
303 {
304 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
305 return _gfn(INVALID_GFN);
306 return guest_l1e_get_gfn(gw->eff_l1e);
307 }
309 /* Given a walk_t, translate the gw->va into the guest's notion of the
310 * corresponding physical address. */
311 static inline paddr_t
312 guest_walk_to_gpa(walk_t *gw)
313 {
314 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
315 return 0;
316 return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
317 }
320 /* Unmap (and reinitialise) a guest walk.
321 * Call this to dispose of any walk filled in by guest_walk_tables() */
322 static void unmap_walk(struct vcpu *v, walk_t *gw)
323 {
324 #if GUEST_PAGING_LEVELS >= 3
325 #if GUEST_PAGING_LEVELS >= 4
326 if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
327 #endif
328 if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
329 #endif
330 if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
331 #ifdef DEBUG
332 memset(gw, 0, sizeof(*gw));
333 #endif
334 }
337 /* Pretty-print the contents of a guest-walk */
338 static inline void print_gw(walk_t *gw)
339 {
340 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
341 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
342 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
343 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
344 SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
345 if ( gw->l4e )
346 SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
347 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
348 #endif /* PAE or 64... */
349 SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
350 if ( gw->l3e )
351 SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
352 #endif /* All levels... */
353 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
354 SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
355 if ( gw->l2e )
356 SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
357 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
358 SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
359 if ( gw->l1e )
360 SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
361 SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
362 }
365 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
366 /* Lightweight audit: pass all the shadows associated with this guest walk
367 * through the audit mechanisms */
368 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
369 {
370 mfn_t smfn;
372 if ( !(SHADOW_AUDIT_ENABLE) )
373 return;
375 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
376 if ( mfn_valid(gw->l4mfn)
377 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
378 SH_type_l4_shadow))) )
379 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
380 if ( mfn_valid(gw->l3mfn)
381 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
382 SH_type_l3_shadow))) )
383 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
384 #endif /* PAE or 64... */
385 if ( mfn_valid(gw->l2mfn) )
386 {
387 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
388 SH_type_l2_shadow))) )
389 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
390 #if GUEST_PAGING_LEVELS == 3
391 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
392 SH_type_l2h_shadow))) )
393 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
394 #endif
395 }
396 if ( mfn_valid(gw->l1mfn)
397 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
398 SH_type_l1_shadow))) )
399 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
400 else if ( gw->l2e
401 && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
402 && mfn_valid(
403 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
404 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
405 }
407 #else
408 #define sh_audit_gw(_v, _gw) do {} while(0)
409 #endif /* audit code */
413 /**************************************************************************/
414 /* Function to write to the guest tables, for propagating accessed and
415 * dirty bits from the shadow to the guest.
416 * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
417 * and an operation type. The guest entry is always passed as an l1e:
418 * since we only ever write flags, that's OK.
419 * Returns the new flag bits of the guest entry. */
421 static u32 guest_set_ad_bits(struct vcpu *v,
422 mfn_t gmfn,
423 guest_l1e_t *ep,
424 unsigned int level,
425 fetch_type_t ft)
426 {
427 u32 flags;
428 int res = 0;
430 ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
431 ASSERT(level <= GUEST_PAGING_LEVELS);
432 ASSERT(shadow_locked_by_me(v->domain));
434 flags = guest_l1e_get_flags(*ep);
436 /* Only set A and D bits for guest-initiated accesses */
437 if ( !(ft & FETCH_TYPE_DEMAND) )
438 return flags;
440 ASSERT(mfn_valid(gmfn)
441 && (sh_mfn_is_a_page_table(gmfn)
442 || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
443 == 0)));
445 /* PAE l3s do not have A and D bits */
446 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
448 /* Need the D bit as well for writes, in L1es and PSE L2es. */
449 if ( ft == ft_demand_write
450 && (level == 1 ||
451 (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
452 {
453 if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
454 == (_PAGE_DIRTY | _PAGE_ACCESSED) )
455 return flags; /* Guest already has A and D bits set */
456 flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
457 perfc_incr(shadow_ad_update);
458 }
459 else
460 {
461 if ( flags & _PAGE_ACCESSED )
462 return flags; /* Guest already has A bit set */
463 flags |= _PAGE_ACCESSED;
464 perfc_incr(shadow_a_update);
465 }
467 /* Set the bit(s) */
468 paging_mark_dirty(v->domain, mfn_x(gmfn));
469 SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
470 "old flags = %#x, new flags = %#x\n",
471 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep),
472 flags);
473 *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
475 /* Propagate this change to any other shadows of the page
476 * (only necessary if there is more than one shadow) */
477 if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
478 {
479 u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
480 /* More than one type bit set in shadow-flags? */
481 if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
482 res = sh_validate_guest_entry(v, gmfn, ep, sizeof (*ep));
483 }
485 /* We should never need to flush the TLB or recopy PAE entries */
486 ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
488 return flags;
489 }
491 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
492 void *
493 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
494 unsigned long *gl1mfn)
495 {
496 void *pl1e = NULL;
497 walk_t gw;
499 ASSERT(shadow_mode_translate(v->domain));
501 // XXX -- this is expensive, but it's easy to cobble together...
502 // FIXME!
504 shadow_lock(v->domain);
505 guest_walk_tables(v, addr, &gw, 1);
507 if ( gw.l2e &&
508 (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) &&
509 !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) )
510 {
511 if ( gl1mfn )
512 *gl1mfn = mfn_x(gw.l1mfn);
513 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
514 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
515 }
517 unmap_walk(v, &gw);
518 shadow_unlock(v->domain);
520 return pl1e;
521 }
523 void
524 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
525 {
526 walk_t gw;
528 ASSERT(shadow_mode_translate(v->domain));
530 // XXX -- this is expensive, but it's easy to cobble together...
531 // FIXME!
533 shadow_lock(v->domain);
534 guest_walk_tables(v, addr, &gw, 1);
535 *(guest_l1e_t *)eff_l1e = gw.eff_l1e;
536 unmap_walk(v, &gw);
537 shadow_unlock(v->domain);
538 }
539 #endif /* CONFIG==SHADOW==GUEST */
541 /**************************************************************************/
542 /* Functions to compute the correct index into a shadow page, given an
543 * index into the guest page (as returned by guest_get_index()).
544 * This is trivial when the shadow and guest use the same sized PTEs, but
545 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
546 * PAE- or 64-bit shadows).
547 *
548 * These functions also increment the shadow mfn, when necessary. When PTE
549 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
550 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
551 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
552 * which shadow page we really want. Similarly, when PTE sizes are
553 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
554 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
555 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
556 * space.)
557 *
558 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
559 * of shadow (to store both the shadow, and the info that would normally be
560 * stored in page_info fields). This arrangement allows the shadow and the
561 * "page_info" fields to always be stored in the same page (in fact, in
562 * the same cache line), avoiding an extra call to map_domain_page().
563 */
565 static inline u32
566 guest_index(void *ptr)
567 {
568 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
569 }
571 static u32
572 shadow_l1_index(mfn_t *smfn, u32 guest_index)
573 {
574 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
575 *smfn = _mfn(mfn_x(*smfn) +
576 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
577 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
578 #else
579 return guest_index;
580 #endif
581 }
583 static u32
584 shadow_l2_index(mfn_t *smfn, u32 guest_index)
585 {
586 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
587 // Because we use 2 shadow l2 entries for each guest entry, the number of
588 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
589 //
590 *smfn = _mfn(mfn_x(*smfn) +
591 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
593 // We multiple by two to get the index of the first of the two entries
594 // used to shadow the specified guest entry.
595 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
596 #else
597 return guest_index;
598 #endif
599 }
601 #if GUEST_PAGING_LEVELS >= 4
603 static u32
604 shadow_l3_index(mfn_t *smfn, u32 guest_index)
605 {
606 return guest_index;
607 }
609 static u32
610 shadow_l4_index(mfn_t *smfn, u32 guest_index)
611 {
612 return guest_index;
613 }
615 #endif // GUEST_PAGING_LEVELS >= 4
618 /**************************************************************************/
619 /* Function which computes shadow entries from their corresponding guest
620 * entries. This is the "heart" of the shadow code. It operates using
621 * level-1 shadow types, but handles all levels of entry.
622 * Don't call it directly, but use the four wrappers below.
623 */
625 static always_inline void
626 _sh_propagate(struct vcpu *v,
627 void *guest_entry_ptr,
628 mfn_t guest_table_mfn,
629 mfn_t target_mfn,
630 void *shadow_entry_ptr,
631 int level,
632 fetch_type_t ft,
633 p2m_type_t p2mt)
634 {
635 guest_l1e_t *gp = guest_entry_ptr;
636 shadow_l1e_t *sp = shadow_entry_ptr;
637 struct domain *d = v->domain;
638 u32 pass_thru_flags;
639 u32 gflags, sflags;
641 /* We don't shadow PAE l3s */
642 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
644 /* Check there's something for the shadows to map to */
645 if ( !p2m_is_valid(p2mt) )
646 {
647 *sp = shadow_l1e_empty();
648 goto done;
649 }
651 if ( mfn_valid(guest_table_mfn) )
652 /* Handle A and D bit propagation into the guest */
653 gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
654 else
655 {
656 /* Must be an fl1e or a prefetch */
657 ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
658 gflags = guest_l1e_get_flags(*gp);
659 }
661 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
662 {
663 /* If a guest l1 entry is not present, shadow with the magic
664 * guest-not-present entry. */
665 if ( level == 1 )
666 *sp = sh_l1e_gnp();
667 else
668 *sp = shadow_l1e_empty();
669 goto done;
670 }
672 if ( level == 1 && p2mt == p2m_mmio_dm )
673 {
674 /* Guest l1e maps emulated MMIO space */
675 *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
676 if ( !d->arch.paging.shadow.has_fast_mmio_entries )
677 d->arch.paging.shadow.has_fast_mmio_entries = 1;
678 goto done;
679 }
681 // Must have a valid target_mfn unless this is a prefetch. In the
682 // case of a prefetch, an invalid mfn means that we can not usefully
683 // shadow anything, and so we return early.
684 //
685 /* N.B. For pass-through MMIO, either this test needs to be relaxed,
686 * and shadow_set_l1e() trained to handle non-valid MFNs (ugh), or the
687 * MMIO areas need to be added to the frame-table to make them "valid". */
688 if ( !mfn_valid(target_mfn) )
689 {
690 ASSERT((ft == ft_prefetch));
691 *sp = shadow_l1e_empty();
692 goto done;
693 }
695 // Propagate bits from the guest to the shadow.
696 // Some of these may be overwritten, below.
697 // Since we know the guest's PRESENT bit is set, we also set the shadow's
698 // SHADOW_PRESENT bit.
699 //
700 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
701 _PAGE_RW | _PAGE_PRESENT);
702 if ( guest_supports_nx(v) )
703 pass_thru_flags |= _PAGE_NX_BIT;
704 sflags = gflags & pass_thru_flags;
706 // Set the A&D bits for higher level shadows.
707 // Higher level entries do not, strictly speaking, have dirty bits, but
708 // since we use shadow linear tables, each of these entries may, at some
709 // point in time, also serve as a shadow L1 entry.
710 // By setting both the A&D bits in each of these, we eliminate the burden
711 // on the hardware to update these bits on initial accesses.
712 //
713 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
714 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
716 // If the A or D bit has not yet been set in the guest, then we must
717 // prevent the corresponding kind of access.
718 //
719 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
720 sflags &= ~_PAGE_PRESENT;
722 /* D bits exist in L1es and PSE L2es */
723 if ( unlikely(((level == 1) ||
724 ((level == 2) &&
725 (gflags & _PAGE_PSE) &&
726 guest_supports_superpages(v)))
727 && !(gflags & _PAGE_DIRTY)) )
728 sflags &= ~_PAGE_RW;
730 // shadow_mode_log_dirty support
731 //
732 // Only allow the guest write access to a page a) on a demand fault,
733 // or b) if the page is already marked as dirty.
734 //
735 // (We handle log-dirty entirely inside the shadow code, without using the
736 // p2m_ram_logdirty p2m type: only HAP uses that.)
737 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
738 {
739 if ( ft & FETCH_TYPE_WRITE )
740 paging_mark_dirty(d, mfn_x(target_mfn));
741 else if ( !sh_mfn_is_dirty(d, target_mfn) )
742 sflags &= ~_PAGE_RW;
743 }
745 /* Read-only memory */
746 if ( p2mt == p2m_ram_ro )
747 sflags &= ~_PAGE_RW;
749 // protect guest page tables
750 //
751 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
752 {
753 if ( shadow_mode_trap_reads(d) )
754 {
755 // if we are trapping both reads & writes, then mark this page
756 // as not present...
757 //
758 sflags &= ~_PAGE_PRESENT;
759 }
760 else
761 {
762 // otherwise, just prevent any writes...
763 //
764 sflags &= ~_PAGE_RW;
765 }
766 }
768 // PV guests in 64-bit mode use two different page tables for user vs
769 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
770 // It is always shadowed as present...
771 if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
772 && !is_hvm_domain(d) )
773 {
774 sflags |= _PAGE_USER;
775 }
777 /* MMIO addresses should never be cached */
778 if ( p2m_is_mmio(p2mt) )
779 sflags |= _PAGE_PCD;
781 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
783 done:
784 SHADOW_DEBUG(PROPAGATE,
785 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
786 fetch_type_names[ft], level, gp->l1, sp->l1);
787 }
790 /* These four wrappers give us a little bit of type-safety back around the
791 * use of void-* pointers in _sh_propagate(), and allow the compiler to
792 * optimize out some level checks. */
794 #if GUEST_PAGING_LEVELS >= 4
795 static void
796 l4e_propagate_from_guest(struct vcpu *v,
797 guest_l4e_t *gl4e,
798 mfn_t gl4mfn,
799 mfn_t sl3mfn,
800 shadow_l4e_t *sl4e,
801 fetch_type_t ft)
802 {
803 _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
804 }
806 static void
807 l3e_propagate_from_guest(struct vcpu *v,
808 guest_l3e_t *gl3e,
809 mfn_t gl3mfn,
810 mfn_t sl2mfn,
811 shadow_l3e_t *sl3e,
812 fetch_type_t ft)
813 {
814 _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
815 }
816 #endif // GUEST_PAGING_LEVELS >= 4
818 static void
819 l2e_propagate_from_guest(struct vcpu *v,
820 guest_l2e_t *gl2e,
821 mfn_t gl2mfn,
822 mfn_t sl1mfn,
823 shadow_l2e_t *sl2e,
824 fetch_type_t ft)
825 {
826 _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
827 }
829 static void
830 l1e_propagate_from_guest(struct vcpu *v,
831 guest_l1e_t *gl1e,
832 mfn_t gl1mfn,
833 mfn_t gmfn,
834 shadow_l1e_t *sl1e,
835 fetch_type_t ft,
836 p2m_type_t p2mt)
837 {
838 _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, p2mt);
839 }
842 /**************************************************************************/
843 /* These functions update shadow entries (and do bookkeeping on the shadow
844 * tables they are in). It is intended that they are the only
845 * functions which ever write (non-zero) data onto a shadow page.
846 */
848 static inline void safe_write_entry(void *dst, void *src)
849 /* Copy one PTE safely when processors might be running on the
850 * destination pagetable. This does *not* give safety against
851 * concurrent writes (that's what the shadow lock is for), just
852 * stops the hardware picking up partially written entries. */
853 {
854 volatile unsigned long *d = dst;
855 unsigned long *s = src;
856 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
857 #if CONFIG_PAGING_LEVELS == 3
858 /* In PAE mode, pagetable entries are larger
859 * than machine words, so won't get written atomically. We need to make
860 * sure any other cpu running on these shadows doesn't see a
861 * half-written entry. Do this by marking the entry not-present first,
862 * then writing the high word before the low word. */
863 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
864 d[0] = 0;
865 d[1] = s[1];
866 d[0] = s[0];
867 #else
868 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
869 * which will be an atomic write, since the entry is aligned. */
870 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
871 *d = *s;
872 #endif
873 }
876 static inline void
877 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
878 /* This function does the actual writes to shadow pages.
879 * It must not be called directly, since it doesn't do the bookkeeping
880 * that shadow_set_l*e() functions do. */
881 {
882 shadow_l1e_t *dst = d;
883 shadow_l1e_t *src = s;
884 void *map = NULL;
885 int i;
887 /* Because we mirror access rights at all levels in the shadow, an
888 * l2 (or higher) entry with the RW bit cleared will leave us with
889 * no write access through the linear map.
890 * We detect that by writing to the shadow with copy_to_user() and
891 * using map_domain_page() to get a writeable mapping if we need to. */
892 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
893 {
894 perfc_incr(shadow_linear_map_failed);
895 map = sh_map_domain_page(mfn);
896 ASSERT(map != NULL);
897 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
898 }
901 for ( i = 0; i < entries; i++ )
902 safe_write_entry(dst++, src++);
904 if ( map != NULL ) sh_unmap_domain_page(map);
905 }
907 static inline int
908 perms_strictly_increased(u32 old_flags, u32 new_flags)
909 /* Given the flags of two entries, are the new flags a strict
910 * increase in rights over the old ones? */
911 {
912 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
913 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
914 /* Flip the NX bit, since it's the only one that decreases rights;
915 * we calculate as if it were an "X" bit. */
916 of ^= _PAGE_NX_BIT;
917 nf ^= _PAGE_NX_BIT;
918 /* If the changed bits are all set in the new flags, then rights strictly
919 * increased between old and new. */
920 return ((of | (of ^ nf)) == nf);
921 }
923 static int inline
924 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
925 {
926 int res;
927 mfn_t mfn;
928 struct domain *owner;
930 ASSERT(!sh_l1e_is_magic(sl1e));
932 if ( !shadow_mode_refcounts(d) )
933 return 1;
935 res = get_page_from_l1e(sl1e, d);
937 // If a privileged domain is attempting to install a map of a page it does
938 // not own, we let it succeed anyway.
939 //
940 if ( unlikely(!res) &&
941 IS_PRIV(d) &&
942 !shadow_mode_translate(d) &&
943 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
944 (owner = page_get_owner(mfn_to_page(mfn))) &&
945 (d != owner) )
946 {
947 res = get_page_from_l1e(sl1e, owner);
948 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
949 "which is owned by domain %d: %s\n",
950 d->domain_id, mfn_x(mfn), owner->domain_id,
951 res ? "success" : "failed");
952 }
954 if ( unlikely(!res) )
955 {
956 perfc_incr(shadow_get_page_fail);
957 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
958 }
960 return res;
961 }
963 static void inline
964 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
965 {
966 if ( !shadow_mode_refcounts(d) )
967 return;
969 put_page_from_l1e(sl1e, d);
970 }
972 #if GUEST_PAGING_LEVELS >= 4
973 static int shadow_set_l4e(struct vcpu *v,
974 shadow_l4e_t *sl4e,
975 shadow_l4e_t new_sl4e,
976 mfn_t sl4mfn)
977 {
978 int flags = 0, ok;
979 shadow_l4e_t old_sl4e;
980 paddr_t paddr;
981 ASSERT(sl4e != NULL);
982 old_sl4e = *sl4e;
984 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
986 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
987 | (((unsigned long)sl4e) & ~PAGE_MASK));
989 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
990 {
991 /* About to install a new reference */
992 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
993 ok = sh_get_ref(v, sl3mfn, paddr);
994 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
995 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
996 ok |= sh_pin(v, sl3mfn);
997 if ( !ok )
998 {
999 domain_crash(v->domain);
1000 return SHADOW_SET_ERROR;
1004 /* Write the new entry */
1005 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
1006 flags |= SHADOW_SET_CHANGED;
1008 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1010 /* We lost a reference to an old mfn. */
1011 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1012 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1013 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1014 shadow_l4e_get_flags(new_sl4e)) )
1016 flags |= SHADOW_SET_FLUSH;
1018 sh_put_ref(v, osl3mfn, paddr);
1020 return flags;
1023 static int shadow_set_l3e(struct vcpu *v,
1024 shadow_l3e_t *sl3e,
1025 shadow_l3e_t new_sl3e,
1026 mfn_t sl3mfn)
1028 int flags = 0;
1029 shadow_l3e_t old_sl3e;
1030 paddr_t paddr;
1031 ASSERT(sl3e != NULL);
1032 old_sl3e = *sl3e;
1034 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1036 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1037 | (((unsigned long)sl3e) & ~PAGE_MASK));
1039 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1040 /* About to install a new reference */
1041 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1043 domain_crash(v->domain);
1044 return SHADOW_SET_ERROR;
1047 /* Write the new entry */
1048 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1049 flags |= SHADOW_SET_CHANGED;
1051 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1053 /* We lost a reference to an old mfn. */
1054 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1055 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1056 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1057 shadow_l3e_get_flags(new_sl3e)) )
1059 flags |= SHADOW_SET_FLUSH;
1061 sh_put_ref(v, osl2mfn, paddr);
1063 return flags;
1065 #endif /* GUEST_PAGING_LEVELS >= 4 */
1067 static int shadow_set_l2e(struct vcpu *v,
1068 shadow_l2e_t *sl2e,
1069 shadow_l2e_t new_sl2e,
1070 mfn_t sl2mfn)
1072 int flags = 0;
1073 shadow_l2e_t old_sl2e;
1074 paddr_t paddr;
1076 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1077 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1078 * shadows. Reference counting and up-pointers track from the first
1079 * page of the shadow to the first l2e, so make sure that we're
1080 * working with those:
1081 * Align the pointer down so it's pointing at the first of the pair */
1082 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1083 /* Align the mfn of the shadow entry too */
1084 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1085 #endif
1087 ASSERT(sl2e != NULL);
1088 old_sl2e = *sl2e;
1090 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1092 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1093 | (((unsigned long)sl2e) & ~PAGE_MASK));
1095 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1096 /* About to install a new reference */
1097 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1099 domain_crash(v->domain);
1100 return SHADOW_SET_ERROR;
1103 /* Write the new entry */
1104 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1106 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1107 /* The l1 shadow is two pages long and need to be pointed to by
1108 * two adjacent l1es. The pair have the same flags, but point
1109 * at odd and even MFNs */
1110 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1111 pair[1].l2 |= (1<<PAGE_SHIFT);
1112 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1114 #else /* normal case */
1115 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1116 #endif
1117 flags |= SHADOW_SET_CHANGED;
1119 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1121 /* We lost a reference to an old mfn. */
1122 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1123 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1124 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1125 shadow_l2e_get_flags(new_sl2e)) )
1127 flags |= SHADOW_SET_FLUSH;
1129 sh_put_ref(v, osl1mfn, paddr);
1131 return flags;
1134 static int shadow_set_l1e(struct vcpu *v,
1135 shadow_l1e_t *sl1e,
1136 shadow_l1e_t new_sl1e,
1137 mfn_t sl1mfn)
1139 int flags = 0;
1140 struct domain *d = v->domain;
1141 shadow_l1e_t old_sl1e;
1142 ASSERT(sl1e != NULL);
1144 old_sl1e = *sl1e;
1146 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1148 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1149 && !sh_l1e_is_magic(new_sl1e) )
1151 /* About to install a new reference */
1152 if ( shadow_mode_refcounts(d) ) {
1153 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1155 /* Doesn't look like a pagetable. */
1156 flags |= SHADOW_SET_ERROR;
1157 new_sl1e = shadow_l1e_empty();
1162 /* Write the new entry */
1163 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1164 flags |= SHADOW_SET_CHANGED;
1166 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1167 && !sh_l1e_is_magic(old_sl1e) )
1169 /* We lost a reference to an old mfn. */
1170 /* N.B. Unlike higher-level sets, never need an extra flush
1171 * when writing an l1e. Because it points to the same guest frame
1172 * as the guest l1e did, it's the guest's responsibility to
1173 * trigger a flush later. */
1174 if ( shadow_mode_refcounts(d) )
1176 shadow_put_page_from_l1e(old_sl1e, d);
1179 return flags;
1183 /**************************************************************************/
1184 /* Macros to walk pagetables. These take the shadow of a pagetable and
1185 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1186 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1187 * second entry (since pairs of entries are managed together). For multi-page
1188 * shadows they walk all pages.
1190 * Arguments are an MFN, the variable to point to each entry, a variable
1191 * to indicate that we are done (we will shortcut to the end of the scan
1192 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1193 * and the code.
1195 * WARNING: These macros have side-effects. They change the values of both
1196 * the pointer and the MFN. */
1198 static inline void increment_ptr_to_guest_entry(void *ptr)
1200 if ( ptr )
1202 guest_l1e_t **entry = ptr;
1203 (*entry)++;
1207 /* All kinds of l1: touch all entries */
1208 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1209 do { \
1210 int _i; \
1211 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1212 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1213 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1214 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1215 { \
1216 (_sl1e) = _sp + _i; \
1217 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1218 {_code} \
1219 if ( _done ) break; \
1220 increment_ptr_to_guest_entry(_gl1p); \
1221 } \
1222 unmap_shadow_page(_sp); \
1223 } while (0)
1225 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1226 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1227 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1228 do { \
1229 int __done = 0; \
1230 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1231 ({ (__done = _done); }), _code); \
1232 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1233 if ( !__done ) \
1234 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1235 ({ (__done = _done); }), _code); \
1236 } while (0)
1237 #else /* Everything else; l1 shadows are only one page */
1238 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1239 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1240 #endif
1243 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1245 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1246 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1247 do { \
1248 int _i, _j, __done = 0; \
1249 int _xen = !shadow_mode_external(_dom); \
1250 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1251 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1252 { \
1253 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1254 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1255 if ( (!(_xen)) \
1256 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1257 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1258 { \
1259 (_sl2e) = _sp + _i; \
1260 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1261 {_code} \
1262 if ( (__done = (_done)) ) break; \
1263 increment_ptr_to_guest_entry(_gl2p); \
1264 } \
1265 unmap_shadow_page(_sp); \
1266 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1267 } \
1268 } while (0)
1270 #elif GUEST_PAGING_LEVELS == 2
1272 /* 32-bit on 32-bit: avoid Xen entries */
1273 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1274 do { \
1275 int _i; \
1276 int _xen = !shadow_mode_external(_dom); \
1277 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1278 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1279 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1280 if ( (!(_xen)) \
1281 || \
1282 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1283 { \
1284 (_sl2e) = _sp + _i; \
1285 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1286 {_code} \
1287 if ( _done ) break; \
1288 increment_ptr_to_guest_entry(_gl2p); \
1289 } \
1290 unmap_shadow_page(_sp); \
1291 } while (0)
1293 #elif GUEST_PAGING_LEVELS == 3
1295 /* PAE: if it's an l2h, don't touch Xen mappings */
1296 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1297 do { \
1298 int _i; \
1299 int _xen = !shadow_mode_external(_dom); \
1300 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1301 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1302 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1303 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1304 if ( (!(_xen)) \
1305 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1306 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1307 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1308 { \
1309 (_sl2e) = _sp + _i; \
1310 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1311 {_code} \
1312 if ( _done ) break; \
1313 increment_ptr_to_guest_entry(_gl2p); \
1314 } \
1315 unmap_shadow_page(_sp); \
1316 } while (0)
1318 #else
1320 /* 64-bit l2: touch all entries except for PAE compat guests. */
1321 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1322 do { \
1323 int _i; \
1324 int _xen = !shadow_mode_external(_dom); \
1325 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1326 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1327 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1328 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1329 { \
1330 if ( (!(_xen)) \
1331 || !is_pv_32on64_domain(_dom) \
1332 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1333 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1334 { \
1335 (_sl2e) = _sp + _i; \
1336 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1337 {_code} \
1338 if ( _done ) break; \
1339 increment_ptr_to_guest_entry(_gl2p); \
1340 } \
1341 } \
1342 unmap_shadow_page(_sp); \
1343 } while (0)
1345 #endif /* different kinds of l2 */
1347 #if GUEST_PAGING_LEVELS == 4
1349 /* 64-bit l3: touch all entries */
1350 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1351 do { \
1352 int _i; \
1353 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1354 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1355 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1356 { \
1357 (_sl3e) = _sp + _i; \
1358 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1359 {_code} \
1360 if ( _done ) break; \
1361 increment_ptr_to_guest_entry(_gl3p); \
1362 } \
1363 unmap_shadow_page(_sp); \
1364 } while (0)
1366 /* 64-bit l4: avoid Xen mappings */
1367 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1368 do { \
1369 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1370 int _xen = !shadow_mode_external(_dom); \
1371 int _i; \
1372 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1373 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1374 { \
1375 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1376 { \
1377 (_sl4e) = _sp + _i; \
1378 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1379 {_code} \
1380 if ( _done ) break; \
1381 } \
1382 increment_ptr_to_guest_entry(_gl4p); \
1383 } \
1384 unmap_shadow_page(_sp); \
1385 } while (0)
1387 #endif
1391 /**************************************************************************/
1392 /* Functions to install Xen mappings and linear mappings in shadow pages */
1394 // XXX -- this function should probably be moved to shadow-common.c, but that
1395 // probably wants to wait until the shadow types have been moved from
1396 // shadow-types.h to shadow-private.h
1397 //
1398 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1399 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1401 struct domain *d = v->domain;
1402 shadow_l4e_t *sl4e;
1404 sl4e = sh_map_domain_page(sl4mfn);
1405 ASSERT(sl4e != NULL);
1406 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1408 /* Copy the common Xen mappings from the idle domain */
1409 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1410 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1411 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1413 /* Install the per-domain mappings for this domain */
1414 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1415 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1416 __PAGE_HYPERVISOR);
1418 /* Linear mapping */
1419 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1420 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1422 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1424 // linear tables may not be used with translated PV guests
1425 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1426 shadow_l4e_empty();
1428 else
1430 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1431 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1434 if ( shadow_mode_translate(v->domain) )
1436 /* install domain-specific P2M table */
1437 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1438 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1439 __PAGE_HYPERVISOR);
1442 if ( is_pv_32on64_domain(v->domain) )
1444 /* install compat arg xlat entry */
1445 sl4e[shadow_l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1446 shadow_l4e_from_mfn(
1447 page_to_mfn(virt_to_page(d->arch.mm_arg_xlat_l3)),
1448 __PAGE_HYPERVISOR);
1451 sh_unmap_domain_page(sl4e);
1453 #endif
1455 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1456 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1457 // place, which means that we need to populate the l2h entry in the l3
1458 // table.
1460 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1462 struct domain *d = v->domain;
1463 shadow_l2e_t *sl2e;
1464 #if CONFIG_PAGING_LEVELS == 3
1465 int i;
1466 #else
1468 if ( !is_pv_32on64_vcpu(v) )
1469 return;
1470 #endif
1472 sl2e = sh_map_domain_page(sl2hmfn);
1473 ASSERT(sl2e != NULL);
1474 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1476 #if CONFIG_PAGING_LEVELS == 3
1478 /* Copy the common Xen mappings from the idle domain */
1479 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1480 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1481 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1483 /* Install the per-domain mappings for this domain */
1484 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1485 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1486 shadow_l2e_from_mfn(
1487 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1488 __PAGE_HYPERVISOR);
1490 /* We don't set up a linear mapping here because we can't until this
1491 * l2h is installed in an l3e. sh_update_linear_entries() handles
1492 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1493 * We zero them here, just as a safety measure.
1494 */
1495 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1496 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1497 shadow_l2e_empty();
1498 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1499 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1500 shadow_l2e_empty();
1502 if ( shadow_mode_translate(d) )
1504 /* Install the domain-specific p2m table */
1505 l3_pgentry_t *p2m;
1506 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1507 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1508 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1510 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1511 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1512 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1513 __PAGE_HYPERVISOR)
1514 : shadow_l2e_empty();
1516 sh_unmap_domain_page(p2m);
1519 #else
1521 /* Copy the common Xen mappings from the idle domain */
1522 memcpy(
1523 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1524 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1525 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1527 #endif
1529 sh_unmap_domain_page(sl2e);
1531 #endif
1534 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1535 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1537 struct domain *d = v->domain;
1538 shadow_l2e_t *sl2e;
1539 int i;
1541 sl2e = sh_map_domain_page(sl2mfn);
1542 ASSERT(sl2e != NULL);
1543 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1545 /* Copy the common Xen mappings from the idle domain */
1546 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1547 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1548 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1550 /* Install the per-domain mappings for this domain */
1551 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1552 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1553 shadow_l2e_from_mfn(
1554 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1555 __PAGE_HYPERVISOR);
1557 /* Linear mapping */
1558 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1559 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1561 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1563 // linear tables may not be used with translated PV guests
1564 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1565 shadow_l2e_empty();
1567 else
1569 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1570 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1573 if ( shadow_mode_translate(d) )
1575 /* install domain-specific P2M table */
1576 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1577 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1578 __PAGE_HYPERVISOR);
1581 sh_unmap_domain_page(sl2e);
1583 #endif
1587 /**************************************************************************/
1588 /* Create a shadow of a given guest page.
1589 */
1590 static mfn_t
1591 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1593 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1594 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1595 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1597 if ( shadow_type != SH_type_l2_32_shadow
1598 && shadow_type != SH_type_l2_pae_shadow
1599 && shadow_type != SH_type_l2h_pae_shadow
1600 && shadow_type != SH_type_l4_64_shadow )
1601 /* Lower-level shadow, not yet linked form a higher level */
1602 mfn_to_shadow_page(smfn)->up = 0;
1604 #if GUEST_PAGING_LEVELS == 4
1605 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1606 if ( shadow_type == SH_type_l4_64_shadow &&
1607 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1609 /* We're shadowing a new l4, but we've been assuming the guest uses
1610 * only one l4 per vcpu and context switches using an l4 entry.
1611 * Count the number of active l4 shadows. If there are enough
1612 * of them, decide that this isn't an old linux guest, and stop
1613 * pinning l3es. This is not very quick but it doesn't happen
1614 * very often. */
1615 struct list_head *l, *t;
1616 struct shadow_page_info *sp;
1617 struct vcpu *v2;
1618 int l4count = 0, vcpus = 0;
1619 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1621 sp = list_entry(l, struct shadow_page_info, list);
1622 if ( sp->type == SH_type_l4_64_shadow )
1623 l4count++;
1625 for_each_vcpu ( v->domain, v2 )
1626 vcpus++;
1627 if ( l4count > 2 * vcpus )
1629 /* Unpin all the pinned l3 tables, and don't pin any more. */
1630 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1632 sp = list_entry(l, struct shadow_page_info, list);
1633 if ( sp->type == SH_type_l3_64_shadow )
1634 sh_unpin(v, shadow_page_to_mfn(sp));
1636 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1639 #endif
1640 #endif
1642 // Create the Xen mappings...
1643 if ( !shadow_mode_external(v->domain) )
1645 switch (shadow_type)
1647 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1648 case SH_type_l4_shadow:
1649 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1650 #endif
1651 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1652 case SH_type_l2h_shadow:
1653 sh_install_xen_entries_in_l2h(v, smfn); break;
1654 #endif
1655 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1656 case SH_type_l2_shadow:
1657 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1658 #endif
1659 default: /* Do nothing */ break;
1663 shadow_promote(v, gmfn, shadow_type);
1664 set_shadow_status(v, gmfn, shadow_type, smfn);
1666 return smfn;
1669 /* Make a splintered superpage shadow */
1670 static mfn_t
1671 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1673 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1674 (unsigned long) gfn_x(gfn));
1676 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1677 gfn_x(gfn), mfn_x(smfn));
1679 set_fl1_shadow_status(v, gfn, smfn);
1680 return smfn;
1684 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1685 mfn_t
1686 sh_make_monitor_table(struct vcpu *v)
1688 struct domain *d = v->domain;
1690 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1692 /* Guarantee we can get the memory we need */
1693 shadow_prealloc(d, SHADOW_MAX_ORDER);
1695 #if CONFIG_PAGING_LEVELS == 4
1697 mfn_t m4mfn;
1698 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1699 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1700 /* Remember the level of this table */
1701 mfn_to_page(m4mfn)->shadow_flags = 4;
1702 #if SHADOW_PAGING_LEVELS < 4
1703 // Install a monitor l3 table in slot 0 of the l4 table.
1704 // This is used for shadow linear maps.
1706 mfn_t m3mfn;
1707 l4_pgentry_t *l4e;
1708 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1709 mfn_to_page(m3mfn)->shadow_flags = 3;
1710 l4e = sh_map_domain_page(m4mfn);
1711 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1712 sh_unmap_domain_page(l4e);
1713 if ( is_pv_32on64_vcpu(v) )
1715 // Install a monitor l2 table in slot 3 of the l3 table.
1716 // This is used for all Xen entries.
1717 mfn_t m2mfn;
1718 l3_pgentry_t *l3e;
1719 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1720 mfn_to_page(m2mfn)->shadow_flags = 2;
1721 l3e = sh_map_domain_page(m3mfn);
1722 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1723 sh_install_xen_entries_in_l2h(v, m2mfn);
1724 sh_unmap_domain_page(l3e);
1727 #endif /* SHADOW_PAGING_LEVELS < 4 */
1728 return m4mfn;
1731 #elif CONFIG_PAGING_LEVELS == 3
1734 mfn_t m3mfn, m2mfn;
1735 l3_pgentry_t *l3e;
1736 l2_pgentry_t *l2e;
1737 int i;
1739 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1740 /* Remember the level of this table */
1741 mfn_to_page(m3mfn)->shadow_flags = 3;
1743 // Install a monitor l2 table in slot 3 of the l3 table.
1744 // This is used for all Xen entries, including linear maps
1745 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1746 mfn_to_page(m2mfn)->shadow_flags = 2;
1747 l3e = sh_map_domain_page(m3mfn);
1748 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1749 sh_install_xen_entries_in_l2h(v, m2mfn);
1750 /* Install the monitor's own linear map */
1751 l2e = sh_map_domain_page(m2mfn);
1752 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1753 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1754 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1755 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1756 : l2e_empty();
1757 sh_unmap_domain_page(l2e);
1758 sh_unmap_domain_page(l3e);
1760 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1761 return m3mfn;
1764 #elif CONFIG_PAGING_LEVELS == 2
1767 mfn_t m2mfn;
1768 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1769 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1770 /* Remember the level of this table */
1771 mfn_to_page(m2mfn)->shadow_flags = 2;
1772 return m2mfn;
1775 #else
1776 #error this should not happen
1777 #endif /* CONFIG_PAGING_LEVELS */
1779 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1781 /**************************************************************************/
1782 /* These functions also take a virtual address and return the level-N
1783 * shadow table mfn and entry, but they create the shadow pagetables if
1784 * they are needed. The "demand" argument is non-zero when handling
1785 * a demand fault (so we know what to do about accessed bits &c).
1786 * If the necessary tables are not present in the guest, they return NULL. */
1788 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1789 * more levels than the guest, the upper levels are always fixed and do not
1790 * reflect any information from the guest, so we do not use these functions
1791 * to access them. */
1793 #if GUEST_PAGING_LEVELS >= 4
1794 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1795 walk_t *gw,
1796 mfn_t *sl4mfn)
1798 /* There is always a shadow of the top level table. Get it. */
1799 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1800 /* Reading the top level table is always valid. */
1801 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1804 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1805 walk_t *gw,
1806 mfn_t *sl3mfn,
1807 fetch_type_t ft)
1809 mfn_t sl4mfn;
1810 shadow_l4e_t *sl4e;
1811 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1812 /* Get the l4e */
1813 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1814 ASSERT(sl4e != NULL);
1815 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1817 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1818 ASSERT(mfn_valid(*sl3mfn));
1820 else
1822 int r;
1823 shadow_l4e_t new_sl4e;
1824 /* No l3 shadow installed: find and install it. */
1825 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1826 if ( !mfn_valid(*sl3mfn) )
1828 /* No l3 shadow of this page exists at all: make one. */
1829 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1831 /* Install the new sl3 table in the sl4e */
1832 l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
1833 *sl3mfn, &new_sl4e, ft);
1834 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1835 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1836 if ( r & SHADOW_SET_ERROR )
1837 return NULL;
1839 /* Now follow it down a level. Guaranteed to succeed. */
1840 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1842 #endif /* GUEST_PAGING_LEVELS >= 4 */
1845 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1846 walk_t *gw,
1847 mfn_t *sl2mfn,
1848 fetch_type_t ft)
1850 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1851 mfn_t sl3mfn = _mfn(INVALID_MFN);
1852 shadow_l3e_t *sl3e;
1853 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1854 /* Get the l3e */
1855 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1856 if ( sl3e == NULL ) return NULL;
1857 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1859 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1860 ASSERT(mfn_valid(*sl2mfn));
1862 else
1864 int r;
1865 shadow_l3e_t new_sl3e;
1866 unsigned int t = SH_type_l2_shadow;
1868 /* Tag compat L2 containing hypervisor (m2p) mappings */
1869 if ( is_pv_32on64_domain(v->domain) &&
1870 guest_l4_table_offset(gw->va) == 0 &&
1871 guest_l3_table_offset(gw->va) == 3 )
1872 t = SH_type_l2h_shadow;
1874 /* No l2 shadow installed: find and install it. */
1875 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1876 if ( !mfn_valid(*sl2mfn) )
1878 /* No l2 shadow of this page exists at all: make one. */
1879 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1881 /* Install the new sl2 table in the sl3e */
1882 l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
1883 *sl2mfn, &new_sl3e, ft);
1884 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1885 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1886 if ( r & SHADOW_SET_ERROR )
1887 return NULL;
1889 /* Now follow it down a level. Guaranteed to succeed. */
1890 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1891 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1892 /* We never demand-shadow PAE l3es: they are only created in
1893 * sh_update_cr3(). Check if the relevant sl3e is present. */
1894 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1895 + shadow_l3_linear_offset(gw->va);
1896 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1897 return NULL;
1898 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1899 ASSERT(mfn_valid(*sl2mfn));
1900 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1901 #else /* 32bit... */
1902 /* There is always a shadow of the top level table. Get it. */
1903 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1904 /* This next line is important: the guest l2 has a 16k
1905 * shadow, we need to return the right mfn of the four. This
1906 * call will set it for us as a side-effect. */
1907 (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
1908 /* Reading the top level table is always valid. */
1909 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1910 #endif
1914 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1915 walk_t *gw,
1916 mfn_t *sl1mfn,
1917 fetch_type_t ft)
1919 mfn_t sl2mfn;
1920 shadow_l2e_t *sl2e;
1922 /* Get the l2e */
1923 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1924 if ( sl2e == NULL ) return NULL;
1925 /* Install the sl1 in the l2e if it wasn't there or if we need to
1926 * re-do it to fix a PSE dirty bit. */
1927 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1928 && likely(ft != ft_demand_write
1929 || (guest_l2e_get_flags(*gw->l2e) & _PAGE_DIRTY)
1930 || !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)) )
1932 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1933 ASSERT(mfn_valid(*sl1mfn));
1935 else
1937 shadow_l2e_t new_sl2e;
1938 int r, flags = guest_l2e_get_flags(*gw->l2e);
1939 /* No l1 shadow installed: find and install it. */
1940 if ( !(flags & _PAGE_PRESENT) )
1941 return NULL; /* No guest page. */
1942 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1944 /* Splintering a superpage */
1945 gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
1946 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1947 if ( !mfn_valid(*sl1mfn) )
1949 /* No fl1 shadow of this superpage exists at all: make one. */
1950 *sl1mfn = make_fl1_shadow(v, l2gfn);
1953 else
1955 /* Shadowing an actual guest l1 table */
1956 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1957 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1958 if ( !mfn_valid(*sl1mfn) )
1960 /* No l1 shadow of this page exists at all: make one. */
1961 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1964 /* Install the new sl1 table in the sl2e */
1965 l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
1966 *sl1mfn, &new_sl2e, ft);
1967 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1968 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1969 if ( r & SHADOW_SET_ERROR )
1970 return NULL;
1971 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1972 * the guest l1 table has an 8k shadow, and we need to return
1973 * the right mfn of the pair. This call will set it for us as a
1974 * side-effect. (In all other cases, it's a no-op and will be
1975 * compiled out.) */
1976 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1978 /* Now follow it down a level. Guaranteed to succeed. */
1979 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1984 /**************************************************************************/
1985 /* Destructors for shadow tables:
1986 * Unregister the shadow, decrement refcounts of any entries present in it,
1987 * and release the memory.
1989 * N.B. These destructors do not clear the contents of the shadows.
1990 * This allows us to delay TLB shootdowns until the page is being reused.
1991 * See shadow_alloc() and shadow_free() for how this is handled.
1992 */
1994 #if GUEST_PAGING_LEVELS >= 4
1995 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1997 shadow_l4e_t *sl4e;
1998 u32 t = mfn_to_shadow_page(smfn)->type;
1999 mfn_t gmfn, sl4mfn;
2001 SHADOW_DEBUG(DESTROY_SHADOW,
2002 "%s(%05lx)\n", __func__, mfn_x(smfn));
2003 ASSERT(t == SH_type_l4_shadow);
2005 /* Record that the guest page isn't shadowed any more (in this type) */
2006 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2007 delete_shadow_status(v, gmfn, t, smfn);
2008 shadow_demote(v, gmfn, t);
2009 /* Decrement refcounts of all the old entries */
2010 sl4mfn = smfn;
2011 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2012 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2014 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
2015 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
2016 | ((unsigned long)sl4e & ~PAGE_MASK));
2018 });
2020 /* Put the memory back in the pool */
2021 shadow_free(v->domain, smfn);
2024 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2026 shadow_l3e_t *sl3e;
2027 u32 t = mfn_to_shadow_page(smfn)->type;
2028 mfn_t gmfn, sl3mfn;
2030 SHADOW_DEBUG(DESTROY_SHADOW,
2031 "%s(%05lx)\n", __func__, mfn_x(smfn));
2032 ASSERT(t == SH_type_l3_shadow);
2034 /* Record that the guest page isn't shadowed any more (in this type) */
2035 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2036 delete_shadow_status(v, gmfn, t, smfn);
2037 shadow_demote(v, gmfn, t);
2039 /* Decrement refcounts of all the old entries */
2040 sl3mfn = smfn;
2041 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2042 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2043 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2044 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2045 | ((unsigned long)sl3e & ~PAGE_MASK));
2046 });
2048 /* Put the memory back in the pool */
2049 shadow_free(v->domain, smfn);
2051 #endif /* GUEST_PAGING_LEVELS >= 4 */
2054 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2056 shadow_l2e_t *sl2e;
2057 u32 t = mfn_to_shadow_page(smfn)->type;
2058 mfn_t gmfn, sl2mfn;
2060 SHADOW_DEBUG(DESTROY_SHADOW,
2061 "%s(%05lx)\n", __func__, mfn_x(smfn));
2063 #if GUEST_PAGING_LEVELS >= 3
2064 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2065 #else
2066 ASSERT(t == SH_type_l2_shadow);
2067 #endif
2069 /* Record that the guest page isn't shadowed any more (in this type) */
2070 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2071 delete_shadow_status(v, gmfn, t, smfn);
2072 shadow_demote(v, gmfn, t);
2074 /* Decrement refcounts of all the old entries */
2075 sl2mfn = smfn;
2076 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2077 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2078 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2079 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2080 | ((unsigned long)sl2e & ~PAGE_MASK));
2081 });
2083 /* Put the memory back in the pool */
2084 shadow_free(v->domain, smfn);
2087 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2089 struct domain *d = v->domain;
2090 shadow_l1e_t *sl1e;
2091 u32 t = mfn_to_shadow_page(smfn)->type;
2093 SHADOW_DEBUG(DESTROY_SHADOW,
2094 "%s(%05lx)\n", __func__, mfn_x(smfn));
2095 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2097 /* Record that the guest page isn't shadowed any more (in this type) */
2098 if ( t == SH_type_fl1_shadow )
2100 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2101 delete_fl1_shadow_status(v, gfn, smfn);
2103 else
2105 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2106 delete_shadow_status(v, gmfn, t, smfn);
2107 shadow_demote(v, gmfn, t);
2110 if ( shadow_mode_refcounts(d) )
2112 /* Decrement refcounts of all the old entries */
2113 mfn_t sl1mfn = smfn;
2114 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2115 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2116 && !sh_l1e_is_magic(*sl1e) )
2117 shadow_put_page_from_l1e(*sl1e, d);
2118 });
2121 /* Put the memory back in the pool */
2122 shadow_free(v->domain, smfn);
2125 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2126 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2128 struct domain *d = v->domain;
2129 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2131 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2132 /* Need to destroy the l3 monitor page in slot 0 too */
2134 mfn_t m3mfn;
2135 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2136 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2137 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2138 if ( is_pv_32on64_vcpu(v) )
2140 /* Need to destroy the l2 monitor page in slot 3 too */
2141 l3_pgentry_t *l3e = sh_map_domain_page(m3mfn);
2142 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2143 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2144 sh_unmap_domain_page(l3e);
2146 shadow_free(d, m3mfn);
2147 sh_unmap_domain_page(l4e);
2149 #elif CONFIG_PAGING_LEVELS == 3
2150 /* Need to destroy the l2 monitor page in slot 4 too */
2152 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2153 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2154 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2155 sh_unmap_domain_page(l3e);
2157 #endif
2159 /* Put the memory back in the pool */
2160 shadow_free(d, mmfn);
2162 #endif
2164 /**************************************************************************/
2165 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2166 * These are called from common code when we are running out of shadow
2167 * memory, and unpinning all the top-level shadows hasn't worked.
2169 * This implementation is pretty crude and slow, but we hope that it won't
2170 * be called very often. */
2172 #if GUEST_PAGING_LEVELS == 2
2174 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2176 shadow_l2e_t *sl2e;
2177 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2178 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2179 });
2182 #elif GUEST_PAGING_LEVELS == 3
2184 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2185 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2187 shadow_l2e_t *sl2e;
2188 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2189 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2190 });
2193 #elif GUEST_PAGING_LEVELS == 4
2195 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2197 shadow_l4e_t *sl4e;
2198 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2199 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2200 });
2203 #endif
2205 /**************************************************************************/
2206 /* Internal translation functions.
2207 * These functions require a pointer to the shadow entry that will be updated.
2208 */
2210 /* These functions take a new guest entry, translate it to shadow and write
2211 * the shadow entry.
2213 * They return the same bitmaps as the shadow_set_lXe() functions.
2214 */
2216 #if GUEST_PAGING_LEVELS >= 4
2217 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2219 shadow_l4e_t new_sl4e;
2220 guest_l4e_t *new_gl4e = new_ge;
2221 shadow_l4e_t *sl4p = se;
2222 mfn_t sl3mfn = _mfn(INVALID_MFN);
2223 struct domain *d = v->domain;
2224 p2m_type_t p2mt;
2225 int result = 0;
2227 perfc_incr(shadow_validate_gl4e_calls);
2229 if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
2231 gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
2232 mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
2233 if ( p2m_is_ram(p2mt) )
2234 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2235 else
2236 result |= SHADOW_SET_ERROR;
2238 l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
2239 sl3mfn, &new_sl4e, ft_prefetch);
2241 // check for updates to xen reserved slots
2242 if ( !shadow_mode_external(d) )
2244 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2245 sizeof(shadow_l4e_t));
2246 int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2248 if ( unlikely(reserved_xen_slot) )
2250 // attempt by the guest to write to a xen reserved slot
2251 //
2252 SHADOW_PRINTK("%s out-of-range update "
2253 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2254 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2255 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2257 SHADOW_ERROR("out-of-range l4e update\n");
2258 result |= SHADOW_SET_ERROR;
2261 // do not call shadow_set_l4e...
2262 return result;
2266 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2267 return result;
2271 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2273 shadow_l3e_t new_sl3e;
2274 guest_l3e_t *new_gl3e = new_ge;
2275 shadow_l3e_t *sl3p = se;
2276 mfn_t sl2mfn = _mfn(INVALID_MFN);
2277 p2m_type_t p2mt;
2278 int result = 0;
2280 perfc_incr(shadow_validate_gl3e_calls);
2282 if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
2284 gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
2285 mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
2286 if ( p2m_is_ram(p2mt) )
2287 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2288 else
2289 result |= SHADOW_SET_ERROR;
2291 l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
2292 sl2mfn, &new_sl3e, ft_prefetch);
2293 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2295 return result;
2297 #endif // GUEST_PAGING_LEVELS >= 4
2299 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2301 shadow_l2e_t new_sl2e;
2302 guest_l2e_t *new_gl2e = new_ge;
2303 shadow_l2e_t *sl2p = se;
2304 mfn_t sl1mfn = _mfn(INVALID_MFN);
2305 p2m_type_t p2mt;
2306 int result = 0;
2308 perfc_incr(shadow_validate_gl2e_calls);
2310 if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
2312 gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
2313 if ( guest_supports_superpages(v) &&
2314 (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
2316 // superpage -- need to look up the shadow L1 which holds the
2317 // splitters...
2318 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2319 #if 0
2320 // XXX - it's possible that we want to do some kind of prefetch
2321 // for superpage fl1's here, but this is *not* on the demand path,
2322 // so we'll hold off trying that for now...
2323 //
2324 if ( !mfn_valid(sl1mfn) )
2325 sl1mfn = make_fl1_shadow(v, gl1gfn);
2326 #endif
2328 else
2330 mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
2331 if ( p2m_is_ram(p2mt) )
2332 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2333 else
2334 result |= SHADOW_SET_ERROR;
2337 l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
2338 sl1mfn, &new_sl2e, ft_prefetch);
2340 // check for updates to xen reserved slots in PV guests...
2341 // XXX -- need to revisit this for PV 3-on-4 guests.
2342 //
2343 #if SHADOW_PAGING_LEVELS < 4
2344 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2345 if ( !shadow_mode_external(v->domain) )
2347 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2348 sizeof(shadow_l2e_t));
2349 int reserved_xen_slot;
2351 #if SHADOW_PAGING_LEVELS == 3
2352 reserved_xen_slot =
2353 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2354 (shadow_index
2355 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2356 #else /* SHADOW_PAGING_LEVELS == 2 */
2357 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2358 #endif
2360 if ( unlikely(reserved_xen_slot) )
2362 // attempt by the guest to write to a xen reserved slot
2363 //
2364 SHADOW_PRINTK("%s out-of-range update "
2365 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2366 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2367 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2369 SHADOW_ERROR("out-of-range l2e update\n");
2370 result |= SHADOW_SET_ERROR;
2373 // do not call shadow_set_l2e...
2374 return result;
2377 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2378 #endif /* SHADOW_PAGING_LEVELS < 4 */
2380 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2382 return result;
2385 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2387 shadow_l1e_t new_sl1e;
2388 guest_l1e_t *new_gl1e = new_ge;
2389 shadow_l1e_t *sl1p = se;
2390 gfn_t gfn;
2391 mfn_t gmfn;
2392 p2m_type_t p2mt;
2393 int result = 0;
2395 perfc_incr(shadow_validate_gl1e_calls);
2397 gfn = guest_l1e_get_gfn(*new_gl1e);
2398 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2400 l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e,
2401 ft_prefetch, p2mt);
2403 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2404 return result;
2408 /**************************************************************************/
2409 /* Functions which translate and install the shadows of arbitrary guest
2410 * entries that we have just seen the guest write. */
2413 static inline int
2414 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2415 void *new_gp, u32 size, u32 sh_type,
2416 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2417 int (*validate_ge)(struct vcpu *v, void *ge,
2418 mfn_t smfn, void *se))
2419 /* Generic function for mapping and validating. */
2421 mfn_t smfn, smfn2, map_mfn;
2422 shadow_l1e_t *sl1p;
2423 u32 shadow_idx, guest_idx;
2424 int result = 0;
2426 /* Align address and size to guest entry boundaries */
2427 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2428 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2429 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2430 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2432 /* Map the shadow page */
2433 smfn = get_shadow_status(v, gmfn, sh_type);
2434 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2435 guest_idx = guest_index(new_gp);
2436 map_mfn = smfn;
2437 shadow_idx = shadow_index(&map_mfn, guest_idx);
2438 sl1p = map_shadow_page(map_mfn);
2440 /* Validate one entry at a time */
2441 while ( size )
2443 smfn2 = smfn;
2444 guest_idx = guest_index(new_gp);
2445 shadow_idx = shadow_index(&smfn2, guest_idx);
2446 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2448 /* We have moved to another page of the shadow */
2449 map_mfn = smfn2;
2450 unmap_shadow_page(sl1p);
2451 sl1p = map_shadow_page(map_mfn);
2453 result |= validate_ge(v,
2454 new_gp,
2455 map_mfn,
2456 &sl1p[shadow_idx]);
2457 size -= sizeof(guest_l1e_t);
2458 new_gp += sizeof(guest_l1e_t);
2460 unmap_shadow_page(sl1p);
2461 return result;
2465 int
2466 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2467 void *new_gl4p, u32 size)
2469 #if GUEST_PAGING_LEVELS >= 4
2470 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2471 SH_type_l4_shadow,
2472 shadow_l4_index,
2473 validate_gl4e);
2474 #else // ! GUEST_PAGING_LEVELS >= 4
2475 SHADOW_PRINTK("called in wrong paging mode!\n");
2476 BUG();
2477 return 0;
2478 #endif
2481 int
2482 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2483 void *new_gl3p, u32 size)
2485 #if GUEST_PAGING_LEVELS >= 4
2486 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2487 SH_type_l3_shadow,
2488 shadow_l3_index,
2489 validate_gl3e);
2490 #else // ! GUEST_PAGING_LEVELS >= 4
2491 SHADOW_PRINTK("called in wrong paging mode!\n");
2492 BUG();
2493 return 0;
2494 #endif
2497 int
2498 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2499 void *new_gl2p, u32 size)
2501 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2502 SH_type_l2_shadow,
2503 shadow_l2_index,
2504 validate_gl2e);
2507 int
2508 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2509 void *new_gl2p, u32 size)
2511 #if GUEST_PAGING_LEVELS >= 3
2512 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2513 SH_type_l2h_shadow,
2514 shadow_l2_index,
2515 validate_gl2e);
2516 #else /* Non-PAE guests don't have different kinds of l2 table */
2517 SHADOW_PRINTK("called in wrong paging mode!\n");
2518 BUG();
2519 return 0;
2520 #endif
2523 int
2524 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2525 void *new_gl1p, u32 size)
2527 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2528 SH_type_l1_shadow,
2529 shadow_l1_index,
2530 validate_gl1e);
2534 /**************************************************************************/
2535 /* Optimization: If we see two emulated writes of zeros to the same
2536 * page-table without another kind of page fault in between, we guess
2537 * that this is a batch of changes (for process destruction) and
2538 * unshadow the page so we don't take a pagefault on every entry. This
2539 * should also make finding writeable mappings of pagetables much
2540 * easier. */
2542 /* Look to see if this is the second emulated write in a row to this
2543 * page, and unshadow/unhook if it is */
2544 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2546 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2547 if ( v->arch.paging.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2548 sh_mfn_is_a_page_table(gmfn) )
2550 u32 flags = mfn_to_page(gmfn)->shadow_flags;
2551 if ( !(flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
2553 perfc_incr(shadow_early_unshadow);
2554 sh_remove_shadows(v, gmfn, 0, 0 /* Slow, can fail to unshadow */ );
2557 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
2558 #endif
2561 /* Stop counting towards early unshadows, as we've seen a real page fault */
2562 static inline void reset_early_unshadow(struct vcpu *v)
2564 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2565 v->arch.paging.shadow.last_emulated_mfn = INVALID_MFN;
2566 #endif
2571 /**************************************************************************/
2572 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2573 * demand-faulted a shadow l1e in the fault handler, to see if it's
2574 * worth fetching some more.
2575 */
2577 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2579 /* XXX magic number */
2580 #define PREFETCH_DISTANCE 32
2582 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2583 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2585 int i, dist;
2586 gfn_t gfn;
2587 mfn_t gmfn;
2588 guest_l1e_t gl1e;
2589 shadow_l1e_t sl1e;
2590 u32 gflags;
2591 p2m_type_t p2mt;
2593 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2594 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2595 /* And no more than a maximum fetches-per-fault */
2596 if ( dist > PREFETCH_DISTANCE )
2597 dist = PREFETCH_DISTANCE;
2599 for ( i = 1; i < dist ; i++ )
2601 /* No point in prefetching if there's already a shadow */
2602 if ( ptr_sl1e[i].l1 != 0 )
2603 break;
2605 if ( gw->l1e )
2607 /* Normal guest page; grab the next guest entry */
2608 gl1e = gw->l1e[i];
2609 /* Not worth continuing if we hit an entry that will need another
2610 * fault for A/D-bit propagation anyway */
2611 gflags = guest_l1e_get_flags(gl1e);
2612 if ( (gflags & _PAGE_PRESENT)
2613 && (!(gflags & _PAGE_ACCESSED)
2614 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2615 break;
2617 else
2619 /* Fragmented superpage, unless we've been called wrongly */
2620 ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
2621 /* Increment the l1e's GFN by the right number of guest pages */
2622 gl1e = guest_l1e_from_gfn(
2623 _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i),
2624 guest_l1e_get_flags(gw->eff_l1e));
2627 /* Look at the gfn that the l1e is pointing at */
2628 gfn = guest_l1e_get_gfn(gl1e);
2629 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
2631 /* Propagate the entry. Safe to use a pointer to our local
2632 * gl1e, since this is not a demand-fetch so there will be no
2633 * write-back to the guest. */
2634 l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
2635 gmfn, &sl1e, ft_prefetch, p2mt);
2636 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2640 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2643 /**************************************************************************/
2644 /* Entry points into the shadow code */
2646 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2647 * for pagefaults. Returns 1 if this fault was an artefact of the
2648 * shadow code (and the guest should retry) or 0 if it is not (and the
2649 * fault should be handled elsewhere or passed to the guest). */
2651 static int sh_page_fault(struct vcpu *v,
2652 unsigned long va,
2653 struct cpu_user_regs *regs)
2655 struct domain *d = v->domain;
2656 walk_t gw;
2657 u32 accumulated_gflags;
2658 gfn_t gfn;
2659 mfn_t gmfn, sl1mfn=_mfn(0);
2660 shadow_l1e_t sl1e, *ptr_sl1e;
2661 paddr_t gpa;
2662 struct sh_emulate_ctxt emul_ctxt;
2663 struct x86_emulate_ops *emul_ops;
2664 int r;
2665 fetch_type_t ft = 0;
2666 p2m_type_t p2mt;
2668 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
2669 v->domain->domain_id, v->vcpu_id, va, regs->error_code);
2671 perfc_incr(shadow_fault);
2672 //
2673 // XXX: Need to think about eventually mapping superpages directly in the
2674 // shadow (when possible), as opposed to splintering them into a
2675 // bunch of 4K maps.
2676 //
2678 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2679 if ( (regs->error_code & PFEC_reserved_bit) )
2681 /* The only reasons for reserved bits to be set in shadow entries
2682 * are the two "magic" shadow_l1e entries. */
2683 if ( likely((__copy_from_user(&sl1e,
2684 (sh_linear_l1_table(v)
2685 + shadow_l1_linear_offset(va)),
2686 sizeof(sl1e)) == 0)
2687 && sh_l1e_is_magic(sl1e)) )
2689 if ( sh_l1e_is_gnp(sl1e) )
2691 /* Not-present in a guest PT: pass to the guest as
2692 * a not-present fault (by flipping two bits). */
2693 ASSERT(regs->error_code & PFEC_page_present);
2694 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2695 reset_early_unshadow(v);
2696 perfc_incr(shadow_fault_fast_gnp);
2697 SHADOW_PRINTK("fast path not-present\n");
2698 return 0;
2700 else
2702 /* Magic MMIO marker: extract gfn for MMIO address */
2703 ASSERT(sh_l1e_is_mmio(sl1e));
2704 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2705 << PAGE_SHIFT)
2706 | (va & ~PAGE_MASK);
2708 perfc_incr(shadow_fault_fast_mmio);
2709 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2710 reset_early_unshadow(v);
2711 handle_mmio(gpa);
2712 return EXCRET_fault_fixed;
2714 else
2716 /* This should be exceptionally rare: another vcpu has fixed
2717 * the tables between the fault and our reading the l1e.
2718 * Retry and let the hardware give us the right fault next time. */
2719 perfc_incr(shadow_fault_fast_fail);
2720 SHADOW_PRINTK("fast path false alarm!\n");
2721 return EXCRET_fault_fixed;
2724 #endif /* SHOPT_FAST_FAULT_PATH */
2726 /* Detect if this page fault happened while we were already in Xen
2727 * doing a shadow operation. If that happens, the only thing we can
2728 * do is let Xen's normal fault handlers try to fix it. In any case,
2729 * a diagnostic trace of the fault will be more useful than
2730 * a BUG() when we try to take the lock again. */
2731 if ( unlikely(shadow_locked_by_me(d)) )
2733 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
2734 d->arch.paging.shadow.locker_function);
2735 return 0;
2738 shadow_lock(d);
2740 shadow_audit_tables(v);
2742 if ( guest_walk_tables(v, va, &gw, 1) != 0 )
2744 SHADOW_PRINTK("malformed guest pagetable\n");
2745 print_gw(&gw);
2748 /* It's possible that the guest has put pagetables in memory that it has
2749 * already used for some special purpose (ioreq pages, or granted pages).
2750 * If that happens we'll have killed the guest already but it's still not
2751 * safe to propagate entries out of the guest PT so get out now. */
2752 if ( unlikely(d->is_shutting_down) )
2754 SHADOW_PRINTK("guest is shutting down\n");
2755 shadow_unlock(d);
2756 return 0;
2759 sh_audit_gw(v, &gw);
2761 // We do not look at the gw->l1e, as that will not exist for superpages.
2762 // Instead, we use the gw->eff_l1e...
2763 //
2764 // We need not check all the levels of the guest page table entries for
2765 // present vs not-present, as the eff_l1e will always be not present if
2766 // one of the higher level entries is not present.
2767 //
2768 if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
2770 perfc_incr(shadow_fault_bail_not_present);
2771 goto not_a_shadow_fault;
2774 // All levels of the guest page table are now known to be present.
2775 accumulated_gflags = accumulate_guest_flags(v, &gw);
2777 // Check for attempts to access supervisor-only pages from user mode,
2778 // i.e. ring 3. Such errors are not caused or dealt with by the shadow
2779 // code.
2780 //
2781 if ( (regs->error_code & PFEC_user_mode) &&
2782 !(accumulated_gflags & _PAGE_USER) )
2784 /* illegal user-mode access to supervisor-only page */
2785 perfc_incr(shadow_fault_bail_user_supervisor);
2786 goto not_a_shadow_fault;
2789 // Was it a write fault?
2790 ft = ((regs->error_code & PFEC_write_access)
2791 ? ft_demand_write : ft_demand_read);
2792 if ( ft == ft_demand_write )
2794 if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
2796 perfc_incr(shadow_fault_bail_ro_mapping);
2797 goto not_a_shadow_fault;
2800 else // must have been either an insn fetch or read fault
2802 // Check for NX bit violations: attempts to execute code that is
2803 // marked "do not execute". Such errors are not caused or dealt with
2804 // by the shadow code.
2805 //
2806 if ( regs->error_code & PFEC_insn_fetch )
2808 if ( accumulated_gflags & _PAGE_NX_BIT )
2810 /* NX prevented this code fetch */
2811 perfc_incr(shadow_fault_bail_nx);
2812 goto not_a_shadow_fault;
2817 /* What mfn is the guest trying to access? */
2818 gfn = guest_l1e_get_gfn(gw.eff_l1e);
2819 gmfn = gfn_to_mfn(d, gfn, &p2mt);
2821 if ( !p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn)) )
2823 perfc_incr(shadow_fault_bail_bad_gfn);
2824 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
2825 gfn_x(gfn), mfn_x(gmfn));
2826 goto not_a_shadow_fault;
2829 /* Make sure there is enough free shadow memory to build a chain of
2830 * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
2831 * to allocate all we need. (We never allocate a top-level shadow
2832 * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
2833 shadow_prealloc(d, SHADOW_MAX_ORDER);
2835 /* Acquire the shadow. This must happen before we figure out the rights
2836 * for the shadow entry, since we might promote a page here. */
2837 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2838 if ( unlikely(ptr_sl1e == NULL) )
2840 /* Couldn't get the sl1e! Since we know the guest entries
2841 * are OK, this can only have been caused by a failed
2842 * shadow_set_l*e(), which will have crashed the guest.
2843 * Get out of the fault handler immediately. */
2844 ASSERT(d->is_shutting_down);
2845 unmap_walk(v, &gw);
2846 shadow_unlock(d);
2847 return 0;
2850 /* Calculate the shadow entry and write it */
2851 l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn,
2852 gmfn, &sl1e, ft, p2mt);
2853 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2855 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2856 /* Prefetch some more shadow entries */
2857 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2858 #endif
2860 /* Need to emulate accesses to page tables */
2861 if ( sh_mfn_is_a_page_table(gmfn) )
2863 if ( ft == ft_demand_write )
2865 perfc_incr(shadow_fault_emulate_write);
2866 goto emulate;
2868 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2870 perfc_incr(shadow_fault_emulate_read);
2871 goto emulate;
2875 /* Need to hand off device-model MMIO and writes to read-only
2876 * memory to the device model */
2877 if ( p2mt == p2m_mmio_dm
2878 || (p2mt == p2m_ram_ro && ft == ft_demand_write) )
2880 gpa = guest_walk_to_gpa(&gw);
2881 goto mmio;
2884 perfc_incr(shadow_fault_fixed);
2885 d->arch.paging.log_dirty.fault_count++;
2886 reset_early_unshadow(v);
2888 done:
2889 sh_audit_gw(v, &gw);
2890 unmap_walk(v, &gw);
2891 SHADOW_PRINTK("fixed\n");
2892 shadow_audit_tables(v);
2893 shadow_unlock(d);
2894 return EXCRET_fault_fixed;
2896 emulate:
2897 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
2898 goto not_a_shadow_fault;
2900 /*
2901 * We do not emulate user writes. Instead we use them as a hint that the
2902 * page is no longer a page table. This behaviour differs from native, but
2903 * it seems very unlikely that any OS grants user access to page tables.
2904 */
2905 if ( (regs->error_code & PFEC_user_mode) )
2907 SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
2908 mfn_x(gmfn));
2909 perfc_incr(shadow_fault_emulate_failed);
2910 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2911 goto done;
2914 if ( is_hvm_domain(d) )
2916 /*
2917 * If we are in the middle of injecting an exception or interrupt then
2918 * we should not emulate: it is not the instruction at %eip that caused
2919 * the fault. Furthermore it is almost certainly the case the handler
2920 * stack is currently considered to be a page table, so we should
2921 * unshadow the faulting page before exiting.
2922 */
2923 if ( unlikely(hvm_event_pending(v)) )
2925 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
2926 "injection: cr2=%#lx, mfn=%#lx\n",
2927 va, mfn_x(gmfn));
2928 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2929 goto done;
2932 hvm_store_cpu_guest_regs(v, regs, NULL);
2935 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
2936 (unsigned long)regs->eip, (unsigned long)regs->esp);
2938 /*
2939 * We don't need to hold the lock for the whole emulation; we will
2940 * take it again when we write to the pagetables.
2941 */
2942 sh_audit_gw(v, &gw);
2943 unmap_walk(v, &gw);
2944 shadow_audit_tables(v);
2945 shadow_unlock(d);
2947 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
2949 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
2951 /*
2952 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
2953 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
2954 * then it must be 'failable': we cannot require the unshadow to succeed.
2955 */
2956 if ( r == X86EMUL_UNHANDLEABLE )
2958 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
2959 mfn_x(gmfn));
2960 perfc_incr(shadow_fault_emulate_failed);
2961 /* If this is actually a page table, then we have a bug, and need
2962 * to support more operations in the emulator. More likely,
2963 * though, this is a hint that this page should not be shadowed. */
2964 shadow_remove_all_shadows(v, gmfn);
2967 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
2968 if ( r == X86EMUL_OKAY ) {
2969 int i;
2970 /* Emulate up to four extra instructions in the hope of catching
2971 * the "second half" of a 64-bit pagetable write. */
2972 for ( i = 0 ; i < 4 ; i++ )
2974 shadow_continue_emulation(&emul_ctxt, regs);
2975 v->arch.paging.last_write_was_pt = 0;
2976 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
2977 if ( r == X86EMUL_OKAY )
2979 if ( v->arch.paging.last_write_was_pt )
2981 perfc_incr(shadow_em_ex_pt);
2982 break; /* Don't emulate past the other half of the write */
2984 else
2985 perfc_incr(shadow_em_ex_non_pt);
2987 else
2989 perfc_incr(shadow_em_ex_fail);
2990 break; /* Don't emulate again if we failed! */
2994 #endif /* PAE guest */
2996 /* Emulator has changed the user registers: write back */
2997 if ( is_hvm_domain(d) )
2998 hvm_load_cpu_guest_regs(v, regs);
3000 SHADOW_PRINTK("emulated\n");
3001 return EXCRET_fault_fixed;
3003 mmio:
3004 if ( !guest_mode(regs) )
3005 goto not_a_shadow_fault;
3006 perfc_incr(shadow_fault_mmio);
3007 sh_audit_gw(v, &gw);
3008 unmap_walk(v, &gw);
3009 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3010 shadow_audit_tables(v);
3011 reset_early_unshadow(v);
3012 shadow_unlock(d);
3013 handle_mmio(gpa);
3014 return EXCRET_fault_fixed;
3016 not_a_shadow_fault:
3017 sh_audit_gw(v, &gw);
3018 unmap_walk(v, &gw);
3019 SHADOW_PRINTK("not a shadow fault\n");
3020 shadow_audit_tables(v);
3021 reset_early_unshadow(v);
3022 shadow_unlock(d);
3023 return 0;
3027 static int
3028 sh_invlpg(struct vcpu *v, unsigned long va)
3029 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3030 * instruction should be issued on the hardware, or 0 if it's safe not
3031 * to do so. */
3033 shadow_l2e_t sl2e;
3035 perfc_incr(shadow_invlpg);
3037 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3038 /* No longer safe to use cached gva->gfn translations */
3039 vtlb_flush(v);
3040 #endif
3042 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3043 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3044 * yet. */
3045 #if SHADOW_PAGING_LEVELS == 4
3047 shadow_l3e_t sl3e;
3048 if ( !(shadow_l4e_get_flags(
3049 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3050 & _PAGE_PRESENT) )
3051 return 0;
3052 /* This must still be a copy-from-user because we don't have the
3053 * shadow lock, and the higher-level shadows might disappear
3054 * under our feet. */
3055 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3056 + shadow_l3_linear_offset(va)),
3057 sizeof (sl3e)) != 0 )
3059 perfc_incr(shadow_invlpg_fault);
3060 return 0;
3062 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3063 return 0;
3065 #elif SHADOW_PAGING_LEVELS == 3
3066 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3067 & _PAGE_PRESENT) )
3068 // no need to flush anything if there's no SL2...
3069 return 0;
3070 #endif
3072 /* This must still be a copy-from-user because we don't have the shadow
3073 * lock, and the higher-level shadows might disappear under our feet. */
3074 if ( __copy_from_user(&sl2e,
3075 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3076 sizeof (sl2e)) != 0 )
3078 perfc_incr(shadow_invlpg_fault);
3079 return 0;
3082 // If there's nothing shadowed for this particular sl2e, then
3083 // there is no need to do an invlpg, either...
3084 //
3085 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3086 return 0;
3088 // Check to see if the SL2 is a splintered superpage...
3089 // If so, then we'll need to flush the entire TLB (because that's
3090 // easier than invalidating all of the individual 4K pages).
3091 //
3092 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
3093 == SH_type_fl1_shadow )
3095 local_flush_tlb();
3096 return 0;
3099 return 1;
3103 static unsigned long
3104 sh_gva_to_gfn(struct vcpu *v, unsigned long va)
3105 /* Called to translate a guest virtual address to what the *guest*
3106 * pagetables would map it to. */
3108 walk_t gw;
3109 gfn_t gfn;
3111 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3112 struct shadow_vtlb t = {0};
3113 if ( vtlb_lookup(v, va, &t) )
3114 return t.frame_number;
3115 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3117 guest_walk_tables(v, va, &gw, 0);
3118 gfn = guest_walk_to_gfn(&gw);
3120 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3121 t.page_number = va >> PAGE_SHIFT;
3122 t.frame_number = gfn_x(gfn);
3123 t.flags = accumulate_guest_flags(v, &gw);
3124 vtlb_insert(v, t);
3125 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3127 unmap_walk(v, &gw);
3128 return gfn_x(gfn);
3132 static inline void
3133 sh_update_linear_entries(struct vcpu *v)
3134 /* Sync up all the linear mappings for this vcpu's pagetables */
3136 struct domain *d = v->domain;
3138 /* Linear pagetables in PV guests
3139 * ------------------------------
3141 * Guest linear pagetables, which map the guest pages, are at
3142 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3143 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3144 * are set up at shadow creation time, but (of course!) the PAE case
3145 * is subtler. Normal linear mappings are made by having an entry
3146 * in the top-level table that points to itself (shadow linear) or
3147 * to the guest top-level table (guest linear). For PAE, to set up
3148 * a linear map requires us to copy the four top-level entries into
3149 * level-2 entries. That means that every time we change a PAE l3e,
3150 * we need to reflect the change into the copy.
3152 * Linear pagetables in HVM guests
3153 * -------------------------------
3155 * For HVM guests, the linear pagetables are installed in the monitor
3156 * tables (since we can't put them in the shadow). Shadow linear
3157 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3158 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3159 * a linear pagetable of the monitor tables themselves. We have
3160 * the same issue of having to re-copy PAE l3 entries whevever we use
3161 * PAE shadows.
3163 * Because HVM guests run on the same monitor tables regardless of the
3164 * shadow tables in use, the linear mapping of the shadow tables has to
3165 * be updated every time v->arch.shadow_table changes.
3166 */
3168 /* Don't try to update the monitor table if it doesn't exist */
3169 if ( shadow_mode_external(d)
3170 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3171 return;
3173 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3175 /* For PV, one l4e points at the guest l4, one points at the shadow
3176 * l4. No maintenance required.
3177 * For HVM, just need to update the l4e that points to the shadow l4. */
3179 if ( shadow_mode_external(d) )
3181 /* Use the linear map if we can; otherwise make a new mapping */
3182 if ( v == current )
3184 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3185 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3186 __PAGE_HYPERVISOR);
3188 else
3190 l4_pgentry_t *ml4e;
3191 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3192 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3193 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3194 __PAGE_HYPERVISOR);
3195 sh_unmap_domain_page(ml4e);
3199 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3201 /* PV: XXX
3203 * HVM: To give ourselves a linear map of the shadows, we need to
3204 * extend a PAE shadow to 4 levels. We do this by having a monitor
3205 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3206 * entries into it. Then, by having the monitor l4e for shadow
3207 * pagetables also point to the monitor l4, we can use it to access
3208 * the shadows.
3209 */
3211 if ( shadow_mode_external(d) )
3213 /* Install copies of the shadow l3es into the monitor l3 table.
3214 * The monitor l3 table is hooked into slot 0 of the monitor
3215 * l4 table, so we use l3 linear indices 0 to 3 */
3216 shadow_l3e_t *sl3e;
3217 l3_pgentry_t *ml3e;
3218 mfn_t l3mfn;
3219 int i;
3221 /* Use linear mappings if we can; otherwise make new mappings */
3222 if ( v == current )
3224 ml3e = __linear_l3_table;
3225 l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
3227 else
3229 l4_pgentry_t *ml4e;
3230 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3231 ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
3232 l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
3233 ml3e = sh_map_domain_page(l3mfn);
3234 sh_unmap_domain_page(ml4e);
3237 /* Shadow l3 tables are made up by sh_update_cr3 */
3238 sl3e = v->arch.paging.shadow.l3table;
3240 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3242 ml3e[i] =
3243 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3244 ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3245 __PAGE_HYPERVISOR)
3246 : l3e_empty();
3249 if ( v != current )
3250 sh_unmap_domain_page(ml3e);
3252 else
3253 domain_crash(d); /* XXX */
3255 #elif CONFIG_PAGING_LEVELS == 3
3257 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3258 * entries in the shadow, and the shadow's l3 entries into the
3259 * shadow-linear-map l2 entries in the shadow. This is safe to do
3260 * because Xen does not let guests share high-slot l2 tables between l3s,
3261 * so we know we're not treading on anyone's toes.
3263 * HVM: need to copy the shadow's l3 entries into the
3264 * shadow-linear-map l2 entries in the monitor table. This is safe
3265 * because we have one monitor table for each vcpu. The monitor's
3266 * own l3es don't need to be copied because they never change.
3267 * XXX That might change if we start stuffing things into the rest
3268 * of the monitor's virtual address space.
3269 */
3271 l2_pgentry_t *l2e, new_l2e;
3272 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3273 int i;
3274 int unmap_l2e = 0;
3276 #if GUEST_PAGING_LEVELS == 2
3278 /* Shadow l3 tables were built by sh_update_cr3 */
3279 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3280 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3282 #else /* GUEST_PAGING_LEVELS == 3 */
3284 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3285 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3287 #endif /* GUEST_PAGING_LEVELS */
3289 /* Choose where to write the entries, using linear maps if possible */
3290 if ( shadow_mode_external(d) )
3292 if ( v == current )
3294 /* From the monitor tables, it's safe to use linear maps
3295 * to update monitor l2s */
3296 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3298 else
3300 /* Map the monitor table's high l2 */
3301 l3_pgentry_t *l3e;
3302 l3e = sh_map_domain_page(
3303 pagetable_get_mfn(v->arch.monitor_table));
3304 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3305 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3306 unmap_l2e = 1;
3307 sh_unmap_domain_page(l3e);
3310 else
3312 /* Map the shadow table's high l2 */
3313 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3314 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3315 unmap_l2e = 1;
3318 /* Write linear mapping of guest (only in PV, and only when
3319 * not translated). */
3320 if ( !shadow_mode_translate(d) )
3322 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3324 new_l2e =
3325 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3326 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3327 __PAGE_HYPERVISOR)
3328 : l2e_empty());
3329 safe_write_entry(
3330 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3331 &new_l2e);
3335 /* Write linear mapping of shadow. */
3336 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3338 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3339 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3340 __PAGE_HYPERVISOR)
3341 : l2e_empty();
3342 safe_write_entry(
3343 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3344 &new_l2e);
3347 if ( unmap_l2e )
3348 sh_unmap_domain_page(l2e);
3351 #elif CONFIG_PAGING_LEVELS == 2
3353 /* For PV, one l2e points at the guest l2, one points at the shadow
3354 * l2. No maintenance required.
3355 * For HVM, just need to update the l2e that points to the shadow l2. */
3357 if ( shadow_mode_external(d) )
3359 /* Use the linear map if we can; otherwise make a new mapping */
3360 if ( v == current )
3362 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3363 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3364 __PAGE_HYPERVISOR);
3366 else
3368 l2_pgentry_t *ml2e;
3369 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3370 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3371 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3372 __PAGE_HYPERVISOR);
3373 sh_unmap_domain_page(ml2e);
3377 #else
3378 #error this should not happen
3379 #endif
3383 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3384 * Does all appropriate management/bookkeeping/refcounting/etc...
3385 */
3386 static void
3387 sh_detach_old_tables(struct vcpu *v)
3389 mfn_t smfn;
3390 int i = 0;
3392 ////
3393 //// vcpu->arch.paging.shadow.guest_vtable
3394 ////
3396 #if GUEST_PAGING_LEVELS == 3
3397 /* PAE guests don't have a mapping of the guest top-level table */
3398 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3399 #else
3400 if ( v->arch.paging.shadow.guest_vtable )
3402 struct domain *d = v->domain;
3403 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3404 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3405 v->arch.paging.shadow.guest_vtable = NULL;
3407 #endif
3410 ////
3411 //// vcpu->arch.shadow_table[]
3412 ////
3414 #if GUEST_PAGING_LEVELS == 3
3415 /* PAE guests have four shadow_table entries */
3416 for ( i = 0 ; i < 4 ; i++ )
3417 #endif
3419 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3420 if ( mfn_x(smfn) )
3421 sh_put_ref(v, smfn, 0);
3422 v->arch.shadow_table[i] = pagetable_null();
3426 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3427 static void
3428 sh_set_toplevel_shadow(struct vcpu *v,
3429 int slot,
3430 mfn_t gmfn,
3431 unsigned int root_type)
3433 mfn_t smfn;
3434 pagetable_t old_entry, new_entry;
3436 struct domain *d = v->domain;
3438 /* Remember the old contents of this slot */
3439 old_entry = v->arch.shadow_table[slot];
3441 /* Now figure out the new contents: is this a valid guest MFN? */
3442 if ( !mfn_valid(gmfn) )
3444 new_entry = pagetable_null();
3445 goto install_new_entry;
3448 /* Guest mfn is valid: shadow it and install the shadow */
3449 smfn = get_shadow_status(v, gmfn, root_type);
3450 if ( !mfn_valid(smfn) )
3452 /* Make sure there's enough free shadow memory. */
3453 shadow_prealloc(d, SHADOW_MAX_ORDER);
3454 /* Shadow the page. */
3455 smfn = sh_make_shadow(v, gmfn, root_type);
3457 ASSERT(mfn_valid(smfn));
3459 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
3460 /* Once again OK to unhook entries from this table if we see fork/exit */
3461 ASSERT(sh_mfn_is_a_page_table(gmfn));
3462 mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
3463 #endif
3465 /* Pin the shadow and put it (back) on the list of pinned shadows */
3466 if ( sh_pin(v, smfn) == 0 )
3468 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3469 domain_crash(v->domain);
3472 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3473 * or the next call to set_toplevel_shadow() */
3474 if ( !sh_get_ref(v, smfn, 0) )
3476 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3477 domain_crash(v->domain);
3480 new_entry = pagetable_from_mfn(smfn);
3482 install_new_entry:
3483 /* Done. Install it */
3484 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3485 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3486 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3487 v->arch.shadow_table[slot] = new_entry;
3489 /* Decrement the refcount of the old contents of this slot */
3490 if ( !pagetable_is_null(old_entry) )
3491 sh_put_ref(v, pagetable_get_mfn(old_entry), 0);
3495 static void
3496 sh_update_cr3(struct vcpu *v, int do_locking)
3497 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3498 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3499 * if appropriate).
3500 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3501 * this function will call hvm_update_guest_cr(v, 3) to tell them where the
3502 * shadow tables are.
3503 * If do_locking != 0, assume we are being called from outside the
3504 * shadow code, and must take and release the shadow lock; otherwise
3505 * that is the caller's responsibility.
3506 */
3508 struct domain *d = v->domain;
3509 mfn_t gmfn;
3510 #if GUEST_PAGING_LEVELS == 3
3511 guest_l3e_t *gl3e;
3512 u32 guest_idx=0;
3513 int i;
3514 #endif
3516 /* Don't do anything on an uninitialised vcpu */
3517 if ( !is_hvm_domain(d) && !v->is_initialised )
3519 ASSERT(v->arch.cr3 == 0);
3520 return;
3523 if ( do_locking ) shadow_lock(v->domain);
3525 ASSERT(shadow_locked_by_me(v->domain));
3526 ASSERT(v->arch.paging.mode);
3528 ////
3529 //// vcpu->arch.guest_table is already set
3530 ////
3532 #ifndef NDEBUG
3533 /* Double-check that the HVM code has sent us a sane guest_table */
3534 if ( is_hvm_domain(d) )
3536 ASSERT(shadow_mode_external(d));
3537 if ( hvm_paging_enabled(v) )
3538 ASSERT(pagetable_get_pfn(v->arch.guest_table));
3539 else
3540 ASSERT(v->arch.guest_table.pfn
3541 == d->arch.paging.shadow.unpaged_pagetable.pfn);
3543 #endif
3545 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3546 d->domain_id, v->vcpu_id,
3547 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3549 #if GUEST_PAGING_LEVELS == 4
3550 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
3551 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3552 else
3553 #endif
3554 gmfn = pagetable_get_mfn(v->arch.guest_table);
3557 ////
3558 //// vcpu->arch.paging.shadow.guest_vtable
3559 ////
3560 #if GUEST_PAGING_LEVELS == 4
3561 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3563 if ( v->arch.paging.shadow.guest_vtable )
3564 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3565 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3566 /* PAGING_LEVELS==4 implies 64-bit, which means that
3567 * map_domain_page_global can't fail */
3568 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
3570 else
3571 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
3572 #elif GUEST_PAGING_LEVELS == 3
3573 /* On PAE guests we don't use a mapping of the guest's own top-level
3574 * table. We cache the current state of that table and shadow that,
3575 * until the next CR3 write makes us refresh our cache. */
3576 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3578 if ( shadow_mode_external(d) )
3579 /* Find where in the page the l3 table is */
3580 guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
3581 else
3582 /* PV guest: l3 is at the start of a page */
3583 guest_idx = 0;
3585 // Ignore the low 2 bits of guest_idx -- they are really just
3586 // cache control.
3587 guest_idx &= ~3;
3589 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
3590 for ( i = 0; i < 4 ; i++ )
3591 v->arch.paging.shadow.gl3e[i] = gl3e[i];
3592 sh_unmap_domain_page(gl3e);
3593 #elif GUEST_PAGING_LEVELS == 2
3594 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3596 if ( v->arch.paging.shadow.guest_vtable )
3597 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3598 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3599 /* Does this really need map_domain_page_global? Handle the
3600 * error properly if so. */
3601 BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
3603 else
3604 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
3605 #else
3606 #error this should never happen
3607 #endif
3609 #if 0
3610 printk("%s %s %d gmfn=%05lx shadow.guest_vtable=%p\n",
3611 __func__, __FILE__, __LINE__, gmfn, v->arch.paging.shadow.guest_vtable);
3612 #endif
3614 ////
3615 //// vcpu->arch.shadow_table[]
3616 ////
3618 /* We revoke write access to the new guest toplevel page(s) before we
3619 * replace the old shadow pagetable(s), so that we can safely use the
3620 * (old) shadow linear maps in the writeable mapping heuristics. */
3621 #if GUEST_PAGING_LEVELS == 2
3622 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
3623 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3624 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3625 #elif GUEST_PAGING_LEVELS == 3
3626 /* PAE guests have four shadow_table entries, based on the
3627 * current values of the guest's four l3es. */
3629 int flush = 0;
3630 gfn_t gl2gfn;
3631 mfn_t gl2mfn;
3632 p2m_type_t p2mt;
3633 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
3634 /* First, make all four entries read-only. */
3635 for ( i = 0; i < 4; i++ )
3637 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3639 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3640 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
3641 if ( p2m_is_ram(p2mt) )
3642 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
3645 if ( flush )
3646 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3647 /* Now install the new shadows. */
3648 for ( i = 0; i < 4; i++ )
3650 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3652 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3653 gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
3654 if ( p2m_is_ram(p2mt) )
3655 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3656 ? SH_type_l2h_shadow
3657 : SH_type_l2_shadow);
3658 else
3659 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3661 else
3662 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3665 #elif GUEST_PAGING_LEVELS == 4
3666 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
3667 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3668 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3669 #else
3670 #error This should never happen
3671 #endif
3673 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3674 #endif
3676 ///
3677 /// v->arch.paging.shadow.l3table
3678 ///
3679 #if SHADOW_PAGING_LEVELS == 3
3681 mfn_t smfn;
3682 int i;
3683 for ( i = 0; i < 4; i++ )
3685 #if GUEST_PAGING_LEVELS == 2
3686 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3687 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3688 #else
3689 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3690 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3691 #endif
3692 v->arch.paging.shadow.l3table[i] =
3693 (mfn_x(smfn) == 0)
3694 ? shadow_l3e_empty()
3695 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3698 #endif /* SHADOW_PAGING_LEVELS == 3 */
3701 ///
3702 /// v->arch.cr3
3703 ///
3704 if ( shadow_mode_external(d) )
3706 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3708 else // not shadow_mode_external...
3710 /* We don't support PV except guest == shadow == config levels */
3711 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3712 #if SHADOW_PAGING_LEVELS == 3
3713 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3714 * Don't use make_cr3 because (a) we know it's below 4GB, and
3715 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3716 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
3717 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
3718 #else
3719 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3720 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3721 #endif
3725 ///
3726 /// v->arch.hvm_vcpu.hw_cr[3]
3727 ///
3728 if ( shadow_mode_external(d) )
3730 ASSERT(is_hvm_domain(d));
3731 #if SHADOW_PAGING_LEVELS == 3
3732 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3733 v->arch.hvm_vcpu.hw_cr[3] =
3734 virt_to_maddr(&v->arch.paging.shadow.l3table);
3735 #else
3736 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3737 v->arch.hvm_vcpu.hw_cr[3] =
3738 pagetable_get_paddr(v->arch.shadow_table[0]);
3739 #endif
3740 hvm_update_guest_cr(v, 3);
3743 /* Fix up the linear pagetable mappings */
3744 sh_update_linear_entries(v);
3746 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3747 /* No longer safe to use cached gva->gfn translations */
3748 vtlb_flush(v);
3749 #endif
3751 /* Release the lock, if we took it (otherwise it's the caller's problem) */
3752 if ( do_locking ) shadow_unlock(v->domain);
3756 /**************************************************************************/
3757 /* Functions to revoke guest rights */
3759 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3760 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3761 /* Look up this vaddr in the current shadow and see if it's a writeable
3762 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3764 shadow_l1e_t sl1e, *sl1p;
3765 shadow_l2e_t *sl2p;
3766 #if SHADOW_PAGING_LEVELS >= 3
3767 shadow_l3e_t *sl3p;
3768 #if SHADOW_PAGING_LEVELS >= 4
3769 shadow_l4e_t *sl4p;
3770 #endif
3771 #endif
3772 mfn_t sl1mfn;
3773 int r;
3775 /* Carefully look in the shadow linear map for the l1e we expect */
3776 #if SHADOW_PAGING_LEVELS >= 4
3777 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3778 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3779 return 0;
3780 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3781 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3782 return 0;
3783 #elif SHADOW_PAGING_LEVELS == 3
3784 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
3785 + shadow_l3_linear_offset(vaddr);
3786 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3787 return 0;
3788 #endif
3789 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3790 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3791 return 0;
3792 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3793 sl1e = *sl1p;
3794 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3795 != (_PAGE_PRESENT|_PAGE_RW))
3796 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3797 return 0;
3799 /* Found it! Need to remove its write permissions. */
3800 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3801 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3802 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3803 ASSERT( !(r & SHADOW_SET_ERROR) );
3804 return 1;
3806 #endif
3808 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
3809 mfn_t readonly_mfn)
3810 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3812 shadow_l1e_t *sl1e;
3813 int done = 0;
3814 int flags;
3815 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3817 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3819 flags = shadow_l1e_get_flags(*sl1e);
3820 if ( (flags & _PAGE_PRESENT)
3821 && (flags & _PAGE_RW)
3822 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3824 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
3825 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
3826 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3827 /* Remember the last shadow that we shot a writeable mapping in */
3828 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3829 #endif
3830 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3831 & PGT_count_mask) == 0 )
3832 /* This breaks us cleanly out of the FOREACH macro */
3833 done = 1;
3835 });
3836 return done;
3840 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3841 /* Excises all mappings to guest frame from this shadow l1 table */
3843 shadow_l1e_t *sl1e;
3844 int done = 0;
3845 int flags;
3847 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3849 flags = shadow_l1e_get_flags(*sl1e);
3850 if ( (flags & _PAGE_PRESENT)
3851 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3853 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3854 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3855 /* This breaks us cleanly out of the FOREACH macro */
3856 done = 1;
3858 });
3859 return done;
3862 /**************************************************************************/
3863 /* Functions to excise all pointers to shadows from higher-level shadows. */
3865 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3866 /* Blank out a single shadow entry */
3868 switch ( mfn_to_shadow_page(smfn)->type )
3870 case SH_type_l1_shadow:
3871 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3872 case SH_type_l2_shadow:
3873 #if GUEST_PAGING_LEVELS >= 3
3874 case SH_type_l2h_shadow:
3875 #endif
3876 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3877 #if GUEST_PAGING_LEVELS >= 4
3878 case SH_type_l3_shadow:
3879 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3880 case SH_type_l4_shadow:
3881 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3882 #endif
3883 default: BUG(); /* Called with the wrong kind of shadow. */
3887 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3888 /* Remove all mappings of this l1 shadow from this l2 shadow */
3890 shadow_l2e_t *sl2e;
3891 int done = 0;
3892 int flags;
3894 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
3896 flags = shadow_l2e_get_flags(*sl2e);
3897 if ( (flags & _PAGE_PRESENT)
3898 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3900 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3901 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
3902 /* This breaks us cleanly out of the FOREACH macro */
3903 done = 1;
3905 });
3906 return done;
3909 #if GUEST_PAGING_LEVELS >= 4
3910 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3911 /* Remove all mappings of this l2 shadow from this l3 shadow */
3913 shadow_l3e_t *sl3e;
3914 int done = 0;
3915 int flags;
3917 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3919 flags = shadow_l3e_get_flags(*sl3e);
3920 if ( (flags & _PAGE_PRESENT)
3921 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3923 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3924 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
3925 /* This breaks us cleanly out of the FOREACH macro */
3926 done = 1;
3928 });
3929 return done;
3932 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
3933 /* Remove all mappings of this l3 shadow from this l4 shadow */
3935 shadow_l4e_t *sl4e;
3936 int done = 0;
3937 int flags;
3939 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
3941 flags = shadow_l4e_get_flags(*sl4e);
3942 if ( (flags & _PAGE_PRESENT)
3943 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
3945 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
3946 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
3947 /* This breaks us cleanly out of the FOREACH macro */
3948 done = 1;
3950 });
3951 return done;
3953 #endif /* 64bit guest */
3955 /**************************************************************************/
3956 /* Handling HVM guest writes to pagetables */
3958 /* Check that the user is allowed to perform this write.
3959 * Returns a mapped pointer to write to, and the mfn it's on,
3960 * or NULL for error. */
3961 static inline void * emulate_map_dest(struct vcpu *v,
3962 unsigned long vaddr,
3963 struct sh_emulate_ctxt *sh_ctxt,
3964 mfn_t *mfnp)
3966 walk_t gw;
3967 u32 flags, errcode;
3968 gfn_t gfn;
3969 mfn_t mfn;
3970 p2m_type_t p2mt;
3972 /* We don't emulate user-mode writes to page tables */
3973 if ( ring_3(sh_ctxt->ctxt.regs) )
3974 return NULL;
3976 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3977 /* Try the virtual TLB first */
3979 struct shadow_vtlb t = {0};
3980 if ( vtlb_lookup(v, vaddr, &t)
3981 && ((t.flags & (_PAGE_PRESENT|_PAGE_RW))
3982 == (_PAGE_PRESENT|_PAGE_RW)) )
3984 flags = t.flags;
3985 gfn = _gfn(t.frame_number);
3987 else
3989 /* Need to do the full lookup, just in case permissions
3990 * have increased since we cached this entry */
3992 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3994 /* Walk the guest pagetables */
3995 guest_walk_tables(v, vaddr, &gw, 1);
3996 flags = accumulate_guest_flags(v, &gw);
3997 gfn = guest_l1e_get_gfn(gw.eff_l1e);
3998 sh_audit_gw(v, &gw);
3999 unmap_walk(v, &gw);
4001 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4002 /* Remember this translation for next time */
4003 t.page_number = vaddr >> PAGE_SHIFT;
4004 t.frame_number = gfn_x(gfn);
4005 t.flags = flags;
4006 vtlb_insert(v, t);
4009 #endif
4011 errcode = PFEC_write_access;
4012 if ( !(flags & _PAGE_PRESENT) )
4013 goto page_fault;
4015 errcode |= PFEC_page_present;
4016 if ( !(flags & _PAGE_RW) )
4017 goto page_fault;
4019 mfn = gfn_to_mfn(v->domain, gfn, &p2mt);
4020 if ( p2m_is_ram(p2mt) )
4022 ASSERT(mfn_valid(mfn));
4023 *mfnp = mfn;
4024 v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4025 return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
4027 else
4028 return NULL;
4030 page_fault:
4031 if ( is_hvm_vcpu(v) )
4032 hvm_inject_exception(TRAP_page_fault, errcode, vaddr);
4033 else
4034 propagate_page_fault(vaddr, errcode);
4035 return NULL;
4038 static int safe_not_to_verify_write(mfn_t gmfn, void *dst, void *src,
4039 int bytes)
4041 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4042 struct page_info *pg = mfn_to_page(gmfn);
4043 if ( !(pg->shadow_flags & SHF_32)
4044 && ((unsigned long)dst & 7) == 0 )
4046 /* Not shadowed 32-bit: aligned 64-bit writes that leave the
4047 * present bit unset are safe to ignore. */
4048 if ( (*(u64*)src & _PAGE_PRESENT) == 0
4049 && (*(u64*)dst & _PAGE_PRESENT) == 0 )
4050 return 1;
4052 else if ( !(pg->shadow_flags & (SHF_PAE|SHF_64))
4053 && ((unsigned long)dst & 3) == 0 )
4055 /* Not shadowed PAE/64-bit: aligned 32-bit writes that leave the
4056 * present bit unset are safe to ignore. */
4057 if ( (*(u32*)src & _PAGE_PRESENT) == 0
4058 && (*(u32*)dst & _PAGE_PRESENT) == 0 )
4059 return 1;
4061 #endif
4062 return 0;
4066 int
4067 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4068 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4070 mfn_t mfn;
4071 void *addr;
4072 int skip;
4074 if ( vaddr & (bytes-1) )
4075 return X86EMUL_UNHANDLEABLE;
4077 ASSERT(((vaddr & ~PAGE_MASK) + bytes) <= PAGE_SIZE);
4078 shadow_lock(v->domain);
4080 addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn);
4081 if ( addr == NULL )
4083 shadow_unlock(v->domain);
4084 return X86EMUL_EXCEPTION;
4087 skip = safe_not_to_verify_write(mfn, addr, src, bytes);
4088 memcpy(addr, src, bytes);
4089 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
4091 /* If we are writing zeros to this page, might want to unshadow */
4092 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
4093 check_for_early_unshadow(v, mfn);
4094 else
4095 reset_early_unshadow(v);
4097 paging_mark_dirty(v->domain, mfn_x(mfn));
4099 sh_unmap_domain_page(addr);
4100 shadow_audit_tables(v);
4101 shadow_unlock(v->domain);
4102 return X86EMUL_OKAY;
4105 int
4106 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4107 unsigned long old, unsigned long new,
4108 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4110 mfn_t mfn;
4111 void *addr;
4112 unsigned long prev;
4113 int rv = X86EMUL_OKAY, skip;
4115 ASSERT(bytes <= sizeof(unsigned long));
4116 shadow_lock(v->domain);
4118 if ( vaddr & (bytes-1) )
4119 return X86EMUL_UNHANDLEABLE;
4121 addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn);
4122 if ( addr == NULL )
4124 shadow_unlock(v->domain);
4125 return X86EMUL_EXCEPTION;
4128 skip = safe_not_to_verify_write(mfn, &new, &old, bytes);
4130 switch ( bytes )
4132 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4133 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4134 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4135 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4136 default:
4137 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4138 prev = ~old;
4141 if ( prev == old )
4143 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
4145 else
4146 rv = X86EMUL_CMPXCHG_FAILED;
4148 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4149 " wanted %#lx now %#lx bytes %u\n",
4150 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4152 /* If we are writing zeros to this page, might want to unshadow */
4153 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
4154 check_for_early_unshadow(v, mfn);
4155 else
4156 reset_early_unshadow(v);
4158 paging_mark_dirty(v->domain, mfn_x(mfn));
4160 sh_unmap_domain_page(addr);
4161 shadow_audit_tables(v);
4162 shadow_unlock(v->domain);
4163 return rv;
4166 int
4167 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4168 unsigned long old_lo, unsigned long old_hi,
4169 unsigned long new_lo, unsigned long new_hi,
4170 struct sh_emulate_ctxt *sh_ctxt)
4172 mfn_t mfn;
4173 void *addr;
4174 u64 old, new, prev;
4175 int rv = X86EMUL_OKAY, skip;
4177 if ( vaddr & 7 )
4178 return X86EMUL_UNHANDLEABLE;
4180 shadow_lock(v->domain);
4182 addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn);
4183 if ( addr == NULL )
4185 shadow_unlock(v->domain);
4186 return X86EMUL_EXCEPTION;
4189 old = (((u64) old_hi) << 32) | (u64) old_lo;
4190 new = (((u64) new_hi) << 32) | (u64) new_lo;
4191 skip = safe_not_to_verify_write(mfn, &new, &old, 8);
4192 prev = cmpxchg(((u64 *)addr), old, new);
4194 if ( prev == old )
4196 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, 8);
4198 else
4199 rv = X86EMUL_CMPXCHG_FAILED;
4201 /* If we are writing zeros to this page, might want to unshadow */
4202 if ( *(u32 *)addr == 0 )
4203 check_for_early_unshadow(v, mfn);
4204 else
4205 reset_early_unshadow(v);
4207 paging_mark_dirty(v->domain, mfn_x(mfn));
4209 sh_unmap_domain_page(addr);
4210 shadow_audit_tables(v);
4211 shadow_unlock(v->domain);
4212 return rv;
4216 /**************************************************************************/
4217 /* Audit tools */
4219 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4221 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4222 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4223 "gl" #_level "mfn = %" PRI_mfn \
4224 " sl" #_level "mfn = %" PRI_mfn \
4225 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4226 " gl" #_level "e = %" SH_PRI_gpte \
4227 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4228 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4229 _level, guest_index(gl ## _level ## e), \
4230 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4231 gl ## _level ## e, sl ## _level ## e, \
4232 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4233 ##_a); \
4234 BUG(); \
4235 done = 1; \
4236 } while (0)
4239 static char * sh_audit_flags(struct vcpu *v, int level,
4240 int gflags, int sflags)
4241 /* Common code for auditing flag bits */
4243 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4244 return "shadow is present but guest is not present";
4245 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4246 return "global bit set in PV shadow";
4247 if ( level == 2 && (sflags & _PAGE_PSE) )
4248 return "PS bit set in shadow";
4249 #if SHADOW_PAGING_LEVELS == 3
4250 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4251 #endif
4252 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4253 return "accessed bit not propagated";
4254 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4255 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4256 return "dirty bit not propagated";
4257 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4258 return "user/supervisor bit does not match";
4259 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4260 return "NX bit does not match";
4261 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4262 return "shadow grants write access but guest does not";
4263 return NULL;
4266 static inline mfn_t
4267 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
4268 /* Convert this gfn to an mfn in the manner appropriate for the
4269 * guest pagetable it's used in (gmfn) */
4271 p2m_type_t p2mt;
4272 if ( !shadow_mode_translate(v->domain) )
4273 return _mfn(gfn_x(gfn));
4275 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
4276 != PGT_writable_page )
4277 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
4278 else
4279 return gfn_to_mfn(v->domain, gfn, &p2mt);
4283 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4285 guest_l1e_t *gl1e, *gp;
4286 shadow_l1e_t *sl1e;
4287 mfn_t mfn, gmfn, gl1mfn;
4288 gfn_t gfn;
4289 char *s;
4290 int done = 0;
4292 /* Follow the backpointer */
4293 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4294 gl1e = gp = sh_map_domain_page(gl1mfn);
4295 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4297 if ( sh_l1e_is_magic(*sl1e) )
4299 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4300 if ( sh_l1e_is_gnp(*sl1e) )
4302 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4303 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4305 else
4307 ASSERT(sh_l1e_is_mmio(*sl1e));
4308 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4309 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4310 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4311 " but guest gfn is %" SH_PRI_gfn,
4312 gfn_x(gfn),
4313 gfn_x(guest_l1e_get_gfn(*gl1e)));
4315 #endif
4317 else
4319 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4320 shadow_l1e_get_flags(*sl1e));
4321 if ( s ) AUDIT_FAIL(1, "%s", s);
4323 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4325 gfn = guest_l1e_get_gfn(*gl1e);
4326 mfn = shadow_l1e_get_mfn(*sl1e);
4327 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
4328 if ( mfn_x(gmfn) != mfn_x(mfn) )
4329 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4330 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4331 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4334 });
4335 sh_unmap_domain_page(gp);
4336 return done;
4339 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4341 guest_l1e_t *gl1e, e;
4342 shadow_l1e_t *sl1e;
4343 mfn_t gl1mfn = _mfn(INVALID_MFN);
4344 int f;
4345 int done = 0;
4347 /* fl1 has no useful backpointer: all we can check are flags */
4348 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4349 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4350 f = shadow_l1e_get_flags(*sl1e);
4351 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4352 if ( !(f == 0
4353 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4354 _PAGE_ACCESSED|_PAGE_DIRTY)
4355 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4356 || sh_l1e_is_magic(*sl1e)) )
4357 AUDIT_FAIL(1, "fl1e has bad flags");
4358 });
4359 return 0;
4362 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4364 guest_l2e_t *gl2e, *gp;
4365 shadow_l2e_t *sl2e;
4366 mfn_t mfn, gmfn, gl2mfn;
4367 gfn_t gfn;
4368 char *s;
4369 int done = 0;
4371 /* Follow the backpointer */
4372 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4373 gl2e = gp = sh_map_domain_page(gl2mfn);
4374 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
4376 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4377 shadow_l2e_get_flags(*sl2e));
4378 if ( s ) AUDIT_FAIL(2, "%s", s);
4380 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4382 gfn = guest_l2e_get_gfn(*gl2e);
4383 mfn = shadow_l2e_get_mfn(*sl2e);
4384 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4385 ? get_fl1_shadow_status(v, gfn)
4386 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4387 SH_type_l1_shadow);
4388 if ( mfn_x(gmfn) != mfn_x(mfn) )
4389 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4390 " (--> %" PRI_mfn ")"
4391 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4392 gfn_x(gfn),
4393 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4394 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4395 mfn_x(gmfn), mfn_x(mfn));
4397 });
4398 sh_unmap_domain_page(gp);
4399 return 0;
4402 #if GUEST_PAGING_LEVELS >= 4
4403 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4405 guest_l3e_t *gl3e, *gp;
4406 shadow_l3e_t *sl3e;
4407 mfn_t mfn, gmfn, gl3mfn;
4408 gfn_t gfn;
4409 char *s;
4410 int done = 0;
4412 /* Follow the backpointer */
4413 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4414 gl3e = gp = sh_map_domain_page(gl3mfn);
4415 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4417 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4418 shadow_l3e_get_flags(*sl3e));
4419 if ( s ) AUDIT_FAIL(3, "%s", s);
4421 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4423 gfn = guest_l3e_get_gfn(*gl3e);
4424 mfn = shadow_l3e_get_mfn(*sl3e);
4425 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4426 ((GUEST_PAGING_LEVELS == 3 ||
4427 is_pv_32on64_vcpu(v))
4428 && !shadow_mode_external(v->domain)
4429 && (guest_index(gl3e) % 4) == 3)
4430 ? SH_type_l2h_shadow
4431 : SH_type_l2_shadow);
4432 if ( mfn_x(gmfn) != mfn_x(mfn) )
4433 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4434 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4435 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4437 });
4438 sh_unmap_domain_page(gp);
4439 return 0;
4442 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4444 guest_l4e_t *gl4e, *gp;
4445 shadow_l4e_t *sl4e;
4446 mfn_t mfn, gmfn, gl4mfn;
4447 gfn_t gfn;
4448 char *s;
4449 int done = 0;
4451 /* Follow the backpointer */
4452 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4453 gl4e = gp = sh_map_domain_page(gl4mfn);
4454 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
4456 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4457 shadow_l4e_get_flags(*sl4e));
4458 if ( s ) AUDIT_FAIL(4, "%s", s);
4460 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4462 gfn = guest_l4e_get_gfn(*gl4e);
4463 mfn = shadow_l4e_get_mfn(*sl4e);
4464 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4465 SH_type_l3_shadow);
4466 if ( mfn_x(gmfn) != mfn_x(mfn) )
4467 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4468 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4469 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4471 });
4472 sh_unmap_domain_page(gp);
4473 return 0;
4475 #endif /* GUEST_PAGING_LEVELS >= 4 */
4478 #undef AUDIT_FAIL
4480 #endif /* Audit code */
4482 /**************************************************************************/
4483 /* Entry points into this mode of the shadow code.
4484 * This will all be mangled by the preprocessor to uniquify everything. */
4485 struct paging_mode sh_paging_mode = {
4486 .page_fault = sh_page_fault,
4487 .invlpg = sh_invlpg,
4488 .gva_to_gfn = sh_gva_to_gfn,
4489 .update_cr3 = sh_update_cr3,
4490 .update_paging_modes = shadow_update_paging_modes,
4491 .write_p2m_entry = shadow_write_p2m_entry,
4492 .write_guest_entry = shadow_write_guest_entry,
4493 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
4494 .guest_map_l1e = sh_guest_map_l1e,
4495 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4496 .guest_levels = GUEST_PAGING_LEVELS,
4497 .shadow.detach_old_tables = sh_detach_old_tables,
4498 .shadow.x86_emulate_write = sh_x86_emulate_write,
4499 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4500 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4501 .shadow.make_monitor_table = sh_make_monitor_table,
4502 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
4503 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4504 .shadow.guess_wrmap = sh_guess_wrmap,
4505 #endif
4506 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
4507 };
4509 /*
4510 * Local variables:
4511 * mode: C
4512 * c-set-style: "BSD"
4513 * c-basic-offset: 4
4514 * indent-tabs-mode: nil
4515 * End:
4516 */