ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 13915:a00b8d3800a8

[XEN] Snapshot PAE l3es when they are shadowed.
We don't update the shadows so we mustn't look at the guest l3es
or we'll be confused by them if they change.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Wed Feb 14 14:46:18 2007 +0000 (2007-02-14)
parents 6daa91dc9247
children 9c2e6f8f3aa7
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include "private.h"
37 #include "types.h"
39 /* THINGS TO DO LATER:
40 *
41 * TEARDOWN HEURISTICS
42 * Also: have a heuristic for when to destroy a previous paging-mode's
43 * shadows. When a guest is done with its start-of-day 32-bit tables
44 * and reuses the memory we want to drop those shadows. Start with
45 * shadows in a page in two modes as a hint, but beware of clever tricks
46 * like reusing a pagetable for both PAE and 64-bit during boot...
47 *
48 * PAE LINEAR MAPS
49 * Rework shadow_get_l*e() to have the option of using map_domain_page()
50 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
51 * Then we can test the speed difference made by linear maps. If the
52 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
53 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
54 * to share l2h pages again.
55 *
56 * GUEST_WALK_TABLES TLB FLUSH COALESCE
57 * guest_walk_tables can do up to three remote TLB flushes as it walks to
58 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
59 * and if we do flush, re-do the walk. If anything has changed, then
60 * pause all the other vcpus and do the walk *again*.
61 *
62 * WP DISABLED
63 * Consider how to implement having the WP bit of CR0 set to 0.
64 * Since we need to be able to cause write faults to pagetables, this might
65 * end up looking like not having the (guest) pagetables present at all in
66 * HVM guests...
67 *
68 * PSE disabled / PSE36
69 * We don't support any modes other than PSE enabled, PSE36 disabled.
70 * Neither of those would be hard to change, but we'd need to be able to
71 * deal with shadows made in one mode and used in another.
72 */
74 #define FETCH_TYPE_PREFETCH 1
75 #define FETCH_TYPE_DEMAND 2
76 #define FETCH_TYPE_WRITE 4
77 typedef enum {
78 ft_prefetch = FETCH_TYPE_PREFETCH,
79 ft_demand_read = FETCH_TYPE_DEMAND,
80 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
81 } fetch_type_t;
83 #ifdef DEBUG_TRACE_DUMP
84 static char *fetch_type_names[] = {
85 [ft_prefetch] "prefetch",
86 [ft_demand_read] "demand read",
87 [ft_demand_write] "demand write",
88 };
89 #endif
91 /**************************************************************************/
92 /* Hash table mapping from guest pagetables to shadows
93 *
94 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
95 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
96 * shadow L1 which maps its "splinters".
97 */
99 static inline mfn_t
100 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
101 /* Look for FL1 shadows in the hash table */
102 {
103 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
105 if ( unlikely(shadow_mode_log_dirty(v->domain) && mfn_valid(smfn)) )
106 {
107 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
108 if ( !(sp->logdirty) )
109 shadow_convert_to_log_dirty(v, smfn);
110 }
112 return smfn;
113 }
115 static inline mfn_t
116 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
117 /* Look for shadows in the hash table */
118 {
119 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
120 perfc_incrc(shadow_get_shadow_status);
122 if ( unlikely(shadow_mode_log_dirty(v->domain) && mfn_valid(smfn)) )
123 {
124 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
125 if ( !(sp->logdirty) )
126 shadow_convert_to_log_dirty(v, smfn);
127 }
129 return smfn;
130 }
132 static inline void
133 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
134 /* Put an FL1 shadow into the hash table */
135 {
136 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
137 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
139 if ( unlikely(shadow_mode_log_dirty(v->domain)) )
140 // mark this shadow as a log dirty shadow...
141 mfn_to_shadow_page(smfn)->logdirty = 1;
142 else
143 mfn_to_shadow_page(smfn)->logdirty = 0;
145 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
146 }
148 static inline void
149 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
150 /* Put a shadow into the hash table */
151 {
152 struct domain *d = v->domain;
153 int res;
155 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
156 d->domain_id, v->vcpu_id, mfn_x(gmfn),
157 shadow_type, mfn_x(smfn));
159 if ( unlikely(shadow_mode_log_dirty(d)) )
160 // mark this shadow as a log dirty shadow...
161 mfn_to_shadow_page(smfn)->logdirty = 1;
162 else
163 mfn_to_shadow_page(smfn)->logdirty = 0;
165 res = get_page(mfn_to_page(gmfn), d);
166 ASSERT(res == 1);
168 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
169 }
171 static inline void
172 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
173 /* Remove a shadow from the hash table */
174 {
175 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
176 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
177 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
178 }
180 static inline void
181 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
182 /* Remove a shadow from the hash table */
183 {
184 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
185 v->domain->domain_id, v->vcpu_id,
186 mfn_x(gmfn), shadow_type, mfn_x(smfn));
187 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
188 put_page(mfn_to_page(gmfn));
189 }
191 /**************************************************************************/
192 /* CPU feature support querying */
194 static inline int
195 guest_supports_superpages(struct vcpu *v)
196 {
197 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
198 * CR4.PSE is set or the guest is in PAE or long mode */
199 return (is_hvm_vcpu(v) && (GUEST_PAGING_LEVELS != 2
200 || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
201 }
203 static inline int
204 guest_supports_nx(struct vcpu *v)
205 {
206 if ( !is_hvm_vcpu(v) )
207 return cpu_has_nx;
209 // XXX - fix this!
210 return 1;
211 }
214 /**************************************************************************/
215 /* Functions for walking the guest page tables */
218 /* Walk the guest pagetables, filling the walk_t with what we see.
219 * Takes an uninitialised walk_t. The caller must call unmap_walk()
220 * on the walk_t before discarding it or calling guest_walk_tables again.
221 * If "guest_op" is non-zero, we are serving a genuine guest memory access,
222 * and must (a) be under the shadow lock, and (b) remove write access
223 * from any gueat PT pages we see, as we will be using their contents to
224 * perform shadow updates.
225 * Returns 0 for success or non-zero if the guest pagetables are malformed.
226 * N.B. Finding a not-present entry does not cause a non-zero return code. */
227 static inline int
228 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
229 {
230 ASSERT(!guest_op || shadow_locked_by_me(v->domain));
232 perfc_incrc(shadow_guest_walk);
233 memset(gw, 0, sizeof(*gw));
234 gw->va = va;
236 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
237 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
238 /* Get l4e from the top level table */
239 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
240 gw->l4e = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable
241 + guest_l4_table_offset(va);
242 /* Walk down to the l3e */
243 if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
244 gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
245 if ( !mfn_valid(gw->l3mfn) ) return 1;
246 /* This mfn is a pagetable: make sure the guest can't write to it. */
247 if ( guest_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
248 flush_tlb_mask(v->domain->domain_dirty_cpumask);
249 gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
250 + guest_l3_table_offset(va);
251 #else /* PAE only... */
252 /* Get l3e from the cache of the guest's top level table */
253 gw->l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
254 #endif /* PAE or 64... */
255 /* Walk down to the l2e */
256 if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
257 gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
258 if ( !mfn_valid(gw->l2mfn) ) return 1;
259 /* This mfn is a pagetable: make sure the guest can't write to it. */
260 if ( guest_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
261 flush_tlb_mask(v->domain->domain_dirty_cpumask);
262 gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
263 + guest_l2_table_offset(va);
264 #else /* 32-bit only... */
265 /* Get l2e from the top level table */
266 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
267 gw->l2e = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable
268 + guest_l2_table_offset(va);
269 #endif /* All levels... */
271 if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
272 if ( guest_supports_superpages(v) &&
273 (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
274 {
275 /* Special case: this guest VA is in a PSE superpage, so there's
276 * no guest l1e. We make one up so that the propagation code
277 * can generate a shadow l1 table. Start with the gfn of the
278 * first 4k-page of the superpage. */
279 gfn_t start = guest_l2e_get_gfn(*gw->l2e);
280 /* Grant full access in the l1e, since all the guest entry's
281 * access controls are enforced in the shadow l2e. This lets
282 * us reflect l2 changes later without touching the l1s. */
283 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
284 _PAGE_ACCESSED|_PAGE_DIRTY);
285 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
286 * of the level 1 */
287 if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
288 flags |= _PAGE_PAT;
289 /* Increment the pfn by the right number of 4k pages.
290 * The ~0x1 is to mask out the PAT bit mentioned above. */
291 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
292 gw->eff_l1e = guest_l1e_from_gfn(start, flags);
293 gw->l1e = NULL;
294 gw->l1mfn = _mfn(INVALID_MFN);
295 }
296 else
297 {
298 /* Not a superpage: carry on and find the l1e. */
299 gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
300 if ( !mfn_valid(gw->l1mfn) ) return 1;
301 /* This mfn is a pagetable: make sure the guest can't write to it. */
302 if ( guest_op
303 && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
304 flush_tlb_mask(v->domain->domain_dirty_cpumask);
305 gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
306 + guest_l1_table_offset(va);
307 gw->eff_l1e = *gw->l1e;
308 }
310 return 0;
311 }
313 /* Given a walk_t, translate the gw->va into the guest's notion of the
314 * corresponding frame number. */
315 static inline gfn_t
316 guest_walk_to_gfn(walk_t *gw)
317 {
318 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
319 return _gfn(INVALID_GFN);
320 return guest_l1e_get_gfn(gw->eff_l1e);
321 }
323 /* Given a walk_t, translate the gw->va into the guest's notion of the
324 * corresponding physical address. */
325 static inline paddr_t
326 guest_walk_to_gpa(walk_t *gw)
327 {
328 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
329 return 0;
330 return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
331 }
334 /* Unmap (and reinitialise) a guest walk.
335 * Call this to dispose of any walk filled in by guest_walk_tables() */
336 static void unmap_walk(struct vcpu *v, walk_t *gw)
337 {
338 #if GUEST_PAGING_LEVELS >= 3
339 #if GUEST_PAGING_LEVELS >= 4
340 if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
341 #endif
342 if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
343 #endif
344 if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
345 #ifdef DEBUG
346 memset(gw, 0, sizeof(*gw));
347 #endif
348 }
351 /* Pretty-print the contents of a guest-walk */
352 static inline void print_gw(walk_t *gw)
353 {
354 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
355 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
356 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
357 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
358 SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
359 if ( gw->l4e )
360 SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
361 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
362 #endif /* PAE or 64... */
363 SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
364 if ( gw->l3e )
365 SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
366 #endif /* All levels... */
367 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
368 SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
369 if ( gw->l2e )
370 SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
371 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
372 SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
373 if ( gw->l1e )
374 SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
375 SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
376 }
379 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
380 /* Lightweight audit: pass all the shadows associated with this guest walk
381 * through the audit mechanisms */
382 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
383 {
384 mfn_t smfn;
386 if ( !(SHADOW_AUDIT_ENABLE) )
387 return;
389 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
390 if ( mfn_valid(gw->l4mfn)
391 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
392 SH_type_l4_shadow))) )
393 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
394 if ( mfn_valid(gw->l3mfn)
395 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
396 SH_type_l3_shadow))) )
397 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
398 #endif /* PAE or 64... */
399 if ( mfn_valid(gw->l2mfn) )
400 {
401 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
402 SH_type_l2_shadow))) )
403 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
404 #if GUEST_PAGING_LEVELS == 3
405 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
406 SH_type_l2h_shadow))) )
407 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
408 #endif
409 }
410 if ( mfn_valid(gw->l1mfn)
411 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
412 SH_type_l1_shadow))) )
413 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
414 else if ( gw->l2e
415 && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
416 && mfn_valid(
417 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
418 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
419 }
421 #else
422 #define sh_audit_gw(_v, _gw) do {} while(0)
423 #endif /* audit code */
427 /**************************************************************************/
428 /* Function to write to the guest tables, for propagating accessed and
429 * dirty bits from the shadow to the guest.
430 * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
431 * and an operation type. The guest entry is always passed as an l1e:
432 * since we only ever write flags, that's OK.
433 * Returns the new flag bits of the guest entry. */
435 static u32 guest_set_ad_bits(struct vcpu *v,
436 mfn_t gmfn,
437 guest_l1e_t *ep,
438 unsigned int level,
439 fetch_type_t ft)
440 {
441 u32 flags;
442 int res = 0;
444 ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
445 ASSERT(level <= GUEST_PAGING_LEVELS);
446 ASSERT(shadow_locked_by_me(v->domain));
448 flags = guest_l1e_get_flags(*ep);
450 /* Only set A and D bits for guest-initiated accesses */
451 if ( !(ft & FETCH_TYPE_DEMAND) )
452 return flags;
454 ASSERT(mfn_valid(gmfn)
455 && (sh_mfn_is_a_page_table(gmfn)
456 || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
457 == 0)));
459 /* PAE l3s do not have A and D bits */
460 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
462 /* Need the D bit as well for writes, in L1es and PSE L2es. */
463 if ( ft == ft_demand_write
464 && (level == 1 ||
465 (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
466 {
467 if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
468 == (_PAGE_DIRTY | _PAGE_ACCESSED) )
469 return flags; /* Guest already has A and D bits set */
470 flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
471 perfc_incrc(shadow_ad_update);
472 }
473 else
474 {
475 if ( flags & _PAGE_ACCESSED )
476 return flags; /* Guest already has A bit set */
477 flags |= _PAGE_ACCESSED;
478 perfc_incrc(shadow_a_update);
479 }
481 /* Set the bit(s) */
482 sh_mark_dirty(v->domain, gmfn);
483 SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
484 "old flags = %#x, new flags = %#x\n",
485 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep),
486 flags);
487 *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
489 /* Propagate this change to any other shadows of the page
490 * (only necessary if there is more than one shadow) */
491 if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
492 {
493 u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
494 /* More than one type bit set in shadow-flags? */
495 if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
496 res = sh_validate_guest_entry(v, gmfn, ep, sizeof (*ep));
497 }
499 /* We should never need to flush the TLB or recopy PAE entries */
500 ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
502 return flags;
503 }
505 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
506 void *
507 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
508 unsigned long *gl1mfn)
509 {
510 void *pl1e = NULL;
511 walk_t gw;
513 ASSERT(shadow_mode_translate(v->domain));
515 // XXX -- this is expensive, but it's easy to cobble together...
516 // FIXME!
518 shadow_lock(v->domain);
519 guest_walk_tables(v, addr, &gw, 1);
521 if ( gw.l2e &&
522 (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) &&
523 !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) )
524 {
525 if ( gl1mfn )
526 *gl1mfn = mfn_x(gw.l1mfn);
527 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
528 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
529 }
531 unmap_walk(v, &gw);
532 shadow_unlock(v->domain);
534 return pl1e;
535 }
537 void
538 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
539 {
540 walk_t gw;
542 ASSERT(shadow_mode_translate(v->domain));
544 // XXX -- this is expensive, but it's easy to cobble together...
545 // FIXME!
547 shadow_lock(v->domain);
548 guest_walk_tables(v, addr, &gw, 1);
549 *(guest_l1e_t *)eff_l1e = gw.eff_l1e;
550 unmap_walk(v, &gw);
551 shadow_unlock(v->domain);
552 }
553 #endif /* CONFIG==SHADOW==GUEST */
555 /**************************************************************************/
556 /* Functions to compute the correct index into a shadow page, given an
557 * index into the guest page (as returned by guest_get_index()).
558 * This is trivial when the shadow and guest use the same sized PTEs, but
559 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
560 * PAE- or 64-bit shadows).
561 *
562 * These functions also increment the shadow mfn, when necessary. When PTE
563 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
564 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
565 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
566 * which shadow page we really want. Similarly, when PTE sizes are
567 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
568 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
569 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
570 * space.)
571 *
572 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
573 * of shadow (to store both the shadow, and the info that would normally be
574 * stored in page_info fields). This arrangement allows the shadow and the
575 * "page_info" fields to always be stored in the same page (in fact, in
576 * the same cache line), avoiding an extra call to map_domain_page().
577 */
579 static inline u32
580 guest_index(void *ptr)
581 {
582 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
583 }
585 static u32
586 shadow_l1_index(mfn_t *smfn, u32 guest_index)
587 {
588 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
589 *smfn = _mfn(mfn_x(*smfn) +
590 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
591 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
592 #else
593 return guest_index;
594 #endif
595 }
597 static u32
598 shadow_l2_index(mfn_t *smfn, u32 guest_index)
599 {
600 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
601 // Because we use 2 shadow l2 entries for each guest entry, the number of
602 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
603 //
604 *smfn = _mfn(mfn_x(*smfn) +
605 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
607 // We multiple by two to get the index of the first of the two entries
608 // used to shadow the specified guest entry.
609 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
610 #else
611 return guest_index;
612 #endif
613 }
615 #if GUEST_PAGING_LEVELS >= 4
617 static u32
618 shadow_l3_index(mfn_t *smfn, u32 guest_index)
619 {
620 return guest_index;
621 }
623 static u32
624 shadow_l4_index(mfn_t *smfn, u32 guest_index)
625 {
626 return guest_index;
627 }
629 #endif // GUEST_PAGING_LEVELS >= 4
632 /**************************************************************************/
633 /* Function which computes shadow entries from their corresponding guest
634 * entries. This is the "heart" of the shadow code. It operates using
635 * level-1 shadow types, but handles all levels of entry.
636 * Don't call it directly, but use the four wrappers below.
637 */
639 static always_inline void
640 _sh_propagate(struct vcpu *v,
641 void *guest_entry_ptr,
642 mfn_t guest_table_mfn,
643 mfn_t target_mfn,
644 void *shadow_entry_ptr,
645 int level,
646 fetch_type_t ft,
647 int mmio)
648 {
649 guest_l1e_t *gp = guest_entry_ptr;
650 shadow_l1e_t *sp = shadow_entry_ptr;
651 struct domain *d = v->domain;
652 u32 pass_thru_flags;
653 u32 gflags, sflags;
655 /* We don't shadow PAE l3s */
656 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
658 if ( mfn_valid(guest_table_mfn) )
659 /* Handle A and D bit propagation into the guest */
660 gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
661 else
662 {
663 /* Must be an fl1e or a prefetch */
664 ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
665 gflags = guest_l1e_get_flags(*gp);
666 }
668 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
669 {
670 /* If a guest l1 entry is not present, shadow with the magic
671 * guest-not-present entry. */
672 if ( level == 1 )
673 *sp = sh_l1e_gnp();
674 else
675 *sp = shadow_l1e_empty();
676 goto done;
677 }
679 if ( level == 1 && mmio )
680 {
681 /* Guest l1e maps MMIO space */
682 *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
683 goto done;
684 }
686 // Must have a valid target_mfn, unless this is a prefetch. In the
687 // case of a prefetch, an invalid mfn means that we can not usefully
688 // shadow anything, and so we return early.
689 //
690 if ( !mfn_valid(target_mfn) )
691 {
692 ASSERT((ft == ft_prefetch));
693 *sp = shadow_l1e_empty();
694 goto done;
695 }
697 // Propagate bits from the guest to the shadow.
698 // Some of these may be overwritten, below.
699 // Since we know the guest's PRESENT bit is set, we also set the shadow's
700 // SHADOW_PRESENT bit.
701 //
702 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
703 _PAGE_RW | _PAGE_PRESENT);
704 if ( guest_supports_nx(v) )
705 pass_thru_flags |= _PAGE_NX_BIT;
706 sflags = gflags & pass_thru_flags;
708 // Set the A&D bits for higher level shadows.
709 // Higher level entries do not, strictly speaking, have dirty bits, but
710 // since we use shadow linear tables, each of these entries may, at some
711 // point in time, also serve as a shadow L1 entry.
712 // By setting both the A&D bits in each of these, we eliminate the burden
713 // on the hardware to update these bits on initial accesses.
714 //
715 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
716 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
718 // If the A or D bit has not yet been set in the guest, then we must
719 // prevent the corresponding kind of access.
720 //
721 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
722 sflags &= ~_PAGE_PRESENT;
724 /* D bits exist in L1es and PSE L2es */
725 if ( unlikely(((level == 1) ||
726 ((level == 2) &&
727 (gflags & _PAGE_PSE) &&
728 guest_supports_superpages(v)))
729 && !(gflags & _PAGE_DIRTY)) )
730 sflags &= ~_PAGE_RW;
732 // shadow_mode_log_dirty support
733 //
734 // Only allow the guest write access to a page a) on a demand fault,
735 // or b) if the page is already marked as dirty.
736 //
737 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
738 {
739 if ( ft & FETCH_TYPE_WRITE )
740 sh_mark_dirty(d, target_mfn);
741 else if ( !sh_mfn_is_dirty(d, target_mfn) )
742 sflags &= ~_PAGE_RW;
743 }
745 // protect guest page tables
746 //
747 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
748 {
749 if ( shadow_mode_trap_reads(d) )
750 {
751 // if we are trapping both reads & writes, then mark this page
752 // as not present...
753 //
754 sflags &= ~_PAGE_PRESENT;
755 }
756 else
757 {
758 // otherwise, just prevent any writes...
759 //
760 sflags &= ~_PAGE_RW;
761 }
762 }
764 // PV guests in 64-bit mode use two different page tables for user vs
765 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
766 // It is always shadowed as present...
767 if ( (GUEST_PAGING_LEVELS == 4) && !is_hvm_domain(d) )
768 {
769 sflags |= _PAGE_USER;
770 }
772 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
773 done:
774 SHADOW_DEBUG(PROPAGATE,
775 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
776 fetch_type_names[ft], level, gp->l1, sp->l1);
777 }
780 /* These four wrappers give us a little bit of type-safety back around the
781 * use of void-* pointers in _sh_propagate(), and allow the compiler to
782 * optimize out some level checks. */
784 #if GUEST_PAGING_LEVELS >= 4
785 static void
786 l4e_propagate_from_guest(struct vcpu *v,
787 guest_l4e_t *gl4e,
788 mfn_t gl4mfn,
789 mfn_t sl3mfn,
790 shadow_l4e_t *sl4e,
791 fetch_type_t ft)
792 {
793 _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, 0);
794 }
796 static void
797 l3e_propagate_from_guest(struct vcpu *v,
798 guest_l3e_t *gl3e,
799 mfn_t gl3mfn,
800 mfn_t sl2mfn,
801 shadow_l3e_t *sl3e,
802 fetch_type_t ft)
803 {
804 _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, 0);
805 }
806 #endif // GUEST_PAGING_LEVELS >= 4
808 static void
809 l2e_propagate_from_guest(struct vcpu *v,
810 guest_l2e_t *gl2e,
811 mfn_t gl2mfn,
812 mfn_t sl1mfn,
813 shadow_l2e_t *sl2e,
814 fetch_type_t ft)
815 {
816 _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, 0);
817 }
819 static void
820 l1e_propagate_from_guest(struct vcpu *v,
821 guest_l1e_t *gl1e,
822 mfn_t gl1mfn,
823 mfn_t gmfn,
824 shadow_l1e_t *sl1e,
825 fetch_type_t ft,
826 int mmio)
827 {
828 _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, mmio);
829 }
832 /**************************************************************************/
833 /* These functions update shadow entries (and do bookkeeping on the shadow
834 * tables they are in). It is intended that they are the only
835 * functions which ever write (non-zero) data onto a shadow page.
836 */
838 static inline void safe_write_entry(void *dst, void *src)
839 /* Copy one PTE safely when processors might be running on the
840 * destination pagetable. This does *not* give safety against
841 * concurrent writes (that's what the shadow lock is for), just
842 * stops the hardware picking up partially written entries. */
843 {
844 volatile unsigned long *d = dst;
845 unsigned long *s = src;
846 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
847 #if CONFIG_PAGING_LEVELS == 3
848 /* In PAE mode, pagetable entries are larger
849 * than machine words, so won't get written atomically. We need to make
850 * sure any other cpu running on these shadows doesn't see a
851 * half-written entry. Do this by marking the entry not-present first,
852 * then writing the high word before the low word. */
853 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
854 d[0] = 0;
855 d[1] = s[1];
856 d[0] = s[0];
857 #else
858 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
859 * which will be an atomic write, since the entry is aligned. */
860 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
861 *d = *s;
862 #endif
863 }
866 static inline void
867 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
868 /* This function does the actual writes to shadow pages.
869 * It must not be called directly, since it doesn't do the bookkeeping
870 * that shadow_set_l*e() functions do. */
871 {
872 shadow_l1e_t *dst = d;
873 shadow_l1e_t *src = s;
874 void *map = NULL;
875 int i;
877 /* Because we mirror access rights at all levels in the shadow, an
878 * l2 (or higher) entry with the RW bit cleared will leave us with
879 * no write access through the linear map.
880 * We detect that by writing to the shadow with copy_to_user() and
881 * using map_domain_page() to get a writeable mapping if we need to. */
882 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
883 {
884 perfc_incrc(shadow_linear_map_failed);
885 map = sh_map_domain_page(mfn);
886 ASSERT(map != NULL);
887 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
888 }
891 for ( i = 0; i < entries; i++ )
892 safe_write_entry(dst++, src++);
894 if ( map != NULL ) sh_unmap_domain_page(map);
895 }
897 static inline int
898 perms_strictly_increased(u32 old_flags, u32 new_flags)
899 /* Given the flags of two entries, are the new flags a strict
900 * increase in rights over the old ones? */
901 {
902 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
903 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
904 /* Flip the NX bit, since it's the only one that decreases rights;
905 * we calculate as if it were an "X" bit. */
906 of ^= _PAGE_NX_BIT;
907 nf ^= _PAGE_NX_BIT;
908 /* If the changed bits are all set in the new flags, then rights strictly
909 * increased between old and new. */
910 return ((of | (of ^ nf)) == nf);
911 }
913 static int inline
914 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
915 {
916 int res;
917 mfn_t mfn;
918 struct domain *owner;
920 ASSERT(!sh_l1e_is_magic(sl1e));
922 if ( !shadow_mode_refcounts(d) )
923 return 1;
925 res = get_page_from_l1e(sl1e, d);
927 // If a privileged domain is attempting to install a map of a page it does
928 // not own, we let it succeed anyway.
929 //
930 if ( unlikely(!res) &&
931 IS_PRIV(d) &&
932 !shadow_mode_translate(d) &&
933 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
934 (owner = page_get_owner(mfn_to_page(mfn))) &&
935 (d != owner) )
936 {
937 res = get_page_from_l1e(sl1e, owner);
938 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
939 "which is owned by domain %d: %s\n",
940 d->domain_id, mfn_x(mfn), owner->domain_id,
941 res ? "success" : "failed");
942 }
944 if ( unlikely(!res) )
945 {
946 perfc_incrc(shadow_get_page_fail);
947 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
948 }
950 return res;
951 }
953 static void inline
954 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
955 {
956 if ( !shadow_mode_refcounts(d) )
957 return;
959 put_page_from_l1e(sl1e, d);
960 }
962 #if GUEST_PAGING_LEVELS >= 4
963 static int shadow_set_l4e(struct vcpu *v,
964 shadow_l4e_t *sl4e,
965 shadow_l4e_t new_sl4e,
966 mfn_t sl4mfn)
967 {
968 int flags = 0, ok;
969 shadow_l4e_t old_sl4e;
970 paddr_t paddr;
971 ASSERT(sl4e != NULL);
972 old_sl4e = *sl4e;
974 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
976 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
977 | (((unsigned long)sl4e) & ~PAGE_MASK));
979 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
980 {
981 /* About to install a new reference */
982 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
983 ok = sh_get_ref(v, sl3mfn, paddr);
984 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
985 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
986 ok |= sh_pin(v, sl3mfn);
987 if ( !ok )
988 {
989 domain_crash(v->domain);
990 return SHADOW_SET_ERROR;
991 }
992 }
994 /* Write the new entry */
995 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
996 flags |= SHADOW_SET_CHANGED;
998 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
999 {
1000 /* We lost a reference to an old mfn. */
1001 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1002 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1003 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1004 shadow_l4e_get_flags(new_sl4e)) )
1006 flags |= SHADOW_SET_FLUSH;
1008 sh_put_ref(v, osl3mfn, paddr);
1010 return flags;
1013 static int shadow_set_l3e(struct vcpu *v,
1014 shadow_l3e_t *sl3e,
1015 shadow_l3e_t new_sl3e,
1016 mfn_t sl3mfn)
1018 int flags = 0;
1019 shadow_l3e_t old_sl3e;
1020 paddr_t paddr;
1021 ASSERT(sl3e != NULL);
1022 old_sl3e = *sl3e;
1024 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1026 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1027 | (((unsigned long)sl3e) & ~PAGE_MASK));
1029 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1030 /* About to install a new reference */
1031 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1033 domain_crash(v->domain);
1034 return SHADOW_SET_ERROR;
1037 /* Write the new entry */
1038 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1039 flags |= SHADOW_SET_CHANGED;
1041 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1043 /* We lost a reference to an old mfn. */
1044 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1045 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1046 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1047 shadow_l3e_get_flags(new_sl3e)) )
1049 flags |= SHADOW_SET_FLUSH;
1051 sh_put_ref(v, osl2mfn, paddr);
1053 return flags;
1055 #endif /* GUEST_PAGING_LEVELS >= 4 */
1057 static int shadow_set_l2e(struct vcpu *v,
1058 shadow_l2e_t *sl2e,
1059 shadow_l2e_t new_sl2e,
1060 mfn_t sl2mfn)
1062 int flags = 0;
1063 shadow_l2e_t old_sl2e;
1064 paddr_t paddr;
1066 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1067 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1068 * shadows. Reference counting and up-pointers track from the first
1069 * page of the shadow to the first l2e, so make sure that we're
1070 * working with those:
1071 * Align the pointer down so it's pointing at the first of the pair */
1072 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1073 /* Align the mfn of the shadow entry too */
1074 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1075 #endif
1077 ASSERT(sl2e != NULL);
1078 old_sl2e = *sl2e;
1080 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1082 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1083 | (((unsigned long)sl2e) & ~PAGE_MASK));
1085 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1086 /* About to install a new reference */
1087 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1089 domain_crash(v->domain);
1090 return SHADOW_SET_ERROR;
1093 /* Write the new entry */
1094 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1096 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1097 /* The l1 shadow is two pages long and need to be pointed to by
1098 * two adjacent l1es. The pair have the same flags, but point
1099 * at odd and even MFNs */
1100 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1101 pair[1].l2 |= (1<<PAGE_SHIFT);
1102 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1104 #else /* normal case */
1105 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1106 #endif
1107 flags |= SHADOW_SET_CHANGED;
1109 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1111 /* We lost a reference to an old mfn. */
1112 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1113 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1114 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1115 shadow_l2e_get_flags(new_sl2e)) )
1117 flags |= SHADOW_SET_FLUSH;
1119 sh_put_ref(v, osl1mfn, paddr);
1121 return flags;
1124 static int shadow_set_l1e(struct vcpu *v,
1125 shadow_l1e_t *sl1e,
1126 shadow_l1e_t new_sl1e,
1127 mfn_t sl1mfn)
1129 int flags = 0;
1130 struct domain *d = v->domain;
1131 shadow_l1e_t old_sl1e;
1132 ASSERT(sl1e != NULL);
1134 old_sl1e = *sl1e;
1136 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1138 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1139 && !sh_l1e_is_magic(new_sl1e) )
1141 /* About to install a new reference */
1142 if ( shadow_mode_refcounts(d) ) {
1143 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1145 /* Doesn't look like a pagetable. */
1146 flags |= SHADOW_SET_ERROR;
1147 new_sl1e = shadow_l1e_empty();
1152 /* Write the new entry */
1153 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1154 flags |= SHADOW_SET_CHANGED;
1156 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1157 && !sh_l1e_is_magic(old_sl1e) )
1159 /* We lost a reference to an old mfn. */
1160 /* N.B. Unlike higher-level sets, never need an extra flush
1161 * when writing an l1e. Because it points to the same guest frame
1162 * as the guest l1e did, it's the guest's responsibility to
1163 * trigger a flush later. */
1164 if ( shadow_mode_refcounts(d) )
1166 shadow_put_page_from_l1e(old_sl1e, d);
1169 return flags;
1173 /**************************************************************************/
1174 /* Macros to walk pagetables. These take the shadow of a pagetable and
1175 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1176 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1177 * second entry (since pairs of entries are managed together). For multi-page
1178 * shadows they walk all pages.
1180 * Arguments are an MFN, the variable to point to each entry, a variable
1181 * to indicate that we are done (we will shortcut to the end of the scan
1182 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1183 * and the code.
1185 * WARNING: These macros have side-effects. They change the values of both
1186 * the pointer and the MFN. */
1188 static inline void increment_ptr_to_guest_entry(void *ptr)
1190 if ( ptr )
1192 guest_l1e_t **entry = ptr;
1193 (*entry)++;
1197 /* All kinds of l1: touch all entries */
1198 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1199 do { \
1200 int _i; \
1201 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1202 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1203 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1204 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1205 { \
1206 (_sl1e) = _sp + _i; \
1207 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1208 {_code} \
1209 if ( _done ) break; \
1210 increment_ptr_to_guest_entry(_gl1p); \
1211 } \
1212 unmap_shadow_page(_sp); \
1213 } while (0)
1215 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1216 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1217 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1218 do { \
1219 int __done = 0; \
1220 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1221 ({ (__done = _done); }), _code); \
1222 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1223 if ( !__done ) \
1224 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1225 ({ (__done = _done); }), _code); \
1226 } while (0)
1227 #else /* Everything else; l1 shadows are only one page */
1228 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1229 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1230 #endif
1233 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1235 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1236 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1237 do { \
1238 int _i, _j, __done = 0; \
1239 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1240 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1241 { \
1242 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1243 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1244 if ( (!(_xen)) \
1245 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1246 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1247 { \
1248 (_sl2e) = _sp + _i; \
1249 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1250 {_code} \
1251 if ( (__done = (_done)) ) break; \
1252 increment_ptr_to_guest_entry(_gl2p); \
1253 } \
1254 unmap_shadow_page(_sp); \
1255 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1256 } \
1257 } while (0)
1259 #elif GUEST_PAGING_LEVELS == 2
1261 /* 32-bit on 32-bit: avoid Xen entries */
1262 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1263 do { \
1264 int _i; \
1265 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1266 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1267 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1268 if ( (!(_xen)) \
1269 || \
1270 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1271 { \
1272 (_sl2e) = _sp + _i; \
1273 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1274 {_code} \
1275 if ( _done ) break; \
1276 increment_ptr_to_guest_entry(_gl2p); \
1277 } \
1278 unmap_shadow_page(_sp); \
1279 } while (0)
1281 #elif GUEST_PAGING_LEVELS == 3
1283 /* PAE: if it's an l2h, don't touch Xen mappings */
1284 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1285 do { \
1286 int _i; \
1287 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1288 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1289 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1290 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1291 if ( (!(_xen)) \
1292 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1293 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1294 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1295 { \
1296 (_sl2e) = _sp + _i; \
1297 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1298 {_code} \
1299 if ( _done ) break; \
1300 increment_ptr_to_guest_entry(_gl2p); \
1301 } \
1302 unmap_shadow_page(_sp); \
1303 } while (0)
1305 #else
1307 /* 64-bit l2: touch all entries */
1308 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1309 do { \
1310 int _i; \
1311 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1312 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow); \
1313 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1314 { \
1315 (_sl2e) = _sp + _i; \
1316 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1317 {_code} \
1318 if ( _done ) break; \
1319 increment_ptr_to_guest_entry(_gl2p); \
1320 } \
1321 unmap_shadow_page(_sp); \
1322 } while (0)
1324 #endif /* different kinds of l2 */
1326 #if GUEST_PAGING_LEVELS == 4
1328 /* 64-bit l3: touch all entries */
1329 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1330 do { \
1331 int _i; \
1332 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1333 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1334 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1335 { \
1336 (_sl3e) = _sp + _i; \
1337 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1338 {_code} \
1339 if ( _done ) break; \
1340 increment_ptr_to_guest_entry(_gl3p); \
1341 } \
1342 unmap_shadow_page(_sp); \
1343 } while (0)
1345 /* 64-bit l4: avoid Xen mappings */
1346 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \
1347 do { \
1348 int _i; \
1349 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1350 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1351 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1352 { \
1353 if ( (!(_xen)) || is_guest_l4_slot(_i) ) \
1354 { \
1355 (_sl4e) = _sp + _i; \
1356 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1357 {_code} \
1358 if ( _done ) break; \
1359 } \
1360 increment_ptr_to_guest_entry(_gl4p); \
1361 } \
1362 unmap_shadow_page(_sp); \
1363 } while (0)
1365 #endif
1369 /**************************************************************************/
1370 /* Functions to install Xen mappings and linear mappings in shadow pages */
1372 // XXX -- this function should probably be moved to shadow-common.c, but that
1373 // probably wants to wait until the shadow types have been moved from
1374 // shadow-types.h to shadow-private.h
1375 //
1376 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1377 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1379 struct domain *d = v->domain;
1380 shadow_l4e_t *sl4e;
1382 sl4e = sh_map_domain_page(sl4mfn);
1383 ASSERT(sl4e != NULL);
1384 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1386 /* Copy the common Xen mappings from the idle domain */
1387 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1388 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1389 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1391 /* Install the per-domain mappings for this domain */
1392 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1393 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1394 __PAGE_HYPERVISOR);
1396 /* Linear mapping */
1397 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1398 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1400 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1402 // linear tables may not be used with translated PV guests
1403 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1404 shadow_l4e_empty();
1406 else
1408 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1409 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1412 if ( shadow_mode_translate(v->domain) )
1414 /* install domain-specific P2M table */
1415 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1416 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1417 __PAGE_HYPERVISOR);
1420 sh_unmap_domain_page(sl4e);
1422 #endif
1424 #if (CONFIG_PAGING_LEVELS == 3 || defined(CONFIG_COMPAT)) && GUEST_PAGING_LEVELS == 3
1425 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1426 // place, which means that we need to populate the l2h entry in the l3
1427 // table.
1429 void sh_install_xen_entries_in_l2h(struct vcpu *v,
1430 mfn_t sl2hmfn)
1432 struct domain *d = v->domain;
1433 shadow_l2e_t *sl2e;
1434 #if CONFIG_PAGING_LEVELS == 3
1435 int i;
1436 #else
1438 if ( !pv_32bit_guest(v) )
1439 return;
1440 #endif
1442 sl2e = sh_map_domain_page(sl2hmfn);
1443 ASSERT(sl2e != NULL);
1444 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1446 #if CONFIG_PAGING_LEVELS == 3
1448 /* Copy the common Xen mappings from the idle domain */
1449 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1450 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1451 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1453 /* Install the per-domain mappings for this domain */
1454 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1455 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1456 shadow_l2e_from_mfn(
1457 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1458 __PAGE_HYPERVISOR);
1460 /* We don't set up a linear mapping here because we can't until this
1461 * l2h is installed in an l3e. sh_update_linear_entries() handles
1462 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1463 * We zero them here, just as a safety measure.
1464 */
1465 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1466 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1467 shadow_l2e_empty();
1468 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1469 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1470 shadow_l2e_empty();
1472 if ( shadow_mode_translate(d) )
1474 /* Install the domain-specific p2m table */
1475 l3_pgentry_t *p2m;
1476 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1477 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1478 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1480 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1481 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1482 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1483 __PAGE_HYPERVISOR)
1484 : shadow_l2e_empty();
1486 sh_unmap_domain_page(p2m);
1489 #else
1491 /* Copy the common Xen mappings from the idle domain */
1492 memcpy(&sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1493 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1494 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1496 #endif
1498 sh_unmap_domain_page(sl2e);
1500 #endif
1503 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1504 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1506 struct domain *d = v->domain;
1507 shadow_l2e_t *sl2e;
1508 int i;
1510 sl2e = sh_map_domain_page(sl2mfn);
1511 ASSERT(sl2e != NULL);
1512 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1514 /* Copy the common Xen mappings from the idle domain */
1515 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1516 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1517 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1519 /* Install the per-domain mappings for this domain */
1520 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1521 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1522 shadow_l2e_from_mfn(
1523 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1524 __PAGE_HYPERVISOR);
1526 /* Linear mapping */
1527 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1528 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1530 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1532 // linear tables may not be used with translated PV guests
1533 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1534 shadow_l2e_empty();
1536 else
1538 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1539 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1542 if ( shadow_mode_translate(d) )
1544 /* install domain-specific P2M table */
1545 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1546 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1547 __PAGE_HYPERVISOR);
1550 sh_unmap_domain_page(sl2e);
1552 #endif
1556 /**************************************************************************/
1557 /* Create a shadow of a given guest page.
1558 */
1559 static mfn_t
1560 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1562 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1563 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1564 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1566 if ( shadow_type != SH_type_l2_32_shadow
1567 && shadow_type != SH_type_l2_pae_shadow
1568 && shadow_type != SH_type_l2h_pae_shadow
1569 && shadow_type != SH_type_l4_64_shadow )
1570 /* Lower-level shadow, not yet linked form a higher level */
1571 mfn_to_shadow_page(smfn)->up = 0;
1573 #if GUEST_PAGING_LEVELS == 4
1574 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1575 if ( shadow_type == SH_type_l4_64_shadow &&
1576 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1578 /* We're shadowing a new l4, but we've been assuming the guest uses
1579 * only one l4 per vcpu and context switches using an l4 entry.
1580 * Count the number of active l4 shadows. If there are enough
1581 * of them, decide that this isn't an old linux guest, and stop
1582 * pinning l3es. This is not very quick but it doesn't happen
1583 * very often. */
1584 struct list_head *l, *t;
1585 struct shadow_page_info *sp;
1586 struct vcpu *v2;
1587 int l4count = 0, vcpus = 0;
1588 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1590 sp = list_entry(l, struct shadow_page_info, list);
1591 if ( sp->type == SH_type_l4_64_shadow )
1592 l4count++;
1594 for_each_vcpu ( v->domain, v2 )
1595 vcpus++;
1596 if ( l4count > 2 * vcpus )
1598 /* Unpin all the pinned l3 tables, and don't pin any more. */
1599 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1601 sp = list_entry(l, struct shadow_page_info, list);
1602 if ( sp->type == SH_type_l3_64_shadow )
1603 sh_unpin(v, shadow_page_to_mfn(sp));
1605 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1608 #endif
1609 #endif
1611 // Create the Xen mappings...
1612 if ( !shadow_mode_external(v->domain) )
1614 switch (shadow_type)
1616 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1617 case SH_type_l4_shadow:
1618 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1619 #endif
1620 #if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
1621 case SH_type_l2h_shadow:
1622 sh_install_xen_entries_in_l2h(v, smfn); break;
1623 #endif
1624 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1625 case SH_type_l2_shadow:
1626 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1627 #endif
1628 default: /* Do nothing */ break;
1632 shadow_promote(v, gmfn, shadow_type);
1633 set_shadow_status(v, gmfn, shadow_type, smfn);
1635 return smfn;
1638 /* Make a splintered superpage shadow */
1639 static mfn_t
1640 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1642 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1643 (unsigned long) gfn_x(gfn));
1645 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1646 gfn_x(gfn), mfn_x(smfn));
1648 set_fl1_shadow_status(v, gfn, smfn);
1649 return smfn;
1653 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1654 mfn_t
1655 sh_make_monitor_table(struct vcpu *v)
1657 struct domain *d = v->domain;
1659 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1661 /* Guarantee we can get the memory we need */
1662 shadow_prealloc(d, SHADOW_MAX_ORDER);
1664 #if CONFIG_PAGING_LEVELS == 4
1666 mfn_t m4mfn;
1667 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1668 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1669 /* Remember the level of this table */
1670 mfn_to_page(m4mfn)->shadow_flags = 4;
1671 #if SHADOW_PAGING_LEVELS < 4
1672 // Install a monitor l3 table in slot 0 of the l4 table.
1673 // This is used for shadow linear maps.
1675 mfn_t m3mfn;
1676 l4_pgentry_t *l4e;
1677 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1678 mfn_to_page(m3mfn)->shadow_flags = 3;
1679 l4e = sh_map_domain_page(m4mfn);
1680 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1681 sh_unmap_domain_page(l4e);
1682 if ( pv_32bit_guest(v) )
1684 // Install a monitor l2 table in slot 3 of the l3 table.
1685 // This is used for all Xen entries.
1686 mfn_t m2mfn;
1687 l3_pgentry_t *l3e;
1688 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1689 mfn_to_page(m2mfn)->shadow_flags = 2;
1690 l3e = sh_map_domain_page(m3mfn);
1691 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1692 sh_install_xen_entries_in_l2h(v, m2mfn);
1693 sh_unmap_domain_page(l3e);
1696 #endif /* SHADOW_PAGING_LEVELS < 4 */
1697 return m4mfn;
1700 #elif CONFIG_PAGING_LEVELS == 3
1703 mfn_t m3mfn, m2mfn;
1704 l3_pgentry_t *l3e;
1705 l2_pgentry_t *l2e;
1706 int i;
1708 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1709 /* Remember the level of this table */
1710 mfn_to_page(m3mfn)->shadow_flags = 3;
1712 // Install a monitor l2 table in slot 3 of the l3 table.
1713 // This is used for all Xen entries, including linear maps
1714 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1715 mfn_to_page(m2mfn)->shadow_flags = 2;
1716 l3e = sh_map_domain_page(m3mfn);
1717 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1718 sh_install_xen_entries_in_l2h(v, m2mfn);
1719 /* Install the monitor's own linear map */
1720 l2e = sh_map_domain_page(m2mfn);
1721 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1722 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1723 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1724 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1725 : l2e_empty();
1726 sh_unmap_domain_page(l2e);
1727 sh_unmap_domain_page(l3e);
1729 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1730 return m3mfn;
1733 #elif CONFIG_PAGING_LEVELS == 2
1736 mfn_t m2mfn;
1737 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1738 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1739 /* Remember the level of this table */
1740 mfn_to_page(m2mfn)->shadow_flags = 2;
1741 return m2mfn;
1744 #else
1745 #error this should not happen
1746 #endif /* CONFIG_PAGING_LEVELS */
1748 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1750 /**************************************************************************/
1751 /* These functions also take a virtual address and return the level-N
1752 * shadow table mfn and entry, but they create the shadow pagetables if
1753 * they are needed. The "demand" argument is non-zero when handling
1754 * a demand fault (so we know what to do about accessed bits &c).
1755 * If the necessary tables are not present in the guest, they return NULL. */
1757 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1758 * more levels than the guest, the upper levels are always fixed and do not
1759 * reflect any information from the guest, so we do not use these functions
1760 * to access them. */
1762 #if GUEST_PAGING_LEVELS >= 4
1763 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1764 walk_t *gw,
1765 mfn_t *sl4mfn)
1767 /* There is always a shadow of the top level table. Get it. */
1768 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1769 /* Reading the top level table is always valid. */
1770 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1773 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1774 walk_t *gw,
1775 mfn_t *sl3mfn,
1776 fetch_type_t ft)
1778 mfn_t sl4mfn;
1779 shadow_l4e_t *sl4e;
1780 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1781 /* Get the l4e */
1782 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1783 ASSERT(sl4e != NULL);
1784 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1786 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1787 ASSERT(mfn_valid(*sl3mfn));
1789 else
1791 int r;
1792 shadow_l4e_t new_sl4e;
1793 /* No l3 shadow installed: find and install it. */
1794 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1795 if ( !mfn_valid(*sl3mfn) )
1797 /* No l3 shadow of this page exists at all: make one. */
1798 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1800 /* Install the new sl3 table in the sl4e */
1801 l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
1802 *sl3mfn, &new_sl4e, ft);
1803 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1804 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1805 if ( r & SHADOW_SET_ERROR )
1806 return NULL;
1808 /* Now follow it down a level. Guaranteed to succeed. */
1809 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1811 #endif /* GUEST_PAGING_LEVELS >= 4 */
1814 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1815 walk_t *gw,
1816 mfn_t *sl2mfn,
1817 fetch_type_t ft)
1819 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1820 mfn_t sl3mfn = _mfn(INVALID_MFN);
1821 shadow_l3e_t *sl3e;
1822 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1823 /* Get the l3e */
1824 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1825 if ( sl3e == NULL ) return NULL;
1826 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1828 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1829 ASSERT(mfn_valid(*sl2mfn));
1831 else
1833 int r;
1834 shadow_l3e_t new_sl3e;
1835 /* No l2 shadow installed: find and install it. */
1836 *sl2mfn = get_shadow_status(v, gw->l2mfn, SH_type_l2_shadow);
1837 if ( !mfn_valid(*sl2mfn) )
1839 /* No l2 shadow of this page exists at all: make one. */
1840 *sl2mfn = sh_make_shadow(v, gw->l2mfn, SH_type_l2_shadow);
1842 /* Install the new sl2 table in the sl3e */
1843 l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
1844 *sl2mfn, &new_sl3e, ft);
1845 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1846 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1847 if ( r & SHADOW_SET_ERROR )
1848 return NULL;
1850 /* Now follow it down a level. Guaranteed to succeed. */
1851 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1852 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1853 /* We never demand-shadow PAE l3es: they are only created in
1854 * sh_update_cr3(). Check if the relevant sl3e is present. */
1855 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1856 + shadow_l3_linear_offset(gw->va);
1857 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1858 return NULL;
1859 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1860 ASSERT(mfn_valid(*sl2mfn));
1861 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1862 #else /* 32bit... */
1863 /* There is always a shadow of the top level table. Get it. */
1864 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1865 /* This next line is important: the guest l2 has a 16k
1866 * shadow, we need to return the right mfn of the four. This
1867 * call will set it for us as a side-effect. */
1868 (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
1869 /* Reading the top level table is always valid. */
1870 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1871 #endif
1875 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1876 walk_t *gw,
1877 mfn_t *sl1mfn,
1878 fetch_type_t ft)
1880 mfn_t sl2mfn;
1881 shadow_l2e_t *sl2e;
1883 /* Get the l2e */
1884 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1885 if ( sl2e == NULL ) return NULL;
1886 /* Install the sl1 in the l2e if it wasn't there or if we need to
1887 * re-do it to fix a PSE dirty bit. */
1888 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1889 && likely(ft != ft_demand_write
1890 || (guest_l2e_get_flags(*gw->l2e) & _PAGE_DIRTY)
1891 || !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)) )
1893 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1894 ASSERT(mfn_valid(*sl1mfn));
1896 else
1898 shadow_l2e_t new_sl2e;
1899 int r, flags = guest_l2e_get_flags(*gw->l2e);
1900 /* No l1 shadow installed: find and install it. */
1901 if ( !(flags & _PAGE_PRESENT) )
1902 return NULL; /* No guest page. */
1903 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1905 /* Splintering a superpage */
1906 gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
1907 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1908 if ( !mfn_valid(*sl1mfn) )
1910 /* No fl1 shadow of this superpage exists at all: make one. */
1911 *sl1mfn = make_fl1_shadow(v, l2gfn);
1914 else
1916 /* Shadowing an actual guest l1 table */
1917 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1918 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1919 if ( !mfn_valid(*sl1mfn) )
1921 /* No l1 shadow of this page exists at all: make one. */
1922 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1925 /* Install the new sl1 table in the sl2e */
1926 l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
1927 *sl1mfn, &new_sl2e, ft);
1928 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1929 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1930 if ( r & SHADOW_SET_ERROR )
1931 return NULL;
1932 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1933 * the guest l1 table has an 8k shadow, and we need to return
1934 * the right mfn of the pair. This call will set it for us as a
1935 * side-effect. (In all other cases, it's a no-op and will be
1936 * compiled out.) */
1937 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1939 /* Now follow it down a level. Guaranteed to succeed. */
1940 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1945 /**************************************************************************/
1946 /* Destructors for shadow tables:
1947 * Unregister the shadow, decrement refcounts of any entries present in it,
1948 * and release the memory.
1950 * N.B. These destructors do not clear the contents of the shadows.
1951 * This allows us to delay TLB shootdowns until the page is being reused.
1952 * See shadow_alloc() and shadow_free() for how this is handled.
1953 */
1955 #if GUEST_PAGING_LEVELS >= 4
1956 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1958 shadow_l4e_t *sl4e;
1959 u32 t = mfn_to_shadow_page(smfn)->type;
1960 mfn_t gmfn, sl4mfn;
1961 int xen_mappings;
1963 SHADOW_DEBUG(DESTROY_SHADOW,
1964 "%s(%05lx)\n", __func__, mfn_x(smfn));
1965 ASSERT(t == SH_type_l4_shadow);
1967 /* Record that the guest page isn't shadowed any more (in this type) */
1968 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1969 delete_shadow_status(v, gmfn, t, smfn);
1970 shadow_demote(v, gmfn, t);
1971 /* Decrement refcounts of all the old entries */
1972 xen_mappings = (!shadow_mode_external(v->domain));
1973 sl4mfn = smfn;
1974 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
1975 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1977 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1978 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1979 | ((unsigned long)sl4e & ~PAGE_MASK));
1981 });
1983 /* Put the memory back in the pool */
1984 shadow_free(v->domain, smfn);
1987 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
1989 shadow_l3e_t *sl3e;
1990 u32 t = mfn_to_shadow_page(smfn)->type;
1991 mfn_t gmfn, sl3mfn;
1993 SHADOW_DEBUG(DESTROY_SHADOW,
1994 "%s(%05lx)\n", __func__, mfn_x(smfn));
1995 ASSERT(t == SH_type_l3_shadow);
1997 /* Record that the guest page isn't shadowed any more (in this type) */
1998 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
1999 delete_shadow_status(v, gmfn, t, smfn);
2000 shadow_demote(v, gmfn, t);
2002 /* Decrement refcounts of all the old entries */
2003 sl3mfn = smfn;
2004 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2005 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2006 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2007 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2008 | ((unsigned long)sl3e & ~PAGE_MASK));
2009 });
2011 /* Put the memory back in the pool */
2012 shadow_free(v->domain, smfn);
2014 #endif /* GUEST_PAGING_LEVELS >= 4 */
2017 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2019 shadow_l2e_t *sl2e;
2020 u32 t = mfn_to_shadow_page(smfn)->type;
2021 mfn_t gmfn, sl2mfn;
2022 int xen_mappings;
2024 SHADOW_DEBUG(DESTROY_SHADOW,
2025 "%s(%05lx)\n", __func__, mfn_x(smfn));
2026 ASSERT(t == SH_type_l2_shadow
2027 || t == SH_type_l2h_pae_shadow);
2029 /* Record that the guest page isn't shadowed any more (in this type) */
2030 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2031 delete_shadow_status(v, gmfn, t, smfn);
2032 shadow_demote(v, gmfn, t);
2034 /* Decrement refcounts of all the old entries */
2035 sl2mfn = smfn;
2036 xen_mappings = (!shadow_mode_external(v->domain) &&
2037 ((GUEST_PAGING_LEVELS == 2) ||
2038 ((GUEST_PAGING_LEVELS == 3) &&
2039 (t == SH_type_l2h_pae_shadow))));
2040 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2041 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2042 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2043 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2044 | ((unsigned long)sl2e & ~PAGE_MASK));
2045 });
2047 /* Put the memory back in the pool */
2048 shadow_free(v->domain, smfn);
2051 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2053 struct domain *d = v->domain;
2054 shadow_l1e_t *sl1e;
2055 u32 t = mfn_to_shadow_page(smfn)->type;
2057 SHADOW_DEBUG(DESTROY_SHADOW,
2058 "%s(%05lx)\n", __func__, mfn_x(smfn));
2059 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2061 /* Record that the guest page isn't shadowed any more (in this type) */
2062 if ( t == SH_type_fl1_shadow )
2064 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2065 delete_fl1_shadow_status(v, gfn, smfn);
2067 else
2069 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2070 delete_shadow_status(v, gmfn, t, smfn);
2071 shadow_demote(v, gmfn, t);
2074 if ( shadow_mode_refcounts(d) )
2076 /* Decrement refcounts of all the old entries */
2077 mfn_t sl1mfn = smfn;
2078 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2079 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2080 && !sh_l1e_is_magic(*sl1e) )
2081 shadow_put_page_from_l1e(*sl1e, d);
2082 });
2085 /* Put the memory back in the pool */
2086 shadow_free(v->domain, smfn);
2089 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2090 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2092 struct domain *d = v->domain;
2093 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2095 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2096 /* Need to destroy the l3 monitor page in slot 0 too */
2098 mfn_t m3mfn;
2099 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2100 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2101 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2102 if ( pv_32bit_guest(v) )
2104 /* Need to destroy the l2 monitor page in slot 3 too */
2105 l3_pgentry_t *l3e = sh_map_domain_page(m3mfn);
2106 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2107 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2108 sh_unmap_domain_page(l3e);
2110 shadow_free(d, m3mfn);
2111 sh_unmap_domain_page(l4e);
2113 #elif CONFIG_PAGING_LEVELS == 3
2114 /* Need to destroy the l2 monitor page in slot 4 too */
2116 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2117 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2118 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2119 sh_unmap_domain_page(l3e);
2121 #endif
2123 /* Put the memory back in the pool */
2124 shadow_free(d, mmfn);
2126 #endif
2128 /**************************************************************************/
2129 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2130 * These are called from common code when we are running out of shadow
2131 * memory, and unpinning all the top-level shadows hasn't worked.
2133 * This implementation is pretty crude and slow, but we hope that it won't
2134 * be called very often. */
2136 #if GUEST_PAGING_LEVELS == 2
2138 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2140 shadow_l2e_t *sl2e;
2141 int xen_mappings = !shadow_mode_external(v->domain);
2142 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2143 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2144 });
2147 #elif GUEST_PAGING_LEVELS == 3
2149 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2150 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2152 shadow_l2e_t *sl2e;
2153 int xen_mappings = !shadow_mode_external(v->domain);
2154 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2155 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2156 });
2159 #elif GUEST_PAGING_LEVELS == 4
2161 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2163 shadow_l4e_t *sl4e;
2164 int xen_mappings = !shadow_mode_external(v->domain);
2165 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
2166 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2167 });
2170 #endif
2172 /**************************************************************************/
2173 /* Internal translation functions.
2174 * These functions require a pointer to the shadow entry that will be updated.
2175 */
2177 /* These functions take a new guest entry, translate it to shadow and write
2178 * the shadow entry.
2180 * They return the same bitmaps as the shadow_set_lXe() functions.
2181 */
2183 #if GUEST_PAGING_LEVELS >= 4
2184 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2186 shadow_l4e_t new_sl4e;
2187 guest_l4e_t *new_gl4e = new_ge;
2188 shadow_l4e_t *sl4p = se;
2189 mfn_t sl3mfn = _mfn(INVALID_MFN);
2190 int result = 0;
2192 perfc_incrc(shadow_validate_gl4e_calls);
2194 if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
2196 gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
2197 mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
2198 if ( mfn_valid(gl3mfn) )
2199 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2200 else
2201 result |= SHADOW_SET_ERROR;
2203 l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
2204 sl3mfn, &new_sl4e, ft_prefetch);
2206 // check for updates to xen reserved slots
2207 if ( !shadow_mode_external(v->domain) )
2209 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2210 sizeof(shadow_l4e_t));
2211 int reserved_xen_slot = !is_guest_l4_slot(shadow_index);
2213 if ( unlikely(reserved_xen_slot) )
2215 // attempt by the guest to write to a xen reserved slot
2216 //
2217 SHADOW_PRINTK("%s out-of-range update "
2218 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2219 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2220 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2222 SHADOW_ERROR("out-of-range l4e update\n");
2223 result |= SHADOW_SET_ERROR;
2226 // do not call shadow_set_l4e...
2227 return result;
2231 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2232 return result;
2236 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2238 shadow_l3e_t new_sl3e;
2239 guest_l3e_t *new_gl3e = new_ge;
2240 shadow_l3e_t *sl3p = se;
2241 mfn_t sl2mfn = _mfn(INVALID_MFN);
2242 int result = 0;
2244 perfc_incrc(shadow_validate_gl3e_calls);
2246 if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
2248 gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
2249 mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
2250 if ( mfn_valid(gl2mfn) )
2251 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2252 else
2253 result |= SHADOW_SET_ERROR;
2255 l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
2256 sl2mfn, &new_sl3e, ft_prefetch);
2257 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2259 return result;
2261 #endif // GUEST_PAGING_LEVELS >= 4
2263 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2265 shadow_l2e_t new_sl2e;
2266 guest_l2e_t *new_gl2e = new_ge;
2267 shadow_l2e_t *sl2p = se;
2268 mfn_t sl1mfn = _mfn(INVALID_MFN);
2269 int result = 0;
2271 perfc_incrc(shadow_validate_gl2e_calls);
2273 if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
2275 gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
2276 if ( guest_supports_superpages(v) &&
2277 (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
2279 // superpage -- need to look up the shadow L1 which holds the
2280 // splitters...
2281 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2282 #if 0
2283 // XXX - it's possible that we want to do some kind of prefetch
2284 // for superpage fl1's here, but this is *not* on the demand path,
2285 // so we'll hold off trying that for now...
2286 //
2287 if ( !mfn_valid(sl1mfn) )
2288 sl1mfn = make_fl1_shadow(v, gl1gfn);
2289 #endif
2291 else
2293 mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
2294 if ( mfn_valid(gl1mfn) )
2295 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2296 else
2297 result |= SHADOW_SET_ERROR;
2300 l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
2301 sl1mfn, &new_sl2e, ft_prefetch);
2303 // check for updates to xen reserved slots in PV guests...
2304 // XXX -- need to revisit this for PV 3-on-4 guests.
2305 //
2306 #if SHADOW_PAGING_LEVELS < 4
2307 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2308 if ( !shadow_mode_external(v->domain) )
2310 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2311 sizeof(shadow_l2e_t));
2312 int reserved_xen_slot;
2314 #if SHADOW_PAGING_LEVELS == 3
2315 reserved_xen_slot =
2316 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2317 (shadow_index
2318 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2319 #else /* SHADOW_PAGING_LEVELS == 2 */
2320 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2321 #endif
2323 if ( unlikely(reserved_xen_slot) )
2325 // attempt by the guest to write to a xen reserved slot
2326 //
2327 SHADOW_PRINTK("%s out-of-range update "
2328 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2329 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2330 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2332 SHADOW_ERROR("out-of-range l2e update\n");
2333 result |= SHADOW_SET_ERROR;
2336 // do not call shadow_set_l2e...
2337 return result;
2340 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2341 #endif /* SHADOW_PAGING_LEVELS < 4 */
2343 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2345 return result;
2348 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2350 shadow_l1e_t new_sl1e;
2351 guest_l1e_t *new_gl1e = new_ge;
2352 shadow_l1e_t *sl1p = se;
2353 gfn_t gfn;
2354 mfn_t gmfn;
2355 int result = 0, mmio;
2357 perfc_incrc(shadow_validate_gl1e_calls);
2359 gfn = guest_l1e_get_gfn(*new_gl1e);
2360 gmfn = vcpu_gfn_to_mfn(v, gfn);
2362 mmio = (is_hvm_vcpu(v) && paging_vcpu_mode_translate(v) && !mfn_valid(gmfn));
2363 l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e,
2364 ft_prefetch, mmio);
2366 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2367 return result;
2371 /**************************************************************************/
2372 /* Functions which translate and install the shadows of arbitrary guest
2373 * entries that we have just seen the guest write. */
2376 static inline int
2377 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2378 void *new_gp, u32 size, u32 sh_type,
2379 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2380 int (*validate_ge)(struct vcpu *v, void *ge,
2381 mfn_t smfn, void *se))
2382 /* Generic function for mapping and validating. */
2384 mfn_t smfn, smfn2, map_mfn;
2385 shadow_l1e_t *sl1p;
2386 u32 shadow_idx, guest_idx;
2387 int result = 0;
2389 /* Align address and size to guest entry boundaries */
2390 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2391 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2392 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2393 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2395 /* Map the shadow page */
2396 smfn = get_shadow_status(v, gmfn, sh_type);
2397 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2398 guest_idx = guest_index(new_gp);
2399 map_mfn = smfn;
2400 shadow_idx = shadow_index(&map_mfn, guest_idx);
2401 sl1p = map_shadow_page(map_mfn);
2403 /* Validate one entry at a time */
2404 while ( size )
2406 smfn2 = smfn;
2407 guest_idx = guest_index(new_gp);
2408 shadow_idx = shadow_index(&smfn2, guest_idx);
2409 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2411 /* We have moved to another page of the shadow */
2412 map_mfn = smfn2;
2413 unmap_shadow_page(sl1p);
2414 sl1p = map_shadow_page(map_mfn);
2416 result |= validate_ge(v,
2417 new_gp,
2418 map_mfn,
2419 &sl1p[shadow_idx]);
2420 size -= sizeof(guest_l1e_t);
2421 new_gp += sizeof(guest_l1e_t);
2423 unmap_shadow_page(sl1p);
2424 return result;
2428 int
2429 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2430 void *new_gl4p, u32 size)
2432 #if GUEST_PAGING_LEVELS >= 4
2433 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2434 SH_type_l4_shadow,
2435 shadow_l4_index,
2436 validate_gl4e);
2437 #else // ! GUEST_PAGING_LEVELS >= 4
2438 SHADOW_PRINTK("called in wrong paging mode!\n");
2439 BUG();
2440 return 0;
2441 #endif
2444 int
2445 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2446 void *new_gl3p, u32 size)
2448 #if GUEST_PAGING_LEVELS >= 4
2449 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2450 SH_type_l3_shadow,
2451 shadow_l3_index,
2452 validate_gl3e);
2453 #else // ! GUEST_PAGING_LEVELS >= 4
2454 SHADOW_PRINTK("called in wrong paging mode!\n");
2455 BUG();
2456 return 0;
2457 #endif
2460 int
2461 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2462 void *new_gl2p, u32 size)
2464 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2465 SH_type_l2_shadow,
2466 shadow_l2_index,
2467 validate_gl2e);
2470 int
2471 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2472 void *new_gl2p, u32 size)
2474 #if GUEST_PAGING_LEVELS == 3
2475 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2476 SH_type_l2h_shadow,
2477 shadow_l2_index,
2478 validate_gl2e);
2479 #else /* Non-PAE guests don't have different kinds of l2 table */
2480 SHADOW_PRINTK("called in wrong paging mode!\n");
2481 BUG();
2482 return 0;
2483 #endif
2486 int
2487 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2488 void *new_gl1p, u32 size)
2490 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2491 SH_type_l1_shadow,
2492 shadow_l1_index,
2493 validate_gl1e);
2497 /**************************************************************************/
2498 /* Optimization: If we see two emulated writes of zeros to the same
2499 * page-table without another kind of page fault in between, we guess
2500 * that this is a batch of changes (for process destruction) and
2501 * unshadow the page so we don't take a pagefault on every entry. This
2502 * should also make finding writeable mappings of pagetables much
2503 * easier. */
2505 /* Look to see if this is the second emulated write in a row to this
2506 * page, and unshadow/unhook if it is */
2507 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2509 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2510 if ( v->arch.paging.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2511 sh_mfn_is_a_page_table(gmfn) )
2513 u32 flags = mfn_to_page(gmfn)->shadow_flags;
2514 if ( !(flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
2516 perfc_incrc(shadow_early_unshadow);
2517 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2520 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
2521 #endif
2524 /* Stop counting towards early unshadows, as we've seen a real page fault */
2525 static inline void reset_early_unshadow(struct vcpu *v)
2527 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2528 v->arch.paging.shadow.last_emulated_mfn = INVALID_MFN;
2529 #endif
2534 /**************************************************************************/
2535 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2536 * demand-faulted a shadow l1e in the fault handler, to see if it's
2537 * worth fetching some more.
2538 */
2540 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2542 /* XXX magic number */
2543 #define PREFETCH_DISTANCE 32
2545 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2546 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2548 int i, dist, mmio;
2549 gfn_t gfn;
2550 mfn_t gmfn;
2551 guest_l1e_t gl1e;
2552 shadow_l1e_t sl1e;
2553 u32 gflags;
2555 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2556 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2557 /* And no more than a maximum fetches-per-fault */
2558 if ( dist > PREFETCH_DISTANCE )
2559 dist = PREFETCH_DISTANCE;
2561 for ( i = 1; i < dist ; i++ )
2563 /* No point in prefetching if there's already a shadow */
2564 if ( ptr_sl1e[i].l1 != 0 )
2565 break;
2567 if ( gw->l1e )
2569 /* Normal guest page; grab the next guest entry */
2570 gl1e = gw->l1e[i];
2571 /* Not worth continuing if we hit an entry that will need another
2572 * fault for A/D-bit propagation anyway */
2573 gflags = guest_l1e_get_flags(gl1e);
2574 if ( (gflags & _PAGE_PRESENT)
2575 && (!(gflags & _PAGE_ACCESSED)
2576 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2577 break;
2579 else
2581 /* Fragmented superpage, unless we've been called wrongly */
2582 ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
2583 /* Increment the l1e's GFN by the right number of guest pages */
2584 gl1e = guest_l1e_from_gfn(
2585 _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i),
2586 guest_l1e_get_flags(gw->eff_l1e));
2589 /* Look at the gfn that the l1e is pointing at */
2590 gfn = guest_l1e_get_gfn(gl1e);
2591 gmfn = vcpu_gfn_to_mfn(v, gfn);
2592 mmio = ( is_hvm_vcpu(v)
2593 && paging_vcpu_mode_translate(v)
2594 && mmio_space(gfn_to_paddr(gfn)) );
2596 /* Propagate the entry. Safe to use a pointer to our local
2597 * gl1e, since this is not a demand-fetch so there will be no
2598 * write-back to the guest. */
2599 l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
2600 gmfn, &sl1e, ft_prefetch, mmio);
2601 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2605 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2608 /**************************************************************************/
2609 /* Entry points into the shadow code */
2611 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2612 * for pagefaults. Returns 1 if this fault was an artefact of the
2613 * shadow code (and the guest should retry) or 0 if it is not (and the
2614 * fault should be handled elsewhere or passed to the guest). */
2616 static int sh_page_fault(struct vcpu *v,
2617 unsigned long va,
2618 struct cpu_user_regs *regs)
2620 struct domain *d = v->domain;
2621 walk_t gw;
2622 u32 accumulated_gflags;
2623 gfn_t gfn;
2624 mfn_t gmfn, sl1mfn=_mfn(0);
2625 shadow_l1e_t sl1e, *ptr_sl1e;
2626 paddr_t gpa;
2627 struct sh_emulate_ctxt emul_ctxt;
2628 struct x86_emulate_ops *emul_ops;
2629 int r, mmio;
2630 fetch_type_t ft = 0;
2632 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
2633 v->domain->domain_id, v->vcpu_id, va, regs->error_code);
2635 perfc_incrc(shadow_fault);
2636 //
2637 // XXX: Need to think about eventually mapping superpages directly in the
2638 // shadow (when possible), as opposed to splintering them into a
2639 // bunch of 4K maps.
2640 //
2642 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2643 if ( (regs->error_code & PFEC_reserved_bit) )
2645 /* The only reasons for reserved bits to be set in shadow entries
2646 * are the two "magic" shadow_l1e entries. */
2647 if ( likely((__copy_from_user(&sl1e,
2648 (sh_linear_l1_table(v)
2649 + shadow_l1_linear_offset(va)),
2650 sizeof(sl1e)) == 0)
2651 && sh_l1e_is_magic(sl1e)) )
2653 if ( sh_l1e_is_gnp(sl1e) )
2655 if ( likely(!is_hvm_domain(d) ||
2656 paging_vcpu_mode_translate(v)) )
2658 /* Not-present in a guest PT: pass to the guest as
2659 * a not-present fault (by flipping two bits). */
2660 ASSERT(regs->error_code & PFEC_page_present);
2661 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2662 perfc_incrc(shadow_fault_fast_gnp);
2663 SHADOW_PRINTK("fast path not-present\n");
2664 return 0;
2666 else
2668 /* Not-present in the P2M: MMIO */
2669 gpa = va;
2672 else
2674 /* Magic MMIO marker: extract gfn for MMIO address */
2675 ASSERT(sh_l1e_is_mmio(sl1e));
2676 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2677 << PAGE_SHIFT)
2678 | (va & ~PAGE_MASK);
2680 perfc_incrc(shadow_fault_fast_mmio);
2681 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2682 reset_early_unshadow(v);
2683 handle_mmio(gpa);
2684 return EXCRET_fault_fixed;
2686 else
2688 /* This should be exceptionally rare: another vcpu has fixed
2689 * the tables between the fault and our reading the l1e.
2690 * Retry and let the hardware give us the right fault next time. */
2691 perfc_incrc(shadow_fault_fast_fail);
2692 SHADOW_PRINTK("fast path false alarm!\n");
2693 return EXCRET_fault_fixed;
2696 #endif /* SHOPT_FAST_FAULT_PATH */
2698 /* Detect if this page fault happened while we were already in Xen
2699 * doing a shadow operation. If that happens, the only thing we can
2700 * do is let Xen's normal fault handlers try to fix it. In any case,
2701 * a diagnostic trace of the fault will be more useful than
2702 * a BUG() when we try to take the lock again. */
2703 if ( unlikely(shadow_locked_by_me(d)) )
2705 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
2706 d->arch.paging.shadow.locker_function);
2707 return 0;
2710 shadow_lock(d);
2712 shadow_audit_tables(v);
2714 if ( guest_walk_tables(v, va, &gw, 1) != 0 )
2716 SHADOW_PRINTK("malformed guest pagetable!");
2717 print_gw(&gw);
2720 sh_audit_gw(v, &gw);
2722 // We do not look at the gw->l1e, as that will not exist for superpages.
2723 // Instead, we use the gw->eff_l1e...
2724 //
2725 // We need not check all the levels of the guest page table entries for
2726 // present vs not-present, as the eff_l1e will always be not present if
2727 // one of the higher level entries is not present.
2728 //
2729 if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
2731 if ( is_hvm_domain(d) && !paging_vcpu_mode_translate(v) )
2733 /* Not present in p2m map, means this is mmio */
2734 gpa = va;
2735 goto mmio;
2738 perfc_incrc(shadow_fault_bail_not_present);
2739 goto not_a_shadow_fault;
2742 // All levels of the guest page table are now known to be present.
2743 accumulated_gflags = accumulate_guest_flags(v, &gw);
2745 // Check for attempts to access supervisor-only pages from user mode,
2746 // i.e. ring 3. Such errors are not caused or dealt with by the shadow
2747 // code.
2748 //
2749 if ( (regs->error_code & PFEC_user_mode) &&
2750 !(accumulated_gflags & _PAGE_USER) )
2752 /* illegal user-mode access to supervisor-only page */
2753 perfc_incrc(shadow_fault_bail_user_supervisor);
2754 goto not_a_shadow_fault;
2757 // Was it a write fault?
2758 ft = ((regs->error_code & PFEC_write_access)
2759 ? ft_demand_write : ft_demand_read);
2760 if ( ft == ft_demand_write )
2762 if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
2764 perfc_incrc(shadow_fault_bail_ro_mapping);
2765 goto not_a_shadow_fault;
2768 else // must have been either an insn fetch or read fault
2770 // Check for NX bit violations: attempts to execute code that is
2771 // marked "do not execute". Such errors are not caused or dealt with
2772 // by the shadow code.
2773 //
2774 if ( regs->error_code & PFEC_insn_fetch )
2776 if ( accumulated_gflags & _PAGE_NX_BIT )
2778 /* NX prevented this code fetch */
2779 perfc_incrc(shadow_fault_bail_nx);
2780 goto not_a_shadow_fault;
2785 /* What mfn is the guest trying to access? */
2786 gfn = guest_l1e_get_gfn(gw.eff_l1e);
2787 gmfn = vcpu_gfn_to_mfn(v, gfn);
2788 mmio = (is_hvm_domain(d)
2789 && paging_vcpu_mode_translate(v)
2790 && mmio_space(gfn_to_paddr(gfn)));
2792 if ( !mmio && !mfn_valid(gmfn) )
2794 perfc_incrc(shadow_fault_bail_bad_gfn);
2795 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
2796 gfn_x(gfn), mfn_x(gmfn));
2797 goto not_a_shadow_fault;
2800 /* Make sure there is enough free shadow memory to build a chain of
2801 * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
2802 * to allocate all we need. (We never allocate a top-level shadow
2803 * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
2804 shadow_prealloc(d, SHADOW_MAX_ORDER);
2806 /* Acquire the shadow. This must happen before we figure out the rights
2807 * for the shadow entry, since we might promote a page here. */
2808 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2809 if ( unlikely(ptr_sl1e == NULL) )
2811 /* Couldn't get the sl1e! Since we know the guest entries
2812 * are OK, this can only have been caused by a failed
2813 * shadow_set_l*e(), which will have crashed the guest.
2814 * Get out of the fault handler immediately. */
2815 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2816 unmap_walk(v, &gw);
2817 shadow_unlock(d);
2818 return 0;
2821 /* Calculate the shadow entry and write it */
2822 l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn,
2823 gmfn, &sl1e, ft, mmio);
2824 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2826 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2827 /* Prefetch some more shadow entries */
2828 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2829 #endif
2831 /* Need to emulate accesses to page tables */
2832 if ( sh_mfn_is_a_page_table(gmfn) )
2834 if ( ft == ft_demand_write )
2836 perfc_incrc(shadow_fault_emulate_write);
2837 goto emulate;
2839 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2841 perfc_incrc(shadow_fault_emulate_read);
2842 goto emulate;
2846 if ( mmio )
2848 gpa = guest_walk_to_gpa(&gw);
2849 goto mmio;
2852 perfc_incrc(shadow_fault_fixed);
2853 d->arch.paging.shadow.fault_count++;
2854 reset_early_unshadow(v);
2856 done:
2857 sh_audit_gw(v, &gw);
2858 unmap_walk(v, &gw);
2859 SHADOW_PRINTK("fixed\n");
2860 shadow_audit_tables(v);
2861 shadow_unlock(d);
2862 return EXCRET_fault_fixed;
2864 emulate:
2865 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
2866 goto not_a_shadow_fault;
2868 if ( is_hvm_domain(d) )
2869 hvm_store_cpu_guest_regs(v, regs, NULL);
2870 SHADOW_PRINTK("emulate: eip=%#lx\n", (unsigned long)regs->eip);
2872 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
2874 /*
2875 * We do not emulate user writes. Instead we use them as a hint that the
2876 * page is no longer a page table. This behaviour differs from native, but
2877 * it seems very unlikely that any OS grants user access to page tables.
2878 */
2879 if ( (regs->error_code & PFEC_user_mode) ||
2880 x86_emulate(&emul_ctxt.ctxt, emul_ops) )
2882 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
2883 mfn_x(gmfn));
2884 perfc_incrc(shadow_fault_emulate_failed);
2885 /* If this is actually a page table, then we have a bug, and need
2886 * to support more operations in the emulator. More likely,
2887 * though, this is a hint that this page should not be shadowed. */
2888 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2891 /* Emulator has changed the user registers: write back */
2892 if ( is_hvm_domain(d) )
2893 hvm_load_cpu_guest_regs(v, regs);
2894 goto done;
2896 mmio:
2897 if ( !guest_mode(regs) )
2898 goto not_a_shadow_fault;
2899 perfc_incrc(shadow_fault_mmio);
2900 sh_audit_gw(v, &gw);
2901 unmap_walk(v, &gw);
2902 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
2903 shadow_audit_tables(v);
2904 reset_early_unshadow(v);
2905 shadow_unlock(d);
2906 handle_mmio(gpa);
2907 return EXCRET_fault_fixed;
2909 not_a_shadow_fault:
2910 sh_audit_gw(v, &gw);
2911 unmap_walk(v, &gw);
2912 SHADOW_PRINTK("not a shadow fault\n");
2913 shadow_audit_tables(v);
2914 reset_early_unshadow(v);
2915 shadow_unlock(d);
2916 return 0;
2920 static int
2921 sh_invlpg(struct vcpu *v, unsigned long va)
2922 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
2923 * instruction should be issued on the hardware, or 0 if it's safe not
2924 * to do so. */
2926 shadow_l2e_t sl2e;
2928 perfc_incrc(shadow_invlpg);
2930 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
2931 * run as high as 6% of invlpg calls where we haven't shadowed the l2
2932 * yet. */
2933 #if SHADOW_PAGING_LEVELS == 4
2935 shadow_l3e_t sl3e;
2936 if ( !(shadow_l4e_get_flags(
2937 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
2938 & _PAGE_PRESENT) )
2939 return 0;
2940 /* This must still be a copy-from-user because we don't have the
2941 * shadow lock, and the higher-level shadows might disappear
2942 * under our feet. */
2943 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
2944 + shadow_l3_linear_offset(va)),
2945 sizeof (sl3e)) != 0 )
2947 perfc_incrc(shadow_invlpg_fault);
2948 return 0;
2950 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
2951 return 0;
2953 #elif SHADOW_PAGING_LEVELS == 3
2954 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
2955 & _PAGE_PRESENT) )
2956 // no need to flush anything if there's no SL2...
2957 return 0;
2958 #endif
2960 /* This must still be a copy-from-user because we don't have the shadow
2961 * lock, and the higher-level shadows might disappear under our feet. */
2962 if ( __copy_from_user(&sl2e,
2963 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
2964 sizeof (sl2e)) != 0 )
2966 perfc_incrc(shadow_invlpg_fault);
2967 return 0;
2970 // If there's nothing shadowed for this particular sl2e, then
2971 // there is no need to do an invlpg, either...
2972 //
2973 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
2974 return 0;
2976 // Check to see if the SL2 is a splintered superpage...
2977 // If so, then we'll need to flush the entire TLB (because that's
2978 // easier than invalidating all of the individual 4K pages).
2979 //
2980 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
2981 == SH_type_fl1_shadow )
2983 local_flush_tlb();
2984 return 0;
2987 return 1;
2990 static unsigned long
2991 sh_gva_to_gfn(struct vcpu *v, unsigned long va)
2992 /* Called to translate a guest virtual address to what the *guest*
2993 * pagetables would map it to. */
2995 walk_t gw;
2996 gfn_t gfn;
2998 guest_walk_tables(v, va, &gw, 0);
2999 gfn = guest_walk_to_gfn(&gw);
3000 unmap_walk(v, &gw);
3002 return gfn_x(gfn);
3006 static paddr_t
3007 sh_gva_to_gpa(struct vcpu *v, unsigned long va)
3008 /* Called to translate a guest virtual address to what the *guest*
3009 * pagetables would map it to. */
3011 unsigned long gfn = sh_gva_to_gfn(v, va);
3012 if ( gfn == INVALID_GFN )
3013 return 0;
3014 else
3015 return (((paddr_t)gfn) << PAGE_SHIFT) + (va & ~PAGE_MASK);
3019 static inline void
3020 sh_update_linear_entries(struct vcpu *v)
3021 /* Sync up all the linear mappings for this vcpu's pagetables */
3023 struct domain *d = v->domain;
3025 /* Linear pagetables in PV guests
3026 * ------------------------------
3028 * Guest linear pagetables, which map the guest pages, are at
3029 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3030 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3031 * are set up at shadow creation time, but (of course!) the PAE case
3032 * is subtler. Normal linear mappings are made by having an entry
3033 * in the top-level table that points to itself (shadow linear) or
3034 * to the guest top-level table (guest linear). For PAE, to set up
3035 * a linear map requires us to copy the four top-level entries into
3036 * level-2 entries. That means that every time we change a PAE l3e,
3037 * we need to reflect the change into the copy.
3039 * Linear pagetables in HVM guests
3040 * -------------------------------
3042 * For HVM guests, the linear pagetables are installed in the monitor
3043 * tables (since we can't put them in the shadow). Shadow linear
3044 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3045 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3046 * a linear pagetable of the monitor tables themselves. We have
3047 * the same issue of having to re-copy PAE l3 entries whevever we use
3048 * PAE shadows.
3050 * Because HVM guests run on the same monitor tables regardless of the
3051 * shadow tables in use, the linear mapping of the shadow tables has to
3052 * be updated every time v->arch.shadow_table changes.
3053 */
3055 /* Don't try to update the monitor table if it doesn't exist */
3056 if ( shadow_mode_external(d)
3057 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3058 return;
3060 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3062 /* For PV, one l4e points at the guest l4, one points at the shadow
3063 * l4. No maintenance required.
3064 * For HVM, just need to update the l4e that points to the shadow l4. */
3066 if ( shadow_mode_external(d) )
3068 /* Use the linear map if we can; otherwise make a new mapping */
3069 if ( v == current )
3071 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3072 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3073 __PAGE_HYPERVISOR);
3075 else
3077 l4_pgentry_t *ml4e;
3078 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3079 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3080 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3081 __PAGE_HYPERVISOR);
3082 sh_unmap_domain_page(ml4e);
3086 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3088 /* PV: XXX
3090 * HVM: To give ourselves a linear map of the shadows, we need to
3091 * extend a PAE shadow to 4 levels. We do this by having a monitor
3092 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3093 * entries into it. Then, by having the monitor l4e for shadow
3094 * pagetables also point to the monitor l4, we can use it to access
3095 * the shadows.
3096 */
3098 if ( shadow_mode_external(d) )
3100 /* Install copies of the shadow l3es into the monitor l3 table.
3101 * The monitor l3 table is hooked into slot 0 of the monitor
3102 * l4 table, so we use l3 linear indices 0 to 3 */
3103 shadow_l3e_t *sl3e;
3104 l3_pgentry_t *ml3e;
3105 mfn_t l3mfn;
3106 int i;
3108 /* Use linear mappings if we can; otherwise make new mappings */
3109 if ( v == current )
3111 ml3e = __linear_l3_table;
3112 l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
3114 else
3116 l4_pgentry_t *ml4e;
3117 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3118 ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
3119 l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
3120 ml3e = sh_map_domain_page(l3mfn);
3121 sh_unmap_domain_page(ml4e);
3124 /* Shadow l3 tables are made up by sh_update_cr3 */
3125 sl3e = v->arch.paging.shadow.l3table;
3127 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3129 ml3e[i] =
3130 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3131 ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3132 __PAGE_HYPERVISOR)
3133 : l3e_empty();
3136 if ( v != current )
3137 sh_unmap_domain_page(ml3e);
3139 else
3140 domain_crash(d); /* XXX */
3142 #elif CONFIG_PAGING_LEVELS == 3
3144 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3145 * entries in the shadow, and the shadow's l3 entries into the
3146 * shadow-linear-map l2 entries in the shadow. This is safe to do
3147 * because Xen does not let guests share high-slot l2 tables between l3s,
3148 * so we know we're not treading on anyone's toes.
3150 * HVM: need to copy the shadow's l3 entries into the
3151 * shadow-linear-map l2 entries in the monitor table. This is safe
3152 * because we have one monitor table for each vcpu. The monitor's
3153 * own l3es don't need to be copied because they never change.
3154 * XXX That might change if we start stuffing things into the rest
3155 * of the monitor's virtual address space.
3156 */
3158 l2_pgentry_t *l2e, new_l2e;
3159 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3160 int i;
3161 int unmap_l2e = 0;
3163 #if GUEST_PAGING_LEVELS == 2
3164 /* Shadow l3 tables were built by sh_update_cr3 */
3165 if ( shadow_mode_external(d) )
3166 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3167 else
3168 BUG(); /* PV 2-on-3 is not supported yet */
3170 #else /* GUEST_PAGING_LEVELS == 3 */
3172 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3173 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3175 #endif /* GUEST_PAGING_LEVELS */
3177 /* Choose where to write the entries, using linear maps if possible */
3178 if ( shadow_mode_external(d) )
3180 if ( v == current )
3182 /* From the monitor tables, it's safe to use linear maps
3183 * to update monitor l2s */
3184 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3186 else
3188 /* Map the monitor table's high l2 */
3189 l3_pgentry_t *l3e;
3190 l3e = sh_map_domain_page(
3191 pagetable_get_mfn(v->arch.monitor_table));
3192 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3193 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3194 unmap_l2e = 1;
3195 sh_unmap_domain_page(l3e);
3198 else
3200 /* Map the shadow table's high l2 */
3201 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3202 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3203 unmap_l2e = 1;
3206 /* Write linear mapping of guest (only in PV, and only when
3207 * not translated). */
3208 if ( !shadow_mode_translate(d) )
3210 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3212 new_l2e =
3213 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3214 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3215 __PAGE_HYPERVISOR)
3216 : l2e_empty());
3217 safe_write_entry(
3218 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3219 &new_l2e);
3223 /* Write linear mapping of shadow. */
3224 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3226 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3227 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3228 __PAGE_HYPERVISOR)
3229 : l2e_empty();
3230 safe_write_entry(
3231 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3232 &new_l2e);
3235 if ( unmap_l2e )
3236 sh_unmap_domain_page(l2e);
3239 #elif CONFIG_PAGING_LEVELS == 2
3241 /* For PV, one l2e points at the guest l2, one points at the shadow
3242 * l2. No maintenance required.
3243 * For HVM, just need to update the l2e that points to the shadow l2. */
3245 if ( shadow_mode_external(d) )
3247 /* Use the linear map if we can; otherwise make a new mapping */
3248 if ( v == current )
3250 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3251 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3252 __PAGE_HYPERVISOR);
3254 else
3256 l2_pgentry_t *ml2e;
3257 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3258 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3259 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3260 __PAGE_HYPERVISOR);
3261 sh_unmap_domain_page(ml2e);
3265 #else
3266 #error this should not happen
3267 #endif
3271 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3272 * Does all appropriate management/bookkeeping/refcounting/etc...
3273 */
3274 static void
3275 sh_detach_old_tables(struct vcpu *v)
3277 mfn_t smfn;
3278 int i = 0;
3280 ////
3281 //// vcpu->arch.paging.shadow.guest_vtable
3282 ////
3284 #if GUEST_PAGING_LEVELS == 3
3285 /* PAE guests don't have a mapping of the guest top-level table */
3286 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3287 #else
3288 if ( v->arch.paging.shadow.guest_vtable )
3290 struct domain *d = v->domain;
3291 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3292 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3293 v->arch.paging.shadow.guest_vtable = NULL;
3295 #endif
3298 ////
3299 //// vcpu->arch.shadow_table[]
3300 ////
3302 #if GUEST_PAGING_LEVELS == 3
3303 /* PAE guests have four shadow_table entries */
3304 for ( i = 0 ; i < 4 ; i++ )
3305 #endif
3307 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3308 if ( mfn_x(smfn) )
3309 sh_put_ref(v, smfn, 0);
3310 v->arch.shadow_table[i] = pagetable_null();
3314 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3315 static void
3316 sh_set_toplevel_shadow(struct vcpu *v,
3317 int slot,
3318 mfn_t gmfn,
3319 unsigned int root_type)
3321 mfn_t smfn;
3322 pagetable_t old_entry, new_entry;
3324 struct domain *d = v->domain;
3326 /* Remember the old contents of this slot */
3327 old_entry = v->arch.shadow_table[slot];
3329 /* Now figure out the new contents: is this a valid guest MFN? */
3330 if ( !mfn_valid(gmfn) )
3332 new_entry = pagetable_null();
3333 goto install_new_entry;
3336 /* Guest mfn is valid: shadow it and install the shadow */
3337 smfn = get_shadow_status(v, gmfn, root_type);
3338 if ( !mfn_valid(smfn) )
3340 /* Make sure there's enough free shadow memory. */
3341 shadow_prealloc(d, SHADOW_MAX_ORDER);
3342 /* Shadow the page. */
3343 smfn = sh_make_shadow(v, gmfn, root_type);
3345 ASSERT(mfn_valid(smfn));
3347 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
3348 /* Once again OK to unhook entries from this table if we see fork/exit */
3349 ASSERT(sh_mfn_is_a_page_table(gmfn));
3350 mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
3351 #endif
3353 /* Pin the shadow and put it (back) on the list of top-level shadows */
3354 if ( sh_pin(v, smfn) == 0 )
3356 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3357 domain_crash(v->domain);
3360 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3361 * or the next call to set_toplevel_shadow() */
3362 if ( !sh_get_ref(v, smfn, 0) )
3364 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3365 domain_crash(v->domain);
3368 new_entry = pagetable_from_mfn(smfn);
3370 install_new_entry:
3371 /* Done. Install it */
3372 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3373 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3374 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3375 v->arch.shadow_table[slot] = new_entry;
3377 /* Decrement the refcount of the old contents of this slot */
3378 if ( !pagetable_is_null(old_entry) )
3379 sh_put_ref(v, pagetable_get_mfn(old_entry), 0);
3383 static void
3384 sh_update_cr3(struct vcpu *v, int do_locking)
3385 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3386 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3387 * if appropriate).
3388 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3389 * this function will call hvm_update_guest_cr3() to tell them where the
3390 * shadow tables are.
3391 * If do_locking != 0, assume we are being called from outside the
3392 * shadow code, and must take and release the shadow lock; otherwise
3393 * that is the caller's responsibility.
3394 */
3396 struct domain *d = v->domain;
3397 mfn_t gmfn;
3398 #if GUEST_PAGING_LEVELS == 3
3399 guest_l3e_t *gl3e;
3400 u32 guest_idx=0;
3401 int i;
3402 #endif
3404 /* Don't do anything on an uninitialised vcpu */
3405 if ( !is_hvm_domain(d) && !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
3407 ASSERT(v->arch.cr3 == 0);
3408 return;
3411 if ( do_locking ) shadow_lock(v->domain);
3413 ASSERT(shadow_locked_by_me(v->domain));
3414 ASSERT(v->arch.paging.mode);
3416 ////
3417 //// vcpu->arch.guest_table is already set
3418 ////
3420 #ifndef NDEBUG
3421 /* Double-check that the HVM code has sent us a sane guest_table */
3422 if ( is_hvm_domain(d) )
3424 gfn_t gfn;
3426 ASSERT(shadow_mode_external(d));
3428 // Is paging enabled on this vcpu?
3429 if ( paging_vcpu_mode_translate(v) )
3431 gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
3432 gmfn = vcpu_gfn_to_mfn(v, gfn);
3433 ASSERT(mfn_valid(gmfn));
3434 ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
3436 else
3438 /* Paging disabled: guest_table points at (part of) p2m */
3439 #if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
3440 /* For everything else, they sould be the same */
3441 ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
3442 #endif
3445 #endif
3447 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3448 d->domain_id, v->vcpu_id,
3449 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3451 #if GUEST_PAGING_LEVELS == 4
3452 if ( !(v->arch.flags & TF_kernel_mode) && !IS_COMPAT(v->domain) )
3453 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3454 else
3455 #endif
3456 gmfn = pagetable_get_mfn(v->arch.guest_table);
3459 ////
3460 //// vcpu->arch.paging.shadow.guest_vtable
3461 ////
3462 #if GUEST_PAGING_LEVELS == 4
3463 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3465 if ( v->arch.paging.shadow.guest_vtable )
3466 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3467 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3469 else
3470 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
3471 #elif GUEST_PAGING_LEVELS == 3
3472 /* On PAE guests we don't use a mapping of the guest's own top-level
3473 * table. We cache the current state of that table and shadow that,
3474 * until the next CR3 write makes us refresh our cache. */
3475 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3477 if ( shadow_mode_external(d) && paging_vcpu_mode_translate(v) )
3478 /* Paging enabled: find where in the page the l3 table is */
3479 guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
3480 else
3481 /* Paging disabled or PV: l3 is at the start of a page */
3482 guest_idx = 0;
3484 // Ignore the low 2 bits of guest_idx -- they are really just
3485 // cache control.
3486 guest_idx &= ~3;
3488 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
3489 for ( i = 0; i < 4 ; i++ )
3490 v->arch.paging.shadow.gl3e[i] = gl3e[i];
3491 sh_unmap_domain_page(gl3e);
3492 #elif GUEST_PAGING_LEVELS == 2
3493 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3495 if ( v->arch.paging.shadow.guest_vtable )
3496 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3497 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3499 else
3500 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
3501 #else
3502 #error this should never happen
3503 #endif
3505 #if 0
3506 printk("%s %s %d gmfn=%05lx shadow.guest_vtable=%p\n",
3507 __func__, __FILE__, __LINE__, gmfn, v->arch.paging.shadow.guest_vtable);
3508 #endif
3510 ////
3511 //// vcpu->arch.shadow_table[]
3512 ////
3514 /* We revoke write access to the new guest toplevel page(s) before we
3515 * replace the old shadow pagetable(s), so that we can safely use the
3516 * (old) shadow linear maps in the writeable mapping heuristics. */
3517 #if GUEST_PAGING_LEVELS == 2
3518 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
3519 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3520 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3521 #elif GUEST_PAGING_LEVELS == 3
3522 /* PAE guests have four shadow_table entries, based on the
3523 * current values of the guest's four l3es. */
3525 int flush = 0;
3526 gfn_t gl2gfn;
3527 mfn_t gl2mfn;
3528 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
3529 /* First, make all four entries read-only. */
3530 for ( i = 0; i < 4; i++ )
3532 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3534 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3535 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3536 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
3539 if ( flush )
3540 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3541 /* Now install the new shadows. */
3542 for ( i = 0; i < 4; i++ )
3544 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3546 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3547 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3548 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3549 ? SH_type_l2h_shadow
3550 : SH_type_l2_shadow);
3552 else
3553 /* The guest is not present: clear out the shadow. */
3554 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3557 #elif GUEST_PAGING_LEVELS == 4
3558 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
3559 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3560 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3561 #else
3562 #error This should never happen
3563 #endif
3565 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3566 #endif
3568 ///
3569 /// v->arch.paging.shadow.l3table
3570 ///
3571 #if SHADOW_PAGING_LEVELS == 3
3573 mfn_t smfn;
3574 int i;
3575 for ( i = 0; i < 4; i++ )
3577 #if GUEST_PAGING_LEVELS == 2
3578 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3579 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3580 #else
3581 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3582 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3583 #endif
3584 v->arch.paging.shadow.l3table[i] =
3585 (mfn_x(smfn) == 0)
3586 ? shadow_l3e_empty()
3587 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3590 #endif /* SHADOW_PAGING_LEVELS == 3 */
3593 ///
3594 /// v->arch.cr3
3595 ///
3596 if ( shadow_mode_external(d) )
3598 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3600 else // not shadow_mode_external...
3602 /* We don't support PV except guest == shadow == config levels */
3603 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3604 #if SHADOW_PAGING_LEVELS == 3
3605 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3606 * Don't use make_cr3 because (a) we know it's below 4GB, and
3607 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3608 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
3609 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
3610 #else
3611 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3612 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3613 #endif
3617 ///
3618 /// v->arch.hvm_vcpu.hw_cr3
3619 ///
3620 if ( shadow_mode_external(d) )
3622 ASSERT(is_hvm_domain(d));
3623 #if SHADOW_PAGING_LEVELS == 3
3624 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3625 hvm_update_guest_cr3(v, virt_to_maddr(&v->arch.paging.shadow.l3table));
3626 #else
3627 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3628 hvm_update_guest_cr3(v, pagetable_get_paddr(v->arch.shadow_table[0]));
3629 #endif
3632 /* Fix up the linear pagetable mappings */
3633 sh_update_linear_entries(v);
3635 /* Release the lock, if we took it (otherwise it's the caller's problem) */
3636 if ( do_locking ) shadow_unlock(v->domain);
3640 /**************************************************************************/
3641 /* Functions to revoke guest rights */
3643 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3644 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3645 /* Look up this vaddr in the current shadow and see if it's a writeable
3646 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3648 shadow_l1e_t sl1e, *sl1p;
3649 shadow_l2e_t *sl2p;
3650 #if SHADOW_PAGING_LEVELS >= 3
3651 shadow_l3e_t *sl3p;
3652 #if SHADOW_PAGING_LEVELS >= 4
3653 shadow_l4e_t *sl4p;
3654 #endif
3655 #endif
3656 mfn_t sl1mfn;
3657 int r;
3659 /* Carefully look in the shadow linear map for the l1e we expect */
3660 #if SHADOW_PAGING_LEVELS >= 4
3661 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3662 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3663 return 0;
3664 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3665 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3666 return 0;
3667 #elif SHADOW_PAGING_LEVELS == 3
3668 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
3669 + shadow_l3_linear_offset(vaddr);
3670 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3671 return 0;
3672 #endif
3673 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3674 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3675 return 0;
3676 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3677 sl1e = *sl1p;
3678 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3679 != (_PAGE_PRESENT|_PAGE_RW))
3680 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3681 return 0;
3683 /* Found it! Need to remove its write permissions. */
3684 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3685 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3686 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3687 ASSERT( !(r & SHADOW_SET_ERROR) );
3688 return 1;
3690 #endif
3692 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
3693 mfn_t readonly_mfn)
3694 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3696 shadow_l1e_t *sl1e;
3697 int done = 0;
3698 int flags;
3699 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3701 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3703 flags = shadow_l1e_get_flags(*sl1e);
3704 if ( (flags & _PAGE_PRESENT)
3705 && (flags & _PAGE_RW)
3706 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3708 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
3709 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
3710 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3711 /* Remember the last shadow that we shot a writeable mapping in */
3712 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3713 #endif
3714 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3715 & PGT_count_mask) == 0 )
3716 /* This breaks us cleanly out of the FOREACH macro */
3717 done = 1;
3719 });
3720 return done;
3724 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3725 /* Excises all mappings to guest frame from this shadow l1 table */
3727 shadow_l1e_t *sl1e;
3728 int done = 0;
3729 int flags;
3731 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3733 flags = shadow_l1e_get_flags(*sl1e);
3734 if ( (flags & _PAGE_PRESENT)
3735 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3737 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3738 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3739 /* This breaks us cleanly out of the FOREACH macro */
3740 done = 1;
3742 });
3743 return done;
3746 /**************************************************************************/
3747 /* Functions to excise all pointers to shadows from higher-level shadows. */
3749 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3750 /* Blank out a single shadow entry */
3752 switch ( mfn_to_shadow_page(smfn)->type )
3754 case SH_type_l1_shadow:
3755 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3756 case SH_type_l2_shadow:
3757 #if GUEST_PAGING_LEVELS == 3
3758 case SH_type_l2h_shadow:
3759 #endif
3760 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3761 #if GUEST_PAGING_LEVELS >= 4
3762 case SH_type_l3_shadow:
3763 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3764 case SH_type_l4_shadow:
3765 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3766 #endif
3767 default: BUG(); /* Called with the wrong kind of shadow. */
3771 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3772 /* Remove all mappings of this l1 shadow from this l2 shadow */
3774 shadow_l2e_t *sl2e;
3775 int done = 0;
3776 int flags;
3777 #if GUEST_PAGING_LEVELS != 4
3778 int xen_mappings = !shadow_mode_external(v->domain);
3779 #endif
3781 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings,
3783 flags = shadow_l2e_get_flags(*sl2e);
3784 if ( (flags & _PAGE_PRESENT)
3785 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3787 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3788 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
3789 /* This breaks us cleanly out of the FOREACH macro */
3790 done = 1;
3792 });
3793 return done;
3796 #if GUEST_PAGING_LEVELS >= 4
3797 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3798 /* Remove all mappings of this l2 shadow from this l3 shadow */
3800 shadow_l3e_t *sl3e;
3801 int done = 0;
3802 int flags;
3804 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3806 flags = shadow_l3e_get_flags(*sl3e);
3807 if ( (flags & _PAGE_PRESENT)
3808 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3810 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3811 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
3812 /* This breaks us cleanly out of the FOREACH macro */
3813 done = 1;
3815 });
3816 return done;
3819 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
3820 /* Remove all mappings of this l3 shadow from this l4 shadow */
3822 shadow_l4e_t *sl4e;
3823 int done = 0;
3824 int flags, xen_mappings = !shadow_mode_external(v->domain);
3826 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
3828 flags = shadow_l4e_get_flags(*sl4e);
3829 if ( (flags & _PAGE_PRESENT)
3830 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
3832 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
3833 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
3834 /* This breaks us cleanly out of the FOREACH macro */
3835 done = 1;
3837 });
3838 return done;
3840 #endif /* 64bit guest */
3842 /**************************************************************************/
3843 /* Handling HVM guest writes to pagetables */
3845 /* Check that the user is allowed to perform this write.
3846 * Returns a mapped pointer to write to, and the mfn it's on,
3847 * or NULL for error. */
3848 static inline void * emulate_map_dest(struct vcpu *v,
3849 unsigned long vaddr,
3850 struct sh_emulate_ctxt *sh_ctxt,
3851 mfn_t *mfnp)
3853 walk_t gw;
3854 u32 flags, errcode;
3855 gfn_t gfn;
3856 mfn_t mfn;
3858 guest_walk_tables(v, vaddr, &gw, 1);
3859 flags = accumulate_guest_flags(v, &gw);
3860 gfn = guest_l1e_get_gfn(gw.eff_l1e);
3861 mfn = vcpu_gfn_to_mfn(v, gfn);
3862 sh_audit_gw(v, &gw);
3863 unmap_walk(v, &gw);
3865 if ( !(flags & _PAGE_PRESENT) )
3867 errcode = 0;
3868 goto page_fault;
3871 if ( !(flags & _PAGE_RW) ||
3872 (!(flags & _PAGE_USER) && ring_3(sh_ctxt->ctxt.regs)) )
3874 errcode = PFEC_page_present;
3875 goto page_fault;
3878 if ( !mfn_valid(mfn) )
3879 return NULL;
3881 *mfnp = mfn;
3882 return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
3884 page_fault:
3885 errcode |= PFEC_write_access;
3886 if ( is_hvm_vcpu(v) )
3887 hvm_inject_exception(TRAP_page_fault, errcode, vaddr);
3888 else
3889 propagate_page_fault(vaddr, errcode);
3890 return NULL;
3893 static int safe_not_to_verify_write(mfn_t gmfn, void *dst, void *src,
3894 int bytes)
3896 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
3897 struct page_info *pg = mfn_to_page(gmfn);
3898 if ( !(pg->shadow_flags & SHF_32)
3899 && bytes == 4
3900 && ((unsigned long)dst & 3) == 0 )
3902 /* Not shadowed 32-bit: aligned 64-bit writes that leave the
3903 * present bit unset are safe to ignore. */
3904 if ( (*(u64*)src & _PAGE_PRESENT) == 0
3905 && (*(u64*)dst & _PAGE_PRESENT) == 0 )
3906 return 1;
3908 else if ( !(pg->shadow_flags & (SHF_PAE|SHF_64))
3909 && bytes == 8
3910 && ((unsigned long)dst & 7) == 0 )
3912 /* Not shadowed PAE/64-bit: aligned 32-bit writes that leave the
3913 * present bit unset are safe to ignore. */
3914 if ( (*(u32*)src & _PAGE_PRESENT) == 0
3915 && (*(u32*)dst & _PAGE_PRESENT) == 0 )
3916 return 1;
3918 #endif
3919 return 0;
3923 int
3924 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
3925 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
3927 mfn_t mfn;
3928 void *addr;
3929 int skip;
3931 if ( vaddr & (bytes-1) )
3932 return X86EMUL_UNHANDLEABLE;
3934 ASSERT(shadow_locked_by_me(v->domain));
3935 ASSERT(((vaddr & ~PAGE_MASK) + bytes) <= PAGE_SIZE);
3937 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
3938 return X86EMUL_PROPAGATE_FAULT;
3940 skip = safe_not_to_verify_write(mfn, addr, src, bytes);
3941 memcpy(addr, src, bytes);
3942 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
3944 /* If we are writing zeros to this page, might want to unshadow */
3945 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
3946 check_for_early_unshadow(v, mfn);
3948 sh_unmap_domain_page(addr);
3949 shadow_audit_tables(v);
3950 return X86EMUL_CONTINUE;
3953 int
3954 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
3955 unsigned long old, unsigned long new,
3956 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
3958 mfn_t mfn;
3959 void *addr;
3960 unsigned long prev;
3961 int rv = X86EMUL_CONTINUE, skip;
3963 ASSERT(shadow_locked_by_me(v->domain));
3964 ASSERT(bytes <= sizeof(unsigned long));
3966 if ( vaddr & (bytes-1) )
3967 return X86EMUL_UNHANDLEABLE;
3969 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
3970 return X86EMUL_PROPAGATE_FAULT;
3972 skip = safe_not_to_verify_write(mfn, &new, &old, bytes);
3974 switch ( bytes )
3976 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
3977 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
3978 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
3979 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
3980 default:
3981 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
3982 prev = ~old;
3985 if ( prev == old )
3987 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
3989 else
3990 rv = X86EMUL_CMPXCHG_FAILED;
3992 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
3993 " wanted %#lx now %#lx bytes %u\n",
3994 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
3996 /* If we are writing zeros to this page, might want to unshadow */
3997 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
3998 check_for_early_unshadow(v, mfn);
4000 sh_unmap_domain_page(addr);
4001 shadow_audit_tables(v);
4002 return rv;
4005 int
4006 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4007 unsigned long old_lo, unsigned long old_hi,
4008 unsigned long new_lo, unsigned long new_hi,
4009 struct sh_emulate_ctxt *sh_ctxt)
4011 mfn_t mfn;
4012 void *addr;
4013 u64 old, new, prev;
4014 int rv = X86EMUL_CONTINUE, skip;
4016 ASSERT(shadow_locked_by_me(v->domain));
4018 if ( vaddr & 7 )
4019 return X86EMUL_UNHANDLEABLE;
4021 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
4022 return X86EMUL_PROPAGATE_FAULT;
4024 old = (((u64) old_hi) << 32) | (u64) old_lo;
4025 new = (((u64) new_hi) << 32) | (u64) new_lo;
4026 skip = safe_not_to_verify_write(mfn, &new, &old, 8);
4027 prev = cmpxchg(((u64 *)addr), old, new);
4029 if ( prev == old )
4031 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, 8);
4033 else
4034 rv = X86EMUL_CMPXCHG_FAILED;
4036 /* If we are writing zeros to this page, might want to unshadow */
4037 if ( *(u32 *)addr == 0 )
4038 check_for_early_unshadow(v, mfn);
4040 sh_unmap_domain_page(addr);
4041 shadow_audit_tables(v);
4042 return rv;
4046 /**************************************************************************/
4047 /* Audit tools */
4049 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4051 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4052 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4053 "gl" #_level "mfn = %" PRI_mfn \
4054 " sl" #_level "mfn = %" PRI_mfn \
4055 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4056 " gl" #_level "e = %" SH_PRI_gpte \
4057 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4058 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4059 _level, guest_index(gl ## _level ## e), \
4060 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4061 gl ## _level ## e, sl ## _level ## e, \
4062 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4063 ##_a); \
4064 BUG(); \
4065 done = 1; \
4066 } while (0)
4069 static char * sh_audit_flags(struct vcpu *v, int level,
4070 int gflags, int sflags)
4071 /* Common code for auditing flag bits */
4073 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4074 return "shadow is present but guest is not present";
4075 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4076 return "global bit set in PV shadow";
4077 if ( level == 2 && (sflags & _PAGE_PSE) )
4078 return "PS bit set in shadow";
4079 #if SHADOW_PAGING_LEVELS == 3
4080 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4081 #endif
4082 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4083 return "accessed bit not propagated";
4084 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4085 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4086 return "dirty bit not propagated";
4087 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4088 return "user/supervisor bit does not match";
4089 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4090 return "NX bit does not match";
4091 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4092 return "shadow grants write access but guest does not";
4093 return NULL;
4096 static inline mfn_t
4097 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
4098 /* Convert this gfn to an mfn in the manner appropriate for the
4099 * guest pagetable it's used in (gmfn) */
4101 if ( !shadow_mode_translate(v->domain) )
4102 return _mfn(gfn_x(gfn));
4104 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
4105 != PGT_writable_page )
4106 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
4107 else
4108 return gfn_to_mfn(v->domain, gfn_x(gfn));
4112 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4114 guest_l1e_t *gl1e, *gp;
4115 shadow_l1e_t *sl1e;
4116 mfn_t mfn, gmfn, gl1mfn;
4117 gfn_t gfn;
4118 char *s;
4119 int done = 0;
4121 /* Follow the backpointer */
4122 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4123 gl1e = gp = sh_map_domain_page(gl1mfn);
4124 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4126 if ( sh_l1e_is_magic(*sl1e) )
4128 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4129 if ( sh_l1e_is_gnp(*sl1e) )
4131 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4132 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4134 else
4136 ASSERT(sh_l1e_is_mmio(*sl1e));
4137 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4138 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4139 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4140 " but guest gfn is %" SH_PRI_gfn,
4141 gfn_x(gfn),
4142 gfn_x(guest_l1e_get_gfn(*gl1e)));
4144 #endif
4146 else
4148 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4149 shadow_l1e_get_flags(*sl1e));
4150 if ( s ) AUDIT_FAIL(1, "%s", s);
4152 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4154 gfn = guest_l1e_get_gfn(*gl1e);
4155 mfn = shadow_l1e_get_mfn(*sl1e);
4156 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
4157 if ( mfn_x(gmfn) != mfn_x(mfn) )
4158 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4159 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4160 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4163 });
4164 sh_unmap_domain_page(gp);
4165 return done;
4168 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4170 guest_l1e_t *gl1e, e;
4171 shadow_l1e_t *sl1e;
4172 mfn_t gl1mfn = _mfn(INVALID_MFN);
4173 int f;
4174 int done = 0;
4176 /* fl1 has no useful backpointer: all we can check are flags */
4177 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4178 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4179 f = shadow_l1e_get_flags(*sl1e);
4180 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4181 if ( !(f == 0
4182 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4183 _PAGE_ACCESSED|_PAGE_DIRTY)
4184 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4185 || sh_l1e_is_magic(*sl1e)) )
4186 AUDIT_FAIL(1, "fl1e has bad flags");
4187 });
4188 return 0;
4191 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4193 guest_l2e_t *gl2e, *gp;
4194 shadow_l2e_t *sl2e;
4195 mfn_t mfn, gmfn, gl2mfn;
4196 gfn_t gfn;
4197 char *s;
4198 int done = 0;
4199 #if GUEST_PAGING_LEVELS != 4
4200 int xen_mappings = !shadow_mode_external(v->domain);
4201 #endif
4203 /* Follow the backpointer */
4204 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4205 gl2e = gp = sh_map_domain_page(gl2mfn);
4206 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
4208 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4209 shadow_l2e_get_flags(*sl2e));
4210 if ( s ) AUDIT_FAIL(2, "%s", s);
4212 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4214 gfn = guest_l2e_get_gfn(*gl2e);
4215 mfn = shadow_l2e_get_mfn(*sl2e);
4216 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4217 ? get_fl1_shadow_status(v, gfn)
4218 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4219 SH_type_l1_shadow);
4220 if ( mfn_x(gmfn) != mfn_x(mfn) )
4221 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4222 " (--> %" PRI_mfn ")"
4223 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4224 gfn_x(gfn),
4225 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4226 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4227 mfn_x(gmfn), mfn_x(mfn));
4229 });
4230 sh_unmap_domain_page(gp);
4231 return 0;
4234 #if GUEST_PAGING_LEVELS >= 4
4235 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4237 guest_l3e_t *gl3e, *gp;
4238 shadow_l3e_t *sl3e;
4239 mfn_t mfn, gmfn, gl3mfn;
4240 gfn_t gfn;
4241 char *s;
4242 int done = 0;
4244 /* Follow the backpointer */
4245 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4246 gl3e = gp = sh_map_domain_page(gl3mfn);
4247 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4249 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4250 shadow_l3e_get_flags(*sl3e));
4251 if ( s ) AUDIT_FAIL(3, "%s", s);
4253 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4255 gfn = guest_l3e_get_gfn(*gl3e);
4256 mfn = shadow_l3e_get_mfn(*sl3e);
4257 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4258 (GUEST_PAGING_LEVELS == 3
4259 && !shadow_mode_external(v->domain)
4260 && (guest_index(gl3e) % 4) == 3)
4261 ? SH_type_l2h_pae_shadow
4262 : SH_type_l2_shadow);
4263 if ( mfn_x(gmfn) != mfn_x(mfn) )
4264 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4265 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4266 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4268 });
4269 sh_unmap_domain_page(gp);
4270 return 0;
4273 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4275 guest_l4e_t *gl4e, *gp;
4276 shadow_l4e_t *sl4e;
4277 mfn_t mfn, gmfn, gl4mfn;
4278 gfn_t gfn;
4279 char *s;
4280 int done = 0;
4281 int xen_mappings = !shadow_mode_external(v->domain);
4283 /* Follow the backpointer */
4284 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4285 gl4e = gp = sh_map_domain_page(gl4mfn);
4286 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
4288 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4289 shadow_l4e_get_flags(*sl4e));
4290 if ( s ) AUDIT_FAIL(4, "%s", s);
4292 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4294 gfn = guest_l4e_get_gfn(*gl4e);
4295 mfn = shadow_l4e_get_mfn(*sl4e);
4296 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4297 SH_type_l3_shadow);
4298 if ( mfn_x(gmfn) != mfn_x(mfn) )
4299 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4300 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4301 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4303 });
4304 sh_unmap_domain_page(gp);
4305 return 0;
4307 #endif /* GUEST_PAGING_LEVELS >= 4 */
4310 #undef AUDIT_FAIL
4312 #endif /* Audit code */
4314 /**************************************************************************/
4315 /* Entry points into this mode of the shadow code.
4316 * This will all be mangled by the preprocessor to uniquify everything. */
4317 struct paging_mode sh_paging_mode = {
4318 .page_fault = sh_page_fault,
4319 .invlpg = sh_invlpg,
4320 .gva_to_gpa = sh_gva_to_gpa,
4321 .gva_to_gfn = sh_gva_to_gfn,
4322 .update_cr3 = sh_update_cr3,
4323 .update_paging_modes = shadow_update_paging_modes,
4324 .write_p2m_entry = shadow_write_p2m_entry,
4325 .write_guest_entry = shadow_write_guest_entry,
4326 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
4327 .guest_map_l1e = sh_guest_map_l1e,
4328 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4329 .guest_levels = GUEST_PAGING_LEVELS,
4330 .shadow.detach_old_tables = sh_detach_old_tables,
4331 .shadow.x86_emulate_write = sh_x86_emulate_write,
4332 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4333 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4334 .shadow.make_monitor_table = sh_make_monitor_table,
4335 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
4336 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4337 .shadow.guess_wrmap = sh_guess_wrmap,
4338 #endif
4339 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
4340 };
4342 /*
4343 * Local variables:
4344 * mode: C
4345 * c-set-style: "BSD"
4346 * c-basic-offset: 4
4347 * indent-tabs-mode: nil
4348 * End:
4349 */