ia64/xen-unstable

view xen/arch/x86/mm/shadow/multi.c @ 14080:c0b1a3b54548

hvm: Turn stack-pointer comparison on shadow-emulate path into a
heuristic which we can allow to fail.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Feb 22 13:03:49 2007 +0000 (2007-02-22)
parents 3f7e8c763b55
children 9e35371a3caa
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include "private.h"
37 #include "types.h"
39 /* THINGS TO DO LATER:
40 *
41 * TEARDOWN HEURISTICS
42 * Also: have a heuristic for when to destroy a previous paging-mode's
43 * shadows. When a guest is done with its start-of-day 32-bit tables
44 * and reuses the memory we want to drop those shadows. Start with
45 * shadows in a page in two modes as a hint, but beware of clever tricks
46 * like reusing a pagetable for both PAE and 64-bit during boot...
47 *
48 * PAE LINEAR MAPS
49 * Rework shadow_get_l*e() to have the option of using map_domain_page()
50 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
51 * Then we can test the speed difference made by linear maps. If the
52 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
53 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
54 * to share l2h pages again.
55 *
56 * GUEST_WALK_TABLES TLB FLUSH COALESCE
57 * guest_walk_tables can do up to three remote TLB flushes as it walks to
58 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
59 * and if we do flush, re-do the walk. If anything has changed, then
60 * pause all the other vcpus and do the walk *again*.
61 *
62 * WP DISABLED
63 * Consider how to implement having the WP bit of CR0 set to 0.
64 * Since we need to be able to cause write faults to pagetables, this might
65 * end up looking like not having the (guest) pagetables present at all in
66 * HVM guests...
67 *
68 * PSE disabled / PSE36
69 * We don't support any modes other than PSE enabled, PSE36 disabled.
70 * Neither of those would be hard to change, but we'd need to be able to
71 * deal with shadows made in one mode and used in another.
72 */
74 #define FETCH_TYPE_PREFETCH 1
75 #define FETCH_TYPE_DEMAND 2
76 #define FETCH_TYPE_WRITE 4
77 typedef enum {
78 ft_prefetch = FETCH_TYPE_PREFETCH,
79 ft_demand_read = FETCH_TYPE_DEMAND,
80 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
81 } fetch_type_t;
83 #ifdef DEBUG_TRACE_DUMP
84 static char *fetch_type_names[] = {
85 [ft_prefetch] "prefetch",
86 [ft_demand_read] "demand read",
87 [ft_demand_write] "demand write",
88 };
89 #endif
91 /**************************************************************************/
92 /* Hash table mapping from guest pagetables to shadows
93 *
94 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
95 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
96 * shadow L1 which maps its "splinters".
97 */
99 static inline mfn_t
100 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
101 /* Look for FL1 shadows in the hash table */
102 {
103 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
105 if ( unlikely(shadow_mode_log_dirty(v->domain) && mfn_valid(smfn)) )
106 {
107 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
108 if ( !(sp->logdirty) )
109 shadow_convert_to_log_dirty(v, smfn);
110 }
112 return smfn;
113 }
115 static inline mfn_t
116 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
117 /* Look for shadows in the hash table */
118 {
119 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
120 perfc_incrc(shadow_get_shadow_status);
122 if ( unlikely(shadow_mode_log_dirty(v->domain) && mfn_valid(smfn)) )
123 {
124 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
125 if ( !(sp->logdirty) )
126 shadow_convert_to_log_dirty(v, smfn);
127 }
129 return smfn;
130 }
132 static inline void
133 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
134 /* Put an FL1 shadow into the hash table */
135 {
136 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
137 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
139 if ( unlikely(shadow_mode_log_dirty(v->domain)) )
140 // mark this shadow as a log dirty shadow...
141 mfn_to_shadow_page(smfn)->logdirty = 1;
142 else
143 mfn_to_shadow_page(smfn)->logdirty = 0;
145 shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
146 }
148 static inline void
149 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
150 /* Put a shadow into the hash table */
151 {
152 struct domain *d = v->domain;
153 int res;
155 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
156 d->domain_id, v->vcpu_id, mfn_x(gmfn),
157 shadow_type, mfn_x(smfn));
159 if ( unlikely(shadow_mode_log_dirty(d)) )
160 // mark this shadow as a log dirty shadow...
161 mfn_to_shadow_page(smfn)->logdirty = 1;
162 else
163 mfn_to_shadow_page(smfn)->logdirty = 0;
165 #ifdef CONFIG_COMPAT
166 if ( !IS_COMPAT(d) || shadow_type != SH_type_l4_64_shadow )
167 #endif
168 {
169 res = get_page(mfn_to_page(gmfn), d);
170 ASSERT(res == 1);
171 }
173 shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
174 }
176 static inline void
177 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
178 /* Remove a shadow from the hash table */
179 {
180 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
181 gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
182 shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
183 }
185 static inline void
186 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
187 /* Remove a shadow from the hash table */
188 {
189 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
190 v->domain->domain_id, v->vcpu_id,
191 mfn_x(gmfn), shadow_type, mfn_x(smfn));
192 shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
193 #ifdef CONFIG_COMPAT
194 if ( !IS_COMPAT(v->domain) || shadow_type != SH_type_l4_64_shadow )
195 #endif
196 put_page(mfn_to_page(gmfn));
197 }
199 /**************************************************************************/
200 /* CPU feature support querying */
202 static inline int
203 guest_supports_superpages(struct vcpu *v)
204 {
205 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
206 * CR4.PSE is set or the guest is in PAE or long mode */
207 return (is_hvm_vcpu(v) && (GUEST_PAGING_LEVELS != 2
208 || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
209 }
211 static inline int
212 guest_supports_nx(struct vcpu *v)
213 {
214 if ( !is_hvm_vcpu(v) )
215 return cpu_has_nx;
217 // XXX - fix this!
218 return 1;
219 }
222 /**************************************************************************/
223 /* Functions for walking the guest page tables */
226 /* Walk the guest pagetables, filling the walk_t with what we see.
227 * Takes an uninitialised walk_t. The caller must call unmap_walk()
228 * on the walk_t before discarding it or calling guest_walk_tables again.
229 * If "guest_op" is non-zero, we are serving a genuine guest memory access,
230 * and must (a) be under the shadow lock, and (b) remove write access
231 * from any gueat PT pages we see, as we will be using their contents to
232 * perform shadow updates.
233 * Returns 0 for success or non-zero if the guest pagetables are malformed.
234 * N.B. Finding a not-present entry does not cause a non-zero return code. */
235 static inline int
236 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
237 {
238 ASSERT(!guest_op || shadow_locked_by_me(v->domain));
240 perfc_incrc(shadow_guest_walk);
241 memset(gw, 0, sizeof(*gw));
242 gw->va = va;
244 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
245 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
246 /* Get l4e from the top level table */
247 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
248 gw->l4e = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable
249 + guest_l4_table_offset(va);
250 /* Walk down to the l3e */
251 if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
252 gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
253 if ( !mfn_valid(gw->l3mfn) ) return 1;
254 /* This mfn is a pagetable: make sure the guest can't write to it. */
255 if ( guest_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
256 flush_tlb_mask(v->domain->domain_dirty_cpumask);
257 gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
258 + guest_l3_table_offset(va);
259 #else /* PAE only... */
260 /* Get l3e from the cache of the guest's top level table */
261 gw->l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
262 #endif /* PAE or 64... */
263 /* Walk down to the l2e */
264 if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
265 gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
266 if ( !mfn_valid(gw->l2mfn) ) return 1;
267 /* This mfn is a pagetable: make sure the guest can't write to it. */
268 if ( guest_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
269 flush_tlb_mask(v->domain->domain_dirty_cpumask);
270 gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
271 + guest_l2_table_offset(va);
272 #else /* 32-bit only... */
273 /* Get l2e from the top level table */
274 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
275 gw->l2e = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable
276 + guest_l2_table_offset(va);
277 #endif /* All levels... */
279 if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
280 if ( guest_supports_superpages(v) &&
281 (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
282 {
283 /* Special case: this guest VA is in a PSE superpage, so there's
284 * no guest l1e. We make one up so that the propagation code
285 * can generate a shadow l1 table. Start with the gfn of the
286 * first 4k-page of the superpage. */
287 gfn_t start = guest_l2e_get_gfn(*gw->l2e);
288 /* Grant full access in the l1e, since all the guest entry's
289 * access controls are enforced in the shadow l2e. This lets
290 * us reflect l2 changes later without touching the l1s. */
291 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
292 _PAGE_ACCESSED|_PAGE_DIRTY);
293 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
294 * of the level 1 */
295 if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
296 flags |= _PAGE_PAT;
297 /* Increment the pfn by the right number of 4k pages.
298 * The ~0x1 is to mask out the PAT bit mentioned above. */
299 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
300 gw->eff_l1e = guest_l1e_from_gfn(start, flags);
301 gw->l1e = NULL;
302 gw->l1mfn = _mfn(INVALID_MFN);
303 }
304 else
305 {
306 /* Not a superpage: carry on and find the l1e. */
307 gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
308 if ( !mfn_valid(gw->l1mfn) ) return 1;
309 /* This mfn is a pagetable: make sure the guest can't write to it. */
310 if ( guest_op
311 && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
312 flush_tlb_mask(v->domain->domain_dirty_cpumask);
313 gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
314 + guest_l1_table_offset(va);
315 gw->eff_l1e = *gw->l1e;
316 }
318 return 0;
319 }
321 /* Given a walk_t, translate the gw->va into the guest's notion of the
322 * corresponding frame number. */
323 static inline gfn_t
324 guest_walk_to_gfn(walk_t *gw)
325 {
326 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
327 return _gfn(INVALID_GFN);
328 return guest_l1e_get_gfn(gw->eff_l1e);
329 }
331 /* Given a walk_t, translate the gw->va into the guest's notion of the
332 * corresponding physical address. */
333 static inline paddr_t
334 guest_walk_to_gpa(walk_t *gw)
335 {
336 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
337 return 0;
338 return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
339 }
342 /* Unmap (and reinitialise) a guest walk.
343 * Call this to dispose of any walk filled in by guest_walk_tables() */
344 static void unmap_walk(struct vcpu *v, walk_t *gw)
345 {
346 #if GUEST_PAGING_LEVELS >= 3
347 #if GUEST_PAGING_LEVELS >= 4
348 if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
349 #endif
350 if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
351 #endif
352 if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
353 #ifdef DEBUG
354 memset(gw, 0, sizeof(*gw));
355 #endif
356 }
359 /* Pretty-print the contents of a guest-walk */
360 static inline void print_gw(walk_t *gw)
361 {
362 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
363 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
364 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
365 SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
366 SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
367 if ( gw->l4e )
368 SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
369 SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
370 #endif /* PAE or 64... */
371 SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
372 if ( gw->l3e )
373 SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
374 #endif /* All levels... */
375 SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
376 SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
377 if ( gw->l2e )
378 SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
379 SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
380 SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
381 if ( gw->l1e )
382 SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
383 SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
384 }
387 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
388 /* Lightweight audit: pass all the shadows associated with this guest walk
389 * through the audit mechanisms */
390 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
391 {
392 mfn_t smfn;
394 if ( !(SHADOW_AUDIT_ENABLE) )
395 return;
397 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
398 if ( mfn_valid(gw->l4mfn)
399 && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
400 SH_type_l4_shadow))) )
401 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
402 if ( mfn_valid(gw->l3mfn)
403 && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
404 SH_type_l3_shadow))) )
405 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
406 #endif /* PAE or 64... */
407 if ( mfn_valid(gw->l2mfn) )
408 {
409 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
410 SH_type_l2_shadow))) )
411 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
412 #if GUEST_PAGING_LEVELS == 3
413 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
414 SH_type_l2h_shadow))) )
415 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
416 #endif
417 }
418 if ( mfn_valid(gw->l1mfn)
419 && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
420 SH_type_l1_shadow))) )
421 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
422 else if ( gw->l2e
423 && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
424 && mfn_valid(
425 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
426 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
427 }
429 #else
430 #define sh_audit_gw(_v, _gw) do {} while(0)
431 #endif /* audit code */
435 /**************************************************************************/
436 /* Function to write to the guest tables, for propagating accessed and
437 * dirty bits from the shadow to the guest.
438 * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
439 * and an operation type. The guest entry is always passed as an l1e:
440 * since we only ever write flags, that's OK.
441 * Returns the new flag bits of the guest entry. */
443 static u32 guest_set_ad_bits(struct vcpu *v,
444 mfn_t gmfn,
445 guest_l1e_t *ep,
446 unsigned int level,
447 fetch_type_t ft)
448 {
449 u32 flags;
450 int res = 0;
452 ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
453 ASSERT(level <= GUEST_PAGING_LEVELS);
454 ASSERT(shadow_locked_by_me(v->domain));
456 flags = guest_l1e_get_flags(*ep);
458 /* Only set A and D bits for guest-initiated accesses */
459 if ( !(ft & FETCH_TYPE_DEMAND) )
460 return flags;
462 ASSERT(mfn_valid(gmfn)
463 && (sh_mfn_is_a_page_table(gmfn)
464 || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
465 == 0)));
467 /* PAE l3s do not have A and D bits */
468 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
470 /* Need the D bit as well for writes, in L1es and PSE L2es. */
471 if ( ft == ft_demand_write
472 && (level == 1 ||
473 (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
474 {
475 if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
476 == (_PAGE_DIRTY | _PAGE_ACCESSED) )
477 return flags; /* Guest already has A and D bits set */
478 flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
479 perfc_incrc(shadow_ad_update);
480 }
481 else
482 {
483 if ( flags & _PAGE_ACCESSED )
484 return flags; /* Guest already has A bit set */
485 flags |= _PAGE_ACCESSED;
486 perfc_incrc(shadow_a_update);
487 }
489 /* Set the bit(s) */
490 sh_mark_dirty(v->domain, gmfn);
491 SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
492 "old flags = %#x, new flags = %#x\n",
493 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep),
494 flags);
495 *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
497 /* Propagate this change to any other shadows of the page
498 * (only necessary if there is more than one shadow) */
499 if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
500 {
501 u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
502 /* More than one type bit set in shadow-flags? */
503 if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
504 res = sh_validate_guest_entry(v, gmfn, ep, sizeof (*ep));
505 }
507 /* We should never need to flush the TLB or recopy PAE entries */
508 ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
510 return flags;
511 }
513 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
514 void *
515 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
516 unsigned long *gl1mfn)
517 {
518 void *pl1e = NULL;
519 walk_t gw;
521 ASSERT(shadow_mode_translate(v->domain));
523 // XXX -- this is expensive, but it's easy to cobble together...
524 // FIXME!
526 shadow_lock(v->domain);
527 guest_walk_tables(v, addr, &gw, 1);
529 if ( gw.l2e &&
530 (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) &&
531 !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) )
532 {
533 if ( gl1mfn )
534 *gl1mfn = mfn_x(gw.l1mfn);
535 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
536 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
537 }
539 unmap_walk(v, &gw);
540 shadow_unlock(v->domain);
542 return pl1e;
543 }
545 void
546 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
547 {
548 walk_t gw;
550 ASSERT(shadow_mode_translate(v->domain));
552 // XXX -- this is expensive, but it's easy to cobble together...
553 // FIXME!
555 shadow_lock(v->domain);
556 guest_walk_tables(v, addr, &gw, 1);
557 *(guest_l1e_t *)eff_l1e = gw.eff_l1e;
558 unmap_walk(v, &gw);
559 shadow_unlock(v->domain);
560 }
561 #endif /* CONFIG==SHADOW==GUEST */
563 /**************************************************************************/
564 /* Functions to compute the correct index into a shadow page, given an
565 * index into the guest page (as returned by guest_get_index()).
566 * This is trivial when the shadow and guest use the same sized PTEs, but
567 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
568 * PAE- or 64-bit shadows).
569 *
570 * These functions also increment the shadow mfn, when necessary. When PTE
571 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
572 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
573 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
574 * which shadow page we really want. Similarly, when PTE sizes are
575 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
576 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
577 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
578 * space.)
579 *
580 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
581 * of shadow (to store both the shadow, and the info that would normally be
582 * stored in page_info fields). This arrangement allows the shadow and the
583 * "page_info" fields to always be stored in the same page (in fact, in
584 * the same cache line), avoiding an extra call to map_domain_page().
585 */
587 static inline u32
588 guest_index(void *ptr)
589 {
590 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
591 }
593 static u32
594 shadow_l1_index(mfn_t *smfn, u32 guest_index)
595 {
596 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
597 *smfn = _mfn(mfn_x(*smfn) +
598 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
599 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
600 #else
601 return guest_index;
602 #endif
603 }
605 static u32
606 shadow_l2_index(mfn_t *smfn, u32 guest_index)
607 {
608 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
609 // Because we use 2 shadow l2 entries for each guest entry, the number of
610 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
611 //
612 *smfn = _mfn(mfn_x(*smfn) +
613 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
615 // We multiple by two to get the index of the first of the two entries
616 // used to shadow the specified guest entry.
617 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
618 #else
619 return guest_index;
620 #endif
621 }
623 #if GUEST_PAGING_LEVELS >= 4
625 static u32
626 shadow_l3_index(mfn_t *smfn, u32 guest_index)
627 {
628 return guest_index;
629 }
631 static u32
632 shadow_l4_index(mfn_t *smfn, u32 guest_index)
633 {
634 return guest_index;
635 }
637 #endif // GUEST_PAGING_LEVELS >= 4
640 /**************************************************************************/
641 /* Function which computes shadow entries from their corresponding guest
642 * entries. This is the "heart" of the shadow code. It operates using
643 * level-1 shadow types, but handles all levels of entry.
644 * Don't call it directly, but use the four wrappers below.
645 */
647 static always_inline void
648 _sh_propagate(struct vcpu *v,
649 void *guest_entry_ptr,
650 mfn_t guest_table_mfn,
651 mfn_t target_mfn,
652 void *shadow_entry_ptr,
653 int level,
654 fetch_type_t ft,
655 int mmio)
656 {
657 guest_l1e_t *gp = guest_entry_ptr;
658 shadow_l1e_t *sp = shadow_entry_ptr;
659 struct domain *d = v->domain;
660 u32 pass_thru_flags;
661 u32 gflags, sflags;
663 /* We don't shadow PAE l3s */
664 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
666 if ( mfn_valid(guest_table_mfn) )
667 /* Handle A and D bit propagation into the guest */
668 gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
669 else
670 {
671 /* Must be an fl1e or a prefetch */
672 ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
673 gflags = guest_l1e_get_flags(*gp);
674 }
676 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
677 {
678 /* If a guest l1 entry is not present, shadow with the magic
679 * guest-not-present entry. */
680 if ( level == 1 )
681 *sp = sh_l1e_gnp();
682 else
683 *sp = shadow_l1e_empty();
684 goto done;
685 }
687 if ( level == 1 && mmio )
688 {
689 /* Guest l1e maps MMIO space */
690 *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
691 goto done;
692 }
694 // Must have a valid target_mfn, unless this is a prefetch. In the
695 // case of a prefetch, an invalid mfn means that we can not usefully
696 // shadow anything, and so we return early.
697 //
698 if ( !mfn_valid(target_mfn) )
699 {
700 ASSERT((ft == ft_prefetch));
701 *sp = shadow_l1e_empty();
702 goto done;
703 }
705 // Propagate bits from the guest to the shadow.
706 // Some of these may be overwritten, below.
707 // Since we know the guest's PRESENT bit is set, we also set the shadow's
708 // SHADOW_PRESENT bit.
709 //
710 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
711 _PAGE_RW | _PAGE_PRESENT);
712 if ( guest_supports_nx(v) )
713 pass_thru_flags |= _PAGE_NX_BIT;
714 sflags = gflags & pass_thru_flags;
716 // Set the A&D bits for higher level shadows.
717 // Higher level entries do not, strictly speaking, have dirty bits, but
718 // since we use shadow linear tables, each of these entries may, at some
719 // point in time, also serve as a shadow L1 entry.
720 // By setting both the A&D bits in each of these, we eliminate the burden
721 // on the hardware to update these bits on initial accesses.
722 //
723 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
724 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
726 // If the A or D bit has not yet been set in the guest, then we must
727 // prevent the corresponding kind of access.
728 //
729 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
730 sflags &= ~_PAGE_PRESENT;
732 /* D bits exist in L1es and PSE L2es */
733 if ( unlikely(((level == 1) ||
734 ((level == 2) &&
735 (gflags & _PAGE_PSE) &&
736 guest_supports_superpages(v)))
737 && !(gflags & _PAGE_DIRTY)) )
738 sflags &= ~_PAGE_RW;
740 // shadow_mode_log_dirty support
741 //
742 // Only allow the guest write access to a page a) on a demand fault,
743 // or b) if the page is already marked as dirty.
744 //
745 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
746 {
747 if ( ft & FETCH_TYPE_WRITE )
748 sh_mark_dirty(d, target_mfn);
749 else if ( !sh_mfn_is_dirty(d, target_mfn) )
750 sflags &= ~_PAGE_RW;
751 }
753 // protect guest page tables
754 //
755 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
756 {
757 if ( shadow_mode_trap_reads(d) )
758 {
759 // if we are trapping both reads & writes, then mark this page
760 // as not present...
761 //
762 sflags &= ~_PAGE_PRESENT;
763 }
764 else
765 {
766 // otherwise, just prevent any writes...
767 //
768 sflags &= ~_PAGE_RW;
769 }
770 }
772 // PV guests in 64-bit mode use two different page tables for user vs
773 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
774 // It is always shadowed as present...
775 if ( (GUEST_PAGING_LEVELS == 4) && !IS_COMPAT(d) && !is_hvm_domain(d) )
776 {
777 sflags |= _PAGE_USER;
778 }
780 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
781 done:
782 SHADOW_DEBUG(PROPAGATE,
783 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
784 fetch_type_names[ft], level, gp->l1, sp->l1);
785 }
788 /* These four wrappers give us a little bit of type-safety back around the
789 * use of void-* pointers in _sh_propagate(), and allow the compiler to
790 * optimize out some level checks. */
792 #if GUEST_PAGING_LEVELS >= 4
793 static void
794 l4e_propagate_from_guest(struct vcpu *v,
795 guest_l4e_t *gl4e,
796 mfn_t gl4mfn,
797 mfn_t sl3mfn,
798 shadow_l4e_t *sl4e,
799 fetch_type_t ft)
800 {
801 _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, 0);
802 }
804 static void
805 l3e_propagate_from_guest(struct vcpu *v,
806 guest_l3e_t *gl3e,
807 mfn_t gl3mfn,
808 mfn_t sl2mfn,
809 shadow_l3e_t *sl3e,
810 fetch_type_t ft)
811 {
812 _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, 0);
813 }
814 #endif // GUEST_PAGING_LEVELS >= 4
816 static void
817 l2e_propagate_from_guest(struct vcpu *v,
818 guest_l2e_t *gl2e,
819 mfn_t gl2mfn,
820 mfn_t sl1mfn,
821 shadow_l2e_t *sl2e,
822 fetch_type_t ft)
823 {
824 _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, 0);
825 }
827 static void
828 l1e_propagate_from_guest(struct vcpu *v,
829 guest_l1e_t *gl1e,
830 mfn_t gl1mfn,
831 mfn_t gmfn,
832 shadow_l1e_t *sl1e,
833 fetch_type_t ft,
834 int mmio)
835 {
836 _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, mmio);
837 }
840 /**************************************************************************/
841 /* These functions update shadow entries (and do bookkeeping on the shadow
842 * tables they are in). It is intended that they are the only
843 * functions which ever write (non-zero) data onto a shadow page.
844 */
846 static inline void safe_write_entry(void *dst, void *src)
847 /* Copy one PTE safely when processors might be running on the
848 * destination pagetable. This does *not* give safety against
849 * concurrent writes (that's what the shadow lock is for), just
850 * stops the hardware picking up partially written entries. */
851 {
852 volatile unsigned long *d = dst;
853 unsigned long *s = src;
854 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
855 #if CONFIG_PAGING_LEVELS == 3
856 /* In PAE mode, pagetable entries are larger
857 * than machine words, so won't get written atomically. We need to make
858 * sure any other cpu running on these shadows doesn't see a
859 * half-written entry. Do this by marking the entry not-present first,
860 * then writing the high word before the low word. */
861 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
862 d[0] = 0;
863 d[1] = s[1];
864 d[0] = s[0];
865 #else
866 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
867 * which will be an atomic write, since the entry is aligned. */
868 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
869 *d = *s;
870 #endif
871 }
874 static inline void
875 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
876 /* This function does the actual writes to shadow pages.
877 * It must not be called directly, since it doesn't do the bookkeeping
878 * that shadow_set_l*e() functions do. */
879 {
880 shadow_l1e_t *dst = d;
881 shadow_l1e_t *src = s;
882 void *map = NULL;
883 int i;
885 /* Because we mirror access rights at all levels in the shadow, an
886 * l2 (or higher) entry with the RW bit cleared will leave us with
887 * no write access through the linear map.
888 * We detect that by writing to the shadow with copy_to_user() and
889 * using map_domain_page() to get a writeable mapping if we need to. */
890 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
891 {
892 perfc_incrc(shadow_linear_map_failed);
893 map = sh_map_domain_page(mfn);
894 ASSERT(map != NULL);
895 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
896 }
899 for ( i = 0; i < entries; i++ )
900 safe_write_entry(dst++, src++);
902 if ( map != NULL ) sh_unmap_domain_page(map);
903 }
905 static inline int
906 perms_strictly_increased(u32 old_flags, u32 new_flags)
907 /* Given the flags of two entries, are the new flags a strict
908 * increase in rights over the old ones? */
909 {
910 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
911 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
912 /* Flip the NX bit, since it's the only one that decreases rights;
913 * we calculate as if it were an "X" bit. */
914 of ^= _PAGE_NX_BIT;
915 nf ^= _PAGE_NX_BIT;
916 /* If the changed bits are all set in the new flags, then rights strictly
917 * increased between old and new. */
918 return ((of | (of ^ nf)) == nf);
919 }
921 static int inline
922 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
923 {
924 int res;
925 mfn_t mfn;
926 struct domain *owner;
928 ASSERT(!sh_l1e_is_magic(sl1e));
930 if ( !shadow_mode_refcounts(d) )
931 return 1;
933 res = get_page_from_l1e(sl1e, d);
935 // If a privileged domain is attempting to install a map of a page it does
936 // not own, we let it succeed anyway.
937 //
938 if ( unlikely(!res) &&
939 IS_PRIV(d) &&
940 !shadow_mode_translate(d) &&
941 mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
942 (owner = page_get_owner(mfn_to_page(mfn))) &&
943 (d != owner) )
944 {
945 res = get_page_from_l1e(sl1e, owner);
946 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
947 "which is owned by domain %d: %s\n",
948 d->domain_id, mfn_x(mfn), owner->domain_id,
949 res ? "success" : "failed");
950 }
952 if ( unlikely(!res) )
953 {
954 perfc_incrc(shadow_get_page_fail);
955 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
956 }
958 return res;
959 }
961 static void inline
962 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
963 {
964 if ( !shadow_mode_refcounts(d) )
965 return;
967 put_page_from_l1e(sl1e, d);
968 }
970 #if GUEST_PAGING_LEVELS >= 4
971 static int shadow_set_l4e(struct vcpu *v,
972 shadow_l4e_t *sl4e,
973 shadow_l4e_t new_sl4e,
974 mfn_t sl4mfn)
975 {
976 int flags = 0, ok;
977 shadow_l4e_t old_sl4e;
978 paddr_t paddr;
979 ASSERT(sl4e != NULL);
980 old_sl4e = *sl4e;
982 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
984 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
985 | (((unsigned long)sl4e) & ~PAGE_MASK));
987 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
988 {
989 /* About to install a new reference */
990 mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
991 ok = sh_get_ref(v, sl3mfn, paddr);
992 /* Are we pinning l3 shadows to handle wierd linux behaviour? */
993 if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
994 ok |= sh_pin(v, sl3mfn);
995 if ( !ok )
996 {
997 domain_crash(v->domain);
998 return SHADOW_SET_ERROR;
999 }
1002 /* Write the new entry */
1003 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
1004 flags |= SHADOW_SET_CHANGED;
1006 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1008 /* We lost a reference to an old mfn. */
1009 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1010 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1011 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1012 shadow_l4e_get_flags(new_sl4e)) )
1014 flags |= SHADOW_SET_FLUSH;
1016 sh_put_ref(v, osl3mfn, paddr);
1018 return flags;
1021 static int shadow_set_l3e(struct vcpu *v,
1022 shadow_l3e_t *sl3e,
1023 shadow_l3e_t new_sl3e,
1024 mfn_t sl3mfn)
1026 int flags = 0;
1027 shadow_l3e_t old_sl3e;
1028 paddr_t paddr;
1029 ASSERT(sl3e != NULL);
1030 old_sl3e = *sl3e;
1032 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1034 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1035 | (((unsigned long)sl3e) & ~PAGE_MASK));
1037 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1038 /* About to install a new reference */
1039 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
1041 domain_crash(v->domain);
1042 return SHADOW_SET_ERROR;
1045 /* Write the new entry */
1046 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1047 flags |= SHADOW_SET_CHANGED;
1049 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1051 /* We lost a reference to an old mfn. */
1052 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1053 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1054 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1055 shadow_l3e_get_flags(new_sl3e)) )
1057 flags |= SHADOW_SET_FLUSH;
1059 sh_put_ref(v, osl2mfn, paddr);
1061 return flags;
1063 #endif /* GUEST_PAGING_LEVELS >= 4 */
1065 static int shadow_set_l2e(struct vcpu *v,
1066 shadow_l2e_t *sl2e,
1067 shadow_l2e_t new_sl2e,
1068 mfn_t sl2mfn)
1070 int flags = 0;
1071 shadow_l2e_t old_sl2e;
1072 paddr_t paddr;
1074 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1075 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1076 * shadows. Reference counting and up-pointers track from the first
1077 * page of the shadow to the first l2e, so make sure that we're
1078 * working with those:
1079 * Align the pointer down so it's pointing at the first of the pair */
1080 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1081 /* Align the mfn of the shadow entry too */
1082 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1083 #endif
1085 ASSERT(sl2e != NULL);
1086 old_sl2e = *sl2e;
1088 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1090 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1091 | (((unsigned long)sl2e) & ~PAGE_MASK));
1093 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1094 /* About to install a new reference */
1095 if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
1097 domain_crash(v->domain);
1098 return SHADOW_SET_ERROR;
1101 /* Write the new entry */
1102 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1104 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1105 /* The l1 shadow is two pages long and need to be pointed to by
1106 * two adjacent l1es. The pair have the same flags, but point
1107 * at odd and even MFNs */
1108 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1109 pair[1].l2 |= (1<<PAGE_SHIFT);
1110 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1112 #else /* normal case */
1113 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1114 #endif
1115 flags |= SHADOW_SET_CHANGED;
1117 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1119 /* We lost a reference to an old mfn. */
1120 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1121 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1122 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1123 shadow_l2e_get_flags(new_sl2e)) )
1125 flags |= SHADOW_SET_FLUSH;
1127 sh_put_ref(v, osl1mfn, paddr);
1129 return flags;
1132 static int shadow_set_l1e(struct vcpu *v,
1133 shadow_l1e_t *sl1e,
1134 shadow_l1e_t new_sl1e,
1135 mfn_t sl1mfn)
1137 int flags = 0;
1138 struct domain *d = v->domain;
1139 shadow_l1e_t old_sl1e;
1140 ASSERT(sl1e != NULL);
1142 old_sl1e = *sl1e;
1144 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1146 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1147 && !sh_l1e_is_magic(new_sl1e) )
1149 /* About to install a new reference */
1150 if ( shadow_mode_refcounts(d) ) {
1151 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1153 /* Doesn't look like a pagetable. */
1154 flags |= SHADOW_SET_ERROR;
1155 new_sl1e = shadow_l1e_empty();
1160 /* Write the new entry */
1161 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1162 flags |= SHADOW_SET_CHANGED;
1164 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1165 && !sh_l1e_is_magic(old_sl1e) )
1167 /* We lost a reference to an old mfn. */
1168 /* N.B. Unlike higher-level sets, never need an extra flush
1169 * when writing an l1e. Because it points to the same guest frame
1170 * as the guest l1e did, it's the guest's responsibility to
1171 * trigger a flush later. */
1172 if ( shadow_mode_refcounts(d) )
1174 shadow_put_page_from_l1e(old_sl1e, d);
1177 return flags;
1181 /**************************************************************************/
1182 /* Macros to walk pagetables. These take the shadow of a pagetable and
1183 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1184 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1185 * second entry (since pairs of entries are managed together). For multi-page
1186 * shadows they walk all pages.
1188 * Arguments are an MFN, the variable to point to each entry, a variable
1189 * to indicate that we are done (we will shortcut to the end of the scan
1190 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1191 * and the code.
1193 * WARNING: These macros have side-effects. They change the values of both
1194 * the pointer and the MFN. */
1196 static inline void increment_ptr_to_guest_entry(void *ptr)
1198 if ( ptr )
1200 guest_l1e_t **entry = ptr;
1201 (*entry)++;
1205 /* All kinds of l1: touch all entries */
1206 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1207 do { \
1208 int _i; \
1209 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1210 ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow \
1211 || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
1212 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1213 { \
1214 (_sl1e) = _sp + _i; \
1215 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1216 {_code} \
1217 if ( _done ) break; \
1218 increment_ptr_to_guest_entry(_gl1p); \
1219 } \
1220 unmap_shadow_page(_sp); \
1221 } while (0)
1223 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1224 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1225 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1226 do { \
1227 int __done = 0; \
1228 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1229 ({ (__done = _done); }), _code); \
1230 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1231 if ( !__done ) \
1232 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1233 ({ (__done = _done); }), _code); \
1234 } while (0)
1235 #else /* Everything else; l1 shadows are only one page */
1236 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1237 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1238 #endif
1241 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1243 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1244 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1245 do { \
1246 int _i, _j, __done = 0; \
1247 int _xen = !shadow_mode_external(_dom); \
1248 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1249 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1250 { \
1251 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1252 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1253 if ( (!(_xen)) \
1254 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1255 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1256 { \
1257 (_sl2e) = _sp + _i; \
1258 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1259 {_code} \
1260 if ( (__done = (_done)) ) break; \
1261 increment_ptr_to_guest_entry(_gl2p); \
1262 } \
1263 unmap_shadow_page(_sp); \
1264 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1265 } \
1266 } while (0)
1268 #elif GUEST_PAGING_LEVELS == 2
1270 /* 32-bit on 32-bit: avoid Xen entries */
1271 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1272 do { \
1273 int _i; \
1274 int _xen = !shadow_mode_external(_dom); \
1275 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1276 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow); \
1277 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1278 if ( (!(_xen)) \
1279 || \
1280 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1281 { \
1282 (_sl2e) = _sp + _i; \
1283 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1284 {_code} \
1285 if ( _done ) break; \
1286 increment_ptr_to_guest_entry(_gl2p); \
1287 } \
1288 unmap_shadow_page(_sp); \
1289 } while (0)
1291 #elif GUEST_PAGING_LEVELS == 3
1293 /* PAE: if it's an l2h, don't touch Xen mappings */
1294 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1295 do { \
1296 int _i; \
1297 int _xen = !shadow_mode_external(_dom); \
1298 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1299 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow \
1300 || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
1301 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1302 if ( (!(_xen)) \
1303 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
1304 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1305 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1306 { \
1307 (_sl2e) = _sp + _i; \
1308 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1309 {_code} \
1310 if ( _done ) break; \
1311 increment_ptr_to_guest_entry(_gl2p); \
1312 } \
1313 unmap_shadow_page(_sp); \
1314 } while (0)
1316 #else
1318 /* 64-bit l2: touch all entries except for PAE compat guests. */
1319 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1320 do { \
1321 int _i; \
1322 int _xen = !shadow_mode_external(_dom); \
1323 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1324 ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow || \
1325 mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow); \
1326 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1327 { \
1328 if ( (!(_xen)) \
1329 || !IS_COMPAT(_dom) \
1330 || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow \
1331 || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1332 { \
1333 (_sl2e) = _sp + _i; \
1334 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1335 {_code} \
1336 if ( _done ) break; \
1337 increment_ptr_to_guest_entry(_gl2p); \
1338 } \
1339 } \
1340 unmap_shadow_page(_sp); \
1341 } while (0)
1343 #endif /* different kinds of l2 */
1345 #if GUEST_PAGING_LEVELS == 4
1347 /* 64-bit l3: touch all entries */
1348 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1349 do { \
1350 int _i; \
1351 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1352 ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow); \
1353 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1354 { \
1355 (_sl3e) = _sp + _i; \
1356 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1357 {_code} \
1358 if ( _done ) break; \
1359 increment_ptr_to_guest_entry(_gl3p); \
1360 } \
1361 unmap_shadow_page(_sp); \
1362 } while (0)
1364 /* 64-bit l4: avoid Xen mappings */
1365 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1366 do { \
1367 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1368 int _xen = !shadow_mode_external(_dom); \
1369 int _i; \
1370 ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow); \
1371 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1372 { \
1373 if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1374 { \
1375 (_sl4e) = _sp + _i; \
1376 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1377 {_code} \
1378 if ( _done ) break; \
1379 } \
1380 increment_ptr_to_guest_entry(_gl4p); \
1381 } \
1382 unmap_shadow_page(_sp); \
1383 } while (0)
1385 #endif
1389 /**************************************************************************/
1390 /* Functions to install Xen mappings and linear mappings in shadow pages */
1392 // XXX -- this function should probably be moved to shadow-common.c, but that
1393 // probably wants to wait until the shadow types have been moved from
1394 // shadow-types.h to shadow-private.h
1395 //
1396 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1397 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1399 struct domain *d = v->domain;
1400 shadow_l4e_t *sl4e;
1402 sl4e = sh_map_domain_page(sl4mfn);
1403 ASSERT(sl4e != NULL);
1404 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1406 /* Copy the common Xen mappings from the idle domain */
1407 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1408 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1409 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1411 /* Install the per-domain mappings for this domain */
1412 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1413 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1414 __PAGE_HYPERVISOR);
1416 /* Linear mapping */
1417 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1418 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1420 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1422 // linear tables may not be used with translated PV guests
1423 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1424 shadow_l4e_empty();
1426 else
1428 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1429 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1432 if ( shadow_mode_translate(v->domain) )
1434 /* install domain-specific P2M table */
1435 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1436 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1437 __PAGE_HYPERVISOR);
1440 if ( IS_COMPAT(v->domain) )
1442 /* install compat arg xlat entry */
1443 sl4e[shadow_l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1444 shadow_l4e_from_mfn(
1445 page_to_mfn(virt_to_page(d->arch.mm_arg_xlat_l3)),
1446 __PAGE_HYPERVISOR);
1449 sh_unmap_domain_page(sl4e);
1451 #endif
1453 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1454 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1455 // place, which means that we need to populate the l2h entry in the l3
1456 // table.
1458 static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1460 struct domain *d = v->domain;
1461 shadow_l2e_t *sl2e;
1462 #if CONFIG_PAGING_LEVELS == 3
1463 int i;
1464 #else
1466 if ( !pv_32bit_guest(v) )
1467 return;
1468 #endif
1470 sl2e = sh_map_domain_page(sl2hmfn);
1471 ASSERT(sl2e != NULL);
1472 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1474 #if CONFIG_PAGING_LEVELS == 3
1476 /* Copy the common Xen mappings from the idle domain */
1477 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1478 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1479 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1481 /* Install the per-domain mappings for this domain */
1482 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1483 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1484 shadow_l2e_from_mfn(
1485 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1486 __PAGE_HYPERVISOR);
1488 /* We don't set up a linear mapping here because we can't until this
1489 * l2h is installed in an l3e. sh_update_linear_entries() handles
1490 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1491 * We zero them here, just as a safety measure.
1492 */
1493 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1494 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1495 shadow_l2e_empty();
1496 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1497 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1498 shadow_l2e_empty();
1500 if ( shadow_mode_translate(d) )
1502 /* Install the domain-specific p2m table */
1503 l3_pgentry_t *p2m;
1504 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1505 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1506 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1508 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1509 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1510 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1511 __PAGE_HYPERVISOR)
1512 : shadow_l2e_empty();
1514 sh_unmap_domain_page(p2m);
1517 #else
1519 /* Copy the common Xen mappings from the idle domain */
1520 memcpy(
1521 &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1522 &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1523 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1525 #endif
1527 sh_unmap_domain_page(sl2e);
1529 #endif
1532 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1533 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1535 struct domain *d = v->domain;
1536 shadow_l2e_t *sl2e;
1537 int i;
1539 sl2e = sh_map_domain_page(sl2mfn);
1540 ASSERT(sl2e != NULL);
1541 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1543 /* Copy the common Xen mappings from the idle domain */
1544 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1545 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1546 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1548 /* Install the per-domain mappings for this domain */
1549 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1550 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1551 shadow_l2e_from_mfn(
1552 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1553 __PAGE_HYPERVISOR);
1555 /* Linear mapping */
1556 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1557 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1559 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1561 // linear tables may not be used with translated PV guests
1562 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1563 shadow_l2e_empty();
1565 else
1567 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1568 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1571 if ( shadow_mode_translate(d) )
1573 /* install domain-specific P2M table */
1574 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1575 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1576 __PAGE_HYPERVISOR);
1579 sh_unmap_domain_page(sl2e);
1581 #endif
1585 /**************************************************************************/
1586 /* Create a shadow of a given guest page.
1587 */
1588 static mfn_t
1589 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1591 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1592 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1593 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1595 if ( shadow_type != SH_type_l2_32_shadow
1596 && shadow_type != SH_type_l2_pae_shadow
1597 && shadow_type != SH_type_l2h_pae_shadow
1598 && shadow_type != SH_type_l4_64_shadow )
1599 /* Lower-level shadow, not yet linked form a higher level */
1600 mfn_to_shadow_page(smfn)->up = 0;
1602 #if GUEST_PAGING_LEVELS == 4
1603 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1604 if ( shadow_type == SH_type_l4_64_shadow &&
1605 unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1607 /* We're shadowing a new l4, but we've been assuming the guest uses
1608 * only one l4 per vcpu and context switches using an l4 entry.
1609 * Count the number of active l4 shadows. If there are enough
1610 * of them, decide that this isn't an old linux guest, and stop
1611 * pinning l3es. This is not very quick but it doesn't happen
1612 * very often. */
1613 struct list_head *l, *t;
1614 struct shadow_page_info *sp;
1615 struct vcpu *v2;
1616 int l4count = 0, vcpus = 0;
1617 list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
1619 sp = list_entry(l, struct shadow_page_info, list);
1620 if ( sp->type == SH_type_l4_64_shadow )
1621 l4count++;
1623 for_each_vcpu ( v->domain, v2 )
1624 vcpus++;
1625 if ( l4count > 2 * vcpus )
1627 /* Unpin all the pinned l3 tables, and don't pin any more. */
1628 list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
1630 sp = list_entry(l, struct shadow_page_info, list);
1631 if ( sp->type == SH_type_l3_64_shadow )
1632 sh_unpin(v, shadow_page_to_mfn(sp));
1634 v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1637 #endif
1638 #endif
1640 // Create the Xen mappings...
1641 if ( !shadow_mode_external(v->domain) )
1643 switch (shadow_type)
1645 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1646 case SH_type_l4_shadow:
1647 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1648 #endif
1649 #if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1650 case SH_type_l2h_shadow:
1651 #ifdef CONFIG_COMPAT
1652 ASSERT( IS_COMPAT(v->domain) );
1653 #endif
1654 sh_install_xen_entries_in_l2h(v, smfn); break;
1655 #endif
1656 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1657 case SH_type_l2_shadow:
1658 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1659 #endif
1660 default: /* Do nothing */ break;
1664 shadow_promote(v, gmfn, shadow_type);
1665 set_shadow_status(v, gmfn, shadow_type, smfn);
1667 return smfn;
1670 /* Make a splintered superpage shadow */
1671 static mfn_t
1672 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1674 mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1675 (unsigned long) gfn_x(gfn));
1677 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1678 gfn_x(gfn), mfn_x(smfn));
1680 set_fl1_shadow_status(v, gfn, smfn);
1681 return smfn;
1685 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1686 mfn_t
1687 sh_make_monitor_table(struct vcpu *v)
1689 struct domain *d = v->domain;
1691 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1693 /* Guarantee we can get the memory we need */
1694 shadow_prealloc(d, SHADOW_MAX_ORDER);
1696 #if CONFIG_PAGING_LEVELS == 4
1698 mfn_t m4mfn;
1699 m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1700 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1701 /* Remember the level of this table */
1702 mfn_to_page(m4mfn)->shadow_flags = 4;
1703 #if SHADOW_PAGING_LEVELS < 4
1704 // Install a monitor l3 table in slot 0 of the l4 table.
1705 // This is used for shadow linear maps.
1707 mfn_t m3mfn;
1708 l4_pgentry_t *l4e;
1709 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1710 mfn_to_page(m3mfn)->shadow_flags = 3;
1711 l4e = sh_map_domain_page(m4mfn);
1712 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1713 sh_unmap_domain_page(l4e);
1714 if ( pv_32bit_guest(v) )
1716 // Install a monitor l2 table in slot 3 of the l3 table.
1717 // This is used for all Xen entries.
1718 mfn_t m2mfn;
1719 l3_pgentry_t *l3e;
1720 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1721 mfn_to_page(m2mfn)->shadow_flags = 2;
1722 l3e = sh_map_domain_page(m3mfn);
1723 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1724 sh_install_xen_entries_in_l2h(v, m2mfn);
1725 sh_unmap_domain_page(l3e);
1728 #endif /* SHADOW_PAGING_LEVELS < 4 */
1729 return m4mfn;
1732 #elif CONFIG_PAGING_LEVELS == 3
1735 mfn_t m3mfn, m2mfn;
1736 l3_pgentry_t *l3e;
1737 l2_pgentry_t *l2e;
1738 int i;
1740 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1741 /* Remember the level of this table */
1742 mfn_to_page(m3mfn)->shadow_flags = 3;
1744 // Install a monitor l2 table in slot 3 of the l3 table.
1745 // This is used for all Xen entries, including linear maps
1746 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1747 mfn_to_page(m2mfn)->shadow_flags = 2;
1748 l3e = sh_map_domain_page(m3mfn);
1749 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1750 sh_install_xen_entries_in_l2h(v, m2mfn);
1751 /* Install the monitor's own linear map */
1752 l2e = sh_map_domain_page(m2mfn);
1753 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1754 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1755 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1756 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1757 : l2e_empty();
1758 sh_unmap_domain_page(l2e);
1759 sh_unmap_domain_page(l3e);
1761 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1762 return m3mfn;
1765 #elif CONFIG_PAGING_LEVELS == 2
1768 mfn_t m2mfn;
1769 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1770 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1771 /* Remember the level of this table */
1772 mfn_to_page(m2mfn)->shadow_flags = 2;
1773 return m2mfn;
1776 #else
1777 #error this should not happen
1778 #endif /* CONFIG_PAGING_LEVELS */
1780 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1782 /**************************************************************************/
1783 /* These functions also take a virtual address and return the level-N
1784 * shadow table mfn and entry, but they create the shadow pagetables if
1785 * they are needed. The "demand" argument is non-zero when handling
1786 * a demand fault (so we know what to do about accessed bits &c).
1787 * If the necessary tables are not present in the guest, they return NULL. */
1789 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1790 * more levels than the guest, the upper levels are always fixed and do not
1791 * reflect any information from the guest, so we do not use these functions
1792 * to access them. */
1794 #if GUEST_PAGING_LEVELS >= 4
1795 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1796 walk_t *gw,
1797 mfn_t *sl4mfn)
1799 /* There is always a shadow of the top level table. Get it. */
1800 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1801 /* Reading the top level table is always valid. */
1802 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1805 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1806 walk_t *gw,
1807 mfn_t *sl3mfn,
1808 fetch_type_t ft)
1810 mfn_t sl4mfn;
1811 shadow_l4e_t *sl4e;
1812 if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1813 /* Get the l4e */
1814 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1815 ASSERT(sl4e != NULL);
1816 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1818 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1819 ASSERT(mfn_valid(*sl3mfn));
1821 else
1823 int r;
1824 shadow_l4e_t new_sl4e;
1825 /* No l3 shadow installed: find and install it. */
1826 *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1827 if ( !mfn_valid(*sl3mfn) )
1829 /* No l3 shadow of this page exists at all: make one. */
1830 *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1832 /* Install the new sl3 table in the sl4e */
1833 l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
1834 *sl3mfn, &new_sl4e, ft);
1835 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1836 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1837 if ( r & SHADOW_SET_ERROR )
1838 return NULL;
1840 /* Now follow it down a level. Guaranteed to succeed. */
1841 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1843 #endif /* GUEST_PAGING_LEVELS >= 4 */
1846 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1847 walk_t *gw,
1848 mfn_t *sl2mfn,
1849 fetch_type_t ft)
1851 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1852 mfn_t sl3mfn = _mfn(INVALID_MFN);
1853 shadow_l3e_t *sl3e;
1854 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1855 /* Get the l3e */
1856 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1857 if ( sl3e == NULL ) return NULL;
1858 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1860 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1861 ASSERT(mfn_valid(*sl2mfn));
1863 else
1865 int r;
1866 shadow_l3e_t new_sl3e;
1867 unsigned int t = SH_type_l2_shadow;
1869 #ifdef CONFIG_COMPAT
1870 /* Tag compat L2 containing hypervisor (m2p) mappings */
1871 if ( IS_COMPAT(v->domain) &&
1872 guest_l4_table_offset(gw->va) == 0 &&
1873 guest_l3_table_offset(gw->va) == 3 )
1874 t = SH_type_l2h_shadow;
1875 #endif
1876 /* No l2 shadow installed: find and install it. */
1877 *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1878 if ( !mfn_valid(*sl2mfn) )
1880 /* No l2 shadow of this page exists at all: make one. */
1881 *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1883 /* Install the new sl2 table in the sl3e */
1884 l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
1885 *sl2mfn, &new_sl3e, ft);
1886 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1887 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1888 if ( r & SHADOW_SET_ERROR )
1889 return NULL;
1891 /* Now follow it down a level. Guaranteed to succeed. */
1892 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1893 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1894 /* We never demand-shadow PAE l3es: they are only created in
1895 * sh_update_cr3(). Check if the relevant sl3e is present. */
1896 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1897 + shadow_l3_linear_offset(gw->va);
1898 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1899 return NULL;
1900 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1901 ASSERT(mfn_valid(*sl2mfn));
1902 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1903 #else /* 32bit... */
1904 /* There is always a shadow of the top level table. Get it. */
1905 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1906 /* This next line is important: the guest l2 has a 16k
1907 * shadow, we need to return the right mfn of the four. This
1908 * call will set it for us as a side-effect. */
1909 (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
1910 /* Reading the top level table is always valid. */
1911 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1912 #endif
1916 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1917 walk_t *gw,
1918 mfn_t *sl1mfn,
1919 fetch_type_t ft)
1921 mfn_t sl2mfn;
1922 shadow_l2e_t *sl2e;
1924 /* Get the l2e */
1925 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1926 if ( sl2e == NULL ) return NULL;
1927 /* Install the sl1 in the l2e if it wasn't there or if we need to
1928 * re-do it to fix a PSE dirty bit. */
1929 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1930 && likely(ft != ft_demand_write
1931 || (guest_l2e_get_flags(*gw->l2e) & _PAGE_DIRTY)
1932 || !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)) )
1934 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1935 ASSERT(mfn_valid(*sl1mfn));
1937 else
1939 shadow_l2e_t new_sl2e;
1940 int r, flags = guest_l2e_get_flags(*gw->l2e);
1941 /* No l1 shadow installed: find and install it. */
1942 if ( !(flags & _PAGE_PRESENT) )
1943 return NULL; /* No guest page. */
1944 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1946 /* Splintering a superpage */
1947 gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
1948 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1949 if ( !mfn_valid(*sl1mfn) )
1951 /* No fl1 shadow of this superpage exists at all: make one. */
1952 *sl1mfn = make_fl1_shadow(v, l2gfn);
1955 else
1957 /* Shadowing an actual guest l1 table */
1958 if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1959 *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1960 if ( !mfn_valid(*sl1mfn) )
1962 /* No l1 shadow of this page exists at all: make one. */
1963 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1966 /* Install the new sl1 table in the sl2e */
1967 l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
1968 *sl1mfn, &new_sl2e, ft);
1969 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1970 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1971 if ( r & SHADOW_SET_ERROR )
1972 return NULL;
1973 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1974 * the guest l1 table has an 8k shadow, and we need to return
1975 * the right mfn of the pair. This call will set it for us as a
1976 * side-effect. (In all other cases, it's a no-op and will be
1977 * compiled out.) */
1978 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1980 /* Now follow it down a level. Guaranteed to succeed. */
1981 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1986 /**************************************************************************/
1987 /* Destructors for shadow tables:
1988 * Unregister the shadow, decrement refcounts of any entries present in it,
1989 * and release the memory.
1991 * N.B. These destructors do not clear the contents of the shadows.
1992 * This allows us to delay TLB shootdowns until the page is being reused.
1993 * See shadow_alloc() and shadow_free() for how this is handled.
1994 */
1996 #if GUEST_PAGING_LEVELS >= 4
1997 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1999 shadow_l4e_t *sl4e;
2000 u32 t = mfn_to_shadow_page(smfn)->type;
2001 mfn_t gmfn, sl4mfn;
2003 SHADOW_DEBUG(DESTROY_SHADOW,
2004 "%s(%05lx)\n", __func__, mfn_x(smfn));
2005 ASSERT(t == SH_type_l4_shadow);
2007 /* Record that the guest page isn't shadowed any more (in this type) */
2008 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2009 delete_shadow_status(v, gmfn, t, smfn);
2010 shadow_demote(v, gmfn, t);
2011 /* Decrement refcounts of all the old entries */
2012 sl4mfn = smfn;
2013 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2014 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
2016 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
2017 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
2018 | ((unsigned long)sl4e & ~PAGE_MASK));
2020 });
2022 /* Put the memory back in the pool */
2023 shadow_free(v->domain, smfn);
2026 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2028 shadow_l3e_t *sl3e;
2029 u32 t = mfn_to_shadow_page(smfn)->type;
2030 mfn_t gmfn, sl3mfn;
2032 SHADOW_DEBUG(DESTROY_SHADOW,
2033 "%s(%05lx)\n", __func__, mfn_x(smfn));
2034 ASSERT(t == SH_type_l3_shadow);
2036 /* Record that the guest page isn't shadowed any more (in this type) */
2037 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2038 delete_shadow_status(v, gmfn, t, smfn);
2039 shadow_demote(v, gmfn, t);
2041 /* Decrement refcounts of all the old entries */
2042 sl3mfn = smfn;
2043 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2044 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2045 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2046 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2047 | ((unsigned long)sl3e & ~PAGE_MASK));
2048 });
2050 /* Put the memory back in the pool */
2051 shadow_free(v->domain, smfn);
2053 #endif /* GUEST_PAGING_LEVELS >= 4 */
2056 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2058 shadow_l2e_t *sl2e;
2059 u32 t = mfn_to_shadow_page(smfn)->type;
2060 mfn_t gmfn, sl2mfn;
2062 SHADOW_DEBUG(DESTROY_SHADOW,
2063 "%s(%05lx)\n", __func__, mfn_x(smfn));
2065 #if GUEST_PAGING_LEVELS >= 3
2066 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2067 #else
2068 ASSERT(t == SH_type_l2_shadow);
2069 #endif
2071 /* Record that the guest page isn't shadowed any more (in this type) */
2072 gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2073 delete_shadow_status(v, gmfn, t, smfn);
2074 shadow_demote(v, gmfn, t);
2076 /* Decrement refcounts of all the old entries */
2077 sl2mfn = smfn;
2078 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2079 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2080 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2081 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2082 | ((unsigned long)sl2e & ~PAGE_MASK));
2083 });
2085 /* Put the memory back in the pool */
2086 shadow_free(v->domain, smfn);
2089 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2091 struct domain *d = v->domain;
2092 shadow_l1e_t *sl1e;
2093 u32 t = mfn_to_shadow_page(smfn)->type;
2095 SHADOW_DEBUG(DESTROY_SHADOW,
2096 "%s(%05lx)\n", __func__, mfn_x(smfn));
2097 ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2099 /* Record that the guest page isn't shadowed any more (in this type) */
2100 if ( t == SH_type_fl1_shadow )
2102 gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
2103 delete_fl1_shadow_status(v, gfn, smfn);
2105 else
2107 mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
2108 delete_shadow_status(v, gmfn, t, smfn);
2109 shadow_demote(v, gmfn, t);
2112 if ( shadow_mode_refcounts(d) )
2114 /* Decrement refcounts of all the old entries */
2115 mfn_t sl1mfn = smfn;
2116 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2117 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2118 && !sh_l1e_is_magic(*sl1e) )
2119 shadow_put_page_from_l1e(*sl1e, d);
2120 });
2123 /* Put the memory back in the pool */
2124 shadow_free(v->domain, smfn);
2127 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2128 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2130 struct domain *d = v->domain;
2131 ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
2133 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2134 /* Need to destroy the l3 monitor page in slot 0 too */
2136 mfn_t m3mfn;
2137 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2138 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2139 m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2140 if ( pv_32bit_guest(v) )
2142 /* Need to destroy the l2 monitor page in slot 3 too */
2143 l3_pgentry_t *l3e = sh_map_domain_page(m3mfn);
2144 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2145 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2146 sh_unmap_domain_page(l3e);
2148 shadow_free(d, m3mfn);
2149 sh_unmap_domain_page(l4e);
2151 #elif CONFIG_PAGING_LEVELS == 3
2152 /* Need to destroy the l2 monitor page in slot 4 too */
2154 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2155 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2156 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2157 sh_unmap_domain_page(l3e);
2159 #endif
2161 /* Put the memory back in the pool */
2162 shadow_free(d, mmfn);
2164 #endif
2166 /**************************************************************************/
2167 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2168 * These are called from common code when we are running out of shadow
2169 * memory, and unpinning all the top-level shadows hasn't worked.
2171 * This implementation is pretty crude and slow, but we hope that it won't
2172 * be called very often. */
2174 #if GUEST_PAGING_LEVELS == 2
2176 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2178 shadow_l2e_t *sl2e;
2179 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2180 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2181 });
2184 #elif GUEST_PAGING_LEVELS == 3
2186 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2187 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2189 shadow_l2e_t *sl2e;
2190 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2191 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2192 });
2195 #elif GUEST_PAGING_LEVELS == 4
2197 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2199 shadow_l4e_t *sl4e;
2200 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2201 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2202 });
2205 #endif
2207 /**************************************************************************/
2208 /* Internal translation functions.
2209 * These functions require a pointer to the shadow entry that will be updated.
2210 */
2212 /* These functions take a new guest entry, translate it to shadow and write
2213 * the shadow entry.
2215 * They return the same bitmaps as the shadow_set_lXe() functions.
2216 */
2218 #if GUEST_PAGING_LEVELS >= 4
2219 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2221 shadow_l4e_t new_sl4e;
2222 guest_l4e_t *new_gl4e = new_ge;
2223 shadow_l4e_t *sl4p = se;
2224 mfn_t sl3mfn = _mfn(INVALID_MFN);
2225 int result = 0;
2227 perfc_incrc(shadow_validate_gl4e_calls);
2229 if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
2231 gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
2232 mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
2233 if ( mfn_valid(gl3mfn) )
2234 sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2235 else
2236 result |= SHADOW_SET_ERROR;
2238 l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
2239 sl3mfn, &new_sl4e, ft_prefetch);
2241 // check for updates to xen reserved slots
2242 if ( !shadow_mode_external(v->domain) )
2244 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2245 sizeof(shadow_l4e_t));
2246 int reserved_xen_slot = !is_guest_l4_slot(v->domain, shadow_index);
2248 if ( unlikely(reserved_xen_slot) )
2250 // attempt by the guest to write to a xen reserved slot
2251 //
2252 SHADOW_PRINTK("%s out-of-range update "
2253 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2254 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2255 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2257 SHADOW_ERROR("out-of-range l4e update\n");
2258 result |= SHADOW_SET_ERROR;
2261 // do not call shadow_set_l4e...
2262 return result;
2266 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2267 return result;
2271 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2273 shadow_l3e_t new_sl3e;
2274 guest_l3e_t *new_gl3e = new_ge;
2275 shadow_l3e_t *sl3p = se;
2276 mfn_t sl2mfn = _mfn(INVALID_MFN);
2277 int result = 0;
2279 perfc_incrc(shadow_validate_gl3e_calls);
2281 if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
2283 gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
2284 mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
2285 if ( mfn_valid(gl2mfn) )
2286 sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2287 else
2288 result |= SHADOW_SET_ERROR;
2290 l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
2291 sl2mfn, &new_sl3e, ft_prefetch);
2292 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2294 return result;
2296 #endif // GUEST_PAGING_LEVELS >= 4
2298 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2300 shadow_l2e_t new_sl2e;
2301 guest_l2e_t *new_gl2e = new_ge;
2302 shadow_l2e_t *sl2p = se;
2303 mfn_t sl1mfn = _mfn(INVALID_MFN);
2304 int result = 0;
2306 perfc_incrc(shadow_validate_gl2e_calls);
2308 if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
2310 gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
2311 if ( guest_supports_superpages(v) &&
2312 (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
2314 // superpage -- need to look up the shadow L1 which holds the
2315 // splitters...
2316 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2317 #if 0
2318 // XXX - it's possible that we want to do some kind of prefetch
2319 // for superpage fl1's here, but this is *not* on the demand path,
2320 // so we'll hold off trying that for now...
2321 //
2322 if ( !mfn_valid(sl1mfn) )
2323 sl1mfn = make_fl1_shadow(v, gl1gfn);
2324 #endif
2326 else
2328 mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
2329 if ( mfn_valid(gl1mfn) )
2330 sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2331 else
2332 result |= SHADOW_SET_ERROR;
2335 l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
2336 sl1mfn, &new_sl2e, ft_prefetch);
2338 // check for updates to xen reserved slots in PV guests...
2339 // XXX -- need to revisit this for PV 3-on-4 guests.
2340 //
2341 #if SHADOW_PAGING_LEVELS < 4
2342 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2343 if ( !shadow_mode_external(v->domain) )
2345 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2346 sizeof(shadow_l2e_t));
2347 int reserved_xen_slot;
2349 #if SHADOW_PAGING_LEVELS == 3
2350 reserved_xen_slot =
2351 ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
2352 (shadow_index
2353 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2354 #else /* SHADOW_PAGING_LEVELS == 2 */
2355 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2356 #endif
2358 if ( unlikely(reserved_xen_slot) )
2360 // attempt by the guest to write to a xen reserved slot
2361 //
2362 SHADOW_PRINTK("%s out-of-range update "
2363 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2364 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2365 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2367 SHADOW_ERROR("out-of-range l2e update\n");
2368 result |= SHADOW_SET_ERROR;
2371 // do not call shadow_set_l2e...
2372 return result;
2375 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2376 #endif /* SHADOW_PAGING_LEVELS < 4 */
2378 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2380 return result;
2383 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2385 shadow_l1e_t new_sl1e;
2386 guest_l1e_t *new_gl1e = new_ge;
2387 shadow_l1e_t *sl1p = se;
2388 gfn_t gfn;
2389 mfn_t gmfn;
2390 int result = 0, mmio;
2392 perfc_incrc(shadow_validate_gl1e_calls);
2394 gfn = guest_l1e_get_gfn(*new_gl1e);
2395 gmfn = vcpu_gfn_to_mfn(v, gfn);
2397 mmio = (is_hvm_vcpu(v) && paging_vcpu_mode_translate(v) && !mfn_valid(gmfn));
2398 l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e,
2399 ft_prefetch, mmio);
2401 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2402 return result;
2406 /**************************************************************************/
2407 /* Functions which translate and install the shadows of arbitrary guest
2408 * entries that we have just seen the guest write. */
2411 static inline int
2412 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2413 void *new_gp, u32 size, u32 sh_type,
2414 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2415 int (*validate_ge)(struct vcpu *v, void *ge,
2416 mfn_t smfn, void *se))
2417 /* Generic function for mapping and validating. */
2419 mfn_t smfn, smfn2, map_mfn;
2420 shadow_l1e_t *sl1p;
2421 u32 shadow_idx, guest_idx;
2422 int result = 0;
2424 /* Align address and size to guest entry boundaries */
2425 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2426 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2427 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2428 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2430 /* Map the shadow page */
2431 smfn = get_shadow_status(v, gmfn, sh_type);
2432 ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2433 guest_idx = guest_index(new_gp);
2434 map_mfn = smfn;
2435 shadow_idx = shadow_index(&map_mfn, guest_idx);
2436 sl1p = map_shadow_page(map_mfn);
2438 /* Validate one entry at a time */
2439 while ( size )
2441 smfn2 = smfn;
2442 guest_idx = guest_index(new_gp);
2443 shadow_idx = shadow_index(&smfn2, guest_idx);
2444 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2446 /* We have moved to another page of the shadow */
2447 map_mfn = smfn2;
2448 unmap_shadow_page(sl1p);
2449 sl1p = map_shadow_page(map_mfn);
2451 result |= validate_ge(v,
2452 new_gp,
2453 map_mfn,
2454 &sl1p[shadow_idx]);
2455 size -= sizeof(guest_l1e_t);
2456 new_gp += sizeof(guest_l1e_t);
2458 unmap_shadow_page(sl1p);
2459 return result;
2463 int
2464 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2465 void *new_gl4p, u32 size)
2467 #if GUEST_PAGING_LEVELS >= 4
2468 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2469 SH_type_l4_shadow,
2470 shadow_l4_index,
2471 validate_gl4e);
2472 #else // ! GUEST_PAGING_LEVELS >= 4
2473 SHADOW_PRINTK("called in wrong paging mode!\n");
2474 BUG();
2475 return 0;
2476 #endif
2479 int
2480 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2481 void *new_gl3p, u32 size)
2483 #if GUEST_PAGING_LEVELS >= 4
2484 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2485 SH_type_l3_shadow,
2486 shadow_l3_index,
2487 validate_gl3e);
2488 #else // ! GUEST_PAGING_LEVELS >= 4
2489 SHADOW_PRINTK("called in wrong paging mode!\n");
2490 BUG();
2491 return 0;
2492 #endif
2495 int
2496 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2497 void *new_gl2p, u32 size)
2499 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2500 SH_type_l2_shadow,
2501 shadow_l2_index,
2502 validate_gl2e);
2505 int
2506 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2507 void *new_gl2p, u32 size)
2509 #if GUEST_PAGING_LEVELS >= 3
2510 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2511 SH_type_l2h_shadow,
2512 shadow_l2_index,
2513 validate_gl2e);
2514 #else /* Non-PAE guests don't have different kinds of l2 table */
2515 SHADOW_PRINTK("called in wrong paging mode!\n");
2516 BUG();
2517 return 0;
2518 #endif
2521 int
2522 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2523 void *new_gl1p, u32 size)
2525 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2526 SH_type_l1_shadow,
2527 shadow_l1_index,
2528 validate_gl1e);
2532 /**************************************************************************/
2533 /* Optimization: If we see two emulated writes of zeros to the same
2534 * page-table without another kind of page fault in between, we guess
2535 * that this is a batch of changes (for process destruction) and
2536 * unshadow the page so we don't take a pagefault on every entry. This
2537 * should also make finding writeable mappings of pagetables much
2538 * easier. */
2540 /* Look to see if this is the second emulated write in a row to this
2541 * page, and unshadow/unhook if it is */
2542 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2544 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2545 if ( v->arch.paging.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2546 sh_mfn_is_a_page_table(gmfn) )
2548 u32 flags = mfn_to_page(gmfn)->shadow_flags;
2549 if ( !(flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
2551 perfc_incrc(shadow_early_unshadow);
2552 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2555 v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
2556 #endif
2559 /* Stop counting towards early unshadows, as we've seen a real page fault */
2560 static inline void reset_early_unshadow(struct vcpu *v)
2562 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2563 v->arch.paging.shadow.last_emulated_mfn = INVALID_MFN;
2564 #endif
2569 /**************************************************************************/
2570 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2571 * demand-faulted a shadow l1e in the fault handler, to see if it's
2572 * worth fetching some more.
2573 */
2575 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2577 /* XXX magic number */
2578 #define PREFETCH_DISTANCE 32
2580 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2581 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2583 int i, dist, mmio;
2584 gfn_t gfn;
2585 mfn_t gmfn;
2586 guest_l1e_t gl1e;
2587 shadow_l1e_t sl1e;
2588 u32 gflags;
2590 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2591 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2592 /* And no more than a maximum fetches-per-fault */
2593 if ( dist > PREFETCH_DISTANCE )
2594 dist = PREFETCH_DISTANCE;
2596 for ( i = 1; i < dist ; i++ )
2598 /* No point in prefetching if there's already a shadow */
2599 if ( ptr_sl1e[i].l1 != 0 )
2600 break;
2602 if ( gw->l1e )
2604 /* Normal guest page; grab the next guest entry */
2605 gl1e = gw->l1e[i];
2606 /* Not worth continuing if we hit an entry that will need another
2607 * fault for A/D-bit propagation anyway */
2608 gflags = guest_l1e_get_flags(gl1e);
2609 if ( (gflags & _PAGE_PRESENT)
2610 && (!(gflags & _PAGE_ACCESSED)
2611 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2612 break;
2614 else
2616 /* Fragmented superpage, unless we've been called wrongly */
2617 ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
2618 /* Increment the l1e's GFN by the right number of guest pages */
2619 gl1e = guest_l1e_from_gfn(
2620 _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i),
2621 guest_l1e_get_flags(gw->eff_l1e));
2624 /* Look at the gfn that the l1e is pointing at */
2625 gfn = guest_l1e_get_gfn(gl1e);
2626 gmfn = vcpu_gfn_to_mfn(v, gfn);
2627 mmio = ( is_hvm_vcpu(v)
2628 && paging_vcpu_mode_translate(v)
2629 && mmio_space(gfn_to_paddr(gfn)) );
2631 /* Propagate the entry. Safe to use a pointer to our local
2632 * gl1e, since this is not a demand-fetch so there will be no
2633 * write-back to the guest. */
2634 l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
2635 gmfn, &sl1e, ft_prefetch, mmio);
2636 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2640 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2643 /**************************************************************************/
2644 /* Entry points into the shadow code */
2646 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2647 * for pagefaults. Returns 1 if this fault was an artefact of the
2648 * shadow code (and the guest should retry) or 0 if it is not (and the
2649 * fault should be handled elsewhere or passed to the guest). */
2651 static int sh_page_fault(struct vcpu *v,
2652 unsigned long va,
2653 struct cpu_user_regs *regs)
2655 struct domain *d = v->domain;
2656 walk_t gw;
2657 u32 accumulated_gflags;
2658 gfn_t gfn;
2659 mfn_t gmfn, sl1mfn=_mfn(0);
2660 shadow_l1e_t sl1e, *ptr_sl1e;
2661 paddr_t gpa;
2662 struct sh_emulate_ctxt emul_ctxt;
2663 struct x86_emulate_ops *emul_ops;
2664 int r, mmio;
2665 fetch_type_t ft = 0;
2667 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
2668 v->domain->domain_id, v->vcpu_id, va, regs->error_code);
2670 perfc_incrc(shadow_fault);
2671 //
2672 // XXX: Need to think about eventually mapping superpages directly in the
2673 // shadow (when possible), as opposed to splintering them into a
2674 // bunch of 4K maps.
2675 //
2677 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2678 if ( (regs->error_code & PFEC_reserved_bit) )
2680 /* The only reasons for reserved bits to be set in shadow entries
2681 * are the two "magic" shadow_l1e entries. */
2682 if ( likely((__copy_from_user(&sl1e,
2683 (sh_linear_l1_table(v)
2684 + shadow_l1_linear_offset(va)),
2685 sizeof(sl1e)) == 0)
2686 && sh_l1e_is_magic(sl1e)) )
2688 if ( sh_l1e_is_gnp(sl1e) )
2690 if ( likely(!is_hvm_domain(d) ||
2691 paging_vcpu_mode_translate(v)) )
2693 /* Not-present in a guest PT: pass to the guest as
2694 * a not-present fault (by flipping two bits). */
2695 ASSERT(regs->error_code & PFEC_page_present);
2696 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2697 perfc_incrc(shadow_fault_fast_gnp);
2698 SHADOW_PRINTK("fast path not-present\n");
2699 return 0;
2701 else
2703 /* Not-present in the P2M: MMIO */
2704 gpa = va;
2707 else
2709 /* Magic MMIO marker: extract gfn for MMIO address */
2710 ASSERT(sh_l1e_is_mmio(sl1e));
2711 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2712 << PAGE_SHIFT)
2713 | (va & ~PAGE_MASK);
2715 perfc_incrc(shadow_fault_fast_mmio);
2716 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2717 reset_early_unshadow(v);
2718 handle_mmio(gpa);
2719 return EXCRET_fault_fixed;
2721 else
2723 /* This should be exceptionally rare: another vcpu has fixed
2724 * the tables between the fault and our reading the l1e.
2725 * Retry and let the hardware give us the right fault next time. */
2726 perfc_incrc(shadow_fault_fast_fail);
2727 SHADOW_PRINTK("fast path false alarm!\n");
2728 return EXCRET_fault_fixed;
2731 #endif /* SHOPT_FAST_FAULT_PATH */
2733 /* Detect if this page fault happened while we were already in Xen
2734 * doing a shadow operation. If that happens, the only thing we can
2735 * do is let Xen's normal fault handlers try to fix it. In any case,
2736 * a diagnostic trace of the fault will be more useful than
2737 * a BUG() when we try to take the lock again. */
2738 if ( unlikely(shadow_locked_by_me(d)) )
2740 SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
2741 d->arch.paging.shadow.locker_function);
2742 return 0;
2745 shadow_lock(d);
2747 shadow_audit_tables(v);
2749 if ( guest_walk_tables(v, va, &gw, 1) != 0 )
2751 SHADOW_PRINTK("malformed guest pagetable!");
2752 print_gw(&gw);
2755 sh_audit_gw(v, &gw);
2757 // We do not look at the gw->l1e, as that will not exist for superpages.
2758 // Instead, we use the gw->eff_l1e...
2759 //
2760 // We need not check all the levels of the guest page table entries for
2761 // present vs not-present, as the eff_l1e will always be not present if
2762 // one of the higher level entries is not present.
2763 //
2764 if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
2766 if ( is_hvm_domain(d) && !paging_vcpu_mode_translate(v) )
2768 /* Not present in p2m map, means this is mmio */
2769 gpa = va;
2770 goto mmio;
2773 perfc_incrc(shadow_fault_bail_not_present);
2774 goto not_a_shadow_fault;
2777 // All levels of the guest page table are now known to be present.
2778 accumulated_gflags = accumulate_guest_flags(v, &gw);
2780 // Check for attempts to access supervisor-only pages from user mode,
2781 // i.e. ring 3. Such errors are not caused or dealt with by the shadow
2782 // code.
2783 //
2784 if ( (regs->error_code & PFEC_user_mode) &&
2785 !(accumulated_gflags & _PAGE_USER) )
2787 /* illegal user-mode access to supervisor-only page */
2788 perfc_incrc(shadow_fault_bail_user_supervisor);
2789 goto not_a_shadow_fault;
2792 // Was it a write fault?
2793 ft = ((regs->error_code & PFEC_write_access)
2794 ? ft_demand_write : ft_demand_read);
2795 if ( ft == ft_demand_write )
2797 if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
2799 perfc_incrc(shadow_fault_bail_ro_mapping);
2800 goto not_a_shadow_fault;
2803 else // must have been either an insn fetch or read fault
2805 // Check for NX bit violations: attempts to execute code that is
2806 // marked "do not execute". Such errors are not caused or dealt with
2807 // by the shadow code.
2808 //
2809 if ( regs->error_code & PFEC_insn_fetch )
2811 if ( accumulated_gflags & _PAGE_NX_BIT )
2813 /* NX prevented this code fetch */
2814 perfc_incrc(shadow_fault_bail_nx);
2815 goto not_a_shadow_fault;
2820 /* What mfn is the guest trying to access? */
2821 gfn = guest_l1e_get_gfn(gw.eff_l1e);
2822 gmfn = vcpu_gfn_to_mfn(v, gfn);
2823 mmio = (is_hvm_domain(d)
2824 && paging_vcpu_mode_translate(v)
2825 && mmio_space(gfn_to_paddr(gfn)));
2827 if ( !mmio && !mfn_valid(gmfn) )
2829 perfc_incrc(shadow_fault_bail_bad_gfn);
2830 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
2831 gfn_x(gfn), mfn_x(gmfn));
2832 goto not_a_shadow_fault;
2835 /* Make sure there is enough free shadow memory to build a chain of
2836 * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
2837 * to allocate all we need. (We never allocate a top-level shadow
2838 * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
2839 shadow_prealloc(d, SHADOW_MAX_ORDER);
2841 /* Acquire the shadow. This must happen before we figure out the rights
2842 * for the shadow entry, since we might promote a page here. */
2843 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2844 if ( unlikely(ptr_sl1e == NULL) )
2846 /* Couldn't get the sl1e! Since we know the guest entries
2847 * are OK, this can only have been caused by a failed
2848 * shadow_set_l*e(), which will have crashed the guest.
2849 * Get out of the fault handler immediately. */
2850 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2851 unmap_walk(v, &gw);
2852 shadow_unlock(d);
2853 return 0;
2856 /* Calculate the shadow entry and write it */
2857 l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn,
2858 gmfn, &sl1e, ft, mmio);
2859 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2861 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2862 /* Prefetch some more shadow entries */
2863 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2864 #endif
2866 /* Need to emulate accesses to page tables */
2867 if ( sh_mfn_is_a_page_table(gmfn) )
2869 if ( ft == ft_demand_write )
2871 perfc_incrc(shadow_fault_emulate_write);
2872 goto emulate;
2874 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2876 perfc_incrc(shadow_fault_emulate_read);
2877 goto emulate;
2881 if ( mmio )
2883 gpa = guest_walk_to_gpa(&gw);
2884 goto mmio;
2887 perfc_incrc(shadow_fault_fixed);
2888 d->arch.paging.shadow.fault_count++;
2889 reset_early_unshadow(v);
2891 done:
2892 sh_audit_gw(v, &gw);
2893 unmap_walk(v, &gw);
2894 SHADOW_PRINTK("fixed\n");
2895 shadow_audit_tables(v);
2896 shadow_unlock(d);
2897 return EXCRET_fault_fixed;
2899 emulate:
2900 if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
2901 goto not_a_shadow_fault;
2903 if ( is_hvm_domain(d) )
2905 /*
2906 * If we are in the middle of injecting an exception or interrupt then
2907 * we should not emulate: it is not the instruction at %eip that caused
2908 * the fault. Furthermore it is almost certainly the case the handler
2909 * stack is currently considered to be a page table, so we should
2910 * unshadow the faulting page before exiting.
2911 */
2912 if ( unlikely(hvm_injection_pending(v)) )
2914 gdprintk(XENLOG_DEBUG, "write to pagetable during event "
2915 "injection: cr2=%#lx, mfn=%#lx\n",
2916 va, mfn_x(gmfn));
2917 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2918 goto done;
2921 hvm_store_cpu_guest_regs(v, regs, NULL);
2924 SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
2925 (unsigned long)regs->eip, (unsigned long)regs->esp);
2927 /*
2928 * Check whether this looks like a stack operation. If so, unshadow the
2929 * faulting page. We can allow this to fail: if it does fail then we
2930 * carry on and emulate, otherwise we bail immediately. Failure is
2931 * tolerated because this is only a heuristic (e.g., stack segment base
2932 * address is ignored).
2933 */
2934 if ( unlikely((va & PAGE_MASK) == (regs->esp & PAGE_MASK)) )
2936 gdprintk(XENLOG_DEBUG, "guest stack is on a shadowed frame: "
2937 "%%esp=%#lx, cr2=%#lx, mfn=%#lx\n",
2938 (unsigned long)regs->esp, va, mfn_x(gmfn));
2939 sh_remove_shadows(v, gmfn, 0 /* thorough */, 0 /* can fail */);
2940 if ( !(mfn_to_page(gmfn)->count_info & PGC_page_table) )
2941 goto done;
2944 emul_ops = shadow_init_emulation(&emul_ctxt, regs);
2946 /*
2947 * We do not emulate user writes. Instead we use them as a hint that the
2948 * page is no longer a page table. This behaviour differs from native, but
2949 * it seems very unlikely that any OS grants user access to page tables.
2950 */
2951 r = X86EMUL_UNHANDLEABLE;
2952 if ( !(regs->error_code & PFEC_user_mode) )
2953 r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
2955 /*
2956 * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
2957 * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
2958 * then it must be 'failable': we cannot require the unshadow to succeed.
2959 */
2960 if ( r == X86EMUL_UNHANDLEABLE )
2962 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
2963 mfn_x(gmfn));
2964 perfc_incrc(shadow_fault_emulate_failed);
2965 /* If this is actually a page table, then we have a bug, and need
2966 * to support more operations in the emulator. More likely,
2967 * though, this is a hint that this page should not be shadowed. */
2968 sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
2971 /* Emulator has changed the user registers: write back */
2972 if ( is_hvm_domain(d) )
2973 hvm_load_cpu_guest_regs(v, regs);
2974 goto done;
2976 mmio:
2977 if ( !guest_mode(regs) )
2978 goto not_a_shadow_fault;
2979 perfc_incrc(shadow_fault_mmio);
2980 sh_audit_gw(v, &gw);
2981 unmap_walk(v, &gw);
2982 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
2983 shadow_audit_tables(v);
2984 reset_early_unshadow(v);
2985 shadow_unlock(d);
2986 handle_mmio(gpa);
2987 return EXCRET_fault_fixed;
2989 not_a_shadow_fault:
2990 sh_audit_gw(v, &gw);
2991 unmap_walk(v, &gw);
2992 SHADOW_PRINTK("not a shadow fault\n");
2993 shadow_audit_tables(v);
2994 reset_early_unshadow(v);
2995 shadow_unlock(d);
2996 return 0;
3000 static int
3001 sh_invlpg(struct vcpu *v, unsigned long va)
3002 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
3003 * instruction should be issued on the hardware, or 0 if it's safe not
3004 * to do so. */
3006 shadow_l2e_t sl2e;
3008 perfc_incrc(shadow_invlpg);
3010 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
3011 * run as high as 6% of invlpg calls where we haven't shadowed the l2
3012 * yet. */
3013 #if SHADOW_PAGING_LEVELS == 4
3015 shadow_l3e_t sl3e;
3016 if ( !(shadow_l4e_get_flags(
3017 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3018 & _PAGE_PRESENT) )
3019 return 0;
3020 /* This must still be a copy-from-user because we don't have the
3021 * shadow lock, and the higher-level shadows might disappear
3022 * under our feet. */
3023 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3024 + shadow_l3_linear_offset(va)),
3025 sizeof (sl3e)) != 0 )
3027 perfc_incrc(shadow_invlpg_fault);
3028 return 0;
3030 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3031 return 0;
3033 #elif SHADOW_PAGING_LEVELS == 3
3034 if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3035 & _PAGE_PRESENT) )
3036 // no need to flush anything if there's no SL2...
3037 return 0;
3038 #endif
3040 /* This must still be a copy-from-user because we don't have the shadow
3041 * lock, and the higher-level shadows might disappear under our feet. */
3042 if ( __copy_from_user(&sl2e,
3043 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3044 sizeof (sl2e)) != 0 )
3046 perfc_incrc(shadow_invlpg_fault);
3047 return 0;
3050 // If there's nothing shadowed for this particular sl2e, then
3051 // there is no need to do an invlpg, either...
3052 //
3053 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3054 return 0;
3056 // Check to see if the SL2 is a splintered superpage...
3057 // If so, then we'll need to flush the entire TLB (because that's
3058 // easier than invalidating all of the individual 4K pages).
3059 //
3060 if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
3061 == SH_type_fl1_shadow )
3063 local_flush_tlb();
3064 return 0;
3067 return 1;
3070 static unsigned long
3071 sh_gva_to_gfn(struct vcpu *v, unsigned long va)
3072 /* Called to translate a guest virtual address to what the *guest*
3073 * pagetables would map it to. */
3075 walk_t gw;
3076 gfn_t gfn;
3078 guest_walk_tables(v, va, &gw, 0);
3079 gfn = guest_walk_to_gfn(&gw);
3080 unmap_walk(v, &gw);
3082 return gfn_x(gfn);
3086 static inline void
3087 sh_update_linear_entries(struct vcpu *v)
3088 /* Sync up all the linear mappings for this vcpu's pagetables */
3090 struct domain *d = v->domain;
3092 /* Linear pagetables in PV guests
3093 * ------------------------------
3095 * Guest linear pagetables, which map the guest pages, are at
3096 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3097 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3098 * are set up at shadow creation time, but (of course!) the PAE case
3099 * is subtler. Normal linear mappings are made by having an entry
3100 * in the top-level table that points to itself (shadow linear) or
3101 * to the guest top-level table (guest linear). For PAE, to set up
3102 * a linear map requires us to copy the four top-level entries into
3103 * level-2 entries. That means that every time we change a PAE l3e,
3104 * we need to reflect the change into the copy.
3106 * Linear pagetables in HVM guests
3107 * -------------------------------
3109 * For HVM guests, the linear pagetables are installed in the monitor
3110 * tables (since we can't put them in the shadow). Shadow linear
3111 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3112 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3113 * a linear pagetable of the monitor tables themselves. We have
3114 * the same issue of having to re-copy PAE l3 entries whevever we use
3115 * PAE shadows.
3117 * Because HVM guests run on the same monitor tables regardless of the
3118 * shadow tables in use, the linear mapping of the shadow tables has to
3119 * be updated every time v->arch.shadow_table changes.
3120 */
3122 /* Don't try to update the monitor table if it doesn't exist */
3123 if ( shadow_mode_external(d)
3124 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3125 return;
3127 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3129 /* For PV, one l4e points at the guest l4, one points at the shadow
3130 * l4. No maintenance required.
3131 * For HVM, just need to update the l4e that points to the shadow l4. */
3133 if ( shadow_mode_external(d) )
3135 /* Use the linear map if we can; otherwise make a new mapping */
3136 if ( v == current )
3138 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3139 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3140 __PAGE_HYPERVISOR);
3142 else
3144 l4_pgentry_t *ml4e;
3145 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3146 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3147 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3148 __PAGE_HYPERVISOR);
3149 sh_unmap_domain_page(ml4e);
3153 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3155 /* PV: XXX
3157 * HVM: To give ourselves a linear map of the shadows, we need to
3158 * extend a PAE shadow to 4 levels. We do this by having a monitor
3159 * l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3160 * entries into it. Then, by having the monitor l4e for shadow
3161 * pagetables also point to the monitor l4, we can use it to access
3162 * the shadows.
3163 */
3165 if ( shadow_mode_external(d) )
3167 /* Install copies of the shadow l3es into the monitor l3 table.
3168 * The monitor l3 table is hooked into slot 0 of the monitor
3169 * l4 table, so we use l3 linear indices 0 to 3 */
3170 shadow_l3e_t *sl3e;
3171 l3_pgentry_t *ml3e;
3172 mfn_t l3mfn;
3173 int i;
3175 /* Use linear mappings if we can; otherwise make new mappings */
3176 if ( v == current )
3178 ml3e = __linear_l3_table;
3179 l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
3181 else
3183 l4_pgentry_t *ml4e;
3184 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3185 ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
3186 l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
3187 ml3e = sh_map_domain_page(l3mfn);
3188 sh_unmap_domain_page(ml4e);
3191 /* Shadow l3 tables are made up by sh_update_cr3 */
3192 sl3e = v->arch.paging.shadow.l3table;
3194 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3196 ml3e[i] =
3197 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3198 ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3199 __PAGE_HYPERVISOR)
3200 : l3e_empty();
3203 if ( v != current )
3204 sh_unmap_domain_page(ml3e);
3206 else
3207 domain_crash(d); /* XXX */
3209 #elif CONFIG_PAGING_LEVELS == 3
3211 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3212 * entries in the shadow, and the shadow's l3 entries into the
3213 * shadow-linear-map l2 entries in the shadow. This is safe to do
3214 * because Xen does not let guests share high-slot l2 tables between l3s,
3215 * so we know we're not treading on anyone's toes.
3217 * HVM: need to copy the shadow's l3 entries into the
3218 * shadow-linear-map l2 entries in the monitor table. This is safe
3219 * because we have one monitor table for each vcpu. The monitor's
3220 * own l3es don't need to be copied because they never change.
3221 * XXX That might change if we start stuffing things into the rest
3222 * of the monitor's virtual address space.
3223 */
3225 l2_pgentry_t *l2e, new_l2e;
3226 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3227 int i;
3228 int unmap_l2e = 0;
3230 #if GUEST_PAGING_LEVELS == 2
3232 /* Shadow l3 tables were built by sh_update_cr3 */
3233 BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3234 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3236 #else /* GUEST_PAGING_LEVELS == 3 */
3238 shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3239 guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3241 #endif /* GUEST_PAGING_LEVELS */
3243 /* Choose where to write the entries, using linear maps if possible */
3244 if ( shadow_mode_external(d) )
3246 if ( v == current )
3248 /* From the monitor tables, it's safe to use linear maps
3249 * to update monitor l2s */
3250 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3252 else
3254 /* Map the monitor table's high l2 */
3255 l3_pgentry_t *l3e;
3256 l3e = sh_map_domain_page(
3257 pagetable_get_mfn(v->arch.monitor_table));
3258 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3259 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3260 unmap_l2e = 1;
3261 sh_unmap_domain_page(l3e);
3264 else
3266 /* Map the shadow table's high l2 */
3267 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3268 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3269 unmap_l2e = 1;
3272 /* Write linear mapping of guest (only in PV, and only when
3273 * not translated). */
3274 if ( !shadow_mode_translate(d) )
3276 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3278 new_l2e =
3279 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3280 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3281 __PAGE_HYPERVISOR)
3282 : l2e_empty());
3283 safe_write_entry(
3284 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3285 &new_l2e);
3289 /* Write linear mapping of shadow. */
3290 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3292 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3293 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3294 __PAGE_HYPERVISOR)
3295 : l2e_empty();
3296 safe_write_entry(
3297 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3298 &new_l2e);
3301 if ( unmap_l2e )
3302 sh_unmap_domain_page(l2e);
3305 #elif CONFIG_PAGING_LEVELS == 2
3307 /* For PV, one l2e points at the guest l2, one points at the shadow
3308 * l2. No maintenance required.
3309 * For HVM, just need to update the l2e that points to the shadow l2. */
3311 if ( shadow_mode_external(d) )
3313 /* Use the linear map if we can; otherwise make a new mapping */
3314 if ( v == current )
3316 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3317 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3318 __PAGE_HYPERVISOR);
3320 else
3322 l2_pgentry_t *ml2e;
3323 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3324 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3325 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3326 __PAGE_HYPERVISOR);
3327 sh_unmap_domain_page(ml2e);
3331 #else
3332 #error this should not happen
3333 #endif
3337 /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3338 * Does all appropriate management/bookkeeping/refcounting/etc...
3339 */
3340 static void
3341 sh_detach_old_tables(struct vcpu *v)
3343 mfn_t smfn;
3344 int i = 0;
3346 ////
3347 //// vcpu->arch.paging.shadow.guest_vtable
3348 ////
3350 #if GUEST_PAGING_LEVELS == 3
3351 /* PAE guests don't have a mapping of the guest top-level table */
3352 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3353 #else
3354 if ( v->arch.paging.shadow.guest_vtable )
3356 struct domain *d = v->domain;
3357 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3358 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3359 v->arch.paging.shadow.guest_vtable = NULL;
3361 #endif
3364 ////
3365 //// vcpu->arch.shadow_table[]
3366 ////
3368 #if GUEST_PAGING_LEVELS == 3
3369 /* PAE guests have four shadow_table entries */
3370 for ( i = 0 ; i < 4 ; i++ )
3371 #endif
3373 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3374 if ( mfn_x(smfn) )
3375 sh_put_ref(v, smfn, 0);
3376 v->arch.shadow_table[i] = pagetable_null();
3380 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3381 static void
3382 sh_set_toplevel_shadow(struct vcpu *v,
3383 int slot,
3384 mfn_t gmfn,
3385 unsigned int root_type)
3387 mfn_t smfn;
3388 pagetable_t old_entry, new_entry;
3390 struct domain *d = v->domain;
3392 /* Remember the old contents of this slot */
3393 old_entry = v->arch.shadow_table[slot];
3395 /* Now figure out the new contents: is this a valid guest MFN? */
3396 if ( !mfn_valid(gmfn) )
3398 new_entry = pagetable_null();
3399 goto install_new_entry;
3402 /* Guest mfn is valid: shadow it and install the shadow */
3403 smfn = get_shadow_status(v, gmfn, root_type);
3404 if ( !mfn_valid(smfn) )
3406 /* Make sure there's enough free shadow memory. */
3407 shadow_prealloc(d, SHADOW_MAX_ORDER);
3408 /* Shadow the page. */
3409 smfn = sh_make_shadow(v, gmfn, root_type);
3411 ASSERT(mfn_valid(smfn));
3413 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
3414 /* Once again OK to unhook entries from this table if we see fork/exit */
3415 #if CONFIG_PAGING_LEVELS == 4
3416 if ( IS_COMPAT(d) )
3417 ASSERT(!sh_mfn_is_a_page_table(gmfn));
3418 else
3419 #endif
3420 ASSERT(sh_mfn_is_a_page_table(gmfn));
3421 mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
3422 #endif
3424 /* Pin the shadow and put it (back) on the list of top-level shadows */
3425 if ( sh_pin(v, smfn) == 0 )
3427 SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
3428 domain_crash(v->domain);
3431 /* Take a ref to this page: it will be released in sh_detach_old_tables()
3432 * or the next call to set_toplevel_shadow() */
3433 if ( !sh_get_ref(v, smfn, 0) )
3435 SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
3436 domain_crash(v->domain);
3439 new_entry = pagetable_from_mfn(smfn);
3441 install_new_entry:
3442 /* Done. Install it */
3443 SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
3444 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3445 mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
3446 v->arch.shadow_table[slot] = new_entry;
3448 /* Decrement the refcount of the old contents of this slot */
3449 if ( !pagetable_is_null(old_entry) )
3450 sh_put_ref(v, pagetable_get_mfn(old_entry), 0);
3454 static void
3455 sh_update_cr3(struct vcpu *v, int do_locking)
3456 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3457 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3458 * if appropriate).
3459 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
3460 * this function will call hvm_update_guest_cr3() to tell them where the
3461 * shadow tables are.
3462 * If do_locking != 0, assume we are being called from outside the
3463 * shadow code, and must take and release the shadow lock; otherwise
3464 * that is the caller's responsibility.
3465 */
3467 struct domain *d = v->domain;
3468 mfn_t gmfn;
3469 #if GUEST_PAGING_LEVELS == 3
3470 guest_l3e_t *gl3e;
3471 u32 guest_idx=0;
3472 int i;
3473 #endif
3475 /* Don't do anything on an uninitialised vcpu */
3476 if ( !is_hvm_domain(d) && !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
3478 ASSERT(v->arch.cr3 == 0);
3479 return;
3482 if ( do_locking ) shadow_lock(v->domain);
3484 ASSERT(shadow_locked_by_me(v->domain));
3485 ASSERT(v->arch.paging.mode);
3487 ////
3488 //// vcpu->arch.guest_table is already set
3489 ////
3491 #ifndef NDEBUG
3492 /* Double-check that the HVM code has sent us a sane guest_table */
3493 if ( is_hvm_domain(d) )
3495 gfn_t gfn;
3497 ASSERT(shadow_mode_external(d));
3499 // Is paging enabled on this vcpu?
3500 if ( paging_vcpu_mode_translate(v) )
3502 gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
3503 gmfn = vcpu_gfn_to_mfn(v, gfn);
3504 ASSERT(mfn_valid(gmfn));
3505 ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
3507 else
3509 /* Paging disabled: guest_table points at (part of) p2m */
3510 #if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
3511 /* For everything else, they sould be the same */
3512 ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
3513 #endif
3516 #endif
3518 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3519 d->domain_id, v->vcpu_id,
3520 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3522 #if GUEST_PAGING_LEVELS == 4
3523 if ( !(v->arch.flags & TF_kernel_mode) && !IS_COMPAT(v->domain) )
3524 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3525 else
3526 #endif
3527 gmfn = pagetable_get_mfn(v->arch.guest_table);
3530 ////
3531 //// vcpu->arch.paging.shadow.guest_vtable
3532 ////
3533 #if GUEST_PAGING_LEVELS == 4
3534 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3536 if ( v->arch.paging.shadow.guest_vtable )
3537 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3538 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3540 else
3541 v->arch.paging.shadow.guest_vtable = __linear_l4_table;
3542 #elif GUEST_PAGING_LEVELS == 3
3543 /* On PAE guests we don't use a mapping of the guest's own top-level
3544 * table. We cache the current state of that table and shadow that,
3545 * until the next CR3 write makes us refresh our cache. */
3546 ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3548 if ( shadow_mode_external(d) && paging_vcpu_mode_translate(v) )
3549 /* Paging enabled: find where in the page the l3 table is */
3550 guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
3551 else
3552 /* Paging disabled or PV: l3 is at the start of a page */
3553 guest_idx = 0;
3555 // Ignore the low 2 bits of guest_idx -- they are really just
3556 // cache control.
3557 guest_idx &= ~3;
3559 gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
3560 for ( i = 0; i < 4 ; i++ )
3561 v->arch.paging.shadow.gl3e[i] = gl3e[i];
3562 sh_unmap_domain_page(gl3e);
3563 #elif GUEST_PAGING_LEVELS == 2
3564 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3566 if ( v->arch.paging.shadow.guest_vtable )
3567 sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3568 v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
3570 else
3571 v->arch.paging.shadow.guest_vtable = __linear_l2_table;
3572 #else
3573 #error this should never happen
3574 #endif
3576 #if 0
3577 printk("%s %s %d gmfn=%05lx shadow.guest_vtable=%p\n",
3578 __func__, __FILE__, __LINE__, gmfn, v->arch.paging.shadow.guest_vtable);
3579 #endif
3581 ////
3582 //// vcpu->arch.shadow_table[]
3583 ////
3585 /* We revoke write access to the new guest toplevel page(s) before we
3586 * replace the old shadow pagetable(s), so that we can safely use the
3587 * (old) shadow linear maps in the writeable mapping heuristics. */
3588 #if GUEST_PAGING_LEVELS == 2
3589 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
3590 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3591 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
3592 #elif GUEST_PAGING_LEVELS == 3
3593 /* PAE guests have four shadow_table entries, based on the
3594 * current values of the guest's four l3es. */
3596 int flush = 0;
3597 gfn_t gl2gfn;
3598 mfn_t gl2mfn;
3599 guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
3600 /* First, make all four entries read-only. */
3601 for ( i = 0; i < 4; i++ )
3603 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3605 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3606 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3607 flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
3610 if ( flush )
3611 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3612 /* Now install the new shadows. */
3613 for ( i = 0; i < 4; i++ )
3615 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3617 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3618 gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3619 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3620 ? SH_type_l2h_shadow
3621 : SH_type_l2_shadow);
3623 else
3624 /* The guest is not present: clear out the shadow. */
3625 sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
3628 #elif GUEST_PAGING_LEVELS == 4
3629 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
3630 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3631 sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
3632 #else
3633 #error This should never happen
3634 #endif
3636 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3637 #endif
3639 ///
3640 /// v->arch.paging.shadow.l3table
3641 ///
3642 #if SHADOW_PAGING_LEVELS == 3
3644 mfn_t smfn;
3645 int i;
3646 for ( i = 0; i < 4; i++ )
3648 #if GUEST_PAGING_LEVELS == 2
3649 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3650 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3651 #else
3652 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3653 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3654 #endif
3655 v->arch.paging.shadow.l3table[i] =
3656 (mfn_x(smfn) == 0)
3657 ? shadow_l3e_empty()
3658 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3661 #endif /* SHADOW_PAGING_LEVELS == 3 */
3664 ///
3665 /// v->arch.cr3
3666 ///
3667 if ( shadow_mode_external(d) )
3669 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3671 else // not shadow_mode_external...
3673 /* We don't support PV except guest == shadow == config levels */
3674 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3675 #if SHADOW_PAGING_LEVELS == 3
3676 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3677 * Don't use make_cr3 because (a) we know it's below 4GB, and
3678 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3679 ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
3680 v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
3681 #else
3682 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3683 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3684 #endif
3688 ///
3689 /// v->arch.hvm_vcpu.hw_cr3
3690 ///
3691 if ( shadow_mode_external(d) )
3693 ASSERT(is_hvm_domain(d));
3694 #if SHADOW_PAGING_LEVELS == 3
3695 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3696 hvm_update_guest_cr3(v, virt_to_maddr(&v->arch.paging.shadow.l3table));
3697 #else
3698 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3699 hvm_update_guest_cr3(v, pagetable_get_paddr(v->arch.shadow_table[0]));
3700 #endif
3703 /* Fix up the linear pagetable mappings */
3704 sh_update_linear_entries(v);
3706 /* Release the lock, if we took it (otherwise it's the caller's problem) */
3707 if ( do_locking ) shadow_unlock(v->domain);
3711 /**************************************************************************/
3712 /* Functions to revoke guest rights */
3714 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3715 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3716 /* Look up this vaddr in the current shadow and see if it's a writeable
3717 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3719 shadow_l1e_t sl1e, *sl1p;
3720 shadow_l2e_t *sl2p;
3721 #if SHADOW_PAGING_LEVELS >= 3
3722 shadow_l3e_t *sl3p;
3723 #if SHADOW_PAGING_LEVELS >= 4
3724 shadow_l4e_t *sl4p;
3725 #endif
3726 #endif
3727 mfn_t sl1mfn;
3728 int r;
3730 /* Carefully look in the shadow linear map for the l1e we expect */
3731 #if SHADOW_PAGING_LEVELS >= 4
3732 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3733 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3734 return 0;
3735 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3736 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3737 return 0;
3738 #elif SHADOW_PAGING_LEVELS == 3
3739 sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
3740 + shadow_l3_linear_offset(vaddr);
3741 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3742 return 0;
3743 #endif
3744 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3745 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3746 return 0;
3747 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3748 sl1e = *sl1p;
3749 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3750 != (_PAGE_PRESENT|_PAGE_RW))
3751 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3752 return 0;
3754 /* Found it! Need to remove its write permissions. */
3755 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3756 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3757 r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3758 ASSERT( !(r & SHADOW_SET_ERROR) );
3759 return 1;
3761 #endif
3763 int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
3764 mfn_t readonly_mfn)
3765 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3767 shadow_l1e_t *sl1e;
3768 int done = 0;
3769 int flags;
3770 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3772 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3774 flags = shadow_l1e_get_flags(*sl1e);
3775 if ( (flags & _PAGE_PRESENT)
3776 && (flags & _PAGE_RW)
3777 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3779 shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
3780 (void) shadow_set_l1e(v, sl1e, ro_sl1e, sl1mfn);
3781 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3782 /* Remember the last shadow that we shot a writeable mapping in */
3783 v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3784 #endif
3785 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3786 & PGT_count_mask) == 0 )
3787 /* This breaks us cleanly out of the FOREACH macro */
3788 done = 1;
3790 });
3791 return done;
3795 int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3796 /* Excises all mappings to guest frame from this shadow l1 table */
3798 shadow_l1e_t *sl1e;
3799 int done = 0;
3800 int flags;
3802 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3804 flags = shadow_l1e_get_flags(*sl1e);
3805 if ( (flags & _PAGE_PRESENT)
3806 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3808 (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3809 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3810 /* This breaks us cleanly out of the FOREACH macro */
3811 done = 1;
3813 });
3814 return done;
3817 /**************************************************************************/
3818 /* Functions to excise all pointers to shadows from higher-level shadows. */
3820 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3821 /* Blank out a single shadow entry */
3823 switch ( mfn_to_shadow_page(smfn)->type )
3825 case SH_type_l1_shadow:
3826 (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3827 case SH_type_l2_shadow:
3828 #if GUEST_PAGING_LEVELS >= 3
3829 case SH_type_l2h_shadow:
3830 #endif
3831 (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3832 #if GUEST_PAGING_LEVELS >= 4
3833 case SH_type_l3_shadow:
3834 (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3835 case SH_type_l4_shadow:
3836 (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3837 #endif
3838 default: BUG(); /* Called with the wrong kind of shadow. */
3842 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3843 /* Remove all mappings of this l1 shadow from this l2 shadow */
3845 shadow_l2e_t *sl2e;
3846 int done = 0;
3847 int flags;
3849 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
3851 flags = shadow_l2e_get_flags(*sl2e);
3852 if ( (flags & _PAGE_PRESENT)
3853 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3855 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3856 if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
3857 /* This breaks us cleanly out of the FOREACH macro */
3858 done = 1;
3860 });
3861 return done;
3864 #if GUEST_PAGING_LEVELS >= 4
3865 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3866 /* Remove all mappings of this l2 shadow from this l3 shadow */
3868 shadow_l3e_t *sl3e;
3869 int done = 0;
3870 int flags;
3872 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3874 flags = shadow_l3e_get_flags(*sl3e);
3875 if ( (flags & _PAGE_PRESENT)
3876 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3878 (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3879 if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
3880 /* This breaks us cleanly out of the FOREACH macro */
3881 done = 1;
3883 });
3884 return done;
3887 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
3888 /* Remove all mappings of this l3 shadow from this l4 shadow */
3890 shadow_l4e_t *sl4e;
3891 int done = 0;
3892 int flags;
3894 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
3896 flags = shadow_l4e_get_flags(*sl4e);
3897 if ( (flags & _PAGE_PRESENT)
3898 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
3900 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
3901 if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
3902 /* This breaks us cleanly out of the FOREACH macro */
3903 done = 1;
3905 });
3906 return done;
3908 #endif /* 64bit guest */
3910 /**************************************************************************/
3911 /* Handling HVM guest writes to pagetables */
3913 /* Check that the user is allowed to perform this write.
3914 * Returns a mapped pointer to write to, and the mfn it's on,
3915 * or NULL for error. */
3916 static inline void * emulate_map_dest(struct vcpu *v,
3917 unsigned long vaddr,
3918 struct sh_emulate_ctxt *sh_ctxt,
3919 mfn_t *mfnp)
3921 walk_t gw;
3922 u32 flags, errcode;
3923 gfn_t gfn;
3924 mfn_t mfn;
3926 guest_walk_tables(v, vaddr, &gw, 1);
3927 flags = accumulate_guest_flags(v, &gw);
3928 gfn = guest_l1e_get_gfn(gw.eff_l1e);
3929 mfn = vcpu_gfn_to_mfn(v, gfn);
3930 sh_audit_gw(v, &gw);
3931 unmap_walk(v, &gw);
3933 if ( !(flags & _PAGE_PRESENT) )
3935 errcode = 0;
3936 goto page_fault;
3939 if ( !(flags & _PAGE_RW) ||
3940 (!(flags & _PAGE_USER) && ring_3(sh_ctxt->ctxt.regs)) )
3942 errcode = PFEC_page_present;
3943 goto page_fault;
3946 if ( !mfn_valid(mfn) )
3947 return NULL;
3949 *mfnp = mfn;
3950 return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
3952 page_fault:
3953 errcode |= PFEC_write_access;
3954 if ( is_hvm_vcpu(v) )
3955 hvm_inject_exception(TRAP_page_fault, errcode, vaddr);
3956 else
3957 propagate_page_fault(vaddr, errcode);
3958 return NULL;
3961 static int safe_not_to_verify_write(mfn_t gmfn, void *dst, void *src,
3962 int bytes)
3964 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
3965 struct page_info *pg = mfn_to_page(gmfn);
3966 if ( !(pg->shadow_flags & SHF_32)
3967 && ((unsigned long)dst & 7) == 0 )
3969 /* Not shadowed 32-bit: aligned 64-bit writes that leave the
3970 * present bit unset are safe to ignore. */
3971 if ( (*(u64*)src & _PAGE_PRESENT) == 0
3972 && (*(u64*)dst & _PAGE_PRESENT) == 0 )
3973 return 1;
3975 else if ( !(pg->shadow_flags & (SHF_PAE|SHF_64))
3976 && ((unsigned long)dst & 3) == 0 )
3978 /* Not shadowed PAE/64-bit: aligned 32-bit writes that leave the
3979 * present bit unset are safe to ignore. */
3980 if ( (*(u32*)src & _PAGE_PRESENT) == 0
3981 && (*(u32*)dst & _PAGE_PRESENT) == 0 )
3982 return 1;
3984 #endif
3985 return 0;
3989 int
3990 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
3991 u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
3993 mfn_t mfn;
3994 void *addr;
3995 int skip;
3997 if ( vaddr & (bytes-1) )
3998 return X86EMUL_UNHANDLEABLE;
4000 ASSERT(shadow_locked_by_me(v->domain));
4001 ASSERT(((vaddr & ~PAGE_MASK) + bytes) <= PAGE_SIZE);
4003 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
4004 return X86EMUL_EXCEPTION;
4006 skip = safe_not_to_verify_write(mfn, addr, src, bytes);
4007 memcpy(addr, src, bytes);
4008 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
4010 /* If we are writing zeros to this page, might want to unshadow */
4011 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
4012 check_for_early_unshadow(v, mfn);
4014 sh_unmap_domain_page(addr);
4015 shadow_audit_tables(v);
4016 return X86EMUL_OKAY;
4019 int
4020 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4021 unsigned long old, unsigned long new,
4022 unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4024 mfn_t mfn;
4025 void *addr;
4026 unsigned long prev;
4027 int rv = X86EMUL_OKAY, skip;
4029 ASSERT(shadow_locked_by_me(v->domain));
4030 ASSERT(bytes <= sizeof(unsigned long));
4032 if ( vaddr & (bytes-1) )
4033 return X86EMUL_UNHANDLEABLE;
4035 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
4036 return X86EMUL_EXCEPTION;
4038 skip = safe_not_to_verify_write(mfn, &new, &old, bytes);
4040 switch ( bytes )
4042 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4043 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4044 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4045 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4046 default:
4047 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4048 prev = ~old;
4051 if ( prev == old )
4053 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes);
4055 else
4056 rv = X86EMUL_CMPXCHG_FAILED;
4058 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4059 " wanted %#lx now %#lx bytes %u\n",
4060 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4062 /* If we are writing zeros to this page, might want to unshadow */
4063 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) )
4064 check_for_early_unshadow(v, mfn);
4066 sh_unmap_domain_page(addr);
4067 shadow_audit_tables(v);
4068 return rv;
4071 int
4072 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4073 unsigned long old_lo, unsigned long old_hi,
4074 unsigned long new_lo, unsigned long new_hi,
4075 struct sh_emulate_ctxt *sh_ctxt)
4077 mfn_t mfn;
4078 void *addr;
4079 u64 old, new, prev;
4080 int rv = X86EMUL_OKAY, skip;
4082 ASSERT(shadow_locked_by_me(v->domain));
4084 if ( vaddr & 7 )
4085 return X86EMUL_UNHANDLEABLE;
4087 if ( (addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn)) == NULL )
4088 return X86EMUL_EXCEPTION;
4090 old = (((u64) old_hi) << 32) | (u64) old_lo;
4091 new = (((u64) new_hi) << 32) | (u64) new_lo;
4092 skip = safe_not_to_verify_write(mfn, &new, &old, 8);
4093 prev = cmpxchg(((u64 *)addr), old, new);
4095 if ( prev == old )
4097 if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, 8);
4099 else
4100 rv = X86EMUL_CMPXCHG_FAILED;
4102 /* If we are writing zeros to this page, might want to unshadow */
4103 if ( *(u32 *)addr == 0 )
4104 check_for_early_unshadow(v, mfn);
4106 sh_unmap_domain_page(addr);
4107 shadow_audit_tables(v);
4108 return rv;
4112 /**************************************************************************/
4113 /* Audit tools */
4115 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4117 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
4118 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4119 "gl" #_level "mfn = %" PRI_mfn \
4120 " sl" #_level "mfn = %" PRI_mfn \
4121 " &gl" #_level "e = %p &sl" #_level "e = %p" \
4122 " gl" #_level "e = %" SH_PRI_gpte \
4123 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4124 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4125 _level, guest_index(gl ## _level ## e), \
4126 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4127 gl ## _level ## e, sl ## _level ## e, \
4128 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4129 ##_a); \
4130 BUG(); \
4131 done = 1; \
4132 } while (0)
4135 static char * sh_audit_flags(struct vcpu *v, int level,
4136 int gflags, int sflags)
4137 /* Common code for auditing flag bits */
4139 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4140 return "shadow is present but guest is not present";
4141 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4142 return "global bit set in PV shadow";
4143 if ( level == 2 && (sflags & _PAGE_PSE) )
4144 return "PS bit set in shadow";
4145 #if SHADOW_PAGING_LEVELS == 3
4146 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4147 #endif
4148 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4149 return "accessed bit not propagated";
4150 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4151 && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4152 return "dirty bit not propagated";
4153 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4154 return "user/supervisor bit does not match";
4155 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4156 return "NX bit does not match";
4157 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4158 return "shadow grants write access but guest does not";
4159 return NULL;
4162 static inline mfn_t
4163 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
4164 /* Convert this gfn to an mfn in the manner appropriate for the
4165 * guest pagetable it's used in (gmfn) */
4167 if ( !shadow_mode_translate(v->domain) )
4168 return _mfn(gfn_x(gfn));
4170 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
4171 != PGT_writable_page )
4172 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
4173 else
4174 return gfn_to_mfn(v->domain, gfn_x(gfn));
4178 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4180 guest_l1e_t *gl1e, *gp;
4181 shadow_l1e_t *sl1e;
4182 mfn_t mfn, gmfn, gl1mfn;
4183 gfn_t gfn;
4184 char *s;
4185 int done = 0;
4187 /* Follow the backpointer */
4188 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
4189 gl1e = gp = sh_map_domain_page(gl1mfn);
4190 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4192 if ( sh_l1e_is_magic(*sl1e) )
4194 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
4195 if ( sh_l1e_is_gnp(*sl1e) )
4197 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4198 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4200 else
4202 ASSERT(sh_l1e_is_mmio(*sl1e));
4203 gfn = sh_l1e_mmio_get_gfn(*sl1e);
4204 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4205 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4206 " but guest gfn is %" SH_PRI_gfn,
4207 gfn_x(gfn),
4208 gfn_x(guest_l1e_get_gfn(*gl1e)));
4210 #endif
4212 else
4214 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
4215 shadow_l1e_get_flags(*sl1e));
4216 if ( s ) AUDIT_FAIL(1, "%s", s);
4218 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4220 gfn = guest_l1e_get_gfn(*gl1e);
4221 mfn = shadow_l1e_get_mfn(*sl1e);
4222 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
4223 if ( mfn_x(gmfn) != mfn_x(mfn) )
4224 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
4225 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4226 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4229 });
4230 sh_unmap_domain_page(gp);
4231 return done;
4234 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4236 guest_l1e_t *gl1e, e;
4237 shadow_l1e_t *sl1e;
4238 mfn_t gl1mfn = _mfn(INVALID_MFN);
4239 int f;
4240 int done = 0;
4242 /* fl1 has no useful backpointer: all we can check are flags */
4243 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4244 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4245 f = shadow_l1e_get_flags(*sl1e);
4246 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4247 if ( !(f == 0
4248 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4249 _PAGE_ACCESSED|_PAGE_DIRTY)
4250 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4251 || sh_l1e_is_magic(*sl1e)) )
4252 AUDIT_FAIL(1, "fl1e has bad flags");
4253 });
4254 return 0;
4257 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4259 guest_l2e_t *gl2e, *gp;
4260 shadow_l2e_t *sl2e;
4261 mfn_t mfn, gmfn, gl2mfn;
4262 gfn_t gfn;
4263 char *s;
4264 int done = 0;
4266 /* Follow the backpointer */
4267 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
4268 gl2e = gp = sh_map_domain_page(gl2mfn);
4269 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
4271 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4272 shadow_l2e_get_flags(*sl2e));
4273 if ( s ) AUDIT_FAIL(2, "%s", s);
4275 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4277 gfn = guest_l2e_get_gfn(*gl2e);
4278 mfn = shadow_l2e_get_mfn(*sl2e);
4279 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4280 ? get_fl1_shadow_status(v, gfn)
4281 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4282 SH_type_l1_shadow);
4283 if ( mfn_x(gmfn) != mfn_x(mfn) )
4284 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4285 " (--> %" PRI_mfn ")"
4286 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4287 gfn_x(gfn),
4288 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4289 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4290 mfn_x(gmfn), mfn_x(mfn));
4292 });
4293 sh_unmap_domain_page(gp);
4294 return 0;
4297 #if GUEST_PAGING_LEVELS >= 4
4298 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4300 guest_l3e_t *gl3e, *gp;
4301 shadow_l3e_t *sl3e;
4302 mfn_t mfn, gmfn, gl3mfn;
4303 gfn_t gfn;
4304 char *s;
4305 int done = 0;
4307 /* Follow the backpointer */
4308 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
4309 gl3e = gp = sh_map_domain_page(gl3mfn);
4310 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4312 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4313 shadow_l3e_get_flags(*sl3e));
4314 if ( s ) AUDIT_FAIL(3, "%s", s);
4316 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4318 gfn = guest_l3e_get_gfn(*gl3e);
4319 mfn = shadow_l3e_get_mfn(*sl3e);
4320 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4321 ((GUEST_PAGING_LEVELS == 3 ||
4322 IS_COMPAT(v->domain))
4323 && !shadow_mode_external(v->domain)
4324 && (guest_index(gl3e) % 4) == 3)
4325 ? SH_type_l2h_shadow
4326 : SH_type_l2_shadow);
4327 if ( mfn_x(gmfn) != mfn_x(mfn) )
4328 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4329 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4330 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4332 });
4333 sh_unmap_domain_page(gp);
4334 return 0;
4337 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4339 guest_l4e_t *gl4e, *gp;
4340 shadow_l4e_t *sl4e;
4341 mfn_t mfn, gmfn, gl4mfn;
4342 gfn_t gfn;
4343 char *s;
4344 int done = 0;
4346 /* Follow the backpointer */
4347 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
4348 gl4e = gp = sh_map_domain_page(gl4mfn);
4349 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
4351 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4352 shadow_l4e_get_flags(*sl4e));
4353 if ( s ) AUDIT_FAIL(4, "%s", s);
4355 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4357 gfn = guest_l4e_get_gfn(*gl4e);
4358 mfn = shadow_l4e_get_mfn(*sl4e);
4359 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4360 SH_type_l3_shadow);
4361 if ( mfn_x(gmfn) != mfn_x(mfn) )
4362 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4363 " --> %" PRI_mfn " != mfn %" PRI_mfn,
4364 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4366 });
4367 sh_unmap_domain_page(gp);
4368 return 0;
4370 #endif /* GUEST_PAGING_LEVELS >= 4 */
4373 #undef AUDIT_FAIL
4375 #endif /* Audit code */
4377 /**************************************************************************/
4378 /* Entry points into this mode of the shadow code.
4379 * This will all be mangled by the preprocessor to uniquify everything. */
4380 struct paging_mode sh_paging_mode = {
4381 .page_fault = sh_page_fault,
4382 .invlpg = sh_invlpg,
4383 .gva_to_gfn = sh_gva_to_gfn,
4384 .update_cr3 = sh_update_cr3,
4385 .update_paging_modes = shadow_update_paging_modes,
4386 .write_p2m_entry = shadow_write_p2m_entry,
4387 .write_guest_entry = shadow_write_guest_entry,
4388 .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
4389 .guest_map_l1e = sh_guest_map_l1e,
4390 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4391 .guest_levels = GUEST_PAGING_LEVELS,
4392 .shadow.detach_old_tables = sh_detach_old_tables,
4393 .shadow.x86_emulate_write = sh_x86_emulate_write,
4394 .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4395 .shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4396 .shadow.make_monitor_table = sh_make_monitor_table,
4397 .shadow.destroy_monitor_table = sh_destroy_monitor_table,
4398 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4399 .shadow.guess_wrmap = sh_guess_wrmap,
4400 #endif
4401 .shadow.shadow_levels = SHADOW_PAGING_LEVELS,
4402 };
4404 /*
4405 * Local variables:
4406 * mode: C
4407 * c-set-style: "BSD"
4408 * c-basic-offset: 4
4409 * indent-tabs-mode: nil
4410 * End:
4411 */