direct-io.hg

view xen/arch/x86/mm/shadow/multi.c @ 12465:d0e9da9cc84a

[HVM] Disallow PTE updates and MMIO accesses from hypervisor mode
(prevents copy_to/from_guest from causing problems, for example).
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Nov 16 10:52:03 2006 +0000 (2006-11-16)
parents a07d6a05792e
children 992723a0ceb1
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/multi.c
3 *
4 * Simple, mostly-synchronous shadow page tables.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/domain_page.h>
31 #include <asm/page.h>
32 #include <asm/current.h>
33 #include <asm/shadow.h>
34 #include <asm/flushtlb.h>
35 #include <asm/hvm/hvm.h>
36 #include "private.h"
37 #include "types.h"
39 /* THINGS TO DO LATER:
40 *
41 * TEARDOWN HEURISTICS
42 * Also: have a heuristic for when to destroy a previous paging-mode's
43 * shadows. When a guest is done with its start-of-day 32-bit tables
44 * and reuses the memory we want to drop those shadows. Start with
45 * shadows in a page in two modes as a hint, but beware of clever tricks
46 * like reusing a pagetable for both PAE and 64-bit during boot...
47 *
48 * PAE LINEAR MAPS
49 * Rework shadow_get_l*e() to have the option of using map_domain_page()
50 * instead of linear maps. Add appropriate unmap_l*e calls in the users.
51 * Then we can test the speed difference made by linear maps. If the
52 * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
53 * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
54 * to share l2h pages again.
55 *
56 * GUEST_WALK_TABLES TLB FLUSH COALESCE
57 * guest_walk_tables can do up to three remote TLB flushes as it walks to
58 * the first l1 of a new pagetable. Should coalesce the flushes to the end,
59 * and if we do flush, re-do the walk. If anything has changed, then
60 * pause all the other vcpus and do the walk *again*.
61 *
62 * WP DISABLED
63 * Consider how to implement having the WP bit of CR0 set to 0.
64 * Since we need to be able to cause write faults to pagetables, this might
65 * end up looking like not having the (guest) pagetables present at all in
66 * HVM guests...
67 *
68 * PSE disabled / PSE36
69 * We don't support any modes other than PSE enabled, PSE36 disabled.
70 * Neither of those would be hard to change, but we'd need to be able to
71 * deal with shadows made in one mode and used in another.
72 */
74 #define FETCH_TYPE_PREFETCH 1
75 #define FETCH_TYPE_DEMAND 2
76 #define FETCH_TYPE_WRITE 4
77 typedef enum {
78 ft_prefetch = FETCH_TYPE_PREFETCH,
79 ft_demand_read = FETCH_TYPE_DEMAND,
80 ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
81 } fetch_type_t;
83 #ifdef DEBUG_TRACE_DUMP
84 static char *fetch_type_names[] = {
85 [ft_prefetch] "prefetch",
86 [ft_demand_read] "demand read",
87 [ft_demand_write] "demand write",
88 };
89 #endif
91 /**************************************************************************/
92 /* Hash table mapping from guest pagetables to shadows
93 *
94 * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
95 * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
96 * shadow L1 which maps its "splinters".
97 */
99 static inline mfn_t
100 get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
101 /* Look for FL1 shadows in the hash table */
102 {
103 mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn),
104 PGC_SH_fl1_shadow >> PGC_SH_type_shift);
106 if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
107 {
108 struct page_info *page = mfn_to_page(smfn);
109 if ( !(page->count_info & PGC_SH_log_dirty) )
110 shadow_convert_to_log_dirty(v, smfn);
111 }
113 return smfn;
114 }
116 static inline mfn_t
117 get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
118 /* Look for shadows in the hash table */
119 {
120 mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn),
121 shadow_type >> PGC_SH_type_shift);
122 perfc_incrc(shadow_get_shadow_status);
124 if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
125 {
126 struct page_info *page = mfn_to_page(smfn);
127 if ( !(page->count_info & PGC_SH_log_dirty) )
128 shadow_convert_to_log_dirty(v, smfn);
129 }
131 return smfn;
132 }
134 static inline void
135 set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
136 /* Put an FL1 shadow into the hash table */
137 {
138 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
139 gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
141 if ( unlikely(shadow_mode_log_dirty(v->domain)) )
142 // mark this shadow as a log dirty shadow...
143 set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
144 else
145 clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
147 shadow_hash_insert(v, gfn_x(gfn),
148 PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
149 }
151 static inline void
152 set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
153 /* Put a shadow into the hash table */
154 {
155 struct domain *d = v->domain;
156 int res;
158 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
159 d->domain_id, v->vcpu_id, mfn_x(gmfn),
160 shadow_type, mfn_x(smfn));
162 if ( unlikely(shadow_mode_log_dirty(d)) )
163 // mark this shadow as a log dirty shadow...
164 set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
165 else
166 clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
168 res = get_page(mfn_to_page(gmfn), d);
169 ASSERT(res == 1);
171 shadow_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH_type_shift,
172 smfn);
173 }
175 static inline void
176 delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
177 /* Remove a shadow from the hash table */
178 {
179 SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
180 gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
181 shadow_hash_delete(v, gfn_x(gfn),
182 PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
183 }
185 static inline void
186 delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
187 /* Remove a shadow from the hash table */
188 {
189 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
190 v->domain->domain_id, v->vcpu_id,
191 mfn_x(gmfn), shadow_type, mfn_x(smfn));
192 shadow_hash_delete(v, mfn_x(gmfn),
193 shadow_type >> PGC_SH_type_shift, smfn);
194 put_page(mfn_to_page(gmfn));
195 }
197 /**************************************************************************/
198 /* CPU feature support querying */
200 static inline int
201 guest_supports_superpages(struct vcpu *v)
202 {
203 /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
204 * CR4.PSE is set or the guest is in PAE or long mode */
205 return (is_hvm_vcpu(v) && (GUEST_PAGING_LEVELS != 2
206 || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
207 }
209 static inline int
210 guest_supports_nx(struct vcpu *v)
211 {
212 if ( !is_hvm_vcpu(v) )
213 return cpu_has_nx;
215 // XXX - fix this!
216 return 1;
217 }
220 /**************************************************************************/
221 /* Functions for walking the guest page tables */
224 /* Walk the guest pagetables, filling the walk_t with what we see.
225 * Takes an uninitialised walk_t. The caller must call unmap_walk()
226 * on the walk_t before discarding it or calling guest_walk_tables again.
227 * If "guest_op" is non-zero, we are serving a genuine guest memory access,
228 * and must (a) be under the shadow lock, and (b) remove write access
229 * from any gueat PT pages we see, as we will be using their contents to
230 * perform shadow updates.
231 * Returns 0 for success or non-zero if the guest pagetables are malformed.
232 * N.B. Finding a not-present entry does not cause a non-zero return code. */
233 static inline int
234 guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
235 {
236 ASSERT(!guest_op || shadow_lock_is_acquired(v->domain));
238 perfc_incrc(shadow_guest_walk);
239 memset(gw, 0, sizeof(*gw));
240 gw->va = va;
242 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
243 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
244 /* Get l4e from the top level table */
245 gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
246 gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
247 /* Walk down to the l3e */
248 if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
249 gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
250 if ( !valid_mfn(gw->l3mfn) ) return 1;
251 /* This mfn is a pagetable: make sure the guest can't write to it. */
252 if ( guest_op && shadow_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
253 flush_tlb_mask(v->domain->domain_dirty_cpumask);
254 gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
255 + guest_l3_table_offset(va);
256 #else /* PAE only... */
257 /* Get l3e from the top level table */
258 gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
259 gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
260 #endif /* PAE or 64... */
261 /* Walk down to the l2e */
262 if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
263 gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
264 if ( !valid_mfn(gw->l2mfn) ) return 1;
265 /* This mfn is a pagetable: make sure the guest can't write to it. */
266 if ( guest_op && shadow_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
267 flush_tlb_mask(v->domain->domain_dirty_cpumask);
268 gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
269 + guest_l2_table_offset(va);
270 #else /* 32-bit only... */
271 /* Get l2e from the top level table */
272 gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
273 gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
274 #endif /* All levels... */
276 if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
277 if ( guest_supports_superpages(v) &&
278 (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
279 {
280 /* Special case: this guest VA is in a PSE superpage, so there's
281 * no guest l1e. We make one up so that the propagation code
282 * can generate a shadow l1 table. Start with the gfn of the
283 * first 4k-page of the superpage. */
284 gfn_t start = guest_l2e_get_gfn(*gw->l2e);
285 /* Grant full access in the l1e, since all the guest entry's
286 * access controls are enforced in the shadow l2e. This lets
287 * us reflect l2 changes later without touching the l1s. */
288 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
289 _PAGE_ACCESSED|_PAGE_DIRTY);
290 /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
291 * of the level 1 */
292 if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
293 flags |= _PAGE_PAT;
294 /* Increment the pfn by the right number of 4k pages.
295 * The ~0x1 is to mask out the PAT bit mentioned above. */
296 start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
297 gw->eff_l1e = guest_l1e_from_gfn(start, flags);
298 gw->l1e = NULL;
299 gw->l1mfn = _mfn(INVALID_MFN);
300 }
301 else
302 {
303 /* Not a superpage: carry on and find the l1e. */
304 gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
305 if ( !valid_mfn(gw->l1mfn) ) return 1;
306 /* This mfn is a pagetable: make sure the guest can't write to it. */
307 if ( guest_op
308 && shadow_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
309 flush_tlb_mask(v->domain->domain_dirty_cpumask);
310 gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
311 + guest_l1_table_offset(va);
312 gw->eff_l1e = *gw->l1e;
313 }
315 return 0;
316 }
318 /* Given a walk_t, translate the gw->va into the guest's notion of the
319 * corresponding frame number. */
320 static inline gfn_t
321 guest_walk_to_gfn(walk_t *gw)
322 {
323 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
324 return _gfn(INVALID_GFN);
325 return guest_l1e_get_gfn(gw->eff_l1e);
326 }
328 /* Given a walk_t, translate the gw->va into the guest's notion of the
329 * corresponding physical address. */
330 static inline paddr_t
331 guest_walk_to_gpa(walk_t *gw)
332 {
333 if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
334 return 0;
335 return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
336 }
339 /* Unmap (and reinitialise) a guest walk.
340 * Call this to dispose of any walk filled in by guest_walk_tables() */
341 static void unmap_walk(struct vcpu *v, walk_t *gw)
342 {
343 #if GUEST_PAGING_LEVELS >= 3
344 #if GUEST_PAGING_LEVELS >= 4
345 if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
346 #endif
347 if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
348 #endif
349 if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
350 #ifdef DEBUG
351 memset(gw, 0, sizeof(*gw));
352 #endif
353 }
356 /* Pretty-print the contents of a guest-walk */
357 static inline void print_gw(walk_t *gw)
358 {
359 SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
360 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
361 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
362 SHADOW_PRINTK(" l4mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l4mfn));
363 SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
364 if ( gw->l4e )
365 SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
366 #endif /* PAE or 64... */
367 SHADOW_PRINTK(" l3mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l3mfn));
368 SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
369 if ( gw->l3e )
370 SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
371 #endif /* All levels... */
372 SHADOW_PRINTK(" l2mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l2mfn));
373 SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
374 if ( gw->l2e )
375 SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
376 SHADOW_PRINTK(" l1mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l1mfn));
377 SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
378 if ( gw->l1e )
379 SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
380 SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
381 }
384 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
385 /* Lightweight audit: pass all the shadows associated with this guest walk
386 * through the audit mechanisms */
387 static void sh_audit_gw(struct vcpu *v, walk_t *gw)
388 {
389 mfn_t smfn;
391 if ( !(SHADOW_AUDIT_ENABLE) )
392 return;
394 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
395 if ( valid_mfn(gw->l4mfn)
396 && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn,
397 PGC_SH_l4_shadow))) )
398 (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
399 if ( valid_mfn(gw->l3mfn)
400 && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn,
401 PGC_SH_l3_shadow))) )
402 (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
403 #endif /* PAE or 64... */
404 if ( valid_mfn(gw->l2mfn) )
405 {
406 if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
407 PGC_SH_l2_shadow))) )
408 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
409 #if GUEST_PAGING_LEVELS == 3
410 if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
411 PGC_SH_l2h_shadow))) )
412 (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
413 #endif
414 }
415 if ( valid_mfn(gw->l1mfn)
416 && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn,
417 PGC_SH_l1_shadow))) )
418 (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
419 else if ( gw->l2e
420 && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
421 && valid_mfn(
422 (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
423 (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
424 }
426 #else
427 #define sh_audit_gw(_v, _gw) do {} while(0)
428 #endif /* audit code */
432 /**************************************************************************/
433 /* Function to write to the guest tables, for propagating accessed and
434 * dirty bits from the shadow to the guest.
435 * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
436 * and an operation type. The guest entry is always passed as an l1e:
437 * since we only ever write flags, that's OK.
438 * Returns the new flag bits of the guest entry. */
440 static u32 guest_set_ad_bits(struct vcpu *v,
441 mfn_t gmfn,
442 guest_l1e_t *ep,
443 unsigned int level,
444 fetch_type_t ft)
445 {
446 u32 flags;
447 int res = 0;
449 ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
450 ASSERT(level <= GUEST_PAGING_LEVELS);
451 ASSERT(shadow_lock_is_acquired(v->domain));
453 flags = guest_l1e_get_flags(*ep);
455 /* Only set A and D bits for guest-initiated accesses */
456 if ( !(ft & FETCH_TYPE_DEMAND) )
457 return flags;
459 ASSERT(valid_mfn(gmfn)
460 && (sh_mfn_is_a_page_table(gmfn)
461 || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
462 == 0)));
464 /* PAE l3s do not have A and D bits */
465 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
467 /* Need the D bit as well for writes, in L1es and PSE L2es. */
468 if ( ft == ft_demand_write
469 && (level == 1 ||
470 (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
471 {
472 if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
473 == (_PAGE_DIRTY | _PAGE_ACCESSED) )
474 return flags; /* Guest already has A and D bits set */
475 flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
476 perfc_incrc(shadow_ad_update);
477 }
478 else
479 {
480 if ( flags & _PAGE_ACCESSED )
481 return flags; /* Guest already has A bit set */
482 flags |= _PAGE_ACCESSED;
483 perfc_incrc(shadow_a_update);
484 }
486 /* Set the bit(s) */
487 sh_mark_dirty(v->domain, gmfn);
488 SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
489 "old flags = %#x, new flags = %#x\n",
490 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep),
491 flags);
492 *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
494 /* Propagate this change to any other shadows of the page
495 * (only necessary if there is more than one shadow) */
496 if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
497 {
498 u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
499 /* More than one type bit set in shadow-flags? */
500 if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
501 res = __shadow_validate_guest_entry(v, gmfn, ep, sizeof(*ep));
502 }
504 /* We should never need to flush the TLB or recopy PAE entries */
505 ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
507 return flags;
508 }
510 #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
511 void *
512 sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
513 unsigned long *gl1mfn)
514 {
515 void *pl1e = NULL;
516 walk_t gw;
518 ASSERT(shadow_mode_translate(v->domain));
520 // XXX -- this is expensive, but it's easy to cobble together...
521 // FIXME!
523 shadow_lock(v->domain);
524 guest_walk_tables(v, addr, &gw, 1);
526 if ( gw.l2e &&
527 (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) &&
528 !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) )
529 {
530 if ( gl1mfn )
531 *gl1mfn = mfn_x(gw.l1mfn);
532 pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
533 (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
534 }
536 unmap_walk(v, &gw);
537 shadow_unlock(v->domain);
539 return pl1e;
540 }
542 void
543 sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
544 {
545 walk_t gw;
547 ASSERT(shadow_mode_translate(v->domain));
549 // XXX -- this is expensive, but it's easy to cobble together...
550 // FIXME!
552 shadow_lock(v->domain);
553 guest_walk_tables(v, addr, &gw, 1);
554 *(guest_l1e_t *)eff_l1e = gw.eff_l1e;
555 unmap_walk(v, &gw);
556 shadow_unlock(v->domain);
557 }
558 #endif /* CONFIG==SHADOW==GUEST */
560 /**************************************************************************/
561 /* Functions to compute the correct index into a shadow page, given an
562 * index into the guest page (as returned by guest_get_index()).
563 * This is trivial when the shadow and guest use the same sized PTEs, but
564 * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
565 * PAE- or 64-bit shadows).
566 *
567 * These functions also increment the shadow mfn, when necessary. When PTE
568 * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
569 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
570 * use simple pointer arithmetic on a pointer to the guest L1e to figure out
571 * which shadow page we really want. Similarly, when PTE sizes are
572 * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
573 * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
574 * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
575 * space.)
576 *
577 * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
578 * of shadow (to store both the shadow, and the info that would normally be
579 * stored in page_info fields). This arrangement allows the shadow and the
580 * "page_info" fields to always be stored in the same page (in fact, in
581 * the same cache line), avoiding an extra call to map_domain_page().
582 */
584 static inline u32
585 guest_index(void *ptr)
586 {
587 return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
588 }
590 static inline u32
591 shadow_l1_index(mfn_t *smfn, u32 guest_index)
592 {
593 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
594 *smfn = _mfn(mfn_x(*smfn) +
595 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
596 return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
597 #else
598 return guest_index;
599 #endif
600 }
602 static inline u32
603 shadow_l2_index(mfn_t *smfn, u32 guest_index)
604 {
605 #if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
606 // Because we use 2 shadow l2 entries for each guest entry, the number of
607 // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
608 //
609 *smfn = _mfn(mfn_x(*smfn) +
610 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
612 // We multiple by two to get the index of the first of the two entries
613 // used to shadow the specified guest entry.
614 return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
615 #else
616 return guest_index;
617 #endif
618 }
620 #if GUEST_PAGING_LEVELS >= 4
622 static inline u32
623 shadow_l3_index(mfn_t *smfn, u32 guest_index)
624 {
625 return guest_index;
626 }
628 static inline u32
629 shadow_l4_index(mfn_t *smfn, u32 guest_index)
630 {
631 return guest_index;
632 }
634 #endif // GUEST_PAGING_LEVELS >= 4
637 /**************************************************************************/
638 /* Function which computes shadow entries from their corresponding guest
639 * entries. This is the "heart" of the shadow code. It operates using
640 * level-1 shadow types, but handles all levels of entry.
641 * Don't call it directly, but use the four wrappers below.
642 */
644 static always_inline void
645 _sh_propagate(struct vcpu *v,
646 void *guest_entry_ptr,
647 mfn_t guest_table_mfn,
648 mfn_t target_mfn,
649 void *shadow_entry_ptr,
650 int level,
651 fetch_type_t ft,
652 int mmio)
653 {
654 guest_l1e_t *gp = guest_entry_ptr;
655 shadow_l1e_t *sp = shadow_entry_ptr;
656 struct domain *d = v->domain;
657 u32 pass_thru_flags;
658 u32 gflags, sflags;
660 /* We don't shadow PAE l3s */
661 ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
663 if ( valid_mfn(guest_table_mfn) )
664 /* Handle A and D bit propagation into the guest */
665 gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
666 else
667 {
668 /* Must be an fl1e or a prefetch */
669 ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
670 gflags = guest_l1e_get_flags(*gp);
671 }
673 if ( unlikely(!(gflags & _PAGE_PRESENT)) )
674 {
675 /* If a guest l1 entry is not present, shadow with the magic
676 * guest-not-present entry. */
677 if ( level == 1 )
678 *sp = sh_l1e_gnp();
679 else
680 *sp = shadow_l1e_empty();
681 goto done;
682 }
684 if ( level == 1 && mmio )
685 {
686 /* Guest l1e maps MMIO space */
687 *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
688 goto done;
689 }
691 // Must have a valid target_mfn, unless this is a prefetch. In the
692 // case of a prefetch, an invalid mfn means that we can not usefully
693 // shadow anything, and so we return early.
694 //
695 if ( !valid_mfn(target_mfn) )
696 {
697 ASSERT((ft == ft_prefetch));
698 *sp = shadow_l1e_empty();
699 goto done;
700 }
702 // Propagate bits from the guest to the shadow.
703 // Some of these may be overwritten, below.
704 // Since we know the guest's PRESENT bit is set, we also set the shadow's
705 // SHADOW_PRESENT bit.
706 //
707 pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
708 _PAGE_RW | _PAGE_PRESENT);
709 if ( guest_supports_nx(v) )
710 pass_thru_flags |= _PAGE_NX_BIT;
711 sflags = gflags & pass_thru_flags;
713 // Set the A&D bits for higher level shadows.
714 // Higher level entries do not, strictly speaking, have dirty bits, but
715 // since we use shadow linear tables, each of these entries may, at some
716 // point in time, also serve as a shadow L1 entry.
717 // By setting both the A&D bits in each of these, we eliminate the burden
718 // on the hardware to update these bits on initial accesses.
719 //
720 if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
721 sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
723 // If the A or D bit has not yet been set in the guest, then we must
724 // prevent the corresponding kind of access.
725 //
726 if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
727 sflags &= ~_PAGE_PRESENT;
729 /* D bits exist in L1es and PSE L2es */
730 if ( unlikely(((level == 1) ||
731 ((level == 2) &&
732 (gflags & _PAGE_PSE) &&
733 guest_supports_superpages(v)))
734 && !(gflags & _PAGE_DIRTY)) )
735 sflags &= ~_PAGE_RW;
737 // shadow_mode_log_dirty support
738 //
739 // Only allow the guest write access to a page a) on a demand fault,
740 // or b) if the page is already marked as dirty.
741 //
742 if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
743 {
744 if ( ft & FETCH_TYPE_WRITE )
745 sh_mark_dirty(d, target_mfn);
746 else if ( !sh_mfn_is_dirty(d, target_mfn) )
747 sflags &= ~_PAGE_RW;
748 }
750 // protect guest page tables
751 //
752 if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
753 {
754 if ( shadow_mode_trap_reads(d) )
755 {
756 // if we are trapping both reads & writes, then mark this page
757 // as not present...
758 //
759 sflags &= ~_PAGE_PRESENT;
760 }
761 else
762 {
763 // otherwise, just prevent any writes...
764 //
765 sflags &= ~_PAGE_RW;
766 }
767 }
769 // PV guests in 64-bit mode use two different page tables for user vs
770 // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
771 // It is always shadowed as present...
772 if ( (GUEST_PAGING_LEVELS == 4) && !is_hvm_domain(d) )
773 {
774 sflags |= _PAGE_USER;
775 }
777 *sp = shadow_l1e_from_mfn(target_mfn, sflags);
778 done:
779 SHADOW_DEBUG(PROPAGATE,
780 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
781 fetch_type_names[ft], level, gp->l1, sp->l1);
782 }
785 /* These four wrappers give us a little bit of type-safety back around the
786 * use of void-* pointers in _sh_propagate(), and allow the compiler to
787 * optimize out some level checks. */
789 #if GUEST_PAGING_LEVELS >= 4
790 static void
791 l4e_propagate_from_guest(struct vcpu *v,
792 guest_l4e_t *gl4e,
793 mfn_t gl4mfn,
794 mfn_t sl3mfn,
795 shadow_l4e_t *sl4e,
796 fetch_type_t ft)
797 {
798 _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, 0);
799 }
801 static void
802 l3e_propagate_from_guest(struct vcpu *v,
803 guest_l3e_t *gl3e,
804 mfn_t gl3mfn,
805 mfn_t sl2mfn,
806 shadow_l3e_t *sl3e,
807 fetch_type_t ft)
808 {
809 _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, 0);
810 }
811 #endif // GUEST_PAGING_LEVELS >= 4
813 static void
814 l2e_propagate_from_guest(struct vcpu *v,
815 guest_l2e_t *gl2e,
816 mfn_t gl2mfn,
817 mfn_t sl1mfn,
818 shadow_l2e_t *sl2e,
819 fetch_type_t ft)
820 {
821 _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, 0);
822 }
824 static void
825 l1e_propagate_from_guest(struct vcpu *v,
826 guest_l1e_t *gl1e,
827 mfn_t gl1mfn,
828 mfn_t gmfn,
829 shadow_l1e_t *sl1e,
830 fetch_type_t ft,
831 int mmio)
832 {
833 _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, mmio);
834 }
837 /**************************************************************************/
838 /* These functions update shadow entries (and do bookkeeping on the shadow
839 * tables they are in). It is intended that they are the only
840 * functions which ever write (non-zero) data onto a shadow page.
841 *
842 * They return a set of flags:
843 * SHADOW_SET_CHANGED -- we actually wrote a new value to the shadow.
844 * SHADOW_SET_FLUSH -- the caller must cause a TLB flush.
845 * SHADOW_SET_ERROR -- the input is not a valid entry (for example, if
846 * shadow_get_page_from_l1e() fails).
847 */
849 static inline void safe_write_entry(void *dst, void *src)
850 /* Copy one PTE safely when processors might be running on the
851 * destination pagetable. This does *not* give safety against
852 * concurrent writes (that's what the shadow lock is for), just
853 * stops the hardware picking up partially written entries. */
854 {
855 volatile unsigned long *d = dst;
856 unsigned long *s = src;
857 ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
858 #if CONFIG_PAGING_LEVELS == 3
859 /* In PAE mode, pagetable entries are larger
860 * than machine words, so won't get written atomically. We need to make
861 * sure any other cpu running on these shadows doesn't see a
862 * half-written entry. Do this by marking the entry not-present first,
863 * then writing the high word before the low word. */
864 BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
865 d[0] = 0;
866 d[1] = s[1];
867 d[0] = s[0];
868 #else
869 /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
870 * which will be an atomic write, since the entry is aligned. */
871 BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
872 *d = *s;
873 #endif
874 }
877 static inline void
878 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
879 /* This function does the actual writes to shadow pages.
880 * It must not be called directly, since it doesn't do the bookkeeping
881 * that shadow_set_l*e() functions do. */
882 {
883 shadow_l1e_t *dst = d;
884 shadow_l1e_t *src = s;
885 void *map = NULL;
886 int i;
888 /* Because we mirror access rights at all levels in the shadow, an
889 * l2 (or higher) entry with the RW bit cleared will leave us with
890 * no write access through the linear map.
891 * We detect that by writing to the shadow with copy_to_user() and
892 * using map_domain_page() to get a writeable mapping if we need to. */
893 if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
894 {
895 perfc_incrc(shadow_linear_map_failed);
896 map = sh_map_domain_page(mfn);
897 ASSERT(map != NULL);
898 dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
899 }
902 for ( i = 0; i < entries; i++ )
903 safe_write_entry(dst++, src++);
905 if ( map != NULL ) sh_unmap_domain_page(map);
906 }
908 static inline int
909 perms_strictly_increased(u32 old_flags, u32 new_flags)
910 /* Given the flags of two entries, are the new flags a strict
911 * increase in rights over the old ones? */
912 {
913 u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
914 u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
915 /* Flip the NX bit, since it's the only one that decreases rights;
916 * we calculate as if it were an "X" bit. */
917 of ^= _PAGE_NX_BIT;
918 nf ^= _PAGE_NX_BIT;
919 /* If the changed bits are all set in the new flags, then rights strictly
920 * increased between old and new. */
921 return ((of | (of ^ nf)) == nf);
922 }
924 static int inline
925 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
926 {
927 int res;
928 mfn_t mfn;
929 struct domain *owner;
931 ASSERT(!sh_l1e_is_magic(sl1e));
933 if ( !shadow_mode_refcounts(d) )
934 return 1;
936 res = get_page_from_l1e(sl1e, d);
938 // If a privileged domain is attempting to install a map of a page it does
939 // not own, we let it succeed anyway.
940 //
941 if ( unlikely(!res) &&
942 IS_PRIV(d) &&
943 !shadow_mode_translate(d) &&
944 valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
945 (owner = page_get_owner(mfn_to_page(mfn))) &&
946 (d != owner) )
947 {
948 res = get_page_from_l1e(sl1e, owner);
949 SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
950 "which is owned by domain %d: %s\n",
951 d->domain_id, mfn_x(mfn), owner->domain_id,
952 res ? "success" : "failed");
953 }
955 if ( unlikely(!res) )
956 {
957 perfc_incrc(shadow_get_page_fail);
958 SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
959 }
961 return res;
962 }
964 static void inline
965 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
966 {
967 if ( !shadow_mode_refcounts(d) )
968 return;
970 put_page_from_l1e(sl1e, d);
971 }
973 #if GUEST_PAGING_LEVELS >= 4
974 static int shadow_set_l4e(struct vcpu *v,
975 shadow_l4e_t *sl4e,
976 shadow_l4e_t new_sl4e,
977 mfn_t sl4mfn)
978 {
979 int flags = 0;
980 shadow_l4e_t old_sl4e;
981 paddr_t paddr;
982 ASSERT(sl4e != NULL);
983 old_sl4e = *sl4e;
985 if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
987 paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
988 | (((unsigned long)sl4e) & ~PAGE_MASK));
990 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
991 {
992 /* About to install a new reference */
993 sh_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr);
994 }
996 /* Write the new entry */
997 shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
998 flags |= SHADOW_SET_CHANGED;
1000 if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
1002 /* We lost a reference to an old mfn. */
1003 mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
1004 if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
1005 || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
1006 shadow_l4e_get_flags(new_sl4e)) )
1008 flags |= SHADOW_SET_FLUSH;
1010 sh_put_ref(v, osl3mfn, paddr);
1012 return flags;
1015 static int shadow_set_l3e(struct vcpu *v,
1016 shadow_l3e_t *sl3e,
1017 shadow_l3e_t new_sl3e,
1018 mfn_t sl3mfn)
1020 int flags = 0;
1021 shadow_l3e_t old_sl3e;
1022 paddr_t paddr;
1023 ASSERT(sl3e != NULL);
1024 old_sl3e = *sl3e;
1026 if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
1028 paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1029 | (((unsigned long)sl3e) & ~PAGE_MASK));
1031 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
1033 /* About to install a new reference */
1034 sh_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr);
1037 /* Write the new entry */
1038 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
1039 flags |= SHADOW_SET_CHANGED;
1041 if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
1043 /* We lost a reference to an old mfn. */
1044 mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
1045 if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
1046 !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
1047 shadow_l3e_get_flags(new_sl3e)) )
1049 flags |= SHADOW_SET_FLUSH;
1051 sh_put_ref(v, osl2mfn, paddr);
1053 return flags;
1055 #endif /* GUEST_PAGING_LEVELS >= 4 */
1057 static int shadow_set_l2e(struct vcpu *v,
1058 shadow_l2e_t *sl2e,
1059 shadow_l2e_t new_sl2e,
1060 mfn_t sl2mfn)
1062 int flags = 0;
1063 shadow_l2e_t old_sl2e;
1064 paddr_t paddr;
1066 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1067 /* In 2-on-3 we work with pairs of l2es pointing at two-page
1068 * shadows. Reference counting and up-pointers track from the first
1069 * page of the shadow to the first l2e, so make sure that we're
1070 * working with those:
1071 * Align the pointer down so it's pointing at the first of the pair */
1072 sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
1073 /* Align the mfn of the shadow entry too */
1074 new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
1075 #endif
1077 ASSERT(sl2e != NULL);
1078 old_sl2e = *sl2e;
1080 if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
1082 paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1083 | (((unsigned long)sl2e) & ~PAGE_MASK));
1085 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1087 /* About to install a new reference */
1088 sh_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr);
1091 /* Write the new entry */
1092 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1094 shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1095 /* The l1 shadow is two pages long and need to be pointed to by
1096 * two adjacent l1es. The pair have the same flags, but point
1097 * at odd and even MFNs */
1098 ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1099 pair[1].l2 |= (1<<PAGE_SHIFT);
1100 shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1102 #else /* normal case */
1103 shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1104 #endif
1105 flags |= SHADOW_SET_CHANGED;
1107 if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1109 /* We lost a reference to an old mfn. */
1110 mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1111 if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1112 !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1113 shadow_l2e_get_flags(new_sl2e)) )
1115 flags |= SHADOW_SET_FLUSH;
1117 sh_put_ref(v, osl1mfn, paddr);
1119 return flags;
1122 static int shadow_set_l1e(struct vcpu *v,
1123 shadow_l1e_t *sl1e,
1124 shadow_l1e_t new_sl1e,
1125 mfn_t sl1mfn)
1127 int flags = 0;
1128 struct domain *d = v->domain;
1129 shadow_l1e_t old_sl1e;
1130 ASSERT(sl1e != NULL);
1132 old_sl1e = *sl1e;
1134 if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1136 if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1137 && !sh_l1e_is_magic(new_sl1e) )
1139 /* About to install a new reference */
1140 if ( shadow_mode_refcounts(d) ) {
1141 if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
1143 /* Doesn't look like a pagetable. */
1144 flags |= SHADOW_SET_ERROR;
1145 new_sl1e = shadow_l1e_empty();
1150 /* Write the new entry */
1151 shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1152 flags |= SHADOW_SET_CHANGED;
1154 if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1155 && !sh_l1e_is_magic(old_sl1e) )
1157 /* We lost a reference to an old mfn. */
1158 /* N.B. Unlike higher-level sets, never need an extra flush
1159 * when writing an l1e. Because it points to the same guest frame
1160 * as the guest l1e did, it's the guest's responsibility to
1161 * trigger a flush later. */
1162 if ( shadow_mode_refcounts(d) )
1164 shadow_put_page_from_l1e(old_sl1e, d);
1167 return flags;
1171 /**************************************************************************/
1172 /* Macros to walk pagetables. These take the shadow of a pagetable and
1173 * walk every "interesting" entry. That is, they don't touch Xen mappings,
1174 * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1175 * second entry (since pairs of entries are managed together). For multi-page
1176 * shadows they walk all pages.
1178 * Arguments are an MFN, the variable to point to each entry, a variable
1179 * to indicate that we are done (we will shortcut to the end of the scan
1180 * when _done != 0), a variable to indicate that we should avoid Xen mappings,
1181 * and the code.
1183 * WARNING: These macros have side-effects. They change the values of both
1184 * the pointer and the MFN. */
1186 static inline void increment_ptr_to_guest_entry(void *ptr)
1188 if ( ptr )
1190 guest_l1e_t **entry = ptr;
1191 (*entry)++;
1195 /* All kinds of l1: touch all entries */
1196 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1197 do { \
1198 int _i; \
1199 shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
1200 ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \
1201 == PGC_SH_l1_shadow \
1202 || (mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \
1203 == PGC_SH_fl1_shadow); \
1204 for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1205 { \
1206 (_sl1e) = _sp + _i; \
1207 if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1208 {_code} \
1209 if ( _done ) break; \
1210 increment_ptr_to_guest_entry(_gl1p); \
1211 } \
1212 unmap_shadow_page(_sp); \
1213 } while (0)
1215 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1216 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1217 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1218 do { \
1219 int __done = 0; \
1220 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1221 ({ (__done = _done); }), _code); \
1222 _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1223 if ( !__done ) \
1224 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1225 ({ (__done = _done); }), _code); \
1226 } while (0)
1227 #else /* Everything else; l1 shadows are only one page */
1228 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1229 _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1230 #endif
1233 #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1235 /* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1236 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1237 do { \
1238 int _i, _j, __done = 0; \
1239 ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
1240 == PGC_SH_l2_32_shadow); \
1241 for ( _j = 0; _j < 4 && !__done; _j++ ) \
1242 { \
1243 shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
1244 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1245 if ( (!(_xen)) \
1246 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1247 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1248 { \
1249 (_sl2e) = _sp + _i; \
1250 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1251 {_code} \
1252 if ( (__done = (_done)) ) break; \
1253 increment_ptr_to_guest_entry(_gl2p); \
1254 } \
1255 unmap_shadow_page(_sp); \
1256 _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1257 } \
1258 } while (0)
1260 #elif GUEST_PAGING_LEVELS == 2
1262 /* 32-bit on 32-bit: avoid Xen entries */
1263 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1264 do { \
1265 int _i; \
1266 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1267 ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
1268 == PGC_SH_l2_32_shadow); \
1269 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1270 if ( (!(_xen)) \
1271 || \
1272 (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1273 { \
1274 (_sl2e) = _sp + _i; \
1275 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1276 {_code} \
1277 if ( _done ) break; \
1278 increment_ptr_to_guest_entry(_gl2p); \
1279 } \
1280 unmap_shadow_page(_sp); \
1281 } while (0)
1283 #elif GUEST_PAGING_LEVELS == 3
1285 /* PAE: if it's an l2h, don't touch Xen mappings */
1286 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1287 do { \
1288 int _i; \
1289 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1290 ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
1291 == PGC_SH_l2_pae_shadow \
1292 || (mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
1293 == PGC_SH_l2h_pae_shadow); \
1294 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1295 if ( (!(_xen)) \
1296 || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
1297 != PGC_SH_l2h_pae_shadow) \
1298 || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1299 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1300 { \
1301 (_sl2e) = _sp + _i; \
1302 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1303 {_code} \
1304 if ( _done ) break; \
1305 increment_ptr_to_guest_entry(_gl2p); \
1306 } \
1307 unmap_shadow_page(_sp); \
1308 } while (0)
1310 #else
1312 /* 64-bit l2: touch all entries */
1313 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
1314 do { \
1315 int _i; \
1316 shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
1317 ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
1318 == PGC_SH_l2_64_shadow); \
1319 for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1320 { \
1321 (_sl2e) = _sp + _i; \
1322 if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1323 {_code} \
1324 if ( _done ) break; \
1325 increment_ptr_to_guest_entry(_gl2p); \
1326 } \
1327 unmap_shadow_page(_sp); \
1328 } while (0)
1330 #endif /* different kinds of l2 */
1332 #if GUEST_PAGING_LEVELS == 4
1334 /* 64-bit l3: touch all entries */
1335 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1336 do { \
1337 int _i; \
1338 shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
1339 ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask) \
1340 == PGC_SH_l3_64_shadow); \
1341 for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1342 { \
1343 (_sl3e) = _sp + _i; \
1344 if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1345 {_code} \
1346 if ( _done ) break; \
1347 increment_ptr_to_guest_entry(_gl3p); \
1348 } \
1349 unmap_shadow_page(_sp); \
1350 } while (0)
1352 /* 64-bit l4: avoid Xen mappings */
1353 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \
1354 do { \
1355 int _i; \
1356 shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
1357 ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH_type_mask) \
1358 == PGC_SH_l4_64_shadow); \
1359 for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1360 { \
1361 if ( (!(_xen)) || is_guest_l4_slot(_i) ) \
1362 { \
1363 (_sl4e) = _sp + _i; \
1364 if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1365 {_code} \
1366 if ( _done ) break; \
1367 } \
1368 increment_ptr_to_guest_entry(_gl4p); \
1369 } \
1370 unmap_shadow_page(_sp); \
1371 } while (0)
1373 #endif
1377 /**************************************************************************/
1378 /* Functions to install Xen mappings and linear mappings in shadow pages */
1380 static mfn_t sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
1382 // XXX -- this function should probably be moved to shadow-common.c, but that
1383 // probably wants to wait until the shadow types have been moved from
1384 // shadow-types.h to shadow-private.h
1385 //
1386 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1387 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1389 struct domain *d = v->domain;
1390 shadow_l4e_t *sl4e;
1392 sl4e = sh_map_domain_page(sl4mfn);
1393 ASSERT(sl4e != NULL);
1394 ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1396 /* Copy the common Xen mappings from the idle domain */
1397 memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1398 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1399 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1401 /* Install the per-domain mappings for this domain */
1402 sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1403 shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1404 __PAGE_HYPERVISOR);
1406 /* Linear mapping */
1407 sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1408 shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1410 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1412 // linear tables may not be used with translated PV guests
1413 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1414 shadow_l4e_empty();
1416 else
1418 sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1419 shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1422 if ( shadow_mode_translate(v->domain) )
1424 /* install domain-specific P2M table */
1425 sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1426 shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1427 __PAGE_HYPERVISOR);
1430 sh_unmap_domain_page(sl4e);
1432 #endif
1434 #if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
1435 // For 3-on-3 PV guests, we need to make sure the xen mappings are in
1436 // place, which means that we need to populate the l2h entry in the l3
1437 // table.
1439 void sh_install_xen_entries_in_l2h(struct vcpu *v,
1440 mfn_t sl2hmfn)
1442 struct domain *d = v->domain;
1443 shadow_l2e_t *sl2e;
1444 int i;
1446 sl2e = sh_map_domain_page(sl2hmfn);
1447 ASSERT(sl2e != NULL);
1448 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1450 /* Copy the common Xen mappings from the idle domain */
1451 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1452 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1453 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1455 /* Install the per-domain mappings for this domain */
1456 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1457 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1458 shadow_l2e_from_mfn(
1459 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1460 __PAGE_HYPERVISOR);
1462 /* We don't set up a linear mapping here because we can't until this
1463 * l2h is installed in an l3e. sh_update_linear_entries() handles
1464 * the linear mappings when CR3 (and so the fourth l3e) is loaded.
1465 * We zero them here, just as a safety measure.
1466 */
1467 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1468 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1469 shadow_l2e_empty();
1470 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1471 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1472 shadow_l2e_empty();
1474 if ( shadow_mode_translate(d) )
1476 /* Install the domain-specific p2m table */
1477 l3_pgentry_t *p2m;
1478 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1479 p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1480 for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1482 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1483 (l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1484 ? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1485 __PAGE_HYPERVISOR)
1486 : shadow_l2e_empty();
1488 sh_unmap_domain_page(p2m);
1491 sh_unmap_domain_page(sl2e);
1493 #endif
1496 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1497 void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
1499 struct domain *d = v->domain;
1500 shadow_l2e_t *sl2e;
1501 int i;
1503 sl2e = sh_map_domain_page(sl2mfn);
1504 ASSERT(sl2e != NULL);
1505 ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1507 /* Copy the common Xen mappings from the idle domain */
1508 memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1509 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1510 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1512 /* Install the per-domain mappings for this domain */
1513 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1514 sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1515 shadow_l2e_from_mfn(
1516 page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
1517 __PAGE_HYPERVISOR);
1519 /* Linear mapping */
1520 sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1521 shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
1523 if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1525 // linear tables may not be used with translated PV guests
1526 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1527 shadow_l2e_empty();
1529 else
1531 sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
1532 shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
1535 if ( shadow_mode_translate(d) )
1537 /* install domain-specific P2M table */
1538 sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
1539 shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1540 __PAGE_HYPERVISOR);
1543 sh_unmap_domain_page(sl2e);
1545 #endif
1549 /**************************************************************************/
1550 /* Create a shadow of a given guest page.
1551 */
1552 static mfn_t
1553 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1555 mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1556 SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1557 mfn_x(gmfn), shadow_type, mfn_x(smfn));
1559 if ( shadow_type != PGC_SH_l2_32_shadow
1560 && shadow_type != PGC_SH_l2_pae_shadow
1561 && shadow_type != PGC_SH_l2h_pae_shadow
1562 && shadow_type != PGC_SH_l4_64_shadow )
1563 /* Lower-level shadow, not yet linked form a higher level */
1564 mfn_to_page(smfn)->up = 0;
1566 // Create the Xen mappings...
1567 if ( !shadow_mode_external(v->domain) )
1569 switch (shadow_type)
1571 #if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1572 case PGC_SH_l4_shadow:
1573 sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1574 #endif
1575 #if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
1576 case PGC_SH_l2h_shadow:
1577 sh_install_xen_entries_in_l2h(v, smfn); break;
1578 #endif
1579 #if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
1580 case PGC_SH_l2_shadow:
1581 sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
1582 #endif
1583 default: /* Do nothing */ break;
1587 shadow_promote(v, gmfn, shadow_type);
1588 set_shadow_status(v, gmfn, shadow_type, smfn);
1590 return smfn;
1593 /* Make a splintered superpage shadow */
1594 static mfn_t
1595 make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1597 mfn_t smfn = shadow_alloc(v->domain, PGC_SH_fl1_shadow,
1598 (unsigned long) gfn_x(gfn));
1600 SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" SH_PRI_mfn "\n",
1601 gfn_x(gfn), mfn_x(smfn));
1603 set_fl1_shadow_status(v, gfn, smfn);
1604 return smfn;
1608 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1609 mfn_t
1610 sh_make_monitor_table(struct vcpu *v)
1613 ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1615 #if CONFIG_PAGING_LEVELS == 4
1617 struct domain *d = v->domain;
1618 mfn_t m4mfn;
1619 m4mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
1620 sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1621 /* Remember the level of this table */
1622 mfn_to_page(m4mfn)->shadow_flags = 4;
1623 #if SHADOW_PAGING_LEVELS < 4
1624 // Install a monitor l3 table in slot 0 of the l4 table.
1625 // This is used for shadow linear maps.
1627 mfn_t m3mfn;
1628 l4_pgentry_t *l4e;
1629 m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
1630 mfn_to_page(m3mfn)->shadow_flags = 3;
1631 l4e = sh_map_domain_page(m4mfn);
1632 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1633 sh_unmap_domain_page(l4e);
1635 #endif /* SHADOW_PAGING_LEVELS < 4 */
1636 return m4mfn;
1639 #elif CONFIG_PAGING_LEVELS == 3
1642 struct domain *d = v->domain;
1643 mfn_t m3mfn, m2mfn;
1644 l3_pgentry_t *l3e;
1645 l2_pgentry_t *l2e;
1646 int i;
1648 m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
1649 /* Remember the level of this table */
1650 mfn_to_page(m3mfn)->shadow_flags = 3;
1652 // Install a monitor l2 table in slot 3 of the l3 table.
1653 // This is used for all Xen entries, including linear maps
1654 m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
1655 mfn_to_page(m2mfn)->shadow_flags = 2;
1656 l3e = sh_map_domain_page(m3mfn);
1657 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1658 sh_install_xen_entries_in_l2h(v, m2mfn);
1659 /* Install the monitor's own linear map */
1660 l2e = sh_map_domain_page(m2mfn);
1661 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1662 l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1663 (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1664 ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1665 : l2e_empty();
1666 sh_unmap_domain_page(l2e);
1667 sh_unmap_domain_page(l3e);
1669 SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1670 return m3mfn;
1673 #elif CONFIG_PAGING_LEVELS == 2
1676 struct domain *d = v->domain;
1677 mfn_t m2mfn;
1678 m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
1679 sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
1680 /* Remember the level of this table */
1681 mfn_to_page(m2mfn)->shadow_flags = 2;
1682 return m2mfn;
1685 #else
1686 #error this should not happen
1687 #endif /* CONFIG_PAGING_LEVELS */
1689 #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1691 /**************************************************************************/
1692 /* These functions also take a virtual address and return the level-N
1693 * shadow table mfn and entry, but they create the shadow pagetables if
1694 * they are needed. The "demand" argument is non-zero when handling
1695 * a demand fault (so we know what to do about accessed bits &c).
1696 * If the necessary tables are not present in the guest, they return NULL. */
1698 /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1699 * more levels than the guest, the upper levels are always fixed and do not
1700 * reflect any information from the guest, so we do not use these functions
1701 * to access them. */
1703 #if GUEST_PAGING_LEVELS >= 4
1704 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1705 walk_t *gw,
1706 mfn_t *sl4mfn)
1708 /* There is always a shadow of the top level table. Get it. */
1709 *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1710 /* Reading the top level table is always valid. */
1711 return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1714 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1715 walk_t *gw,
1716 mfn_t *sl3mfn,
1717 fetch_type_t ft)
1719 mfn_t sl4mfn;
1720 shadow_l4e_t *sl4e;
1721 if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
1722 /* Get the l4e */
1723 sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1724 ASSERT(sl4e != NULL);
1725 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1727 *sl3mfn = shadow_l4e_get_mfn(*sl4e);
1728 ASSERT(valid_mfn(*sl3mfn));
1730 else
1732 int r;
1733 shadow_l4e_t new_sl4e;
1734 /* No l3 shadow installed: find and install it. */
1735 *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH_l3_shadow);
1736 if ( !valid_mfn(*sl3mfn) )
1738 /* No l3 shadow of this page exists at all: make one. */
1739 *sl3mfn = sh_make_shadow(v, gw->l3mfn, PGC_SH_l3_shadow);
1741 /* Install the new sl3 table in the sl4e */
1742 l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
1743 *sl3mfn, &new_sl4e, ft);
1744 r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1745 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1747 /* Now follow it down a level. Guaranteed to succeed. */
1748 return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1750 #endif /* GUEST_PAGING_LEVELS >= 4 */
1753 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1754 walk_t *gw,
1755 mfn_t *sl2mfn,
1756 fetch_type_t ft)
1758 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1759 mfn_t sl3mfn = _mfn(INVALID_MFN);
1760 shadow_l3e_t *sl3e;
1761 if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
1762 /* Get the l3e */
1763 sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
1764 ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */
1765 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1767 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1768 ASSERT(valid_mfn(*sl2mfn));
1770 else
1772 int r;
1773 shadow_l3e_t new_sl3e;
1774 /* No l2 shadow installed: find and install it. */
1775 *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH_l2_shadow);
1776 if ( !valid_mfn(*sl2mfn) )
1778 /* No l2 shadow of this page exists at all: make one. */
1779 *sl2mfn = sh_make_shadow(v, gw->l2mfn, PGC_SH_l2_shadow);
1781 /* Install the new sl2 table in the sl3e */
1782 l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
1783 *sl2mfn, &new_sl3e, ft);
1784 r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1785 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1787 /* Now follow it down a level. Guaranteed to succeed. */
1788 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1789 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1790 /* We never demand-shadow PAE l3es: they are only created in
1791 * sh_update_cr3(). Check if the relevant sl3e is present. */
1792 shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.shadow.l3table)
1793 + shadow_l3_linear_offset(gw->va);
1794 if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1795 return NULL;
1796 *sl2mfn = shadow_l3e_get_mfn(*sl3e);
1797 ASSERT(valid_mfn(*sl2mfn));
1798 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1799 #else /* 32bit... */
1800 /* There is always a shadow of the top level table. Get it. */
1801 *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1802 /* This next line is important: the guest l2 has a 16k
1803 * shadow, we need to return the right mfn of the four. This
1804 * call will set it for us as a side-effect. */
1805 (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
1806 /* Reading the top level table is always valid. */
1807 return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1808 #endif
1812 static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1813 walk_t *gw,
1814 mfn_t *sl1mfn,
1815 fetch_type_t ft)
1817 mfn_t sl2mfn;
1818 shadow_l2e_t *sl2e;
1820 /* Get the l2e */
1821 sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
1822 if ( sl2e == NULL ) return NULL;
1823 /* Install the sl1 in the l2e if it wasn't there or if we need to
1824 * re-do it to fix a PSE dirty bit. */
1825 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1826 && likely(ft != ft_demand_write
1827 || (guest_l2e_get_flags(*gw->l2e) & _PAGE_DIRTY)
1828 || !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)) )
1830 *sl1mfn = shadow_l2e_get_mfn(*sl2e);
1831 ASSERT(valid_mfn(*sl1mfn));
1833 else
1835 shadow_l2e_t new_sl2e;
1836 int r, flags = guest_l2e_get_flags(*gw->l2e);
1837 /* No l1 shadow installed: find and install it. */
1838 if ( !(flags & _PAGE_PRESENT) )
1839 return NULL; /* No guest page. */
1840 if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1842 /* Splintering a superpage */
1843 gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
1844 *sl1mfn = get_fl1_shadow_status(v, l2gfn);
1845 if ( !valid_mfn(*sl1mfn) )
1847 /* No fl1 shadow of this superpage exists at all: make one. */
1848 *sl1mfn = make_fl1_shadow(v, l2gfn);
1851 else
1853 /* Shadowing an actual guest l1 table */
1854 if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
1855 *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH_l1_shadow);
1856 if ( !valid_mfn(*sl1mfn) )
1858 /* No l1 shadow of this page exists at all: make one. */
1859 *sl1mfn = sh_make_shadow(v, gw->l1mfn, PGC_SH_l1_shadow);
1862 /* Install the new sl1 table in the sl2e */
1863 l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
1864 *sl1mfn, &new_sl2e, ft);
1865 r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1866 ASSERT((r & SHADOW_SET_FLUSH) == 0);
1867 /* This next line is important: in 32-on-PAE and 32-on-64 modes,
1868 * the guest l1 table has an 8k shadow, and we need to return
1869 * the right mfn of the pair. This call will set it for us as a
1870 * side-effect. (In all other cases, it's a no-op and will be
1871 * compiled out.) */
1872 (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1874 /* Now follow it down a level. Guaranteed to succeed. */
1875 return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1880 /**************************************************************************/
1881 /* Destructors for shadow tables:
1882 * Unregister the shadow, decrement refcounts of any entries present in it,
1883 * and release the memory.
1885 * N.B. These destructors do not clear the contents of the shadows.
1886 * This allows us to delay TLB shootdowns until the page is being reused.
1887 * See shadow_alloc() and shadow_free() for how this is handled.
1888 */
1890 #if GUEST_PAGING_LEVELS >= 4
1891 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1893 shadow_l4e_t *sl4e;
1894 u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
1895 mfn_t gmfn, sl4mfn;
1896 int xen_mappings;
1898 SHADOW_DEBUG(DESTROY_SHADOW,
1899 "%s(%05lx)\n", __func__, mfn_x(smfn));
1900 ASSERT(t == PGC_SH_l4_shadow);
1902 /* Record that the guest page isn't shadowed any more (in this type) */
1903 gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
1904 delete_shadow_status(v, gmfn, t, smfn);
1905 shadow_demote(v, gmfn, t);
1906 /* Take this shadow off the list of root shadows */
1907 list_del_init(&mfn_to_page(smfn)->list);
1909 /* Decrement refcounts of all the old entries */
1910 xen_mappings = (!shadow_mode_external(v->domain));
1911 sl4mfn = smfn;
1912 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
1913 if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1915 sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1916 (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1917 | ((unsigned long)sl4e & ~PAGE_MASK));
1919 });
1921 /* Put the memory back in the pool */
1922 shadow_free(v->domain, smfn);
1925 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
1927 shadow_l3e_t *sl3e;
1928 u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
1929 mfn_t gmfn, sl3mfn;
1931 SHADOW_DEBUG(DESTROY_SHADOW,
1932 "%s(%05lx)\n", __func__, mfn_x(smfn));
1933 ASSERT(t == PGC_SH_l3_shadow);
1935 /* Record that the guest page isn't shadowed any more (in this type) */
1936 gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
1937 delete_shadow_status(v, gmfn, t, smfn);
1938 shadow_demote(v, gmfn, t);
1940 /* Decrement refcounts of all the old entries */
1941 sl3mfn = smfn;
1942 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
1943 if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1944 sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
1945 (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
1946 | ((unsigned long)sl3e & ~PAGE_MASK));
1947 });
1949 /* Put the memory back in the pool */
1950 shadow_free(v->domain, smfn);
1952 #endif /* GUEST_PAGING_LEVELS >= 4 */
1955 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
1957 shadow_l2e_t *sl2e;
1958 u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
1959 mfn_t gmfn, sl2mfn;
1960 int xen_mappings;
1962 SHADOW_DEBUG(DESTROY_SHADOW,
1963 "%s(%05lx)\n", __func__, mfn_x(smfn));
1964 ASSERT(t == PGC_SH_l2_shadow
1965 || t == PGC_SH_l2h_pae_shadow);
1967 /* Record that the guest page isn't shadowed any more (in this type) */
1968 gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
1969 delete_shadow_status(v, gmfn, t, smfn);
1970 shadow_demote(v, gmfn, t);
1971 #if (GUEST_PAGING_LEVELS == 2) || (GUEST_PAGING_LEVELS == 3)
1972 /* Take this shadow off the list of root shadows */
1973 list_del_init(&mfn_to_page(smfn)->list);
1974 #endif
1976 /* Decrement refcounts of all the old entries */
1977 sl2mfn = smfn;
1978 xen_mappings = (!shadow_mode_external(v->domain) &&
1979 ((GUEST_PAGING_LEVELS == 2) ||
1980 ((GUEST_PAGING_LEVELS == 3) &&
1981 (t == PGC_SH_l2h_pae_shadow))));
1982 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
1983 if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
1984 sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
1985 (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
1986 | ((unsigned long)sl2e & ~PAGE_MASK));
1987 });
1989 /* Put the memory back in the pool */
1990 shadow_free(v->domain, smfn);
1993 void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
1995 struct domain *d = v->domain;
1996 shadow_l1e_t *sl1e;
1997 u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
1999 SHADOW_DEBUG(DESTROY_SHADOW,
2000 "%s(%05lx)\n", __func__, mfn_x(smfn));
2001 ASSERT(t == PGC_SH_l1_shadow || t == PGC_SH_fl1_shadow);
2003 /* Record that the guest page isn't shadowed any more (in this type) */
2004 if ( t == PGC_SH_fl1_shadow )
2006 gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info);
2007 delete_fl1_shadow_status(v, gfn, smfn);
2009 else
2011 mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
2012 delete_shadow_status(v, gmfn, t, smfn);
2013 shadow_demote(v, gmfn, t);
2016 if ( shadow_mode_refcounts(d) )
2018 /* Decrement refcounts of all the old entries */
2019 mfn_t sl1mfn = smfn;
2020 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2021 if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2022 && !sh_l1e_is_magic(*sl1e) )
2023 shadow_put_page_from_l1e(*sl1e, d);
2024 });
2027 /* Put the memory back in the pool */
2028 shadow_free(v->domain, smfn);
2031 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2032 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2034 struct domain *d = v->domain;
2035 ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH_type_mask)
2036 == PGC_SH_monitor_table);
2038 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2039 /* Need to destroy the l3 monitor page in slot 0 too */
2041 l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2042 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2043 shadow_free(d, _mfn(l4e_get_pfn(l4e[0])));
2044 sh_unmap_domain_page(l4e);
2046 #elif CONFIG_PAGING_LEVELS == 3
2047 /* Need to destroy the l2 monitor page in slot 4 too */
2049 l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2050 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2051 shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2052 sh_unmap_domain_page(l3e);
2054 #endif
2056 /* Put the memory back in the pool */
2057 shadow_free(d, mmfn);
2059 #endif
2061 /**************************************************************************/
2062 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2063 * These are called from common code when we are running out of shadow
2064 * memory, and unpinning all the top-level shadows hasn't worked.
2066 * This implementation is pretty crude and slow, but we hope that it won't
2067 * be called very often. */
2069 #if GUEST_PAGING_LEVELS == 2
2071 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2073 shadow_l2e_t *sl2e;
2074 int xen_mappings = !shadow_mode_external(v->domain);
2075 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2076 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2077 });
2080 #elif GUEST_PAGING_LEVELS == 3
2082 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2083 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2085 shadow_l2e_t *sl2e;
2086 int xen_mappings = !shadow_mode_external(v->domain);
2087 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
2088 (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2089 });
2092 #elif GUEST_PAGING_LEVELS == 4
2094 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2096 shadow_l4e_t *sl4e;
2097 int xen_mappings = !shadow_mode_external(v->domain);
2098 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
2099 (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2100 });
2103 #endif
2105 /**************************************************************************/
2106 /* Internal translation functions.
2107 * These functions require a pointer to the shadow entry that will be updated.
2108 */
2110 /* These functions take a new guest entry, translate it to shadow and write
2111 * the shadow entry.
2113 * They return the same bitmaps as the shadow_set_lXe() functions.
2114 */
2116 #if GUEST_PAGING_LEVELS >= 4
2117 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2119 shadow_l4e_t new_sl4e;
2120 guest_l4e_t *new_gl4e = new_ge;
2121 shadow_l4e_t *sl4p = se;
2122 mfn_t sl3mfn = _mfn(INVALID_MFN);
2123 int result = 0;
2125 perfc_incrc(shadow_validate_gl4e_calls);
2127 if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
2129 gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
2130 mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
2131 if ( valid_mfn(gl3mfn) )
2132 sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH_l3_shadow);
2133 else
2134 result |= SHADOW_SET_ERROR;
2136 l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
2137 sl3mfn, &new_sl4e, ft_prefetch);
2139 // check for updates to xen reserved slots
2140 if ( !shadow_mode_external(v->domain) )
2142 int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2143 sizeof(shadow_l4e_t));
2144 int reserved_xen_slot = !is_guest_l4_slot(shadow_index);
2146 if ( unlikely(reserved_xen_slot) )
2148 // attempt by the guest to write to a xen reserved slot
2149 //
2150 SHADOW_PRINTK("%s out-of-range update "
2151 "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2152 __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2153 if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2155 SHADOW_ERROR("out-of-range l4e update\n");
2156 result |= SHADOW_SET_ERROR;
2159 // do not call shadow_set_l4e...
2160 return result;
2164 result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2165 return result;
2169 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2171 shadow_l3e_t new_sl3e;
2172 guest_l3e_t *new_gl3e = new_ge;
2173 shadow_l3e_t *sl3p = se;
2174 mfn_t sl2mfn = _mfn(INVALID_MFN);
2175 int result = 0;
2177 perfc_incrc(shadow_validate_gl3e_calls);
2179 if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
2181 gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
2182 mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
2183 if ( valid_mfn(gl2mfn) )
2184 sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH_l2_shadow);
2185 else
2186 result |= SHADOW_SET_ERROR;
2188 l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
2189 sl2mfn, &new_sl3e, ft_prefetch);
2190 result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2192 return result;
2194 #endif // GUEST_PAGING_LEVELS >= 4
2196 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2198 shadow_l2e_t new_sl2e;
2199 guest_l2e_t *new_gl2e = new_ge;
2200 shadow_l2e_t *sl2p = se;
2201 mfn_t sl1mfn = _mfn(INVALID_MFN);
2202 int result = 0;
2204 perfc_incrc(shadow_validate_gl2e_calls);
2206 if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
2208 gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
2209 if ( guest_supports_superpages(v) &&
2210 (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
2212 // superpage -- need to look up the shadow L1 which holds the
2213 // splitters...
2214 sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2215 #if 0
2216 // XXX - it's possible that we want to do some kind of prefetch
2217 // for superpage fl1's here, but this is *not* on the demand path,
2218 // so we'll hold off trying that for now...
2219 //
2220 if ( !valid_mfn(sl1mfn) )
2221 sl1mfn = make_fl1_shadow(v, gl1gfn);
2222 #endif
2224 else
2226 mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
2227 if ( valid_mfn(gl1mfn) )
2228 sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH_l1_shadow);
2229 else
2230 result |= SHADOW_SET_ERROR;
2233 l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
2234 sl1mfn, &new_sl2e, ft_prefetch);
2236 // check for updates to xen reserved slots in PV guests...
2237 // XXX -- need to revisit this for PV 3-on-4 guests.
2238 //
2239 #if SHADOW_PAGING_LEVELS < 4
2240 #if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2241 if ( !shadow_mode_external(v->domain) )
2243 int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2244 sizeof(shadow_l2e_t));
2245 int reserved_xen_slot;
2247 #if SHADOW_PAGING_LEVELS == 3
2248 reserved_xen_slot =
2249 (((mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask)
2250 == PGC_SH_l2h_pae_shadow) &&
2251 (shadow_index
2252 >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2253 #else /* SHADOW_PAGING_LEVELS == 2 */
2254 reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2255 #endif
2257 if ( unlikely(reserved_xen_slot) )
2259 // attempt by the guest to write to a xen reserved slot
2260 //
2261 SHADOW_PRINTK("%s out-of-range update "
2262 "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2263 __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2264 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2266 SHADOW_ERROR("out-of-range l2e update\n");
2267 result |= SHADOW_SET_ERROR;
2270 // do not call shadow_set_l2e...
2271 return result;
2274 #endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2275 #endif /* SHADOW_PAGING_LEVELS < 4 */
2277 result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2279 return result;
2282 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2284 shadow_l1e_t new_sl1e;
2285 guest_l1e_t *new_gl1e = new_ge;
2286 shadow_l1e_t *sl1p = se;
2287 gfn_t gfn;
2288 mfn_t gmfn;
2289 int result = 0, mmio;
2291 perfc_incrc(shadow_validate_gl1e_calls);
2293 gfn = guest_l1e_get_gfn(*new_gl1e);
2294 gmfn = vcpu_gfn_to_mfn(v, gfn);
2296 mmio = (is_hvm_vcpu(v) && shadow_vcpu_mode_translate(v) && !valid_mfn(gmfn));
2297 l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e,
2298 ft_prefetch, mmio);
2300 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
2301 return result;
2305 /**************************************************************************/
2306 /* Functions which translate and install the shadows of arbitrary guest
2307 * entries that we have just seen the guest write. */
2310 static inline int
2311 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2312 void *new_gp, u32 size, u32 sh_type,
2313 u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2314 int (*validate_ge)(struct vcpu *v, void *ge,
2315 mfn_t smfn, void *se))
2316 /* Generic function for mapping and validating. */
2318 mfn_t smfn, smfn2, map_mfn;
2319 shadow_l1e_t *sl1p;
2320 u32 shadow_idx, guest_idx;
2321 int result = 0;
2323 /* Align address and size to guest entry boundaries */
2324 size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2325 new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2326 size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2327 ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2329 /* Map the shadow page */
2330 smfn = get_shadow_status(v, gmfn, sh_type);
2331 ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
2332 guest_idx = guest_index(new_gp);
2333 map_mfn = smfn;
2334 shadow_idx = shadow_index(&map_mfn, guest_idx);
2335 sl1p = map_shadow_page(map_mfn);
2337 /* Validate one entry at a time */
2338 while ( size )
2340 smfn2 = smfn;
2341 guest_idx = guest_index(new_gp);
2342 shadow_idx = shadow_index(&smfn2, guest_idx);
2343 if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2345 /* We have moved to another page of the shadow */
2346 map_mfn = smfn2;
2347 unmap_shadow_page(sl1p);
2348 sl1p = map_shadow_page(map_mfn);
2350 result |= validate_ge(v,
2351 new_gp,
2352 map_mfn,
2353 &sl1p[shadow_idx]);
2354 size -= sizeof(guest_l1e_t);
2355 new_gp += sizeof(guest_l1e_t);
2357 unmap_shadow_page(sl1p);
2358 return result;
2362 int
2363 sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2364 void *new_gl4p, u32 size)
2366 #if GUEST_PAGING_LEVELS >= 4
2367 return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2368 PGC_SH_l4_shadow,
2369 shadow_l4_index,
2370 validate_gl4e);
2371 #else // ! GUEST_PAGING_LEVELS >= 4
2372 SHADOW_PRINTK("called in wrong paging mode!\n");
2373 BUG();
2374 return 0;
2375 #endif
2378 int
2379 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2380 void *new_gl3p, u32 size)
2382 #if GUEST_PAGING_LEVELS >= 4
2383 return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2384 PGC_SH_l3_shadow,
2385 shadow_l3_index,
2386 validate_gl3e);
2387 #else // ! GUEST_PAGING_LEVELS >= 4
2388 SHADOW_PRINTK("called in wrong paging mode!\n");
2389 BUG();
2390 return 0;
2391 #endif
2394 int
2395 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2396 void *new_gl2p, u32 size)
2398 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2399 PGC_SH_l2_shadow,
2400 shadow_l2_index,
2401 validate_gl2e);
2404 int
2405 sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2406 void *new_gl2p, u32 size)
2408 #if GUEST_PAGING_LEVELS == 3
2409 return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2410 PGC_SH_l2h_shadow,
2411 shadow_l2_index,
2412 validate_gl2e);
2413 #else /* Non-PAE guests don't have different kinds of l2 table */
2414 SHADOW_PRINTK("called in wrong paging mode!\n");
2415 BUG();
2416 return 0;
2417 #endif
2420 int
2421 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2422 void *new_gl1p, u32 size)
2424 return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2425 PGC_SH_l1_shadow,
2426 shadow_l1_index,
2427 validate_gl1e);
2431 /**************************************************************************/
2432 /* Optimization: If we see two emulated writes of zeros to the same
2433 * page-table without another kind of page fault in between, we guess
2434 * that this is a batch of changes (for process destruction) and
2435 * unshadow the page so we don't take a pagefault on every entry. This
2436 * should also make finding writeable mappings of pagetables much
2437 * easier. */
2439 /* Look to see if this is the second emulated write in a row to this
2440 * page, and unshadow/unhook if it is */
2441 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2443 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2444 if ( v->arch.shadow.last_emulated_mfn == mfn_x(gmfn) &&
2445 sh_mfn_is_a_page_table(gmfn) )
2447 u32 flags = mfn_to_page(gmfn)->shadow_flags;
2448 if ( !(flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
2450 perfc_incrc(shadow_early_unshadow);
2451 sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2454 v->arch.shadow.last_emulated_mfn = mfn_x(gmfn);
2455 #endif
2458 /* Stop counting towards early unshadows, as we've seen a real page fault */
2459 static inline void reset_early_unshadow(struct vcpu *v)
2461 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2462 v->arch.shadow.last_emulated_mfn = INVALID_MFN;
2463 #endif
2468 /**************************************************************************/
2469 /* Optimization: Prefetch multiple L1 entries. This is called after we have
2470 * demand-faulted a shadow l1e in the fault handler, to see if it's
2471 * worth fetching some more.
2472 */
2474 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2476 /* XXX magic number */
2477 #define PREFETCH_DISTANCE 32
2479 static void sh_prefetch(struct vcpu *v, walk_t *gw,
2480 shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2482 int i, dist, mmio;
2483 gfn_t gfn;
2484 mfn_t gmfn;
2485 guest_l1e_t gl1e;
2486 shadow_l1e_t sl1e;
2487 u32 gflags;
2489 /* Prefetch no further than the end of the _shadow_ l1 MFN */
2490 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2491 /* And no more than a maximum fetches-per-fault */
2492 if ( dist > PREFETCH_DISTANCE )
2493 dist = PREFETCH_DISTANCE;
2495 for ( i = 1; i < dist ; i++ )
2497 /* No point in prefetching if there's already a shadow */
2498 if ( ptr_sl1e[i].l1 != 0 )
2499 break;
2501 if ( gw->l1e )
2503 /* Normal guest page; grab the next guest entry */
2504 gl1e = gw->l1e[i];
2505 /* Not worth continuing if we hit an entry that will need another
2506 * fault for A/D-bit propagation anyway */
2507 gflags = guest_l1e_get_flags(gl1e);
2508 if ( (gflags & _PAGE_PRESENT)
2509 && (!(gflags & _PAGE_ACCESSED)
2510 || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2511 break;
2513 else
2515 /* Fragmented superpage, unless we've been called wrongly */
2516 ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
2517 /* Increment the l1e's GFN by the right number of guest pages */
2518 gl1e = guest_l1e_from_gfn(
2519 _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i),
2520 guest_l1e_get_flags(gw->eff_l1e));
2523 /* Look at the gfn that the l1e is pointing at */
2524 gfn = guest_l1e_get_gfn(gl1e);
2525 gmfn = vcpu_gfn_to_mfn(v, gfn);
2526 mmio = ( is_hvm_vcpu(v)
2527 && shadow_vcpu_mode_translate(v)
2528 && mmio_space(gfn_to_paddr(gfn)) );
2530 /* Propagate the entry. Safe to use a pointer to our local
2531 * gl1e, since this is not a demand-fetch so there will be no
2532 * write-back to the guest. */
2533 l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
2534 gmfn, &sl1e, ft_prefetch, mmio);
2535 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
2539 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2542 /**************************************************************************/
2543 /* Entry points into the shadow code */
2545 /* Called from pagefault handler in Xen, and from the HVM trap handlers
2546 * for pagefaults. Returns 1 if this fault was an artefact of the
2547 * shadow code (and the guest should retry) or 0 if it is not (and the
2548 * fault should be handled elsewhere or passed to the guest). */
2550 static int sh_page_fault(struct vcpu *v,
2551 unsigned long va,
2552 struct cpu_user_regs *regs)
2554 struct domain *d = v->domain;
2555 walk_t gw;
2556 u32 accumulated_gflags;
2557 gfn_t gfn;
2558 mfn_t gmfn, sl1mfn=_mfn(0);
2559 shadow_l1e_t sl1e, *ptr_sl1e;
2560 paddr_t gpa;
2561 struct cpu_user_regs emul_regs;
2562 struct x86_emulate_ctxt emul_ctxt;
2563 int r, mmio;
2564 fetch_type_t ft = 0;
2566 SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
2567 v->domain->domain_id, v->vcpu_id, va, regs->error_code);
2569 //
2570 // XXX: Need to think about eventually mapping superpages directly in the
2571 // shadow (when possible), as opposed to splintering them into a
2572 // bunch of 4K maps.
2573 //
2575 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
2576 if ( (regs->error_code & PFEC_reserved_bit) )
2578 /* The only reasons for reserved bits to be set in shadow entries
2579 * are the two "magic" shadow_l1e entries. */
2580 if ( likely((__copy_from_user(&sl1e,
2581 (sh_linear_l1_table(v)
2582 + shadow_l1_linear_offset(va)),
2583 sizeof(sl1e)) == 0)
2584 && sh_l1e_is_magic(sl1e)) )
2586 if ( sh_l1e_is_gnp(sl1e) )
2588 if ( likely(!is_hvm_domain(d) ||
2589 shadow_vcpu_mode_translate(v)) )
2591 /* Not-present in a guest PT: pass to the guest as
2592 * a not-present fault (by flipping two bits). */
2593 ASSERT(regs->error_code & PFEC_page_present);
2594 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
2595 perfc_incrc(shadow_fault_fast_gnp);
2596 SHADOW_PRINTK("fast path not-present\n");
2597 return 0;
2599 else
2601 /* Not-present in the P2M: MMIO */
2602 gpa = va;
2605 else
2607 /* Magic MMIO marker: extract gfn for MMIO address */
2608 ASSERT(sh_l1e_is_mmio(sl1e));
2609 gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
2610 << PAGE_SHIFT)
2611 | (va & ~PAGE_MASK);
2613 perfc_incrc(shadow_fault_fast_mmio);
2614 SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
2615 reset_early_unshadow(v);
2616 handle_mmio(gpa);
2617 return EXCRET_fault_fixed;
2619 else
2621 /* This should be exceptionally rare: another vcpu has fixed
2622 * the tables between the fault and our reading the l1e.
2623 * Fall through to the normal fault handing logic */
2624 perfc_incrc(shadow_fault_fast_fail);
2625 SHADOW_PRINTK("fast path false alarm!\n");
2626 /* Don't pass the reserved-bit bit: if we look at the fault
2627 * below and decide to pass it to the guest, the reserved-bit
2628 * bit won't make sense there. */
2629 regs->error_code &= ~PFEC_reserved_bit;
2632 #endif /* SHOPT_FAST_FAULT_PATH */
2634 shadow_lock(d);
2636 shadow_audit_tables(v);
2638 if ( guest_walk_tables(v, va, &gw, 1) != 0 )
2640 SHADOW_PRINTK("malformed guest pagetable!");
2641 print_gw(&gw);
2644 sh_audit_gw(v, &gw);
2646 // We do not look at the gw->l1e, as that will not exist for superpages.
2647 // Instead, we use the gw->eff_l1e...
2648 //
2649 // We need not check all the levels of the guest page table entries for
2650 // present vs not-present, as the eff_l1e will always be not present if
2651 // one of the higher level entries is not present.
2652 //
2653 if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
2655 if ( is_hvm_domain(d) && !shadow_vcpu_mode_translate(v) )
2657 /* Not present in p2m map, means this is mmio */
2658 gpa = va;
2659 goto mmio;
2662 perfc_incrc(shadow_fault_bail_not_present);
2663 goto not_a_shadow_fault;
2666 // All levels of the guest page table are now known to be present.
2667 accumulated_gflags = accumulate_guest_flags(v, &gw);
2669 // Check for attempts to access supervisor-only pages from user mode,
2670 // i.e. ring 3. Such errors are not caused or dealt with by the shadow
2671 // code.
2672 //
2673 if ( (regs->error_code & PFEC_user_mode) &&
2674 !(accumulated_gflags & _PAGE_USER) )
2676 /* illegal user-mode access to supervisor-only page */
2677 perfc_incrc(shadow_fault_bail_user_supervisor);
2678 goto not_a_shadow_fault;
2681 // Was it a write fault?
2682 ft = ((regs->error_code & PFEC_write_access)
2683 ? ft_demand_write : ft_demand_read);
2684 if ( ft == ft_demand_write )
2686 if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
2688 perfc_incrc(shadow_fault_bail_ro_mapping);
2689 goto not_a_shadow_fault;
2692 else // must have been either an insn fetch or read fault
2694 // Check for NX bit violations: attempts to execute code that is
2695 // marked "do not execute". Such errors are not caused or dealt with
2696 // by the shadow code.
2697 //
2698 if ( regs->error_code & PFEC_insn_fetch )
2700 if ( accumulated_gflags & _PAGE_NX_BIT )
2702 /* NX prevented this code fetch */
2703 perfc_incrc(shadow_fault_bail_nx);
2704 goto not_a_shadow_fault;
2709 /* What mfn is the guest trying to access? */
2710 gfn = guest_l1e_get_gfn(gw.eff_l1e);
2711 gmfn = vcpu_gfn_to_mfn(v, gfn);
2712 mmio = (is_hvm_domain(d)
2713 && shadow_vcpu_mode_translate(v)
2714 && mmio_space(gfn_to_paddr(gfn)));
2716 if ( !mmio && !valid_mfn(gmfn) )
2718 perfc_incrc(shadow_fault_bail_bad_gfn);
2719 SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n",
2720 gfn_x(gfn), mfn_x(gmfn));
2721 goto not_a_shadow_fault;
2724 /* Make sure there is enough free shadow memory to build a chain of
2725 * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
2726 * to allocate all we need. (We never allocate a top-level shadow
2727 * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
2728 shadow_prealloc(d, SHADOW_MAX_ORDER);
2730 /* Acquire the shadow. This must happen before we figure out the rights
2731 * for the shadow entry, since we might promote a page here. */
2732 // XXX -- this code will need to change somewhat if/when the shadow code
2733 // can directly map superpages...
2734 ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
2735 ASSERT(ptr_sl1e);
2737 /* Calculate the shadow entry and write it */
2738 l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn,
2739 gmfn, &sl1e, ft, mmio);
2740 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
2742 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2743 /* Prefetch some more shadow entries */
2744 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
2745 #endif
2747 /* Need to emulate accesses to page tables */
2748 if ( sh_mfn_is_a_page_table(gmfn) )
2750 if ( ft == ft_demand_write )
2752 perfc_incrc(shadow_fault_emulate_write);
2753 goto emulate;
2755 else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
2757 perfc_incrc(shadow_fault_emulate_read);
2758 goto emulate;
2762 if ( mmio )
2764 gpa = guest_walk_to_gpa(&gw);
2765 goto mmio;
2768 perfc_incrc(shadow_fault_fixed);
2769 d->arch.shadow.fault_count++;
2770 reset_early_unshadow(v);
2772 done:
2773 sh_audit_gw(v, &gw);
2774 unmap_walk(v, &gw);
2775 SHADOW_PRINTK("fixed\n");
2776 shadow_audit_tables(v);
2777 shadow_unlock(d);
2778 return EXCRET_fault_fixed;
2780 emulate:
2781 /* Take the register set we were called with */
2782 emul_regs = *regs;
2783 if ( is_hvm_domain(d) )
2785 /* Add the guest's segment selectors, rip, rsp. rflags */
2786 hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
2788 emul_ctxt.regs = &emul_regs;
2789 emul_ctxt.cr2 = va;
2790 emul_ctxt.mode = (is_hvm_domain(d) ?
2791 hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST);
2793 SHADOW_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
2795 v->arch.shadow.propagate_fault = 0;
2797 /*
2798 * We do not emulate user writes. Instead we use them as a hint that the
2799 * page is no longer a page table. This behaviour differs from native, but
2800 * it seems very unlikely that any OS grants user access to page tables.
2801 * We also disallow guest PTE updates from within Xen.
2802 */
2803 if ( (regs->error_code & PFEC_user_mode) || !guest_mode(regs) ||
2804 x86_emulate_memop(&emul_ctxt, &shadow_emulator_ops) )
2806 SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
2807 mfn_x(gmfn));
2808 perfc_incrc(shadow_fault_emulate_failed);
2809 /* If this is actually a page table, then we have a bug, and need
2810 * to support more operations in the emulator. More likely,
2811 * though, this is a hint that this page should not be shadowed. */
2812 shadow_remove_all_shadows(v, gmfn);
2813 /* This means that actual missing operations will cause the
2814 * guest to loop on the same page fault. */
2815 goto done;
2818 /* Emulation triggered another page fault? */
2819 if ( v->arch.shadow.propagate_fault )
2820 goto not_a_shadow_fault;
2822 /* Emulator has changed the user registers: write back */
2823 if ( is_hvm_domain(d) )
2825 /* Write back the guest's segment selectors, rip, rsp. rflags */
2826 hvm_load_cpu_guest_regs(v, &emul_regs);
2827 /* And don't overwrite those in the caller's regs. */
2828 emul_regs.eip = regs->eip;
2829 emul_regs.cs = regs->cs;
2830 emul_regs.eflags = regs->eflags;
2831 emul_regs.esp = regs->esp;
2832 emul_regs.ss = regs->ss;
2833 emul_regs.es = regs->es;
2834 emul_regs.ds = regs->ds;
2835 emul_regs.fs = regs->fs;
2836 emul_regs.gs = regs->gs;
2838 *regs = emul_regs;
2840 goto done;
2842 mmio:
2843 if ( !guest_mode(regs) )
2844 goto not_a_shadow_fault;
2845 perfc_incrc(shadow_fault_mmio);
2846 sh_audit_gw(v, &gw);
2847 unmap_walk(v, &gw);
2848 SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
2849 shadow_audit_tables(v);
2850 reset_early_unshadow(v);
2851 shadow_unlock(d);
2852 handle_mmio(gpa);
2853 return EXCRET_fault_fixed;
2855 not_a_shadow_fault:
2856 sh_audit_gw(v, &gw);
2857 unmap_walk(v, &gw);
2858 SHADOW_PRINTK("not a shadow fault\n");
2859 shadow_audit_tables(v);
2860 reset_early_unshadow(v);
2861 shadow_unlock(d);
2862 return 0;
2866 static int
2867 sh_invlpg(struct vcpu *v, unsigned long va)
2868 /* Called when the guest requests an invlpg. Returns 1 if the invlpg
2869 * instruction should be issued on the hardware, or 0 if it's safe not
2870 * to do so. */
2872 shadow_l2e_t sl2e;
2874 perfc_incrc(shadow_invlpg);
2876 /* First check that we can safely read the shadow l2e. SMP/PAE linux can
2877 * run as high as 6% of invlpg calls where we haven't shadowed the l2
2878 * yet. */
2879 #if SHADOW_PAGING_LEVELS == 4
2881 shadow_l3e_t sl3e;
2882 if ( !(shadow_l4e_get_flags(
2883 sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
2884 & _PAGE_PRESENT) )
2885 return 0;
2886 /* This must still be a copy-from-user because we don't have the
2887 * shadow lock, and the higher-level shadows might disappear
2888 * under our feet. */
2889 if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
2890 + shadow_l3_linear_offset(va)),
2891 sizeof (sl3e)) != 0 )
2893 perfc_incrc(shadow_invlpg_fault);
2894 return 0;
2896 if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
2897 return 0;
2899 #elif SHADOW_PAGING_LEVELS == 3
2900 if ( !(l3e_get_flags(v->arch.shadow.l3table[shadow_l3_linear_offset(va)])
2901 & _PAGE_PRESENT) )
2902 // no need to flush anything if there's no SL2...
2903 return 0;
2904 #endif
2906 /* This must still be a copy-from-user because we don't have the shadow
2907 * lock, and the higher-level shadows might disappear under our feet. */
2908 if ( __copy_from_user(&sl2e,
2909 sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
2910 sizeof (sl2e)) != 0 )
2912 perfc_incrc(shadow_invlpg_fault);
2913 return 0;
2916 // If there's nothing shadowed for this particular sl2e, then
2917 // there is no need to do an invlpg, either...
2918 //
2919 if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
2920 return 0;
2922 // Check to see if the SL2 is a splintered superpage...
2923 // If so, then we'll need to flush the entire TLB (because that's
2924 // easier than invalidating all of the individual 4K pages).
2925 //
2926 if ( (mfn_to_page(shadow_l2e_get_mfn(sl2e))->count_info &
2927 PGC_SH_type_mask) == PGC_SH_fl1_shadow )
2929 local_flush_tlb();
2930 return 0;
2933 return 1;
2936 static unsigned long
2937 sh_gva_to_gfn(struct vcpu *v, unsigned long va)
2938 /* Called to translate a guest virtual address to what the *guest*
2939 * pagetables would map it to. */
2941 walk_t gw;
2942 gfn_t gfn;
2944 guest_walk_tables(v, va, &gw, 0);
2945 gfn = guest_walk_to_gfn(&gw);
2946 unmap_walk(v, &gw);
2948 return gfn_x(gfn);
2952 static paddr_t
2953 sh_gva_to_gpa(struct vcpu *v, unsigned long va)
2954 /* Called to translate a guest virtual address to what the *guest*
2955 * pagetables would map it to. */
2957 unsigned long gfn = sh_gva_to_gfn(v, va);
2958 if ( gfn == INVALID_GFN )
2959 return 0;
2960 else
2961 return (((paddr_t)gfn) << PAGE_SHIFT) + (va & ~PAGE_MASK);
2965 static inline void
2966 sh_update_linear_entries(struct vcpu *v)
2967 /* Sync up all the linear mappings for this vcpu's pagetables */
2969 struct domain *d = v->domain;
2971 /* Linear pagetables in PV guests
2972 * ------------------------------
2974 * Guest linear pagetables, which map the guest pages, are at
2975 * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
2976 * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
2977 * are set up at shadow creation time, but (of course!) the PAE case
2978 * is subtler. Normal linear mappings are made by having an entry
2979 * in the top-level table that points to itself (shadow linear) or
2980 * to the guest top-level table (guest linear). For PAE, to set up
2981 * a linear map requires us to copy the four top-level entries into
2982 * level-2 entries. That means that every time we change a PAE l3e,
2983 * we need to reflect the change into the copy.
2985 * Linear pagetables in HVM guests
2986 * -------------------------------
2988 * For HVM guests, the linear pagetables are installed in the monitor
2989 * tables (since we can't put them in the shadow). Shadow linear
2990 * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
2991 * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
2992 * a linear pagetable of the monitor tables themselves. We have
2993 * the same issue of having to re-copy PAE l3 entries whevever we use
2994 * PAE shadows.
2996 * Because HVM guests run on the same monitor tables regardless of the
2997 * shadow tables in use, the linear mapping of the shadow tables has to
2998 * be updated every time v->arch.shadow_table changes.
2999 */
3001 /* Don't try to update the monitor table if it doesn't exist */
3002 if ( shadow_mode_external(d)
3003 && pagetable_get_pfn(v->arch.monitor_table) == 0 )
3004 return;
3006 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3008 /* For PV, one l4e points at the guest l4, one points at the shadow
3009 * l4. No maintenance required.
3010 * For HVM, just need to update the l4e that points to the shadow l4. */
3012 if ( shadow_mode_external(d) )
3014 /* Use the linear map if we can; otherwise make a new mapping */
3015 if ( v == current )
3017 __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3018 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3019 __PAGE_HYPERVISOR);
3021 else
3023 l4_pgentry_t *ml4e;
3024 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3025 ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3026 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3027 __PAGE_HYPERVISOR);
3028 sh_unmap_domain_page(ml4e);
3032 #elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3034 /* This case only exists in HVM. To give ourselves a linear map of the
3035 * shadows, we need to extend a PAE shadow to 4 levels. We do this by
3036 * having a monitor l3 in slot 0 of the monitor l4 table, and
3037 * copying the PAE l3 entries into it. Then, by having the monitor l4e
3038 * for shadow pagetables also point to the monitor l4, we can use it
3039 * to access the shadows. */
3041 if ( shadow_mode_external(d) )
3043 /* Install copies of the shadow l3es into the monitor l3 table.
3044 * The monitor l3 table is hooked into slot 0 of the monitor
3045 * l4 table, so we use l3 linear indices 0 to 3 */
3046 shadow_l3e_t *sl3e;
3047 l3_pgentry_t *ml3e;
3048 mfn_t l3mfn;
3049 int i;
3051 /* Use linear mappings if we can; otherwise make new mappings */
3052 if ( v == current )
3054 ml3e = __linear_l3_table;
3055 l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
3057 else
3059 l4_pgentry_t *ml4e;
3060 ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3061 ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
3062 l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
3063 ml3e = sh_map_domain_page(l3mfn);
3064 sh_unmap_domain_page(ml4e);
3067 /* Shadow l3 tables are made up by update_cr3 */
3068 sl3e = v->arch.shadow.l3table;
3070 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3072 ml3e[i] =
3073 (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3074 ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3075 __PAGE_HYPERVISOR)
3076 : l3e_empty();
3079 if ( v != current )
3080 sh_unmap_domain_page(ml3e);
3083 #elif CONFIG_PAGING_LEVELS == 3
3085 /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3086 * entries in the shadow, and the shadow's l3 entries into the
3087 * shadow-linear-map l2 entries in the shadow. This is safe to do
3088 * because Xen does not let guests share high-slot l2 tables between l3s,
3089 * so we know we're not treading on anyone's toes.
3091 * HVM: need to copy the shadow's l3 entries into the
3092 * shadow-linear-map l2 entries in the monitor table. This is safe
3093 * because we have one monitor table for each vcpu. The monitor's
3094 * own l3es don't need to be copied because they never change.
3095 * XXX That might change if we start stuffing things into the rest
3096 * of the monitor's virtual address space.
3097 */
3099 l2_pgentry_t *l2e, new_l2e;
3100 shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3101 int i;
3102 int unmap_l2e = 0;
3104 #if GUEST_PAGING_LEVELS == 2
3105 /* Shadow l3 tables were built by update_cr3 */
3106 if ( shadow_mode_external(d) )
3107 shadow_l3e = (shadow_l3e_t *)&v->arch.shadow.l3table;
3108 else
3109 BUG(); /* PV 2-on-3 is not supported yet */
3111 #else /* GUEST_PAGING_LEVELS == 3 */
3113 shadow_l3e = (shadow_l3e_t *)&v->arch.shadow.l3table;
3114 /* Always safe to use guest_vtable, because it's globally mapped */
3115 guest_l3e = v->arch.guest_vtable;
3117 #endif /* GUEST_PAGING_LEVELS */
3119 /* Choose where to write the entries, using linear maps if possible */
3120 if ( shadow_mode_external(d) )
3122 if ( v == current )
3124 /* From the monitor tables, it's safe to use linear maps
3125 * to update monitor l2s */
3126 l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3128 else
3130 /* Map the monitor table's high l2 */
3131 l3_pgentry_t *l3e;
3132 l3e = sh_map_domain_page(
3133 pagetable_get_mfn(v->arch.monitor_table));
3134 ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3135 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3136 unmap_l2e = 1;
3137 sh_unmap_domain_page(l3e);
3140 else
3142 /* Map the shadow table's high l2 */
3143 ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3144 l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3145 unmap_l2e = 1;
3148 /* Write linear mapping of guest (only in PV, and only when
3149 * not translated). */
3150 if ( !shadow_mode_translate(d) )
3152 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3154 new_l2e =
3155 ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3156 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3157 __PAGE_HYPERVISOR)
3158 : l2e_empty());
3159 safe_write_entry(
3160 &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3161 &new_l2e);
3165 /* Write linear mapping of shadow. */
3166 for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3168 new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3169 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3170 __PAGE_HYPERVISOR)
3171 : l2e_empty();
3172 safe_write_entry(
3173 &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3174 &new_l2e);
3177 if ( unmap_l2e )
3178 sh_unmap_domain_page(l2e);
3181 #elif CONFIG_PAGING_LEVELS == 2
3183 /* For PV, one l2e points at the guest l2, one points at the shadow
3184 * l2. No maintenance required.
3185 * For HVM, just need to update the l2e that points to the shadow l2. */
3187 if ( shadow_mode_external(d) )
3189 /* Use the linear map if we can; otherwise make a new mapping */
3190 if ( v == current )
3192 __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3193 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3194 __PAGE_HYPERVISOR);
3196 else
3198 l2_pgentry_t *ml2e;
3199 ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3200 ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3201 l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3202 __PAGE_HYPERVISOR);
3203 sh_unmap_domain_page(ml2e);
3207 #else
3208 #error this should not happen
3209 #endif
3213 /* Removes vcpu->arch.guest_vtable and vcpu->arch.shadow_table[].
3214 * Does all appropriate management/bookkeeping/refcounting/etc...
3215 */
3216 static void
3217 sh_detach_old_tables(struct vcpu *v)
3219 struct domain *d = v->domain;
3220 mfn_t smfn;
3221 int i = 0;
3223 ////
3224 //// vcpu->arch.guest_vtable
3225 ////
3226 if ( v->arch.guest_vtable )
3228 #if GUEST_PAGING_LEVELS == 4
3229 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3230 sh_unmap_domain_page_global(v->arch.guest_vtable);
3231 #elif GUEST_PAGING_LEVELS == 3
3232 if ( 1 || shadow_mode_external(d) || shadow_mode_translate(d) )
3233 sh_unmap_domain_page_global(v->arch.guest_vtable);
3234 #elif GUEST_PAGING_LEVELS == 2
3235 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3236 sh_unmap_domain_page_global(v->arch.guest_vtable);
3237 #endif
3238 v->arch.guest_vtable = NULL;
3241 ////
3242 //// vcpu->arch.shadow_table[]
3243 ////
3246 #if GUEST_PAGING_LEVELS == 3
3247 /* PAE guests have four shadow_table entries */
3248 for ( i = 0 ; i < 4 ; i++ )
3249 #endif
3251 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3252 if ( mfn_x(smfn) )
3253 sh_put_ref(v, smfn, 0);
3254 v->arch.shadow_table[i] = pagetable_null();
3258 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3259 static void
3260 sh_set_toplevel_shadow(struct vcpu *v,
3261 int slot,
3262 mfn_t gmfn,
3263 unsigned int root_type)
3265 mfn_t smfn = get_shadow_status(v, gmfn, root_type);
3266 struct domain *d = v->domain;
3267 ASSERT(pagetable_is_null(v->arch.shadow_table[slot]));
3268 if ( valid_mfn(smfn) )
3270 /* Pull this root shadow to the front of the list of roots. */
3271 list_del(&mfn_to_page(smfn)->list);
3272 list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
3274 else
3276 /* This guest MFN is a pagetable. Must revoke write access
3277 * (and can't use heuristics because we have no linear map here). */
3278 if ( shadow_remove_write_access(v, gmfn, 0, 0) != 0 )
3279 flush_tlb_mask(v->domain->domain_dirty_cpumask);
3280 /* Make sure there's enough free shadow memory. */
3281 shadow_prealloc(d, SHADOW_MAX_ORDER);
3282 /* Shadow the page. */
3283 smfn = sh_make_shadow(v, gmfn, root_type);
3284 list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
3286 ASSERT(valid_mfn(smfn));
3288 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
3289 /* Once again OK to unhook entries from this table if we see fork/exit */
3290 ASSERT(sh_mfn_is_a_page_table(gmfn));
3291 mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
3292 #endif
3294 /* Take a ref to this page: it will be released in sh_detach_old_tables. */
3295 sh_get_ref(smfn, 0);
3296 sh_pin(smfn);
3298 /* Done. Install it */
3299 SHADOW_PRINTK("%u/%u [%u] gmfn %#"SH_PRI_mfn" smfn %#"SH_PRI_mfn"\n",
3300 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
3301 mfn_x(gmfn), mfn_x(smfn));
3302 v->arch.shadow_table[slot] = pagetable_from_mfn(smfn);
3306 static void
3307 sh_update_cr3(struct vcpu *v)
3308 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
3309 * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
3310 * if appropriate).
3311 * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works,
3312 * and read vcpu->arch.hvm_vcpu.hw_cr3 afterwards.
3313 */
3315 struct domain *d = v->domain;
3316 mfn_t gmfn;
3317 #if GUEST_PAGING_LEVELS == 3
3318 u32 guest_idx=0;
3319 #endif
3321 ASSERT(shadow_lock_is_acquired(v->domain));
3322 ASSERT(v->arch.shadow.mode);
3324 ////
3325 //// vcpu->arch.guest_table is already set
3326 ////
3328 #ifndef NDEBUG
3329 /* Double-check that the HVM code has sent us a sane guest_table */
3330 if ( is_hvm_domain(d) )
3332 gfn_t gfn;
3334 ASSERT(shadow_mode_external(d));
3336 // Is paging enabled on this vcpu?
3337 if ( shadow_vcpu_mode_translate(v) )
3339 gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
3340 gmfn = vcpu_gfn_to_mfn(v, gfn);
3341 ASSERT(valid_mfn(gmfn));
3342 ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
3344 else
3346 /* Paging disabled: guest_table points at (part of) p2m */
3347 #if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
3348 /* For everything else, they sould be the same */
3349 ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
3350 #endif
3353 #endif
3355 SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
3356 d->domain_id, v->vcpu_id,
3357 (unsigned long)pagetable_get_pfn(v->arch.guest_table));
3359 #if GUEST_PAGING_LEVELS == 4
3360 if ( !(v->arch.flags & TF_kernel_mode) )
3361 gmfn = pagetable_get_mfn(v->arch.guest_table_user);
3362 else
3363 #endif
3364 gmfn = pagetable_get_mfn(v->arch.guest_table);
3366 sh_detach_old_tables(v);
3368 if ( !is_hvm_domain(d) && !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
3370 ASSERT(v->arch.cr3 == 0);
3371 return;
3374 ////
3375 //// vcpu->arch.guest_vtable
3376 ////
3377 #if GUEST_PAGING_LEVELS == 4
3378 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3379 v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
3380 else
3381 v->arch.guest_vtable = __linear_l4_table;
3382 #elif GUEST_PAGING_LEVELS == 3
3383 if ( shadow_mode_external(d) )
3385 if ( shadow_vcpu_mode_translate(v) )
3386 /* Paging enabled: find where in the page the l3 table is */
3387 guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
3388 else
3389 /* Paging disabled: l3 is at the start of a page (in the p2m) */
3390 guest_idx = 0;
3392 // Ignore the low 2 bits of guest_idx -- they are really just
3393 // cache control.
3394 guest_idx &= ~3;
3396 // XXX - why does this need a global map?
3397 v->arch.guest_vtable =
3398 (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx;
3400 else
3401 v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
3402 #elif GUEST_PAGING_LEVELS == 2
3403 if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3404 v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
3405 else
3406 v->arch.guest_vtable = __linear_l2_table;
3407 #else
3408 #error this should never happen
3409 #endif
3411 #if 0
3412 printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
3413 __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
3414 #endif
3416 ////
3417 //// vcpu->arch.shadow_table[]
3418 ////
3420 #if GUEST_PAGING_LEVELS == 2
3421 sh_set_toplevel_shadow(v, 0, gmfn, PGC_SH_l2_shadow);
3422 #elif GUEST_PAGING_LEVELS == 3
3423 /* PAE guests have four shadow_table entries, based on the
3424 * current values of the guest's four l3es. */
3426 int i;
3427 guest_l3e_t *gl3e = (guest_l3e_t*)v->arch.guest_vtable;
3428 for ( i = 0; i < 4; i++ )
3430 ASSERT(pagetable_is_null(v->arch.shadow_table[i]));
3431 if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
3433 gfn_t gl2gfn = guest_l3e_get_gfn(gl3e[i]);
3434 mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
3435 if ( valid_mfn(gl2mfn) )
3436 sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
3437 ? PGC_SH_l2h_shadow
3438 : PGC_SH_l2_shadow);
3442 #elif GUEST_PAGING_LEVELS == 4
3443 sh_set_toplevel_shadow(v, 0, gmfn, PGC_SH_l4_shadow);
3444 #else
3445 #error This should never happen
3446 #endif
3448 #if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
3449 #endif
3451 ///
3452 /// v->arch.shadow.l3table
3453 ///
3454 #if SHADOW_PAGING_LEVELS == 3
3456 mfn_t smfn;
3457 int i;
3458 for ( i = 0; i < 4; i++ )
3460 #if GUEST_PAGING_LEVELS == 2
3461 /* 2-on-3: make a PAE l3 that points at the four-page l2 */
3462 smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
3463 #else
3464 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
3465 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3466 #endif
3467 v->arch.shadow.l3table[i] =
3468 (mfn_x(smfn) == 0)
3469 ? shadow_l3e_empty()
3470 : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
3473 #endif /* SHADOW_PAGING_LEVELS == 3 */
3476 ///
3477 /// v->arch.cr3
3478 ///
3479 if ( shadow_mode_external(d) )
3481 make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
3483 else // not shadow_mode_external...
3485 /* We don't support PV except guest == shadow == config levels */
3486 BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
3487 #if SHADOW_PAGING_LEVELS == 3
3488 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
3489 * Don't use make_cr3 because (a) we know it's below 4GB, and
3490 * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
3491 ASSERT(virt_to_maddr(&v->arch.shadow.l3table) <= 0xffffffe0ULL);
3492 v->arch.cr3 = virt_to_maddr(&v->arch.shadow.l3table);
3493 #else
3494 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3495 make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
3496 #endif
3500 ///
3501 /// v->arch.hvm_vcpu.hw_cr3
3502 ///
3503 if ( shadow_mode_external(d) )
3505 ASSERT(is_hvm_domain(d));
3506 #if SHADOW_PAGING_LEVELS == 3
3507 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
3508 v->arch.hvm_vcpu.hw_cr3 = virt_to_maddr(&v->arch.shadow.l3table);
3509 #else
3510 /* 2-on-2 or 4-on-4: Just use the shadow top-level directly */
3511 v->arch.hvm_vcpu.hw_cr3 = pagetable_get_paddr(v->arch.shadow_table[0]);
3512 #endif
3515 /* Fix up the linear pagetable mappings */
3516 sh_update_linear_entries(v);
3520 /**************************************************************************/
3521 /* Functions to revoke guest rights */
3523 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3524 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
3525 /* Look up this vaddr in the current shadow and see if it's a writeable
3526 * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
3528 shadow_l1e_t sl1e, *sl1p;
3529 shadow_l2e_t *sl2p;
3530 #if GUEST_PAGING_LEVELS >= 3
3531 shadow_l3e_t *sl3p;
3532 #if GUEST_PAGING_LEVELS >= 4
3533 shadow_l4e_t *sl4p;
3534 #endif
3535 #endif
3536 mfn_t sl1mfn;
3539 /* Carefully look in the shadow linear map for the l1e we expect */
3540 #if GUEST_PAGING_LEVELS >= 4
3541 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
3542 if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
3543 return 0;
3544 sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
3545 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3546 return 0;
3547 #elif GUEST_PAGING_LEVELS == 3
3548 sl3p = ((shadow_l3e_t *) v->arch.shadow.l3table)
3549 + shadow_l3_linear_offset(vaddr);
3550 if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
3551 return 0;
3552 #endif
3553 sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
3554 if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
3555 return 0;
3556 sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
3557 sl1e = *sl1p;
3558 if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
3559 != (_PAGE_PRESENT|_PAGE_RW))
3560 || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
3561 return 0;
3563 /* Found it! Need to remove its write permissions. */
3564 sl1mfn = shadow_l2e_get_mfn(*sl2p);
3565 sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
3566 shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
3567 return 1;
3569 #endif
3571 int sh_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
3572 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
3574 shadow_l1e_t *sl1e;
3575 int done = 0;
3576 int flags;
3577 mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
3579 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3581 flags = shadow_l1e_get_flags(*sl1e);
3582 if ( (flags & _PAGE_PRESENT)
3583 && (flags & _PAGE_RW)
3584 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
3586 shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3587 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
3588 /* Remember the last shadow that we shot a writeable mapping in */
3589 v->arch.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
3590 #endif
3591 if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
3592 & PGT_count_mask) == 0 )
3593 /* This breaks us cleanly out of the FOREACH macro */
3594 done = 1;
3596 });
3597 return done;
3601 int sh_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
3602 /* Excises all mappings to guest frame from this shadow l1 table */
3604 shadow_l1e_t *sl1e;
3605 int done = 0;
3606 int flags;
3608 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
3610 flags = shadow_l1e_get_flags(*sl1e);
3611 if ( (flags & _PAGE_PRESENT)
3612 && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
3614 shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
3615 if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
3616 /* This breaks us cleanly out of the FOREACH macro */
3617 done = 1;
3619 });
3620 return done;
3623 /**************************************************************************/
3624 /* Functions to excise all pointers to shadows from higher-level shadows. */
3626 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
3627 /* Blank out a single shadow entry */
3629 switch (mfn_to_page(smfn)->count_info & PGC_SH_type_mask)
3631 case PGC_SH_l1_shadow:
3632 shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
3633 case PGC_SH_l2_shadow:
3634 #if GUEST_PAGING_LEVELS == 3
3635 case PGC_SH_l2h_shadow:
3636 #endif
3637 shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
3638 #if GUEST_PAGING_LEVELS >= 4
3639 case PGC_SH_l3_shadow:
3640 shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
3641 case PGC_SH_l4_shadow:
3642 shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
3643 #endif
3644 default: BUG(); /* Called with the wrong kind of shadow. */
3648 int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
3649 /* Remove all mappings of this l1 shadow from this l2 shadow */
3651 shadow_l2e_t *sl2e;
3652 int done = 0;
3653 int flags;
3654 #if GUEST_PAGING_LEVELS != 4
3655 int xen_mappings = !shadow_mode_external(v->domain);
3656 #endif
3658 SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings,
3660 flags = shadow_l2e_get_flags(*sl2e);
3661 if ( (flags & _PAGE_PRESENT)
3662 && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
3664 shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
3665 if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH_type_mask) == 0 )
3666 /* This breaks us cleanly out of the FOREACH macro */
3667 done = 1;
3669 });
3670 return done;
3673 #if GUEST_PAGING_LEVELS >= 4
3674 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
3675 /* Remove all mappings of this l2 shadow from this l3 shadow */
3677 shadow_l3e_t *sl3e;
3678 int done = 0;
3679 int flags;
3681 SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
3683 flags = shadow_l3e_get_flags(*sl3e);
3684 if ( (flags & _PAGE_PRESENT)
3685 && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
3687 shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
3688 if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) == 0 )
3689 /* This breaks us cleanly out of the FOREACH macro */
3690 done = 1;
3692 });
3693 return done;
3696 int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
3697 /* Remove all mappings of this l3 shadow from this l4 shadow */
3699 shadow_l4e_t *sl4e;
3700 int done = 0;
3701 int flags, xen_mappings = !shadow_mode_external(v->domain);
3703 SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
3705 flags = shadow_l4e_get_flags(*sl4e);
3706 if ( (flags & _PAGE_PRESENT)
3707 && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
3709 shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
3710 if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH_type_mask) == 0 )
3711 /* This breaks us cleanly out of the FOREACH macro */
3712 done = 1;
3714 });
3715 return done;
3717 #endif /* 64bit guest */
3719 /**************************************************************************/
3720 /* Handling HVM guest writes to pagetables */
3722 /* Check that the user is allowed to perform this write.
3723 * Returns a mapped pointer to write to, and the mfn it's on,
3724 * or NULL for error. */
3725 static inline void * emulate_map_dest(struct vcpu *v,
3726 unsigned long vaddr,
3727 struct x86_emulate_ctxt *ctxt,
3728 mfn_t *mfnp)
3730 walk_t gw;
3731 u32 flags;
3732 gfn_t gfn;
3733 mfn_t mfn;
3735 guest_walk_tables(v, vaddr, &gw, 1);
3736 flags = accumulate_guest_flags(v, &gw);
3737 gfn = guest_l1e_get_gfn(gw.eff_l1e);
3738 mfn = vcpu_gfn_to_mfn(v, gfn);
3739 sh_audit_gw(v, &gw);
3740 unmap_walk(v, &gw);
3742 if ( !(flags & _PAGE_PRESENT)
3743 || !(flags & _PAGE_RW)
3744 || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) )
3746 /* This write would have faulted even on bare metal */
3747 v->arch.shadow.propagate_fault = 1;
3748 return NULL;
3751 if ( !valid_mfn(mfn) )
3753 /* Attempted a write to a bad gfn. This should never happen:
3754 * after all, we're here because this write is to a page table. */
3755 BUG();
3758 ASSERT(sh_mfn_is_a_page_table(mfn));
3759 *mfnp = mfn;
3760 return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
3763 int
3764 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
3765 u32 bytes, struct x86_emulate_ctxt *ctxt)
3767 ASSERT(shadow_lock_is_acquired(v->domain));
3768 while ( bytes > 0 )
3770 mfn_t mfn;
3771 int bytes_on_page;
3772 void *addr;
3774 bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK);
3775 if ( bytes_on_page > bytes )
3776 bytes_on_page = bytes;
3778 if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
3779 return X86EMUL_PROPAGATE_FAULT;
3780 memcpy(addr, src, bytes_on_page);
3781 shadow_validate_guest_pt_write(v, mfn, addr, bytes_on_page);
3782 bytes -= bytes_on_page;
3783 /* If we are writing zeros to this page, might want to unshadow */
3784 if ( likely(bytes_on_page >= 4) && (*(u32 *)addr == 0) )
3785 check_for_early_unshadow(v, mfn);
3786 sh_unmap_domain_page(addr);
3788 shadow_audit_tables(v);
3789 return X86EMUL_CONTINUE;
3792 int
3793 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
3794 unsigned long old, unsigned long new,
3795 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
3797 mfn_t mfn;
3798 void *addr;
3799 unsigned long prev;
3800 int rv = X86EMUL_CONTINUE;
3802 ASSERT(shadow_lock_is_acquired(v->domain));
3803 ASSERT(bytes <= sizeof (unsigned long));
3805 if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
3806 return X86EMUL_PROPAGATE_FAULT;
3808 switch (bytes)
3810 case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
3811 case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
3812 case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
3813 case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
3814 default:
3815 SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
3816 prev = ~old;
3819 if ( (prev == old) )
3820 shadow_validate_guest_pt_write(v, mfn, addr, bytes);
3821 else
3822 rv = X86EMUL_CMPXCHG_FAILED;
3824 SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
3825 " wanted %#lx now %#lx bytes %u\n",
3826 vaddr, prev, old, new, *(unsigned long *)addr, bytes);
3828 /* If we are writing zeros to this page, might want to unshadow */
3829 if ( likely(bytes >= 4) && (*(u32 *)addr == 0) )
3830 check_for_early_unshadow(v, mfn);
3832 sh_unmap_domain_page(addr);
3833 shadow_audit_tables(v);
3834 check_for_early_unshadow(v, mfn);
3835 return rv;
3838 int
3839 sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
3840 unsigned long old_lo, unsigned long old_hi,
3841 unsigned long new_lo, unsigned long new_hi,
3842 struct x86_emulate_ctxt *ctxt)
3844 mfn_t mfn;
3845 void *addr;
3846 u64 old, new, prev;
3847 int rv = X86EMUL_CONTINUE;
3849 ASSERT(shadow_lock_is_acquired(v->domain));
3851 if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
3852 return X86EMUL_PROPAGATE_FAULT;
3854 old = (((u64) old_hi) << 32) | (u64) old_lo;
3855 new = (((u64) new_hi) << 32) | (u64) new_lo;
3856 prev = cmpxchg(((u64 *)addr), old, new);
3858 if ( (prev == old) )
3859 shadow_validate_guest_pt_write(v, mfn, addr, 8);
3860 else
3861 rv = X86EMUL_CMPXCHG_FAILED;
3863 /* If we are writing zeros to this page, might want to unshadow */
3864 if ( *(u32 *)addr == 0 )
3865 check_for_early_unshadow(v, mfn);
3867 sh_unmap_domain_page(addr);
3868 shadow_audit_tables(v);
3869 check_for_early_unshadow(v, mfn);
3870 return rv;
3874 /**************************************************************************/
3875 /* Audit tools */
3877 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
3879 #define AUDIT_FAIL(_level, _fmt, _a...) do { \
3880 printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
3881 "gl" #_level "mfn = %" SH_PRI_mfn \
3882 " sl" #_level "mfn = %" SH_PRI_mfn \
3883 " &gl" #_level "e = %p &sl" #_level "e = %p" \
3884 " gl" #_level "e = %" SH_PRI_gpte \
3885 " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
3886 GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
3887 _level, guest_index(gl ## _level ## e), \
3888 mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
3889 gl ## _level ## e, sl ## _level ## e, \
3890 gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
3891 ##_a); \
3892 BUG(); \
3893 done = 1; \
3894 } while (0)
3897 static char * sh_audit_flags(struct vcpu *v, int level,
3898 int gflags, int sflags)
3899 /* Common code for auditing flag bits */
3901 if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
3902 return "shadow is present but guest is not present";
3903 if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
3904 return "global bit set in PV shadow";
3905 if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
3906 && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) )
3907 return "dirty bit not propagated";
3908 if ( level == 2 && (sflags & _PAGE_PSE) )
3909 return "PS bit set in shadow";
3910 #if SHADOW_PAGING_LEVELS == 3
3911 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
3912 #endif
3913 if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
3914 return "user/supervisor bit does not match";
3915 if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
3916 return "NX bit does not match";
3917 if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
3918 return "shadow grants write access but guest does not";
3919 if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) )
3920 return "accessed bit not propagated";
3921 return NULL;
3924 static inline mfn_t
3925 audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
3926 /* Convert this gfn to an mfn in the manner appropriate for the
3927 * guest pagetable it's used in (gmfn) */
3929 if ( !shadow_mode_translate(v->domain) )
3930 return _mfn(gfn_x(gfn));
3932 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
3933 != PGT_writable_page )
3934 return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
3935 else
3936 return sh_gfn_to_mfn(v->domain, gfn_x(gfn));
3940 int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
3942 guest_l1e_t *gl1e, *gp;
3943 shadow_l1e_t *sl1e;
3944 mfn_t mfn, gmfn, gl1mfn;
3945 gfn_t gfn;
3946 char *s;
3947 int done = 0;
3949 /* Follow the backpointer */
3950 gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
3951 gl1e = gp = sh_map_domain_page(gl1mfn);
3952 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
3954 if ( sh_l1e_is_magic(*sl1e) )
3956 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
3957 if ( sh_l1e_is_gnp(*sl1e) )
3959 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
3960 AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
3962 else
3964 ASSERT(sh_l1e_is_mmio(*sl1e));
3965 gfn = sh_l1e_mmio_get_gfn(*sl1e);
3966 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
3967 AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
3968 " but guest gfn is %" SH_PRI_gfn,
3969 gfn_x(gfn),
3970 gfn_x(guest_l1e_get_gfn(*gl1e)));
3972 #endif
3974 else
3976 s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
3977 shadow_l1e_get_flags(*sl1e));
3978 if ( s ) AUDIT_FAIL(1, "%s", s);
3980 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
3982 gfn = guest_l1e_get_gfn(*gl1e);
3983 mfn = shadow_l1e_get_mfn(*sl1e);
3984 gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
3985 if ( mfn_x(gmfn) != mfn_x(mfn) )
3986 AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
3987 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
3988 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
3991 });
3992 sh_unmap_domain_page(gp);
3993 return done;
3996 int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
3998 guest_l1e_t *gl1e, e;
3999 shadow_l1e_t *sl1e;
4000 mfn_t gl1mfn = _mfn(INVALID_MFN);
4001 int f;
4002 int done = 0;
4004 /* fl1 has no useful backpointer: all we can check are flags */
4005 e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
4006 SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
4007 f = shadow_l1e_get_flags(*sl1e);
4008 f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
4009 if ( !(f == 0
4010 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
4011 _PAGE_ACCESSED|_PAGE_DIRTY)
4012 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
4013 || sh_l1e_is_magic(*sl1e)) )
4014 AUDIT_FAIL(1, "fl1e has bad flags");
4015 });
4016 return 0;
4019 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
4021 guest_l2e_t *gl2e, *gp;
4022 shadow_l2e_t *sl2e;
4023 mfn_t mfn, gmfn, gl2mfn;
4024 gfn_t gfn;
4025 char *s;
4026 int done = 0;
4027 #if GUEST_PAGING_LEVELS != 4
4028 int xen_mappings = !shadow_mode_external(v->domain);
4029 #endif
4031 /* Follow the backpointer */
4032 gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info);
4033 gl2e = gp = sh_map_domain_page(gl2mfn);
4034 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
4036 s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
4037 shadow_l2e_get_flags(*sl2e));
4038 if ( s ) AUDIT_FAIL(2, "%s", s);
4040 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4042 gfn = guest_l2e_get_gfn(*gl2e);
4043 mfn = shadow_l2e_get_mfn(*sl2e);
4044 gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
4045 ? get_fl1_shadow_status(v, gfn)
4046 : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
4047 PGC_SH_l1_shadow);
4048 if ( mfn_x(gmfn) != mfn_x(mfn) )
4049 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
4050 " (--> %" SH_PRI_mfn ")"
4051 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4052 gfn_x(gfn),
4053 (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
4054 : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
4055 mfn_x(gmfn), mfn_x(mfn));
4057 });
4058 sh_unmap_domain_page(gp);
4059 return 0;
4062 #if GUEST_PAGING_LEVELS >= 4
4063 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
4065 guest_l3e_t *gl3e, *gp;
4066 shadow_l3e_t *sl3e;
4067 mfn_t mfn, gmfn, gl3mfn;
4068 gfn_t gfn;
4069 char *s;
4070 int done = 0;
4072 /* Follow the backpointer */
4073 gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info);
4074 gl3e = gp = sh_map_domain_page(gl3mfn);
4075 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
4077 s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
4078 shadow_l3e_get_flags(*sl3e));
4079 if ( s ) AUDIT_FAIL(3, "%s", s);
4081 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4083 gfn = guest_l3e_get_gfn(*gl3e);
4084 mfn = shadow_l3e_get_mfn(*sl3e);
4085 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
4086 (GUEST_PAGING_LEVELS == 3
4087 && !shadow_mode_external(v->domain)
4088 && (guest_index(gl3e) % 4) == 3)
4089 ? PGC_SH_l2h_pae_shadow
4090 : PGC_SH_l2_shadow);
4091 if ( mfn_x(gmfn) != mfn_x(mfn) )
4092 AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
4093 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4094 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4096 });
4097 sh_unmap_domain_page(gp);
4098 return 0;
4101 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
4103 guest_l4e_t *gl4e, *gp;
4104 shadow_l4e_t *sl4e;
4105 mfn_t mfn, gmfn, gl4mfn;
4106 gfn_t gfn;
4107 char *s;
4108 int done = 0;
4109 int xen_mappings = !shadow_mode_external(v->domain);
4111 /* Follow the backpointer */
4112 gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info);
4113 gl4e = gp = sh_map_domain_page(gl4mfn);
4114 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
4116 s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
4117 shadow_l4e_get_flags(*sl4e));
4118 if ( s ) AUDIT_FAIL(4, "%s", s);
4120 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
4122 gfn = guest_l4e_get_gfn(*gl4e);
4123 mfn = shadow_l4e_get_mfn(*sl4e);
4124 gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
4125 PGC_SH_l3_shadow);
4126 if ( mfn_x(gmfn) != mfn_x(mfn) )
4127 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
4128 " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
4129 gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
4131 });
4132 sh_unmap_domain_page(gp);
4133 return 0;
4135 #endif /* GUEST_PAGING_LEVELS >= 4 */
4138 #undef AUDIT_FAIL
4140 #endif /* Audit code */
4142 /**************************************************************************/
4143 /* Entry points into this mode of the shadow code.
4144 * This will all be mangled by the preprocessor to uniquify everything. */
4145 struct shadow_paging_mode sh_paging_mode = {
4146 .page_fault = sh_page_fault,
4147 .invlpg = sh_invlpg,
4148 .gva_to_gpa = sh_gva_to_gpa,
4149 .gva_to_gfn = sh_gva_to_gfn,
4150 .update_cr3 = sh_update_cr3,
4151 .map_and_validate_gl1e = sh_map_and_validate_gl1e,
4152 .map_and_validate_gl2e = sh_map_and_validate_gl2e,
4153 .map_and_validate_gl2he = sh_map_and_validate_gl2he,
4154 .map_and_validate_gl3e = sh_map_and_validate_gl3e,
4155 .map_and_validate_gl4e = sh_map_and_validate_gl4e,
4156 .detach_old_tables = sh_detach_old_tables,
4157 .x86_emulate_write = sh_x86_emulate_write,
4158 .x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
4159 .x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
4160 .make_monitor_table = sh_make_monitor_table,
4161 .destroy_monitor_table = sh_destroy_monitor_table,
4162 .guest_map_l1e = sh_guest_map_l1e,
4163 .guest_get_eff_l1e = sh_guest_get_eff_l1e,
4164 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4165 .guess_wrmap = sh_guess_wrmap,
4166 #endif
4167 .guest_levels = GUEST_PAGING_LEVELS,
4168 .shadow_levels = SHADOW_PAGING_LEVELS,
4169 };
4171 /*
4172 * Local variables:
4173 * mode: C
4174 * c-set-style: "BSD"
4175 * c-basic-offset: 4
4176 * indent-tabs-mode: nil
4177 * End:
4178 */