ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 12564:2fd223c64fc6

[XEN] Pin l3 shadows of older x86_64 linux guests.
Older x86_64 linux kernels use one l4 table per cpu and context switch by
changing an l4 entry pointing to an l3 table. If we're shadowing them
we need to pin l3 shadows to stop them being torn down on every
context switch. (But don't do this for normal 64bit guests).
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Thu Nov 23 17:46:52 2006 +0000 (2006-11-23)
parents 47a8bb3cd123
children b4baf35cff11
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include "private.h"
41 #if SHADOW_AUDIT
42 int shadow_audit_enable = 0;
44 static void shadow_audit_key(unsigned char key)
45 {
46 shadow_audit_enable = !shadow_audit_enable;
47 printk("%s shadow_audit_enable=%d\n",
48 __func__, shadow_audit_enable);
49 }
51 static int __init shadow_audit_key_init(void)
52 {
53 register_keyhandler(
54 'O', shadow_audit_key, "toggle shadow audits");
55 return 0;
56 }
57 __initcall(shadow_audit_key_init);
58 #endif /* SHADOW_AUDIT */
60 static void sh_free_log_dirty_bitmap(struct domain *d);
62 int _shadow_mode_refcounts(struct domain *d)
63 {
64 return shadow_mode_refcounts(d);
65 }
68 /**************************************************************************/
69 /* x86 emulator support for the shadow code
70 */
72 static int
73 sh_x86_emulate_read_std(unsigned long addr,
74 unsigned long *val,
75 unsigned int bytes,
76 struct x86_emulate_ctxt *ctxt)
77 {
78 *val = 0;
79 // XXX -- this is WRONG.
80 // It entirely ignores the permissions in the page tables.
81 // In this case, that is only a user vs supervisor access check.
82 //
83 if ( hvm_copy_from_guest_virt(val, addr, bytes) == 0 )
84 {
85 #if 0
86 struct vcpu *v = current;
87 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
88 v->domain->domain_id, v->vcpu_id,
89 addr, *val, bytes);
90 #endif
91 return X86EMUL_CONTINUE;
92 }
94 /* If we got here, there was nothing mapped here, or a bad GFN
95 * was mapped here. This should never happen: we're here because
96 * of a write fault at the end of the instruction we're emulating. */
97 SHADOW_PRINTK("read failed to va %#lx\n", addr);
98 return X86EMUL_PROPAGATE_FAULT;
99 }
101 static int
102 sh_x86_emulate_write_std(unsigned long addr,
103 unsigned long val,
104 unsigned int bytes,
105 struct x86_emulate_ctxt *ctxt)
106 {
107 #if 0
108 struct vcpu *v = current;
109 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
110 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
111 #endif
113 // XXX -- this is WRONG.
114 // It entirely ignores the permissions in the page tables.
115 // In this case, that includes user vs supervisor, and
116 // write access.
117 //
118 if ( hvm_copy_to_guest_virt(addr, &val, bytes) == 0 )
119 return X86EMUL_CONTINUE;
121 /* If we got here, there was nothing mapped here, or a bad GFN
122 * was mapped here. This should never happen: we're here because
123 * of a write fault at the end of the instruction we're emulating,
124 * which should be handled by sh_x86_emulate_write_emulated. */
125 SHADOW_PRINTK("write failed to va %#lx\n", addr);
126 return X86EMUL_PROPAGATE_FAULT;
127 }
129 static int
130 sh_x86_emulate_write_emulated(unsigned long addr,
131 unsigned long val,
132 unsigned int bytes,
133 struct x86_emulate_ctxt *ctxt)
134 {
135 struct vcpu *v = current;
136 #if 0
137 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
138 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
139 #endif
140 return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
141 }
143 static int
144 sh_x86_emulate_cmpxchg_emulated(unsigned long addr,
145 unsigned long old,
146 unsigned long new,
147 unsigned int bytes,
148 struct x86_emulate_ctxt *ctxt)
149 {
150 struct vcpu *v = current;
151 #if 0
152 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
153 v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
154 #endif
155 return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new,
156 bytes, ctxt);
157 }
159 static int
160 sh_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
161 unsigned long old_lo,
162 unsigned long old_hi,
163 unsigned long new_lo,
164 unsigned long new_hi,
165 struct x86_emulate_ctxt *ctxt)
166 {
167 struct vcpu *v = current;
168 #if 0
169 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
170 v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
171 new_hi, new_lo, ctxt);
172 #endif
173 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
174 new_lo, new_hi, ctxt);
175 }
178 struct x86_emulate_ops shadow_emulator_ops = {
179 .read_std = sh_x86_emulate_read_std,
180 .write_std = sh_x86_emulate_write_std,
181 .read_emulated = sh_x86_emulate_read_std,
182 .write_emulated = sh_x86_emulate_write_emulated,
183 .cmpxchg_emulated = sh_x86_emulate_cmpxchg_emulated,
184 .cmpxchg8b_emulated = sh_x86_emulate_cmpxchg8b_emulated,
185 };
187 /**************************************************************************/
188 /* Code for "promoting" a guest page to the point where the shadow code is
189 * willing to let it be treated as a guest page table. This generally
190 * involves making sure there are no writable mappings available to the guest
191 * for this page.
192 */
193 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
194 {
195 struct page_info *page = mfn_to_page(gmfn);
197 ASSERT(valid_mfn(gmfn));
199 /* We should never try to promote a gmfn that has writeable mappings */
200 ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
202 /* Is the page already shadowed? */
203 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
204 page->shadow_flags = 0;
206 ASSERT(!test_bit(type, &page->shadow_flags));
207 set_bit(type, &page->shadow_flags);
208 }
210 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
211 {
212 struct page_info *page = mfn_to_page(gmfn);
214 ASSERT(test_bit(_PGC_page_table, &page->count_info));
215 ASSERT(test_bit(type, &page->shadow_flags));
217 clear_bit(type, &page->shadow_flags);
219 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
220 {
221 /* tlbflush timestamp field is valid again */
222 page->tlbflush_timestamp = tlbflush_current_time();
223 clear_bit(_PGC_page_table, &page->count_info);
224 }
225 }
227 /**************************************************************************/
228 /* Validate a pagetable change from the guest and update the shadows.
229 * Returns a bitmask of SHADOW_SET_* flags. */
231 int
232 __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
233 void *entry, u32 size)
234 {
235 int result = 0;
236 struct page_info *page = mfn_to_page(gmfn);
238 sh_mark_dirty(v->domain, gmfn);
240 // Determine which types of shadows are affected, and update each.
241 //
242 // Always validate L1s before L2s to prevent another cpu with a linear
243 // mapping of this gmfn from seeing a walk that results from
244 // using the new L2 value and the old L1 value. (It is OK for such a
245 // guest to see a walk that uses the old L2 value with the new L1 value,
246 // as hardware could behave this way if one level of the pagewalk occurs
247 // before the store, and the next level of the pagewalk occurs after the
248 // store.
249 //
250 // Ditto for L2s before L3s, etc.
251 //
253 if ( !(page->count_info & PGC_page_table) )
254 return 0; /* Not shadowed at all */
256 #if CONFIG_PAGING_LEVELS == 2
257 if ( page->shadow_flags & SHF_L1_32 )
258 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
259 (v, gmfn, entry, size);
260 #else
261 if ( page->shadow_flags & SHF_L1_32 )
262 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
263 (v, gmfn, entry, size);
264 #endif
266 #if CONFIG_PAGING_LEVELS == 2
267 if ( page->shadow_flags & SHF_L2_32 )
268 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
269 (v, gmfn, entry, size);
270 #else
271 if ( page->shadow_flags & SHF_L2_32 )
272 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
273 (v, gmfn, entry, size);
274 #endif
276 #if CONFIG_PAGING_LEVELS >= 3
277 if ( page->shadow_flags & SHF_L1_PAE )
278 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
279 (v, gmfn, entry, size);
280 if ( page->shadow_flags & SHF_L2_PAE )
281 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
282 (v, gmfn, entry, size);
283 if ( page->shadow_flags & SHF_L2H_PAE )
284 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
285 (v, gmfn, entry, size);
286 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
287 ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
288 #endif
290 #if CONFIG_PAGING_LEVELS >= 4
291 if ( page->shadow_flags & SHF_L1_64 )
292 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
293 (v, gmfn, entry, size);
294 if ( page->shadow_flags & SHF_L2_64 )
295 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
296 (v, gmfn, entry, size);
297 if ( page->shadow_flags & SHF_L3_64 )
298 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
299 (v, gmfn, entry, size);
300 if ( page->shadow_flags & SHF_L4_64 )
301 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
302 (v, gmfn, entry, size);
303 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
304 ASSERT((page->shadow_flags
305 & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
306 #endif
308 return result;
309 }
312 int
313 shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
314 /* This is the entry point from hypercalls. It returns a bitmask of all the
315 * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
316 {
317 int rc;
319 ASSERT(shadow_lock_is_acquired(v->domain));
320 rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
321 shadow_audit_tables(v);
322 return rc;
323 }
325 void
326 shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
327 void *entry, u32 size)
328 /* This is the entry point for emulated writes to pagetables in HVM guests and
329 * PV translated guests.
330 */
331 {
332 struct domain *d = v->domain;
333 int rc;
335 ASSERT(shadow_lock_is_acquired(v->domain));
336 rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
337 if ( rc & SHADOW_SET_FLUSH )
338 /* Need to flush TLBs to pick up shadow PT changes */
339 flush_tlb_mask(d->domain_dirty_cpumask);
340 if ( rc & SHADOW_SET_ERROR )
341 {
342 /* This page is probably not a pagetable any more: tear it out of the
343 * shadows, along with any tables that reference it.
344 * Since the validate call above will have made a "safe" (i.e. zero)
345 * shadow entry, we can let the domain live even if we can't fully
346 * unshadow the page. */
347 sh_remove_shadows(v, gmfn, 0, 0);
348 }
349 }
352 /**************************************************************************/
353 /* Memory management for shadow pages. */
355 /* Allocating shadow pages
356 * -----------------------
357 *
358 * Most shadow pages are allocated singly, but there is one case where
359 * we need to allocate multiple pages together: shadowing 32-bit guest
360 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
361 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
362 * l1 tables (covering 2MB of virtual address space each). Similarly, a
363 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
364 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
365 * contiguous and aligned; functions for handling offsets into them are
366 * defined in shadow.c (shadow_l1_index() etc.)
367 *
368 * This table shows the allocation behaviour of the different modes:
369 *
370 * Xen paging 32b pae pae 64b 64b 64b
371 * Guest paging 32b 32b pae 32b pae 64b
372 * PV or HVM * HVM * HVM HVM *
373 * Shadow paging 32b pae pae pae pae 64b
374 *
375 * sl1 size 4k 8k 4k 8k 4k 4k
376 * sl2 size 4k 16k 4k 16k 4k 4k
377 * sl3 size - - - - - 4k
378 * sl4 size - - - - - 4k
379 *
380 * We allocate memory from xen in four-page units and break them down
381 * with a simple buddy allocator. Can't use the xen allocator to handle
382 * this as it only works for contiguous zones, and a domain's shadow
383 * pool is made of fragments.
384 *
385 * In HVM guests, the p2m table is built out of shadow pages, and we provide
386 * a function for the p2m management to steal pages, in max-order chunks, from
387 * the free pool. We don't provide for giving them back, yet.
388 */
390 /* Figure out the least acceptable quantity of shadow memory.
391 * The minimum memory requirement for always being able to free up a
392 * chunk of memory is very small -- only three max-order chunks per
393 * vcpu to hold the top level shadows and pages with Xen mappings in them.
394 *
395 * But for a guest to be guaranteed to successfully execute a single
396 * instruction, we must be able to map a large number (about thirty) VAs
397 * at the same time, which means that to guarantee progress, we must
398 * allow for more than ninety allocated pages per vcpu. We round that
399 * up to 128 pages, or half a megabyte per vcpu. */
400 unsigned int shadow_min_acceptable_pages(struct domain *d)
401 {
402 u32 vcpu_count = 0;
403 struct vcpu *v;
405 for_each_vcpu(d, v)
406 vcpu_count++;
408 return (vcpu_count * 128);
409 }
411 /* Figure out the order of allocation needed for a given shadow type */
412 static inline u32
413 shadow_order(unsigned int shadow_type)
414 {
415 #if CONFIG_PAGING_LEVELS > 2
416 static const u32 type_to_order[16] = {
417 0, /* SH_type_none */
418 1, /* SH_type_l1_32_shadow */
419 1, /* SH_type_fl1_32_shadow */
420 2, /* SH_type_l2_32_shadow */
421 0, /* SH_type_l1_pae_shadow */
422 0, /* SH_type_fl1_pae_shadow */
423 0, /* SH_type_l2_pae_shadow */
424 0, /* SH_type_l2h_pae_shadow */
425 0, /* SH_type_l1_64_shadow */
426 0, /* SH_type_fl1_64_shadow */
427 0, /* SH_type_l2_64_shadow */
428 0, /* SH_type_l3_64_shadow */
429 0, /* SH_type_l4_64_shadow */
430 2, /* SH_type_p2m_table */
431 0 /* SH_type_monitor_table */
432 };
433 ASSERT(shadow_type < 16);
434 return type_to_order[shadow_type];
435 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
436 return 0;
437 #endif
438 }
441 /* Do we have a free chunk of at least this order? */
442 static inline int chunk_is_available(struct domain *d, int order)
443 {
444 int i;
446 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
447 if ( !list_empty(&d->arch.shadow.freelists[i]) )
448 return 1;
449 return 0;
450 }
452 /* Dispatcher function: call the per-mode function that will unhook the
453 * non-Xen mappings in this top-level shadow mfn */
454 void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
455 {
456 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
457 switch ( sp->type )
458 {
459 case SH_type_l2_32_shadow:
460 #if CONFIG_PAGING_LEVELS == 2
461 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
462 #else
463 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
464 #endif
465 break;
466 #if CONFIG_PAGING_LEVELS >= 3
467 case SH_type_l2_pae_shadow:
468 case SH_type_l2h_pae_shadow:
469 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
470 break;
471 #endif
472 #if CONFIG_PAGING_LEVELS >= 4
473 case SH_type_l4_64_shadow:
474 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
475 break;
476 #endif
477 default:
478 SHADOW_PRINTK("top-level shadow has bad type %08x\n", sp->type);
479 BUG();
480 }
481 }
484 /* Make sure there is at least one chunk of the required order available
485 * in the shadow page pool. This must be called before any calls to
486 * shadow_alloc(). Since this will free existing shadows to make room,
487 * it must be called early enough to avoid freeing shadows that the
488 * caller is currently working on. */
489 void shadow_prealloc(struct domain *d, unsigned int order)
490 {
491 /* Need a vpcu for calling unpins; for now, since we don't have
492 * per-vcpu shadows, any will do */
493 struct vcpu *v, *v2;
494 struct list_head *l, *t;
495 struct shadow_page_info *sp;
496 cpumask_t flushmask = CPU_MASK_NONE;
497 mfn_t smfn;
498 int i;
500 if ( chunk_is_available(d, order) ) return;
502 v = current;
503 if ( v->domain != d )
504 v = d->vcpu[0];
505 ASSERT(v != NULL);
507 /* Stage one: walk the list of pinned pages, unpinning them */
508 perfc_incrc(shadow_prealloc_1);
509 list_for_each_backwards_safe(l, t, &d->arch.shadow.pinned_shadows)
510 {
511 sp = list_entry(l, struct shadow_page_info, list);
512 smfn = shadow_page_to_mfn(sp);
514 /* Unpin this top-level shadow */
515 sh_unpin(v, smfn);
517 /* See if that freed up a chunk of appropriate size */
518 if ( chunk_is_available(d, order) ) return;
519 }
521 /* Stage two: all shadow pages are in use in hierarchies that are
522 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
523 * mappings. */
524 perfc_incrc(shadow_prealloc_2);
526 for_each_vcpu(d, v2)
527 for ( i = 0 ; i < 4 ; i++ )
528 {
529 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
530 {
531 shadow_unhook_mappings(v,
532 pagetable_get_mfn(v2->arch.shadow_table[i]));
533 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
535 /* See if that freed up a chunk of appropriate size */
536 if ( chunk_is_available(d, order) )
537 {
538 flush_tlb_mask(flushmask);
539 return;
540 }
541 }
542 }
544 /* Nothing more we can do: all remaining shadows are of pages that
545 * hold Xen mappings for some vcpu. This can never happen. */
546 SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
547 " shadow pages total = %u, free = %u, p2m=%u\n",
548 1 << order,
549 d->arch.shadow.total_pages,
550 d->arch.shadow.free_pages,
551 d->arch.shadow.p2m_pages);
552 BUG();
553 }
555 /* Deliberately free all the memory we can: this will tear down all of
556 * this domain's shadows */
557 static void shadow_blow_tables(struct domain *d)
558 {
559 struct list_head *l, *t;
560 struct shadow_page_info *sp;
561 struct vcpu *v = d->vcpu[0];
562 mfn_t smfn;
563 int i;
565 /* Pass one: unpin all pinned pages */
566 list_for_each_backwards_safe(l,t, &d->arch.shadow.pinned_shadows)
567 {
568 sp = list_entry(l, struct shadow_page_info, list);
569 smfn = shadow_page_to_mfn(sp);
570 sh_unpin(v, smfn);
571 }
573 /* Second pass: unhook entries of in-use shadows */
574 for_each_vcpu(d, v)
575 for ( i = 0 ; i < 4 ; i++ )
576 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
577 shadow_unhook_mappings(v,
578 pagetable_get_mfn(v->arch.shadow_table[i]));
580 /* Make sure everyone sees the unshadowings */
581 flush_tlb_mask(d->domain_dirty_cpumask);
582 }
585 #ifndef NDEBUG
586 /* Blow all shadows of all shadowed domains: this can be used to cause the
587 * guest's pagetables to be re-shadowed if we suspect that the shadows
588 * have somehow got out of sync */
589 static void shadow_blow_all_tables(unsigned char c)
590 {
591 struct domain *d;
592 printk("'%c' pressed -> blowing all shadow tables\n", c);
593 for_each_domain(d)
594 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
595 {
596 shadow_lock(d);
597 shadow_blow_tables(d);
598 shadow_unlock(d);
599 }
600 }
602 /* Register this function in the Xen console keypress table */
603 static __init int shadow_blow_tables_keyhandler_init(void)
604 {
605 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
606 return 0;
607 }
608 __initcall(shadow_blow_tables_keyhandler_init);
609 #endif /* !NDEBUG */
611 /* Allocate another shadow's worth of (contiguous, aligned) pages,
612 * and fill in the type and backpointer fields of their page_infos.
613 * Never fails to allocate. */
614 mfn_t shadow_alloc(struct domain *d,
615 u32 shadow_type,
616 unsigned long backpointer)
617 {
618 struct shadow_page_info *sp = NULL;
619 unsigned int order = shadow_order(shadow_type);
620 cpumask_t mask;
621 void *p;
622 int i;
624 ASSERT(shadow_lock_is_acquired(d));
625 ASSERT(order <= SHADOW_MAX_ORDER);
626 ASSERT(shadow_type != SH_type_none);
627 perfc_incrc(shadow_alloc);
629 /* Find smallest order which can satisfy the request. */
630 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
631 if ( !list_empty(&d->arch.shadow.freelists[i]) )
632 {
633 sp = list_entry(d->arch.shadow.freelists[i].next,
634 struct shadow_page_info, list);
635 list_del(&sp->list);
637 /* We may have to halve the chunk a number of times. */
638 while ( i != order )
639 {
640 i--;
641 sp->order = i;
642 list_add_tail(&sp->list, &d->arch.shadow.freelists[i]);
643 sp += 1 << i;
644 }
645 d->arch.shadow.free_pages -= 1 << order;
647 /* Init page info fields and clear the pages */
648 for ( i = 0; i < 1<<order ; i++ )
649 {
650 /* Before we overwrite the old contents of this page,
651 * we need to be sure that no TLB holds a pointer to it. */
652 mask = d->domain_dirty_cpumask;
653 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
654 if ( unlikely(!cpus_empty(mask)) )
655 {
656 perfc_incrc(shadow_alloc_tlbflush);
657 flush_tlb_mask(mask);
658 }
659 /* Now safe to clear the page for reuse */
660 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
661 ASSERT(p != NULL);
662 clear_page(p);
663 sh_unmap_domain_page(p);
664 INIT_LIST_HEAD(&sp[i].list);
665 sp[i].type = shadow_type;
666 sp[i].pinned = 0;
667 sp[i].logdirty = 0;
668 sp[i].count = 0;
669 sp[i].backpointer = backpointer;
670 sp[i].next_shadow = NULL;
671 perfc_incr(shadow_alloc_count);
672 }
673 return shadow_page_to_mfn(sp);
674 }
676 /* If we get here, we failed to allocate. This should never happen.
677 * It means that we didn't call shadow_prealloc() correctly before
678 * we allocated. We can't recover by calling prealloc here, because
679 * we might free up higher-level pages that the caller is working on. */
680 SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
681 BUG();
682 }
685 /* Return some shadow pages to the pool. */
686 void shadow_free(struct domain *d, mfn_t smfn)
687 {
688 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
689 u32 shadow_type;
690 unsigned long order;
691 unsigned long mask;
692 int i;
694 ASSERT(shadow_lock_is_acquired(d));
695 perfc_incrc(shadow_free);
697 shadow_type = sp->type;
698 ASSERT(shadow_type != SH_type_none);
699 ASSERT(shadow_type != SH_type_p2m_table);
700 order = shadow_order(shadow_type);
702 d->arch.shadow.free_pages += 1 << order;
704 for ( i = 0; i < 1<<order; i++ )
705 {
706 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
707 struct vcpu *v;
708 for_each_vcpu(d, v)
709 {
710 /* No longer safe to look for a writeable mapping in this shadow */
711 if ( v->arch.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
712 v->arch.shadow.last_writeable_pte_smfn = 0;
713 }
714 #endif
715 /* Strip out the type: this is now a free shadow page */
716 sp[i].type = 0;
717 /* Remember the TLB timestamp so we will know whether to flush
718 * TLBs when we reuse the page. Because the destructors leave the
719 * contents of the pages in place, we can delay TLB flushes until
720 * just before the allocator hands the page out again. */
721 sp[i].tlbflush_timestamp = tlbflush_current_time();
722 perfc_decr(shadow_alloc_count);
723 }
725 /* Merge chunks as far as possible. */
726 while ( order < SHADOW_MAX_ORDER )
727 {
728 mask = 1 << order;
729 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
730 /* Merge with predecessor block? */
731 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
732 break;
733 list_del(&(sp-mask)->list);
734 sp -= mask;
735 } else {
736 /* Merge with successor block? */
737 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
738 break;
739 list_del(&(sp+mask)->list);
740 }
741 order++;
742 }
744 sp->order = order;
745 list_add_tail(&sp->list, &d->arch.shadow.freelists[order]);
746 }
748 /* Divert some memory from the pool to be used by the p2m mapping.
749 * This action is irreversible: the p2m mapping only ever grows.
750 * That's OK because the p2m table only exists for translated domains,
751 * and those domains can't ever turn off shadow mode.
752 * Also, we only ever allocate a max-order chunk, so as to preserve
753 * the invariant that shadow_prealloc() always works.
754 * Returns 0 iff it can't get a chunk (the caller should then
755 * free up some pages in domheap and call set_sh_allocation);
756 * returns non-zero on success.
757 */
758 static int
759 shadow_alloc_p2m_pages(struct domain *d)
760 {
761 struct page_info *pg;
762 u32 i;
763 ASSERT(shadow_lock_is_acquired(d));
765 if ( d->arch.shadow.total_pages
766 < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
767 return 0; /* Not enough shadow memory: need to increase it first */
769 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
770 d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
771 d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
772 for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
773 {
774 /* Unlike shadow pages, mark p2m pages as owned by the domain.
775 * Marking the domain as the owner would normally allow the guest to
776 * create mappings of these pages, but these p2m pages will never be
777 * in the domain's guest-physical address space, and so that is not
778 * believed to be a concern.
779 */
780 page_set_owner(&pg[i], d);
781 list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
782 }
783 return 1;
784 }
786 // Returns 0 if no memory is available...
787 mfn_t
788 shadow_alloc_p2m_page(struct domain *d)
789 {
790 struct list_head *entry;
791 struct page_info *pg;
792 mfn_t mfn;
793 void *p;
795 if ( list_empty(&d->arch.shadow.p2m_freelist) &&
796 !shadow_alloc_p2m_pages(d) )
797 return _mfn(0);
798 entry = d->arch.shadow.p2m_freelist.next;
799 list_del(entry);
800 list_add_tail(entry, &d->arch.shadow.p2m_inuse);
801 pg = list_entry(entry, struct page_info, list);
802 pg->count_info = 1;
803 mfn = page_to_mfn(pg);
804 p = sh_map_domain_page(mfn);
805 clear_page(p);
806 sh_unmap_domain_page(p);
808 return mfn;
809 }
811 #if CONFIG_PAGING_LEVELS == 3
812 static void p2m_install_entry_in_monitors(struct domain *d,
813 l3_pgentry_t *l3e)
814 /* Special case, only used for external-mode domains on PAE hosts:
815 * update the mapping of the p2m table. Once again, this is trivial in
816 * other paging modes (one top-level entry points to the top-level p2m,
817 * no maintenance needed), but PAE makes life difficult by needing a
818 * copy the eight l3es of the p2m table in eight l2h slots in the
819 * monitor table. This function makes fresh copies when a p2m l3e
820 * changes. */
821 {
822 l2_pgentry_t *ml2e;
823 struct vcpu *v;
824 unsigned int index;
826 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
827 ASSERT(index < MACHPHYS_MBYTES>>1);
829 for_each_vcpu(d, v)
830 {
831 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
832 continue;
833 ASSERT(shadow_mode_external(v->domain));
835 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
836 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
838 if ( v == current ) /* OK to use linear map of monitor_table */
839 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
840 else
841 {
842 l3_pgentry_t *ml3e;
843 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
844 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
845 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
846 ml2e += l2_table_offset(RO_MPT_VIRT_START);
847 sh_unmap_domain_page(ml3e);
848 }
849 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
850 if ( v != current )
851 sh_unmap_domain_page(ml2e);
852 }
853 }
854 #endif
856 // Find the next level's P2M entry, checking for out-of-range gfn's...
857 // Returns NULL on error.
858 //
859 static l1_pgentry_t *
860 p2m_find_entry(void *table, unsigned long *gfn_remainder,
861 unsigned long gfn, u32 shift, u32 max)
862 {
863 u32 index;
865 index = *gfn_remainder >> shift;
866 if ( index >= max )
867 {
868 SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
869 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
870 gfn, *gfn_remainder, shift, index, max);
871 return NULL;
872 }
873 *gfn_remainder &= (1 << shift) - 1;
874 return (l1_pgentry_t *)table + index;
875 }
877 // Walk one level of the P2M table, allocating a new table if required.
878 // Returns 0 on error.
879 //
880 static int
881 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
882 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
883 u32 max, unsigned long type)
884 {
885 l1_pgentry_t *p2m_entry;
886 void *next;
888 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
889 shift, max)) )
890 return 0;
892 if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
893 {
894 mfn_t mfn = shadow_alloc_p2m_page(d);
895 if ( mfn_x(mfn) == 0 )
896 return 0;
897 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
898 mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
899 mfn_to_page(mfn)->count_info = 1;
900 #if CONFIG_PAGING_LEVELS == 3
901 if (type == PGT_l2_page_table)
902 {
903 struct vcpu *v;
904 /* We have written to the p2m l3: need to sync the per-vcpu
905 * copies of it in the monitor tables */
906 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
907 /* Also, any vcpus running on shadows of the p2m need to
908 * reload their CR3s so the change propagates to the shadow */
909 ASSERT(shadow_lock_is_acquired(d));
910 for_each_vcpu(d, v)
911 {
912 if ( pagetable_get_pfn(v->arch.guest_table)
913 == pagetable_get_pfn(d->arch.phys_table)
914 && v->arch.shadow.mode != NULL )
915 v->arch.shadow.mode->update_cr3(v);
916 }
917 }
918 #endif
919 /* The P2M can be shadowed: keep the shadows synced */
920 if ( d->vcpu[0] != NULL )
921 (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
922 p2m_entry, sizeof *p2m_entry);
923 }
924 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
925 next = sh_map_domain_page(*table_mfn);
926 sh_unmap_domain_page(*table);
927 *table = next;
929 return 1;
930 }
932 // Returns 0 on error (out of memory)
933 int
934 shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
935 {
936 // XXX -- this might be able to be faster iff current->domain == d
937 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
938 void *table = sh_map_domain_page(table_mfn);
939 unsigned long gfn_remainder = gfn;
940 l1_pgentry_t *p2m_entry;
942 #if CONFIG_PAGING_LEVELS >= 4
943 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
944 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
945 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
946 return 0;
947 #endif
948 #if CONFIG_PAGING_LEVELS >= 3
949 // When using PAE Xen, we only allow 33 bits of pseudo-physical
950 // address in translated guests (i.e. 8 GBytes). This restriction
951 // comes from wanting to map the P2M table into the 16MB RO_MPT hole
952 // in Xen's address space for translated PV guests.
953 //
954 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
955 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
956 (CONFIG_PAGING_LEVELS == 3
957 ? 8
958 : L3_PAGETABLE_ENTRIES),
959 PGT_l2_page_table) )
960 return 0;
961 #endif
962 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
963 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
964 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
965 return 0;
967 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
968 0, L1_PAGETABLE_ENTRIES);
969 ASSERT(p2m_entry);
970 if ( valid_mfn(mfn) )
971 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
972 else
973 *p2m_entry = l1e_empty();
975 /* Track the highest gfn for which we have ever had a valid mapping */
976 if ( valid_mfn(mfn) && (gfn > d->arch.max_mapped_pfn) )
977 d->arch.max_mapped_pfn = gfn;
979 /* The P2M can be shadowed: keep the shadows synced */
980 if ( d->vcpu[0] != NULL )
981 (void)__shadow_validate_guest_entry(
982 d->vcpu[0], table_mfn, p2m_entry, sizeof(*p2m_entry));
984 sh_unmap_domain_page(table);
986 return 1;
987 }
989 // Allocate a new p2m table for a domain.
990 //
991 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
992 // controlled by CONFIG_PAGING_LEVELS).
993 //
994 // Returns 0 if p2m table could not be initialized
995 //
996 static int
997 shadow_alloc_p2m_table(struct domain *d)
998 {
999 mfn_t p2m_top, mfn;
1000 struct list_head *entry;
1001 struct page_info *page;
1002 unsigned int page_count = 0;
1003 unsigned long gfn;
1005 SHADOW_PRINTK("allocating p2m table\n");
1006 ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
1008 p2m_top = shadow_alloc_p2m_page(d);
1009 mfn_to_page(p2m_top)->count_info = 1;
1010 mfn_to_page(p2m_top)->u.inuse.type_info =
1011 #if CONFIG_PAGING_LEVELS == 4
1012 PGT_l4_page_table
1013 #elif CONFIG_PAGING_LEVELS == 3
1014 PGT_l3_page_table
1015 #elif CONFIG_PAGING_LEVELS == 2
1016 PGT_l2_page_table
1017 #endif
1018 | 1 | PGT_validated;
1020 if ( mfn_x(p2m_top) == 0 )
1021 return 0;
1023 d->arch.phys_table = pagetable_from_mfn(p2m_top);
1025 SHADOW_PRINTK("populating p2m table\n");
1027 /* Initialise physmap tables for slot zero. Other code assumes this. */
1028 gfn = 0;
1029 mfn = _mfn(INVALID_MFN);
1030 if ( !shadow_set_p2m_entry(d, gfn, mfn) )
1031 goto error;
1033 for ( entry = d->page_list.next;
1034 entry != &d->page_list;
1035 entry = entry->next )
1037 page = list_entry(entry, struct page_info, list);
1038 mfn = page_to_mfn(page);
1039 gfn = get_gpfn_from_mfn(mfn_x(mfn));
1040 page_count++;
1041 if (
1042 #ifdef __x86_64__
1043 (gfn != 0x5555555555555555L)
1044 #else
1045 (gfn != 0x55555555L)
1046 #endif
1047 && gfn != INVALID_M2P_ENTRY
1048 && !shadow_set_p2m_entry(d, gfn, mfn) )
1049 goto error;
1052 SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
1053 return 1;
1055 error:
1056 SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
1057 SH_PRI_mfn "\n", gfn, mfn_x(mfn));
1058 return 0;
1061 mfn_t
1062 sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1063 /* Read another domain's p2m entries */
1065 mfn_t mfn;
1066 unsigned long addr = gpfn << PAGE_SHIFT;
1067 l2_pgentry_t *l2e;
1068 l1_pgentry_t *l1e;
1070 ASSERT(shadow_mode_translate(d));
1071 mfn = pagetable_get_mfn(d->arch.phys_table);
1074 if ( gpfn > d->arch.max_mapped_pfn )
1075 /* This pfn is higher than the highest the p2m map currently holds */
1076 return _mfn(INVALID_MFN);
1078 #if CONFIG_PAGING_LEVELS >= 4
1080 l4_pgentry_t *l4e = sh_map_domain_page(mfn);
1081 l4e += l4_table_offset(addr);
1082 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1084 sh_unmap_domain_page(l4e);
1085 return _mfn(INVALID_MFN);
1087 mfn = _mfn(l4e_get_pfn(*l4e));
1088 sh_unmap_domain_page(l4e);
1090 #endif
1091 #if CONFIG_PAGING_LEVELS >= 3
1093 l3_pgentry_t *l3e = sh_map_domain_page(mfn);
1094 l3e += l3_table_offset(addr);
1095 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1097 sh_unmap_domain_page(l3e);
1098 return _mfn(INVALID_MFN);
1100 mfn = _mfn(l3e_get_pfn(*l3e));
1101 sh_unmap_domain_page(l3e);
1103 #endif
1105 l2e = sh_map_domain_page(mfn);
1106 l2e += l2_table_offset(addr);
1107 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1109 sh_unmap_domain_page(l2e);
1110 return _mfn(INVALID_MFN);
1112 mfn = _mfn(l2e_get_pfn(*l2e));
1113 sh_unmap_domain_page(l2e);
1115 l1e = sh_map_domain_page(mfn);
1116 l1e += l1_table_offset(addr);
1117 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1119 sh_unmap_domain_page(l1e);
1120 return _mfn(INVALID_MFN);
1122 mfn = _mfn(l1e_get_pfn(*l1e));
1123 sh_unmap_domain_page(l1e);
1125 return mfn;
1128 unsigned long
1129 shadow_gfn_to_mfn_foreign(unsigned long gpfn)
1131 return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
1135 static void shadow_p2m_teardown(struct domain *d)
1136 /* Return all the p2m pages to Xen.
1137 * We know we don't have any extra mappings to these pages */
1139 struct list_head *entry, *n;
1140 struct page_info *pg;
1142 d->arch.phys_table = pagetable_null();
1144 list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
1146 pg = list_entry(entry, struct page_info, list);
1147 list_del(entry);
1148 /* Should have just the one ref we gave it in alloc_p2m_page() */
1149 if ( (pg->count_info & PGC_count_mask) != 1 )
1151 SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1152 pg->count_info, pg->u.inuse.type_info);
1154 ASSERT(page_get_owner(pg) == d);
1155 /* Free should not decrement domain's total allocation, since
1156 * these pages were allocated without an owner. */
1157 page_set_owner(pg, NULL);
1158 free_domheap_pages(pg, 0);
1159 d->arch.shadow.p2m_pages--;
1160 perfc_decr(shadow_alloc_count);
1162 list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
1164 list_del(entry);
1165 pg = list_entry(entry, struct page_info, list);
1166 ASSERT(page_get_owner(pg) == d);
1167 /* Free should not decrement domain's total allocation. */
1168 page_set_owner(pg, NULL);
1169 free_domheap_pages(pg, 0);
1170 d->arch.shadow.p2m_pages--;
1171 perfc_decr(shadow_alloc_count);
1173 ASSERT(d->arch.shadow.p2m_pages == 0);
1176 /* Set the pool of shadow pages to the required number of pages.
1177 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1178 * plus space for the p2m table.
1179 * Returns 0 for success, non-zero for failure. */
1180 static unsigned int set_sh_allocation(struct domain *d,
1181 unsigned int pages,
1182 int *preempted)
1184 struct shadow_page_info *sp;
1185 unsigned int lower_bound;
1186 int j;
1188 ASSERT(shadow_lock_is_acquired(d));
1190 /* Don't allocate less than the minimum acceptable, plus one page per
1191 * megabyte of RAM (for the p2m table) */
1192 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1193 if ( pages > 0 && pages < lower_bound )
1194 pages = lower_bound;
1195 /* Round up to largest block size */
1196 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1198 SHADOW_PRINTK("current %i target %i\n",
1199 d->arch.shadow.total_pages, pages);
1201 while ( d->arch.shadow.total_pages != pages )
1203 if ( d->arch.shadow.total_pages < pages )
1205 /* Need to allocate more memory from domheap */
1206 sp = (struct shadow_page_info *)
1207 alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
1208 if ( sp == NULL )
1210 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1211 return -ENOMEM;
1213 d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1214 d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1215 for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
1217 sp[j].type = 0;
1218 sp[j].pinned = 0;
1219 sp[j].logdirty = 0;
1220 sp[j].count = 0;
1221 sp[j].mbz = 0;
1222 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1224 sp->order = SHADOW_MAX_ORDER;
1225 list_add_tail(&sp->list,
1226 &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
1228 else if ( d->arch.shadow.total_pages > pages )
1230 /* Need to return memory to domheap */
1231 shadow_prealloc(d, SHADOW_MAX_ORDER);
1232 ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
1233 sp = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
1234 struct shadow_page_info, list);
1235 list_del(&sp->list);
1236 d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1237 d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1238 free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER);
1241 /* Check to see if we need to yield and try again */
1242 if ( preempted && hypercall_preempt_check() )
1244 *preempted = 1;
1245 return 0;
1249 return 0;
1252 unsigned int shadow_set_allocation(struct domain *d,
1253 unsigned int megabytes,
1254 int *preempted)
1255 /* Hypercall interface to set the shadow memory allocation */
1257 unsigned int rv;
1258 shadow_lock(d);
1259 rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
1260 SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
1261 d->domain_id,
1262 d->arch.shadow.total_pages,
1263 shadow_get_allocation(d));
1264 shadow_unlock(d);
1265 return rv;
1268 /**************************************************************************/
1269 /* Hash table for storing the guest->shadow mappings.
1270 * The table itself is an array of pointers to shadows; the shadows are then
1271 * threaded on a singly-linked list of shadows with the same hash value */
1273 #define SHADOW_HASH_BUCKETS 251
1274 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1276 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1277 typedef u32 key_t;
1278 static inline key_t sh_hash(unsigned long n, unsigned int t)
1280 unsigned char *p = (unsigned char *)&n;
1281 key_t k = t;
1282 int i;
1283 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1284 return k % SHADOW_HASH_BUCKETS;
1287 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1289 /* Before we get to the mechanism, define a pair of audit functions
1290 * that sanity-check the contents of the hash table. */
1291 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1292 /* Audit one bucket of the hash table */
1294 struct shadow_page_info *sp, *x;
1296 if ( !(SHADOW_AUDIT_ENABLE) )
1297 return;
1299 sp = d->arch.shadow.hash_table[bucket];
1300 while ( sp )
1302 /* Not a shadow? */
1303 BUG_ON( sp->mbz != 0 );
1304 /* Bogus type? */
1305 BUG_ON( sp->type == 0 );
1306 BUG_ON( sp->type > SH_type_max_shadow );
1307 /* Wrong bucket? */
1308 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1309 /* Duplicate entry? */
1310 for ( x = sp->next_shadow; x; x = x->next_shadow )
1311 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1312 /* Follow the backpointer to the guest pagetable */
1313 if ( sp->type != SH_type_fl1_32_shadow
1314 && sp->type != SH_type_fl1_pae_shadow
1315 && sp->type != SH_type_fl1_64_shadow )
1317 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1318 /* Bad shadow flags on guest page? */
1319 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1320 /* Bad type count on guest page? */
1321 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1322 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1324 SHADOW_ERROR("MFN %#lx shadowed (by %#"SH_PRI_mfn")"
1325 " but has typecount %#lx\n",
1326 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1327 gpg->u.inuse.type_info);
1328 BUG();
1331 /* That entry was OK; on we go */
1332 sp = sp->next_shadow;
1336 #else
1337 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1338 #endif /* Hashtable bucket audit */
1341 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1343 static void sh_hash_audit(struct domain *d)
1344 /* Full audit: audit every bucket in the table */
1346 int i;
1348 if ( !(SHADOW_AUDIT_ENABLE) )
1349 return;
1351 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1353 sh_hash_audit_bucket(d, i);
1357 #else
1358 #define sh_hash_audit(_d) do {} while(0)
1359 #endif /* Hashtable bucket audit */
1361 /* Allocate and initialise the table itself.
1362 * Returns 0 for success, 1 for error. */
1363 static int shadow_hash_alloc(struct domain *d)
1365 struct shadow_page_info **table;
1367 ASSERT(shadow_lock_is_acquired(d));
1368 ASSERT(!d->arch.shadow.hash_table);
1370 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1371 if ( !table ) return 1;
1372 memset(table, 0,
1373 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1374 d->arch.shadow.hash_table = table;
1375 return 0;
1378 /* Tear down the hash table and return all memory to Xen.
1379 * This function does not care whether the table is populated. */
1380 static void shadow_hash_teardown(struct domain *d)
1382 ASSERT(shadow_lock_is_acquired(d));
1383 ASSERT(d->arch.shadow.hash_table);
1385 xfree(d->arch.shadow.hash_table);
1386 d->arch.shadow.hash_table = NULL;
1390 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1391 /* Find an entry in the hash table. Returns the MFN of the shadow,
1392 * or INVALID_MFN if it doesn't exist */
1394 struct domain *d = v->domain;
1395 struct shadow_page_info *sp, *prev;
1396 key_t key;
1398 ASSERT(shadow_lock_is_acquired(d));
1399 ASSERT(d->arch.shadow.hash_table);
1400 ASSERT(t);
1402 sh_hash_audit(d);
1404 perfc_incrc(shadow_hash_lookups);
1405 key = sh_hash(n, t);
1406 sh_hash_audit_bucket(d, key);
1408 sp = d->arch.shadow.hash_table[key];
1409 prev = NULL;
1410 while(sp)
1412 if ( sp->backpointer == n && sp->type == t )
1414 /* Pull-to-front if 'sp' isn't already the head item */
1415 if ( unlikely(sp != d->arch.shadow.hash_table[key]) )
1417 if ( unlikely(d->arch.shadow.hash_walking != 0) )
1418 /* Can't reorder: someone is walking the hash chains */
1419 return shadow_page_to_mfn(sp);
1420 else
1422 ASSERT(prev);
1423 /* Delete sp from the list */
1424 prev->next_shadow = sp->next_shadow;
1425 /* Re-insert it at the head of the list */
1426 sp->next_shadow = d->arch.shadow.hash_table[key];
1427 d->arch.shadow.hash_table[key] = sp;
1430 else
1432 perfc_incrc(shadow_hash_lookup_head);
1434 return shadow_page_to_mfn(sp);
1436 prev = sp;
1437 sp = sp->next_shadow;
1440 perfc_incrc(shadow_hash_lookup_miss);
1441 return _mfn(INVALID_MFN);
1444 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
1445 mfn_t smfn)
1446 /* Put a mapping (n,t)->smfn into the hash table */
1448 struct domain *d = v->domain;
1449 struct shadow_page_info *sp;
1450 key_t key;
1452 ASSERT(shadow_lock_is_acquired(d));
1453 ASSERT(d->arch.shadow.hash_table);
1454 ASSERT(t);
1456 sh_hash_audit(d);
1458 perfc_incrc(shadow_hash_inserts);
1459 key = sh_hash(n, t);
1460 sh_hash_audit_bucket(d, key);
1462 /* Insert this shadow at the top of the bucket */
1463 sp = mfn_to_shadow_page(smfn);
1464 sp->next_shadow = d->arch.shadow.hash_table[key];
1465 d->arch.shadow.hash_table[key] = sp;
1467 sh_hash_audit_bucket(d, key);
1470 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
1471 mfn_t smfn)
1472 /* Excise the mapping (n,t)->smfn from the hash table */
1474 struct domain *d = v->domain;
1475 struct shadow_page_info *sp, *x;
1476 key_t key;
1478 ASSERT(shadow_lock_is_acquired(d));
1479 ASSERT(d->arch.shadow.hash_table);
1480 ASSERT(t);
1482 sh_hash_audit(d);
1484 perfc_incrc(shadow_hash_deletes);
1485 key = sh_hash(n, t);
1486 sh_hash_audit_bucket(d, key);
1488 sp = mfn_to_shadow_page(smfn);
1489 if ( d->arch.shadow.hash_table[key] == sp )
1490 /* Easy case: we're deleting the head item. */
1491 d->arch.shadow.hash_table[key] = sp->next_shadow;
1492 else
1494 /* Need to search for the one we want */
1495 x = d->arch.shadow.hash_table[key];
1496 while ( 1 )
1498 ASSERT(x); /* We can't have hit the end, since our target is
1499 * still in the chain somehwere... */
1500 if ( x->next_shadow == sp )
1502 x->next_shadow = sp->next_shadow;
1503 break;
1505 x = x->next_shadow;
1508 sp->next_shadow = NULL;
1510 sh_hash_audit_bucket(d, key);
1513 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1515 static void hash_foreach(struct vcpu *v,
1516 unsigned int callback_mask,
1517 hash_callback_t callbacks[],
1518 mfn_t callback_mfn)
1519 /* Walk the hash table looking at the types of the entries and
1520 * calling the appropriate callback function for each entry.
1521 * The mask determines which shadow types we call back for, and the array
1522 * of callbacks tells us which function to call.
1523 * Any callback may return non-zero to let us skip the rest of the scan.
1525 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1526 * then return non-zero to terminate the scan. */
1528 int i, done = 0;
1529 struct domain *d = v->domain;
1530 struct shadow_page_info *x;
1532 /* Say we're here, to stop hash-lookups reordering the chains */
1533 ASSERT(shadow_lock_is_acquired(d));
1534 ASSERT(d->arch.shadow.hash_walking == 0);
1535 d->arch.shadow.hash_walking = 1;
1537 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1539 /* WARNING: This is not safe against changes to the hash table.
1540 * The callback *must* return non-zero if it has inserted or
1541 * deleted anything from the hash (lookups are OK, though). */
1542 for ( x = d->arch.shadow.hash_table[i]; x; x = x->next_shadow )
1544 if ( callback_mask & (1 << x->type) )
1546 ASSERT(x->type <= 15);
1547 ASSERT(callbacks[x->type] != NULL);
1548 done = callbacks[x->type](v, shadow_page_to_mfn(x),
1549 callback_mfn);
1550 if ( done ) break;
1553 if ( done ) break;
1555 d->arch.shadow.hash_walking = 0;
1559 /**************************************************************************/
1560 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1561 * which will decrement refcounts appropriately and return memory to the
1562 * free pool. */
1564 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1566 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1567 unsigned int t = sp->type;
1570 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1572 /* Double-check, if we can, that the shadowed page belongs to this
1573 * domain, (by following the back-pointer). */
1574 ASSERT(t == SH_type_fl1_32_shadow ||
1575 t == SH_type_fl1_pae_shadow ||
1576 t == SH_type_fl1_64_shadow ||
1577 t == SH_type_monitor_table ||
1578 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
1579 == v->domain));
1581 /* The down-shifts here are so that the switch statement is on nice
1582 * small numbers that the compiler will enjoy */
1583 switch ( t )
1585 #if CONFIG_PAGING_LEVELS == 2
1586 case SH_type_l1_32_shadow:
1587 case SH_type_fl1_32_shadow:
1588 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1589 break;
1590 case SH_type_l2_32_shadow:
1591 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1592 break;
1593 #else /* PAE or 64bit */
1594 case SH_type_l1_32_shadow:
1595 case SH_type_fl1_32_shadow:
1596 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1597 break;
1598 case SH_type_l2_32_shadow:
1599 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1600 break;
1601 #endif
1603 #if CONFIG_PAGING_LEVELS >= 3
1604 case SH_type_l1_pae_shadow:
1605 case SH_type_fl1_pae_shadow:
1606 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1607 break;
1608 case SH_type_l2_pae_shadow:
1609 case SH_type_l2h_pae_shadow:
1610 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1611 break;
1612 #endif
1614 #if CONFIG_PAGING_LEVELS >= 4
1615 case SH_type_l1_64_shadow:
1616 case SH_type_fl1_64_shadow:
1617 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1618 break;
1619 case SH_type_l2_64_shadow:
1620 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1621 break;
1622 case SH_type_l3_64_shadow:
1623 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1624 break;
1625 case SH_type_l4_64_shadow:
1626 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1627 break;
1628 #endif
1629 default:
1630 SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
1631 (unsigned long)t);
1632 BUG();
1636 /**************************************************************************/
1637 /* Remove all writeable mappings of a guest frame from the shadow tables
1638 * Returns non-zero if we need to flush TLBs.
1639 * level and fault_addr desribe how we found this to be a pagetable;
1640 * level==0 means we have some other reason for revoking write access.*/
1642 int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn,
1643 unsigned int level,
1644 unsigned long fault_addr)
1646 /* Dispatch table for getting per-type functions */
1647 static hash_callback_t callbacks[16] = {
1648 NULL, /* none */
1649 #if CONFIG_PAGING_LEVELS == 2
1650 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */
1651 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */
1652 #else
1653 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */
1654 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */
1655 #endif
1656 NULL, /* l2_32 */
1657 #if CONFIG_PAGING_LEVELS >= 3
1658 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */
1659 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
1660 #else
1661 NULL, /* l1_pae */
1662 NULL, /* fl1_pae */
1663 #endif
1664 NULL, /* l2_pae */
1665 NULL, /* l2h_pae */
1666 #if CONFIG_PAGING_LEVELS >= 4
1667 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */
1668 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */
1669 #else
1670 NULL, /* l1_64 */
1671 NULL, /* fl1_64 */
1672 #endif
1673 NULL, /* l2_64 */
1674 NULL, /* l3_64 */
1675 NULL, /* l4_64 */
1676 NULL, /* p2m */
1677 NULL /* unused */
1678 };
1680 static unsigned int callback_mask =
1681 1 << SH_type_l1_32_shadow
1682 | 1 << SH_type_fl1_32_shadow
1683 | 1 << SH_type_l1_pae_shadow
1684 | 1 << SH_type_fl1_pae_shadow
1685 | 1 << SH_type_l1_64_shadow
1686 | 1 << SH_type_fl1_64_shadow
1688 struct page_info *pg = mfn_to_page(gmfn);
1690 ASSERT(shadow_lock_is_acquired(v->domain));
1692 /* Only remove writable mappings if we are doing shadow refcounts.
1693 * In guest refcounting, we trust Xen to already be restricting
1694 * all the writes to the guest page tables, so we do not need to
1695 * do more. */
1696 if ( !shadow_mode_refcounts(v->domain) )
1697 return 0;
1699 /* Early exit if it's already a pagetable, or otherwise not writeable */
1700 if ( sh_mfn_is_a_page_table(gmfn)
1701 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1702 return 0;
1704 perfc_incrc(shadow_writeable);
1706 /* If this isn't a "normal" writeable page, the domain is trying to
1707 * put pagetables in special memory of some kind. We can't allow that. */
1708 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1710 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1711 PRtype_info "\n",
1712 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1713 domain_crash(v->domain);
1716 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1717 if ( v == current && level != 0 )
1719 unsigned long gfn;
1720 /* Heuristic: there is likely to be only one writeable mapping,
1721 * and that mapping is likely to be in the current pagetable,
1722 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
1724 #define GUESS(_a, _h) do { \
1725 if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
1726 perfc_incrc(shadow_writeable_h_ ## _h); \
1727 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1728 return 1; \
1729 } while (0)
1732 if ( v->arch.shadow.mode->guest_levels == 2 )
1734 if ( level == 1 )
1735 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1736 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1738 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1739 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1740 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1743 #if CONFIG_PAGING_LEVELS >= 3
1744 else if ( v->arch.shadow.mode->guest_levels == 3 )
1746 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1747 switch ( level )
1749 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1750 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1753 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1754 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1755 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1757 #if CONFIG_PAGING_LEVELS >= 4
1758 else if ( v->arch.shadow.mode->guest_levels == 4 )
1760 /* 64bit w2k3: linear map at 0x0000070000000000 */
1761 switch ( level )
1763 case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
1764 case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
1765 case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
1768 /* 64bit Linux direct map at 0xffff810000000000; older kernels
1769 * had it at 0x0000010000000000UL */
1770 gfn = sh_mfn_to_gfn(v->domain, gmfn);
1771 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
1772 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
1774 #endif /* CONFIG_PAGING_LEVELS >= 4 */
1775 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1777 #undef GUESS
1780 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1781 return 1;
1783 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
1784 * (entries in the fixmap) where linux maps its pagetables. Since
1785 * we expect to hit them most of the time, we start the search for
1786 * the writeable mapping by looking at the same MFN where the last
1787 * brute-force search succeeded. */
1789 if ( v->arch.shadow.last_writeable_pte_smfn != 0 )
1791 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
1792 mfn_t last_smfn = _mfn(v->arch.shadow.last_writeable_pte_smfn);
1793 int shtype = mfn_to_shadow_page(last_smfn)->type;
1795 if ( callbacks[shtype] )
1796 callbacks[shtype](v, last_smfn, gmfn);
1798 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
1799 perfc_incrc(shadow_writeable_h_5);
1802 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1803 return 1;
1805 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
1807 /* Brute-force search of all the shadows, by walking the hash */
1808 perfc_incrc(shadow_writeable_bf);
1809 hash_foreach(v, callback_mask, callbacks, gmfn);
1811 /* If that didn't catch the mapping, something is very wrong */
1812 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
1814 SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
1815 "%lu left\n", mfn_x(gmfn),
1816 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
1817 domain_crash(v->domain);
1820 /* We killed at least one writeable mapping, so must flush TLBs. */
1821 return 1;
1826 /**************************************************************************/
1827 /* Remove all mappings of a guest frame from the shadow tables.
1828 * Returns non-zero if we need to flush TLBs. */
1830 int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
1832 struct page_info *page = mfn_to_page(gmfn);
1833 int expected_count;
1835 /* Dispatch table for getting per-type functions */
1836 static hash_callback_t callbacks[16] = {
1837 NULL, /* none */
1838 #if CONFIG_PAGING_LEVELS == 2
1839 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */
1840 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */
1841 #else
1842 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */
1843 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */
1844 #endif
1845 NULL, /* l2_32 */
1846 #if CONFIG_PAGING_LEVELS >= 3
1847 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */
1848 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
1849 #else
1850 NULL, /* l1_pae */
1851 NULL, /* fl1_pae */
1852 #endif
1853 NULL, /* l2_pae */
1854 NULL, /* l2h_pae */
1855 #if CONFIG_PAGING_LEVELS >= 4
1856 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */
1857 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */
1858 #else
1859 NULL, /* l1_64 */
1860 NULL, /* fl1_64 */
1861 #endif
1862 NULL, /* l2_64 */
1863 NULL, /* l3_64 */
1864 NULL, /* l4_64 */
1865 NULL, /* p2m */
1866 NULL /* unused */
1867 };
1869 static unsigned int callback_mask =
1870 1 << SH_type_l1_32_shadow
1871 | 1 << SH_type_fl1_32_shadow
1872 | 1 << SH_type_l1_pae_shadow
1873 | 1 << SH_type_fl1_pae_shadow
1874 | 1 << SH_type_l1_64_shadow
1875 | 1 << SH_type_fl1_64_shadow
1878 perfc_incrc(shadow_mappings);
1879 if ( (page->count_info & PGC_count_mask) == 0 )
1880 return 0;
1882 ASSERT(shadow_lock_is_acquired(v->domain));
1884 /* XXX TODO:
1885 * Heuristics for finding the (probably) single mapping of this gmfn */
1887 /* Brute-force search of all the shadows, by walking the hash */
1888 perfc_incrc(shadow_mappings_bf);
1889 hash_foreach(v, callback_mask, callbacks, gmfn);
1891 /* If that didn't catch the mapping, something is very wrong */
1892 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
1893 if ( (page->count_info & PGC_count_mask) != expected_count )
1895 /* Don't complain if we're in HVM and there's one extra mapping:
1896 * The qemu helper process has an untyped mapping of this dom's RAM */
1897 if ( !(shadow_mode_external(v->domain)
1898 && (page->count_info & PGC_count_mask) <= 2
1899 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
1901 SHADOW_ERROR("can't find all mappings of mfn %lx: "
1902 "c=%08x t=%08lx\n", mfn_x(gmfn),
1903 page->count_info, page->u.inuse.type_info);
1907 /* We killed at least one mapping, so must flush TLBs. */
1908 return 1;
1912 /**************************************************************************/
1913 /* Remove all shadows of a guest frame from the shadow tables */
1915 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
1916 /* Follow this shadow's up-pointer, if it has one, and remove the reference
1917 * found there. Returns 1 if that was the only reference to this shadow */
1919 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1920 mfn_t pmfn;
1921 void *vaddr;
1922 int rc;
1924 ASSERT(sp->type > 0);
1925 ASSERT(sp->type < SH_type_max_shadow);
1926 ASSERT(sp->type != SH_type_l2_32_shadow);
1927 ASSERT(sp->type != SH_type_l2_pae_shadow);
1928 ASSERT(sp->type != SH_type_l2h_pae_shadow);
1929 ASSERT(sp->type != SH_type_l4_64_shadow);
1931 if (sp->up == 0) return 0;
1932 pmfn = _mfn(sp->up >> PAGE_SHIFT);
1933 ASSERT(valid_mfn(pmfn));
1934 vaddr = sh_map_domain_page(pmfn);
1935 ASSERT(vaddr);
1936 vaddr += sp->up & (PAGE_SIZE-1);
1937 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
1939 /* Is this the only reference to this shadow? */
1940 rc = (sp->count == 1) ? 1 : 0;
1942 /* Blank the offending entry */
1943 switch (sp->type)
1945 case SH_type_l1_32_shadow:
1946 case SH_type_l2_32_shadow:
1947 #if CONFIG_PAGING_LEVELS == 2
1948 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
1949 #else
1950 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
1951 #endif
1952 break;
1953 #if CONFIG_PAGING_LEVELS >=3
1954 case SH_type_l1_pae_shadow:
1955 case SH_type_l2_pae_shadow:
1956 case SH_type_l2h_pae_shadow:
1957 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
1958 break;
1959 #if CONFIG_PAGING_LEVELS >= 4
1960 case SH_type_l1_64_shadow:
1961 case SH_type_l2_64_shadow:
1962 case SH_type_l3_64_shadow:
1963 case SH_type_l4_64_shadow:
1964 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
1965 break;
1966 #endif
1967 #endif
1968 default: BUG(); /* Some wierd unknown shadow type */
1971 sh_unmap_domain_page(vaddr);
1972 if ( rc )
1973 perfc_incrc(shadow_up_pointer);
1974 else
1975 perfc_incrc(shadow_unshadow_bf);
1977 return rc;
1980 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
1981 /* Remove the shadows of this guest page.
1982 * If fast != 0, just try the quick heuristic, which will remove
1983 * at most one reference to each shadow of the page. Otherwise, walk
1984 * all the shadow tables looking for refs to shadows of this gmfn.
1985 * If all != 0, kill the domain if we can't find all the shadows.
1986 * (all != 0 implies fast == 0)
1987 */
1989 struct page_info *pg;
1990 mfn_t smfn;
1991 u32 sh_flags;
1992 unsigned char t;
1994 /* Dispatch table for getting per-type functions: each level must
1995 * be called with the function to remove a lower-level shadow. */
1996 static hash_callback_t callbacks[16] = {
1997 NULL, /* none */
1998 NULL, /* l1_32 */
1999 NULL, /* fl1_32 */
2000 #if CONFIG_PAGING_LEVELS == 2
2001 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2002 #else
2003 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2004 #endif
2005 NULL, /* l1_pae */
2006 NULL, /* fl1_pae */
2007 #if CONFIG_PAGING_LEVELS >= 3
2008 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2009 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2010 #else
2011 NULL, /* l2_pae */
2012 NULL, /* l2h_pae */
2013 #endif
2014 NULL, /* l1_64 */
2015 NULL, /* fl1_64 */
2016 #if CONFIG_PAGING_LEVELS >= 4
2017 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2018 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2019 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2020 #else
2021 NULL, /* l2_64 */
2022 NULL, /* l3_64 */
2023 NULL, /* l4_64 */
2024 #endif
2025 NULL, /* p2m */
2026 NULL /* unused */
2027 };
2029 /* Another lookup table, for choosing which mask to use */
2030 static unsigned int masks[16] = {
2031 0, /* none */
2032 1 << SH_type_l2_32_shadow, /* l1_32 */
2033 0, /* fl1_32 */
2034 0, /* l2_32 */
2035 ((1 << SH_type_l2h_pae_shadow)
2036 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2037 0, /* fl1_pae */
2038 0, /* l2_pae */
2039 0, /* l2h_pae */
2040 1 << SH_type_l2_64_shadow, /* l1_64 */
2041 0, /* fl1_64 */
2042 1 << SH_type_l3_64_shadow, /* l2_64 */
2043 1 << SH_type_l4_64_shadow, /* l3_64 */
2044 0, /* l4_64 */
2045 0, /* p2m */
2046 0 /* unused */
2047 };
2049 ASSERT(shadow_lock_is_acquired(v->domain));
2050 ASSERT(!(all && fast));
2052 pg = mfn_to_page(gmfn);
2054 /* Bail out now if the page is not shadowed */
2055 if ( (pg->count_info & PGC_page_table) == 0 )
2056 return;
2058 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2059 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2061 /* Search for this shadow in all appropriate shadows */
2062 perfc_incrc(shadow_unshadow);
2063 sh_flags = pg->shadow_flags;
2065 /* Lower-level shadows need to be excised from upper-level shadows.
2066 * This call to hash_foreach() looks dangerous but is in fact OK: each
2067 * call will remove at most one shadow, and terminate immediately when
2068 * it does remove it, so we never walk the hash after doing a deletion. */
2069 #define DO_UNSHADOW(_type) do { \
2070 t = (_type); \
2071 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2072 if ( sh_type_is_pinnable(v, t) ) \
2073 sh_unpin(v, smfn); \
2074 else \
2075 sh_remove_shadow_via_pointer(v, smfn); \
2076 if ( (pg->count_info & PGC_page_table) && !fast ) \
2077 hash_foreach(v, masks[t], callbacks, smfn); \
2078 } while (0)
2080 if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(SH_type_l1_32_shadow);
2081 if ( sh_flags & SHF_L2_32 ) DO_UNSHADOW(SH_type_l2_32_shadow);
2082 #if CONFIG_PAGING_LEVELS >= 3
2083 if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(SH_type_l1_pae_shadow);
2084 if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(SH_type_l2_pae_shadow);
2085 if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(SH_type_l2h_pae_shadow);
2086 #if CONFIG_PAGING_LEVELS >= 4
2087 if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(SH_type_l1_64_shadow);
2088 if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(SH_type_l2_64_shadow);
2089 if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(SH_type_l3_64_shadow);
2090 if ( sh_flags & SHF_L4_64 ) DO_UNSHADOW(SH_type_l4_64_shadow);
2091 #endif
2092 #endif
2094 #undef DO_UNSHADOW
2096 /* If that didn't catch the shadows, something is wrong */
2097 if ( !fast && (pg->count_info & PGC_page_table) )
2099 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2100 "(shadow_flags=%08lx)\n",
2101 mfn_x(gmfn), pg->shadow_flags);
2102 if ( all )
2103 domain_crash(v->domain);
2106 /* Need to flush TLBs now, so that linear maps are safe next time we
2107 * take a fault. */
2108 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2111 void
2112 shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2113 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2114 * Unshadow it, and recursively unshadow pages that reference it. */
2116 shadow_remove_all_shadows(v, gmfn);
2117 /* XXX TODO:
2118 * Rework this hashtable walker to return a linked-list of all
2119 * the shadows it modified, then do breadth-first recursion
2120 * to find the way up to higher-level tables and unshadow them too.
2122 * The current code (just tearing down each page's shadows as we
2123 * detect that it is not a pagetable) is correct, but very slow.
2124 * It means extra emulated writes and slows down removal of mappings. */
2127 /**************************************************************************/
2129 void sh_update_paging_modes(struct vcpu *v)
2131 struct domain *d = v->domain;
2132 struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
2133 mfn_t old_guest_table;
2135 ASSERT(shadow_lock_is_acquired(d));
2137 // Valid transitions handled by this function:
2138 // - For PV guests:
2139 // - after a shadow mode has been changed
2140 // - For HVM guests:
2141 // - after a shadow mode has been changed
2142 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2143 //
2145 // First, tear down any old shadow tables held by this vcpu.
2146 //
2147 shadow_detach_old_tables(v);
2149 if ( !is_hvm_domain(d) )
2151 ///
2152 /// PV guest
2153 ///
2154 #if CONFIG_PAGING_LEVELS == 4
2155 if ( pv_32bit_guest(v) )
2156 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
2157 else
2158 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2159 #elif CONFIG_PAGING_LEVELS == 3
2160 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2161 #elif CONFIG_PAGING_LEVELS == 2
2162 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2163 #else
2164 #error unexpected paging mode
2165 #endif
2166 v->arch.shadow.translate_enabled = !!shadow_mode_translate(d);
2168 else
2170 ///
2171 /// HVM guest
2172 ///
2173 ASSERT(shadow_mode_translate(d));
2174 ASSERT(shadow_mode_external(d));
2176 v->arch.shadow.translate_enabled = !!hvm_paging_enabled(v);
2177 if ( !v->arch.shadow.translate_enabled )
2179 /* Set v->arch.guest_table to use the p2m map, and choose
2180 * the appropriate shadow mode */
2181 old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2182 #if CONFIG_PAGING_LEVELS == 2
2183 v->arch.guest_table =
2184 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2185 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2186 #elif CONFIG_PAGING_LEVELS == 3
2187 v->arch.guest_table =
2188 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2189 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2190 #else /* CONFIG_PAGING_LEVELS == 4 */
2192 l4_pgentry_t *l4e;
2193 /* Use the start of the first l3 table as a PAE l3 */
2194 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2195 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2196 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2197 v->arch.guest_table =
2198 pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2199 sh_unmap_domain_page(l4e);
2201 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2202 #endif
2203 /* Fix up refcounts on guest_table */
2204 get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2205 if ( mfn_x(old_guest_table) != 0 )
2206 put_page(mfn_to_page(old_guest_table));
2208 else
2210 #ifdef __x86_64__
2211 if ( hvm_long_mode_enabled(v) )
2213 // long mode guest...
2214 v->arch.shadow.mode =
2215 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2217 else
2218 #endif
2219 if ( hvm_pae_enabled(v) )
2221 #if CONFIG_PAGING_LEVELS >= 3
2222 // 32-bit PAE mode guest...
2223 v->arch.shadow.mode =
2224 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2225 #else
2226 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2227 domain_crash(d);
2228 return;
2229 #endif
2231 else
2233 // 32-bit 2 level guest...
2234 #if CONFIG_PAGING_LEVELS >= 3
2235 v->arch.shadow.mode =
2236 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2237 #else
2238 v->arch.shadow.mode =
2239 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2240 #endif
2244 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
2246 mfn_t mmfn = shadow_make_monitor_table(v);
2247 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2248 v->arch.monitor_vtable = sh_map_domain_page(mmfn);
2251 if ( v->arch.shadow.mode != old_mode )
2253 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2254 "(was g=%u s=%u)\n",
2255 d->domain_id, v->vcpu_id,
2256 is_hvm_domain(d) ? !!hvm_paging_enabled(v) : 1,
2257 v->arch.shadow.mode->guest_levels,
2258 v->arch.shadow.mode->shadow_levels,
2259 old_mode ? old_mode->guest_levels : 0,
2260 old_mode ? old_mode->shadow_levels : 0);
2261 if ( old_mode &&
2262 (v->arch.shadow.mode->shadow_levels !=
2263 old_mode->shadow_levels) )
2265 /* Need to make a new monitor table for the new mode */
2266 mfn_t new_mfn, old_mfn;
2268 if ( v != current )
2270 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2271 "this HVM vcpu's (d=%u v=%u) paging mode!\n",
2272 current->domain->domain_id, current->vcpu_id,
2273 v->domain->domain_id, v->vcpu_id);
2274 domain_crash(v->domain);
2275 return;
2278 sh_unmap_domain_page(v->arch.monitor_vtable);
2279 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2280 v->arch.monitor_table = pagetable_null();
2281 new_mfn = v->arch.shadow.mode->make_monitor_table(v);
2282 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2283 v->arch.monitor_vtable = sh_map_domain_page(new_mfn);
2284 SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
2285 mfn_x(new_mfn));
2287 /* Don't be running on the old monitor table when we
2288 * pull it down! Switch CR3, and warn the HVM code that
2289 * its host cr3 has changed. */
2290 make_cr3(v, mfn_x(new_mfn));
2291 write_ptbase(v);
2292 hvm_update_host_cr3(v);
2293 old_mode->destroy_monitor_table(v, old_mfn);
2297 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2298 // These are HARD: think about the case where two CPU's have
2299 // different values for CR4.PSE and CR4.PGE at the same time.
2300 // This *does* happen, at least for CR4.PGE...
2303 v->arch.shadow.mode->update_cr3(v);
2306 /**************************************************************************/
2307 /* Turning on and off shadow features */
2309 static void sh_new_mode(struct domain *d, u32 new_mode)
2310 /* Inform all the vcpus that the shadow mode has been changed */
2312 struct vcpu *v;
2314 ASSERT(shadow_lock_is_acquired(d));
2315 ASSERT(d != current->domain);
2316 d->arch.shadow.mode = new_mode;
2317 if ( new_mode & SHM2_translate )
2318 shadow_audit_p2m(d);
2319 for_each_vcpu(d, v)
2320 sh_update_paging_modes(v);
2323 int shadow_enable(struct domain *d, u32 mode)
2324 /* Turn on "permanent" shadow features: external, translate, refcount.
2325 * Can only be called once on a domain, and these features cannot be
2326 * disabled.
2327 * Returns 0 for success, -errno for failure. */
2329 unsigned int old_pages;
2330 int rv = 0;
2332 mode |= SHM2_enable;
2334 domain_pause(d);
2335 shadow_lock(d);
2337 /* Sanity check the arguments */
2338 if ( (d == current->domain) ||
2339 shadow_mode_enabled(d) ||
2340 ((mode & SHM2_translate) && !(mode & SHM2_refcounts)) ||
2341 ((mode & SHM2_external) && !(mode & SHM2_translate)) )
2343 rv = -EINVAL;
2344 goto out;
2347 // XXX -- eventually would like to require that all memory be allocated
2348 // *after* shadow_enabled() is called... So here, we would test to make
2349 // sure that d->page_list is empty.
2350 #if 0
2351 spin_lock(&d->page_alloc_lock);
2352 if ( !list_empty(&d->page_list) )
2354 spin_unlock(&d->page_alloc_lock);
2355 rv = -EINVAL;
2356 goto out;
2358 spin_unlock(&d->page_alloc_lock);
2359 #endif
2361 /* Init the shadow memory allocation if the user hasn't done so */
2362 old_pages = d->arch.shadow.total_pages;
2363 if ( old_pages == 0 )
2364 if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
2366 set_sh_allocation(d, 0, NULL);
2367 rv = -ENOMEM;
2368 goto out;
2371 /* Init the hash table */
2372 if ( shadow_hash_alloc(d) != 0 )
2374 set_sh_allocation(d, old_pages, NULL);
2375 rv = -ENOMEM;
2376 goto out;
2379 /* Init the P2M table */
2380 if ( mode & SHM2_translate )
2381 if ( !shadow_alloc_p2m_table(d) )
2383 shadow_hash_teardown(d);
2384 set_sh_allocation(d, old_pages, NULL);
2385 shadow_p2m_teardown(d);
2386 rv = -ENOMEM;
2387 goto out;
2390 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2391 /* We assume we're dealing with an older 64bit linux guest until we
2392 * see the guest use more than one l4 per vcpu. */
2393 d->arch.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2394 #endif
2396 /* Update the bits */
2397 sh_new_mode(d, mode);
2398 shadow_audit_p2m(d);
2399 out:
2400 shadow_unlock(d);
2401 domain_unpause(d);
2402 return rv;
2405 void shadow_teardown(struct domain *d)
2406 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2407 * Should only be called for dying domains. */
2409 struct vcpu *v;
2410 mfn_t mfn;
2412 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2413 ASSERT(d != current->domain);
2415 if ( !shadow_lock_is_acquired(d) )
2416 shadow_lock(d); /* Keep various asserts happy */
2418 if ( shadow_mode_enabled(d) )
2420 /* Release the shadow and monitor tables held by each vcpu */
2421 for_each_vcpu(d, v)
2423 shadow_detach_old_tables(v);
2424 if ( shadow_mode_external(d) )
2426 mfn = pagetable_get_mfn(v->arch.monitor_table);
2427 if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
2428 shadow_destroy_monitor_table(v, mfn);
2429 v->arch.monitor_table = pagetable_null();
2434 if ( d->arch.shadow.total_pages != 0 )
2436 SHADOW_PRINTK("teardown of domain %u starts."
2437 " Shadow pages total = %u, free = %u, p2m=%u\n",
2438 d->domain_id,
2439 d->arch.shadow.total_pages,
2440 d->arch.shadow.free_pages,
2441 d->arch.shadow.p2m_pages);
2442 /* Destroy all the shadows and release memory to domheap */
2443 set_sh_allocation(d, 0, NULL);
2444 /* Release the hash table back to xenheap */
2445 if (d->arch.shadow.hash_table)
2446 shadow_hash_teardown(d);
2447 /* Release the log-dirty bitmap of dirtied pages */
2448 sh_free_log_dirty_bitmap(d);
2449 /* Should not have any more memory held */
2450 SHADOW_PRINTK("teardown done."
2451 " Shadow pages total = %u, free = %u, p2m=%u\n",
2452 d->arch.shadow.total_pages,
2453 d->arch.shadow.free_pages,
2454 d->arch.shadow.p2m_pages);
2455 ASSERT(d->arch.shadow.total_pages == 0);
2458 /* We leave the "permanent" shadow modes enabled, but clear the
2459 * log-dirty mode bit. We don't want any more mark_dirty()
2460 * calls now that we've torn down the bitmap */
2461 d->arch.shadow.mode &= ~SHM2_log_dirty;
2463 shadow_unlock(d);
2466 void shadow_final_teardown(struct domain *d)
2467 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2470 SHADOW_PRINTK("dom %u final teardown starts."
2471 " Shadow pages total = %u, free = %u, p2m=%u\n",
2472 d->domain_id,
2473 d->arch.shadow.total_pages,
2474 d->arch.shadow.free_pages,
2475 d->arch.shadow.p2m_pages);
2477 /* Double-check that the domain didn't have any shadow memory.
2478 * It is possible for a domain that never got domain_kill()ed
2479 * to get here with its shadow allocation intact. */
2480 if ( d->arch.shadow.total_pages != 0 )
2481 shadow_teardown(d);
2483 /* It is now safe to pull down the p2m map. */
2484 if ( d->arch.shadow.p2m_pages != 0 )
2485 shadow_p2m_teardown(d);
2487 SHADOW_PRINTK("dom %u final teardown done."
2488 " Shadow pages total = %u, free = %u, p2m=%u\n",
2489 d->domain_id,
2490 d->arch.shadow.total_pages,
2491 d->arch.shadow.free_pages,
2492 d->arch.shadow.p2m_pages);
2495 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2496 /* Turn on a single shadow mode feature */
2498 ASSERT(shadow_lock_is_acquired(d));
2500 /* Sanity check the call */
2501 if ( d == current->domain || (d->arch.shadow.mode & mode) )
2503 return -EINVAL;
2506 if ( d->arch.shadow.mode == 0 )
2508 /* Init the shadow memory allocation and the hash table */
2509 if ( set_sh_allocation(d, 1, NULL) != 0
2510 || shadow_hash_alloc(d) != 0 )
2512 set_sh_allocation(d, 0, NULL);
2513 return -ENOMEM;
2517 /* Update the bits */
2518 sh_new_mode(d, d->arch.shadow.mode | mode);
2520 return 0;
2523 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2524 /* Turn off a single shadow mode feature */
2526 struct vcpu *v;
2527 ASSERT(shadow_lock_is_acquired(d));
2529 /* Sanity check the call */
2530 if ( d == current->domain || !(d->arch.shadow.mode & mode) )
2532 return -EINVAL;
2535 /* Update the bits */
2536 sh_new_mode(d, d->arch.shadow.mode & ~mode);
2537 if ( d->arch.shadow.mode == 0 )
2539 /* Get this domain off shadows */
2540 SHADOW_PRINTK("un-shadowing of domain %u starts."
2541 " Shadow pages total = %u, free = %u, p2m=%u\n",
2542 d->domain_id,
2543 d->arch.shadow.total_pages,
2544 d->arch.shadow.free_pages,
2545 d->arch.shadow.p2m_pages);
2546 for_each_vcpu(d, v)
2548 shadow_detach_old_tables(v);
2549 #if CONFIG_PAGING_LEVELS == 4
2550 if ( !(v->arch.flags & TF_kernel_mode) )
2551 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2552 else
2553 #endif
2554 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2558 /* Pull down the memory allocation */
2559 if ( set_sh_allocation(d, 0, NULL) != 0 )
2561 // XXX - How can this occur?
2562 // Seems like a bug to return an error now that we've
2563 // disabled the relevant shadow mode.
2564 //
2565 return -ENOMEM;
2567 shadow_hash_teardown(d);
2568 SHADOW_PRINTK("un-shadowing of domain %u done."
2569 " Shadow pages total = %u, free = %u, p2m=%u\n",
2570 d->domain_id,
2571 d->arch.shadow.total_pages,
2572 d->arch.shadow.free_pages,
2573 d->arch.shadow.p2m_pages);
2576 return 0;
2579 /* Enable/disable ops for the "test" and "log-dirty" modes */
2580 int shadow_test_enable(struct domain *d)
2582 int ret;
2584 domain_pause(d);
2585 shadow_lock(d);
2587 if ( shadow_mode_enabled(d) )
2589 SHADOW_ERROR("Don't support enabling test mode"
2590 " on already shadowed doms\n");
2591 ret = -EINVAL;
2592 goto out;
2595 ret = shadow_one_bit_enable(d, SHM2_enable);
2596 out:
2597 shadow_unlock(d);
2598 domain_unpause(d);
2600 return ret;
2603 int shadow_test_disable(struct domain *d)
2605 int ret;
2607 domain_pause(d);
2608 shadow_lock(d);
2609 ret = shadow_one_bit_disable(d, SHM2_enable);
2610 shadow_unlock(d);
2611 domain_unpause(d);
2613 return ret;
2616 static int
2617 sh_alloc_log_dirty_bitmap(struct domain *d)
2619 ASSERT(d->arch.shadow.dirty_bitmap == NULL);
2620 d->arch.shadow.dirty_bitmap_size =
2621 (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
2622 ~(BITS_PER_LONG - 1);
2623 d->arch.shadow.dirty_bitmap =
2624 xmalloc_array(unsigned long,
2625 d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
2626 if ( d->arch.shadow.dirty_bitmap == NULL )
2628 d->arch.shadow.dirty_bitmap_size = 0;
2629 return -ENOMEM;
2631 memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
2633 return 0;
2636 static void
2637 sh_free_log_dirty_bitmap(struct domain *d)
2639 d->arch.shadow.dirty_bitmap_size = 0;
2640 if ( d->arch.shadow.dirty_bitmap )
2642 xfree(d->arch.shadow.dirty_bitmap);
2643 d->arch.shadow.dirty_bitmap = NULL;
2647 static int shadow_log_dirty_enable(struct domain *d)
2649 int ret;
2651 domain_pause(d);
2652 shadow_lock(d);
2654 if ( shadow_mode_log_dirty(d) )
2656 ret = -EINVAL;
2657 goto out;
2660 if ( shadow_mode_enabled(d) )
2662 SHADOW_ERROR("Don't (yet) support enabling log-dirty"
2663 " on already shadowed doms\n");
2664 ret = -EINVAL;
2665 goto out;
2668 ret = sh_alloc_log_dirty_bitmap(d);
2669 if ( ret != 0 )
2671 sh_free_log_dirty_bitmap(d);
2672 goto out;
2675 ret = shadow_one_bit_enable(d, SHM2_log_dirty);
2676 if ( ret != 0 )
2677 sh_free_log_dirty_bitmap(d);
2679 out:
2680 shadow_unlock(d);
2681 domain_unpause(d);
2682 return ret;
2685 static int shadow_log_dirty_disable(struct domain *d)
2687 int ret;
2689 domain_pause(d);
2690 shadow_lock(d);
2691 ret = shadow_one_bit_disable(d, SHM2_log_dirty);
2692 if ( !shadow_mode_log_dirty(d) )
2693 sh_free_log_dirty_bitmap(d);
2694 shadow_unlock(d);
2695 domain_unpause(d);
2697 return ret;
2700 /**************************************************************************/
2701 /* P2M map manipulations */
2703 static void
2704 sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
2706 struct vcpu *v;
2708 if ( !shadow_mode_translate(d) )
2709 return;
2711 v = current;
2712 if ( v->domain != d )
2713 v = d->vcpu[0];
2715 SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
2717 ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
2718 //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
2720 if ( v != NULL )
2722 shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
2723 if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
2724 flush_tlb_mask(d->domain_dirty_cpumask);
2727 shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
2728 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
2731 void
2732 shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
2733 unsigned long mfn)
2735 shadow_lock(d);
2736 shadow_audit_p2m(d);
2737 sh_p2m_remove_page(d, gfn, mfn);
2738 shadow_audit_p2m(d);
2739 shadow_unlock(d);
2742 void
2743 shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
2744 unsigned long mfn)
2746 unsigned long ogfn;
2747 mfn_t omfn;
2749 if ( !shadow_mode_translate(d) )
2750 return;
2752 shadow_lock(d);
2753 shadow_audit_p2m(d);
2755 SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
2757 omfn = sh_gfn_to_mfn(d, gfn);
2758 if ( valid_mfn(omfn) )
2760 /* Get rid of the old mapping, especially any shadows */
2761 struct vcpu *v = current;
2762 if ( v->domain != d )
2763 v = d->vcpu[0];
2764 if ( v != NULL )
2766 shadow_remove_all_shadows_and_parents(v, omfn);
2767 if ( shadow_remove_all_mappings(v, omfn) )
2768 flush_tlb_mask(d->domain_dirty_cpumask);
2770 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
2773 ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
2774 if (
2775 #ifdef __x86_64__
2776 (ogfn != 0x5555555555555555L)
2777 #else
2778 (ogfn != 0x55555555L)
2779 #endif
2780 && (ogfn != INVALID_M2P_ENTRY)
2781 && (ogfn != gfn) )
2783 /* This machine frame is already mapped at another physical address */
2784 SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
2785 mfn, ogfn, gfn);
2786 if ( valid_mfn(omfn = sh_gfn_to_mfn(d, ogfn)) )
2788 SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
2789 ogfn , mfn_x(omfn));
2790 if ( mfn_x(omfn) == mfn )
2791 sh_p2m_remove_page(d, ogfn, mfn);
2795 shadow_set_p2m_entry(d, gfn, _mfn(mfn));
2796 set_gpfn_from_mfn(mfn, gfn);
2797 shadow_audit_p2m(d);
2798 shadow_unlock(d);
2801 /**************************************************************************/
2802 /* Log-dirty mode support */
2804 /* Convert a shadow to log-dirty mode. */
2805 void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
2807 BUG();
2811 /* Read a domain's log-dirty bitmap and stats.
2812 * If the operation is a CLEAN, clear the bitmap and stats as well. */
2813 static int shadow_log_dirty_op(
2814 struct domain *d, struct xen_domctl_shadow_op *sc)
2816 int i, rv = 0, clean = 0;
2818 domain_pause(d);
2819 shadow_lock(d);
2821 clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
2823 SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
2824 (clean) ? "clean" : "peek",
2825 d->domain_id,
2826 d->arch.shadow.fault_count,
2827 d->arch.shadow.dirty_count);
2829 sc->stats.fault_count = d->arch.shadow.fault_count;
2830 sc->stats.dirty_count = d->arch.shadow.dirty_count;
2832 if ( clean )
2834 /* Need to revoke write access to the domain's pages again.
2835 * In future, we'll have a less heavy-handed approach to this,
2836 * but for now, we just unshadow everything except Xen. */
2837 shadow_blow_tables(d);
2839 d->arch.shadow.fault_count = 0;
2840 d->arch.shadow.dirty_count = 0;
2843 if ( guest_handle_is_null(sc->dirty_bitmap) ||
2844 (d->arch.shadow.dirty_bitmap == NULL) )
2846 rv = -EINVAL;
2847 goto out;
2850 if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
2851 sc->pages = d->arch.shadow.dirty_bitmap_size;
2853 #define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
2854 for ( i = 0; i < sc->pages; i += CHUNK )
2856 int bytes = ((((sc->pages - i) > CHUNK)
2857 ? CHUNK
2858 : (sc->pages - i)) + 7) / 8;
2860 if ( copy_to_guest_offset(
2861 sc->dirty_bitmap,
2862 i/(8*sizeof(unsigned long)),
2863 d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
2864 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
2866 rv = -EINVAL;
2867 goto out;
2870 if ( clean )
2871 memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
2872 0, bytes);
2874 #undef CHUNK
2876 out:
2877 shadow_unlock(d);
2878 domain_unpause(d);
2879 return 0;
2883 /* Mark a page as dirty */
2884 void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
2886 unsigned long pfn;
2888 ASSERT(shadow_lock_is_acquired(d));
2889 ASSERT(shadow_mode_log_dirty(d));
2891 if ( !valid_mfn(gmfn) )
2892 return;
2894 ASSERT(d->arch.shadow.dirty_bitmap != NULL);
2896 /* We /really/ mean PFN here, even for non-translated guests. */
2897 pfn = get_gpfn_from_mfn(mfn_x(gmfn));
2899 /*
2900 * Values with the MSB set denote MFNs that aren't really part of the
2901 * domain's pseudo-physical memory map (e.g., the shared info frame).
2902 * Nothing to do here...
2903 */
2904 if ( unlikely(!VALID_M2P(pfn)) )
2905 return;
2907 /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
2908 if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
2910 if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
2912 SHADOW_DEBUG(LOGDIRTY,
2913 "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
2914 mfn_x(gmfn), pfn, d->domain_id);
2915 d->arch.shadow.dirty_count++;
2918 else
2920 SHADOW_PRINTK("mark_dirty OOR! "
2921 "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
2922 "owner=%d c=%08x t=%" PRtype_info "\n",
2923 mfn_x(gmfn),
2924 pfn,
2925 d->arch.shadow.dirty_bitmap_size,
2926 d->domain_id,
2927 (page_get_owner(mfn_to_page(gmfn))
2928 ? page_get_owner(mfn_to_page(gmfn))->domain_id
2929 : -1),
2930 mfn_to_page(gmfn)->count_info,
2931 mfn_to_page(gmfn)->u.inuse.type_info);
2936 /**************************************************************************/
2937 /* Shadow-control XEN_DOMCTL dispatcher */
2939 int shadow_domctl(struct domain *d,
2940 xen_domctl_shadow_op_t *sc,
2941 XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
2943 int rc, preempted = 0;
2945 if ( unlikely(d == current->domain) )
2947 gdprintk(XENLOG_INFO, "Don't try to do a shadow op on yourself!\n");
2948 return -EINVAL;
2951 switch ( sc->op )
2953 case XEN_DOMCTL_SHADOW_OP_OFF:
2954 if ( shadow_mode_log_dirty(d) )
2955 if ( (rc = shadow_log_dirty_disable(d)) != 0 )
2956 return rc;
2957 if ( is_hvm_domain(d) )
2958 return -EINVAL;
2959 if ( d->arch.shadow.mode & SHM2_enable )
2960 if ( (rc = shadow_test_disable(d)) != 0 )
2961 return rc;
2962 return 0;
2964 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
2965 return shadow_test_enable(d);
2967 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
2968 return shadow_log_dirty_enable(d);
2970 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
2971 return shadow_enable(d, SHM2_refcounts|SHM2_translate);
2973 case XEN_DOMCTL_SHADOW_OP_CLEAN:
2974 case XEN_DOMCTL_SHADOW_OP_PEEK:
2975 return shadow_log_dirty_op(d, sc);
2977 case XEN_DOMCTL_SHADOW_OP_ENABLE:
2978 if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
2979 return shadow_log_dirty_enable(d);
2980 return shadow_enable(d, sc->mode << SHM2_shift);
2982 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
2983 sc->mb = shadow_get_allocation(d);
2984 return 0;
2986 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
2987 rc = shadow_set_allocation(d, sc->mb, &preempted);
2988 if ( preempted )
2989 /* Not finished. Set up to re-run the call. */
2990 rc = hypercall_create_continuation(
2991 __HYPERVISOR_domctl, "h", u_domctl);
2992 else
2993 /* Finished. Return the new allocation */
2994 sc->mb = shadow_get_allocation(d);
2995 return rc;
2997 default:
2998 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
2999 return -EINVAL;
3004 /**************************************************************************/
3005 /* Auditing shadow tables */
3007 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3009 void shadow_audit_tables(struct vcpu *v)
3011 /* Dispatch table for getting per-type functions */
3012 static hash_callback_t callbacks[16] = {
3013 NULL, /* none */
3014 #if CONFIG_PAGING_LEVELS == 2
3015 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
3016 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
3017 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
3018 #else
3019 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
3020 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
3021 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
3022 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
3023 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
3024 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
3025 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
3026 #if CONFIG_PAGING_LEVELS >= 4
3027 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
3028 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
3029 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
3030 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
3031 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
3032 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3033 #endif /* CONFIG_PAGING_LEVELS > 2 */
3034 NULL /* All the rest */
3035 };
3036 unsigned int mask;
3038 if ( !(SHADOW_AUDIT_ENABLE) )
3039 return;
3041 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3042 mask = ~1; /* Audit every table in the system */
3043 else
3045 /* Audit only the current mode's tables */
3046 switch ( v->arch.shadow.mode->guest_levels )
3048 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3049 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3050 |SHF_L2H_PAE); break;
3051 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3052 |SHF_L3_64|SHF_L4_64); break;
3053 default: BUG();
3057 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3060 #endif /* Shadow audit */
3063 /**************************************************************************/
3064 /* Auditing p2m tables */
3066 #if SHADOW_AUDIT & SHADOW_AUDIT_P2M
3068 void shadow_audit_p2m(struct domain *d)
3070 struct list_head *entry;
3071 struct page_info *page;
3072 struct domain *od;
3073 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
3074 mfn_t p2mfn;
3075 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
3076 int test_linear;
3078 if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
3079 return;
3081 //SHADOW_PRINTK("p2m audit starts\n");
3083 test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
3084 if ( test_linear )
3085 local_flush_tlb();
3087 /* Audit part one: walk the domain's page allocation list, checking
3088 * the m2p entries. */
3089 for ( entry = d->page_list.next;
3090 entry != &d->page_list;
3091 entry = entry->next )
3093 page = list_entry(entry, struct page_info, list);
3094 mfn = mfn_x(page_to_mfn(page));
3096 // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
3098 od = page_get_owner(page);
3100 if ( od != d )
3102 SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
3103 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
3104 continue;
3107 gfn = get_gpfn_from_mfn(mfn);
3108 if ( gfn == INVALID_M2P_ENTRY )
3110 orphans_i++;
3111 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
3112 // mfn);
3113 continue;
3116 if ( gfn == 0x55555555 )
3118 orphans_d++;
3119 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
3120 // mfn);
3121 continue;
3124 p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
3125 if ( mfn_x(p2mfn) != mfn )
3127 mpbad++;
3128 SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
3129 " (-> gfn %#lx)\n",
3130 mfn, gfn, mfn_x(p2mfn),
3131 (mfn_valid(p2mfn)
3132 ? get_gpfn_from_mfn(mfn_x(p2mfn))
3133 : -1u));
3134 /* This m2p entry is stale: the domain has another frame in
3135 * this physical slot. No great disaster, but for neatness,
3136 * blow away the m2p entry. */
3137 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3140 if ( test_linear && (gfn <= d->arch.max_mapped_pfn) )
3142 lp2mfn = gfn_to_mfn_current(gfn);
3143 if ( mfn_x(lp2mfn) != mfn_x(p2mfn) )
3145 SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
3146 "(!= mfn %#lx)\n", gfn,
3147 mfn_x(lp2mfn), mfn_x(p2mfn));
3151 // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
3152 // mfn, gfn, p2mfn, lp2mfn);
3155 /* Audit part two: walk the domain's p2m table, checking the entries. */
3156 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
3158 l2_pgentry_t *l2e;
3159 l1_pgentry_t *l1e;
3160 int i1, i2;
3162 #if CONFIG_PAGING_LEVELS == 4
3163 l4_pgentry_t *l4e;
3164 l3_pgentry_t *l3e;
3165 int i3, i4;
3166 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3167 #elif CONFIG_PAGING_LEVELS == 3
3168 l3_pgentry_t *l3e;
3169 int i3;
3170 l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3171 #else /* CONFIG_PAGING_LEVELS == 2 */
3172 l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3173 #endif
3175 gfn = 0;
3176 #if CONFIG_PAGING_LEVELS >= 3
3177 #if CONFIG_PAGING_LEVELS >= 4
3178 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
3180 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
3182 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
3183 continue;
3185 l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
3186 #endif /* now at levels 3 or 4... */
3187 for ( i3 = 0;
3188 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
3189 i3++ )
3191 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
3193 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3194 continue;
3196 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
3197 #endif /* all levels... */
3198 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
3200 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
3202 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
3203 continue;
3205 l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
3207 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
3209 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
3210 continue;
3211 mfn = l1e_get_pfn(l1e[i1]);
3212 ASSERT(valid_mfn(_mfn(mfn)));
3213 m2pfn = get_gpfn_from_mfn(mfn);
3214 if ( m2pfn != gfn )
3216 pmbad++;
3217 SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
3218 " -> gfn %#lx\n", gfn, mfn, m2pfn);
3219 BUG();
3222 sh_unmap_domain_page(l1e);
3224 #if CONFIG_PAGING_LEVELS >= 3
3225 sh_unmap_domain_page(l2e);
3227 #if CONFIG_PAGING_LEVELS >= 4
3228 sh_unmap_domain_page(l3e);
3230 #endif
3231 #endif
3233 #if CONFIG_PAGING_LEVELS == 4
3234 sh_unmap_domain_page(l4e);
3235 #elif CONFIG_PAGING_LEVELS == 3
3236 sh_unmap_domain_page(l3e);
3237 #else /* CONFIG_PAGING_LEVELS == 2 */
3238 sh_unmap_domain_page(l2e);
3239 #endif
3243 //SHADOW_PRINTK("p2m audit complete\n");
3244 //if ( orphans_i | orphans_d | mpbad | pmbad )
3245 // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
3246 // orphans_i + orphans_d, orphans_i, orphans_d,
3247 if ( mpbad | pmbad )
3248 SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
3249 pmbad, mpbad);
3252 #endif /* p2m audit */
3254 /*
3255 * Local variables:
3256 * mode: C
3257 * c-set-style: "BSD"
3258 * c-basic-offset: 4
3259 * indent-tabs-mode: nil
3260 * End:
3261 */