ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 12717:697b0203e68f

[XEN] Fix error paths in p2m insertion code
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Fri Dec 01 09:28:14 2006 +0000 (2006-12-01)
parents d5d5915f4a7c
children f35f17d24a23
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include "private.h"
41 #if SHADOW_AUDIT
42 int shadow_audit_enable = 0;
44 static void shadow_audit_key(unsigned char key)
45 {
46 shadow_audit_enable = !shadow_audit_enable;
47 printk("%s shadow_audit_enable=%d\n",
48 __func__, shadow_audit_enable);
49 }
51 static int __init shadow_audit_key_init(void)
52 {
53 register_keyhandler(
54 'O', shadow_audit_key, "toggle shadow audits");
55 return 0;
56 }
57 __initcall(shadow_audit_key_init);
58 #endif /* SHADOW_AUDIT */
60 static void sh_free_log_dirty_bitmap(struct domain *d);
62 int _shadow_mode_refcounts(struct domain *d)
63 {
64 return shadow_mode_refcounts(d);
65 }
68 /**************************************************************************/
69 /* x86 emulator support for the shadow code
70 */
72 static int hvm_translate_linear_addr(
73 enum x86_segment seg,
74 unsigned long offset,
75 unsigned int bytes,
76 unsigned int is_write,
77 unsigned long *paddr)
78 {
79 struct segment_register creg, dreg;
80 unsigned long limit, addr = offset;
81 uint32_t last_byte;
83 hvm_get_segment_register(current, x86_seg_cs, &creg);
84 hvm_get_segment_register(current, seg, &dreg);
86 if ( !creg.attr.fields.l || !hvm_long_mode_enabled(current) )
87 {
88 /*
89 * COMPATIBILITY MODE: Apply segment checks and add base.
90 */
92 /* If this is a store, is the segment a writable data segment? */
93 if ( is_write && ((dreg.attr.fields.type & 0xa) != 0x2) )
94 goto gpf;
96 /* Calculate the segment limit, including granularity flag. */
97 limit = dreg.limit;
98 if ( dreg.attr.fields.g )
99 limit = (limit << 12) | 0xfff;
101 last_byte = offset + bytes - 1;
103 /* Is this a grows-down data segment? Special limit check if so. */
104 if ( (dreg.attr.fields.type & 0xc) == 0x4 )
105 {
106 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
107 if ( !dreg.attr.fields.db )
108 last_byte = (uint16_t)last_byte;
110 /* Check first byte and last byte against respective bounds. */
111 if ( (offset <= limit) || (last_byte < offset) )
112 goto gpf;
113 }
114 else if ( (last_byte > limit) || (last_byte < offset) )
115 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
117 /*
118 * Hardware truncates to 32 bits in compatibility mode.
119 * It does not truncate to 16 bits in 16-bit address-size mode.
120 */
121 addr = (uint32_t)(addr + dreg.base);
122 }
123 else if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
124 {
125 /*
126 * LONG MODE: FS and GS add a segment base.
127 */
128 addr += dreg.base;
129 }
131 *paddr = addr;
132 return 0;
134 gpf:
135 /* Inject #GP(0). */
136 hvm_inject_exception(TRAP_gp_fault, 0);
137 return X86EMUL_PROPAGATE_FAULT;
138 }
140 static int
141 sh_x86_emulate_read(enum x86_segment seg,
142 unsigned long offset,
143 unsigned long *val,
144 unsigned int bytes,
145 struct x86_emulate_ctxt *ctxt)
146 {
147 unsigned long addr;
148 int rc;
150 rc = hvm_translate_linear_addr(seg, offset, bytes, 0, &addr);
151 if ( rc )
152 return rc;
154 *val = 0;
155 // XXX -- this is WRONG.
156 // It entirely ignores the permissions in the page tables.
157 // In this case, that is only a user vs supervisor access check.
158 //
159 if ( hvm_copy_from_guest_virt(val, addr, bytes) == 0 )
160 {
161 #if 0
162 struct vcpu *v = current;
163 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
164 v->domain->domain_id, v->vcpu_id,
165 addr, *val, bytes);
166 #endif
167 return X86EMUL_CONTINUE;
168 }
170 /* If we got here, there was nothing mapped here, or a bad GFN
171 * was mapped here. This should never happen: we're here because
172 * of a write fault at the end of the instruction we're emulating. */
173 SHADOW_PRINTK("read failed to va %#lx\n", addr);
174 return X86EMUL_PROPAGATE_FAULT;
175 }
177 static int
178 sh_x86_emulate_write(enum x86_segment seg,
179 unsigned long offset,
180 unsigned long val,
181 unsigned int bytes,
182 struct x86_emulate_ctxt *ctxt)
183 {
184 struct vcpu *v = current;
185 unsigned long addr;
186 int rc;
188 rc = hvm_translate_linear_addr(seg, offset, bytes, 1, &addr);
189 if ( rc )
190 return rc;
192 #if 0
193 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
194 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
195 #endif
196 return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
197 }
199 static int
200 sh_x86_emulate_cmpxchg(enum x86_segment seg,
201 unsigned long offset,
202 unsigned long old,
203 unsigned long new,
204 unsigned int bytes,
205 struct x86_emulate_ctxt *ctxt)
206 {
207 struct vcpu *v = current;
208 unsigned long addr;
209 int rc;
211 rc = hvm_translate_linear_addr(seg, offset, bytes, 1, &addr);
212 if ( rc )
213 return rc;
215 #if 0
216 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
217 v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
218 #endif
219 return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new,
220 bytes, ctxt);
221 }
223 static int
224 sh_x86_emulate_cmpxchg8b(enum x86_segment seg,
225 unsigned long offset,
226 unsigned long old_lo,
227 unsigned long old_hi,
228 unsigned long new_lo,
229 unsigned long new_hi,
230 struct x86_emulate_ctxt *ctxt)
231 {
232 struct vcpu *v = current;
233 unsigned long addr;
234 int rc;
236 rc = hvm_translate_linear_addr(seg, offset, 8, 1, &addr);
237 if ( rc )
238 return rc;
240 #if 0
241 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
242 v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
243 new_hi, new_lo, ctxt);
244 #endif
245 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
246 new_lo, new_hi, ctxt);
247 }
250 struct x86_emulate_ops shadow_emulator_ops = {
251 .read = sh_x86_emulate_read,
252 .write = sh_x86_emulate_write,
253 .cmpxchg = sh_x86_emulate_cmpxchg,
254 .cmpxchg8b = sh_x86_emulate_cmpxchg8b,
255 };
257 /**************************************************************************/
258 /* Code for "promoting" a guest page to the point where the shadow code is
259 * willing to let it be treated as a guest page table. This generally
260 * involves making sure there are no writable mappings available to the guest
261 * for this page.
262 */
263 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
264 {
265 struct page_info *page = mfn_to_page(gmfn);
267 ASSERT(mfn_valid(gmfn));
269 /* We should never try to promote a gmfn that has writeable mappings */
270 ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
272 /* Is the page already shadowed? */
273 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
274 page->shadow_flags = 0;
276 ASSERT(!test_bit(type, &page->shadow_flags));
277 set_bit(type, &page->shadow_flags);
278 }
280 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
281 {
282 struct page_info *page = mfn_to_page(gmfn);
284 ASSERT(test_bit(_PGC_page_table, &page->count_info));
285 ASSERT(test_bit(type, &page->shadow_flags));
287 clear_bit(type, &page->shadow_flags);
289 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
290 {
291 /* tlbflush timestamp field is valid again */
292 page->tlbflush_timestamp = tlbflush_current_time();
293 clear_bit(_PGC_page_table, &page->count_info);
294 }
295 }
297 /**************************************************************************/
298 /* Validate a pagetable change from the guest and update the shadows.
299 * Returns a bitmask of SHADOW_SET_* flags. */
301 int
302 __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
303 void *entry, u32 size)
304 {
305 int result = 0;
306 struct page_info *page = mfn_to_page(gmfn);
308 sh_mark_dirty(v->domain, gmfn);
310 // Determine which types of shadows are affected, and update each.
311 //
312 // Always validate L1s before L2s to prevent another cpu with a linear
313 // mapping of this gmfn from seeing a walk that results from
314 // using the new L2 value and the old L1 value. (It is OK for such a
315 // guest to see a walk that uses the old L2 value with the new L1 value,
316 // as hardware could behave this way if one level of the pagewalk occurs
317 // before the store, and the next level of the pagewalk occurs after the
318 // store.
319 //
320 // Ditto for L2s before L3s, etc.
321 //
323 if ( !(page->count_info & PGC_page_table) )
324 return 0; /* Not shadowed at all */
326 #if CONFIG_PAGING_LEVELS == 2
327 if ( page->shadow_flags & SHF_L1_32 )
328 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
329 (v, gmfn, entry, size);
330 #else
331 if ( page->shadow_flags & SHF_L1_32 )
332 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
333 (v, gmfn, entry, size);
334 #endif
336 #if CONFIG_PAGING_LEVELS == 2
337 if ( page->shadow_flags & SHF_L2_32 )
338 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
339 (v, gmfn, entry, size);
340 #else
341 if ( page->shadow_flags & SHF_L2_32 )
342 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
343 (v, gmfn, entry, size);
344 #endif
346 #if CONFIG_PAGING_LEVELS >= 3
347 if ( page->shadow_flags & SHF_L1_PAE )
348 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
349 (v, gmfn, entry, size);
350 if ( page->shadow_flags & SHF_L2_PAE )
351 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
352 (v, gmfn, entry, size);
353 if ( page->shadow_flags & SHF_L2H_PAE )
354 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
355 (v, gmfn, entry, size);
356 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
357 ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
358 #endif
360 #if CONFIG_PAGING_LEVELS >= 4
361 if ( page->shadow_flags & SHF_L1_64 )
362 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
363 (v, gmfn, entry, size);
364 if ( page->shadow_flags & SHF_L2_64 )
365 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
366 (v, gmfn, entry, size);
367 if ( page->shadow_flags & SHF_L3_64 )
368 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
369 (v, gmfn, entry, size);
370 if ( page->shadow_flags & SHF_L4_64 )
371 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
372 (v, gmfn, entry, size);
373 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
374 ASSERT((page->shadow_flags
375 & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
376 #endif
378 return result;
379 }
382 int
383 shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
384 /* This is the entry point from hypercalls. It returns a bitmask of all the
385 * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
386 {
387 int rc;
389 ASSERT(shadow_lock_is_acquired(v->domain));
390 rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
391 shadow_audit_tables(v);
392 return rc;
393 }
395 void
396 shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
397 void *entry, u32 size)
398 /* This is the entry point for emulated writes to pagetables in HVM guests and
399 * PV translated guests.
400 */
401 {
402 struct domain *d = v->domain;
403 int rc;
405 ASSERT(shadow_lock_is_acquired(v->domain));
406 rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
407 if ( rc & SHADOW_SET_FLUSH )
408 /* Need to flush TLBs to pick up shadow PT changes */
409 flush_tlb_mask(d->domain_dirty_cpumask);
410 if ( rc & SHADOW_SET_ERROR )
411 {
412 /* This page is probably not a pagetable any more: tear it out of the
413 * shadows, along with any tables that reference it.
414 * Since the validate call above will have made a "safe" (i.e. zero)
415 * shadow entry, we can let the domain live even if we can't fully
416 * unshadow the page. */
417 sh_remove_shadows(v, gmfn, 0, 0);
418 }
419 }
422 /**************************************************************************/
423 /* Memory management for shadow pages. */
425 /* Allocating shadow pages
426 * -----------------------
427 *
428 * Most shadow pages are allocated singly, but there is one case where
429 * we need to allocate multiple pages together: shadowing 32-bit guest
430 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
431 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
432 * l1 tables (covering 2MB of virtual address space each). Similarly, a
433 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
434 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
435 * contiguous and aligned; functions for handling offsets into them are
436 * defined in shadow.c (shadow_l1_index() etc.)
437 *
438 * This table shows the allocation behaviour of the different modes:
439 *
440 * Xen paging 32b pae pae 64b 64b 64b
441 * Guest paging 32b 32b pae 32b pae 64b
442 * PV or HVM * HVM * HVM HVM *
443 * Shadow paging 32b pae pae pae pae 64b
444 *
445 * sl1 size 4k 8k 4k 8k 4k 4k
446 * sl2 size 4k 16k 4k 16k 4k 4k
447 * sl3 size - - - - - 4k
448 * sl4 size - - - - - 4k
449 *
450 * We allocate memory from xen in four-page units and break them down
451 * with a simple buddy allocator. Can't use the xen allocator to handle
452 * this as it only works for contiguous zones, and a domain's shadow
453 * pool is made of fragments.
454 *
455 * In HVM guests, the p2m table is built out of shadow pages, and we provide
456 * a function for the p2m management to steal pages, in max-order chunks, from
457 * the free pool. We don't provide for giving them back, yet.
458 */
460 /* Figure out the least acceptable quantity of shadow memory.
461 * The minimum memory requirement for always being able to free up a
462 * chunk of memory is very small -- only three max-order chunks per
463 * vcpu to hold the top level shadows and pages with Xen mappings in them.
464 *
465 * But for a guest to be guaranteed to successfully execute a single
466 * instruction, we must be able to map a large number (about thirty) VAs
467 * at the same time, which means that to guarantee progress, we must
468 * allow for more than ninety allocated pages per vcpu. We round that
469 * up to 128 pages, or half a megabyte per vcpu. */
470 unsigned int shadow_min_acceptable_pages(struct domain *d)
471 {
472 u32 vcpu_count = 0;
473 struct vcpu *v;
475 for_each_vcpu(d, v)
476 vcpu_count++;
478 return (vcpu_count * 128);
479 }
481 /* Figure out the order of allocation needed for a given shadow type */
482 static inline u32
483 shadow_order(unsigned int shadow_type)
484 {
485 #if CONFIG_PAGING_LEVELS > 2
486 static const u32 type_to_order[16] = {
487 0, /* SH_type_none */
488 1, /* SH_type_l1_32_shadow */
489 1, /* SH_type_fl1_32_shadow */
490 2, /* SH_type_l2_32_shadow */
491 0, /* SH_type_l1_pae_shadow */
492 0, /* SH_type_fl1_pae_shadow */
493 0, /* SH_type_l2_pae_shadow */
494 0, /* SH_type_l2h_pae_shadow */
495 0, /* SH_type_l1_64_shadow */
496 0, /* SH_type_fl1_64_shadow */
497 0, /* SH_type_l2_64_shadow */
498 0, /* SH_type_l3_64_shadow */
499 0, /* SH_type_l4_64_shadow */
500 2, /* SH_type_p2m_table */
501 0 /* SH_type_monitor_table */
502 };
503 ASSERT(shadow_type < 16);
504 return type_to_order[shadow_type];
505 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
506 return 0;
507 #endif
508 }
511 /* Do we have a free chunk of at least this order? */
512 static inline int chunk_is_available(struct domain *d, int order)
513 {
514 int i;
516 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
517 if ( !list_empty(&d->arch.shadow.freelists[i]) )
518 return 1;
519 return 0;
520 }
522 /* Dispatcher function: call the per-mode function that will unhook the
523 * non-Xen mappings in this top-level shadow mfn */
524 void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
525 {
526 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
527 switch ( sp->type )
528 {
529 case SH_type_l2_32_shadow:
530 #if CONFIG_PAGING_LEVELS == 2
531 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
532 #else
533 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
534 #endif
535 break;
536 #if CONFIG_PAGING_LEVELS >= 3
537 case SH_type_l2_pae_shadow:
538 case SH_type_l2h_pae_shadow:
539 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
540 break;
541 #endif
542 #if CONFIG_PAGING_LEVELS >= 4
543 case SH_type_l4_64_shadow:
544 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
545 break;
546 #endif
547 default:
548 SHADOW_PRINTK("top-level shadow has bad type %08x\n", sp->type);
549 BUG();
550 }
551 }
554 /* Make sure there is at least one chunk of the required order available
555 * in the shadow page pool. This must be called before any calls to
556 * shadow_alloc(). Since this will free existing shadows to make room,
557 * it must be called early enough to avoid freeing shadows that the
558 * caller is currently working on. */
559 void shadow_prealloc(struct domain *d, unsigned int order)
560 {
561 /* Need a vpcu for calling unpins; for now, since we don't have
562 * per-vcpu shadows, any will do */
563 struct vcpu *v, *v2;
564 struct list_head *l, *t;
565 struct shadow_page_info *sp;
566 cpumask_t flushmask = CPU_MASK_NONE;
567 mfn_t smfn;
568 int i;
570 if ( chunk_is_available(d, order) ) return;
572 v = current;
573 if ( v->domain != d )
574 v = d->vcpu[0];
575 ASSERT(v != NULL);
577 /* Stage one: walk the list of pinned pages, unpinning them */
578 perfc_incrc(shadow_prealloc_1);
579 list_for_each_backwards_safe(l, t, &d->arch.shadow.pinned_shadows)
580 {
581 sp = list_entry(l, struct shadow_page_info, list);
582 smfn = shadow_page_to_mfn(sp);
584 /* Unpin this top-level shadow */
585 sh_unpin(v, smfn);
587 /* See if that freed up a chunk of appropriate size */
588 if ( chunk_is_available(d, order) ) return;
589 }
591 /* Stage two: all shadow pages are in use in hierarchies that are
592 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
593 * mappings. */
594 perfc_incrc(shadow_prealloc_2);
596 for_each_vcpu(d, v2)
597 for ( i = 0 ; i < 4 ; i++ )
598 {
599 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
600 {
601 shadow_unhook_mappings(v,
602 pagetable_get_mfn(v2->arch.shadow_table[i]));
603 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
605 /* See if that freed up a chunk of appropriate size */
606 if ( chunk_is_available(d, order) )
607 {
608 flush_tlb_mask(flushmask);
609 return;
610 }
611 }
612 }
614 /* Nothing more we can do: all remaining shadows are of pages that
615 * hold Xen mappings for some vcpu. This can never happen. */
616 SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
617 " shadow pages total = %u, free = %u, p2m=%u\n",
618 1 << order,
619 d->arch.shadow.total_pages,
620 d->arch.shadow.free_pages,
621 d->arch.shadow.p2m_pages);
622 BUG();
623 }
625 /* Deliberately free all the memory we can: this will tear down all of
626 * this domain's shadows */
627 static void shadow_blow_tables(struct domain *d)
628 {
629 struct list_head *l, *t;
630 struct shadow_page_info *sp;
631 struct vcpu *v = d->vcpu[0];
632 mfn_t smfn;
633 int i;
635 /* Pass one: unpin all pinned pages */
636 list_for_each_backwards_safe(l,t, &d->arch.shadow.pinned_shadows)
637 {
638 sp = list_entry(l, struct shadow_page_info, list);
639 smfn = shadow_page_to_mfn(sp);
640 sh_unpin(v, smfn);
641 }
643 /* Second pass: unhook entries of in-use shadows */
644 for_each_vcpu(d, v)
645 for ( i = 0 ; i < 4 ; i++ )
646 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
647 shadow_unhook_mappings(v,
648 pagetable_get_mfn(v->arch.shadow_table[i]));
650 /* Make sure everyone sees the unshadowings */
651 flush_tlb_mask(d->domain_dirty_cpumask);
652 }
655 #ifndef NDEBUG
656 /* Blow all shadows of all shadowed domains: this can be used to cause the
657 * guest's pagetables to be re-shadowed if we suspect that the shadows
658 * have somehow got out of sync */
659 static void shadow_blow_all_tables(unsigned char c)
660 {
661 struct domain *d;
662 printk("'%c' pressed -> blowing all shadow tables\n", c);
663 for_each_domain(d)
664 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
665 {
666 shadow_lock(d);
667 shadow_blow_tables(d);
668 shadow_unlock(d);
669 }
670 }
672 /* Register this function in the Xen console keypress table */
673 static __init int shadow_blow_tables_keyhandler_init(void)
674 {
675 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
676 return 0;
677 }
678 __initcall(shadow_blow_tables_keyhandler_init);
679 #endif /* !NDEBUG */
681 /* Allocate another shadow's worth of (contiguous, aligned) pages,
682 * and fill in the type and backpointer fields of their page_infos.
683 * Never fails to allocate. */
684 mfn_t shadow_alloc(struct domain *d,
685 u32 shadow_type,
686 unsigned long backpointer)
687 {
688 struct shadow_page_info *sp = NULL;
689 unsigned int order = shadow_order(shadow_type);
690 cpumask_t mask;
691 void *p;
692 int i;
694 ASSERT(shadow_lock_is_acquired(d));
695 ASSERT(order <= SHADOW_MAX_ORDER);
696 ASSERT(shadow_type != SH_type_none);
697 perfc_incrc(shadow_alloc);
699 /* Find smallest order which can satisfy the request. */
700 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
701 if ( !list_empty(&d->arch.shadow.freelists[i]) )
702 {
703 sp = list_entry(d->arch.shadow.freelists[i].next,
704 struct shadow_page_info, list);
705 list_del(&sp->list);
707 /* We may have to halve the chunk a number of times. */
708 while ( i != order )
709 {
710 i--;
711 sp->order = i;
712 list_add_tail(&sp->list, &d->arch.shadow.freelists[i]);
713 sp += 1 << i;
714 }
715 d->arch.shadow.free_pages -= 1 << order;
717 /* Init page info fields and clear the pages */
718 for ( i = 0; i < 1<<order ; i++ )
719 {
720 /* Before we overwrite the old contents of this page,
721 * we need to be sure that no TLB holds a pointer to it. */
722 mask = d->domain_dirty_cpumask;
723 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
724 if ( unlikely(!cpus_empty(mask)) )
725 {
726 perfc_incrc(shadow_alloc_tlbflush);
727 flush_tlb_mask(mask);
728 }
729 /* Now safe to clear the page for reuse */
730 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
731 ASSERT(p != NULL);
732 clear_page(p);
733 sh_unmap_domain_page(p);
734 INIT_LIST_HEAD(&sp[i].list);
735 sp[i].type = shadow_type;
736 sp[i].pinned = 0;
737 sp[i].logdirty = 0;
738 sp[i].count = 0;
739 sp[i].backpointer = backpointer;
740 sp[i].next_shadow = NULL;
741 perfc_incr(shadow_alloc_count);
742 }
743 return shadow_page_to_mfn(sp);
744 }
746 /* If we get here, we failed to allocate. This should never happen.
747 * It means that we didn't call shadow_prealloc() correctly before
748 * we allocated. We can't recover by calling prealloc here, because
749 * we might free up higher-level pages that the caller is working on. */
750 SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
751 BUG();
752 }
755 /* Return some shadow pages to the pool. */
756 void shadow_free(struct domain *d, mfn_t smfn)
757 {
758 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
759 u32 shadow_type;
760 unsigned long order;
761 unsigned long mask;
762 int i;
764 ASSERT(shadow_lock_is_acquired(d));
765 perfc_incrc(shadow_free);
767 shadow_type = sp->type;
768 ASSERT(shadow_type != SH_type_none);
769 ASSERT(shadow_type != SH_type_p2m_table);
770 order = shadow_order(shadow_type);
772 d->arch.shadow.free_pages += 1 << order;
774 for ( i = 0; i < 1<<order; i++ )
775 {
776 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
777 struct vcpu *v;
778 for_each_vcpu(d, v)
779 {
780 /* No longer safe to look for a writeable mapping in this shadow */
781 if ( v->arch.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
782 v->arch.shadow.last_writeable_pte_smfn = 0;
783 }
784 #endif
785 /* Strip out the type: this is now a free shadow page */
786 sp[i].type = 0;
787 /* Remember the TLB timestamp so we will know whether to flush
788 * TLBs when we reuse the page. Because the destructors leave the
789 * contents of the pages in place, we can delay TLB flushes until
790 * just before the allocator hands the page out again. */
791 sp[i].tlbflush_timestamp = tlbflush_current_time();
792 perfc_decr(shadow_alloc_count);
793 }
795 /* Merge chunks as far as possible. */
796 while ( order < SHADOW_MAX_ORDER )
797 {
798 mask = 1 << order;
799 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
800 /* Merge with predecessor block? */
801 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
802 break;
803 list_del(&(sp-mask)->list);
804 sp -= mask;
805 } else {
806 /* Merge with successor block? */
807 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
808 break;
809 list_del(&(sp+mask)->list);
810 }
811 order++;
812 }
814 sp->order = order;
815 list_add_tail(&sp->list, &d->arch.shadow.freelists[order]);
816 }
818 /* Divert some memory from the pool to be used by the p2m mapping.
819 * This action is irreversible: the p2m mapping only ever grows.
820 * That's OK because the p2m table only exists for translated domains,
821 * and those domains can't ever turn off shadow mode.
822 * Also, we only ever allocate a max-order chunk, so as to preserve
823 * the invariant that shadow_prealloc() always works.
824 * Returns 0 iff it can't get a chunk (the caller should then
825 * free up some pages in domheap and call set_sh_allocation);
826 * returns non-zero on success.
827 */
828 static int
829 shadow_alloc_p2m_pages(struct domain *d)
830 {
831 struct page_info *pg;
832 u32 i;
833 ASSERT(shadow_lock_is_acquired(d));
835 if ( d->arch.shadow.total_pages
836 < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
837 return 0; /* Not enough shadow memory: need to increase it first */
839 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
840 d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
841 d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
842 for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
843 {
844 /* Unlike shadow pages, mark p2m pages as owned by the domain.
845 * Marking the domain as the owner would normally allow the guest to
846 * create mappings of these pages, but these p2m pages will never be
847 * in the domain's guest-physical address space, and so that is not
848 * believed to be a concern.
849 */
850 page_set_owner(&pg[i], d);
851 list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
852 }
853 return 1;
854 }
856 // Returns 0 if no memory is available...
857 mfn_t
858 shadow_alloc_p2m_page(struct domain *d)
859 {
860 struct list_head *entry;
861 struct page_info *pg;
862 mfn_t mfn;
863 void *p;
865 if ( list_empty(&d->arch.shadow.p2m_freelist) &&
866 !shadow_alloc_p2m_pages(d) )
867 return _mfn(0);
868 entry = d->arch.shadow.p2m_freelist.next;
869 list_del(entry);
870 list_add_tail(entry, &d->arch.shadow.p2m_inuse);
871 pg = list_entry(entry, struct page_info, list);
872 pg->count_info = 1;
873 mfn = page_to_mfn(pg);
874 p = sh_map_domain_page(mfn);
875 clear_page(p);
876 sh_unmap_domain_page(p);
878 return mfn;
879 }
881 #if CONFIG_PAGING_LEVELS == 3
882 static void p2m_install_entry_in_monitors(struct domain *d,
883 l3_pgentry_t *l3e)
884 /* Special case, only used for external-mode domains on PAE hosts:
885 * update the mapping of the p2m table. Once again, this is trivial in
886 * other paging modes (one top-level entry points to the top-level p2m,
887 * no maintenance needed), but PAE makes life difficult by needing a
888 * copy the eight l3es of the p2m table in eight l2h slots in the
889 * monitor table. This function makes fresh copies when a p2m l3e
890 * changes. */
891 {
892 l2_pgentry_t *ml2e;
893 struct vcpu *v;
894 unsigned int index;
896 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
897 ASSERT(index < MACHPHYS_MBYTES>>1);
899 for_each_vcpu(d, v)
900 {
901 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
902 continue;
903 ASSERT(shadow_mode_external(v->domain));
905 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
906 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
908 if ( v == current ) /* OK to use linear map of monitor_table */
909 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
910 else
911 {
912 l3_pgentry_t *ml3e;
913 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
914 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
915 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
916 ml2e += l2_table_offset(RO_MPT_VIRT_START);
917 sh_unmap_domain_page(ml3e);
918 }
919 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
920 if ( v != current )
921 sh_unmap_domain_page(ml2e);
922 }
923 }
924 #endif
926 // Find the next level's P2M entry, checking for out-of-range gfn's...
927 // Returns NULL on error.
928 //
929 static l1_pgentry_t *
930 p2m_find_entry(void *table, unsigned long *gfn_remainder,
931 unsigned long gfn, u32 shift, u32 max)
932 {
933 u32 index;
935 index = *gfn_remainder >> shift;
936 if ( index >= max )
937 {
938 SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
939 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
940 gfn, *gfn_remainder, shift, index, max);
941 return NULL;
942 }
943 *gfn_remainder &= (1 << shift) - 1;
944 return (l1_pgentry_t *)table + index;
945 }
947 // Walk one level of the P2M table, allocating a new table if required.
948 // Returns 0 on error.
949 //
950 static int
951 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
952 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
953 u32 max, unsigned long type)
954 {
955 l1_pgentry_t *p2m_entry;
956 void *next;
958 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
959 shift, max)) )
960 return 0;
962 if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
963 {
964 mfn_t mfn = shadow_alloc_p2m_page(d);
965 if ( mfn_x(mfn) == 0 )
966 return 0;
967 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
968 mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
969 mfn_to_page(mfn)->count_info = 1;
970 #if CONFIG_PAGING_LEVELS == 3
971 if (type == PGT_l2_page_table)
972 {
973 struct vcpu *v;
974 /* We have written to the p2m l3: need to sync the per-vcpu
975 * copies of it in the monitor tables */
976 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
977 /* Also, any vcpus running on shadows of the p2m need to
978 * reload their CR3s so the change propagates to the shadow */
979 ASSERT(shadow_lock_is_acquired(d));
980 for_each_vcpu(d, v)
981 {
982 if ( pagetable_get_pfn(v->arch.guest_table)
983 == pagetable_get_pfn(d->arch.phys_table)
984 && v->arch.shadow.mode != NULL )
985 v->arch.shadow.mode->update_cr3(v);
986 }
987 }
988 #endif
989 /* The P2M can be shadowed: keep the shadows synced */
990 if ( d->vcpu[0] != NULL )
991 (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
992 p2m_entry, sizeof *p2m_entry);
993 }
994 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
995 next = sh_map_domain_page(*table_mfn);
996 sh_unmap_domain_page(*table);
997 *table = next;
999 return 1;
1002 // Returns 0 on error (out of memory)
1003 int
1004 shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
1006 // XXX -- this might be able to be faster iff current->domain == d
1007 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
1008 void *table = sh_map_domain_page(table_mfn);
1009 unsigned long gfn_remainder = gfn;
1010 l1_pgentry_t *p2m_entry;
1011 int rv=0;
1013 #if CONFIG_PAGING_LEVELS >= 4
1014 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1015 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
1016 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
1017 goto out;
1018 #endif
1019 #if CONFIG_PAGING_LEVELS >= 3
1020 // When using PAE Xen, we only allow 33 bits of pseudo-physical
1021 // address in translated guests (i.e. 8 GBytes). This restriction
1022 // comes from wanting to map the P2M table into the 16MB RO_MPT hole
1023 // in Xen's address space for translated PV guests.
1024 //
1025 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1026 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
1027 (CONFIG_PAGING_LEVELS == 3
1028 ? 8
1029 : L3_PAGETABLE_ENTRIES),
1030 PGT_l2_page_table) )
1031 goto out;
1032 #endif
1033 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1034 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
1035 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
1036 goto out;
1038 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
1039 0, L1_PAGETABLE_ENTRIES);
1040 ASSERT(p2m_entry);
1041 if ( mfn_valid(mfn) )
1042 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1043 else
1044 *p2m_entry = l1e_empty();
1046 /* Track the highest gfn for which we have ever had a valid mapping */
1047 if ( mfn_valid(mfn) && (gfn > d->arch.max_mapped_pfn) )
1048 d->arch.max_mapped_pfn = gfn;
1050 /* The P2M can be shadowed: keep the shadows synced */
1051 if ( d->vcpu[0] != NULL )
1052 (void)__shadow_validate_guest_entry(
1053 d->vcpu[0], table_mfn, p2m_entry, sizeof(*p2m_entry));
1055 /* Success */
1056 rv = 1;
1058 out:
1059 sh_unmap_domain_page(table);
1060 return rv;
1063 // Allocate a new p2m table for a domain.
1064 //
1065 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
1066 // controlled by CONFIG_PAGING_LEVELS).
1067 //
1068 // Returns 0 if p2m table could not be initialized
1069 //
1070 static int
1071 shadow_alloc_p2m_table(struct domain *d)
1073 mfn_t p2m_top, mfn;
1074 struct list_head *entry;
1075 struct page_info *page;
1076 unsigned int page_count = 0;
1077 unsigned long gfn;
1079 SHADOW_PRINTK("allocating p2m table\n");
1080 ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
1082 p2m_top = shadow_alloc_p2m_page(d);
1083 mfn_to_page(p2m_top)->count_info = 1;
1084 mfn_to_page(p2m_top)->u.inuse.type_info =
1085 #if CONFIG_PAGING_LEVELS == 4
1086 PGT_l4_page_table
1087 #elif CONFIG_PAGING_LEVELS == 3
1088 PGT_l3_page_table
1089 #elif CONFIG_PAGING_LEVELS == 2
1090 PGT_l2_page_table
1091 #endif
1092 | 1 | PGT_validated;
1094 if ( mfn_x(p2m_top) == 0 )
1095 return 0;
1097 d->arch.phys_table = pagetable_from_mfn(p2m_top);
1099 SHADOW_PRINTK("populating p2m table\n");
1101 /* Initialise physmap tables for slot zero. Other code assumes this. */
1102 gfn = 0;
1103 mfn = _mfn(INVALID_MFN);
1104 if ( !shadow_set_p2m_entry(d, gfn, mfn) )
1105 goto error;
1107 for ( entry = d->page_list.next;
1108 entry != &d->page_list;
1109 entry = entry->next )
1111 page = list_entry(entry, struct page_info, list);
1112 mfn = page_to_mfn(page);
1113 gfn = get_gpfn_from_mfn(mfn_x(mfn));
1114 page_count++;
1115 if (
1116 #ifdef __x86_64__
1117 (gfn != 0x5555555555555555L)
1118 #else
1119 (gfn != 0x55555555L)
1120 #endif
1121 && gfn != INVALID_M2P_ENTRY
1122 && !shadow_set_p2m_entry(d, gfn, mfn) )
1123 goto error;
1126 SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
1127 return 1;
1129 error:
1130 SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
1131 SH_PRI_mfn "\n", gfn, mfn_x(mfn));
1132 return 0;
1135 mfn_t
1136 sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1137 /* Read another domain's p2m entries */
1139 mfn_t mfn;
1140 unsigned long addr = gpfn << PAGE_SHIFT;
1141 l2_pgentry_t *l2e;
1142 l1_pgentry_t *l1e;
1144 ASSERT(shadow_mode_translate(d));
1145 mfn = pagetable_get_mfn(d->arch.phys_table);
1148 if ( gpfn > d->arch.max_mapped_pfn )
1149 /* This pfn is higher than the highest the p2m map currently holds */
1150 return _mfn(INVALID_MFN);
1152 #if CONFIG_PAGING_LEVELS >= 4
1154 l4_pgentry_t *l4e = sh_map_domain_page(mfn);
1155 l4e += l4_table_offset(addr);
1156 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1158 sh_unmap_domain_page(l4e);
1159 return _mfn(INVALID_MFN);
1161 mfn = _mfn(l4e_get_pfn(*l4e));
1162 sh_unmap_domain_page(l4e);
1164 #endif
1165 #if CONFIG_PAGING_LEVELS >= 3
1167 l3_pgentry_t *l3e = sh_map_domain_page(mfn);
1168 l3e += l3_table_offset(addr);
1169 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1171 sh_unmap_domain_page(l3e);
1172 return _mfn(INVALID_MFN);
1174 mfn = _mfn(l3e_get_pfn(*l3e));
1175 sh_unmap_domain_page(l3e);
1177 #endif
1179 l2e = sh_map_domain_page(mfn);
1180 l2e += l2_table_offset(addr);
1181 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1183 sh_unmap_domain_page(l2e);
1184 return _mfn(INVALID_MFN);
1186 mfn = _mfn(l2e_get_pfn(*l2e));
1187 sh_unmap_domain_page(l2e);
1189 l1e = sh_map_domain_page(mfn);
1190 l1e += l1_table_offset(addr);
1191 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1193 sh_unmap_domain_page(l1e);
1194 return _mfn(INVALID_MFN);
1196 mfn = _mfn(l1e_get_pfn(*l1e));
1197 sh_unmap_domain_page(l1e);
1199 return mfn;
1202 unsigned long
1203 shadow_gfn_to_mfn_foreign(unsigned long gpfn)
1205 return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
1209 static void shadow_p2m_teardown(struct domain *d)
1210 /* Return all the p2m pages to Xen.
1211 * We know we don't have any extra mappings to these pages */
1213 struct list_head *entry, *n;
1214 struct page_info *pg;
1216 d->arch.phys_table = pagetable_null();
1218 list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
1220 pg = list_entry(entry, struct page_info, list);
1221 list_del(entry);
1222 /* Should have just the one ref we gave it in alloc_p2m_page() */
1223 if ( (pg->count_info & PGC_count_mask) != 1 )
1225 SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1226 pg->count_info, pg->u.inuse.type_info);
1228 ASSERT(page_get_owner(pg) == d);
1229 /* Free should not decrement domain's total allocation, since
1230 * these pages were allocated without an owner. */
1231 page_set_owner(pg, NULL);
1232 free_domheap_pages(pg, 0);
1233 d->arch.shadow.p2m_pages--;
1234 perfc_decr(shadow_alloc_count);
1236 list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
1238 list_del(entry);
1239 pg = list_entry(entry, struct page_info, list);
1240 ASSERT(page_get_owner(pg) == d);
1241 /* Free should not decrement domain's total allocation. */
1242 page_set_owner(pg, NULL);
1243 free_domheap_pages(pg, 0);
1244 d->arch.shadow.p2m_pages--;
1245 perfc_decr(shadow_alloc_count);
1247 ASSERT(d->arch.shadow.p2m_pages == 0);
1250 /* Set the pool of shadow pages to the required number of pages.
1251 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1252 * plus space for the p2m table.
1253 * Returns 0 for success, non-zero for failure. */
1254 static unsigned int set_sh_allocation(struct domain *d,
1255 unsigned int pages,
1256 int *preempted)
1258 struct shadow_page_info *sp;
1259 unsigned int lower_bound;
1260 int j;
1262 ASSERT(shadow_lock_is_acquired(d));
1264 /* Don't allocate less than the minimum acceptable, plus one page per
1265 * megabyte of RAM (for the p2m table) */
1266 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1267 if ( pages > 0 && pages < lower_bound )
1268 pages = lower_bound;
1269 /* Round up to largest block size */
1270 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1272 SHADOW_PRINTK("current %i target %i\n",
1273 d->arch.shadow.total_pages, pages);
1275 while ( d->arch.shadow.total_pages != pages )
1277 if ( d->arch.shadow.total_pages < pages )
1279 /* Need to allocate more memory from domheap */
1280 sp = (struct shadow_page_info *)
1281 alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
1282 if ( sp == NULL )
1284 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1285 return -ENOMEM;
1287 d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1288 d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1289 for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
1291 sp[j].type = 0;
1292 sp[j].pinned = 0;
1293 sp[j].logdirty = 0;
1294 sp[j].count = 0;
1295 sp[j].mbz = 0;
1296 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1298 sp->order = SHADOW_MAX_ORDER;
1299 list_add_tail(&sp->list,
1300 &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
1302 else if ( d->arch.shadow.total_pages > pages )
1304 /* Need to return memory to domheap */
1305 shadow_prealloc(d, SHADOW_MAX_ORDER);
1306 ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
1307 sp = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
1308 struct shadow_page_info, list);
1309 list_del(&sp->list);
1310 d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1311 d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1312 free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER);
1315 /* Check to see if we need to yield and try again */
1316 if ( preempted && hypercall_preempt_check() )
1318 *preempted = 1;
1319 return 0;
1323 return 0;
1326 unsigned int shadow_set_allocation(struct domain *d,
1327 unsigned int megabytes,
1328 int *preempted)
1329 /* Hypercall interface to set the shadow memory allocation */
1331 unsigned int rv;
1332 shadow_lock(d);
1333 rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
1334 SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
1335 d->domain_id,
1336 d->arch.shadow.total_pages,
1337 shadow_get_allocation(d));
1338 shadow_unlock(d);
1339 return rv;
1342 /**************************************************************************/
1343 /* Hash table for storing the guest->shadow mappings.
1344 * The table itself is an array of pointers to shadows; the shadows are then
1345 * threaded on a singly-linked list of shadows with the same hash value */
1347 #define SHADOW_HASH_BUCKETS 251
1348 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1350 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1351 typedef u32 key_t;
1352 static inline key_t sh_hash(unsigned long n, unsigned int t)
1354 unsigned char *p = (unsigned char *)&n;
1355 key_t k = t;
1356 int i;
1357 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1358 return k % SHADOW_HASH_BUCKETS;
1361 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1363 /* Before we get to the mechanism, define a pair of audit functions
1364 * that sanity-check the contents of the hash table. */
1365 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1366 /* Audit one bucket of the hash table */
1368 struct shadow_page_info *sp, *x;
1370 if ( !(SHADOW_AUDIT_ENABLE) )
1371 return;
1373 sp = d->arch.shadow.hash_table[bucket];
1374 while ( sp )
1376 /* Not a shadow? */
1377 BUG_ON( sp->mbz != 0 );
1378 /* Bogus type? */
1379 BUG_ON( sp->type == 0 );
1380 BUG_ON( sp->type > SH_type_max_shadow );
1381 /* Wrong bucket? */
1382 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1383 /* Duplicate entry? */
1384 for ( x = sp->next_shadow; x; x = x->next_shadow )
1385 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1386 /* Follow the backpointer to the guest pagetable */
1387 if ( sp->type != SH_type_fl1_32_shadow
1388 && sp->type != SH_type_fl1_pae_shadow
1389 && sp->type != SH_type_fl1_64_shadow )
1391 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1392 /* Bad shadow flags on guest page? */
1393 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1394 /* Bad type count on guest page? */
1395 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1396 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1398 SHADOW_ERROR("MFN %#lx shadowed (by %#"SH_PRI_mfn")"
1399 " but has typecount %#lx\n",
1400 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1401 gpg->u.inuse.type_info);
1402 BUG();
1405 /* That entry was OK; on we go */
1406 sp = sp->next_shadow;
1410 #else
1411 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1412 #endif /* Hashtable bucket audit */
1415 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1417 static void sh_hash_audit(struct domain *d)
1418 /* Full audit: audit every bucket in the table */
1420 int i;
1422 if ( !(SHADOW_AUDIT_ENABLE) )
1423 return;
1425 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1427 sh_hash_audit_bucket(d, i);
1431 #else
1432 #define sh_hash_audit(_d) do {} while(0)
1433 #endif /* Hashtable bucket audit */
1435 /* Allocate and initialise the table itself.
1436 * Returns 0 for success, 1 for error. */
1437 static int shadow_hash_alloc(struct domain *d)
1439 struct shadow_page_info **table;
1441 ASSERT(shadow_lock_is_acquired(d));
1442 ASSERT(!d->arch.shadow.hash_table);
1444 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1445 if ( !table ) return 1;
1446 memset(table, 0,
1447 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1448 d->arch.shadow.hash_table = table;
1449 return 0;
1452 /* Tear down the hash table and return all memory to Xen.
1453 * This function does not care whether the table is populated. */
1454 static void shadow_hash_teardown(struct domain *d)
1456 ASSERT(shadow_lock_is_acquired(d));
1457 ASSERT(d->arch.shadow.hash_table);
1459 xfree(d->arch.shadow.hash_table);
1460 d->arch.shadow.hash_table = NULL;
1464 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1465 /* Find an entry in the hash table. Returns the MFN of the shadow,
1466 * or INVALID_MFN if it doesn't exist */
1468 struct domain *d = v->domain;
1469 struct shadow_page_info *sp, *prev;
1470 key_t key;
1472 ASSERT(shadow_lock_is_acquired(d));
1473 ASSERT(d->arch.shadow.hash_table);
1474 ASSERT(t);
1476 sh_hash_audit(d);
1478 perfc_incrc(shadow_hash_lookups);
1479 key = sh_hash(n, t);
1480 sh_hash_audit_bucket(d, key);
1482 sp = d->arch.shadow.hash_table[key];
1483 prev = NULL;
1484 while(sp)
1486 if ( sp->backpointer == n && sp->type == t )
1488 /* Pull-to-front if 'sp' isn't already the head item */
1489 if ( unlikely(sp != d->arch.shadow.hash_table[key]) )
1491 if ( unlikely(d->arch.shadow.hash_walking != 0) )
1492 /* Can't reorder: someone is walking the hash chains */
1493 return shadow_page_to_mfn(sp);
1494 else
1496 ASSERT(prev);
1497 /* Delete sp from the list */
1498 prev->next_shadow = sp->next_shadow;
1499 /* Re-insert it at the head of the list */
1500 sp->next_shadow = d->arch.shadow.hash_table[key];
1501 d->arch.shadow.hash_table[key] = sp;
1504 else
1506 perfc_incrc(shadow_hash_lookup_head);
1508 return shadow_page_to_mfn(sp);
1510 prev = sp;
1511 sp = sp->next_shadow;
1514 perfc_incrc(shadow_hash_lookup_miss);
1515 return _mfn(INVALID_MFN);
1518 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
1519 mfn_t smfn)
1520 /* Put a mapping (n,t)->smfn into the hash table */
1522 struct domain *d = v->domain;
1523 struct shadow_page_info *sp;
1524 key_t key;
1526 ASSERT(shadow_lock_is_acquired(d));
1527 ASSERT(d->arch.shadow.hash_table);
1528 ASSERT(t);
1530 sh_hash_audit(d);
1532 perfc_incrc(shadow_hash_inserts);
1533 key = sh_hash(n, t);
1534 sh_hash_audit_bucket(d, key);
1536 /* Insert this shadow at the top of the bucket */
1537 sp = mfn_to_shadow_page(smfn);
1538 sp->next_shadow = d->arch.shadow.hash_table[key];
1539 d->arch.shadow.hash_table[key] = sp;
1541 sh_hash_audit_bucket(d, key);
1544 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
1545 mfn_t smfn)
1546 /* Excise the mapping (n,t)->smfn from the hash table */
1548 struct domain *d = v->domain;
1549 struct shadow_page_info *sp, *x;
1550 key_t key;
1552 ASSERT(shadow_lock_is_acquired(d));
1553 ASSERT(d->arch.shadow.hash_table);
1554 ASSERT(t);
1556 sh_hash_audit(d);
1558 perfc_incrc(shadow_hash_deletes);
1559 key = sh_hash(n, t);
1560 sh_hash_audit_bucket(d, key);
1562 sp = mfn_to_shadow_page(smfn);
1563 if ( d->arch.shadow.hash_table[key] == sp )
1564 /* Easy case: we're deleting the head item. */
1565 d->arch.shadow.hash_table[key] = sp->next_shadow;
1566 else
1568 /* Need to search for the one we want */
1569 x = d->arch.shadow.hash_table[key];
1570 while ( 1 )
1572 ASSERT(x); /* We can't have hit the end, since our target is
1573 * still in the chain somehwere... */
1574 if ( x->next_shadow == sp )
1576 x->next_shadow = sp->next_shadow;
1577 break;
1579 x = x->next_shadow;
1582 sp->next_shadow = NULL;
1584 sh_hash_audit_bucket(d, key);
1587 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1589 static void hash_foreach(struct vcpu *v,
1590 unsigned int callback_mask,
1591 hash_callback_t callbacks[],
1592 mfn_t callback_mfn)
1593 /* Walk the hash table looking at the types of the entries and
1594 * calling the appropriate callback function for each entry.
1595 * The mask determines which shadow types we call back for, and the array
1596 * of callbacks tells us which function to call.
1597 * Any callback may return non-zero to let us skip the rest of the scan.
1599 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1600 * then return non-zero to terminate the scan. */
1602 int i, done = 0;
1603 struct domain *d = v->domain;
1604 struct shadow_page_info *x;
1606 /* Say we're here, to stop hash-lookups reordering the chains */
1607 ASSERT(shadow_lock_is_acquired(d));
1608 ASSERT(d->arch.shadow.hash_walking == 0);
1609 d->arch.shadow.hash_walking = 1;
1611 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1613 /* WARNING: This is not safe against changes to the hash table.
1614 * The callback *must* return non-zero if it has inserted or
1615 * deleted anything from the hash (lookups are OK, though). */
1616 for ( x = d->arch.shadow.hash_table[i]; x; x = x->next_shadow )
1618 if ( callback_mask & (1 << x->type) )
1620 ASSERT(x->type <= 15);
1621 ASSERT(callbacks[x->type] != NULL);
1622 done = callbacks[x->type](v, shadow_page_to_mfn(x),
1623 callback_mfn);
1624 if ( done ) break;
1627 if ( done ) break;
1629 d->arch.shadow.hash_walking = 0;
1633 /**************************************************************************/
1634 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1635 * which will decrement refcounts appropriately and return memory to the
1636 * free pool. */
1638 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1640 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1641 unsigned int t = sp->type;
1644 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1646 /* Double-check, if we can, that the shadowed page belongs to this
1647 * domain, (by following the back-pointer). */
1648 ASSERT(t == SH_type_fl1_32_shadow ||
1649 t == SH_type_fl1_pae_shadow ||
1650 t == SH_type_fl1_64_shadow ||
1651 t == SH_type_monitor_table ||
1652 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
1653 == v->domain));
1655 /* The down-shifts here are so that the switch statement is on nice
1656 * small numbers that the compiler will enjoy */
1657 switch ( t )
1659 #if CONFIG_PAGING_LEVELS == 2
1660 case SH_type_l1_32_shadow:
1661 case SH_type_fl1_32_shadow:
1662 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1663 break;
1664 case SH_type_l2_32_shadow:
1665 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1666 break;
1667 #else /* PAE or 64bit */
1668 case SH_type_l1_32_shadow:
1669 case SH_type_fl1_32_shadow:
1670 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1671 break;
1672 case SH_type_l2_32_shadow:
1673 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1674 break;
1675 #endif
1677 #if CONFIG_PAGING_LEVELS >= 3
1678 case SH_type_l1_pae_shadow:
1679 case SH_type_fl1_pae_shadow:
1680 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1681 break;
1682 case SH_type_l2_pae_shadow:
1683 case SH_type_l2h_pae_shadow:
1684 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1685 break;
1686 #endif
1688 #if CONFIG_PAGING_LEVELS >= 4
1689 case SH_type_l1_64_shadow:
1690 case SH_type_fl1_64_shadow:
1691 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1692 break;
1693 case SH_type_l2_64_shadow:
1694 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1695 break;
1696 case SH_type_l3_64_shadow:
1697 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1698 break;
1699 case SH_type_l4_64_shadow:
1700 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1701 break;
1702 #endif
1703 default:
1704 SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
1705 (unsigned long)t);
1706 BUG();
1710 /**************************************************************************/
1711 /* Remove all writeable mappings of a guest frame from the shadow tables
1712 * Returns non-zero if we need to flush TLBs.
1713 * level and fault_addr desribe how we found this to be a pagetable;
1714 * level==0 means we have some other reason for revoking write access.*/
1716 int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn,
1717 unsigned int level,
1718 unsigned long fault_addr)
1720 /* Dispatch table for getting per-type functions */
1721 static hash_callback_t callbacks[16] = {
1722 NULL, /* none */
1723 #if CONFIG_PAGING_LEVELS == 2
1724 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */
1725 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */
1726 #else
1727 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */
1728 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */
1729 #endif
1730 NULL, /* l2_32 */
1731 #if CONFIG_PAGING_LEVELS >= 3
1732 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */
1733 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
1734 #else
1735 NULL, /* l1_pae */
1736 NULL, /* fl1_pae */
1737 #endif
1738 NULL, /* l2_pae */
1739 NULL, /* l2h_pae */
1740 #if CONFIG_PAGING_LEVELS >= 4
1741 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */
1742 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */
1743 #else
1744 NULL, /* l1_64 */
1745 NULL, /* fl1_64 */
1746 #endif
1747 NULL, /* l2_64 */
1748 NULL, /* l3_64 */
1749 NULL, /* l4_64 */
1750 NULL, /* p2m */
1751 NULL /* unused */
1752 };
1754 static unsigned int callback_mask =
1755 1 << SH_type_l1_32_shadow
1756 | 1 << SH_type_fl1_32_shadow
1757 | 1 << SH_type_l1_pae_shadow
1758 | 1 << SH_type_fl1_pae_shadow
1759 | 1 << SH_type_l1_64_shadow
1760 | 1 << SH_type_fl1_64_shadow
1762 struct page_info *pg = mfn_to_page(gmfn);
1764 ASSERT(shadow_lock_is_acquired(v->domain));
1766 /* Only remove writable mappings if we are doing shadow refcounts.
1767 * In guest refcounting, we trust Xen to already be restricting
1768 * all the writes to the guest page tables, so we do not need to
1769 * do more. */
1770 if ( !shadow_mode_refcounts(v->domain) )
1771 return 0;
1773 /* Early exit if it's already a pagetable, or otherwise not writeable */
1774 if ( sh_mfn_is_a_page_table(gmfn)
1775 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1776 return 0;
1778 perfc_incrc(shadow_writeable);
1780 /* If this isn't a "normal" writeable page, the domain is trying to
1781 * put pagetables in special memory of some kind. We can't allow that. */
1782 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1784 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1785 PRtype_info "\n",
1786 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1787 domain_crash(v->domain);
1790 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1791 if ( v == current && level != 0 )
1793 unsigned long gfn;
1794 /* Heuristic: there is likely to be only one writeable mapping,
1795 * and that mapping is likely to be in the current pagetable,
1796 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
1798 #define GUESS(_a, _h) do { \
1799 if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
1800 perfc_incrc(shadow_writeable_h_ ## _h); \
1801 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1802 return 1; \
1803 } while (0)
1806 if ( v->arch.shadow.mode->guest_levels == 2 )
1808 if ( level == 1 )
1809 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1810 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1812 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1813 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1814 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1817 #if CONFIG_PAGING_LEVELS >= 3
1818 else if ( v->arch.shadow.mode->guest_levels == 3 )
1820 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1821 switch ( level )
1823 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1824 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1827 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1828 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1829 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1831 #if CONFIG_PAGING_LEVELS >= 4
1832 else if ( v->arch.shadow.mode->guest_levels == 4 )
1834 /* 64bit w2k3: linear map at 0x0000070000000000 */
1835 switch ( level )
1837 case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
1838 case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
1839 case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
1842 /* 64bit Linux direct map at 0xffff810000000000; older kernels
1843 * had it at 0x0000010000000000UL */
1844 gfn = sh_mfn_to_gfn(v->domain, gmfn);
1845 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
1846 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
1848 #endif /* CONFIG_PAGING_LEVELS >= 4 */
1849 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1851 #undef GUESS
1854 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1855 return 1;
1857 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
1858 * (entries in the fixmap) where linux maps its pagetables. Since
1859 * we expect to hit them most of the time, we start the search for
1860 * the writeable mapping by looking at the same MFN where the last
1861 * brute-force search succeeded. */
1863 if ( v->arch.shadow.last_writeable_pte_smfn != 0 )
1865 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
1866 mfn_t last_smfn = _mfn(v->arch.shadow.last_writeable_pte_smfn);
1867 int shtype = mfn_to_shadow_page(last_smfn)->type;
1869 if ( callbacks[shtype] )
1870 callbacks[shtype](v, last_smfn, gmfn);
1872 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
1873 perfc_incrc(shadow_writeable_h_5);
1876 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1877 return 1;
1879 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
1881 /* Brute-force search of all the shadows, by walking the hash */
1882 perfc_incrc(shadow_writeable_bf);
1883 hash_foreach(v, callback_mask, callbacks, gmfn);
1885 /* If that didn't catch the mapping, something is very wrong */
1886 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
1888 SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
1889 "%lu left\n", mfn_x(gmfn),
1890 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
1891 domain_crash(v->domain);
1894 /* We killed at least one writeable mapping, so must flush TLBs. */
1895 return 1;
1900 /**************************************************************************/
1901 /* Remove all mappings of a guest frame from the shadow tables.
1902 * Returns non-zero if we need to flush TLBs. */
1904 int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
1906 struct page_info *page = mfn_to_page(gmfn);
1907 int expected_count;
1909 /* Dispatch table for getting per-type functions */
1910 static hash_callback_t callbacks[16] = {
1911 NULL, /* none */
1912 #if CONFIG_PAGING_LEVELS == 2
1913 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */
1914 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */
1915 #else
1916 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */
1917 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */
1918 #endif
1919 NULL, /* l2_32 */
1920 #if CONFIG_PAGING_LEVELS >= 3
1921 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */
1922 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
1923 #else
1924 NULL, /* l1_pae */
1925 NULL, /* fl1_pae */
1926 #endif
1927 NULL, /* l2_pae */
1928 NULL, /* l2h_pae */
1929 #if CONFIG_PAGING_LEVELS >= 4
1930 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */
1931 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */
1932 #else
1933 NULL, /* l1_64 */
1934 NULL, /* fl1_64 */
1935 #endif
1936 NULL, /* l2_64 */
1937 NULL, /* l3_64 */
1938 NULL, /* l4_64 */
1939 NULL, /* p2m */
1940 NULL /* unused */
1941 };
1943 static unsigned int callback_mask =
1944 1 << SH_type_l1_32_shadow
1945 | 1 << SH_type_fl1_32_shadow
1946 | 1 << SH_type_l1_pae_shadow
1947 | 1 << SH_type_fl1_pae_shadow
1948 | 1 << SH_type_l1_64_shadow
1949 | 1 << SH_type_fl1_64_shadow
1952 perfc_incrc(shadow_mappings);
1953 if ( (page->count_info & PGC_count_mask) == 0 )
1954 return 0;
1956 ASSERT(shadow_lock_is_acquired(v->domain));
1958 /* XXX TODO:
1959 * Heuristics for finding the (probably) single mapping of this gmfn */
1961 /* Brute-force search of all the shadows, by walking the hash */
1962 perfc_incrc(shadow_mappings_bf);
1963 hash_foreach(v, callback_mask, callbacks, gmfn);
1965 /* If that didn't catch the mapping, something is very wrong */
1966 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
1967 if ( (page->count_info & PGC_count_mask) != expected_count )
1969 /* Don't complain if we're in HVM and there's one extra mapping:
1970 * The qemu helper process has an untyped mapping of this dom's RAM */
1971 if ( !(shadow_mode_external(v->domain)
1972 && (page->count_info & PGC_count_mask) <= 2
1973 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
1975 SHADOW_ERROR("can't find all mappings of mfn %lx: "
1976 "c=%08x t=%08lx\n", mfn_x(gmfn),
1977 page->count_info, page->u.inuse.type_info);
1981 /* We killed at least one mapping, so must flush TLBs. */
1982 return 1;
1986 /**************************************************************************/
1987 /* Remove all shadows of a guest frame from the shadow tables */
1989 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
1990 /* Follow this shadow's up-pointer, if it has one, and remove the reference
1991 * found there. Returns 1 if that was the only reference to this shadow */
1993 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1994 mfn_t pmfn;
1995 void *vaddr;
1996 int rc;
1998 ASSERT(sp->type > 0);
1999 ASSERT(sp->type < SH_type_max_shadow);
2000 ASSERT(sp->type != SH_type_l2_32_shadow);
2001 ASSERT(sp->type != SH_type_l2_pae_shadow);
2002 ASSERT(sp->type != SH_type_l2h_pae_shadow);
2003 ASSERT(sp->type != SH_type_l4_64_shadow);
2005 if (sp->up == 0) return 0;
2006 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2007 ASSERT(mfn_valid(pmfn));
2008 vaddr = sh_map_domain_page(pmfn);
2009 ASSERT(vaddr);
2010 vaddr += sp->up & (PAGE_SIZE-1);
2011 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2013 /* Is this the only reference to this shadow? */
2014 rc = (sp->count == 1) ? 1 : 0;
2016 /* Blank the offending entry */
2017 switch (sp->type)
2019 case SH_type_l1_32_shadow:
2020 case SH_type_l2_32_shadow:
2021 #if CONFIG_PAGING_LEVELS == 2
2022 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2023 #else
2024 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2025 #endif
2026 break;
2027 #if CONFIG_PAGING_LEVELS >=3
2028 case SH_type_l1_pae_shadow:
2029 case SH_type_l2_pae_shadow:
2030 case SH_type_l2h_pae_shadow:
2031 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2032 break;
2033 #if CONFIG_PAGING_LEVELS >= 4
2034 case SH_type_l1_64_shadow:
2035 case SH_type_l2_64_shadow:
2036 case SH_type_l3_64_shadow:
2037 case SH_type_l4_64_shadow:
2038 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2039 break;
2040 #endif
2041 #endif
2042 default: BUG(); /* Some wierd unknown shadow type */
2045 sh_unmap_domain_page(vaddr);
2046 if ( rc )
2047 perfc_incrc(shadow_up_pointer);
2048 else
2049 perfc_incrc(shadow_unshadow_bf);
2051 return rc;
2054 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2055 /* Remove the shadows of this guest page.
2056 * If fast != 0, just try the quick heuristic, which will remove
2057 * at most one reference to each shadow of the page. Otherwise, walk
2058 * all the shadow tables looking for refs to shadows of this gmfn.
2059 * If all != 0, kill the domain if we can't find all the shadows.
2060 * (all != 0 implies fast == 0)
2061 */
2063 struct page_info *pg;
2064 mfn_t smfn;
2065 u32 sh_flags;
2066 unsigned char t;
2068 /* Dispatch table for getting per-type functions: each level must
2069 * be called with the function to remove a lower-level shadow. */
2070 static hash_callback_t callbacks[16] = {
2071 NULL, /* none */
2072 NULL, /* l1_32 */
2073 NULL, /* fl1_32 */
2074 #if CONFIG_PAGING_LEVELS == 2
2075 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2076 #else
2077 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2078 #endif
2079 NULL, /* l1_pae */
2080 NULL, /* fl1_pae */
2081 #if CONFIG_PAGING_LEVELS >= 3
2082 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2083 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2084 #else
2085 NULL, /* l2_pae */
2086 NULL, /* l2h_pae */
2087 #endif
2088 NULL, /* l1_64 */
2089 NULL, /* fl1_64 */
2090 #if CONFIG_PAGING_LEVELS >= 4
2091 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2092 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2093 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2094 #else
2095 NULL, /* l2_64 */
2096 NULL, /* l3_64 */
2097 NULL, /* l4_64 */
2098 #endif
2099 NULL, /* p2m */
2100 NULL /* unused */
2101 };
2103 /* Another lookup table, for choosing which mask to use */
2104 static unsigned int masks[16] = {
2105 0, /* none */
2106 1 << SH_type_l2_32_shadow, /* l1_32 */
2107 0, /* fl1_32 */
2108 0, /* l2_32 */
2109 ((1 << SH_type_l2h_pae_shadow)
2110 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2111 0, /* fl1_pae */
2112 0, /* l2_pae */
2113 0, /* l2h_pae */
2114 1 << SH_type_l2_64_shadow, /* l1_64 */
2115 0, /* fl1_64 */
2116 1 << SH_type_l3_64_shadow, /* l2_64 */
2117 1 << SH_type_l4_64_shadow, /* l3_64 */
2118 0, /* l4_64 */
2119 0, /* p2m */
2120 0 /* unused */
2121 };
2123 ASSERT(shadow_lock_is_acquired(v->domain));
2124 ASSERT(!(all && fast));
2126 pg = mfn_to_page(gmfn);
2128 /* Bail out now if the page is not shadowed */
2129 if ( (pg->count_info & PGC_page_table) == 0 )
2130 return;
2132 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2133 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2135 /* Search for this shadow in all appropriate shadows */
2136 perfc_incrc(shadow_unshadow);
2137 sh_flags = pg->shadow_flags;
2139 /* Lower-level shadows need to be excised from upper-level shadows.
2140 * This call to hash_foreach() looks dangerous but is in fact OK: each
2141 * call will remove at most one shadow, and terminate immediately when
2142 * it does remove it, so we never walk the hash after doing a deletion. */
2143 #define DO_UNSHADOW(_type) do { \
2144 t = (_type); \
2145 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2146 if ( sh_type_is_pinnable(v, t) ) \
2147 sh_unpin(v, smfn); \
2148 else \
2149 sh_remove_shadow_via_pointer(v, smfn); \
2150 if ( (pg->count_info & PGC_page_table) && !fast ) \
2151 hash_foreach(v, masks[t], callbacks, smfn); \
2152 } while (0)
2154 if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(SH_type_l1_32_shadow);
2155 if ( sh_flags & SHF_L2_32 ) DO_UNSHADOW(SH_type_l2_32_shadow);
2156 #if CONFIG_PAGING_LEVELS >= 3
2157 if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(SH_type_l1_pae_shadow);
2158 if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(SH_type_l2_pae_shadow);
2159 if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(SH_type_l2h_pae_shadow);
2160 #if CONFIG_PAGING_LEVELS >= 4
2161 if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(SH_type_l1_64_shadow);
2162 if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(SH_type_l2_64_shadow);
2163 if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(SH_type_l3_64_shadow);
2164 if ( sh_flags & SHF_L4_64 ) DO_UNSHADOW(SH_type_l4_64_shadow);
2165 #endif
2166 #endif
2168 #undef DO_UNSHADOW
2170 /* If that didn't catch the shadows, something is wrong */
2171 if ( !fast && (pg->count_info & PGC_page_table) )
2173 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2174 "(shadow_flags=%08lx)\n",
2175 mfn_x(gmfn), pg->shadow_flags);
2176 if ( all )
2177 domain_crash(v->domain);
2180 /* Need to flush TLBs now, so that linear maps are safe next time we
2181 * take a fault. */
2182 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2185 void
2186 shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2187 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2188 * Unshadow it, and recursively unshadow pages that reference it. */
2190 shadow_remove_all_shadows(v, gmfn);
2191 /* XXX TODO:
2192 * Rework this hashtable walker to return a linked-list of all
2193 * the shadows it modified, then do breadth-first recursion
2194 * to find the way up to higher-level tables and unshadow them too.
2196 * The current code (just tearing down each page's shadows as we
2197 * detect that it is not a pagetable) is correct, but very slow.
2198 * It means extra emulated writes and slows down removal of mappings. */
2201 /**************************************************************************/
2203 void sh_update_paging_modes(struct vcpu *v)
2205 struct domain *d = v->domain;
2206 struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
2207 mfn_t old_guest_table;
2209 ASSERT(shadow_lock_is_acquired(d));
2211 // Valid transitions handled by this function:
2212 // - For PV guests:
2213 // - after a shadow mode has been changed
2214 // - For HVM guests:
2215 // - after a shadow mode has been changed
2216 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2217 //
2219 // First, tear down any old shadow tables held by this vcpu.
2220 //
2221 shadow_detach_old_tables(v);
2223 if ( !is_hvm_domain(d) )
2225 ///
2226 /// PV guest
2227 ///
2228 #if CONFIG_PAGING_LEVELS == 4
2229 if ( pv_32bit_guest(v) )
2230 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
2231 else
2232 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2233 #elif CONFIG_PAGING_LEVELS == 3
2234 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2235 #elif CONFIG_PAGING_LEVELS == 2
2236 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2237 #else
2238 #error unexpected paging mode
2239 #endif
2240 v->arch.shadow.translate_enabled = !!shadow_mode_translate(d);
2242 else
2244 ///
2245 /// HVM guest
2246 ///
2247 ASSERT(shadow_mode_translate(d));
2248 ASSERT(shadow_mode_external(d));
2250 v->arch.shadow.translate_enabled = !!hvm_paging_enabled(v);
2251 if ( !v->arch.shadow.translate_enabled )
2253 /* Set v->arch.guest_table to use the p2m map, and choose
2254 * the appropriate shadow mode */
2255 old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2256 #if CONFIG_PAGING_LEVELS == 2
2257 v->arch.guest_table =
2258 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2259 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2260 #elif CONFIG_PAGING_LEVELS == 3
2261 v->arch.guest_table =
2262 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2263 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2264 #else /* CONFIG_PAGING_LEVELS == 4 */
2266 l4_pgentry_t *l4e;
2267 /* Use the start of the first l3 table as a PAE l3 */
2268 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2269 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2270 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2271 v->arch.guest_table =
2272 pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2273 sh_unmap_domain_page(l4e);
2275 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2276 #endif
2277 /* Fix up refcounts on guest_table */
2278 get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2279 if ( mfn_x(old_guest_table) != 0 )
2280 put_page(mfn_to_page(old_guest_table));
2282 else
2284 #ifdef __x86_64__
2285 if ( hvm_long_mode_enabled(v) )
2287 // long mode guest...
2288 v->arch.shadow.mode =
2289 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2291 else
2292 #endif
2293 if ( hvm_pae_enabled(v) )
2295 #if CONFIG_PAGING_LEVELS >= 3
2296 // 32-bit PAE mode guest...
2297 v->arch.shadow.mode =
2298 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2299 #else
2300 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2301 domain_crash(d);
2302 return;
2303 #endif
2305 else
2307 // 32-bit 2 level guest...
2308 #if CONFIG_PAGING_LEVELS >= 3
2309 v->arch.shadow.mode =
2310 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2311 #else
2312 v->arch.shadow.mode =
2313 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2314 #endif
2318 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
2320 mfn_t mmfn = shadow_make_monitor_table(v);
2321 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2322 v->arch.monitor_vtable = sh_map_domain_page(mmfn);
2325 if ( v->arch.shadow.mode != old_mode )
2327 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2328 "(was g=%u s=%u)\n",
2329 d->domain_id, v->vcpu_id,
2330 is_hvm_domain(d) ? !!hvm_paging_enabled(v) : 1,
2331 v->arch.shadow.mode->guest_levels,
2332 v->arch.shadow.mode->shadow_levels,
2333 old_mode ? old_mode->guest_levels : 0,
2334 old_mode ? old_mode->shadow_levels : 0);
2335 if ( old_mode &&
2336 (v->arch.shadow.mode->shadow_levels !=
2337 old_mode->shadow_levels) )
2339 /* Need to make a new monitor table for the new mode */
2340 mfn_t new_mfn, old_mfn;
2342 if ( v != current )
2344 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2345 "this HVM vcpu's (d=%u v=%u) paging mode!\n",
2346 current->domain->domain_id, current->vcpu_id,
2347 v->domain->domain_id, v->vcpu_id);
2348 domain_crash(v->domain);
2349 return;
2352 sh_unmap_domain_page(v->arch.monitor_vtable);
2353 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2354 v->arch.monitor_table = pagetable_null();
2355 new_mfn = v->arch.shadow.mode->make_monitor_table(v);
2356 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2357 v->arch.monitor_vtable = sh_map_domain_page(new_mfn);
2358 SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
2359 mfn_x(new_mfn));
2361 /* Don't be running on the old monitor table when we
2362 * pull it down! Switch CR3, and warn the HVM code that
2363 * its host cr3 has changed. */
2364 make_cr3(v, mfn_x(new_mfn));
2365 write_ptbase(v);
2366 hvm_update_host_cr3(v);
2367 old_mode->destroy_monitor_table(v, old_mfn);
2371 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2372 // These are HARD: think about the case where two CPU's have
2373 // different values for CR4.PSE and CR4.PGE at the same time.
2374 // This *does* happen, at least for CR4.PGE...
2377 v->arch.shadow.mode->update_cr3(v);
2380 /**************************************************************************/
2381 /* Turning on and off shadow features */
2383 static void sh_new_mode(struct domain *d, u32 new_mode)
2384 /* Inform all the vcpus that the shadow mode has been changed */
2386 struct vcpu *v;
2388 ASSERT(shadow_lock_is_acquired(d));
2389 ASSERT(d != current->domain);
2390 d->arch.shadow.mode = new_mode;
2391 if ( new_mode & SHM2_translate )
2392 shadow_audit_p2m(d);
2393 for_each_vcpu(d, v)
2394 sh_update_paging_modes(v);
2397 int shadow_enable(struct domain *d, u32 mode)
2398 /* Turn on "permanent" shadow features: external, translate, refcount.
2399 * Can only be called once on a domain, and these features cannot be
2400 * disabled.
2401 * Returns 0 for success, -errno for failure. */
2403 unsigned int old_pages;
2404 int rv = 0;
2406 mode |= SHM2_enable;
2408 domain_pause(d);
2409 shadow_lock(d);
2411 /* Sanity check the arguments */
2412 if ( (d == current->domain) ||
2413 shadow_mode_enabled(d) ||
2414 ((mode & SHM2_translate) && !(mode & SHM2_refcounts)) ||
2415 ((mode & SHM2_external) && !(mode & SHM2_translate)) )
2417 rv = -EINVAL;
2418 goto out;
2421 // XXX -- eventually would like to require that all memory be allocated
2422 // *after* shadow_enabled() is called... So here, we would test to make
2423 // sure that d->page_list is empty.
2424 #if 0
2425 spin_lock(&d->page_alloc_lock);
2426 if ( !list_empty(&d->page_list) )
2428 spin_unlock(&d->page_alloc_lock);
2429 rv = -EINVAL;
2430 goto out;
2432 spin_unlock(&d->page_alloc_lock);
2433 #endif
2435 /* Init the shadow memory allocation if the user hasn't done so */
2436 old_pages = d->arch.shadow.total_pages;
2437 if ( old_pages == 0 )
2438 if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
2440 set_sh_allocation(d, 0, NULL);
2441 rv = -ENOMEM;
2442 goto out;
2445 /* Init the hash table */
2446 if ( shadow_hash_alloc(d) != 0 )
2448 set_sh_allocation(d, old_pages, NULL);
2449 rv = -ENOMEM;
2450 goto out;
2453 /* Init the P2M table */
2454 if ( mode & SHM2_translate )
2455 if ( !shadow_alloc_p2m_table(d) )
2457 shadow_hash_teardown(d);
2458 set_sh_allocation(d, old_pages, NULL);
2459 shadow_p2m_teardown(d);
2460 rv = -ENOMEM;
2461 goto out;
2464 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2465 /* We assume we're dealing with an older 64bit linux guest until we
2466 * see the guest use more than one l4 per vcpu. */
2467 d->arch.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2468 #endif
2470 /* Update the bits */
2471 sh_new_mode(d, mode);
2472 shadow_audit_p2m(d);
2473 out:
2474 shadow_unlock(d);
2475 domain_unpause(d);
2476 return rv;
2479 void shadow_teardown(struct domain *d)
2480 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2481 * Should only be called for dying domains. */
2483 struct vcpu *v;
2484 mfn_t mfn;
2486 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2487 ASSERT(d != current->domain);
2489 if ( !shadow_lock_is_acquired(d) )
2490 shadow_lock(d); /* Keep various asserts happy */
2492 if ( shadow_mode_enabled(d) )
2494 /* Release the shadow and monitor tables held by each vcpu */
2495 for_each_vcpu(d, v)
2497 shadow_detach_old_tables(v);
2498 if ( shadow_mode_external(d) )
2500 mfn = pagetable_get_mfn(v->arch.monitor_table);
2501 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
2502 shadow_destroy_monitor_table(v, mfn);
2503 v->arch.monitor_table = pagetable_null();
2508 if ( d->arch.shadow.total_pages != 0 )
2510 SHADOW_PRINTK("teardown of domain %u starts."
2511 " Shadow pages total = %u, free = %u, p2m=%u\n",
2512 d->domain_id,
2513 d->arch.shadow.total_pages,
2514 d->arch.shadow.free_pages,
2515 d->arch.shadow.p2m_pages);
2516 /* Destroy all the shadows and release memory to domheap */
2517 set_sh_allocation(d, 0, NULL);
2518 /* Release the hash table back to xenheap */
2519 if (d->arch.shadow.hash_table)
2520 shadow_hash_teardown(d);
2521 /* Release the log-dirty bitmap of dirtied pages */
2522 sh_free_log_dirty_bitmap(d);
2523 /* Should not have any more memory held */
2524 SHADOW_PRINTK("teardown done."
2525 " Shadow pages total = %u, free = %u, p2m=%u\n",
2526 d->arch.shadow.total_pages,
2527 d->arch.shadow.free_pages,
2528 d->arch.shadow.p2m_pages);
2529 ASSERT(d->arch.shadow.total_pages == 0);
2532 /* We leave the "permanent" shadow modes enabled, but clear the
2533 * log-dirty mode bit. We don't want any more mark_dirty()
2534 * calls now that we've torn down the bitmap */
2535 d->arch.shadow.mode &= ~SHM2_log_dirty;
2537 shadow_unlock(d);
2540 void shadow_final_teardown(struct domain *d)
2541 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2544 SHADOW_PRINTK("dom %u final teardown starts."
2545 " Shadow pages total = %u, free = %u, p2m=%u\n",
2546 d->domain_id,
2547 d->arch.shadow.total_pages,
2548 d->arch.shadow.free_pages,
2549 d->arch.shadow.p2m_pages);
2551 /* Double-check that the domain didn't have any shadow memory.
2552 * It is possible for a domain that never got domain_kill()ed
2553 * to get here with its shadow allocation intact. */
2554 if ( d->arch.shadow.total_pages != 0 )
2555 shadow_teardown(d);
2557 /* It is now safe to pull down the p2m map. */
2558 if ( d->arch.shadow.p2m_pages != 0 )
2559 shadow_p2m_teardown(d);
2561 SHADOW_PRINTK("dom %u final teardown done."
2562 " Shadow pages total = %u, free = %u, p2m=%u\n",
2563 d->domain_id,
2564 d->arch.shadow.total_pages,
2565 d->arch.shadow.free_pages,
2566 d->arch.shadow.p2m_pages);
2569 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2570 /* Turn on a single shadow mode feature */
2572 ASSERT(shadow_lock_is_acquired(d));
2574 /* Sanity check the call */
2575 if ( d == current->domain || (d->arch.shadow.mode & mode) )
2577 return -EINVAL;
2580 if ( d->arch.shadow.mode == 0 )
2582 /* Init the shadow memory allocation and the hash table */
2583 if ( set_sh_allocation(d, 1, NULL) != 0
2584 || shadow_hash_alloc(d) != 0 )
2586 set_sh_allocation(d, 0, NULL);
2587 return -ENOMEM;
2591 /* Update the bits */
2592 sh_new_mode(d, d->arch.shadow.mode | mode);
2594 return 0;
2597 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2598 /* Turn off a single shadow mode feature */
2600 struct vcpu *v;
2601 ASSERT(shadow_lock_is_acquired(d));
2603 /* Sanity check the call */
2604 if ( d == current->domain || !(d->arch.shadow.mode & mode) )
2606 return -EINVAL;
2609 /* Update the bits */
2610 sh_new_mode(d, d->arch.shadow.mode & ~mode);
2611 if ( d->arch.shadow.mode == 0 )
2613 /* Get this domain off shadows */
2614 SHADOW_PRINTK("un-shadowing of domain %u starts."
2615 " Shadow pages total = %u, free = %u, p2m=%u\n",
2616 d->domain_id,
2617 d->arch.shadow.total_pages,
2618 d->arch.shadow.free_pages,
2619 d->arch.shadow.p2m_pages);
2620 for_each_vcpu(d, v)
2622 shadow_detach_old_tables(v);
2623 #if CONFIG_PAGING_LEVELS == 4
2624 if ( !(v->arch.flags & TF_kernel_mode) )
2625 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2626 else
2627 #endif
2628 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2632 /* Pull down the memory allocation */
2633 if ( set_sh_allocation(d, 0, NULL) != 0 )
2635 // XXX - How can this occur?
2636 // Seems like a bug to return an error now that we've
2637 // disabled the relevant shadow mode.
2638 //
2639 return -ENOMEM;
2641 shadow_hash_teardown(d);
2642 SHADOW_PRINTK("un-shadowing of domain %u done."
2643 " Shadow pages total = %u, free = %u, p2m=%u\n",
2644 d->domain_id,
2645 d->arch.shadow.total_pages,
2646 d->arch.shadow.free_pages,
2647 d->arch.shadow.p2m_pages);
2650 return 0;
2653 /* Enable/disable ops for the "test" and "log-dirty" modes */
2654 int shadow_test_enable(struct domain *d)
2656 int ret;
2658 domain_pause(d);
2659 shadow_lock(d);
2661 if ( shadow_mode_enabled(d) )
2663 SHADOW_ERROR("Don't support enabling test mode"
2664 " on already shadowed doms\n");
2665 ret = -EINVAL;
2666 goto out;
2669 ret = shadow_one_bit_enable(d, SHM2_enable);
2670 out:
2671 shadow_unlock(d);
2672 domain_unpause(d);
2674 return ret;
2677 int shadow_test_disable(struct domain *d)
2679 int ret;
2681 domain_pause(d);
2682 shadow_lock(d);
2683 ret = shadow_one_bit_disable(d, SHM2_enable);
2684 shadow_unlock(d);
2685 domain_unpause(d);
2687 return ret;
2690 static int
2691 sh_alloc_log_dirty_bitmap(struct domain *d)
2693 ASSERT(d->arch.shadow.dirty_bitmap == NULL);
2694 d->arch.shadow.dirty_bitmap_size =
2695 (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
2696 ~(BITS_PER_LONG - 1);
2697 d->arch.shadow.dirty_bitmap =
2698 xmalloc_array(unsigned long,
2699 d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
2700 if ( d->arch.shadow.dirty_bitmap == NULL )
2702 d->arch.shadow.dirty_bitmap_size = 0;
2703 return -ENOMEM;
2705 memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
2707 return 0;
2710 static void
2711 sh_free_log_dirty_bitmap(struct domain *d)
2713 d->arch.shadow.dirty_bitmap_size = 0;
2714 if ( d->arch.shadow.dirty_bitmap )
2716 xfree(d->arch.shadow.dirty_bitmap);
2717 d->arch.shadow.dirty_bitmap = NULL;
2721 static int shadow_log_dirty_enable(struct domain *d)
2723 int ret;
2725 domain_pause(d);
2726 shadow_lock(d);
2728 if ( shadow_mode_log_dirty(d) )
2730 ret = -EINVAL;
2731 goto out;
2734 if ( shadow_mode_enabled(d) )
2736 SHADOW_ERROR("Don't (yet) support enabling log-dirty"
2737 " on already shadowed doms\n");
2738 ret = -EINVAL;
2739 goto out;
2742 ret = sh_alloc_log_dirty_bitmap(d);
2743 if ( ret != 0 )
2745 sh_free_log_dirty_bitmap(d);
2746 goto out;
2749 ret = shadow_one_bit_enable(d, SHM2_log_dirty);
2750 if ( ret != 0 )
2751 sh_free_log_dirty_bitmap(d);
2753 out:
2754 shadow_unlock(d);
2755 domain_unpause(d);
2756 return ret;
2759 static int shadow_log_dirty_disable(struct domain *d)
2761 int ret;
2763 domain_pause(d);
2764 shadow_lock(d);
2765 ret = shadow_one_bit_disable(d, SHM2_log_dirty);
2766 if ( !shadow_mode_log_dirty(d) )
2767 sh_free_log_dirty_bitmap(d);
2768 shadow_unlock(d);
2769 domain_unpause(d);
2771 return ret;
2774 /**************************************************************************/
2775 /* P2M map manipulations */
2777 static void
2778 sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
2780 struct vcpu *v;
2782 if ( !shadow_mode_translate(d) )
2783 return;
2785 v = current;
2786 if ( v->domain != d )
2787 v = d->vcpu[0];
2789 SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
2791 ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
2792 //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
2794 if ( v != NULL )
2796 shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
2797 if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
2798 flush_tlb_mask(d->domain_dirty_cpumask);
2801 shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
2802 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
2805 void
2806 shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
2807 unsigned long mfn)
2809 shadow_lock(d);
2810 shadow_audit_p2m(d);
2811 sh_p2m_remove_page(d, gfn, mfn);
2812 shadow_audit_p2m(d);
2813 shadow_unlock(d);
2816 void
2817 shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
2818 unsigned long mfn)
2820 unsigned long ogfn;
2821 mfn_t omfn;
2823 if ( !shadow_mode_translate(d) )
2824 return;
2826 shadow_lock(d);
2827 shadow_audit_p2m(d);
2829 SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
2831 omfn = sh_gfn_to_mfn(d, gfn);
2832 if ( mfn_valid(omfn) )
2834 /* Get rid of the old mapping, especially any shadows */
2835 struct vcpu *v = current;
2836 if ( v->domain != d )
2837 v = d->vcpu[0];
2838 if ( v != NULL )
2840 shadow_remove_all_shadows_and_parents(v, omfn);
2841 if ( shadow_remove_all_mappings(v, omfn) )
2842 flush_tlb_mask(d->domain_dirty_cpumask);
2844 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
2847 ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
2848 if (
2849 #ifdef __x86_64__
2850 (ogfn != 0x5555555555555555L)
2851 #else
2852 (ogfn != 0x55555555L)
2853 #endif
2854 && (ogfn != INVALID_M2P_ENTRY)
2855 && (ogfn != gfn) )
2857 /* This machine frame is already mapped at another physical address */
2858 SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
2859 mfn, ogfn, gfn);
2860 if ( mfn_valid(omfn = sh_gfn_to_mfn(d, ogfn)) )
2862 SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
2863 ogfn , mfn_x(omfn));
2864 if ( mfn_x(omfn) == mfn )
2865 sh_p2m_remove_page(d, ogfn, mfn);
2869 shadow_set_p2m_entry(d, gfn, _mfn(mfn));
2870 set_gpfn_from_mfn(mfn, gfn);
2871 shadow_audit_p2m(d);
2872 shadow_unlock(d);
2875 /**************************************************************************/
2876 /* Log-dirty mode support */
2878 /* Convert a shadow to log-dirty mode. */
2879 void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
2881 BUG();
2885 /* Read a domain's log-dirty bitmap and stats.
2886 * If the operation is a CLEAN, clear the bitmap and stats as well. */
2887 static int shadow_log_dirty_op(
2888 struct domain *d, struct xen_domctl_shadow_op *sc)
2890 int i, rv = 0, clean = 0;
2892 domain_pause(d);
2893 shadow_lock(d);
2895 clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
2897 SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
2898 (clean) ? "clean" : "peek",
2899 d->domain_id,
2900 d->arch.shadow.fault_count,
2901 d->arch.shadow.dirty_count);
2903 sc->stats.fault_count = d->arch.shadow.fault_count;
2904 sc->stats.dirty_count = d->arch.shadow.dirty_count;
2906 if ( clean )
2908 /* Need to revoke write access to the domain's pages again.
2909 * In future, we'll have a less heavy-handed approach to this,
2910 * but for now, we just unshadow everything except Xen. */
2911 shadow_blow_tables(d);
2913 d->arch.shadow.fault_count = 0;
2914 d->arch.shadow.dirty_count = 0;
2917 if ( guest_handle_is_null(sc->dirty_bitmap) ||
2918 (d->arch.shadow.dirty_bitmap == NULL) )
2920 rv = -EINVAL;
2921 goto out;
2924 if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
2925 sc->pages = d->arch.shadow.dirty_bitmap_size;
2927 #define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
2928 for ( i = 0; i < sc->pages; i += CHUNK )
2930 int bytes = ((((sc->pages - i) > CHUNK)
2931 ? CHUNK
2932 : (sc->pages - i)) + 7) / 8;
2934 if ( copy_to_guest_offset(
2935 sc->dirty_bitmap,
2936 i/(8*sizeof(unsigned long)),
2937 d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
2938 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
2940 rv = -EINVAL;
2941 goto out;
2944 if ( clean )
2945 memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
2946 0, bytes);
2948 #undef CHUNK
2950 out:
2951 shadow_unlock(d);
2952 domain_unpause(d);
2953 return 0;
2957 /* Mark a page as dirty */
2958 void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
2960 unsigned long pfn;
2962 ASSERT(shadow_lock_is_acquired(d));
2963 ASSERT(shadow_mode_log_dirty(d));
2965 if ( !mfn_valid(gmfn) )
2966 return;
2968 ASSERT(d->arch.shadow.dirty_bitmap != NULL);
2970 /* We /really/ mean PFN here, even for non-translated guests. */
2971 pfn = get_gpfn_from_mfn(mfn_x(gmfn));
2973 /*
2974 * Values with the MSB set denote MFNs that aren't really part of the
2975 * domain's pseudo-physical memory map (e.g., the shared info frame).
2976 * Nothing to do here...
2977 */
2978 if ( unlikely(!VALID_M2P(pfn)) )
2979 return;
2981 /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
2982 if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
2984 if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
2986 SHADOW_DEBUG(LOGDIRTY,
2987 "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
2988 mfn_x(gmfn), pfn, d->domain_id);
2989 d->arch.shadow.dirty_count++;
2992 else
2994 SHADOW_PRINTK("mark_dirty OOR! "
2995 "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
2996 "owner=%d c=%08x t=%" PRtype_info "\n",
2997 mfn_x(gmfn),
2998 pfn,
2999 d->arch.shadow.dirty_bitmap_size,
3000 d->domain_id,
3001 (page_get_owner(mfn_to_page(gmfn))
3002 ? page_get_owner(mfn_to_page(gmfn))->domain_id
3003 : -1),
3004 mfn_to_page(gmfn)->count_info,
3005 mfn_to_page(gmfn)->u.inuse.type_info);
3010 /**************************************************************************/
3011 /* Shadow-control XEN_DOMCTL dispatcher */
3013 int shadow_domctl(struct domain *d,
3014 xen_domctl_shadow_op_t *sc,
3015 XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
3017 int rc, preempted = 0;
3019 if ( unlikely(d == current->domain) )
3021 gdprintk(XENLOG_INFO, "Don't try to do a shadow op on yourself!\n");
3022 return -EINVAL;
3025 switch ( sc->op )
3027 case XEN_DOMCTL_SHADOW_OP_OFF:
3028 if ( shadow_mode_log_dirty(d) )
3029 if ( (rc = shadow_log_dirty_disable(d)) != 0 )
3030 return rc;
3031 if ( is_hvm_domain(d) )
3032 return -EINVAL;
3033 if ( d->arch.shadow.mode & SHM2_enable )
3034 if ( (rc = shadow_test_disable(d)) != 0 )
3035 return rc;
3036 return 0;
3038 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3039 return shadow_test_enable(d);
3041 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
3042 return shadow_log_dirty_enable(d);
3044 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3045 return shadow_enable(d, SHM2_refcounts|SHM2_translate);
3047 case XEN_DOMCTL_SHADOW_OP_CLEAN:
3048 case XEN_DOMCTL_SHADOW_OP_PEEK:
3049 return shadow_log_dirty_op(d, sc);
3051 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3052 if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
3053 return shadow_log_dirty_enable(d);
3054 return shadow_enable(d, sc->mode << SHM2_shift);
3056 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3057 sc->mb = shadow_get_allocation(d);
3058 return 0;
3060 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3061 rc = shadow_set_allocation(d, sc->mb, &preempted);
3062 if ( preempted )
3063 /* Not finished. Set up to re-run the call. */
3064 rc = hypercall_create_continuation(
3065 __HYPERVISOR_domctl, "h", u_domctl);
3066 else
3067 /* Finished. Return the new allocation */
3068 sc->mb = shadow_get_allocation(d);
3069 return rc;
3071 default:
3072 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3073 return -EINVAL;
3078 /**************************************************************************/
3079 /* Auditing shadow tables */
3081 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3083 void shadow_audit_tables(struct vcpu *v)
3085 /* Dispatch table for getting per-type functions */
3086 static hash_callback_t callbacks[16] = {
3087 NULL, /* none */
3088 #if CONFIG_PAGING_LEVELS == 2
3089 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
3090 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
3091 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
3092 #else
3093 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
3094 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
3095 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
3096 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
3097 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
3098 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
3099 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
3100 #if CONFIG_PAGING_LEVELS >= 4
3101 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
3102 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
3103 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
3104 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
3105 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
3106 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3107 #endif /* CONFIG_PAGING_LEVELS > 2 */
3108 NULL /* All the rest */
3109 };
3110 unsigned int mask;
3112 if ( !(SHADOW_AUDIT_ENABLE) )
3113 return;
3115 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3116 mask = ~1; /* Audit every table in the system */
3117 else
3119 /* Audit only the current mode's tables */
3120 switch ( v->arch.shadow.mode->guest_levels )
3122 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3123 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3124 |SHF_L2H_PAE); break;
3125 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3126 |SHF_L3_64|SHF_L4_64); break;
3127 default: BUG();
3131 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3134 #endif /* Shadow audit */
3137 /**************************************************************************/
3138 /* Auditing p2m tables */
3140 #if SHADOW_AUDIT & SHADOW_AUDIT_P2M
3142 void shadow_audit_p2m(struct domain *d)
3144 struct list_head *entry;
3145 struct page_info *page;
3146 struct domain *od;
3147 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
3148 mfn_t p2mfn;
3149 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
3150 int test_linear;
3152 if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
3153 return;
3155 //SHADOW_PRINTK("p2m audit starts\n");
3157 test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
3158 if ( test_linear )
3159 local_flush_tlb();
3161 /* Audit part one: walk the domain's page allocation list, checking
3162 * the m2p entries. */
3163 for ( entry = d->page_list.next;
3164 entry != &d->page_list;
3165 entry = entry->next )
3167 page = list_entry(entry, struct page_info, list);
3168 mfn = mfn_x(page_to_mfn(page));
3170 // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
3172 od = page_get_owner(page);
3174 if ( od != d )
3176 SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
3177 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
3178 continue;
3181 gfn = get_gpfn_from_mfn(mfn);
3182 if ( gfn == INVALID_M2P_ENTRY )
3184 orphans_i++;
3185 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
3186 // mfn);
3187 continue;
3190 if ( gfn == 0x55555555 )
3192 orphans_d++;
3193 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
3194 // mfn);
3195 continue;
3198 p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
3199 if ( mfn_x(p2mfn) != mfn )
3201 mpbad++;
3202 SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
3203 " (-> gfn %#lx)\n",
3204 mfn, gfn, mfn_x(p2mfn),
3205 (mfn_valid(p2mfn)
3206 ? get_gpfn_from_mfn(mfn_x(p2mfn))
3207 : -1u));
3208 /* This m2p entry is stale: the domain has another frame in
3209 * this physical slot. No great disaster, but for neatness,
3210 * blow away the m2p entry. */
3211 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3214 if ( test_linear && (gfn <= d->arch.max_mapped_pfn) )
3216 lp2mfn = gfn_to_mfn_current(gfn);
3217 if ( mfn_x(lp2mfn) != mfn_x(p2mfn) )
3219 SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
3220 "(!= mfn %#lx)\n", gfn,
3221 mfn_x(lp2mfn), mfn_x(p2mfn));
3225 // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
3226 // mfn, gfn, p2mfn, lp2mfn);
3229 /* Audit part two: walk the domain's p2m table, checking the entries. */
3230 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
3232 l2_pgentry_t *l2e;
3233 l1_pgentry_t *l1e;
3234 int i1, i2;
3236 #if CONFIG_PAGING_LEVELS == 4
3237 l4_pgentry_t *l4e;
3238 l3_pgentry_t *l3e;
3239 int i3, i4;
3240 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3241 #elif CONFIG_PAGING_LEVELS == 3
3242 l3_pgentry_t *l3e;
3243 int i3;
3244 l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3245 #else /* CONFIG_PAGING_LEVELS == 2 */
3246 l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3247 #endif
3249 gfn = 0;
3250 #if CONFIG_PAGING_LEVELS >= 3
3251 #if CONFIG_PAGING_LEVELS >= 4
3252 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
3254 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
3256 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
3257 continue;
3259 l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
3260 #endif /* now at levels 3 or 4... */
3261 for ( i3 = 0;
3262 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
3263 i3++ )
3265 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
3267 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3268 continue;
3270 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
3271 #endif /* all levels... */
3272 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
3274 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
3276 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
3277 continue;
3279 l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
3281 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
3283 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
3284 continue;
3285 mfn = l1e_get_pfn(l1e[i1]);
3286 ASSERT(mfn_valid(_mfn(mfn)));
3287 m2pfn = get_gpfn_from_mfn(mfn);
3288 if ( m2pfn != gfn )
3290 pmbad++;
3291 SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
3292 " -> gfn %#lx\n", gfn, mfn, m2pfn);
3293 BUG();
3296 sh_unmap_domain_page(l1e);
3298 #if CONFIG_PAGING_LEVELS >= 3
3299 sh_unmap_domain_page(l2e);
3301 #if CONFIG_PAGING_LEVELS >= 4
3302 sh_unmap_domain_page(l3e);
3304 #endif
3305 #endif
3307 #if CONFIG_PAGING_LEVELS == 4
3308 sh_unmap_domain_page(l4e);
3309 #elif CONFIG_PAGING_LEVELS == 3
3310 sh_unmap_domain_page(l3e);
3311 #else /* CONFIG_PAGING_LEVELS == 2 */
3312 sh_unmap_domain_page(l2e);
3313 #endif
3317 //SHADOW_PRINTK("p2m audit complete\n");
3318 //if ( orphans_i | orphans_d | mpbad | pmbad )
3319 // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
3320 // orphans_i + orphans_d, orphans_i, orphans_d,
3321 if ( mpbad | pmbad )
3322 SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
3323 pmbad, mpbad);
3326 #endif /* p2m audit */
3328 /*
3329 * Local variables:
3330 * mode: C
3331 * c-set-style: "BSD"
3332 * c-basic-offset: 4
3333 * indent-tabs-mode: nil
3334 * End:
3335 */