ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 13224:25723963a6b6

Minor clean ups to shadow logdirty peek / clean.

Signed-off-by: Steven Hand <steven@xensource.com>
author Steven Hand <steven@xensource.com>
date Fri Dec 29 14:25:03 2006 +0000 (2006-12-29)
parents c75d6f2aad7a
children 711c31232d71
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include "private.h"
42 /* Set up the shadow-specific parts of a domain struct at start of day.
43 * Called for every domain from arch_domain_create() */
44 void shadow_domain_init(struct domain *d)
45 {
46 int i;
47 shadow_lock_init(d);
48 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
49 INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
50 INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
51 INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
52 INIT_LIST_HEAD(&d->arch.shadow.pinned_shadows);
53 }
56 #if SHADOW_AUDIT
57 int shadow_audit_enable = 0;
59 static void shadow_audit_key(unsigned char key)
60 {
61 shadow_audit_enable = !shadow_audit_enable;
62 printk("%s shadow_audit_enable=%d\n",
63 __func__, shadow_audit_enable);
64 }
66 static int __init shadow_audit_key_init(void)
67 {
68 register_keyhandler(
69 'O', shadow_audit_key, "toggle shadow audits");
70 return 0;
71 }
72 __initcall(shadow_audit_key_init);
73 #endif /* SHADOW_AUDIT */
75 static void sh_free_log_dirty_bitmap(struct domain *d);
77 int _shadow_mode_refcounts(struct domain *d)
78 {
79 return shadow_mode_refcounts(d);
80 }
83 /**************************************************************************/
84 /* x86 emulator support for the shadow code
85 */
87 struct segment_register *hvm_get_seg_reg(
88 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
89 {
90 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
91 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
92 hvm_get_segment_register(current, seg, seg_reg);
93 return seg_reg;
94 }
96 enum hvm_access_type {
97 hvm_access_insn_fetch, hvm_access_read, hvm_access_write
98 };
100 static int hvm_translate_linear_addr(
101 enum x86_segment seg,
102 unsigned long offset,
103 unsigned int bytes,
104 enum hvm_access_type access_type,
105 struct sh_emulate_ctxt *sh_ctxt,
106 unsigned long *paddr)
107 {
108 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
109 unsigned long limit, addr = offset;
110 uint32_t last_byte;
112 if ( sh_ctxt->ctxt.mode != X86EMUL_MODE_PROT64 )
113 {
114 /*
115 * COMPATIBILITY MODE: Apply segment checks and add base.
116 */
118 switch ( access_type )
119 {
120 case hvm_access_read:
121 if ( (reg->attr.fields.type & 0xa) == 0x8 )
122 goto gpf; /* execute-only code segment */
123 break;
124 case hvm_access_write:
125 if ( (reg->attr.fields.type & 0xa) != 0x2 )
126 goto gpf; /* not a writable data segment */
127 break;
128 default:
129 break;
130 }
132 /* Calculate the segment limit, including granularity flag. */
133 limit = reg->limit;
134 if ( reg->attr.fields.g )
135 limit = (limit << 12) | 0xfff;
137 last_byte = offset + bytes - 1;
139 /* Is this a grows-down data segment? Special limit check if so. */
140 if ( (reg->attr.fields.type & 0xc) == 0x4 )
141 {
142 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
143 if ( !reg->attr.fields.db )
144 last_byte = (uint16_t)last_byte;
146 /* Check first byte and last byte against respective bounds. */
147 if ( (offset <= limit) || (last_byte < offset) )
148 goto gpf;
149 }
150 else if ( (last_byte > limit) || (last_byte < offset) )
151 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
153 /*
154 * Hardware truncates to 32 bits in compatibility mode.
155 * It does not truncate to 16 bits in 16-bit address-size mode.
156 */
157 addr = (uint32_t)(addr + reg->base);
158 }
159 else
160 {
161 /*
162 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
163 */
165 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
166 addr += reg->base;
168 if ( !is_canonical_address(addr) )
169 goto gpf;
170 }
172 *paddr = addr;
173 return 0;
175 gpf:
176 /* Inject #GP(0). */
177 hvm_inject_exception(TRAP_gp_fault, 0, 0);
178 return X86EMUL_PROPAGATE_FAULT;
179 }
181 static int
182 hvm_read(enum x86_segment seg,
183 unsigned long offset,
184 unsigned long *val,
185 unsigned int bytes,
186 enum hvm_access_type access_type,
187 struct sh_emulate_ctxt *sh_ctxt)
188 {
189 unsigned long addr;
190 int rc, errcode;
192 rc = hvm_translate_linear_addr(
193 seg, offset, bytes, access_type, sh_ctxt, &addr);
194 if ( rc )
195 return rc;
197 *val = 0;
198 // XXX -- this is WRONG.
199 // It entirely ignores the permissions in the page tables.
200 // In this case, that is only a user vs supervisor access check.
201 //
202 if ( (rc = hvm_copy_from_guest_virt(val, addr, bytes)) == 0 )
203 return X86EMUL_CONTINUE;
205 /* If we got here, there was nothing mapped here, or a bad GFN
206 * was mapped here. This should never happen: we're here because
207 * of a write fault at the end of the instruction we're emulating. */
208 SHADOW_PRINTK("read failed to va %#lx\n", addr);
209 errcode = ring_3(sh_ctxt->ctxt.regs) ? PFEC_user_mode : 0;
210 if ( access_type == hvm_access_insn_fetch )
211 errcode |= PFEC_insn_fetch;
212 hvm_inject_exception(TRAP_page_fault, errcode, addr + bytes - rc);
213 return X86EMUL_PROPAGATE_FAULT;
214 }
216 static int
217 hvm_emulate_read(enum x86_segment seg,
218 unsigned long offset,
219 unsigned long *val,
220 unsigned int bytes,
221 struct x86_emulate_ctxt *ctxt)
222 {
223 return hvm_read(seg, offset, val, bytes, hvm_access_read,
224 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
225 }
227 static int
228 hvm_emulate_insn_fetch(enum x86_segment seg,
229 unsigned long offset,
230 unsigned long *val,
231 unsigned int bytes,
232 struct x86_emulate_ctxt *ctxt)
233 {
234 struct sh_emulate_ctxt *sh_ctxt =
235 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
236 unsigned int insn_off = offset - ctxt->regs->eip;
238 /* Fall back if requested bytes are not in the prefetch cache. */
239 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
240 return hvm_read(seg, offset, val, bytes,
241 hvm_access_insn_fetch, sh_ctxt);
243 /* Hit the cache. Simple memcpy. */
244 *val = 0;
245 memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
246 return X86EMUL_CONTINUE;
247 }
249 static int
250 hvm_emulate_write(enum x86_segment seg,
251 unsigned long offset,
252 unsigned long val,
253 unsigned int bytes,
254 struct x86_emulate_ctxt *ctxt)
255 {
256 struct sh_emulate_ctxt *sh_ctxt =
257 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
258 struct vcpu *v = current;
259 unsigned long addr;
260 int rc;
262 rc = hvm_translate_linear_addr(
263 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
264 if ( rc )
265 return rc;
267 return v->arch.shadow.mode->x86_emulate_write(
268 v, addr, &val, bytes, sh_ctxt);
269 }
271 static int
272 hvm_emulate_cmpxchg(enum x86_segment seg,
273 unsigned long offset,
274 unsigned long old,
275 unsigned long new,
276 unsigned int bytes,
277 struct x86_emulate_ctxt *ctxt)
278 {
279 struct sh_emulate_ctxt *sh_ctxt =
280 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
281 struct vcpu *v = current;
282 unsigned long addr;
283 int rc;
285 rc = hvm_translate_linear_addr(
286 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
287 if ( rc )
288 return rc;
290 return v->arch.shadow.mode->x86_emulate_cmpxchg(
291 v, addr, old, new, bytes, sh_ctxt);
292 }
294 static int
295 hvm_emulate_cmpxchg8b(enum x86_segment seg,
296 unsigned long offset,
297 unsigned long old_lo,
298 unsigned long old_hi,
299 unsigned long new_lo,
300 unsigned long new_hi,
301 struct x86_emulate_ctxt *ctxt)
302 {
303 struct sh_emulate_ctxt *sh_ctxt =
304 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
305 struct vcpu *v = current;
306 unsigned long addr;
307 int rc;
309 rc = hvm_translate_linear_addr(
310 seg, offset, 8, hvm_access_write, sh_ctxt, &addr);
311 if ( rc )
312 return rc;
314 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(
315 v, addr, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
316 }
318 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
319 .read = hvm_emulate_read,
320 .insn_fetch = hvm_emulate_insn_fetch,
321 .write = hvm_emulate_write,
322 .cmpxchg = hvm_emulate_cmpxchg,
323 .cmpxchg8b = hvm_emulate_cmpxchg8b,
324 };
326 static int
327 pv_emulate_read(enum x86_segment seg,
328 unsigned long offset,
329 unsigned long *val,
330 unsigned int bytes,
331 struct x86_emulate_ctxt *ctxt)
332 {
333 unsigned int rc;
335 *val = 0;
336 if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
337 {
338 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
339 return X86EMUL_PROPAGATE_FAULT;
340 }
342 return X86EMUL_CONTINUE;
343 }
345 static int
346 pv_emulate_write(enum x86_segment seg,
347 unsigned long offset,
348 unsigned long val,
349 unsigned int bytes,
350 struct x86_emulate_ctxt *ctxt)
351 {
352 struct sh_emulate_ctxt *sh_ctxt =
353 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
354 struct vcpu *v = current;
355 return v->arch.shadow.mode->x86_emulate_write(
356 v, offset, &val, bytes, sh_ctxt);
357 }
359 static int
360 pv_emulate_cmpxchg(enum x86_segment seg,
361 unsigned long offset,
362 unsigned long old,
363 unsigned long new,
364 unsigned int bytes,
365 struct x86_emulate_ctxt *ctxt)
366 {
367 struct sh_emulate_ctxt *sh_ctxt =
368 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
369 struct vcpu *v = current;
370 return v->arch.shadow.mode->x86_emulate_cmpxchg(
371 v, offset, old, new, bytes, sh_ctxt);
372 }
374 static int
375 pv_emulate_cmpxchg8b(enum x86_segment seg,
376 unsigned long offset,
377 unsigned long old_lo,
378 unsigned long old_hi,
379 unsigned long new_lo,
380 unsigned long new_hi,
381 struct x86_emulate_ctxt *ctxt)
382 {
383 struct sh_emulate_ctxt *sh_ctxt =
384 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
385 struct vcpu *v = current;
386 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(
387 v, offset, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
388 }
390 static struct x86_emulate_ops pv_shadow_emulator_ops = {
391 .read = pv_emulate_read,
392 .insn_fetch = pv_emulate_read,
393 .write = pv_emulate_write,
394 .cmpxchg = pv_emulate_cmpxchg,
395 .cmpxchg8b = pv_emulate_cmpxchg8b,
396 };
398 struct x86_emulate_ops *shadow_init_emulation(
399 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
400 {
401 struct segment_register *creg;
402 struct vcpu *v = current;
403 unsigned long addr;
405 sh_ctxt->ctxt.regs = regs;
407 if ( !is_hvm_vcpu(v) )
408 {
409 sh_ctxt->ctxt.mode = X86EMUL_MODE_HOST;
410 return &pv_shadow_emulator_ops;
411 }
413 /* Segment cache initialisation. Primed with CS. */
414 sh_ctxt->valid_seg_regs = 0;
415 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
417 /* Work out the emulation mode. */
418 if ( hvm_long_mode_enabled(v) )
419 sh_ctxt->ctxt.mode = creg->attr.fields.l ?
420 X86EMUL_MODE_PROT64 : X86EMUL_MODE_PROT32;
421 else if ( regs->eflags & X86_EFLAGS_VM )
422 sh_ctxt->ctxt.mode = X86EMUL_MODE_REAL;
423 else
424 sh_ctxt->ctxt.mode = creg->attr.fields.db ?
425 X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
427 /* Attempt to prefetch whole instruction. */
428 sh_ctxt->insn_buf_bytes =
429 (!hvm_translate_linear_addr(
430 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
431 hvm_access_insn_fetch, sh_ctxt, &addr) &&
432 !hvm_copy_from_guest_virt(
433 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf)))
434 ? sizeof(sh_ctxt->insn_buf) : 0;
436 return &hvm_shadow_emulator_ops;
437 }
439 /**************************************************************************/
440 /* Code for "promoting" a guest page to the point where the shadow code is
441 * willing to let it be treated as a guest page table. This generally
442 * involves making sure there are no writable mappings available to the guest
443 * for this page.
444 */
445 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
446 {
447 struct page_info *page = mfn_to_page(gmfn);
449 ASSERT(mfn_valid(gmfn));
451 /* We should never try to promote a gmfn that has writeable mappings */
452 ASSERT(sh_remove_write_access(v, gmfn, 0, 0) == 0);
454 /* Is the page already shadowed? */
455 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
456 page->shadow_flags = 0;
458 ASSERT(!test_bit(type, &page->shadow_flags));
459 set_bit(type, &page->shadow_flags);
460 }
462 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
463 {
464 struct page_info *page = mfn_to_page(gmfn);
466 ASSERT(test_bit(_PGC_page_table, &page->count_info));
467 ASSERT(test_bit(type, &page->shadow_flags));
469 clear_bit(type, &page->shadow_flags);
471 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
472 {
473 /* tlbflush timestamp field is valid again */
474 page->tlbflush_timestamp = tlbflush_current_time();
475 clear_bit(_PGC_page_table, &page->count_info);
476 }
477 }
479 /**************************************************************************/
480 /* Validate a pagetable change from the guest and update the shadows.
481 * Returns a bitmask of SHADOW_SET_* flags. */
483 int
484 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
485 {
486 int result = 0;
487 struct page_info *page = mfn_to_page(gmfn);
489 sh_mark_dirty(v->domain, gmfn);
491 // Determine which types of shadows are affected, and update each.
492 //
493 // Always validate L1s before L2s to prevent another cpu with a linear
494 // mapping of this gmfn from seeing a walk that results from
495 // using the new L2 value and the old L1 value. (It is OK for such a
496 // guest to see a walk that uses the old L2 value with the new L1 value,
497 // as hardware could behave this way if one level of the pagewalk occurs
498 // before the store, and the next level of the pagewalk occurs after the
499 // store.
500 //
501 // Ditto for L2s before L3s, etc.
502 //
504 if ( !(page->count_info & PGC_page_table) )
505 return 0; /* Not shadowed at all */
507 #if CONFIG_PAGING_LEVELS == 2
508 if ( page->shadow_flags & SHF_L1_32 )
509 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
510 (v, gmfn, entry, size);
511 #else
512 if ( page->shadow_flags & SHF_L1_32 )
513 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
514 (v, gmfn, entry, size);
515 #endif
517 #if CONFIG_PAGING_LEVELS == 2
518 if ( page->shadow_flags & SHF_L2_32 )
519 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
520 (v, gmfn, entry, size);
521 #else
522 if ( page->shadow_flags & SHF_L2_32 )
523 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
524 (v, gmfn, entry, size);
525 #endif
527 #if CONFIG_PAGING_LEVELS >= 3
528 if ( page->shadow_flags & SHF_L1_PAE )
529 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
530 (v, gmfn, entry, size);
531 if ( page->shadow_flags & SHF_L2_PAE )
532 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
533 (v, gmfn, entry, size);
534 if ( page->shadow_flags & SHF_L2H_PAE )
535 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
536 (v, gmfn, entry, size);
537 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
538 ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
539 #endif
541 #if CONFIG_PAGING_LEVELS >= 4
542 if ( page->shadow_flags & SHF_L1_64 )
543 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
544 (v, gmfn, entry, size);
545 if ( page->shadow_flags & SHF_L2_64 )
546 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
547 (v, gmfn, entry, size);
548 if ( page->shadow_flags & SHF_L3_64 )
549 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
550 (v, gmfn, entry, size);
551 if ( page->shadow_flags & SHF_L4_64 )
552 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
553 (v, gmfn, entry, size);
554 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
555 ASSERT((page->shadow_flags
556 & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
557 #endif
559 return result;
560 }
563 void
564 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
565 void *entry, u32 size)
566 /* This is the entry point for emulated writes to pagetables in HVM guests and
567 * PV translated guests.
568 */
569 {
570 struct domain *d = v->domain;
571 int rc;
573 ASSERT(shadow_locked_by_me(v->domain));
574 rc = sh_validate_guest_entry(v, gmfn, entry, size);
575 if ( rc & SHADOW_SET_FLUSH )
576 /* Need to flush TLBs to pick up shadow PT changes */
577 flush_tlb_mask(d->domain_dirty_cpumask);
578 if ( rc & SHADOW_SET_ERROR )
579 {
580 /* This page is probably not a pagetable any more: tear it out of the
581 * shadows, along with any tables that reference it.
582 * Since the validate call above will have made a "safe" (i.e. zero)
583 * shadow entry, we can let the domain live even if we can't fully
584 * unshadow the page. */
585 sh_remove_shadows(v, gmfn, 0, 0);
586 }
587 }
589 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
590 intpte_t new, mfn_t gmfn)
591 /* Write a new value into the guest pagetable, and update the shadows
592 * appropriately. Returns 0 if we page-faulted, 1 for success. */
593 {
594 int failed;
595 shadow_lock(v->domain);
596 failed = __copy_to_user(p, &new, sizeof(new));
597 if ( failed != sizeof(new) )
598 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
599 shadow_unlock(v->domain);
600 return (failed == 0);
601 }
603 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
604 intpte_t *old, intpte_t new, mfn_t gmfn)
605 /* Cmpxchg a new value into the guest pagetable, and update the shadows
606 * appropriately. Returns 0 if we page-faulted, 1 if not.
607 * N.B. caller should check the value of "old" to see if the
608 * cmpxchg itself was successful. */
609 {
610 int failed;
611 intpte_t t = *old;
612 shadow_lock(v->domain);
613 failed = cmpxchg_user(p, t, new);
614 if ( t == *old )
615 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
616 *old = t;
617 shadow_unlock(v->domain);
618 return (failed == 0);
619 }
622 /**************************************************************************/
623 /* Memory management for shadow pages. */
625 /* Allocating shadow pages
626 * -----------------------
627 *
628 * Most shadow pages are allocated singly, but there is one case where
629 * we need to allocate multiple pages together: shadowing 32-bit guest
630 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
631 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
632 * l1 tables (covering 2MB of virtual address space each). Similarly, a
633 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
634 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
635 * contiguous and aligned; functions for handling offsets into them are
636 * defined in shadow.c (shadow_l1_index() etc.)
637 *
638 * This table shows the allocation behaviour of the different modes:
639 *
640 * Xen paging 32b pae pae 64b 64b 64b
641 * Guest paging 32b 32b pae 32b pae 64b
642 * PV or HVM * HVM * HVM HVM *
643 * Shadow paging 32b pae pae pae pae 64b
644 *
645 * sl1 size 4k 8k 4k 8k 4k 4k
646 * sl2 size 4k 16k 4k 16k 4k 4k
647 * sl3 size - - - - - 4k
648 * sl4 size - - - - - 4k
649 *
650 * We allocate memory from xen in four-page units and break them down
651 * with a simple buddy allocator. Can't use the xen allocator to handle
652 * this as it only works for contiguous zones, and a domain's shadow
653 * pool is made of fragments.
654 *
655 * In HVM guests, the p2m table is built out of shadow pages, and we provide
656 * a function for the p2m management to steal pages, in max-order chunks, from
657 * the free pool. We don't provide for giving them back, yet.
658 */
660 /* Figure out the least acceptable quantity of shadow memory.
661 * The minimum memory requirement for always being able to free up a
662 * chunk of memory is very small -- only three max-order chunks per
663 * vcpu to hold the top level shadows and pages with Xen mappings in them.
664 *
665 * But for a guest to be guaranteed to successfully execute a single
666 * instruction, we must be able to map a large number (about thirty) VAs
667 * at the same time, which means that to guarantee progress, we must
668 * allow for more than ninety allocated pages per vcpu. We round that
669 * up to 128 pages, or half a megabyte per vcpu. */
670 unsigned int shadow_min_acceptable_pages(struct domain *d)
671 {
672 u32 vcpu_count = 0;
673 struct vcpu *v;
675 for_each_vcpu(d, v)
676 vcpu_count++;
678 return (vcpu_count * 128);
679 }
681 /* Figure out the order of allocation needed for a given shadow type */
682 static inline u32
683 shadow_order(unsigned int shadow_type)
684 {
685 #if CONFIG_PAGING_LEVELS > 2
686 static const u32 type_to_order[16] = {
687 0, /* SH_type_none */
688 1, /* SH_type_l1_32_shadow */
689 1, /* SH_type_fl1_32_shadow */
690 2, /* SH_type_l2_32_shadow */
691 0, /* SH_type_l1_pae_shadow */
692 0, /* SH_type_fl1_pae_shadow */
693 0, /* SH_type_l2_pae_shadow */
694 0, /* SH_type_l2h_pae_shadow */
695 0, /* SH_type_l1_64_shadow */
696 0, /* SH_type_fl1_64_shadow */
697 0, /* SH_type_l2_64_shadow */
698 0, /* SH_type_l3_64_shadow */
699 0, /* SH_type_l4_64_shadow */
700 2, /* SH_type_p2m_table */
701 0 /* SH_type_monitor_table */
702 };
703 ASSERT(shadow_type < 16);
704 return type_to_order[shadow_type];
705 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
706 return 0;
707 #endif
708 }
711 /* Do we have a free chunk of at least this order? */
712 static inline int chunk_is_available(struct domain *d, int order)
713 {
714 int i;
716 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
717 if ( !list_empty(&d->arch.shadow.freelists[i]) )
718 return 1;
719 return 0;
720 }
722 /* Dispatcher function: call the per-mode function that will unhook the
723 * non-Xen mappings in this top-level shadow mfn */
724 void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
725 {
726 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
727 switch ( sp->type )
728 {
729 case SH_type_l2_32_shadow:
730 #if CONFIG_PAGING_LEVELS == 2
731 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
732 #else
733 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
734 #endif
735 break;
736 #if CONFIG_PAGING_LEVELS >= 3
737 case SH_type_l2_pae_shadow:
738 case SH_type_l2h_pae_shadow:
739 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
740 break;
741 #endif
742 #if CONFIG_PAGING_LEVELS >= 4
743 case SH_type_l4_64_shadow:
744 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
745 break;
746 #endif
747 default:
748 SHADOW_PRINTK("top-level shadow has bad type %08x\n", sp->type);
749 BUG();
750 }
751 }
754 /* Make sure there is at least one chunk of the required order available
755 * in the shadow page pool. This must be called before any calls to
756 * shadow_alloc(). Since this will free existing shadows to make room,
757 * it must be called early enough to avoid freeing shadows that the
758 * caller is currently working on. */
759 void shadow_prealloc(struct domain *d, unsigned int order)
760 {
761 /* Need a vpcu for calling unpins; for now, since we don't have
762 * per-vcpu shadows, any will do */
763 struct vcpu *v, *v2;
764 struct list_head *l, *t;
765 struct shadow_page_info *sp;
766 cpumask_t flushmask = CPU_MASK_NONE;
767 mfn_t smfn;
768 int i;
770 if ( chunk_is_available(d, order) ) return;
772 v = current;
773 if ( v->domain != d )
774 v = d->vcpu[0];
775 ASSERT(v != NULL);
777 /* Stage one: walk the list of pinned pages, unpinning them */
778 perfc_incrc(shadow_prealloc_1);
779 list_for_each_backwards_safe(l, t, &d->arch.shadow.pinned_shadows)
780 {
781 sp = list_entry(l, struct shadow_page_info, list);
782 smfn = shadow_page_to_mfn(sp);
784 /* Unpin this top-level shadow */
785 sh_unpin(v, smfn);
787 /* See if that freed up a chunk of appropriate size */
788 if ( chunk_is_available(d, order) ) return;
789 }
791 /* Stage two: all shadow pages are in use in hierarchies that are
792 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
793 * mappings. */
794 perfc_incrc(shadow_prealloc_2);
796 for_each_vcpu(d, v2)
797 for ( i = 0 ; i < 4 ; i++ )
798 {
799 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
800 {
801 shadow_unhook_mappings(v,
802 pagetable_get_mfn(v2->arch.shadow_table[i]));
803 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
805 /* See if that freed up a chunk of appropriate size */
806 if ( chunk_is_available(d, order) )
807 {
808 flush_tlb_mask(flushmask);
809 return;
810 }
811 }
812 }
814 /* Nothing more we can do: all remaining shadows are of pages that
815 * hold Xen mappings for some vcpu. This can never happen. */
816 SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
817 " shadow pages total = %u, free = %u, p2m=%u\n",
818 1 << order,
819 d->arch.shadow.total_pages,
820 d->arch.shadow.free_pages,
821 d->arch.shadow.p2m_pages);
822 BUG();
823 }
825 /* Deliberately free all the memory we can: this will tear down all of
826 * this domain's shadows */
827 static void shadow_blow_tables(struct domain *d)
828 {
829 struct list_head *l, *t;
830 struct shadow_page_info *sp;
831 struct vcpu *v = d->vcpu[0];
832 mfn_t smfn;
833 int i;
835 /* Pass one: unpin all pinned pages */
836 list_for_each_backwards_safe(l,t, &d->arch.shadow.pinned_shadows)
837 {
838 sp = list_entry(l, struct shadow_page_info, list);
839 smfn = shadow_page_to_mfn(sp);
840 sh_unpin(v, smfn);
841 }
843 /* Second pass: unhook entries of in-use shadows */
844 for_each_vcpu(d, v)
845 for ( i = 0 ; i < 4 ; i++ )
846 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
847 shadow_unhook_mappings(v,
848 pagetable_get_mfn(v->arch.shadow_table[i]));
850 /* Make sure everyone sees the unshadowings */
851 flush_tlb_mask(d->domain_dirty_cpumask);
852 }
855 #ifndef NDEBUG
856 /* Blow all shadows of all shadowed domains: this can be used to cause the
857 * guest's pagetables to be re-shadowed if we suspect that the shadows
858 * have somehow got out of sync */
859 static void shadow_blow_all_tables(unsigned char c)
860 {
861 struct domain *d;
862 printk("'%c' pressed -> blowing all shadow tables\n", c);
863 for_each_domain(d)
864 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
865 {
866 shadow_lock(d);
867 shadow_blow_tables(d);
868 shadow_unlock(d);
869 }
870 }
872 /* Register this function in the Xen console keypress table */
873 static __init int shadow_blow_tables_keyhandler_init(void)
874 {
875 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
876 return 0;
877 }
878 __initcall(shadow_blow_tables_keyhandler_init);
879 #endif /* !NDEBUG */
881 /* Allocate another shadow's worth of (contiguous, aligned) pages,
882 * and fill in the type and backpointer fields of their page_infos.
883 * Never fails to allocate. */
884 mfn_t shadow_alloc(struct domain *d,
885 u32 shadow_type,
886 unsigned long backpointer)
887 {
888 struct shadow_page_info *sp = NULL;
889 unsigned int order = shadow_order(shadow_type);
890 cpumask_t mask;
891 void *p;
892 int i;
894 ASSERT(shadow_locked_by_me(d));
895 ASSERT(order <= SHADOW_MAX_ORDER);
896 ASSERT(shadow_type != SH_type_none);
897 perfc_incrc(shadow_alloc);
899 /* Find smallest order which can satisfy the request. */
900 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
901 if ( !list_empty(&d->arch.shadow.freelists[i]) )
902 {
903 sp = list_entry(d->arch.shadow.freelists[i].next,
904 struct shadow_page_info, list);
905 list_del(&sp->list);
907 /* We may have to halve the chunk a number of times. */
908 while ( i != order )
909 {
910 i--;
911 sp->order = i;
912 list_add_tail(&sp->list, &d->arch.shadow.freelists[i]);
913 sp += 1 << i;
914 }
915 d->arch.shadow.free_pages -= 1 << order;
917 /* Init page info fields and clear the pages */
918 for ( i = 0; i < 1<<order ; i++ )
919 {
920 /* Before we overwrite the old contents of this page,
921 * we need to be sure that no TLB holds a pointer to it. */
922 mask = d->domain_dirty_cpumask;
923 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
924 if ( unlikely(!cpus_empty(mask)) )
925 {
926 perfc_incrc(shadow_alloc_tlbflush);
927 flush_tlb_mask(mask);
928 }
929 /* Now safe to clear the page for reuse */
930 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
931 ASSERT(p != NULL);
932 clear_page(p);
933 sh_unmap_domain_page(p);
934 INIT_LIST_HEAD(&sp[i].list);
935 sp[i].type = shadow_type;
936 sp[i].pinned = 0;
937 sp[i].logdirty = 0;
938 sp[i].count = 0;
939 sp[i].backpointer = backpointer;
940 sp[i].next_shadow = NULL;
941 perfc_incr(shadow_alloc_count);
942 }
943 return shadow_page_to_mfn(sp);
944 }
946 /* If we get here, we failed to allocate. This should never happen.
947 * It means that we didn't call shadow_prealloc() correctly before
948 * we allocated. We can't recover by calling prealloc here, because
949 * we might free up higher-level pages that the caller is working on. */
950 SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
951 BUG();
952 }
955 /* Return some shadow pages to the pool. */
956 void shadow_free(struct domain *d, mfn_t smfn)
957 {
958 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
959 u32 shadow_type;
960 unsigned long order;
961 unsigned long mask;
962 int i;
964 ASSERT(shadow_locked_by_me(d));
965 perfc_incrc(shadow_free);
967 shadow_type = sp->type;
968 ASSERT(shadow_type != SH_type_none);
969 ASSERT(shadow_type != SH_type_p2m_table);
970 order = shadow_order(shadow_type);
972 d->arch.shadow.free_pages += 1 << order;
974 for ( i = 0; i < 1<<order; i++ )
975 {
976 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
977 struct vcpu *v;
978 for_each_vcpu(d, v)
979 {
980 /* No longer safe to look for a writeable mapping in this shadow */
981 if ( v->arch.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
982 v->arch.shadow.last_writeable_pte_smfn = 0;
983 }
984 #endif
985 /* Strip out the type: this is now a free shadow page */
986 sp[i].type = 0;
987 /* Remember the TLB timestamp so we will know whether to flush
988 * TLBs when we reuse the page. Because the destructors leave the
989 * contents of the pages in place, we can delay TLB flushes until
990 * just before the allocator hands the page out again. */
991 sp[i].tlbflush_timestamp = tlbflush_current_time();
992 perfc_decr(shadow_alloc_count);
993 }
995 /* Merge chunks as far as possible. */
996 while ( order < SHADOW_MAX_ORDER )
997 {
998 mask = 1 << order;
999 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
1000 /* Merge with predecessor block? */
1001 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
1002 break;
1003 list_del(&(sp-mask)->list);
1004 sp -= mask;
1005 } else {
1006 /* Merge with successor block? */
1007 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
1008 break;
1009 list_del(&(sp+mask)->list);
1011 order++;
1014 sp->order = order;
1015 list_add_tail(&sp->list, &d->arch.shadow.freelists[order]);
1018 /* Divert some memory from the pool to be used by the p2m mapping.
1019 * This action is irreversible: the p2m mapping only ever grows.
1020 * That's OK because the p2m table only exists for translated domains,
1021 * and those domains can't ever turn off shadow mode.
1022 * Also, we only ever allocate a max-order chunk, so as to preserve
1023 * the invariant that shadow_prealloc() always works.
1024 * Returns 0 iff it can't get a chunk (the caller should then
1025 * free up some pages in domheap and call sh_set_allocation);
1026 * returns non-zero on success.
1027 */
1028 static int
1029 shadow_alloc_p2m_pages(struct domain *d)
1031 struct page_info *pg;
1032 u32 i;
1033 ASSERT(shadow_locked_by_me(d));
1035 if ( d->arch.shadow.total_pages
1036 < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
1037 return 0; /* Not enough shadow memory: need to increase it first */
1039 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1040 d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
1041 d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
1042 for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
1044 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1045 * Marking the domain as the owner would normally allow the guest to
1046 * create mappings of these pages, but these p2m pages will never be
1047 * in the domain's guest-physical address space, and so that is not
1048 * believed to be a concern.
1049 */
1050 page_set_owner(&pg[i], d);
1051 list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
1053 return 1;
1056 // Returns 0 if no memory is available...
1057 mfn_t
1058 shadow_alloc_p2m_page(struct domain *d)
1060 struct list_head *entry;
1061 struct page_info *pg;
1062 mfn_t mfn;
1063 void *p;
1065 if ( list_empty(&d->arch.shadow.p2m_freelist) &&
1066 !shadow_alloc_p2m_pages(d) )
1067 return _mfn(0);
1068 entry = d->arch.shadow.p2m_freelist.next;
1069 list_del(entry);
1070 list_add_tail(entry, &d->arch.shadow.p2m_inuse);
1071 pg = list_entry(entry, struct page_info, list);
1072 pg->count_info = 1;
1073 mfn = page_to_mfn(pg);
1074 p = sh_map_domain_page(mfn);
1075 clear_page(p);
1076 sh_unmap_domain_page(p);
1078 return mfn;
1081 #if CONFIG_PAGING_LEVELS == 3
1082 static void p2m_install_entry_in_monitors(struct domain *d,
1083 l3_pgentry_t *l3e)
1084 /* Special case, only used for external-mode domains on PAE hosts:
1085 * update the mapping of the p2m table. Once again, this is trivial in
1086 * other paging modes (one top-level entry points to the top-level p2m,
1087 * no maintenance needed), but PAE makes life difficult by needing a
1088 * copy the eight l3es of the p2m table in eight l2h slots in the
1089 * monitor table. This function makes fresh copies when a p2m l3e
1090 * changes. */
1092 l2_pgentry_t *ml2e;
1093 struct vcpu *v;
1094 unsigned int index;
1096 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1097 ASSERT(index < MACHPHYS_MBYTES>>1);
1099 for_each_vcpu(d, v)
1101 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1102 continue;
1103 ASSERT(shadow_mode_external(v->domain));
1105 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1106 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1108 if ( v == current ) /* OK to use linear map of monitor_table */
1109 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1110 else
1112 l3_pgentry_t *ml3e;
1113 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1114 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1115 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1116 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1117 sh_unmap_domain_page(ml3e);
1119 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1120 if ( v != current )
1121 sh_unmap_domain_page(ml2e);
1124 #endif
1126 // Find the next level's P2M entry, checking for out-of-range gfn's...
1127 // Returns NULL on error.
1128 //
1129 static l1_pgentry_t *
1130 p2m_find_entry(void *table, unsigned long *gfn_remainder,
1131 unsigned long gfn, u32 shift, u32 max)
1133 u32 index;
1135 index = *gfn_remainder >> shift;
1136 if ( index >= max )
1138 SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
1139 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
1140 gfn, *gfn_remainder, shift, index, max);
1141 return NULL;
1143 *gfn_remainder &= (1 << shift) - 1;
1144 return (l1_pgentry_t *)table + index;
1147 // Walk one level of the P2M table, allocating a new table if required.
1148 // Returns 0 on error.
1149 //
1150 static int
1151 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
1152 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
1153 u32 max, unsigned long type)
1155 l1_pgentry_t *p2m_entry;
1156 void *next;
1158 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
1159 shift, max)) )
1160 return 0;
1162 if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
1164 mfn_t mfn = shadow_alloc_p2m_page(d);
1165 if ( mfn_x(mfn) == 0 )
1166 return 0;
1167 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1168 mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
1169 mfn_to_page(mfn)->count_info = 1;
1170 #if CONFIG_PAGING_LEVELS == 3
1171 if (type == PGT_l2_page_table)
1173 struct vcpu *v;
1174 /* We have written to the p2m l3: need to sync the per-vcpu
1175 * copies of it in the monitor tables */
1176 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
1177 /* Also, any vcpus running on shadows of the p2m need to
1178 * reload their CR3s so the change propagates to the shadow */
1179 ASSERT(shadow_locked_by_me(d));
1180 for_each_vcpu(d, v)
1182 if ( pagetable_get_pfn(v->arch.guest_table)
1183 == pagetable_get_pfn(d->arch.phys_table)
1184 && v->arch.shadow.mode != NULL )
1185 v->arch.shadow.mode->update_cr3(v, 0);
1188 #endif
1189 /* The P2M can be shadowed: keep the shadows synced */
1190 if ( d->vcpu[0] != NULL )
1191 (void)sh_validate_guest_entry(d->vcpu[0], *table_mfn,
1192 p2m_entry, sizeof *p2m_entry);
1194 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
1195 next = sh_map_domain_page(*table_mfn);
1196 sh_unmap_domain_page(*table);
1197 *table = next;
1199 return 1;
1202 // Returns 0 on error (out of memory)
1203 int
1204 shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
1206 // XXX -- this might be able to be faster iff current->domain == d
1207 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
1208 void *table = sh_map_domain_page(table_mfn);
1209 unsigned long gfn_remainder = gfn;
1210 l1_pgentry_t *p2m_entry;
1211 int rv=0;
1213 #if CONFIG_PAGING_LEVELS >= 4
1214 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1215 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
1216 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
1217 goto out;
1218 #endif
1219 #if CONFIG_PAGING_LEVELS >= 3
1220 // When using PAE Xen, we only allow 33 bits of pseudo-physical
1221 // address in translated guests (i.e. 8 GBytes). This restriction
1222 // comes from wanting to map the P2M table into the 16MB RO_MPT hole
1223 // in Xen's address space for translated PV guests.
1224 //
1225 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1226 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
1227 (CONFIG_PAGING_LEVELS == 3
1228 ? 8
1229 : L3_PAGETABLE_ENTRIES),
1230 PGT_l2_page_table) )
1231 goto out;
1232 #endif
1233 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1234 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
1235 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
1236 goto out;
1238 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
1239 0, L1_PAGETABLE_ENTRIES);
1240 ASSERT(p2m_entry);
1241 if ( mfn_valid(mfn) )
1242 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1243 else
1244 *p2m_entry = l1e_empty();
1246 /* Track the highest gfn for which we have ever had a valid mapping */
1247 if ( mfn_valid(mfn) && (gfn > d->arch.max_mapped_pfn) )
1248 d->arch.max_mapped_pfn = gfn;
1250 /* The P2M can be shadowed: keep the shadows synced */
1251 if ( d->vcpu[0] != NULL )
1252 (void)sh_validate_guest_entry(d->vcpu[0], table_mfn,
1253 p2m_entry, sizeof(*p2m_entry));
1255 /* Success */
1256 rv = 1;
1258 out:
1259 sh_unmap_domain_page(table);
1260 return rv;
1263 // Allocate a new p2m table for a domain.
1264 //
1265 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
1266 // controlled by CONFIG_PAGING_LEVELS).
1267 //
1268 // Returns 0 if p2m table could not be initialized
1269 //
1270 static int
1271 shadow_alloc_p2m_table(struct domain *d)
1273 mfn_t p2m_top, mfn;
1274 struct list_head *entry;
1275 struct page_info *page;
1276 unsigned int page_count = 0;
1277 unsigned long gfn;
1279 SHADOW_PRINTK("allocating p2m table\n");
1280 ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
1282 p2m_top = shadow_alloc_p2m_page(d);
1283 mfn_to_page(p2m_top)->count_info = 1;
1284 mfn_to_page(p2m_top)->u.inuse.type_info =
1285 #if CONFIG_PAGING_LEVELS == 4
1286 PGT_l4_page_table
1287 #elif CONFIG_PAGING_LEVELS == 3
1288 PGT_l3_page_table
1289 #elif CONFIG_PAGING_LEVELS == 2
1290 PGT_l2_page_table
1291 #endif
1292 | 1 | PGT_validated;
1294 if ( mfn_x(p2m_top) == 0 )
1295 return 0;
1297 d->arch.phys_table = pagetable_from_mfn(p2m_top);
1299 SHADOW_PRINTK("populating p2m table\n");
1301 /* Initialise physmap tables for slot zero. Other code assumes this. */
1302 gfn = 0;
1303 mfn = _mfn(INVALID_MFN);
1304 if ( !shadow_set_p2m_entry(d, gfn, mfn) )
1305 goto error;
1307 for ( entry = d->page_list.next;
1308 entry != &d->page_list;
1309 entry = entry->next )
1311 page = list_entry(entry, struct page_info, list);
1312 mfn = page_to_mfn(page);
1313 gfn = get_gpfn_from_mfn(mfn_x(mfn));
1314 page_count++;
1315 if (
1316 #ifdef __x86_64__
1317 (gfn != 0x5555555555555555L)
1318 #else
1319 (gfn != 0x55555555L)
1320 #endif
1321 && gfn != INVALID_M2P_ENTRY
1322 && !shadow_set_p2m_entry(d, gfn, mfn) )
1323 goto error;
1326 SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
1327 return 1;
1329 error:
1330 SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
1331 SH_PRI_mfn "\n", gfn, mfn_x(mfn));
1332 return 0;
1335 mfn_t
1336 sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1337 /* Read another domain's p2m entries */
1339 mfn_t mfn;
1340 paddr_t addr = ((paddr_t)gpfn) << PAGE_SHIFT;
1341 l2_pgentry_t *l2e;
1342 l1_pgentry_t *l1e;
1344 ASSERT(shadow_mode_translate(d));
1345 mfn = pagetable_get_mfn(d->arch.phys_table);
1348 if ( gpfn > d->arch.max_mapped_pfn )
1349 /* This pfn is higher than the highest the p2m map currently holds */
1350 return _mfn(INVALID_MFN);
1352 #if CONFIG_PAGING_LEVELS >= 4
1354 l4_pgentry_t *l4e = sh_map_domain_page(mfn);
1355 l4e += l4_table_offset(addr);
1356 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1358 sh_unmap_domain_page(l4e);
1359 return _mfn(INVALID_MFN);
1361 mfn = _mfn(l4e_get_pfn(*l4e));
1362 sh_unmap_domain_page(l4e);
1364 #endif
1365 #if CONFIG_PAGING_LEVELS >= 3
1367 l3_pgentry_t *l3e = sh_map_domain_page(mfn);
1368 #if CONFIG_PAGING_LEVELS == 3
1369 /* On PAE hosts the p2m has eight l3 entries, not four (see
1370 * shadow_set_p2m_entry()) so we can't use l3_table_offset.
1371 * Instead, just count the number of l3es from zero. It's safe
1372 * to do this because we already checked that the gfn is within
1373 * the bounds of the p2m. */
1374 l3e += (addr >> L3_PAGETABLE_SHIFT);
1375 #else
1376 l3e += l3_table_offset(addr);
1377 #endif
1378 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1380 sh_unmap_domain_page(l3e);
1381 return _mfn(INVALID_MFN);
1383 mfn = _mfn(l3e_get_pfn(*l3e));
1384 sh_unmap_domain_page(l3e);
1386 #endif
1388 l2e = sh_map_domain_page(mfn);
1389 l2e += l2_table_offset(addr);
1390 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1392 sh_unmap_domain_page(l2e);
1393 return _mfn(INVALID_MFN);
1395 mfn = _mfn(l2e_get_pfn(*l2e));
1396 sh_unmap_domain_page(l2e);
1398 l1e = sh_map_domain_page(mfn);
1399 l1e += l1_table_offset(addr);
1400 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1402 sh_unmap_domain_page(l1e);
1403 return _mfn(INVALID_MFN);
1405 mfn = _mfn(l1e_get_pfn(*l1e));
1406 sh_unmap_domain_page(l1e);
1408 return mfn;
1411 unsigned long
1412 shadow_gfn_to_mfn_foreign(unsigned long gpfn)
1414 return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
1418 static void shadow_p2m_teardown(struct domain *d)
1419 /* Return all the p2m pages to Xen.
1420 * We know we don't have any extra mappings to these pages */
1422 struct list_head *entry, *n;
1423 struct page_info *pg;
1425 d->arch.phys_table = pagetable_null();
1427 list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
1429 pg = list_entry(entry, struct page_info, list);
1430 list_del(entry);
1431 /* Should have just the one ref we gave it in alloc_p2m_page() */
1432 if ( (pg->count_info & PGC_count_mask) != 1 )
1434 SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1435 pg->count_info, pg->u.inuse.type_info);
1437 ASSERT(page_get_owner(pg) == d);
1438 /* Free should not decrement domain's total allocation, since
1439 * these pages were allocated without an owner. */
1440 page_set_owner(pg, NULL);
1441 free_domheap_pages(pg, 0);
1442 d->arch.shadow.p2m_pages--;
1443 perfc_decr(shadow_alloc_count);
1445 list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
1447 list_del(entry);
1448 pg = list_entry(entry, struct page_info, list);
1449 ASSERT(page_get_owner(pg) == d);
1450 /* Free should not decrement domain's total allocation. */
1451 page_set_owner(pg, NULL);
1452 free_domheap_pages(pg, 0);
1453 d->arch.shadow.p2m_pages--;
1454 perfc_decr(shadow_alloc_count);
1456 ASSERT(d->arch.shadow.p2m_pages == 0);
1459 /* Set the pool of shadow pages to the required number of pages.
1460 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1461 * plus space for the p2m table.
1462 * Returns 0 for success, non-zero for failure. */
1463 static unsigned int sh_set_allocation(struct domain *d,
1464 unsigned int pages,
1465 int *preempted)
1467 struct shadow_page_info *sp;
1468 unsigned int lower_bound;
1469 int j;
1471 ASSERT(shadow_locked_by_me(d));
1473 /* Don't allocate less than the minimum acceptable, plus one page per
1474 * megabyte of RAM (for the p2m table) */
1475 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1476 if ( pages > 0 && pages < lower_bound )
1477 pages = lower_bound;
1478 /* Round up to largest block size */
1479 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1481 SHADOW_PRINTK("current %i target %i\n",
1482 d->arch.shadow.total_pages, pages);
1484 while ( d->arch.shadow.total_pages != pages )
1486 if ( d->arch.shadow.total_pages < pages )
1488 /* Need to allocate more memory from domheap */
1489 sp = (struct shadow_page_info *)
1490 alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
1491 if ( sp == NULL )
1493 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1494 return -ENOMEM;
1496 d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1497 d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1498 for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
1500 sp[j].type = 0;
1501 sp[j].pinned = 0;
1502 sp[j].logdirty = 0;
1503 sp[j].count = 0;
1504 sp[j].mbz = 0;
1505 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1507 sp->order = SHADOW_MAX_ORDER;
1508 list_add_tail(&sp->list,
1509 &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
1511 else if ( d->arch.shadow.total_pages > pages )
1513 /* Need to return memory to domheap */
1514 shadow_prealloc(d, SHADOW_MAX_ORDER);
1515 ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
1516 sp = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
1517 struct shadow_page_info, list);
1518 list_del(&sp->list);
1519 d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1520 d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1521 free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER);
1524 /* Check to see if we need to yield and try again */
1525 if ( preempted && hypercall_preempt_check() )
1527 *preempted = 1;
1528 return 0;
1532 return 0;
1535 /* Return the size of the shadow pool, rounded up to the nearest MB */
1536 static unsigned int shadow_get_allocation(struct domain *d)
1538 unsigned int pg = d->arch.shadow.total_pages;
1539 return ((pg >> (20 - PAGE_SHIFT))
1540 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1543 /**************************************************************************/
1544 /* Hash table for storing the guest->shadow mappings.
1545 * The table itself is an array of pointers to shadows; the shadows are then
1546 * threaded on a singly-linked list of shadows with the same hash value */
1548 #define SHADOW_HASH_BUCKETS 251
1549 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1551 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1552 typedef u32 key_t;
1553 static inline key_t sh_hash(unsigned long n, unsigned int t)
1555 unsigned char *p = (unsigned char *)&n;
1556 key_t k = t;
1557 int i;
1558 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1559 return k % SHADOW_HASH_BUCKETS;
1562 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1564 /* Before we get to the mechanism, define a pair of audit functions
1565 * that sanity-check the contents of the hash table. */
1566 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1567 /* Audit one bucket of the hash table */
1569 struct shadow_page_info *sp, *x;
1571 if ( !(SHADOW_AUDIT_ENABLE) )
1572 return;
1574 sp = d->arch.shadow.hash_table[bucket];
1575 while ( sp )
1577 /* Not a shadow? */
1578 BUG_ON( sp->mbz != 0 );
1579 /* Bogus type? */
1580 BUG_ON( sp->type == 0 );
1581 BUG_ON( sp->type > SH_type_max_shadow );
1582 /* Wrong bucket? */
1583 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1584 /* Duplicate entry? */
1585 for ( x = sp->next_shadow; x; x = x->next_shadow )
1586 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1587 /* Follow the backpointer to the guest pagetable */
1588 if ( sp->type != SH_type_fl1_32_shadow
1589 && sp->type != SH_type_fl1_pae_shadow
1590 && sp->type != SH_type_fl1_64_shadow )
1592 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1593 /* Bad shadow flags on guest page? */
1594 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1595 /* Bad type count on guest page? */
1596 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1597 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1599 SHADOW_ERROR("MFN %#lx shadowed (by %#"SH_PRI_mfn")"
1600 " but has typecount %#lx\n",
1601 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1602 gpg->u.inuse.type_info);
1603 BUG();
1606 /* That entry was OK; on we go */
1607 sp = sp->next_shadow;
1611 #else
1612 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1613 #endif /* Hashtable bucket audit */
1616 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1618 static void sh_hash_audit(struct domain *d)
1619 /* Full audit: audit every bucket in the table */
1621 int i;
1623 if ( !(SHADOW_AUDIT_ENABLE) )
1624 return;
1626 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1628 sh_hash_audit_bucket(d, i);
1632 #else
1633 #define sh_hash_audit(_d) do {} while(0)
1634 #endif /* Hashtable bucket audit */
1636 /* Allocate and initialise the table itself.
1637 * Returns 0 for success, 1 for error. */
1638 static int shadow_hash_alloc(struct domain *d)
1640 struct shadow_page_info **table;
1642 ASSERT(shadow_locked_by_me(d));
1643 ASSERT(!d->arch.shadow.hash_table);
1645 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1646 if ( !table ) return 1;
1647 memset(table, 0,
1648 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1649 d->arch.shadow.hash_table = table;
1650 return 0;
1653 /* Tear down the hash table and return all memory to Xen.
1654 * This function does not care whether the table is populated. */
1655 static void shadow_hash_teardown(struct domain *d)
1657 ASSERT(shadow_locked_by_me(d));
1658 ASSERT(d->arch.shadow.hash_table);
1660 xfree(d->arch.shadow.hash_table);
1661 d->arch.shadow.hash_table = NULL;
1665 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1666 /* Find an entry in the hash table. Returns the MFN of the shadow,
1667 * or INVALID_MFN if it doesn't exist */
1669 struct domain *d = v->domain;
1670 struct shadow_page_info *sp, *prev;
1671 key_t key;
1673 ASSERT(shadow_locked_by_me(d));
1674 ASSERT(d->arch.shadow.hash_table);
1675 ASSERT(t);
1677 sh_hash_audit(d);
1679 perfc_incrc(shadow_hash_lookups);
1680 key = sh_hash(n, t);
1681 sh_hash_audit_bucket(d, key);
1683 sp = d->arch.shadow.hash_table[key];
1684 prev = NULL;
1685 while(sp)
1687 if ( sp->backpointer == n && sp->type == t )
1689 /* Pull-to-front if 'sp' isn't already the head item */
1690 if ( unlikely(sp != d->arch.shadow.hash_table[key]) )
1692 if ( unlikely(d->arch.shadow.hash_walking != 0) )
1693 /* Can't reorder: someone is walking the hash chains */
1694 return shadow_page_to_mfn(sp);
1695 else
1697 ASSERT(prev);
1698 /* Delete sp from the list */
1699 prev->next_shadow = sp->next_shadow;
1700 /* Re-insert it at the head of the list */
1701 sp->next_shadow = d->arch.shadow.hash_table[key];
1702 d->arch.shadow.hash_table[key] = sp;
1705 else
1707 perfc_incrc(shadow_hash_lookup_head);
1709 return shadow_page_to_mfn(sp);
1711 prev = sp;
1712 sp = sp->next_shadow;
1715 perfc_incrc(shadow_hash_lookup_miss);
1716 return _mfn(INVALID_MFN);
1719 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
1720 mfn_t smfn)
1721 /* Put a mapping (n,t)->smfn into the hash table */
1723 struct domain *d = v->domain;
1724 struct shadow_page_info *sp;
1725 key_t key;
1727 ASSERT(shadow_locked_by_me(d));
1728 ASSERT(d->arch.shadow.hash_table);
1729 ASSERT(t);
1731 sh_hash_audit(d);
1733 perfc_incrc(shadow_hash_inserts);
1734 key = sh_hash(n, t);
1735 sh_hash_audit_bucket(d, key);
1737 /* Insert this shadow at the top of the bucket */
1738 sp = mfn_to_shadow_page(smfn);
1739 sp->next_shadow = d->arch.shadow.hash_table[key];
1740 d->arch.shadow.hash_table[key] = sp;
1742 sh_hash_audit_bucket(d, key);
1745 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
1746 mfn_t smfn)
1747 /* Excise the mapping (n,t)->smfn from the hash table */
1749 struct domain *d = v->domain;
1750 struct shadow_page_info *sp, *x;
1751 key_t key;
1753 ASSERT(shadow_locked_by_me(d));
1754 ASSERT(d->arch.shadow.hash_table);
1755 ASSERT(t);
1757 sh_hash_audit(d);
1759 perfc_incrc(shadow_hash_deletes);
1760 key = sh_hash(n, t);
1761 sh_hash_audit_bucket(d, key);
1763 sp = mfn_to_shadow_page(smfn);
1764 if ( d->arch.shadow.hash_table[key] == sp )
1765 /* Easy case: we're deleting the head item. */
1766 d->arch.shadow.hash_table[key] = sp->next_shadow;
1767 else
1769 /* Need to search for the one we want */
1770 x = d->arch.shadow.hash_table[key];
1771 while ( 1 )
1773 ASSERT(x); /* We can't have hit the end, since our target is
1774 * still in the chain somehwere... */
1775 if ( x->next_shadow == sp )
1777 x->next_shadow = sp->next_shadow;
1778 break;
1780 x = x->next_shadow;
1783 sp->next_shadow = NULL;
1785 sh_hash_audit_bucket(d, key);
1788 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1790 static void hash_foreach(struct vcpu *v,
1791 unsigned int callback_mask,
1792 hash_callback_t callbacks[],
1793 mfn_t callback_mfn)
1794 /* Walk the hash table looking at the types of the entries and
1795 * calling the appropriate callback function for each entry.
1796 * The mask determines which shadow types we call back for, and the array
1797 * of callbacks tells us which function to call.
1798 * Any callback may return non-zero to let us skip the rest of the scan.
1800 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1801 * then return non-zero to terminate the scan. */
1803 int i, done = 0;
1804 struct domain *d = v->domain;
1805 struct shadow_page_info *x;
1807 /* Say we're here, to stop hash-lookups reordering the chains */
1808 ASSERT(shadow_locked_by_me(d));
1809 ASSERT(d->arch.shadow.hash_walking == 0);
1810 d->arch.shadow.hash_walking = 1;
1812 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1814 /* WARNING: This is not safe against changes to the hash table.
1815 * The callback *must* return non-zero if it has inserted or
1816 * deleted anything from the hash (lookups are OK, though). */
1817 for ( x = d->arch.shadow.hash_table[i]; x; x = x->next_shadow )
1819 if ( callback_mask & (1 << x->type) )
1821 ASSERT(x->type <= 15);
1822 ASSERT(callbacks[x->type] != NULL);
1823 done = callbacks[x->type](v, shadow_page_to_mfn(x),
1824 callback_mfn);
1825 if ( done ) break;
1828 if ( done ) break;
1830 d->arch.shadow.hash_walking = 0;
1834 /**************************************************************************/
1835 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1836 * which will decrement refcounts appropriately and return memory to the
1837 * free pool. */
1839 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1841 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1842 unsigned int t = sp->type;
1845 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1847 /* Double-check, if we can, that the shadowed page belongs to this
1848 * domain, (by following the back-pointer). */
1849 ASSERT(t == SH_type_fl1_32_shadow ||
1850 t == SH_type_fl1_pae_shadow ||
1851 t == SH_type_fl1_64_shadow ||
1852 t == SH_type_monitor_table ||
1853 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
1854 == v->domain));
1856 /* The down-shifts here are so that the switch statement is on nice
1857 * small numbers that the compiler will enjoy */
1858 switch ( t )
1860 #if CONFIG_PAGING_LEVELS == 2
1861 case SH_type_l1_32_shadow:
1862 case SH_type_fl1_32_shadow:
1863 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1864 break;
1865 case SH_type_l2_32_shadow:
1866 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1867 break;
1868 #else /* PAE or 64bit */
1869 case SH_type_l1_32_shadow:
1870 case SH_type_fl1_32_shadow:
1871 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1872 break;
1873 case SH_type_l2_32_shadow:
1874 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1875 break;
1876 #endif
1878 #if CONFIG_PAGING_LEVELS >= 3
1879 case SH_type_l1_pae_shadow:
1880 case SH_type_fl1_pae_shadow:
1881 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1882 break;
1883 case SH_type_l2_pae_shadow:
1884 case SH_type_l2h_pae_shadow:
1885 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1886 break;
1887 #endif
1889 #if CONFIG_PAGING_LEVELS >= 4
1890 case SH_type_l1_64_shadow:
1891 case SH_type_fl1_64_shadow:
1892 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1893 break;
1894 case SH_type_l2_64_shadow:
1895 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1896 break;
1897 case SH_type_l3_64_shadow:
1898 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1899 break;
1900 case SH_type_l4_64_shadow:
1901 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1902 break;
1903 #endif
1904 default:
1905 SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
1906 (unsigned long)t);
1907 BUG();
1911 /**************************************************************************/
1912 /* Remove all writeable mappings of a guest frame from the shadow tables
1913 * Returns non-zero if we need to flush TLBs.
1914 * level and fault_addr desribe how we found this to be a pagetable;
1915 * level==0 means we have some other reason for revoking write access.*/
1917 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
1918 unsigned int level,
1919 unsigned long fault_addr)
1921 /* Dispatch table for getting per-type functions */
1922 static hash_callback_t callbacks[16] = {
1923 NULL, /* none */
1924 #if CONFIG_PAGING_LEVELS == 2
1925 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* l1_32 */
1926 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* fl1_32 */
1927 #else
1928 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* l1_32 */
1929 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* fl1_32 */
1930 #endif
1931 NULL, /* l2_32 */
1932 #if CONFIG_PAGING_LEVELS >= 3
1933 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* l1_pae */
1934 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* fl1_pae */
1935 #else
1936 NULL, /* l1_pae */
1937 NULL, /* fl1_pae */
1938 #endif
1939 NULL, /* l2_pae */
1940 NULL, /* l2h_pae */
1941 #if CONFIG_PAGING_LEVELS >= 4
1942 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* l1_64 */
1943 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* fl1_64 */
1944 #else
1945 NULL, /* l1_64 */
1946 NULL, /* fl1_64 */
1947 #endif
1948 NULL, /* l2_64 */
1949 NULL, /* l3_64 */
1950 NULL, /* l4_64 */
1951 NULL, /* p2m */
1952 NULL /* unused */
1953 };
1955 static unsigned int callback_mask =
1956 1 << SH_type_l1_32_shadow
1957 | 1 << SH_type_fl1_32_shadow
1958 | 1 << SH_type_l1_pae_shadow
1959 | 1 << SH_type_fl1_pae_shadow
1960 | 1 << SH_type_l1_64_shadow
1961 | 1 << SH_type_fl1_64_shadow
1963 struct page_info *pg = mfn_to_page(gmfn);
1965 ASSERT(shadow_locked_by_me(v->domain));
1967 /* Only remove writable mappings if we are doing shadow refcounts.
1968 * In guest refcounting, we trust Xen to already be restricting
1969 * all the writes to the guest page tables, so we do not need to
1970 * do more. */
1971 if ( !shadow_mode_refcounts(v->domain) )
1972 return 0;
1974 /* Early exit if it's already a pagetable, or otherwise not writeable */
1975 if ( sh_mfn_is_a_page_table(gmfn)
1976 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1977 return 0;
1979 perfc_incrc(shadow_writeable);
1981 /* If this isn't a "normal" writeable page, the domain is trying to
1982 * put pagetables in special memory of some kind. We can't allow that. */
1983 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1985 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1986 PRtype_info "\n",
1987 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1988 domain_crash(v->domain);
1991 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1992 if ( v == current && level != 0 )
1994 unsigned long gfn;
1995 /* Heuristic: there is likely to be only one writeable mapping,
1996 * and that mapping is likely to be in the current pagetable,
1997 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
1999 #define GUESS(_a, _h) do { \
2000 if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
2001 perfc_incrc(shadow_writeable_h_ ## _h); \
2002 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
2003 return 1; \
2004 } while (0)
2007 if ( v->arch.shadow.mode->guest_levels == 2 )
2009 if ( level == 1 )
2010 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
2011 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
2013 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2014 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2015 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2018 #if CONFIG_PAGING_LEVELS >= 3
2019 else if ( v->arch.shadow.mode->guest_levels == 3 )
2021 /* 32bit PAE w2k3: linear map at 0xC0000000 */
2022 switch ( level )
2024 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
2025 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
2028 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2029 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2030 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2032 #if CONFIG_PAGING_LEVELS >= 4
2033 else if ( v->arch.shadow.mode->guest_levels == 4 )
2035 /* 64bit w2k3: linear map at 0x0000070000000000 */
2036 switch ( level )
2038 case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
2039 case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
2040 case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
2043 /* 64bit Linux direct map at 0xffff810000000000; older kernels
2044 * had it at 0x0000010000000000UL */
2045 gfn = sh_mfn_to_gfn(v->domain, gmfn);
2046 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
2047 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
2049 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2050 #endif /* CONFIG_PAGING_LEVELS >= 3 */
2052 #undef GUESS
2055 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2056 return 1;
2058 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
2059 * (entries in the fixmap) where linux maps its pagetables. Since
2060 * we expect to hit them most of the time, we start the search for
2061 * the writeable mapping by looking at the same MFN where the last
2062 * brute-force search succeeded. */
2064 if ( v->arch.shadow.last_writeable_pte_smfn != 0 )
2066 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
2067 mfn_t last_smfn = _mfn(v->arch.shadow.last_writeable_pte_smfn);
2068 int shtype = mfn_to_shadow_page(last_smfn)->type;
2070 if ( callbacks[shtype] )
2071 callbacks[shtype](v, last_smfn, gmfn);
2073 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
2074 perfc_incrc(shadow_writeable_h_5);
2077 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2078 return 1;
2080 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
2082 /* Brute-force search of all the shadows, by walking the hash */
2083 perfc_incrc(shadow_writeable_bf);
2084 hash_foreach(v, callback_mask, callbacks, gmfn);
2086 /* If that didn't catch the mapping, something is very wrong */
2087 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2089 SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
2090 "%lu left\n", mfn_x(gmfn),
2091 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2092 domain_crash(v->domain);
2095 /* We killed at least one writeable mapping, so must flush TLBs. */
2096 return 1;
2101 /**************************************************************************/
2102 /* Remove all mappings of a guest frame from the shadow tables.
2103 * Returns non-zero if we need to flush TLBs. */
2105 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
2107 struct page_info *page = mfn_to_page(gmfn);
2108 int expected_count, do_locking;
2110 /* Dispatch table for getting per-type functions */
2111 static hash_callback_t callbacks[16] = {
2112 NULL, /* none */
2113 #if CONFIG_PAGING_LEVELS == 2
2114 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* l1_32 */
2115 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* fl1_32 */
2116 #else
2117 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* l1_32 */
2118 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* fl1_32 */
2119 #endif
2120 NULL, /* l2_32 */
2121 #if CONFIG_PAGING_LEVELS >= 3
2122 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* l1_pae */
2123 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* fl1_pae */
2124 #else
2125 NULL, /* l1_pae */
2126 NULL, /* fl1_pae */
2127 #endif
2128 NULL, /* l2_pae */
2129 NULL, /* l2h_pae */
2130 #if CONFIG_PAGING_LEVELS >= 4
2131 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* l1_64 */
2132 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* fl1_64 */
2133 #else
2134 NULL, /* l1_64 */
2135 NULL, /* fl1_64 */
2136 #endif
2137 NULL, /* l2_64 */
2138 NULL, /* l3_64 */
2139 NULL, /* l4_64 */
2140 NULL, /* p2m */
2141 NULL /* unused */
2142 };
2144 static unsigned int callback_mask =
2145 1 << SH_type_l1_32_shadow
2146 | 1 << SH_type_fl1_32_shadow
2147 | 1 << SH_type_l1_pae_shadow
2148 | 1 << SH_type_fl1_pae_shadow
2149 | 1 << SH_type_l1_64_shadow
2150 | 1 << SH_type_fl1_64_shadow
2153 perfc_incrc(shadow_mappings);
2154 if ( (page->count_info & PGC_count_mask) == 0 )
2155 return 0;
2157 /* Although this is an externally visible function, we do not know
2158 * whether the shadow lock will be held when it is called (since it
2159 * can be called via put_page_type when we clear a shadow l1e).
2160 * If the lock isn't held, take it for the duration of the call. */
2161 do_locking = !shadow_locked_by_me(v->domain);
2162 if ( do_locking ) shadow_lock(v->domain);
2164 /* XXX TODO:
2165 * Heuristics for finding the (probably) single mapping of this gmfn */
2167 /* Brute-force search of all the shadows, by walking the hash */
2168 perfc_incrc(shadow_mappings_bf);
2169 hash_foreach(v, callback_mask, callbacks, gmfn);
2171 /* If that didn't catch the mapping, something is very wrong */
2172 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2173 if ( (page->count_info & PGC_count_mask) != expected_count )
2175 /* Don't complain if we're in HVM and there's one extra mapping:
2176 * The qemu helper process has an untyped mapping of this dom's RAM */
2177 if ( !(shadow_mode_external(v->domain)
2178 && (page->count_info & PGC_count_mask) <= 2
2179 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2181 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2182 "c=%08x t=%08lx\n", mfn_x(gmfn),
2183 page->count_info, page->u.inuse.type_info);
2187 if ( do_locking ) shadow_unlock(v->domain);
2189 /* We killed at least one mapping, so must flush TLBs. */
2190 return 1;
2194 /**************************************************************************/
2195 /* Remove all shadows of a guest frame from the shadow tables */
2197 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2198 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2199 * found there. Returns 1 if that was the only reference to this shadow */
2201 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2202 mfn_t pmfn;
2203 void *vaddr;
2204 int rc;
2206 ASSERT(sp->type > 0);
2207 ASSERT(sp->type < SH_type_max_shadow);
2208 ASSERT(sp->type != SH_type_l2_32_shadow);
2209 ASSERT(sp->type != SH_type_l2_pae_shadow);
2210 ASSERT(sp->type != SH_type_l2h_pae_shadow);
2211 ASSERT(sp->type != SH_type_l4_64_shadow);
2213 if (sp->up == 0) return 0;
2214 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2215 ASSERT(mfn_valid(pmfn));
2216 vaddr = sh_map_domain_page(pmfn);
2217 ASSERT(vaddr);
2218 vaddr += sp->up & (PAGE_SIZE-1);
2219 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2221 /* Is this the only reference to this shadow? */
2222 rc = (sp->count == 1) ? 1 : 0;
2224 /* Blank the offending entry */
2225 switch (sp->type)
2227 case SH_type_l1_32_shadow:
2228 case SH_type_l2_32_shadow:
2229 #if CONFIG_PAGING_LEVELS == 2
2230 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2231 #else
2232 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2233 #endif
2234 break;
2235 #if CONFIG_PAGING_LEVELS >=3
2236 case SH_type_l1_pae_shadow:
2237 case SH_type_l2_pae_shadow:
2238 case SH_type_l2h_pae_shadow:
2239 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2240 break;
2241 #if CONFIG_PAGING_LEVELS >= 4
2242 case SH_type_l1_64_shadow:
2243 case SH_type_l2_64_shadow:
2244 case SH_type_l3_64_shadow:
2245 case SH_type_l4_64_shadow:
2246 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2247 break;
2248 #endif
2249 #endif
2250 default: BUG(); /* Some wierd unknown shadow type */
2253 sh_unmap_domain_page(vaddr);
2254 if ( rc )
2255 perfc_incrc(shadow_up_pointer);
2256 else
2257 perfc_incrc(shadow_unshadow_bf);
2259 return rc;
2262 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2263 /* Remove the shadows of this guest page.
2264 * If fast != 0, just try the quick heuristic, which will remove
2265 * at most one reference to each shadow of the page. Otherwise, walk
2266 * all the shadow tables looking for refs to shadows of this gmfn.
2267 * If all != 0, kill the domain if we can't find all the shadows.
2268 * (all != 0 implies fast == 0)
2269 */
2271 struct page_info *pg = mfn_to_page(gmfn);
2272 mfn_t smfn;
2273 u32 sh_flags;
2274 int do_locking;
2275 unsigned char t;
2277 /* Dispatch table for getting per-type functions: each level must
2278 * be called with the function to remove a lower-level shadow. */
2279 static hash_callback_t callbacks[16] = {
2280 NULL, /* none */
2281 NULL, /* l1_32 */
2282 NULL, /* fl1_32 */
2283 #if CONFIG_PAGING_LEVELS == 2
2284 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2285 #else
2286 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2287 #endif
2288 NULL, /* l1_pae */
2289 NULL, /* fl1_pae */
2290 #if CONFIG_PAGING_LEVELS >= 3
2291 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2292 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2293 #else
2294 NULL, /* l2_pae */
2295 NULL, /* l2h_pae */
2296 #endif
2297 NULL, /* l1_64 */
2298 NULL, /* fl1_64 */
2299 #if CONFIG_PAGING_LEVELS >= 4
2300 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2301 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2302 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2303 #else
2304 NULL, /* l2_64 */
2305 NULL, /* l3_64 */
2306 NULL, /* l4_64 */
2307 #endif
2308 NULL, /* p2m */
2309 NULL /* unused */
2310 };
2312 /* Another lookup table, for choosing which mask to use */
2313 static unsigned int masks[16] = {
2314 0, /* none */
2315 1 << SH_type_l2_32_shadow, /* l1_32 */
2316 0, /* fl1_32 */
2317 0, /* l2_32 */
2318 ((1 << SH_type_l2h_pae_shadow)
2319 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2320 0, /* fl1_pae */
2321 0, /* l2_pae */
2322 0, /* l2h_pae */
2323 1 << SH_type_l2_64_shadow, /* l1_64 */
2324 0, /* fl1_64 */
2325 1 << SH_type_l3_64_shadow, /* l2_64 */
2326 1 << SH_type_l4_64_shadow, /* l3_64 */
2327 0, /* l4_64 */
2328 0, /* p2m */
2329 0 /* unused */
2330 };
2332 ASSERT(!(all && fast));
2334 /* Bail out now if the page is not shadowed */
2335 if ( (pg->count_info & PGC_page_table) == 0 )
2336 return;
2338 /* Although this is an externally visible function, we do not know
2339 * whether the shadow lock will be held when it is called (since it
2340 * can be called via put_page_type when we clear a shadow l1e).
2341 * If the lock isn't held, take it for the duration of the call. */
2342 do_locking = !shadow_locked_by_me(v->domain);
2343 if ( do_locking ) shadow_lock(v->domain);
2345 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2346 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2348 /* Search for this shadow in all appropriate shadows */
2349 perfc_incrc(shadow_unshadow);
2350 sh_flags = pg->shadow_flags;
2352 /* Lower-level shadows need to be excised from upper-level shadows.
2353 * This call to hash_foreach() looks dangerous but is in fact OK: each
2354 * call will remove at most one shadow, and terminate immediately when
2355 * it does remove it, so we never walk the hash after doing a deletion. */
2356 #define DO_UNSHADOW(_type) do { \
2357 t = (_type); \
2358 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2359 if ( sh_type_is_pinnable(v, t) ) \
2360 sh_unpin(v, smfn); \
2361 else \
2362 sh_remove_shadow_via_pointer(v, smfn); \
2363 if ( (pg->count_info & PGC_page_table) && !fast ) \
2364 hash_foreach(v, masks[t], callbacks, smfn); \
2365 } while (0)
2367 if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(SH_type_l1_32_shadow);
2368 if ( sh_flags & SHF_L2_32 ) DO_UNSHADOW(SH_type_l2_32_shadow);
2369 #if CONFIG_PAGING_LEVELS >= 3
2370 if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(SH_type_l1_pae_shadow);
2371 if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(SH_type_l2_pae_shadow);
2372 if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(SH_type_l2h_pae_shadow);
2373 #if CONFIG_PAGING_LEVELS >= 4
2374 if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(SH_type_l1_64_shadow);
2375 if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(SH_type_l2_64_shadow);
2376 if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(SH_type_l3_64_shadow);
2377 if ( sh_flags & SHF_L4_64 ) DO_UNSHADOW(SH_type_l4_64_shadow);
2378 #endif
2379 #endif
2381 #undef DO_UNSHADOW
2383 /* If that didn't catch the shadows, something is wrong */
2384 if ( !fast && (pg->count_info & PGC_page_table) )
2386 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2387 "(shadow_flags=%08lx)\n",
2388 mfn_x(gmfn), pg->shadow_flags);
2389 if ( all )
2390 domain_crash(v->domain);
2393 /* Need to flush TLBs now, so that linear maps are safe next time we
2394 * take a fault. */
2395 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2397 if ( do_locking ) shadow_unlock(v->domain);
2400 static void
2401 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2402 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2403 * Unshadow it, and recursively unshadow pages that reference it. */
2405 sh_remove_shadows(v, gmfn, 0, 1);
2406 /* XXX TODO:
2407 * Rework this hashtable walker to return a linked-list of all
2408 * the shadows it modified, then do breadth-first recursion
2409 * to find the way up to higher-level tables and unshadow them too.
2411 * The current code (just tearing down each page's shadows as we
2412 * detect that it is not a pagetable) is correct, but very slow.
2413 * It means extra emulated writes and slows down removal of mappings. */
2416 /**************************************************************************/
2418 static void sh_update_paging_modes(struct vcpu *v)
2420 struct domain *d = v->domain;
2421 struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
2422 mfn_t old_guest_table;
2424 ASSERT(shadow_locked_by_me(d));
2426 // Valid transitions handled by this function:
2427 // - For PV guests:
2428 // - after a shadow mode has been changed
2429 // - For HVM guests:
2430 // - after a shadow mode has been changed
2431 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2432 //
2434 // First, tear down any old shadow tables held by this vcpu.
2435 //
2436 if ( v->arch.shadow.mode )
2437 v->arch.shadow.mode->detach_old_tables(v);
2439 if ( !is_hvm_domain(d) )
2441 ///
2442 /// PV guest
2443 ///
2444 #if CONFIG_PAGING_LEVELS == 4
2445 /* When 32-on-64 PV guests are supported, they must choose
2446 * a different mode here */
2447 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2448 #elif CONFIG_PAGING_LEVELS == 3
2449 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2450 #elif CONFIG_PAGING_LEVELS == 2
2451 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2452 #else
2453 #error unexpected paging mode
2454 #endif
2455 v->arch.shadow.translate_enabled = !!shadow_mode_translate(d);
2457 else
2459 ///
2460 /// HVM guest
2461 ///
2462 ASSERT(shadow_mode_translate(d));
2463 ASSERT(shadow_mode_external(d));
2465 v->arch.shadow.translate_enabled = !!hvm_paging_enabled(v);
2466 if ( !v->arch.shadow.translate_enabled )
2468 /* Set v->arch.guest_table to use the p2m map, and choose
2469 * the appropriate shadow mode */
2470 old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2471 #if CONFIG_PAGING_LEVELS == 2
2472 v->arch.guest_table =
2473 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2474 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2475 #elif CONFIG_PAGING_LEVELS == 3
2476 v->arch.guest_table =
2477 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2478 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2479 #else /* CONFIG_PAGING_LEVELS == 4 */
2481 l4_pgentry_t *l4e;
2482 /* Use the start of the first l3 table as a PAE l3 */
2483 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2484 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2485 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2486 v->arch.guest_table =
2487 pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2488 sh_unmap_domain_page(l4e);
2490 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2491 #endif
2492 /* Fix up refcounts on guest_table */
2493 get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2494 if ( mfn_x(old_guest_table) != 0 )
2495 put_page(mfn_to_page(old_guest_table));
2497 else
2499 #ifdef __x86_64__
2500 if ( hvm_long_mode_enabled(v) )
2502 // long mode guest...
2503 v->arch.shadow.mode =
2504 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2506 else
2507 #endif
2508 if ( hvm_pae_enabled(v) )
2510 #if CONFIG_PAGING_LEVELS >= 3
2511 // 32-bit PAE mode guest...
2512 v->arch.shadow.mode =
2513 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2514 #else
2515 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2516 domain_crash(d);
2517 return;
2518 #endif
2520 else
2522 // 32-bit 2 level guest...
2523 #if CONFIG_PAGING_LEVELS >= 3
2524 v->arch.shadow.mode =
2525 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2526 #else
2527 v->arch.shadow.mode =
2528 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2529 #endif
2533 if ( pagetable_is_null(v->arch.monitor_table) )
2535 mfn_t mmfn = v->arch.shadow.mode->make_monitor_table(v);
2536 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2537 make_cr3(v, mfn_x(mmfn));
2538 hvm_update_host_cr3(v);
2541 if ( v->arch.shadow.mode != old_mode )
2543 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2544 "(was g=%u s=%u)\n",
2545 d->domain_id, v->vcpu_id,
2546 is_hvm_domain(d) ? !!hvm_paging_enabled(v) : 1,
2547 v->arch.shadow.mode->guest_levels,
2548 v->arch.shadow.mode->shadow_levels,
2549 old_mode ? old_mode->guest_levels : 0,
2550 old_mode ? old_mode->shadow_levels : 0);
2551 if ( old_mode &&
2552 (v->arch.shadow.mode->shadow_levels !=
2553 old_mode->shadow_levels) )
2555 /* Need to make a new monitor table for the new mode */
2556 mfn_t new_mfn, old_mfn;
2558 if ( v != current )
2560 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2561 "this HVM vcpu's (d=%u v=%u) paging mode!\n",
2562 current->domain->domain_id, current->vcpu_id,
2563 v->domain->domain_id, v->vcpu_id);
2564 domain_crash(v->domain);
2565 return;
2568 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2569 v->arch.monitor_table = pagetable_null();
2570 new_mfn = v->arch.shadow.mode->make_monitor_table(v);
2571 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2572 SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
2573 mfn_x(new_mfn));
2575 /* Don't be running on the old monitor table when we
2576 * pull it down! Switch CR3, and warn the HVM code that
2577 * its host cr3 has changed. */
2578 make_cr3(v, mfn_x(new_mfn));
2579 write_ptbase(v);
2580 hvm_update_host_cr3(v);
2581 old_mode->destroy_monitor_table(v, old_mfn);
2585 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2586 // These are HARD: think about the case where two CPU's have
2587 // different values for CR4.PSE and CR4.PGE at the same time.
2588 // This *does* happen, at least for CR4.PGE...
2591 v->arch.shadow.mode->update_cr3(v, 0);
2594 void shadow_update_paging_modes(struct vcpu *v)
2596 shadow_lock(v->domain);
2597 sh_update_paging_modes(v);
2598 shadow_unlock(v->domain);
2601 /**************************************************************************/
2602 /* Turning on and off shadow features */
2604 static void sh_new_mode(struct domain *d, u32 new_mode)
2605 /* Inform all the vcpus that the shadow mode has been changed */
2607 struct vcpu *v;
2609 ASSERT(shadow_locked_by_me(d));
2610 ASSERT(d != current->domain);
2611 d->arch.shadow.mode = new_mode;
2612 if ( new_mode & SHM2_translate )
2613 shadow_audit_p2m(d);
2614 for_each_vcpu(d, v)
2615 sh_update_paging_modes(v);
2618 int shadow_enable(struct domain *d, u32 mode)
2619 /* Turn on "permanent" shadow features: external, translate, refcount.
2620 * Can only be called once on a domain, and these features cannot be
2621 * disabled.
2622 * Returns 0 for success, -errno for failure. */
2624 unsigned int old_pages;
2625 int rv = 0;
2627 mode |= SHM2_enable;
2629 domain_pause(d);
2630 shadow_lock(d);
2632 /* Sanity check the arguments */
2633 if ( (d == current->domain) ||
2634 shadow_mode_enabled(d) ||
2635 ((mode & SHM2_translate) && !(mode & SHM2_refcounts)) ||
2636 ((mode & SHM2_external) && !(mode & SHM2_translate)) )
2638 rv = -EINVAL;
2639 goto out;
2642 // XXX -- eventually would like to require that all memory be allocated
2643 // *after* shadow_enabled() is called... So here, we would test to make
2644 // sure that d->page_list is empty.
2645 #if 0
2646 spin_lock(&d->page_alloc_lock);
2647 if ( !list_empty(&d->page_list) )
2649 spin_unlock(&d->page_alloc_lock);
2650 rv = -EINVAL;
2651 goto out;
2653 spin_unlock(&d->page_alloc_lock);
2654 #endif
2656 /* Init the shadow memory allocation if the user hasn't done so */
2657 old_pages = d->arch.shadow.total_pages;
2658 if ( old_pages == 0 )
2659 if ( sh_set_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
2661 sh_set_allocation(d, 0, NULL);
2662 rv = -ENOMEM;
2663 goto out;
2666 /* Init the hash table */
2667 if ( shadow_hash_alloc(d) != 0 )
2669 sh_set_allocation(d, old_pages, NULL);
2670 rv = -ENOMEM;
2671 goto out;
2674 /* Init the P2M table */
2675 if ( mode & SHM2_translate )
2676 if ( !shadow_alloc_p2m_table(d) )
2678 shadow_hash_teardown(d);
2679 sh_set_allocation(d, old_pages, NULL);
2680 shadow_p2m_teardown(d);
2681 rv = -ENOMEM;
2682 goto out;
2685 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2686 /* We assume we're dealing with an older 64bit linux guest until we
2687 * see the guest use more than one l4 per vcpu. */
2688 d->arch.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2689 #endif
2691 /* Update the bits */
2692 sh_new_mode(d, mode);
2693 shadow_audit_p2m(d);
2694 out:
2695 shadow_unlock(d);
2696 domain_unpause(d);
2697 return rv;
2700 void shadow_teardown(struct domain *d)
2701 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2702 * Should only be called for dying domains. */
2704 struct vcpu *v;
2705 mfn_t mfn;
2707 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2708 ASSERT(d != current->domain);
2710 if ( !shadow_locked_by_me(d) )
2711 shadow_lock(d); /* Keep various asserts happy */
2713 if ( shadow_mode_enabled(d) )
2715 /* Release the shadow and monitor tables held by each vcpu */
2716 for_each_vcpu(d, v)
2718 if ( v->arch.shadow.mode )
2720 v->arch.shadow.mode->detach_old_tables(v);
2721 if ( shadow_mode_external(d) )
2723 mfn = pagetable_get_mfn(v->arch.monitor_table);
2724 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
2725 v->arch.shadow.mode->destroy_monitor_table(v, mfn);
2726 v->arch.monitor_table = pagetable_null();
2732 if ( d->arch.shadow.total_pages != 0 )
2734 SHADOW_PRINTK("teardown of domain %u starts."
2735 " Shadow pages total = %u, free = %u, p2m=%u\n",
2736 d->domain_id,
2737 d->arch.shadow.total_pages,
2738 d->arch.shadow.free_pages,
2739 d->arch.shadow.p2m_pages);
2740 /* Destroy all the shadows and release memory to domheap */
2741 sh_set_allocation(d, 0, NULL);
2742 /* Release the hash table back to xenheap */
2743 if (d->arch.shadow.hash_table)
2744 shadow_hash_teardown(d);
2745 /* Release the log-dirty bitmap of dirtied pages */
2746 sh_free_log_dirty_bitmap(d);
2747 /* Should not have any more memory held */
2748 SHADOW_PRINTK("teardown done."
2749 " Shadow pages total = %u, free = %u, p2m=%u\n",
2750 d->arch.shadow.total_pages,
2751 d->arch.shadow.free_pages,
2752 d->arch.shadow.p2m_pages);
2753 ASSERT(d->arch.shadow.total_pages == 0);
2756 /* We leave the "permanent" shadow modes enabled, but clear the
2757 * log-dirty mode bit. We don't want any more mark_dirty()
2758 * calls now that we've torn down the bitmap */
2759 d->arch.shadow.mode &= ~SHM2_log_dirty;
2761 shadow_unlock(d);
2764 void shadow_final_teardown(struct domain *d)
2765 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2768 SHADOW_PRINTK("dom %u final teardown starts."
2769 " Shadow pages total = %u, free = %u, p2m=%u\n",
2770 d->domain_id,
2771 d->arch.shadow.total_pages,
2772 d->arch.shadow.free_pages,
2773 d->arch.shadow.p2m_pages);
2775 /* Double-check that the domain didn't have any shadow memory.
2776 * It is possible for a domain that never got domain_kill()ed
2777 * to get here with its shadow allocation intact. */
2778 if ( d->arch.shadow.total_pages != 0 )
2779 shadow_teardown(d);
2781 /* It is now safe to pull down the p2m map. */
2782 if ( d->arch.shadow.p2m_pages != 0 )
2783 shadow_p2m_teardown(d);
2785 SHADOW_PRINTK("dom %u final teardown done."
2786 " Shadow pages total = %u, free = %u, p2m=%u\n",
2787 d->domain_id,
2788 d->arch.shadow.total_pages,
2789 d->arch.shadow.free_pages,
2790 d->arch.shadow.p2m_pages);
2793 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2794 /* Turn on a single shadow mode feature */
2796 ASSERT(shadow_locked_by_me(d));
2798 /* Sanity check the call */
2799 if ( d == current->domain || (d->arch.shadow.mode & mode) )
2801 return -EINVAL;
2804 if ( d->arch.shadow.mode == 0 )
2806 /* Init the shadow memory allocation and the hash table */
2807 if ( sh_set_allocation(d, 1, NULL) != 0
2808 || shadow_hash_alloc(d) != 0 )
2810 sh_set_allocation(d, 0, NULL);
2811 return -ENOMEM;
2815 /* Update the bits */
2816 sh_new_mode(d, d->arch.shadow.mode | mode);
2818 return 0;
2821 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2822 /* Turn off a single shadow mode feature */
2824 struct vcpu *v;
2825 ASSERT(shadow_locked_by_me(d));
2827 /* Sanity check the call */
2828 if ( d == current->domain || !(d->arch.shadow.mode & mode) )
2830 return -EINVAL;
2833 /* Update the bits */
2834 sh_new_mode(d, d->arch.shadow.mode & ~mode);
2835 if ( d->arch.shadow.mode == 0 )
2837 /* Get this domain off shadows */
2838 SHADOW_PRINTK("un-shadowing of domain %u starts."
2839 " Shadow pages total = %u, free = %u, p2m=%u\n",
2840 d->domain_id,
2841 d->arch.shadow.total_pages,
2842 d->arch.shadow.free_pages,
2843 d->arch.shadow.p2m_pages);
2844 for_each_vcpu(d, v)
2846 if ( v->arch.shadow.mode )
2847 v->arch.shadow.mode->detach_old_tables(v);
2848 #if CONFIG_PAGING_LEVELS == 4
2849 if ( !(v->arch.flags & TF_kernel_mode) )
2850 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2851 else
2852 #endif
2853 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2857 /* Pull down the memory allocation */
2858 if ( sh_set_allocation(d, 0, NULL) != 0 )
2860 // XXX - How can this occur?
2861 // Seems like a bug to return an error now that we've
2862 // disabled the relevant shadow mode.
2863 //
2864 return -ENOMEM;
2866 shadow_hash_teardown(d);
2867 SHADOW_PRINTK("un-shadowing of domain %u done."
2868 " Shadow pages total = %u, free = %u, p2m=%u\n",
2869 d->domain_id,
2870 d->arch.shadow.total_pages,
2871 d->arch.shadow.free_pages,
2872 d->arch.shadow.p2m_pages);
2875 return 0;
2878 /* Enable/disable ops for the "test" and "log-dirty" modes */
2879 static int shadow_test_enable(struct domain *d)
2881 int ret;
2883 domain_pause(d);
2884 shadow_lock(d);
2886 if ( shadow_mode_enabled(d) )
2888 SHADOW_ERROR("Don't support enabling test mode"
2889 " on already shadowed doms\n");
2890 ret = -EINVAL;
2891 goto out;
2894 ret = shadow_one_bit_enable(d, SHM2_enable);
2895 out:
2896 shadow_unlock(d);
2897 domain_unpause(d);
2899 return ret;
2902 static int shadow_test_disable(struct domain *d)
2904 int ret;
2906 domain_pause(d);
2907 shadow_lock(d);
2908 ret = shadow_one_bit_disable(d, SHM2_enable);
2909 shadow_unlock(d);
2910 domain_unpause(d);
2912 return ret;
2915 static int
2916 sh_alloc_log_dirty_bitmap(struct domain *d)
2918 ASSERT(d->arch.shadow.dirty_bitmap == NULL);
2919 d->arch.shadow.dirty_bitmap_size =
2920 (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
2921 ~(BITS_PER_LONG - 1);
2922 d->arch.shadow.dirty_bitmap =
2923 xmalloc_array(unsigned long,
2924 d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
2925 if ( d->arch.shadow.dirty_bitmap == NULL )
2927 d->arch.shadow.dirty_bitmap_size = 0;
2928 return -ENOMEM;
2930 memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
2932 return 0;
2935 static void
2936 sh_free_log_dirty_bitmap(struct domain *d)
2938 d->arch.shadow.dirty_bitmap_size = 0;
2939 if ( d->arch.shadow.dirty_bitmap )
2941 xfree(d->arch.shadow.dirty_bitmap);
2942 d->arch.shadow.dirty_bitmap = NULL;
2946 static int shadow_log_dirty_enable(struct domain *d)
2948 int ret;
2950 domain_pause(d);
2951 shadow_lock(d);
2953 if ( shadow_mode_log_dirty(d) )
2955 ret = -EINVAL;
2956 goto out;
2959 if ( shadow_mode_enabled(d) )
2961 SHADOW_ERROR("Don't (yet) support enabling log-dirty"
2962 " on already shadowed doms\n");
2963 ret = -EINVAL;
2964 goto out;
2967 ret = sh_alloc_log_dirty_bitmap(d);
2968 if ( ret != 0 )
2970 sh_free_log_dirty_bitmap(d);
2971 goto out;
2974 ret = shadow_one_bit_enable(d, SHM2_log_dirty);
2975 if ( ret != 0 )
2976 sh_free_log_dirty_bitmap(d);
2978 out:
2979 shadow_unlock(d);
2980 domain_unpause(d);
2981 return ret;
2984 static int shadow_log_dirty_disable(struct domain *d)
2986 int ret;
2988 domain_pause(d);
2989 shadow_lock(d);
2990 ret = shadow_one_bit_disable(d, SHM2_log_dirty);
2991 if ( !shadow_mode_log_dirty(d) )
2992 sh_free_log_dirty_bitmap(d);
2993 shadow_unlock(d);
2994 domain_unpause(d);
2996 return ret;
2999 /**************************************************************************/
3000 /* P2M map manipulations */
3002 static void
3003 sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
3005 struct vcpu *v;
3007 if ( !shadow_mode_translate(d) )
3008 return;
3010 v = current;
3011 if ( v->domain != d )
3012 v = d->vcpu[0];
3014 SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
3016 ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
3017 //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
3019 if ( v != NULL )
3021 sh_remove_all_shadows_and_parents(v, _mfn(mfn));
3022 if ( sh_remove_all_mappings(v, _mfn(mfn)) )
3023 flush_tlb_mask(d->domain_dirty_cpumask);
3026 shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
3027 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3030 void
3031 shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
3032 unsigned long mfn)
3034 shadow_lock(d);
3035 shadow_audit_p2m(d);
3036 sh_p2m_remove_page(d, gfn, mfn);
3037 shadow_audit_p2m(d);
3038 shadow_unlock(d);
3041 void
3042 shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
3043 unsigned long mfn)
3045 unsigned long ogfn;
3046 mfn_t omfn;
3048 if ( !shadow_mode_translate(d) )
3049 return;
3051 shadow_lock(d);
3052 shadow_audit_p2m(d);
3054 SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
3056 omfn = sh_gfn_to_mfn(d, gfn);
3057 if ( mfn_valid(omfn) )
3059 /* Get rid of the old mapping, especially any shadows */
3060 struct vcpu *v = current;
3061 if ( v->domain != d )
3062 v = d->vcpu[0];
3063 if ( v != NULL )
3065 sh_remove_all_shadows_and_parents(v, omfn);
3066 if ( sh_remove_all_mappings(v, omfn) )
3067 flush_tlb_mask(d->domain_dirty_cpumask);
3069 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
3072 ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
3073 if (
3074 #ifdef __x86_64__
3075 (ogfn != 0x5555555555555555L)
3076 #else
3077 (ogfn != 0x55555555L)
3078 #endif
3079 && (ogfn != INVALID_M2P_ENTRY)
3080 && (ogfn != gfn) )
3082 /* This machine frame is already mapped at another physical address */
3083 SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
3084 mfn, ogfn, gfn);
3085 if ( mfn_valid(omfn = sh_gfn_to_mfn(d, ogfn)) )
3087 SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
3088 ogfn , mfn_x(omfn));
3089 if ( mfn_x(omfn) == mfn )
3090 sh_p2m_remove_page(d, ogfn, mfn);
3094 shadow_set_p2m_entry(d, gfn, _mfn(mfn));
3095 set_gpfn_from_mfn(mfn, gfn);
3097 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3098 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
3099 cached the fact that this is an mmio region in the shadow
3100 page tables. Blow the tables away to remove the cache.
3101 This is pretty heavy handed, but this is a rare operation
3102 (it might happen a dozen times during boot and then never
3103 again), so it doesn't matter too much. */
3104 shadow_blow_tables(d);
3105 #endif
3107 shadow_audit_p2m(d);
3108 shadow_unlock(d);
3111 /**************************************************************************/
3112 /* Log-dirty mode support */
3114 /* Convert a shadow to log-dirty mode. */
3115 void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
3117 BUG();
3121 /* Read a domain's log-dirty bitmap and stats.
3122 * If the operation is a CLEAN, clear the bitmap and stats as well. */
3123 static int shadow_log_dirty_op(
3124 struct domain *d, struct xen_domctl_shadow_op *sc)
3126 int i, rv = 0, clean = 0, peek = 1;
3128 domain_pause(d);
3129 shadow_lock(d);
3131 clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
3133 SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
3134 (clean) ? "clean" : "peek",
3135 d->domain_id,
3136 d->arch.shadow.fault_count,
3137 d->arch.shadow.dirty_count);
3139 sc->stats.fault_count = d->arch.shadow.fault_count;
3140 sc->stats.dirty_count = d->arch.shadow.dirty_count;
3142 if ( clean )
3144 /* Need to revoke write access to the domain's pages again.
3145 * In future, we'll have a less heavy-handed approach to this,
3146 * but for now, we just unshadow everything except Xen. */
3147 shadow_blow_tables(d);
3149 d->arch.shadow.fault_count = 0;
3150 d->arch.shadow.dirty_count = 0;
3153 if ( guest_handle_is_null(sc->dirty_bitmap) )
3154 /* caller may have wanted just to clean the state or access stats. */
3155 peek = 0;
3157 if ( (peek || clean) && (d->arch.shadow.dirty_bitmap == NULL) )
3159 rv = -EINVAL; /* perhaps should be ENOMEM? */
3160 goto out;
3163 if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
3164 sc->pages = d->arch.shadow.dirty_bitmap_size;
3166 #define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
3167 for ( i = 0; i < sc->pages; i += CHUNK )
3169 int bytes = ((((sc->pages - i) > CHUNK)
3170 ? CHUNK
3171 : (sc->pages - i)) + 7) / 8;
3173 if ( likely(peek) )
3175 if ( copy_to_guest_offset(
3176 sc->dirty_bitmap,
3177 i/(8*sizeof(unsigned long)),
3178 d->arch.shadow.dirty_bitmap+(i/(8*sizeof(unsigned long))),
3179 (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) )
3181 rv = -EFAULT;
3182 goto out;
3186 if ( clean )
3187 memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
3188 0, bytes);
3190 #undef CHUNK
3192 out:
3193 shadow_unlock(d);
3194 domain_unpause(d);
3195 return rv;
3199 /* Mark a page as dirty */
3200 void sh_mark_dirty(struct domain *d, mfn_t gmfn)
3202 unsigned long pfn;
3204 ASSERT(shadow_locked_by_me(d));
3206 if ( !shadow_mode_log_dirty(d) || !mfn_valid(gmfn) )
3207 return;
3209 ASSERT(d->arch.shadow.dirty_bitmap != NULL);
3211 /* We /really/ mean PFN here, even for non-translated guests. */
3212 pfn = get_gpfn_from_mfn(mfn_x(gmfn));
3214 /*
3215 * Values with the MSB set denote MFNs that aren't really part of the
3216 * domain's pseudo-physical memory map (e.g., the shared info frame).
3217 * Nothing to do here...
3218 */
3219 if ( unlikely(!VALID_M2P(pfn)) )
3220 return;
3222 /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
3223 if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
3225 if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
3227 SHADOW_DEBUG(LOGDIRTY,
3228 "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
3229 mfn_x(gmfn), pfn, d->domain_id);
3230 d->arch.shadow.dirty_count++;
3233 else
3235 SHADOW_PRINTK("mark_dirty OOR! "
3236 "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
3237 "owner=%d c=%08x t=%" PRtype_info "\n",
3238 mfn_x(gmfn),
3239 pfn,
3240 d->arch.shadow.dirty_bitmap_size,
3241 d->domain_id,
3242 (page_get_owner(mfn_to_page(gmfn))
3243 ? page_get_owner(mfn_to_page(gmfn))->domain_id
3244 : -1),
3245 mfn_to_page(gmfn)->count_info,
3246 mfn_to_page(gmfn)->u.inuse.type_info);
3250 void shadow_mark_dirty(struct domain *d, mfn_t gmfn)
3252 shadow_lock(d);
3253 sh_mark_dirty(d, gmfn);
3254 shadow_unlock(d);
3257 /**************************************************************************/
3258 /* Shadow-control XEN_DOMCTL dispatcher */
3260 int shadow_domctl(struct domain *d,
3261 xen_domctl_shadow_op_t *sc,
3262 XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
3264 int rc, preempted = 0;
3266 if ( unlikely(d == current->domain) )
3268 gdprintk(XENLOG_INFO, "Don't try to do a shadow op on yourself!\n");
3269 return -EINVAL;
3272 switch ( sc->op )
3274 case XEN_DOMCTL_SHADOW_OP_OFF:
3275 if ( shadow_mode_log_dirty(d) )
3276 if ( (rc = shadow_log_dirty_disable(d)) != 0 )
3277 return rc;
3278 if ( is_hvm_domain(d) )
3279 return -EINVAL;
3280 if ( d->arch.shadow.mode & SHM2_enable )
3281 if ( (rc = shadow_test_disable(d)) != 0 )
3282 return rc;
3283 return 0;
3285 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3286 return shadow_test_enable(d);
3288 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
3289 return shadow_log_dirty_enable(d);
3291 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3292 return shadow_enable(d, SHM2_refcounts|SHM2_translate);
3294 case XEN_DOMCTL_SHADOW_OP_CLEAN:
3295 case XEN_DOMCTL_SHADOW_OP_PEEK:
3296 return shadow_log_dirty_op(d, sc);
3298 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3299 if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
3300 return shadow_log_dirty_enable(d);
3301 return shadow_enable(d, sc->mode << SHM2_shift);
3303 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3304 sc->mb = shadow_get_allocation(d);
3305 return 0;
3307 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3308 shadow_lock(d);
3309 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
3310 shadow_unlock(d);
3311 if ( preempted )
3312 /* Not finished. Set up to re-run the call. */
3313 rc = hypercall_create_continuation(
3314 __HYPERVISOR_domctl, "h", u_domctl);
3315 else
3316 /* Finished. Return the new allocation */
3317 sc->mb = shadow_get_allocation(d);
3318 return rc;
3320 default:
3321 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3322 return -EINVAL;
3327 /**************************************************************************/
3328 /* Auditing shadow tables */
3330 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3332 void shadow_audit_tables(struct vcpu *v)
3334 /* Dispatch table for getting per-type functions */
3335 static hash_callback_t callbacks[16] = {
3336 NULL, /* none */
3337 #if CONFIG_PAGING_LEVELS == 2
3338 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
3339 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
3340 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
3341 #else
3342 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
3343 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
3344 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
3345 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
3346 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
3347 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
3348 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
3349 #if CONFIG_PAGING_LEVELS >= 4
3350 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
3351 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
3352 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
3353 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
3354 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
3355 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3356 #endif /* CONFIG_PAGING_LEVELS > 2 */
3357 NULL /* All the rest */
3358 };
3359 unsigned int mask;
3361 if ( !(SHADOW_AUDIT_ENABLE) )
3362 return;
3364 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3365 mask = ~1; /* Audit every table in the system */
3366 else
3368 /* Audit only the current mode's tables */
3369 switch ( v->arch.shadow.mode->guest_levels )
3371 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3372 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3373 |SHF_L2H_PAE); break;
3374 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3375 |SHF_L3_64|SHF_L4_64); break;
3376 default: BUG();
3380 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3383 #endif /* Shadow audit */
3386 /**************************************************************************/
3387 /* Auditing p2m tables */
3389 #if SHADOW_AUDIT & SHADOW_AUDIT_P2M
3391 void shadow_audit_p2m(struct domain *d)
3393 struct list_head *entry;
3394 struct page_info *page;
3395 struct domain *od;
3396 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
3397 mfn_t p2mfn;
3398 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
3399 int test_linear;
3401 if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
3402 return;
3404 //SHADOW_PRINTK("p2m audit starts\n");
3406 test_linear = ( (d == current->domain)
3407 && !pagetable_is_null(current->arch.monitor_table) );
3408 if ( test_linear )
3409 local_flush_tlb();
3411 /* Audit part one: walk the domain's page allocation list, checking
3412 * the m2p entries. */
3413 for ( entry = d->page_list.next;
3414 entry != &d->page_list;
3415 entry = entry->next )
3417 page = list_entry(entry, struct page_info, list);
3418 mfn = mfn_x(page_to_mfn(page));
3420 // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
3422 od = page_get_owner(page);
3424 if ( od != d )
3426 SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
3427 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
3428 continue;
3431 gfn = get_gpfn_from_mfn(mfn);
3432 if ( gfn == INVALID_M2P_ENTRY )
3434 orphans_i++;
3435 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
3436 // mfn);
3437 continue;
3440 if ( gfn == 0x55555555 )
3442 orphans_d++;
3443 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
3444 // mfn);
3445 continue;
3448 p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
3449 if ( mfn_x(p2mfn) != mfn )
3451 mpbad++;
3452 SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
3453 " (-> gfn %#lx)\n",
3454 mfn, gfn, mfn_x(p2mfn),
3455 (mfn_valid(p2mfn)
3456 ? get_gpfn_from_mfn(mfn_x(p2mfn))
3457 : -1u));
3458 /* This m2p entry is stale: the domain has another frame in
3459 * this physical slot. No great disaster, but for neatness,
3460 * blow away the m2p entry. */
3461 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3464 if ( test_linear && (gfn <= d->arch.max_mapped_pfn) )
3466 lp2mfn = gfn_to_mfn_current(gfn);
3467 if ( mfn_x(lp2mfn) != mfn_x(p2mfn) )
3469 SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
3470 "(!= mfn %#lx)\n", gfn,
3471 mfn_x(lp2mfn), mfn_x(p2mfn));
3475 // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
3476 // mfn, gfn, p2mfn, lp2mfn);
3479 /* Audit part two: walk the domain's p2m table, checking the entries. */
3480 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
3482 l2_pgentry_t *l2e;
3483 l1_pgentry_t *l1e;
3484 int i1, i2;
3486 #if CONFIG_PAGING_LEVELS == 4
3487 l4_pgentry_t *l4e;
3488 l3_pgentry_t *l3e;
3489 int i3, i4;
3490 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3491 #elif CONFIG_PAGING_LEVELS == 3
3492 l3_pgentry_t *l3e;
3493 int i3;
3494 l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3495 #else /* CONFIG_PAGING_LEVELS == 2 */
3496 l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3497 #endif
3499 gfn = 0;
3500 #if CONFIG_PAGING_LEVELS >= 3
3501 #if CONFIG_PAGING_LEVELS >= 4
3502 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
3504 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
3506 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
3507 continue;
3509 l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
3510 #endif /* now at levels 3 or 4... */
3511 for ( i3 = 0;
3512 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
3513 i3++ )
3515 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
3517 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3518 continue;
3520 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
3521 #endif /* all levels... */
3522 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
3524 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
3526 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
3527 continue;
3529 l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
3531 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
3533 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
3534 continue;
3535 mfn = l1e_get_pfn(l1e[i1]);
3536 ASSERT(mfn_valid(_mfn(mfn)));
3537 m2pfn = get_gpfn_from_mfn(mfn);
3538 if ( m2pfn != gfn )
3540 pmbad++;
3541 SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
3542 " -> gfn %#lx\n", gfn, mfn, m2pfn);
3543 BUG();
3546 sh_unmap_domain_page(l1e);
3548 #if CONFIG_PAGING_LEVELS >= 3
3549 sh_unmap_domain_page(l2e);
3551 #if CONFIG_PAGING_LEVELS >= 4
3552 sh_unmap_domain_page(l3e);
3554 #endif
3555 #endif
3557 #if CONFIG_PAGING_LEVELS == 4
3558 sh_unmap_domain_page(l4e);
3559 #elif CONFIG_PAGING_LEVELS == 3
3560 sh_unmap_domain_page(l3e);
3561 #else /* CONFIG_PAGING_LEVELS == 2 */
3562 sh_unmap_domain_page(l2e);
3563 #endif
3567 //SHADOW_PRINTK("p2m audit complete\n");
3568 //if ( orphans_i | orphans_d | mpbad | pmbad )
3569 // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
3570 // orphans_i + orphans_d, orphans_i, orphans_d,
3571 if ( mpbad | pmbad )
3572 SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
3573 pmbad, mpbad);
3576 #endif /* p2m audit */
3578 /*
3579 * Local variables:
3580 * mode: C
3581 * c-set-style: "BSD"
3582 * c-basic-offset: 4
3583 * indent-tabs-mode: nil
3584 * End:
3585 */