ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 13547:a514ae6bc150

[HVM] Save/restore cleanups 06: Let dom0 change domU's paging mode

Signed-off-by: Zhai Edwin <edwin.zhai@intel.com>
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Sat Jan 20 11:40:52 2007 +0000 (2007-01-20)
parents f78cca1e57a2
children 30af6cfdb05c
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include <asm/shared.h>
40 #include "private.h"
43 /* Set up the shadow-specific parts of a domain struct at start of day.
44 * Called for every domain from arch_domain_create() */
45 void shadow_domain_init(struct domain *d)
46 {
47 int i;
48 shadow_lock_init(d);
49 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
50 INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
51 INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
52 INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
53 INIT_LIST_HEAD(&d->arch.shadow.pinned_shadows);
54 }
57 #if SHADOW_AUDIT
58 int shadow_audit_enable = 0;
60 static void shadow_audit_key(unsigned char key)
61 {
62 shadow_audit_enable = !shadow_audit_enable;
63 printk("%s shadow_audit_enable=%d\n",
64 __func__, shadow_audit_enable);
65 }
67 static int __init shadow_audit_key_init(void)
68 {
69 register_keyhandler(
70 'O', shadow_audit_key, "toggle shadow audits");
71 return 0;
72 }
73 __initcall(shadow_audit_key_init);
74 #endif /* SHADOW_AUDIT */
76 static void sh_free_log_dirty_bitmap(struct domain *d);
78 int _shadow_mode_refcounts(struct domain *d)
79 {
80 return shadow_mode_refcounts(d);
81 }
84 /**************************************************************************/
85 /* x86 emulator support for the shadow code
86 */
88 struct segment_register *hvm_get_seg_reg(
89 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
90 {
91 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
92 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
93 hvm_get_segment_register(current, seg, seg_reg);
94 return seg_reg;
95 }
97 enum hvm_access_type {
98 hvm_access_insn_fetch, hvm_access_read, hvm_access_write
99 };
101 static int hvm_translate_linear_addr(
102 enum x86_segment seg,
103 unsigned long offset,
104 unsigned int bytes,
105 enum hvm_access_type access_type,
106 struct sh_emulate_ctxt *sh_ctxt,
107 unsigned long *paddr)
108 {
109 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
110 unsigned long limit, addr = offset;
111 uint32_t last_byte;
113 if ( sh_ctxt->ctxt.addr_size != 64 )
114 {
115 /*
116 * COMPATIBILITY MODE: Apply segment checks and add base.
117 */
119 switch ( access_type )
120 {
121 case hvm_access_read:
122 if ( (reg->attr.fields.type & 0xa) == 0x8 )
123 goto gpf; /* execute-only code segment */
124 break;
125 case hvm_access_write:
126 if ( (reg->attr.fields.type & 0xa) != 0x2 )
127 goto gpf; /* not a writable data segment */
128 break;
129 default:
130 break;
131 }
133 /* Calculate the segment limit, including granularity flag. */
134 limit = reg->limit;
135 if ( reg->attr.fields.g )
136 limit = (limit << 12) | 0xfff;
138 last_byte = offset + bytes - 1;
140 /* Is this a grows-down data segment? Special limit check if so. */
141 if ( (reg->attr.fields.type & 0xc) == 0x4 )
142 {
143 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
144 if ( !reg->attr.fields.db )
145 last_byte = (uint16_t)last_byte;
147 /* Check first byte and last byte against respective bounds. */
148 if ( (offset <= limit) || (last_byte < offset) )
149 goto gpf;
150 }
151 else if ( (last_byte > limit) || (last_byte < offset) )
152 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
154 /*
155 * Hardware truncates to 32 bits in compatibility mode.
156 * It does not truncate to 16 bits in 16-bit address-size mode.
157 */
158 addr = (uint32_t)(addr + reg->base);
159 }
160 else
161 {
162 /*
163 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
164 */
166 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
167 addr += reg->base;
169 if ( !is_canonical_address(addr) )
170 goto gpf;
171 }
173 *paddr = addr;
174 return 0;
176 gpf:
177 /* Inject #GP(0). */
178 hvm_inject_exception(TRAP_gp_fault, 0, 0);
179 return X86EMUL_PROPAGATE_FAULT;
180 }
182 static int
183 hvm_read(enum x86_segment seg,
184 unsigned long offset,
185 unsigned long *val,
186 unsigned int bytes,
187 enum hvm_access_type access_type,
188 struct sh_emulate_ctxt *sh_ctxt)
189 {
190 unsigned long addr;
191 int rc, errcode;
193 rc = hvm_translate_linear_addr(
194 seg, offset, bytes, access_type, sh_ctxt, &addr);
195 if ( rc )
196 return rc;
198 *val = 0;
199 // XXX -- this is WRONG.
200 // It entirely ignores the permissions in the page tables.
201 // In this case, that is only a user vs supervisor access check.
202 //
203 if ( (rc = hvm_copy_from_guest_virt(val, addr, bytes)) == 0 )
204 return X86EMUL_CONTINUE;
206 /* If we got here, there was nothing mapped here, or a bad GFN
207 * was mapped here. This should never happen: we're here because
208 * of a write fault at the end of the instruction we're emulating. */
209 SHADOW_PRINTK("read failed to va %#lx\n", addr);
210 errcode = ring_3(sh_ctxt->ctxt.regs) ? PFEC_user_mode : 0;
211 if ( access_type == hvm_access_insn_fetch )
212 errcode |= PFEC_insn_fetch;
213 hvm_inject_exception(TRAP_page_fault, errcode, addr + bytes - rc);
214 return X86EMUL_PROPAGATE_FAULT;
215 }
217 static int
218 hvm_emulate_read(enum x86_segment seg,
219 unsigned long offset,
220 unsigned long *val,
221 unsigned int bytes,
222 struct x86_emulate_ctxt *ctxt)
223 {
224 return hvm_read(seg, offset, val, bytes, hvm_access_read,
225 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
226 }
228 static int
229 hvm_emulate_insn_fetch(enum x86_segment seg,
230 unsigned long offset,
231 unsigned long *val,
232 unsigned int bytes,
233 struct x86_emulate_ctxt *ctxt)
234 {
235 struct sh_emulate_ctxt *sh_ctxt =
236 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
237 unsigned int insn_off = offset - ctxt->regs->eip;
239 /* Fall back if requested bytes are not in the prefetch cache. */
240 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
241 return hvm_read(seg, offset, val, bytes,
242 hvm_access_insn_fetch, sh_ctxt);
244 /* Hit the cache. Simple memcpy. */
245 *val = 0;
246 memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
247 return X86EMUL_CONTINUE;
248 }
250 static int
251 hvm_emulate_write(enum x86_segment seg,
252 unsigned long offset,
253 unsigned long val,
254 unsigned int bytes,
255 struct x86_emulate_ctxt *ctxt)
256 {
257 struct sh_emulate_ctxt *sh_ctxt =
258 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
259 struct vcpu *v = current;
260 unsigned long addr;
261 int rc;
263 rc = hvm_translate_linear_addr(
264 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
265 if ( rc )
266 return rc;
268 return v->arch.shadow.mode->x86_emulate_write(
269 v, addr, &val, bytes, sh_ctxt);
270 }
272 static int
273 hvm_emulate_cmpxchg(enum x86_segment seg,
274 unsigned long offset,
275 unsigned long old,
276 unsigned long new,
277 unsigned int bytes,
278 struct x86_emulate_ctxt *ctxt)
279 {
280 struct sh_emulate_ctxt *sh_ctxt =
281 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
282 struct vcpu *v = current;
283 unsigned long addr;
284 int rc;
286 rc = hvm_translate_linear_addr(
287 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
288 if ( rc )
289 return rc;
291 return v->arch.shadow.mode->x86_emulate_cmpxchg(
292 v, addr, old, new, bytes, sh_ctxt);
293 }
295 static int
296 hvm_emulate_cmpxchg8b(enum x86_segment seg,
297 unsigned long offset,
298 unsigned long old_lo,
299 unsigned long old_hi,
300 unsigned long new_lo,
301 unsigned long new_hi,
302 struct x86_emulate_ctxt *ctxt)
303 {
304 struct sh_emulate_ctxt *sh_ctxt =
305 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
306 struct vcpu *v = current;
307 unsigned long addr;
308 int rc;
310 rc = hvm_translate_linear_addr(
311 seg, offset, 8, hvm_access_write, sh_ctxt, &addr);
312 if ( rc )
313 return rc;
315 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(
316 v, addr, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
317 }
319 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
320 .read = hvm_emulate_read,
321 .insn_fetch = hvm_emulate_insn_fetch,
322 .write = hvm_emulate_write,
323 .cmpxchg = hvm_emulate_cmpxchg,
324 .cmpxchg8b = hvm_emulate_cmpxchg8b,
325 };
327 static int
328 pv_emulate_read(enum x86_segment seg,
329 unsigned long offset,
330 unsigned long *val,
331 unsigned int bytes,
332 struct x86_emulate_ctxt *ctxt)
333 {
334 unsigned int rc;
336 *val = 0;
337 if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
338 {
339 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
340 return X86EMUL_PROPAGATE_FAULT;
341 }
343 return X86EMUL_CONTINUE;
344 }
346 static int
347 pv_emulate_write(enum x86_segment seg,
348 unsigned long offset,
349 unsigned long val,
350 unsigned int bytes,
351 struct x86_emulate_ctxt *ctxt)
352 {
353 struct sh_emulate_ctxt *sh_ctxt =
354 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
355 struct vcpu *v = current;
356 return v->arch.shadow.mode->x86_emulate_write(
357 v, offset, &val, bytes, sh_ctxt);
358 }
360 static int
361 pv_emulate_cmpxchg(enum x86_segment seg,
362 unsigned long offset,
363 unsigned long old,
364 unsigned long new,
365 unsigned int bytes,
366 struct x86_emulate_ctxt *ctxt)
367 {
368 struct sh_emulate_ctxt *sh_ctxt =
369 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
370 struct vcpu *v = current;
371 return v->arch.shadow.mode->x86_emulate_cmpxchg(
372 v, offset, old, new, bytes, sh_ctxt);
373 }
375 static int
376 pv_emulate_cmpxchg8b(enum x86_segment seg,
377 unsigned long offset,
378 unsigned long old_lo,
379 unsigned long old_hi,
380 unsigned long new_lo,
381 unsigned long new_hi,
382 struct x86_emulate_ctxt *ctxt)
383 {
384 struct sh_emulate_ctxt *sh_ctxt =
385 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
386 struct vcpu *v = current;
387 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(
388 v, offset, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
389 }
391 static struct x86_emulate_ops pv_shadow_emulator_ops = {
392 .read = pv_emulate_read,
393 .insn_fetch = pv_emulate_read,
394 .write = pv_emulate_write,
395 .cmpxchg = pv_emulate_cmpxchg,
396 .cmpxchg8b = pv_emulate_cmpxchg8b,
397 };
399 struct x86_emulate_ops *shadow_init_emulation(
400 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
401 {
402 struct segment_register *creg, *sreg;
403 struct vcpu *v = current;
404 unsigned long addr;
406 sh_ctxt->ctxt.regs = regs;
408 if ( !is_hvm_vcpu(v) )
409 {
410 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
411 return &pv_shadow_emulator_ops;
412 }
414 /* Segment cache initialisation. Primed with CS. */
415 sh_ctxt->valid_seg_regs = 0;
416 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
418 /* Work out the emulation mode. */
419 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
420 {
421 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
422 }
423 else if ( regs->eflags & X86_EFLAGS_VM )
424 {
425 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 16;
426 }
427 else
428 {
429 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
430 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
431 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
432 }
434 /* Attempt to prefetch whole instruction. */
435 sh_ctxt->insn_buf_bytes =
436 (!hvm_translate_linear_addr(
437 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
438 hvm_access_insn_fetch, sh_ctxt, &addr) &&
439 !hvm_copy_from_guest_virt(
440 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf)))
441 ? sizeof(sh_ctxt->insn_buf) : 0;
443 return &hvm_shadow_emulator_ops;
444 }
446 /**************************************************************************/
447 /* Code for "promoting" a guest page to the point where the shadow code is
448 * willing to let it be treated as a guest page table. This generally
449 * involves making sure there are no writable mappings available to the guest
450 * for this page.
451 */
452 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
453 {
454 struct page_info *page = mfn_to_page(gmfn);
456 ASSERT(mfn_valid(gmfn));
458 /* We should never try to promote a gmfn that has writeable mappings */
459 ASSERT(sh_remove_write_access(v, gmfn, 0, 0) == 0);
461 /* Is the page already shadowed? */
462 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
463 page->shadow_flags = 0;
465 ASSERT(!test_bit(type, &page->shadow_flags));
466 set_bit(type, &page->shadow_flags);
467 }
469 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
470 {
471 struct page_info *page = mfn_to_page(gmfn);
473 ASSERT(test_bit(_PGC_page_table, &page->count_info));
474 ASSERT(test_bit(type, &page->shadow_flags));
476 clear_bit(type, &page->shadow_flags);
478 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
479 {
480 /* tlbflush timestamp field is valid again */
481 page->tlbflush_timestamp = tlbflush_current_time();
482 clear_bit(_PGC_page_table, &page->count_info);
483 }
484 }
486 /**************************************************************************/
487 /* Validate a pagetable change from the guest and update the shadows.
488 * Returns a bitmask of SHADOW_SET_* flags. */
490 int
491 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
492 {
493 int result = 0;
494 struct page_info *page = mfn_to_page(gmfn);
496 sh_mark_dirty(v->domain, gmfn);
498 // Determine which types of shadows are affected, and update each.
499 //
500 // Always validate L1s before L2s to prevent another cpu with a linear
501 // mapping of this gmfn from seeing a walk that results from
502 // using the new L2 value and the old L1 value. (It is OK for such a
503 // guest to see a walk that uses the old L2 value with the new L1 value,
504 // as hardware could behave this way if one level of the pagewalk occurs
505 // before the store, and the next level of the pagewalk occurs after the
506 // store.
507 //
508 // Ditto for L2s before L3s, etc.
509 //
511 if ( !(page->count_info & PGC_page_table) )
512 return 0; /* Not shadowed at all */
514 #if CONFIG_PAGING_LEVELS == 2
515 if ( page->shadow_flags & SHF_L1_32 )
516 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
517 (v, gmfn, entry, size);
518 #else
519 if ( page->shadow_flags & SHF_L1_32 )
520 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
521 (v, gmfn, entry, size);
522 #endif
524 #if CONFIG_PAGING_LEVELS == 2
525 if ( page->shadow_flags & SHF_L2_32 )
526 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
527 (v, gmfn, entry, size);
528 #else
529 if ( page->shadow_flags & SHF_L2_32 )
530 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
531 (v, gmfn, entry, size);
532 #endif
534 #if CONFIG_PAGING_LEVELS >= 3
535 if ( page->shadow_flags & SHF_L1_PAE )
536 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
537 (v, gmfn, entry, size);
538 if ( page->shadow_flags & SHF_L2_PAE )
539 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
540 (v, gmfn, entry, size);
541 if ( page->shadow_flags & SHF_L2H_PAE )
542 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
543 (v, gmfn, entry, size);
544 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
545 ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
546 #endif
548 #if CONFIG_PAGING_LEVELS >= 4
549 if ( page->shadow_flags & SHF_L1_64 )
550 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
551 (v, gmfn, entry, size);
552 if ( page->shadow_flags & SHF_L2_64 )
553 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
554 (v, gmfn, entry, size);
555 if ( page->shadow_flags & SHF_L3_64 )
556 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
557 (v, gmfn, entry, size);
558 if ( page->shadow_flags & SHF_L4_64 )
559 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
560 (v, gmfn, entry, size);
561 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
562 ASSERT((page->shadow_flags
563 & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
564 #endif
566 return result;
567 }
570 void
571 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
572 void *entry, u32 size)
573 /* This is the entry point for emulated writes to pagetables in HVM guests and
574 * PV translated guests.
575 */
576 {
577 struct domain *d = v->domain;
578 int rc;
580 ASSERT(shadow_locked_by_me(v->domain));
581 rc = sh_validate_guest_entry(v, gmfn, entry, size);
582 if ( rc & SHADOW_SET_FLUSH )
583 /* Need to flush TLBs to pick up shadow PT changes */
584 flush_tlb_mask(d->domain_dirty_cpumask);
585 if ( rc & SHADOW_SET_ERROR )
586 {
587 /* This page is probably not a pagetable any more: tear it out of the
588 * shadows, along with any tables that reference it.
589 * Since the validate call above will have made a "safe" (i.e. zero)
590 * shadow entry, we can let the domain live even if we can't fully
591 * unshadow the page. */
592 sh_remove_shadows(v, gmfn, 0, 0);
593 }
594 }
596 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
597 intpte_t new, mfn_t gmfn)
598 /* Write a new value into the guest pagetable, and update the shadows
599 * appropriately. Returns 0 if we page-faulted, 1 for success. */
600 {
601 int failed;
602 shadow_lock(v->domain);
603 failed = __copy_to_user(p, &new, sizeof(new));
604 if ( failed != sizeof(new) )
605 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
606 shadow_unlock(v->domain);
607 return (failed == 0);
608 }
610 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
611 intpte_t *old, intpte_t new, mfn_t gmfn)
612 /* Cmpxchg a new value into the guest pagetable, and update the shadows
613 * appropriately. Returns 0 if we page-faulted, 1 if not.
614 * N.B. caller should check the value of "old" to see if the
615 * cmpxchg itself was successful. */
616 {
617 int failed;
618 intpte_t t = *old;
619 shadow_lock(v->domain);
620 failed = cmpxchg_user(p, t, new);
621 if ( t == *old )
622 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
623 *old = t;
624 shadow_unlock(v->domain);
625 return (failed == 0);
626 }
629 /**************************************************************************/
630 /* Memory management for shadow pages. */
632 /* Allocating shadow pages
633 * -----------------------
634 *
635 * Most shadow pages are allocated singly, but there is one case where
636 * we need to allocate multiple pages together: shadowing 32-bit guest
637 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
638 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
639 * l1 tables (covering 2MB of virtual address space each). Similarly, a
640 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
641 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
642 * contiguous and aligned; functions for handling offsets into them are
643 * defined in shadow.c (shadow_l1_index() etc.)
644 *
645 * This table shows the allocation behaviour of the different modes:
646 *
647 * Xen paging 32b pae pae 64b 64b 64b
648 * Guest paging 32b 32b pae 32b pae 64b
649 * PV or HVM * HVM * HVM HVM *
650 * Shadow paging 32b pae pae pae pae 64b
651 *
652 * sl1 size 4k 8k 4k 8k 4k 4k
653 * sl2 size 4k 16k 4k 16k 4k 4k
654 * sl3 size - - - - - 4k
655 * sl4 size - - - - - 4k
656 *
657 * We allocate memory from xen in four-page units and break them down
658 * with a simple buddy allocator. Can't use the xen allocator to handle
659 * this as it only works for contiguous zones, and a domain's shadow
660 * pool is made of fragments.
661 *
662 * In HVM guests, the p2m table is built out of shadow pages, and we provide
663 * a function for the p2m management to steal pages, in max-order chunks, from
664 * the free pool. We don't provide for giving them back, yet.
665 */
667 /* Figure out the least acceptable quantity of shadow memory.
668 * The minimum memory requirement for always being able to free up a
669 * chunk of memory is very small -- only three max-order chunks per
670 * vcpu to hold the top level shadows and pages with Xen mappings in them.
671 *
672 * But for a guest to be guaranteed to successfully execute a single
673 * instruction, we must be able to map a large number (about thirty) VAs
674 * at the same time, which means that to guarantee progress, we must
675 * allow for more than ninety allocated pages per vcpu. We round that
676 * up to 128 pages, or half a megabyte per vcpu. */
677 unsigned int shadow_min_acceptable_pages(struct domain *d)
678 {
679 u32 vcpu_count = 0;
680 struct vcpu *v;
682 for_each_vcpu(d, v)
683 vcpu_count++;
685 return (vcpu_count * 128);
686 }
688 /* Figure out the order of allocation needed for a given shadow type */
689 static inline u32
690 shadow_order(unsigned int shadow_type)
691 {
692 #if CONFIG_PAGING_LEVELS > 2
693 static const u32 type_to_order[16] = {
694 0, /* SH_type_none */
695 1, /* SH_type_l1_32_shadow */
696 1, /* SH_type_fl1_32_shadow */
697 2, /* SH_type_l2_32_shadow */
698 0, /* SH_type_l1_pae_shadow */
699 0, /* SH_type_fl1_pae_shadow */
700 0, /* SH_type_l2_pae_shadow */
701 0, /* SH_type_l2h_pae_shadow */
702 0, /* SH_type_l1_64_shadow */
703 0, /* SH_type_fl1_64_shadow */
704 0, /* SH_type_l2_64_shadow */
705 0, /* SH_type_l3_64_shadow */
706 0, /* SH_type_l4_64_shadow */
707 2, /* SH_type_p2m_table */
708 0 /* SH_type_monitor_table */
709 };
710 ASSERT(shadow_type < 16);
711 return type_to_order[shadow_type];
712 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
713 return 0;
714 #endif
715 }
718 /* Do we have a free chunk of at least this order? */
719 static inline int chunk_is_available(struct domain *d, int order)
720 {
721 int i;
723 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
724 if ( !list_empty(&d->arch.shadow.freelists[i]) )
725 return 1;
726 return 0;
727 }
729 /* Dispatcher function: call the per-mode function that will unhook the
730 * non-Xen mappings in this top-level shadow mfn */
731 void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
732 {
733 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
734 switch ( sp->type )
735 {
736 case SH_type_l2_32_shadow:
737 #if CONFIG_PAGING_LEVELS == 2
738 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
739 #else
740 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
741 #endif
742 break;
743 #if CONFIG_PAGING_LEVELS >= 3
744 case SH_type_l2_pae_shadow:
745 case SH_type_l2h_pae_shadow:
746 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
747 break;
748 #endif
749 #if CONFIG_PAGING_LEVELS >= 4
750 case SH_type_l4_64_shadow:
751 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
752 break;
753 #endif
754 default:
755 SHADOW_PRINTK("top-level shadow has bad type %08x\n", sp->type);
756 BUG();
757 }
758 }
761 /* Make sure there is at least one chunk of the required order available
762 * in the shadow page pool. This must be called before any calls to
763 * shadow_alloc(). Since this will free existing shadows to make room,
764 * it must be called early enough to avoid freeing shadows that the
765 * caller is currently working on. */
766 void shadow_prealloc(struct domain *d, unsigned int order)
767 {
768 /* Need a vpcu for calling unpins; for now, since we don't have
769 * per-vcpu shadows, any will do */
770 struct vcpu *v, *v2;
771 struct list_head *l, *t;
772 struct shadow_page_info *sp;
773 cpumask_t flushmask = CPU_MASK_NONE;
774 mfn_t smfn;
775 int i;
777 if ( chunk_is_available(d, order) ) return;
779 v = current;
780 if ( v->domain != d )
781 v = d->vcpu[0];
782 ASSERT(v != NULL);
784 /* Stage one: walk the list of pinned pages, unpinning them */
785 perfc_incrc(shadow_prealloc_1);
786 list_for_each_backwards_safe(l, t, &d->arch.shadow.pinned_shadows)
787 {
788 sp = list_entry(l, struct shadow_page_info, list);
789 smfn = shadow_page_to_mfn(sp);
791 /* Unpin this top-level shadow */
792 sh_unpin(v, smfn);
794 /* See if that freed up a chunk of appropriate size */
795 if ( chunk_is_available(d, order) ) return;
796 }
798 /* Stage two: all shadow pages are in use in hierarchies that are
799 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
800 * mappings. */
801 perfc_incrc(shadow_prealloc_2);
803 for_each_vcpu(d, v2)
804 for ( i = 0 ; i < 4 ; i++ )
805 {
806 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
807 {
808 shadow_unhook_mappings(v,
809 pagetable_get_mfn(v2->arch.shadow_table[i]));
810 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
812 /* See if that freed up a chunk of appropriate size */
813 if ( chunk_is_available(d, order) )
814 {
815 flush_tlb_mask(flushmask);
816 return;
817 }
818 }
819 }
821 /* Nothing more we can do: all remaining shadows are of pages that
822 * hold Xen mappings for some vcpu. This can never happen. */
823 SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
824 " shadow pages total = %u, free = %u, p2m=%u\n",
825 1 << order,
826 d->arch.shadow.total_pages,
827 d->arch.shadow.free_pages,
828 d->arch.shadow.p2m_pages);
829 BUG();
830 }
832 /* Deliberately free all the memory we can: this will tear down all of
833 * this domain's shadows */
834 static void shadow_blow_tables(struct domain *d)
835 {
836 struct list_head *l, *t;
837 struct shadow_page_info *sp;
838 struct vcpu *v = d->vcpu[0];
839 mfn_t smfn;
840 int i;
842 /* Pass one: unpin all pinned pages */
843 list_for_each_backwards_safe(l,t, &d->arch.shadow.pinned_shadows)
844 {
845 sp = list_entry(l, struct shadow_page_info, list);
846 smfn = shadow_page_to_mfn(sp);
847 sh_unpin(v, smfn);
848 }
850 /* Second pass: unhook entries of in-use shadows */
851 for_each_vcpu(d, v)
852 for ( i = 0 ; i < 4 ; i++ )
853 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
854 shadow_unhook_mappings(v,
855 pagetable_get_mfn(v->arch.shadow_table[i]));
857 /* Make sure everyone sees the unshadowings */
858 flush_tlb_mask(d->domain_dirty_cpumask);
859 }
862 #ifndef NDEBUG
863 /* Blow all shadows of all shadowed domains: this can be used to cause the
864 * guest's pagetables to be re-shadowed if we suspect that the shadows
865 * have somehow got out of sync */
866 static void shadow_blow_all_tables(unsigned char c)
867 {
868 struct domain *d;
869 printk("'%c' pressed -> blowing all shadow tables\n", c);
870 for_each_domain(d)
871 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
872 {
873 shadow_lock(d);
874 shadow_blow_tables(d);
875 shadow_unlock(d);
876 }
877 }
879 /* Register this function in the Xen console keypress table */
880 static __init int shadow_blow_tables_keyhandler_init(void)
881 {
882 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
883 return 0;
884 }
885 __initcall(shadow_blow_tables_keyhandler_init);
886 #endif /* !NDEBUG */
888 /* Allocate another shadow's worth of (contiguous, aligned) pages,
889 * and fill in the type and backpointer fields of their page_infos.
890 * Never fails to allocate. */
891 mfn_t shadow_alloc(struct domain *d,
892 u32 shadow_type,
893 unsigned long backpointer)
894 {
895 struct shadow_page_info *sp = NULL;
896 unsigned int order = shadow_order(shadow_type);
897 cpumask_t mask;
898 void *p;
899 int i;
901 ASSERT(shadow_locked_by_me(d));
902 ASSERT(order <= SHADOW_MAX_ORDER);
903 ASSERT(shadow_type != SH_type_none);
904 perfc_incrc(shadow_alloc);
906 /* Find smallest order which can satisfy the request. */
907 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
908 if ( !list_empty(&d->arch.shadow.freelists[i]) )
909 {
910 sp = list_entry(d->arch.shadow.freelists[i].next,
911 struct shadow_page_info, list);
912 list_del(&sp->list);
914 /* We may have to halve the chunk a number of times. */
915 while ( i != order )
916 {
917 i--;
918 sp->order = i;
919 list_add_tail(&sp->list, &d->arch.shadow.freelists[i]);
920 sp += 1 << i;
921 }
922 d->arch.shadow.free_pages -= 1 << order;
924 /* Init page info fields and clear the pages */
925 for ( i = 0; i < 1<<order ; i++ )
926 {
927 /* Before we overwrite the old contents of this page,
928 * we need to be sure that no TLB holds a pointer to it. */
929 mask = d->domain_dirty_cpumask;
930 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
931 if ( unlikely(!cpus_empty(mask)) )
932 {
933 perfc_incrc(shadow_alloc_tlbflush);
934 flush_tlb_mask(mask);
935 }
936 /* Now safe to clear the page for reuse */
937 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
938 ASSERT(p != NULL);
939 clear_page(p);
940 sh_unmap_domain_page(p);
941 INIT_LIST_HEAD(&sp[i].list);
942 sp[i].type = shadow_type;
943 sp[i].pinned = 0;
944 sp[i].logdirty = 0;
945 sp[i].count = 0;
946 sp[i].backpointer = backpointer;
947 sp[i].next_shadow = NULL;
948 perfc_incr(shadow_alloc_count);
949 }
950 return shadow_page_to_mfn(sp);
951 }
953 /* If we get here, we failed to allocate. This should never happen.
954 * It means that we didn't call shadow_prealloc() correctly before
955 * we allocated. We can't recover by calling prealloc here, because
956 * we might free up higher-level pages that the caller is working on. */
957 SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
958 BUG();
959 }
962 /* Return some shadow pages to the pool. */
963 void shadow_free(struct domain *d, mfn_t smfn)
964 {
965 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
966 u32 shadow_type;
967 unsigned long order;
968 unsigned long mask;
969 int i;
971 ASSERT(shadow_locked_by_me(d));
972 perfc_incrc(shadow_free);
974 shadow_type = sp->type;
975 ASSERT(shadow_type != SH_type_none);
976 ASSERT(shadow_type != SH_type_p2m_table);
977 order = shadow_order(shadow_type);
979 d->arch.shadow.free_pages += 1 << order;
981 for ( i = 0; i < 1<<order; i++ )
982 {
983 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
984 struct vcpu *v;
985 for_each_vcpu(d, v)
986 {
987 /* No longer safe to look for a writeable mapping in this shadow */
988 if ( v->arch.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
989 v->arch.shadow.last_writeable_pte_smfn = 0;
990 }
991 #endif
992 /* Strip out the type: this is now a free shadow page */
993 sp[i].type = 0;
994 /* Remember the TLB timestamp so we will know whether to flush
995 * TLBs when we reuse the page. Because the destructors leave the
996 * contents of the pages in place, we can delay TLB flushes until
997 * just before the allocator hands the page out again. */
998 sp[i].tlbflush_timestamp = tlbflush_current_time();
999 perfc_decr(shadow_alloc_count);
1002 /* Merge chunks as far as possible. */
1003 while ( order < SHADOW_MAX_ORDER )
1005 mask = 1 << order;
1006 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
1007 /* Merge with predecessor block? */
1008 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
1009 break;
1010 list_del(&(sp-mask)->list);
1011 sp -= mask;
1012 } else {
1013 /* Merge with successor block? */
1014 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
1015 break;
1016 list_del(&(sp+mask)->list);
1018 order++;
1021 sp->order = order;
1022 list_add_tail(&sp->list, &d->arch.shadow.freelists[order]);
1025 /* Divert some memory from the pool to be used by the p2m mapping.
1026 * This action is irreversible: the p2m mapping only ever grows.
1027 * That's OK because the p2m table only exists for translated domains,
1028 * and those domains can't ever turn off shadow mode.
1029 * Also, we only ever allocate a max-order chunk, so as to preserve
1030 * the invariant that shadow_prealloc() always works.
1031 * Returns 0 iff it can't get a chunk (the caller should then
1032 * free up some pages in domheap and call sh_set_allocation);
1033 * returns non-zero on success.
1034 */
1035 static int
1036 shadow_alloc_p2m_pages(struct domain *d)
1038 struct page_info *pg;
1039 u32 i;
1040 ASSERT(shadow_locked_by_me(d));
1042 if ( d->arch.shadow.total_pages
1043 < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
1044 return 0; /* Not enough shadow memory: need to increase it first */
1046 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1047 d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
1048 d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
1049 for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
1051 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1052 * Marking the domain as the owner would normally allow the guest to
1053 * create mappings of these pages, but these p2m pages will never be
1054 * in the domain's guest-physical address space, and so that is not
1055 * believed to be a concern.
1056 */
1057 page_set_owner(&pg[i], d);
1058 list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
1060 return 1;
1063 // Returns 0 if no memory is available...
1064 mfn_t
1065 shadow_alloc_p2m_page(struct domain *d)
1067 struct list_head *entry;
1068 struct page_info *pg;
1069 mfn_t mfn;
1070 void *p;
1072 if ( list_empty(&d->arch.shadow.p2m_freelist) &&
1073 !shadow_alloc_p2m_pages(d) )
1074 return _mfn(0);
1075 entry = d->arch.shadow.p2m_freelist.next;
1076 list_del(entry);
1077 list_add_tail(entry, &d->arch.shadow.p2m_inuse);
1078 pg = list_entry(entry, struct page_info, list);
1079 pg->count_info = 1;
1080 mfn = page_to_mfn(pg);
1081 p = sh_map_domain_page(mfn);
1082 clear_page(p);
1083 sh_unmap_domain_page(p);
1085 return mfn;
1088 #if CONFIG_PAGING_LEVELS == 3
1089 static void p2m_install_entry_in_monitors(struct domain *d,
1090 l3_pgentry_t *l3e)
1091 /* Special case, only used for external-mode domains on PAE hosts:
1092 * update the mapping of the p2m table. Once again, this is trivial in
1093 * other paging modes (one top-level entry points to the top-level p2m,
1094 * no maintenance needed), but PAE makes life difficult by needing a
1095 * copy the eight l3es of the p2m table in eight l2h slots in the
1096 * monitor table. This function makes fresh copies when a p2m l3e
1097 * changes. */
1099 l2_pgentry_t *ml2e;
1100 struct vcpu *v;
1101 unsigned int index;
1103 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1104 ASSERT(index < MACHPHYS_MBYTES>>1);
1106 for_each_vcpu(d, v)
1108 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1109 continue;
1110 ASSERT(shadow_mode_external(v->domain));
1112 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1113 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1115 if ( v == current ) /* OK to use linear map of monitor_table */
1116 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1117 else
1119 l3_pgentry_t *ml3e;
1120 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1121 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1122 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1123 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1124 sh_unmap_domain_page(ml3e);
1126 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1127 if ( v != current )
1128 sh_unmap_domain_page(ml2e);
1131 #endif
1133 // Find the next level's P2M entry, checking for out-of-range gfn's...
1134 // Returns NULL on error.
1135 //
1136 static l1_pgentry_t *
1137 p2m_find_entry(void *table, unsigned long *gfn_remainder,
1138 unsigned long gfn, u32 shift, u32 max)
1140 u32 index;
1142 index = *gfn_remainder >> shift;
1143 if ( index >= max )
1145 SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
1146 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
1147 gfn, *gfn_remainder, shift, index, max);
1148 return NULL;
1150 *gfn_remainder &= (1 << shift) - 1;
1151 return (l1_pgentry_t *)table + index;
1154 // Walk one level of the P2M table, allocating a new table if required.
1155 // Returns 0 on error.
1156 //
1157 static int
1158 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
1159 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
1160 u32 max, unsigned long type)
1162 l1_pgentry_t *p2m_entry;
1163 void *next;
1165 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
1166 shift, max)) )
1167 return 0;
1169 if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
1171 mfn_t mfn = shadow_alloc_p2m_page(d);
1172 if ( mfn_x(mfn) == 0 )
1173 return 0;
1174 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1175 mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
1176 mfn_to_page(mfn)->count_info = 1;
1177 #if CONFIG_PAGING_LEVELS == 3
1178 if (type == PGT_l2_page_table)
1180 struct vcpu *v;
1181 /* We have written to the p2m l3: need to sync the per-vcpu
1182 * copies of it in the monitor tables */
1183 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
1184 /* Also, any vcpus running on shadows of the p2m need to
1185 * reload their CR3s so the change propagates to the shadow */
1186 ASSERT(shadow_locked_by_me(d));
1187 for_each_vcpu(d, v)
1189 if ( pagetable_get_pfn(v->arch.guest_table)
1190 == pagetable_get_pfn(d->arch.phys_table)
1191 && v->arch.shadow.mode != NULL )
1192 v->arch.shadow.mode->update_cr3(v, 0);
1195 #endif
1196 /* The P2M can be shadowed: keep the shadows synced */
1197 if ( d->vcpu[0] != NULL )
1198 (void)sh_validate_guest_entry(d->vcpu[0], *table_mfn,
1199 p2m_entry, sizeof *p2m_entry);
1201 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
1202 next = sh_map_domain_page(*table_mfn);
1203 sh_unmap_domain_page(*table);
1204 *table = next;
1206 return 1;
1209 // Returns 0 on error (out of memory)
1210 int
1211 shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
1213 // XXX -- this might be able to be faster iff current->domain == d
1214 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
1215 void *table = sh_map_domain_page(table_mfn);
1216 unsigned long gfn_remainder = gfn;
1217 l1_pgentry_t *p2m_entry;
1218 int rv=0;
1220 #if CONFIG_PAGING_LEVELS >= 4
1221 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1222 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
1223 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
1224 goto out;
1225 #endif
1226 #if CONFIG_PAGING_LEVELS >= 3
1227 // When using PAE Xen, we only allow 33 bits of pseudo-physical
1228 // address in translated guests (i.e. 8 GBytes). This restriction
1229 // comes from wanting to map the P2M table into the 16MB RO_MPT hole
1230 // in Xen's address space for translated PV guests.
1231 //
1232 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1233 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
1234 (CONFIG_PAGING_LEVELS == 3
1235 ? 8
1236 : L3_PAGETABLE_ENTRIES),
1237 PGT_l2_page_table) )
1238 goto out;
1239 #endif
1240 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1241 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
1242 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
1243 goto out;
1245 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
1246 0, L1_PAGETABLE_ENTRIES);
1247 ASSERT(p2m_entry);
1248 if ( mfn_valid(mfn) )
1249 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1250 else
1251 *p2m_entry = l1e_empty();
1253 /* Track the highest gfn for which we have ever had a valid mapping */
1254 if ( mfn_valid(mfn) && (gfn > d->arch.max_mapped_pfn) )
1255 d->arch.max_mapped_pfn = gfn;
1257 /* The P2M can be shadowed: keep the shadows synced */
1258 if ( d->vcpu[0] != NULL )
1259 (void)sh_validate_guest_entry(d->vcpu[0], table_mfn,
1260 p2m_entry, sizeof(*p2m_entry));
1262 /* Success */
1263 rv = 1;
1265 out:
1266 sh_unmap_domain_page(table);
1267 return rv;
1270 // Allocate a new p2m table for a domain.
1271 //
1272 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
1273 // controlled by CONFIG_PAGING_LEVELS).
1274 //
1275 // Returns 0 if p2m table could not be initialized
1276 //
1277 static int
1278 shadow_alloc_p2m_table(struct domain *d)
1280 mfn_t p2m_top, mfn;
1281 struct list_head *entry;
1282 struct page_info *page;
1283 unsigned int page_count = 0;
1284 unsigned long gfn;
1286 SHADOW_PRINTK("allocating p2m table\n");
1287 ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
1289 p2m_top = shadow_alloc_p2m_page(d);
1290 mfn_to_page(p2m_top)->count_info = 1;
1291 mfn_to_page(p2m_top)->u.inuse.type_info =
1292 #if CONFIG_PAGING_LEVELS == 4
1293 PGT_l4_page_table
1294 #elif CONFIG_PAGING_LEVELS == 3
1295 PGT_l3_page_table
1296 #elif CONFIG_PAGING_LEVELS == 2
1297 PGT_l2_page_table
1298 #endif
1299 | 1 | PGT_validated;
1301 if ( mfn_x(p2m_top) == 0 )
1302 return 0;
1304 d->arch.phys_table = pagetable_from_mfn(p2m_top);
1306 SHADOW_PRINTK("populating p2m table\n");
1308 /* Initialise physmap tables for slot zero. Other code assumes this. */
1309 gfn = 0;
1310 mfn = _mfn(INVALID_MFN);
1311 if ( !shadow_set_p2m_entry(d, gfn, mfn) )
1312 goto error;
1314 /* Build a p2m map that matches the m2p entries for this domain's
1315 * allocated pages. Skip any pages that have an explicitly invalid
1316 * or obviously bogus m2p entry. */
1317 for ( entry = d->page_list.next;
1318 entry != &d->page_list;
1319 entry = entry->next )
1321 page = list_entry(entry, struct page_info, list);
1322 mfn = page_to_mfn(page);
1323 gfn = get_gpfn_from_mfn(mfn_x(mfn));
1324 page_count++;
1325 if (
1326 #ifdef __x86_64__
1327 (gfn != 0x5555555555555555L)
1328 #else
1329 (gfn != 0x55555555L)
1330 #endif
1331 && gfn != INVALID_M2P_ENTRY
1332 && (gfn <
1333 (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof (l1_pgentry_t))
1334 && !shadow_set_p2m_entry(d, gfn, mfn) )
1335 goto error;
1338 SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
1339 return 1;
1341 error:
1342 SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
1343 SH_PRI_mfn "\n", gfn, mfn_x(mfn));
1344 return 0;
1347 mfn_t
1348 sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1349 /* Read another domain's p2m entries */
1351 mfn_t mfn;
1352 paddr_t addr = ((paddr_t)gpfn) << PAGE_SHIFT;
1353 l2_pgentry_t *l2e;
1354 l1_pgentry_t *l1e;
1356 ASSERT(shadow_mode_translate(d));
1357 mfn = pagetable_get_mfn(d->arch.phys_table);
1360 if ( gpfn > d->arch.max_mapped_pfn )
1361 /* This pfn is higher than the highest the p2m map currently holds */
1362 return _mfn(INVALID_MFN);
1364 #if CONFIG_PAGING_LEVELS >= 4
1366 l4_pgentry_t *l4e = sh_map_domain_page(mfn);
1367 l4e += l4_table_offset(addr);
1368 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1370 sh_unmap_domain_page(l4e);
1371 return _mfn(INVALID_MFN);
1373 mfn = _mfn(l4e_get_pfn(*l4e));
1374 sh_unmap_domain_page(l4e);
1376 #endif
1377 #if CONFIG_PAGING_LEVELS >= 3
1379 l3_pgentry_t *l3e = sh_map_domain_page(mfn);
1380 #if CONFIG_PAGING_LEVELS == 3
1381 /* On PAE hosts the p2m has eight l3 entries, not four (see
1382 * shadow_set_p2m_entry()) so we can't use l3_table_offset.
1383 * Instead, just count the number of l3es from zero. It's safe
1384 * to do this because we already checked that the gfn is within
1385 * the bounds of the p2m. */
1386 l3e += (addr >> L3_PAGETABLE_SHIFT);
1387 #else
1388 l3e += l3_table_offset(addr);
1389 #endif
1390 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1392 sh_unmap_domain_page(l3e);
1393 return _mfn(INVALID_MFN);
1395 mfn = _mfn(l3e_get_pfn(*l3e));
1396 sh_unmap_domain_page(l3e);
1398 #endif
1400 l2e = sh_map_domain_page(mfn);
1401 l2e += l2_table_offset(addr);
1402 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1404 sh_unmap_domain_page(l2e);
1405 return _mfn(INVALID_MFN);
1407 mfn = _mfn(l2e_get_pfn(*l2e));
1408 sh_unmap_domain_page(l2e);
1410 l1e = sh_map_domain_page(mfn);
1411 l1e += l1_table_offset(addr);
1412 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1414 sh_unmap_domain_page(l1e);
1415 return _mfn(INVALID_MFN);
1417 mfn = _mfn(l1e_get_pfn(*l1e));
1418 sh_unmap_domain_page(l1e);
1420 return mfn;
1423 unsigned long
1424 shadow_gfn_to_mfn_foreign(unsigned long gpfn)
1426 return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
1430 static void shadow_p2m_teardown(struct domain *d)
1431 /* Return all the p2m pages to Xen.
1432 * We know we don't have any extra mappings to these pages */
1434 struct list_head *entry, *n;
1435 struct page_info *pg;
1437 d->arch.phys_table = pagetable_null();
1439 list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
1441 pg = list_entry(entry, struct page_info, list);
1442 list_del(entry);
1443 /* Should have just the one ref we gave it in alloc_p2m_page() */
1444 if ( (pg->count_info & PGC_count_mask) != 1 )
1446 SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1447 pg->count_info, pg->u.inuse.type_info);
1449 ASSERT(page_get_owner(pg) == d);
1450 /* Free should not decrement domain's total allocation, since
1451 * these pages were allocated without an owner. */
1452 page_set_owner(pg, NULL);
1453 free_domheap_pages(pg, 0);
1454 d->arch.shadow.p2m_pages--;
1455 perfc_decr(shadow_alloc_count);
1457 list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
1459 list_del(entry);
1460 pg = list_entry(entry, struct page_info, list);
1461 ASSERT(page_get_owner(pg) == d);
1462 /* Free should not decrement domain's total allocation. */
1463 page_set_owner(pg, NULL);
1464 free_domheap_pages(pg, 0);
1465 d->arch.shadow.p2m_pages--;
1466 perfc_decr(shadow_alloc_count);
1468 ASSERT(d->arch.shadow.p2m_pages == 0);
1471 /* Set the pool of shadow pages to the required number of pages.
1472 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1473 * plus space for the p2m table.
1474 * Returns 0 for success, non-zero for failure. */
1475 static unsigned int sh_set_allocation(struct domain *d,
1476 unsigned int pages,
1477 int *preempted)
1479 struct shadow_page_info *sp;
1480 unsigned int lower_bound;
1481 int j;
1483 ASSERT(shadow_locked_by_me(d));
1485 /* Don't allocate less than the minimum acceptable, plus one page per
1486 * megabyte of RAM (for the p2m table) */
1487 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1488 if ( pages > 0 && pages < lower_bound )
1489 pages = lower_bound;
1490 /* Round up to largest block size */
1491 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1493 SHADOW_PRINTK("current %i target %i\n",
1494 d->arch.shadow.total_pages, pages);
1496 while ( d->arch.shadow.total_pages != pages )
1498 if ( d->arch.shadow.total_pages < pages )
1500 /* Need to allocate more memory from domheap */
1501 sp = (struct shadow_page_info *)
1502 alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
1503 if ( sp == NULL )
1505 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1506 return -ENOMEM;
1508 d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1509 d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1510 for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
1512 sp[j].type = 0;
1513 sp[j].pinned = 0;
1514 sp[j].logdirty = 0;
1515 sp[j].count = 0;
1516 sp[j].mbz = 0;
1517 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1519 sp->order = SHADOW_MAX_ORDER;
1520 list_add_tail(&sp->list,
1521 &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
1523 else if ( d->arch.shadow.total_pages > pages )
1525 /* Need to return memory to domheap */
1526 shadow_prealloc(d, SHADOW_MAX_ORDER);
1527 ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
1528 sp = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
1529 struct shadow_page_info, list);
1530 list_del(&sp->list);
1531 d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1532 d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1533 free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER);
1536 /* Check to see if we need to yield and try again */
1537 if ( preempted && hypercall_preempt_check() )
1539 *preempted = 1;
1540 return 0;
1544 return 0;
1547 /* Return the size of the shadow pool, rounded up to the nearest MB */
1548 static unsigned int shadow_get_allocation(struct domain *d)
1550 unsigned int pg = d->arch.shadow.total_pages;
1551 return ((pg >> (20 - PAGE_SHIFT))
1552 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1555 /**************************************************************************/
1556 /* Hash table for storing the guest->shadow mappings.
1557 * The table itself is an array of pointers to shadows; the shadows are then
1558 * threaded on a singly-linked list of shadows with the same hash value */
1560 #define SHADOW_HASH_BUCKETS 251
1561 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1563 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1564 typedef u32 key_t;
1565 static inline key_t sh_hash(unsigned long n, unsigned int t)
1567 unsigned char *p = (unsigned char *)&n;
1568 key_t k = t;
1569 int i;
1570 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1571 return k % SHADOW_HASH_BUCKETS;
1574 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1576 /* Before we get to the mechanism, define a pair of audit functions
1577 * that sanity-check the contents of the hash table. */
1578 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1579 /* Audit one bucket of the hash table */
1581 struct shadow_page_info *sp, *x;
1583 if ( !(SHADOW_AUDIT_ENABLE) )
1584 return;
1586 sp = d->arch.shadow.hash_table[bucket];
1587 while ( sp )
1589 /* Not a shadow? */
1590 BUG_ON( sp->mbz != 0 );
1591 /* Bogus type? */
1592 BUG_ON( sp->type == 0 );
1593 BUG_ON( sp->type > SH_type_max_shadow );
1594 /* Wrong bucket? */
1595 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1596 /* Duplicate entry? */
1597 for ( x = sp->next_shadow; x; x = x->next_shadow )
1598 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1599 /* Follow the backpointer to the guest pagetable */
1600 if ( sp->type != SH_type_fl1_32_shadow
1601 && sp->type != SH_type_fl1_pae_shadow
1602 && sp->type != SH_type_fl1_64_shadow )
1604 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1605 /* Bad shadow flags on guest page? */
1606 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1607 /* Bad type count on guest page? */
1608 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1609 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1611 SHADOW_ERROR("MFN %#lx shadowed (by %#"SH_PRI_mfn")"
1612 " but has typecount %#lx\n",
1613 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1614 gpg->u.inuse.type_info);
1615 BUG();
1618 /* That entry was OK; on we go */
1619 sp = sp->next_shadow;
1623 #else
1624 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1625 #endif /* Hashtable bucket audit */
1628 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1630 static void sh_hash_audit(struct domain *d)
1631 /* Full audit: audit every bucket in the table */
1633 int i;
1635 if ( !(SHADOW_AUDIT_ENABLE) )
1636 return;
1638 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1640 sh_hash_audit_bucket(d, i);
1644 #else
1645 #define sh_hash_audit(_d) do {} while(0)
1646 #endif /* Hashtable bucket audit */
1648 /* Allocate and initialise the table itself.
1649 * Returns 0 for success, 1 for error. */
1650 static int shadow_hash_alloc(struct domain *d)
1652 struct shadow_page_info **table;
1654 ASSERT(shadow_locked_by_me(d));
1655 ASSERT(!d->arch.shadow.hash_table);
1657 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1658 if ( !table ) return 1;
1659 memset(table, 0,
1660 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1661 d->arch.shadow.hash_table = table;
1662 return 0;
1665 /* Tear down the hash table and return all memory to Xen.
1666 * This function does not care whether the table is populated. */
1667 static void shadow_hash_teardown(struct domain *d)
1669 ASSERT(shadow_locked_by_me(d));
1670 ASSERT(d->arch.shadow.hash_table);
1672 xfree(d->arch.shadow.hash_table);
1673 d->arch.shadow.hash_table = NULL;
1677 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1678 /* Find an entry in the hash table. Returns the MFN of the shadow,
1679 * or INVALID_MFN if it doesn't exist */
1681 struct domain *d = v->domain;
1682 struct shadow_page_info *sp, *prev;
1683 key_t key;
1685 ASSERT(shadow_locked_by_me(d));
1686 ASSERT(d->arch.shadow.hash_table);
1687 ASSERT(t);
1689 sh_hash_audit(d);
1691 perfc_incrc(shadow_hash_lookups);
1692 key = sh_hash(n, t);
1693 sh_hash_audit_bucket(d, key);
1695 sp = d->arch.shadow.hash_table[key];
1696 prev = NULL;
1697 while(sp)
1699 if ( sp->backpointer == n && sp->type == t )
1701 /* Pull-to-front if 'sp' isn't already the head item */
1702 if ( unlikely(sp != d->arch.shadow.hash_table[key]) )
1704 if ( unlikely(d->arch.shadow.hash_walking != 0) )
1705 /* Can't reorder: someone is walking the hash chains */
1706 return shadow_page_to_mfn(sp);
1707 else
1709 ASSERT(prev);
1710 /* Delete sp from the list */
1711 prev->next_shadow = sp->next_shadow;
1712 /* Re-insert it at the head of the list */
1713 sp->next_shadow = d->arch.shadow.hash_table[key];
1714 d->arch.shadow.hash_table[key] = sp;
1717 else
1719 perfc_incrc(shadow_hash_lookup_head);
1721 return shadow_page_to_mfn(sp);
1723 prev = sp;
1724 sp = sp->next_shadow;
1727 perfc_incrc(shadow_hash_lookup_miss);
1728 return _mfn(INVALID_MFN);
1731 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
1732 mfn_t smfn)
1733 /* Put a mapping (n,t)->smfn into the hash table */
1735 struct domain *d = v->domain;
1736 struct shadow_page_info *sp;
1737 key_t key;
1739 ASSERT(shadow_locked_by_me(d));
1740 ASSERT(d->arch.shadow.hash_table);
1741 ASSERT(t);
1743 sh_hash_audit(d);
1745 perfc_incrc(shadow_hash_inserts);
1746 key = sh_hash(n, t);
1747 sh_hash_audit_bucket(d, key);
1749 /* Insert this shadow at the top of the bucket */
1750 sp = mfn_to_shadow_page(smfn);
1751 sp->next_shadow = d->arch.shadow.hash_table[key];
1752 d->arch.shadow.hash_table[key] = sp;
1754 sh_hash_audit_bucket(d, key);
1757 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
1758 mfn_t smfn)
1759 /* Excise the mapping (n,t)->smfn from the hash table */
1761 struct domain *d = v->domain;
1762 struct shadow_page_info *sp, *x;
1763 key_t key;
1765 ASSERT(shadow_locked_by_me(d));
1766 ASSERT(d->arch.shadow.hash_table);
1767 ASSERT(t);
1769 sh_hash_audit(d);
1771 perfc_incrc(shadow_hash_deletes);
1772 key = sh_hash(n, t);
1773 sh_hash_audit_bucket(d, key);
1775 sp = mfn_to_shadow_page(smfn);
1776 if ( d->arch.shadow.hash_table[key] == sp )
1777 /* Easy case: we're deleting the head item. */
1778 d->arch.shadow.hash_table[key] = sp->next_shadow;
1779 else
1781 /* Need to search for the one we want */
1782 x = d->arch.shadow.hash_table[key];
1783 while ( 1 )
1785 ASSERT(x); /* We can't have hit the end, since our target is
1786 * still in the chain somehwere... */
1787 if ( x->next_shadow == sp )
1789 x->next_shadow = sp->next_shadow;
1790 break;
1792 x = x->next_shadow;
1795 sp->next_shadow = NULL;
1797 sh_hash_audit_bucket(d, key);
1800 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1802 static void hash_foreach(struct vcpu *v,
1803 unsigned int callback_mask,
1804 hash_callback_t callbacks[],
1805 mfn_t callback_mfn)
1806 /* Walk the hash table looking at the types of the entries and
1807 * calling the appropriate callback function for each entry.
1808 * The mask determines which shadow types we call back for, and the array
1809 * of callbacks tells us which function to call.
1810 * Any callback may return non-zero to let us skip the rest of the scan.
1812 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1813 * then return non-zero to terminate the scan. */
1815 int i, done = 0;
1816 struct domain *d = v->domain;
1817 struct shadow_page_info *x;
1819 /* Say we're here, to stop hash-lookups reordering the chains */
1820 ASSERT(shadow_locked_by_me(d));
1821 ASSERT(d->arch.shadow.hash_walking == 0);
1822 d->arch.shadow.hash_walking = 1;
1824 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1826 /* WARNING: This is not safe against changes to the hash table.
1827 * The callback *must* return non-zero if it has inserted or
1828 * deleted anything from the hash (lookups are OK, though). */
1829 for ( x = d->arch.shadow.hash_table[i]; x; x = x->next_shadow )
1831 if ( callback_mask & (1 << x->type) )
1833 ASSERT(x->type <= 15);
1834 ASSERT(callbacks[x->type] != NULL);
1835 done = callbacks[x->type](v, shadow_page_to_mfn(x),
1836 callback_mfn);
1837 if ( done ) break;
1840 if ( done ) break;
1842 d->arch.shadow.hash_walking = 0;
1846 /**************************************************************************/
1847 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1848 * which will decrement refcounts appropriately and return memory to the
1849 * free pool. */
1851 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1853 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1854 unsigned int t = sp->type;
1857 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1859 /* Double-check, if we can, that the shadowed page belongs to this
1860 * domain, (by following the back-pointer). */
1861 ASSERT(t == SH_type_fl1_32_shadow ||
1862 t == SH_type_fl1_pae_shadow ||
1863 t == SH_type_fl1_64_shadow ||
1864 t == SH_type_monitor_table ||
1865 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
1866 == v->domain));
1868 /* The down-shifts here are so that the switch statement is on nice
1869 * small numbers that the compiler will enjoy */
1870 switch ( t )
1872 #if CONFIG_PAGING_LEVELS == 2
1873 case SH_type_l1_32_shadow:
1874 case SH_type_fl1_32_shadow:
1875 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1876 break;
1877 case SH_type_l2_32_shadow:
1878 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1879 break;
1880 #else /* PAE or 64bit */
1881 case SH_type_l1_32_shadow:
1882 case SH_type_fl1_32_shadow:
1883 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1884 break;
1885 case SH_type_l2_32_shadow:
1886 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1887 break;
1888 #endif
1890 #if CONFIG_PAGING_LEVELS >= 3
1891 case SH_type_l1_pae_shadow:
1892 case SH_type_fl1_pae_shadow:
1893 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1894 break;
1895 case SH_type_l2_pae_shadow:
1896 case SH_type_l2h_pae_shadow:
1897 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1898 break;
1899 #endif
1901 #if CONFIG_PAGING_LEVELS >= 4
1902 case SH_type_l1_64_shadow:
1903 case SH_type_fl1_64_shadow:
1904 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1905 break;
1906 case SH_type_l2_64_shadow:
1907 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1908 break;
1909 case SH_type_l3_64_shadow:
1910 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1911 break;
1912 case SH_type_l4_64_shadow:
1913 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1914 break;
1915 #endif
1916 default:
1917 SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
1918 (unsigned long)t);
1919 BUG();
1923 /**************************************************************************/
1924 /* Remove all writeable mappings of a guest frame from the shadow tables
1925 * Returns non-zero if we need to flush TLBs.
1926 * level and fault_addr desribe how we found this to be a pagetable;
1927 * level==0 means we have some other reason for revoking write access.*/
1929 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
1930 unsigned int level,
1931 unsigned long fault_addr)
1933 /* Dispatch table for getting per-type functions */
1934 static hash_callback_t callbacks[16] = {
1935 NULL, /* none */
1936 #if CONFIG_PAGING_LEVELS == 2
1937 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* l1_32 */
1938 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* fl1_32 */
1939 #else
1940 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* l1_32 */
1941 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* fl1_32 */
1942 #endif
1943 NULL, /* l2_32 */
1944 #if CONFIG_PAGING_LEVELS >= 3
1945 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* l1_pae */
1946 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* fl1_pae */
1947 #else
1948 NULL, /* l1_pae */
1949 NULL, /* fl1_pae */
1950 #endif
1951 NULL, /* l2_pae */
1952 NULL, /* l2h_pae */
1953 #if CONFIG_PAGING_LEVELS >= 4
1954 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* l1_64 */
1955 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* fl1_64 */
1956 #else
1957 NULL, /* l1_64 */
1958 NULL, /* fl1_64 */
1959 #endif
1960 NULL, /* l2_64 */
1961 NULL, /* l3_64 */
1962 NULL, /* l4_64 */
1963 NULL, /* p2m */
1964 NULL /* unused */
1965 };
1967 static unsigned int callback_mask =
1968 1 << SH_type_l1_32_shadow
1969 | 1 << SH_type_fl1_32_shadow
1970 | 1 << SH_type_l1_pae_shadow
1971 | 1 << SH_type_fl1_pae_shadow
1972 | 1 << SH_type_l1_64_shadow
1973 | 1 << SH_type_fl1_64_shadow
1975 struct page_info *pg = mfn_to_page(gmfn);
1977 ASSERT(shadow_locked_by_me(v->domain));
1979 /* Only remove writable mappings if we are doing shadow refcounts.
1980 * In guest refcounting, we trust Xen to already be restricting
1981 * all the writes to the guest page tables, so we do not need to
1982 * do more. */
1983 if ( !shadow_mode_refcounts(v->domain) )
1984 return 0;
1986 /* Early exit if it's already a pagetable, or otherwise not writeable */
1987 if ( sh_mfn_is_a_page_table(gmfn)
1988 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1989 return 0;
1991 perfc_incrc(shadow_writeable);
1993 /* If this isn't a "normal" writeable page, the domain is trying to
1994 * put pagetables in special memory of some kind. We can't allow that. */
1995 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1997 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1998 PRtype_info "\n",
1999 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
2000 domain_crash(v->domain);
2003 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
2004 if ( v == current && level != 0 )
2006 unsigned long gfn;
2007 /* Heuristic: there is likely to be only one writeable mapping,
2008 * and that mapping is likely to be in the current pagetable,
2009 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
2011 #define GUESS(_a, _h) do { \
2012 if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
2013 perfc_incrc(shadow_writeable_h_ ## _h); \
2014 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
2015 return 1; \
2016 } while (0)
2019 if ( v->arch.shadow.mode->guest_levels == 2 )
2021 if ( level == 1 )
2022 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
2023 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
2025 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2026 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2027 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2030 #if CONFIG_PAGING_LEVELS >= 3
2031 else if ( v->arch.shadow.mode->guest_levels == 3 )
2033 /* 32bit PAE w2k3: linear map at 0xC0000000 */
2034 switch ( level )
2036 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
2037 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
2040 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2041 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2042 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2044 #if CONFIG_PAGING_LEVELS >= 4
2045 else if ( v->arch.shadow.mode->guest_levels == 4 )
2047 /* 64bit w2k3: linear map at 0x0000070000000000 */
2048 switch ( level )
2050 case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
2051 case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
2052 case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
2055 /* 64bit Linux direct map at 0xffff810000000000; older kernels
2056 * had it at 0x0000010000000000UL */
2057 gfn = sh_mfn_to_gfn(v->domain, gmfn);
2058 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
2059 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
2061 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2062 #endif /* CONFIG_PAGING_LEVELS >= 3 */
2064 #undef GUESS
2067 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2068 return 1;
2070 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
2071 * (entries in the fixmap) where linux maps its pagetables. Since
2072 * we expect to hit them most of the time, we start the search for
2073 * the writeable mapping by looking at the same MFN where the last
2074 * brute-force search succeeded. */
2076 if ( v->arch.shadow.last_writeable_pte_smfn != 0 )
2078 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
2079 mfn_t last_smfn = _mfn(v->arch.shadow.last_writeable_pte_smfn);
2080 int shtype = mfn_to_shadow_page(last_smfn)->type;
2082 if ( callbacks[shtype] )
2083 callbacks[shtype](v, last_smfn, gmfn);
2085 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
2086 perfc_incrc(shadow_writeable_h_5);
2089 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2090 return 1;
2092 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
2094 /* Brute-force search of all the shadows, by walking the hash */
2095 perfc_incrc(shadow_writeable_bf);
2096 hash_foreach(v, callback_mask, callbacks, gmfn);
2098 /* If that didn't catch the mapping, something is very wrong */
2099 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2101 SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
2102 "%lu left\n", mfn_x(gmfn),
2103 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2104 domain_crash(v->domain);
2107 /* We killed at least one writeable mapping, so must flush TLBs. */
2108 return 1;
2113 /**************************************************************************/
2114 /* Remove all mappings of a guest frame from the shadow tables.
2115 * Returns non-zero if we need to flush TLBs. */
2117 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
2119 struct page_info *page = mfn_to_page(gmfn);
2120 int expected_count, do_locking;
2122 /* Dispatch table for getting per-type functions */
2123 static hash_callback_t callbacks[16] = {
2124 NULL, /* none */
2125 #if CONFIG_PAGING_LEVELS == 2
2126 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* l1_32 */
2127 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* fl1_32 */
2128 #else
2129 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* l1_32 */
2130 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* fl1_32 */
2131 #endif
2132 NULL, /* l2_32 */
2133 #if CONFIG_PAGING_LEVELS >= 3
2134 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* l1_pae */
2135 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* fl1_pae */
2136 #else
2137 NULL, /* l1_pae */
2138 NULL, /* fl1_pae */
2139 #endif
2140 NULL, /* l2_pae */
2141 NULL, /* l2h_pae */
2142 #if CONFIG_PAGING_LEVELS >= 4
2143 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* l1_64 */
2144 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* fl1_64 */
2145 #else
2146 NULL, /* l1_64 */
2147 NULL, /* fl1_64 */
2148 #endif
2149 NULL, /* l2_64 */
2150 NULL, /* l3_64 */
2151 NULL, /* l4_64 */
2152 NULL, /* p2m */
2153 NULL /* unused */
2154 };
2156 static unsigned int callback_mask =
2157 1 << SH_type_l1_32_shadow
2158 | 1 << SH_type_fl1_32_shadow
2159 | 1 << SH_type_l1_pae_shadow
2160 | 1 << SH_type_fl1_pae_shadow
2161 | 1 << SH_type_l1_64_shadow
2162 | 1 << SH_type_fl1_64_shadow
2165 perfc_incrc(shadow_mappings);
2166 if ( (page->count_info & PGC_count_mask) == 0 )
2167 return 0;
2169 /* Although this is an externally visible function, we do not know
2170 * whether the shadow lock will be held when it is called (since it
2171 * can be called via put_page_type when we clear a shadow l1e).
2172 * If the lock isn't held, take it for the duration of the call. */
2173 do_locking = !shadow_locked_by_me(v->domain);
2174 if ( do_locking ) shadow_lock(v->domain);
2176 /* XXX TODO:
2177 * Heuristics for finding the (probably) single mapping of this gmfn */
2179 /* Brute-force search of all the shadows, by walking the hash */
2180 perfc_incrc(shadow_mappings_bf);
2181 hash_foreach(v, callback_mask, callbacks, gmfn);
2183 /* If that didn't catch the mapping, something is very wrong */
2184 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2185 if ( (page->count_info & PGC_count_mask) != expected_count )
2187 /* Don't complain if we're in HVM and there are some extra mappings:
2188 * The qemu helper process has an untyped mapping of this dom's RAM
2189 * and the HVM restore program takes another. */
2190 if ( !(shadow_mode_external(v->domain)
2191 && (page->count_info & PGC_count_mask) <= 3
2192 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2194 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2195 "c=%08x t=%08lx\n", mfn_x(gmfn),
2196 page->count_info, page->u.inuse.type_info);
2200 if ( do_locking ) shadow_unlock(v->domain);
2202 /* We killed at least one mapping, so must flush TLBs. */
2203 return 1;
2207 /**************************************************************************/
2208 /* Remove all shadows of a guest frame from the shadow tables */
2210 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2211 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2212 * found there. Returns 1 if that was the only reference to this shadow */
2214 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2215 mfn_t pmfn;
2216 void *vaddr;
2217 int rc;
2219 ASSERT(sp->type > 0);
2220 ASSERT(sp->type < SH_type_max_shadow);
2221 ASSERT(sp->type != SH_type_l2_32_shadow);
2222 ASSERT(sp->type != SH_type_l2_pae_shadow);
2223 ASSERT(sp->type != SH_type_l2h_pae_shadow);
2224 ASSERT(sp->type != SH_type_l4_64_shadow);
2226 if (sp->up == 0) return 0;
2227 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2228 ASSERT(mfn_valid(pmfn));
2229 vaddr = sh_map_domain_page(pmfn);
2230 ASSERT(vaddr);
2231 vaddr += sp->up & (PAGE_SIZE-1);
2232 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2234 /* Is this the only reference to this shadow? */
2235 rc = (sp->count == 1) ? 1 : 0;
2237 /* Blank the offending entry */
2238 switch (sp->type)
2240 case SH_type_l1_32_shadow:
2241 case SH_type_l2_32_shadow:
2242 #if CONFIG_PAGING_LEVELS == 2
2243 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2244 #else
2245 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2246 #endif
2247 break;
2248 #if CONFIG_PAGING_LEVELS >=3
2249 case SH_type_l1_pae_shadow:
2250 case SH_type_l2_pae_shadow:
2251 case SH_type_l2h_pae_shadow:
2252 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2253 break;
2254 #if CONFIG_PAGING_LEVELS >= 4
2255 case SH_type_l1_64_shadow:
2256 case SH_type_l2_64_shadow:
2257 case SH_type_l3_64_shadow:
2258 case SH_type_l4_64_shadow:
2259 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2260 break;
2261 #endif
2262 #endif
2263 default: BUG(); /* Some wierd unknown shadow type */
2266 sh_unmap_domain_page(vaddr);
2267 if ( rc )
2268 perfc_incrc(shadow_up_pointer);
2269 else
2270 perfc_incrc(shadow_unshadow_bf);
2272 return rc;
2275 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2276 /* Remove the shadows of this guest page.
2277 * If fast != 0, just try the quick heuristic, which will remove
2278 * at most one reference to each shadow of the page. Otherwise, walk
2279 * all the shadow tables looking for refs to shadows of this gmfn.
2280 * If all != 0, kill the domain if we can't find all the shadows.
2281 * (all != 0 implies fast == 0)
2282 */
2284 struct page_info *pg = mfn_to_page(gmfn);
2285 mfn_t smfn;
2286 u32 sh_flags;
2287 int do_locking;
2288 unsigned char t;
2290 /* Dispatch table for getting per-type functions: each level must
2291 * be called with the function to remove a lower-level shadow. */
2292 static hash_callback_t callbacks[16] = {
2293 NULL, /* none */
2294 NULL, /* l1_32 */
2295 NULL, /* fl1_32 */
2296 #if CONFIG_PAGING_LEVELS == 2
2297 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2298 #else
2299 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2300 #endif
2301 NULL, /* l1_pae */
2302 NULL, /* fl1_pae */
2303 #if CONFIG_PAGING_LEVELS >= 3
2304 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2305 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2306 #else
2307 NULL, /* l2_pae */
2308 NULL, /* l2h_pae */
2309 #endif
2310 NULL, /* l1_64 */
2311 NULL, /* fl1_64 */
2312 #if CONFIG_PAGING_LEVELS >= 4
2313 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2314 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2315 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2316 #else
2317 NULL, /* l2_64 */
2318 NULL, /* l3_64 */
2319 NULL, /* l4_64 */
2320 #endif
2321 NULL, /* p2m */
2322 NULL /* unused */
2323 };
2325 /* Another lookup table, for choosing which mask to use */
2326 static unsigned int masks[16] = {
2327 0, /* none */
2328 1 << SH_type_l2_32_shadow, /* l1_32 */
2329 0, /* fl1_32 */
2330 0, /* l2_32 */
2331 ((1 << SH_type_l2h_pae_shadow)
2332 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2333 0, /* fl1_pae */
2334 0, /* l2_pae */
2335 0, /* l2h_pae */
2336 1 << SH_type_l2_64_shadow, /* l1_64 */
2337 0, /* fl1_64 */
2338 1 << SH_type_l3_64_shadow, /* l2_64 */
2339 1 << SH_type_l4_64_shadow, /* l3_64 */
2340 0, /* l4_64 */
2341 0, /* p2m */
2342 0 /* unused */
2343 };
2345 ASSERT(!(all && fast));
2347 /* Bail out now if the page is not shadowed */
2348 if ( (pg->count_info & PGC_page_table) == 0 )
2349 return;
2351 /* Although this is an externally visible function, we do not know
2352 * whether the shadow lock will be held when it is called (since it
2353 * can be called via put_page_type when we clear a shadow l1e).
2354 * If the lock isn't held, take it for the duration of the call. */
2355 do_locking = !shadow_locked_by_me(v->domain);
2356 if ( do_locking ) shadow_lock(v->domain);
2358 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2359 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2361 /* Search for this shadow in all appropriate shadows */
2362 perfc_incrc(shadow_unshadow);
2363 sh_flags = pg->shadow_flags;
2365 /* Lower-level shadows need to be excised from upper-level shadows.
2366 * This call to hash_foreach() looks dangerous but is in fact OK: each
2367 * call will remove at most one shadow, and terminate immediately when
2368 * it does remove it, so we never walk the hash after doing a deletion. */
2369 #define DO_UNSHADOW(_type) do { \
2370 t = (_type); \
2371 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2372 if ( sh_type_is_pinnable(v, t) ) \
2373 sh_unpin(v, smfn); \
2374 else \
2375 sh_remove_shadow_via_pointer(v, smfn); \
2376 if ( (pg->count_info & PGC_page_table) && !fast ) \
2377 hash_foreach(v, masks[t], callbacks, smfn); \
2378 } while (0)
2380 if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(SH_type_l1_32_shadow);
2381 if ( sh_flags & SHF_L2_32 ) DO_UNSHADOW(SH_type_l2_32_shadow);
2382 #if CONFIG_PAGING_LEVELS >= 3
2383 if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(SH_type_l1_pae_shadow);
2384 if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(SH_type_l2_pae_shadow);
2385 if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(SH_type_l2h_pae_shadow);
2386 #if CONFIG_PAGING_LEVELS >= 4
2387 if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(SH_type_l1_64_shadow);
2388 if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(SH_type_l2_64_shadow);
2389 if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(SH_type_l3_64_shadow);
2390 if ( sh_flags & SHF_L4_64 ) DO_UNSHADOW(SH_type_l4_64_shadow);
2391 #endif
2392 #endif
2394 #undef DO_UNSHADOW
2396 /* If that didn't catch the shadows, something is wrong */
2397 if ( !fast && (pg->count_info & PGC_page_table) )
2399 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2400 "(shadow_flags=%08lx)\n",
2401 mfn_x(gmfn), pg->shadow_flags);
2402 if ( all )
2403 domain_crash(v->domain);
2406 /* Need to flush TLBs now, so that linear maps are safe next time we
2407 * take a fault. */
2408 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2410 if ( do_locking ) shadow_unlock(v->domain);
2413 static void
2414 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2415 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2416 * Unshadow it, and recursively unshadow pages that reference it. */
2418 sh_remove_shadows(v, gmfn, 0, 1);
2419 /* XXX TODO:
2420 * Rework this hashtable walker to return a linked-list of all
2421 * the shadows it modified, then do breadth-first recursion
2422 * to find the way up to higher-level tables and unshadow them too.
2424 * The current code (just tearing down each page's shadows as we
2425 * detect that it is not a pagetable) is correct, but very slow.
2426 * It means extra emulated writes and slows down removal of mappings. */
2429 /**************************************************************************/
2431 static void sh_update_paging_modes(struct vcpu *v)
2433 struct domain *d = v->domain;
2434 struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
2435 mfn_t old_guest_table;
2437 ASSERT(shadow_locked_by_me(d));
2439 // Valid transitions handled by this function:
2440 // - For PV guests:
2441 // - after a shadow mode has been changed
2442 // - For HVM guests:
2443 // - after a shadow mode has been changed
2444 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2445 //
2447 // First, tear down any old shadow tables held by this vcpu.
2448 //
2449 if ( v->arch.shadow.mode )
2450 v->arch.shadow.mode->detach_old_tables(v);
2452 if ( !is_hvm_domain(d) )
2454 ///
2455 /// PV guest
2456 ///
2457 #if CONFIG_PAGING_LEVELS == 4
2458 if ( pv_32bit_guest(v) )
2459 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2460 else
2461 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2462 #elif CONFIG_PAGING_LEVELS == 3
2463 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2464 #elif CONFIG_PAGING_LEVELS == 2
2465 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2466 #else
2467 #error unexpected paging mode
2468 #endif
2469 v->arch.shadow.translate_enabled = !!shadow_mode_translate(d);
2471 else
2473 ///
2474 /// HVM guest
2475 ///
2476 ASSERT(shadow_mode_translate(d));
2477 ASSERT(shadow_mode_external(d));
2479 v->arch.shadow.translate_enabled = !!hvm_paging_enabled(v);
2480 if ( !v->arch.shadow.translate_enabled )
2482 /* Set v->arch.guest_table to use the p2m map, and choose
2483 * the appropriate shadow mode */
2484 old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2485 #if CONFIG_PAGING_LEVELS == 2
2486 v->arch.guest_table =
2487 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2488 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2489 #elif CONFIG_PAGING_LEVELS == 3
2490 v->arch.guest_table =
2491 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2492 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2493 #else /* CONFIG_PAGING_LEVELS == 4 */
2495 l4_pgentry_t *l4e;
2496 /* Use the start of the first l3 table as a PAE l3 */
2497 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2498 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2499 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2500 v->arch.guest_table =
2501 pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2502 sh_unmap_domain_page(l4e);
2504 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2505 #endif
2506 /* Fix up refcounts on guest_table */
2507 get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2508 if ( mfn_x(old_guest_table) != 0 )
2509 put_page(mfn_to_page(old_guest_table));
2511 else
2513 #ifdef __x86_64__
2514 if ( hvm_long_mode_enabled(v) )
2516 // long mode guest...
2517 v->arch.shadow.mode =
2518 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2520 else
2521 #endif
2522 if ( hvm_pae_enabled(v) )
2524 #if CONFIG_PAGING_LEVELS >= 3
2525 // 32-bit PAE mode guest...
2526 v->arch.shadow.mode =
2527 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2528 #else
2529 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2530 domain_crash(d);
2531 return;
2532 #endif
2534 else
2536 // 32-bit 2 level guest...
2537 #if CONFIG_PAGING_LEVELS >= 3
2538 v->arch.shadow.mode =
2539 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2540 #else
2541 v->arch.shadow.mode =
2542 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2543 #endif
2547 if ( pagetable_is_null(v->arch.monitor_table) )
2549 mfn_t mmfn = v->arch.shadow.mode->make_monitor_table(v);
2550 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2551 make_cr3(v, mfn_x(mmfn));
2552 hvm_update_host_cr3(v);
2555 if ( v->arch.shadow.mode != old_mode )
2557 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2558 "(was g=%u s=%u)\n",
2559 d->domain_id, v->vcpu_id,
2560 is_hvm_domain(d) ? !!hvm_paging_enabled(v) : 1,
2561 v->arch.shadow.mode->guest_levels,
2562 v->arch.shadow.mode->shadow_levels,
2563 old_mode ? old_mode->guest_levels : 0,
2564 old_mode ? old_mode->shadow_levels : 0);
2565 if ( old_mode &&
2566 (v->arch.shadow.mode->shadow_levels !=
2567 old_mode->shadow_levels) )
2569 /* Need to make a new monitor table for the new mode */
2570 mfn_t new_mfn, old_mfn;
2572 if ( v != current && vcpu_runnable(v) )
2574 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2575 "this HVM vcpu's (d=%u v=%u) paging mode "
2576 "while it is running.\n",
2577 current->domain->domain_id, current->vcpu_id,
2578 v->domain->domain_id, v->vcpu_id);
2579 /* It's not safe to do that because we can't change
2580 * the host CR£ for a running domain */
2581 domain_crash(v->domain);
2582 return;
2585 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2586 v->arch.monitor_table = pagetable_null();
2587 new_mfn = v->arch.shadow.mode->make_monitor_table(v);
2588 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2589 SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
2590 mfn_x(new_mfn));
2592 /* Don't be running on the old monitor table when we
2593 * pull it down! Switch CR3, and warn the HVM code that
2594 * its host cr3 has changed. */
2595 make_cr3(v, mfn_x(new_mfn));
2596 if ( v == current )
2597 write_ptbase(v);
2598 hvm_update_host_cr3(v);
2599 old_mode->destroy_monitor_table(v, old_mfn);
2603 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2604 // These are HARD: think about the case where two CPU's have
2605 // different values for CR4.PSE and CR4.PGE at the same time.
2606 // This *does* happen, at least for CR4.PGE...
2609 v->arch.shadow.mode->update_cr3(v, 0);
2612 void shadow_update_paging_modes(struct vcpu *v)
2614 shadow_lock(v->domain);
2615 sh_update_paging_modes(v);
2616 shadow_unlock(v->domain);
2619 /**************************************************************************/
2620 /* Turning on and off shadow features */
2622 static void sh_new_mode(struct domain *d, u32 new_mode)
2623 /* Inform all the vcpus that the shadow mode has been changed */
2625 struct vcpu *v;
2627 ASSERT(shadow_locked_by_me(d));
2628 ASSERT(d != current->domain);
2629 d->arch.shadow.mode = new_mode;
2630 if ( new_mode & SHM2_translate )
2631 shadow_audit_p2m(d);
2632 for_each_vcpu(d, v)
2633 sh_update_paging_modes(v);
2636 int shadow_enable(struct domain *d, u32 mode)
2637 /* Turn on "permanent" shadow features: external, translate, refcount.
2638 * Can only be called once on a domain, and these features cannot be
2639 * disabled.
2640 * Returns 0 for success, -errno for failure. */
2642 unsigned int old_pages;
2643 int rv = 0;
2645 mode |= SHM2_enable;
2647 domain_pause(d);
2648 shadow_lock(d);
2650 /* Sanity check the arguments */
2651 if ( (d == current->domain) ||
2652 shadow_mode_enabled(d) ||
2653 ((mode & SHM2_translate) && !(mode & SHM2_refcounts)) ||
2654 ((mode & SHM2_external) && !(mode & SHM2_translate)) )
2656 rv = -EINVAL;
2657 goto out;
2660 // XXX -- eventually would like to require that all memory be allocated
2661 // *after* shadow_enabled() is called... So here, we would test to make
2662 // sure that d->page_list is empty.
2663 #if 0
2664 spin_lock(&d->page_alloc_lock);
2665 if ( !list_empty(&d->page_list) )
2667 spin_unlock(&d->page_alloc_lock);
2668 rv = -EINVAL;
2669 goto out;
2671 spin_unlock(&d->page_alloc_lock);
2672 #endif
2674 /* Init the shadow memory allocation if the user hasn't done so */
2675 old_pages = d->arch.shadow.total_pages;
2676 if ( old_pages == 0 )
2677 if ( sh_set_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
2679 sh_set_allocation(d, 0, NULL);
2680 rv = -ENOMEM;
2681 goto out;
2684 /* Init the hash table */
2685 if ( shadow_hash_alloc(d) != 0 )
2687 sh_set_allocation(d, old_pages, NULL);
2688 rv = -ENOMEM;
2689 goto out;
2692 /* Init the P2M table */
2693 if ( mode & SHM2_translate )
2694 if ( !shadow_alloc_p2m_table(d) )
2696 shadow_hash_teardown(d);
2697 sh_set_allocation(d, old_pages, NULL);
2698 shadow_p2m_teardown(d);
2699 rv = -ENOMEM;
2700 goto out;
2703 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2704 /* We assume we're dealing with an older 64bit linux guest until we
2705 * see the guest use more than one l4 per vcpu. */
2706 d->arch.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2707 #endif
2709 /* Update the bits */
2710 sh_new_mode(d, mode);
2711 shadow_audit_p2m(d);
2712 out:
2713 shadow_unlock(d);
2714 domain_unpause(d);
2715 return rv;
2718 void shadow_teardown(struct domain *d)
2719 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2720 * Should only be called for dying domains. */
2722 struct vcpu *v;
2723 mfn_t mfn;
2725 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2726 ASSERT(d != current->domain);
2728 if ( !shadow_locked_by_me(d) )
2729 shadow_lock(d); /* Keep various asserts happy */
2731 if ( shadow_mode_enabled(d) )
2733 /* Release the shadow and monitor tables held by each vcpu */
2734 for_each_vcpu(d, v)
2736 if ( v->arch.shadow.mode )
2738 v->arch.shadow.mode->detach_old_tables(v);
2739 if ( shadow_mode_external(d) )
2741 mfn = pagetable_get_mfn(v->arch.monitor_table);
2742 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
2743 v->arch.shadow.mode->destroy_monitor_table(v, mfn);
2744 v->arch.monitor_table = pagetable_null();
2750 if ( d->arch.shadow.total_pages != 0 )
2752 SHADOW_PRINTK("teardown of domain %u starts."
2753 " Shadow pages total = %u, free = %u, p2m=%u\n",
2754 d->domain_id,
2755 d->arch.shadow.total_pages,
2756 d->arch.shadow.free_pages,
2757 d->arch.shadow.p2m_pages);
2758 /* Destroy all the shadows and release memory to domheap */
2759 sh_set_allocation(d, 0, NULL);
2760 /* Release the hash table back to xenheap */
2761 if (d->arch.shadow.hash_table)
2762 shadow_hash_teardown(d);
2763 /* Release the log-dirty bitmap of dirtied pages */
2764 sh_free_log_dirty_bitmap(d);
2765 /* Should not have any more memory held */
2766 SHADOW_PRINTK("teardown done."
2767 " Shadow pages total = %u, free = %u, p2m=%u\n",
2768 d->arch.shadow.total_pages,
2769 d->arch.shadow.free_pages,
2770 d->arch.shadow.p2m_pages);
2771 ASSERT(d->arch.shadow.total_pages == 0);
2774 /* We leave the "permanent" shadow modes enabled, but clear the
2775 * log-dirty mode bit. We don't want any more mark_dirty()
2776 * calls now that we've torn down the bitmap */
2777 d->arch.shadow.mode &= ~SHM2_log_dirty;
2779 shadow_unlock(d);
2782 void shadow_final_teardown(struct domain *d)
2783 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2786 SHADOW_PRINTK("dom %u final teardown starts."
2787 " Shadow pages total = %u, free = %u, p2m=%u\n",
2788 d->domain_id,
2789 d->arch.shadow.total_pages,
2790 d->arch.shadow.free_pages,
2791 d->arch.shadow.p2m_pages);
2793 /* Double-check that the domain didn't have any shadow memory.
2794 * It is possible for a domain that never got domain_kill()ed
2795 * to get here with its shadow allocation intact. */
2796 if ( d->arch.shadow.total_pages != 0 )
2797 shadow_teardown(d);
2799 /* It is now safe to pull down the p2m map. */
2800 if ( d->arch.shadow.p2m_pages != 0 )
2801 shadow_p2m_teardown(d);
2803 SHADOW_PRINTK("dom %u final teardown done."
2804 " Shadow pages total = %u, free = %u, p2m=%u\n",
2805 d->domain_id,
2806 d->arch.shadow.total_pages,
2807 d->arch.shadow.free_pages,
2808 d->arch.shadow.p2m_pages);
2811 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2812 /* Turn on a single shadow mode feature */
2814 ASSERT(shadow_locked_by_me(d));
2816 /* Sanity check the call */
2817 if ( d == current->domain || (d->arch.shadow.mode & mode) )
2819 return -EINVAL;
2822 if ( d->arch.shadow.mode == 0 )
2824 /* Init the shadow memory allocation and the hash table */
2825 if ( sh_set_allocation(d, 1, NULL) != 0
2826 || shadow_hash_alloc(d) != 0 )
2828 sh_set_allocation(d, 0, NULL);
2829 return -ENOMEM;
2833 /* Update the bits */
2834 sh_new_mode(d, d->arch.shadow.mode | mode);
2836 return 0;
2839 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2840 /* Turn off a single shadow mode feature */
2842 struct vcpu *v;
2843 ASSERT(shadow_locked_by_me(d));
2845 /* Sanity check the call */
2846 if ( d == current->domain || !(d->arch.shadow.mode & mode) )
2848 return -EINVAL;
2851 /* Update the bits */
2852 sh_new_mode(d, d->arch.shadow.mode & ~mode);
2853 if ( d->arch.shadow.mode == 0 )
2855 /* Get this domain off shadows */
2856 SHADOW_PRINTK("un-shadowing of domain %u starts."
2857 " Shadow pages total = %u, free = %u, p2m=%u\n",
2858 d->domain_id,
2859 d->arch.shadow.total_pages,
2860 d->arch.shadow.free_pages,
2861 d->arch.shadow.p2m_pages);
2862 for_each_vcpu(d, v)
2864 if ( v->arch.shadow.mode )
2865 v->arch.shadow.mode->detach_old_tables(v);
2866 #if CONFIG_PAGING_LEVELS == 4
2867 if ( !(v->arch.flags & TF_kernel_mode) )
2868 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2869 else
2870 #endif
2871 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2875 /* Pull down the memory allocation */
2876 if ( sh_set_allocation(d, 0, NULL) != 0 )
2878 // XXX - How can this occur?
2879 // Seems like a bug to return an error now that we've
2880 // disabled the relevant shadow mode.
2881 //
2882 return -ENOMEM;
2884 shadow_hash_teardown(d);
2885 SHADOW_PRINTK("un-shadowing of domain %u done."
2886 " Shadow pages total = %u, free = %u, p2m=%u\n",
2887 d->domain_id,
2888 d->arch.shadow.total_pages,
2889 d->arch.shadow.free_pages,
2890 d->arch.shadow.p2m_pages);
2893 return 0;
2896 /* Enable/disable ops for the "test" and "log-dirty" modes */
2897 static int shadow_test_enable(struct domain *d)
2899 int ret;
2901 domain_pause(d);
2902 shadow_lock(d);
2904 if ( shadow_mode_enabled(d) )
2906 SHADOW_ERROR("Don't support enabling test mode"
2907 " on already shadowed doms\n");
2908 ret = -EINVAL;
2909 goto out;
2912 ret = shadow_one_bit_enable(d, SHM2_enable);
2913 out:
2914 shadow_unlock(d);
2915 domain_unpause(d);
2917 return ret;
2920 static int shadow_test_disable(struct domain *d)
2922 int ret;
2924 domain_pause(d);
2925 shadow_lock(d);
2926 ret = shadow_one_bit_disable(d, SHM2_enable);
2927 shadow_unlock(d);
2928 domain_unpause(d);
2930 return ret;
2933 static int
2934 sh_alloc_log_dirty_bitmap(struct domain *d)
2936 ASSERT(d->arch.shadow.dirty_bitmap == NULL);
2937 d->arch.shadow.dirty_bitmap_size =
2938 (arch_get_max_pfn(d) + (BITS_PER_LONG - 1)) &
2939 ~(BITS_PER_LONG - 1);
2940 d->arch.shadow.dirty_bitmap =
2941 xmalloc_array(unsigned long,
2942 d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
2943 if ( d->arch.shadow.dirty_bitmap == NULL )
2945 d->arch.shadow.dirty_bitmap_size = 0;
2946 return -ENOMEM;
2948 memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
2950 return 0;
2953 static void
2954 sh_free_log_dirty_bitmap(struct domain *d)
2956 d->arch.shadow.dirty_bitmap_size = 0;
2957 if ( d->arch.shadow.dirty_bitmap )
2959 xfree(d->arch.shadow.dirty_bitmap);
2960 d->arch.shadow.dirty_bitmap = NULL;
2964 static int shadow_log_dirty_enable(struct domain *d)
2966 int ret;
2968 domain_pause(d);
2969 shadow_lock(d);
2971 if ( shadow_mode_log_dirty(d) )
2973 ret = -EINVAL;
2974 goto out;
2977 if ( shadow_mode_enabled(d) )
2979 SHADOW_ERROR("Don't (yet) support enabling log-dirty"
2980 " on already shadowed doms\n");
2981 ret = -EINVAL;
2982 goto out;
2985 ret = sh_alloc_log_dirty_bitmap(d);
2986 if ( ret != 0 )
2988 sh_free_log_dirty_bitmap(d);
2989 goto out;
2992 ret = shadow_one_bit_enable(d, SHM2_log_dirty);
2993 if ( ret != 0 )
2994 sh_free_log_dirty_bitmap(d);
2996 out:
2997 shadow_unlock(d);
2998 domain_unpause(d);
2999 return ret;
3002 static int shadow_log_dirty_disable(struct domain *d)
3004 int ret;
3006 domain_pause(d);
3007 shadow_lock(d);
3008 ret = shadow_one_bit_disable(d, SHM2_log_dirty);
3009 if ( !shadow_mode_log_dirty(d) )
3010 sh_free_log_dirty_bitmap(d);
3011 shadow_unlock(d);
3012 domain_unpause(d);
3014 return ret;
3017 /**************************************************************************/
3018 /* P2M map manipulations */
3020 static void
3021 sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
3023 struct vcpu *v;
3025 if ( !shadow_mode_translate(d) )
3026 return;
3028 v = current;
3029 if ( v->domain != d )
3030 v = d->vcpu[0];
3032 SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
3034 ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
3035 //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
3037 if ( v != NULL )
3039 sh_remove_all_shadows_and_parents(v, _mfn(mfn));
3040 if ( sh_remove_all_mappings(v, _mfn(mfn)) )
3041 flush_tlb_mask(d->domain_dirty_cpumask);
3044 shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
3045 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3048 void
3049 shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
3050 unsigned long mfn)
3052 shadow_lock(d);
3053 shadow_audit_p2m(d);
3054 sh_p2m_remove_page(d, gfn, mfn);
3055 shadow_audit_p2m(d);
3056 shadow_unlock(d);
3059 void
3060 shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
3061 unsigned long mfn)
3063 unsigned long ogfn;
3064 mfn_t omfn;
3066 if ( !shadow_mode_translate(d) )
3067 return;
3069 shadow_lock(d);
3070 shadow_audit_p2m(d);
3072 SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
3074 omfn = sh_gfn_to_mfn(d, gfn);
3075 if ( mfn_valid(omfn) )
3077 /* Get rid of the old mapping, especially any shadows */
3078 struct vcpu *v = current;
3079 if ( v->domain != d )
3080 v = d->vcpu[0];
3081 if ( v != NULL )
3083 sh_remove_all_shadows_and_parents(v, omfn);
3084 if ( sh_remove_all_mappings(v, omfn) )
3085 flush_tlb_mask(d->domain_dirty_cpumask);
3087 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
3090 ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
3091 if (
3092 #ifdef __x86_64__
3093 (ogfn != 0x5555555555555555L)
3094 #else
3095 (ogfn != 0x55555555L)
3096 #endif
3097 && (ogfn != INVALID_M2P_ENTRY)
3098 && (ogfn != gfn) )
3100 /* This machine frame is already mapped at another physical address */
3101 SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
3102 mfn, ogfn, gfn);
3103 if ( mfn_valid(omfn = sh_gfn_to_mfn(d, ogfn)) )
3105 SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
3106 ogfn , mfn_x(omfn));
3107 if ( mfn_x(omfn) == mfn )
3108 sh_p2m_remove_page(d, ogfn, mfn);
3112 shadow_set_p2m_entry(d, gfn, _mfn(mfn));
3113 set_gpfn_from_mfn(mfn, gfn);
3115 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3116 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
3117 cached the fact that this is an mmio region in the shadow
3118 page tables. Blow the tables away to remove the cache.
3119 This is pretty heavy handed, but this is a rare operation
3120 (it might happen a dozen times during boot and then never
3121 again), so it doesn't matter too much. */
3122 shadow_blow_tables(d);
3123 #endif
3125 shadow_audit_p2m(d);
3126 shadow_unlock(d);
3129 /**************************************************************************/
3130 /* Log-dirty mode support */
3132 /* Convert a shadow to log-dirty mode. */
3133 void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
3135 BUG();
3139 /* Read a domain's log-dirty bitmap and stats.
3140 * If the operation is a CLEAN, clear the bitmap and stats as well. */
3141 static int shadow_log_dirty_op(
3142 struct domain *d, struct xen_domctl_shadow_op *sc)
3144 int i, rv = 0, clean = 0, peek = 1;
3146 domain_pause(d);
3147 shadow_lock(d);
3149 clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
3151 SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
3152 (clean) ? "clean" : "peek",
3153 d->domain_id,
3154 d->arch.shadow.fault_count,
3155 d->arch.shadow.dirty_count);
3157 sc->stats.fault_count = d->arch.shadow.fault_count;
3158 sc->stats.dirty_count = d->arch.shadow.dirty_count;
3160 if ( clean )
3162 /* Need to revoke write access to the domain's pages again.
3163 * In future, we'll have a less heavy-handed approach to this,
3164 * but for now, we just unshadow everything except Xen. */
3165 shadow_blow_tables(d);
3167 d->arch.shadow.fault_count = 0;
3168 d->arch.shadow.dirty_count = 0;
3171 if ( guest_handle_is_null(sc->dirty_bitmap) )
3172 /* caller may have wanted just to clean the state or access stats. */
3173 peek = 0;
3175 if ( (peek || clean) && (d->arch.shadow.dirty_bitmap == NULL) )
3177 rv = -EINVAL; /* perhaps should be ENOMEM? */
3178 goto out;
3181 if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
3182 sc->pages = d->arch.shadow.dirty_bitmap_size;
3184 #define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
3185 for ( i = 0; i < sc->pages; i += CHUNK )
3187 int bytes = ((((sc->pages - i) > CHUNK)
3188 ? CHUNK
3189 : (sc->pages - i)) + 7) / 8;
3191 if ( likely(peek) )
3193 if ( copy_to_guest_offset(
3194 sc->dirty_bitmap,
3195 i/(8*sizeof(unsigned long)),
3196 d->arch.shadow.dirty_bitmap+(i/(8*sizeof(unsigned long))),
3197 (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) )
3199 rv = -EFAULT;
3200 goto out;
3204 if ( clean )
3205 memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
3206 0, bytes);
3208 #undef CHUNK
3210 out:
3211 shadow_unlock(d);
3212 domain_unpause(d);
3213 return rv;
3217 /* Mark a page as dirty */
3218 void sh_mark_dirty(struct domain *d, mfn_t gmfn)
3220 unsigned long pfn;
3222 ASSERT(shadow_locked_by_me(d));
3224 if ( !shadow_mode_log_dirty(d) || !mfn_valid(gmfn) )
3225 return;
3227 ASSERT(d->arch.shadow.dirty_bitmap != NULL);
3229 /* We /really/ mean PFN here, even for non-translated guests. */
3230 pfn = get_gpfn_from_mfn(mfn_x(gmfn));
3232 /*
3233 * Values with the MSB set denote MFNs that aren't really part of the
3234 * domain's pseudo-physical memory map (e.g., the shared info frame).
3235 * Nothing to do here...
3236 */
3237 if ( unlikely(!VALID_M2P(pfn)) )
3238 return;
3240 /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
3241 if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
3243 if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
3245 SHADOW_DEBUG(LOGDIRTY,
3246 "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
3247 mfn_x(gmfn), pfn, d->domain_id);
3248 d->arch.shadow.dirty_count++;
3251 else
3253 SHADOW_PRINTK("mark_dirty OOR! "
3254 "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
3255 "owner=%d c=%08x t=%" PRtype_info "\n",
3256 mfn_x(gmfn),
3257 pfn,
3258 d->arch.shadow.dirty_bitmap_size,
3259 d->domain_id,
3260 (page_get_owner(mfn_to_page(gmfn))
3261 ? page_get_owner(mfn_to_page(gmfn))->domain_id
3262 : -1),
3263 mfn_to_page(gmfn)->count_info,
3264 mfn_to_page(gmfn)->u.inuse.type_info);
3268 void shadow_mark_dirty(struct domain *d, mfn_t gmfn)
3270 shadow_lock(d);
3271 sh_mark_dirty(d, gmfn);
3272 shadow_unlock(d);
3275 /**************************************************************************/
3276 /* Shadow-control XEN_DOMCTL dispatcher */
3278 int shadow_domctl(struct domain *d,
3279 xen_domctl_shadow_op_t *sc,
3280 XEN_GUEST_HANDLE(void) u_domctl)
3282 int rc, preempted = 0;
3284 if ( unlikely(d == current->domain) )
3286 gdprintk(XENLOG_INFO, "Don't try to do a shadow op on yourself!\n");
3287 return -EINVAL;
3290 switch ( sc->op )
3292 case XEN_DOMCTL_SHADOW_OP_OFF:
3293 if ( shadow_mode_log_dirty(d) )
3294 if ( (rc = shadow_log_dirty_disable(d)) != 0 )
3295 return rc;
3296 if ( is_hvm_domain(d) )
3297 return -EINVAL;
3298 if ( d->arch.shadow.mode & SHM2_enable )
3299 if ( (rc = shadow_test_disable(d)) != 0 )
3300 return rc;
3301 return 0;
3303 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3304 return shadow_test_enable(d);
3306 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
3307 return shadow_log_dirty_enable(d);
3309 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3310 return shadow_enable(d, SHM2_refcounts|SHM2_translate);
3312 case XEN_DOMCTL_SHADOW_OP_CLEAN:
3313 case XEN_DOMCTL_SHADOW_OP_PEEK:
3314 return shadow_log_dirty_op(d, sc);
3316 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3317 if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
3318 return shadow_log_dirty_enable(d);
3319 return shadow_enable(d, sc->mode << SHM2_shift);
3321 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3322 sc->mb = shadow_get_allocation(d);
3323 return 0;
3325 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3326 shadow_lock(d);
3327 if ( sc->mb == 0 && shadow_mode_enabled(d) )
3329 /* Can't set the allocation to zero unless the domain stops using
3330 * shadow pagetables first */
3331 SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
3332 " is still using shadows.\n", d->domain_id);
3333 shadow_unlock(d);
3334 return -EINVAL;
3336 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
3337 shadow_unlock(d);
3338 if ( preempted )
3339 /* Not finished. Set up to re-run the call. */
3340 rc = hypercall_create_continuation(
3341 __HYPERVISOR_domctl, "h", u_domctl);
3342 else
3343 /* Finished. Return the new allocation */
3344 sc->mb = shadow_get_allocation(d);
3345 return rc;
3347 default:
3348 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3349 return -EINVAL;
3354 /**************************************************************************/
3355 /* Auditing shadow tables */
3357 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3359 void shadow_audit_tables(struct vcpu *v)
3361 /* Dispatch table for getting per-type functions */
3362 static hash_callback_t callbacks[16] = {
3363 NULL, /* none */
3364 #if CONFIG_PAGING_LEVELS == 2
3365 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
3366 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
3367 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
3368 #else
3369 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
3370 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
3371 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
3372 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
3373 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
3374 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
3375 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
3376 #if CONFIG_PAGING_LEVELS >= 4
3377 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
3378 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
3379 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
3380 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
3381 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
3382 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3383 #endif /* CONFIG_PAGING_LEVELS > 2 */
3384 NULL /* All the rest */
3385 };
3386 unsigned int mask;
3388 if ( !(SHADOW_AUDIT_ENABLE) )
3389 return;
3391 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3392 mask = ~1; /* Audit every table in the system */
3393 else
3395 /* Audit only the current mode's tables */
3396 switch ( v->arch.shadow.mode->guest_levels )
3398 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3399 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3400 |SHF_L2H_PAE); break;
3401 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3402 |SHF_L3_64|SHF_L4_64); break;
3403 default: BUG();
3407 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3410 #endif /* Shadow audit */
3413 /**************************************************************************/
3414 /* Auditing p2m tables */
3416 #if SHADOW_AUDIT & SHADOW_AUDIT_P2M
3418 void shadow_audit_p2m(struct domain *d)
3420 struct list_head *entry;
3421 struct page_info *page;
3422 struct domain *od;
3423 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
3424 mfn_t p2mfn;
3425 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
3426 int test_linear;
3428 if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
3429 return;
3431 //SHADOW_PRINTK("p2m audit starts\n");
3433 test_linear = ( (d == current->domain)
3434 && !pagetable_is_null(current->arch.monitor_table) );
3435 if ( test_linear )
3436 local_flush_tlb();
3438 /* Audit part one: walk the domain's page allocation list, checking
3439 * the m2p entries. */
3440 for ( entry = d->page_list.next;
3441 entry != &d->page_list;
3442 entry = entry->next )
3444 page = list_entry(entry, struct page_info, list);
3445 mfn = mfn_x(page_to_mfn(page));
3447 // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
3449 od = page_get_owner(page);
3451 if ( od != d )
3453 SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
3454 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
3455 continue;
3458 gfn = get_gpfn_from_mfn(mfn);
3459 if ( gfn == INVALID_M2P_ENTRY )
3461 orphans_i++;
3462 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
3463 // mfn);
3464 continue;
3467 if ( gfn == 0x55555555 )
3469 orphans_d++;
3470 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
3471 // mfn);
3472 continue;
3475 p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
3476 if ( mfn_x(p2mfn) != mfn )
3478 mpbad++;
3479 SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
3480 " (-> gfn %#lx)\n",
3481 mfn, gfn, mfn_x(p2mfn),
3482 (mfn_valid(p2mfn)
3483 ? get_gpfn_from_mfn(mfn_x(p2mfn))
3484 : -1u));
3485 /* This m2p entry is stale: the domain has another frame in
3486 * this physical slot. No great disaster, but for neatness,
3487 * blow away the m2p entry. */
3488 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3491 if ( test_linear && (gfn <= d->arch.max_mapped_pfn) )
3493 lp2mfn = gfn_to_mfn_current(gfn);
3494 if ( mfn_x(lp2mfn) != mfn_x(p2mfn) )
3496 SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
3497 "(!= mfn %#lx)\n", gfn,
3498 mfn_x(lp2mfn), mfn_x(p2mfn));
3502 // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
3503 // mfn, gfn, p2mfn, lp2mfn);
3506 /* Audit part two: walk the domain's p2m table, checking the entries. */
3507 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
3509 l2_pgentry_t *l2e;
3510 l1_pgentry_t *l1e;
3511 int i1, i2;
3513 #if CONFIG_PAGING_LEVELS == 4
3514 l4_pgentry_t *l4e;
3515 l3_pgentry_t *l3e;
3516 int i3, i4;
3517 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3518 #elif CONFIG_PAGING_LEVELS == 3
3519 l3_pgentry_t *l3e;
3520 int i3;
3521 l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3522 #else /* CONFIG_PAGING_LEVELS == 2 */
3523 l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3524 #endif
3526 gfn = 0;
3527 #if CONFIG_PAGING_LEVELS >= 3
3528 #if CONFIG_PAGING_LEVELS >= 4
3529 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
3531 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
3533 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
3534 continue;
3536 l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
3537 #endif /* now at levels 3 or 4... */
3538 for ( i3 = 0;
3539 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
3540 i3++ )
3542 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
3544 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3545 continue;
3547 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
3548 #endif /* all levels... */
3549 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
3551 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
3553 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
3554 continue;
3556 l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
3558 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
3560 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
3561 continue;
3562 mfn = l1e_get_pfn(l1e[i1]);
3563 ASSERT(mfn_valid(_mfn(mfn)));
3564 m2pfn = get_gpfn_from_mfn(mfn);
3565 if ( m2pfn != gfn )
3567 pmbad++;
3568 SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
3569 " -> gfn %#lx\n", gfn, mfn, m2pfn);
3570 BUG();
3573 sh_unmap_domain_page(l1e);
3575 #if CONFIG_PAGING_LEVELS >= 3
3576 sh_unmap_domain_page(l2e);
3578 #if CONFIG_PAGING_LEVELS >= 4
3579 sh_unmap_domain_page(l3e);
3581 #endif
3582 #endif
3584 #if CONFIG_PAGING_LEVELS == 4
3585 sh_unmap_domain_page(l4e);
3586 #elif CONFIG_PAGING_LEVELS == 3
3587 sh_unmap_domain_page(l3e);
3588 #else /* CONFIG_PAGING_LEVELS == 2 */
3589 sh_unmap_domain_page(l2e);
3590 #endif
3594 //SHADOW_PRINTK("p2m audit complete\n");
3595 //if ( orphans_i | orphans_d | mpbad | pmbad )
3596 // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
3597 // orphans_i + orphans_d, orphans_i, orphans_d,
3598 if ( mpbad | pmbad )
3599 SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
3600 pmbad, mpbad);
3603 #endif /* p2m audit */
3605 /*
3606 * Local variables:
3607 * mode: C
3608 * c-set-style: "BSD"
3609 * c-basic-offset: 4
3610 * indent-tabs-mode: nil
3611 * End:
3612 */