ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 12895:f5121d001d1a

[XEN] Shadow-mode-refcount PTE update fix.

Add back in support for emulated PTE updates which is critical for
shdow_refcount operation.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@localhost.localdomain
date Sat Dec 09 16:29:52 2006 +0000 (2006-12-09)
parents 963a02c040f6
children aabceba1dbc5
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include "private.h"
41 #if SHADOW_AUDIT
42 int shadow_audit_enable = 0;
44 static void shadow_audit_key(unsigned char key)
45 {
46 shadow_audit_enable = !shadow_audit_enable;
47 printk("%s shadow_audit_enable=%d\n",
48 __func__, shadow_audit_enable);
49 }
51 static int __init shadow_audit_key_init(void)
52 {
53 register_keyhandler(
54 'O', shadow_audit_key, "toggle shadow audits");
55 return 0;
56 }
57 __initcall(shadow_audit_key_init);
58 #endif /* SHADOW_AUDIT */
60 static void sh_free_log_dirty_bitmap(struct domain *d);
62 int _shadow_mode_refcounts(struct domain *d)
63 {
64 return shadow_mode_refcounts(d);
65 }
68 /**************************************************************************/
69 /* x86 emulator support for the shadow code
70 */
72 struct segment_register *hvm_get_seg_reg(
73 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
74 {
75 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
76 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
77 hvm_get_segment_register(current, seg, seg_reg);
78 return seg_reg;
79 }
81 enum hvm_access_type {
82 hvm_access_insn_fetch, hvm_access_read, hvm_access_write
83 };
85 static int hvm_translate_linear_addr(
86 enum x86_segment seg,
87 unsigned long offset,
88 unsigned int bytes,
89 enum hvm_access_type access_type,
90 struct sh_emulate_ctxt *sh_ctxt,
91 unsigned long *paddr)
92 {
93 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
94 unsigned long limit, addr = offset;
95 uint32_t last_byte;
97 if ( sh_ctxt->ctxt.mode != X86EMUL_MODE_PROT64 )
98 {
99 /*
100 * COMPATIBILITY MODE: Apply segment checks and add base.
101 */
103 switch ( access_type )
104 {
105 case hvm_access_read:
106 if ( (reg->attr.fields.type & 0xa) == 0x8 )
107 goto gpf; /* execute-only code segment */
108 break;
109 case hvm_access_write:
110 if ( (reg->attr.fields.type & 0xa) != 0x2 )
111 goto gpf; /* not a writable data segment */
112 break;
113 default:
114 break;
115 }
117 /* Calculate the segment limit, including granularity flag. */
118 limit = reg->limit;
119 if ( reg->attr.fields.g )
120 limit = (limit << 12) | 0xfff;
122 last_byte = offset + bytes - 1;
124 /* Is this a grows-down data segment? Special limit check if so. */
125 if ( (reg->attr.fields.type & 0xc) == 0x4 )
126 {
127 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
128 if ( !reg->attr.fields.db )
129 last_byte = (uint16_t)last_byte;
131 /* Check first byte and last byte against respective bounds. */
132 if ( (offset <= limit) || (last_byte < offset) )
133 goto gpf;
134 }
135 else if ( (last_byte > limit) || (last_byte < offset) )
136 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
138 /*
139 * Hardware truncates to 32 bits in compatibility mode.
140 * It does not truncate to 16 bits in 16-bit address-size mode.
141 */
142 addr = (uint32_t)(addr + reg->base);
143 }
144 else
145 {
146 /*
147 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
148 */
150 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
151 addr += reg->base;
153 if ( !is_canonical_address(addr) )
154 goto gpf;
155 }
157 *paddr = addr;
158 return 0;
160 gpf:
161 /* Inject #GP(0). */
162 hvm_inject_exception(TRAP_gp_fault, 0, 0);
163 return X86EMUL_PROPAGATE_FAULT;
164 }
166 static int
167 hvm_read(enum x86_segment seg,
168 unsigned long offset,
169 unsigned long *val,
170 unsigned int bytes,
171 enum hvm_access_type access_type,
172 struct sh_emulate_ctxt *sh_ctxt)
173 {
174 unsigned long addr;
175 int rc, errcode;
177 rc = hvm_translate_linear_addr(
178 seg, offset, bytes, access_type, sh_ctxt, &addr);
179 if ( rc )
180 return rc;
182 *val = 0;
183 // XXX -- this is WRONG.
184 // It entirely ignores the permissions in the page tables.
185 // In this case, that is only a user vs supervisor access check.
186 //
187 if ( (rc = hvm_copy_from_guest_virt(val, addr, bytes)) == 0 )
188 return X86EMUL_CONTINUE;
190 /* If we got here, there was nothing mapped here, or a bad GFN
191 * was mapped here. This should never happen: we're here because
192 * of a write fault at the end of the instruction we're emulating. */
193 SHADOW_PRINTK("read failed to va %#lx\n", addr);
194 errcode = ring_3(sh_ctxt->ctxt.regs) ? PFEC_user_mode : 0;
195 if ( access_type == hvm_access_insn_fetch )
196 errcode |= PFEC_insn_fetch;
197 hvm_inject_exception(TRAP_page_fault, errcode, addr + bytes - rc);
198 return X86EMUL_PROPAGATE_FAULT;
199 }
201 static int
202 hvm_emulate_read(enum x86_segment seg,
203 unsigned long offset,
204 unsigned long *val,
205 unsigned int bytes,
206 struct x86_emulate_ctxt *ctxt)
207 {
208 return hvm_read(seg, offset, val, bytes, hvm_access_read,
209 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
210 }
212 static int
213 hvm_emulate_insn_fetch(enum x86_segment seg,
214 unsigned long offset,
215 unsigned long *val,
216 unsigned int bytes,
217 struct x86_emulate_ctxt *ctxt)
218 {
219 struct sh_emulate_ctxt *sh_ctxt =
220 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
221 unsigned int insn_off = offset - ctxt->regs->eip;
223 /* Fall back if requested bytes are not in the prefetch cache. */
224 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
225 return hvm_read(seg, offset, val, bytes,
226 hvm_access_insn_fetch, sh_ctxt);
228 /* Hit the cache. Simple memcpy. */
229 *val = 0;
230 memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
231 return X86EMUL_CONTINUE;
232 }
234 static int
235 hvm_emulate_write(enum x86_segment seg,
236 unsigned long offset,
237 unsigned long val,
238 unsigned int bytes,
239 struct x86_emulate_ctxt *ctxt)
240 {
241 struct sh_emulate_ctxt *sh_ctxt =
242 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
243 struct vcpu *v = current;
244 unsigned long addr;
245 int rc;
247 rc = hvm_translate_linear_addr(
248 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
249 if ( rc )
250 return rc;
252 return v->arch.shadow.mode->x86_emulate_write(
253 v, addr, &val, bytes, sh_ctxt);
254 }
256 static int
257 hvm_emulate_cmpxchg(enum x86_segment seg,
258 unsigned long offset,
259 unsigned long old,
260 unsigned long new,
261 unsigned int bytes,
262 struct x86_emulate_ctxt *ctxt)
263 {
264 struct sh_emulate_ctxt *sh_ctxt =
265 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
266 struct vcpu *v = current;
267 unsigned long addr;
268 int rc;
270 rc = hvm_translate_linear_addr(
271 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
272 if ( rc )
273 return rc;
275 return v->arch.shadow.mode->x86_emulate_cmpxchg(
276 v, addr, old, new, bytes, sh_ctxt);
277 }
279 static int
280 hvm_emulate_cmpxchg8b(enum x86_segment seg,
281 unsigned long offset,
282 unsigned long old_lo,
283 unsigned long old_hi,
284 unsigned long new_lo,
285 unsigned long new_hi,
286 struct x86_emulate_ctxt *ctxt)
287 {
288 struct sh_emulate_ctxt *sh_ctxt =
289 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
290 struct vcpu *v = current;
291 unsigned long addr;
292 int rc;
294 rc = hvm_translate_linear_addr(
295 seg, offset, 8, hvm_access_write, sh_ctxt, &addr);
296 if ( rc )
297 return rc;
299 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(
300 v, addr, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
301 }
303 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
304 .read = hvm_emulate_read,
305 .insn_fetch = hvm_emulate_insn_fetch,
306 .write = hvm_emulate_write,
307 .cmpxchg = hvm_emulate_cmpxchg,
308 .cmpxchg8b = hvm_emulate_cmpxchg8b,
309 };
311 static int
312 pv_emulate_read(enum x86_segment seg,
313 unsigned long offset,
314 unsigned long *val,
315 unsigned int bytes,
316 struct x86_emulate_ctxt *ctxt)
317 {
318 unsigned int rc;
320 *val = 0;
321 if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
322 {
323 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
324 return X86EMUL_PROPAGATE_FAULT;
325 }
327 return X86EMUL_CONTINUE;
328 }
330 static int
331 pv_emulate_write(enum x86_segment seg,
332 unsigned long offset,
333 unsigned long val,
334 unsigned int bytes,
335 struct x86_emulate_ctxt *ctxt)
336 {
337 struct sh_emulate_ctxt *sh_ctxt =
338 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
339 struct vcpu *v = current;
340 return v->arch.shadow.mode->x86_emulate_write(
341 v, offset, &val, bytes, sh_ctxt);
342 }
344 static int
345 pv_emulate_cmpxchg(enum x86_segment seg,
346 unsigned long offset,
347 unsigned long old,
348 unsigned long new,
349 unsigned int bytes,
350 struct x86_emulate_ctxt *ctxt)
351 {
352 struct sh_emulate_ctxt *sh_ctxt =
353 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
354 struct vcpu *v = current;
355 return v->arch.shadow.mode->x86_emulate_cmpxchg(
356 v, offset, old, new, bytes, sh_ctxt);
357 }
359 static int
360 pv_emulate_cmpxchg8b(enum x86_segment seg,
361 unsigned long offset,
362 unsigned long old_lo,
363 unsigned long old_hi,
364 unsigned long new_lo,
365 unsigned long new_hi,
366 struct x86_emulate_ctxt *ctxt)
367 {
368 struct sh_emulate_ctxt *sh_ctxt =
369 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
370 struct vcpu *v = current;
371 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(
372 v, offset, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
373 }
375 static struct x86_emulate_ops pv_shadow_emulator_ops = {
376 .read = pv_emulate_read,
377 .insn_fetch = pv_emulate_read,
378 .write = pv_emulate_write,
379 .cmpxchg = pv_emulate_cmpxchg,
380 .cmpxchg8b = pv_emulate_cmpxchg8b,
381 };
383 struct x86_emulate_ops *shadow_init_emulation(
384 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
385 {
386 struct segment_register *creg;
387 struct vcpu *v = current;
388 unsigned long addr;
390 sh_ctxt->ctxt.regs = regs;
392 if ( !is_hvm_vcpu(v) )
393 {
394 sh_ctxt->ctxt.mode = X86EMUL_MODE_HOST;
395 return &pv_shadow_emulator_ops;
396 }
398 /* Segment cache initialisation. Primed with CS. */
399 sh_ctxt->valid_seg_regs = 0;
400 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
402 /* Work out the emulation mode. */
403 if ( hvm_long_mode_enabled(v) )
404 sh_ctxt->ctxt.mode = creg->attr.fields.l ?
405 X86EMUL_MODE_PROT64 : X86EMUL_MODE_PROT32;
406 else if ( regs->eflags & X86_EFLAGS_VM )
407 sh_ctxt->ctxt.mode = X86EMUL_MODE_REAL;
408 else
409 sh_ctxt->ctxt.mode = creg->attr.fields.db ?
410 X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
412 /* Attempt to prefetch whole instruction. */
413 sh_ctxt->insn_buf_bytes =
414 (!hvm_translate_linear_addr(
415 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
416 hvm_access_insn_fetch, sh_ctxt, &addr) &&
417 !hvm_copy_from_guest_virt(
418 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf)))
419 ? sizeof(sh_ctxt->insn_buf) : 0;
421 return &hvm_shadow_emulator_ops;
422 }
424 /**************************************************************************/
425 /* Code for "promoting" a guest page to the point where the shadow code is
426 * willing to let it be treated as a guest page table. This generally
427 * involves making sure there are no writable mappings available to the guest
428 * for this page.
429 */
430 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
431 {
432 struct page_info *page = mfn_to_page(gmfn);
434 ASSERT(mfn_valid(gmfn));
436 /* We should never try to promote a gmfn that has writeable mappings */
437 ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
439 /* Is the page already shadowed? */
440 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
441 page->shadow_flags = 0;
443 ASSERT(!test_bit(type, &page->shadow_flags));
444 set_bit(type, &page->shadow_flags);
445 }
447 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
448 {
449 struct page_info *page = mfn_to_page(gmfn);
451 ASSERT(test_bit(_PGC_page_table, &page->count_info));
452 ASSERT(test_bit(type, &page->shadow_flags));
454 clear_bit(type, &page->shadow_flags);
456 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
457 {
458 /* tlbflush timestamp field is valid again */
459 page->tlbflush_timestamp = tlbflush_current_time();
460 clear_bit(_PGC_page_table, &page->count_info);
461 }
462 }
464 /**************************************************************************/
465 /* Validate a pagetable change from the guest and update the shadows.
466 * Returns a bitmask of SHADOW_SET_* flags. */
468 int
469 __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
470 void *entry, u32 size)
471 {
472 int result = 0;
473 struct page_info *page = mfn_to_page(gmfn);
475 sh_mark_dirty(v->domain, gmfn);
477 // Determine which types of shadows are affected, and update each.
478 //
479 // Always validate L1s before L2s to prevent another cpu with a linear
480 // mapping of this gmfn from seeing a walk that results from
481 // using the new L2 value and the old L1 value. (It is OK for such a
482 // guest to see a walk that uses the old L2 value with the new L1 value,
483 // as hardware could behave this way if one level of the pagewalk occurs
484 // before the store, and the next level of the pagewalk occurs after the
485 // store.
486 //
487 // Ditto for L2s before L3s, etc.
488 //
490 if ( !(page->count_info & PGC_page_table) )
491 return 0; /* Not shadowed at all */
493 #if CONFIG_PAGING_LEVELS == 2
494 if ( page->shadow_flags & SHF_L1_32 )
495 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
496 (v, gmfn, entry, size);
497 #else
498 if ( page->shadow_flags & SHF_L1_32 )
499 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
500 (v, gmfn, entry, size);
501 #endif
503 #if CONFIG_PAGING_LEVELS == 2
504 if ( page->shadow_flags & SHF_L2_32 )
505 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
506 (v, gmfn, entry, size);
507 #else
508 if ( page->shadow_flags & SHF_L2_32 )
509 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
510 (v, gmfn, entry, size);
511 #endif
513 #if CONFIG_PAGING_LEVELS >= 3
514 if ( page->shadow_flags & SHF_L1_PAE )
515 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
516 (v, gmfn, entry, size);
517 if ( page->shadow_flags & SHF_L2_PAE )
518 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
519 (v, gmfn, entry, size);
520 if ( page->shadow_flags & SHF_L2H_PAE )
521 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
522 (v, gmfn, entry, size);
523 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
524 ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
525 #endif
527 #if CONFIG_PAGING_LEVELS >= 4
528 if ( page->shadow_flags & SHF_L1_64 )
529 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
530 (v, gmfn, entry, size);
531 if ( page->shadow_flags & SHF_L2_64 )
532 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
533 (v, gmfn, entry, size);
534 if ( page->shadow_flags & SHF_L3_64 )
535 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
536 (v, gmfn, entry, size);
537 if ( page->shadow_flags & SHF_L4_64 )
538 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
539 (v, gmfn, entry, size);
540 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
541 ASSERT((page->shadow_flags
542 & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
543 #endif
545 return result;
546 }
549 int
550 shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
551 /* This is the entry point from hypercalls. It returns a bitmask of all the
552 * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
553 {
554 int rc;
556 ASSERT(shadow_lock_is_acquired(v->domain));
557 rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
558 shadow_audit_tables(v);
559 return rc;
560 }
562 void
563 shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
564 void *entry, u32 size)
565 /* This is the entry point for emulated writes to pagetables in HVM guests and
566 * PV translated guests.
567 */
568 {
569 struct domain *d = v->domain;
570 int rc;
572 ASSERT(shadow_lock_is_acquired(v->domain));
573 rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
574 if ( rc & SHADOW_SET_FLUSH )
575 /* Need to flush TLBs to pick up shadow PT changes */
576 flush_tlb_mask(d->domain_dirty_cpumask);
577 if ( rc & SHADOW_SET_ERROR )
578 {
579 /* This page is probably not a pagetable any more: tear it out of the
580 * shadows, along with any tables that reference it.
581 * Since the validate call above will have made a "safe" (i.e. zero)
582 * shadow entry, we can let the domain live even if we can't fully
583 * unshadow the page. */
584 sh_remove_shadows(v, gmfn, 0, 0);
585 }
586 }
589 /**************************************************************************/
590 /* Memory management for shadow pages. */
592 /* Allocating shadow pages
593 * -----------------------
594 *
595 * Most shadow pages are allocated singly, but there is one case where
596 * we need to allocate multiple pages together: shadowing 32-bit guest
597 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
598 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
599 * l1 tables (covering 2MB of virtual address space each). Similarly, a
600 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
601 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
602 * contiguous and aligned; functions for handling offsets into them are
603 * defined in shadow.c (shadow_l1_index() etc.)
604 *
605 * This table shows the allocation behaviour of the different modes:
606 *
607 * Xen paging 32b pae pae 64b 64b 64b
608 * Guest paging 32b 32b pae 32b pae 64b
609 * PV or HVM * HVM * HVM HVM *
610 * Shadow paging 32b pae pae pae pae 64b
611 *
612 * sl1 size 4k 8k 4k 8k 4k 4k
613 * sl2 size 4k 16k 4k 16k 4k 4k
614 * sl3 size - - - - - 4k
615 * sl4 size - - - - - 4k
616 *
617 * We allocate memory from xen in four-page units and break them down
618 * with a simple buddy allocator. Can't use the xen allocator to handle
619 * this as it only works for contiguous zones, and a domain's shadow
620 * pool is made of fragments.
621 *
622 * In HVM guests, the p2m table is built out of shadow pages, and we provide
623 * a function for the p2m management to steal pages, in max-order chunks, from
624 * the free pool. We don't provide for giving them back, yet.
625 */
627 /* Figure out the least acceptable quantity of shadow memory.
628 * The minimum memory requirement for always being able to free up a
629 * chunk of memory is very small -- only three max-order chunks per
630 * vcpu to hold the top level shadows and pages with Xen mappings in them.
631 *
632 * But for a guest to be guaranteed to successfully execute a single
633 * instruction, we must be able to map a large number (about thirty) VAs
634 * at the same time, which means that to guarantee progress, we must
635 * allow for more than ninety allocated pages per vcpu. We round that
636 * up to 128 pages, or half a megabyte per vcpu. */
637 unsigned int shadow_min_acceptable_pages(struct domain *d)
638 {
639 u32 vcpu_count = 0;
640 struct vcpu *v;
642 for_each_vcpu(d, v)
643 vcpu_count++;
645 return (vcpu_count * 128);
646 }
648 /* Figure out the order of allocation needed for a given shadow type */
649 static inline u32
650 shadow_order(unsigned int shadow_type)
651 {
652 #if CONFIG_PAGING_LEVELS > 2
653 static const u32 type_to_order[16] = {
654 0, /* SH_type_none */
655 1, /* SH_type_l1_32_shadow */
656 1, /* SH_type_fl1_32_shadow */
657 2, /* SH_type_l2_32_shadow */
658 0, /* SH_type_l1_pae_shadow */
659 0, /* SH_type_fl1_pae_shadow */
660 0, /* SH_type_l2_pae_shadow */
661 0, /* SH_type_l2h_pae_shadow */
662 0, /* SH_type_l1_64_shadow */
663 0, /* SH_type_fl1_64_shadow */
664 0, /* SH_type_l2_64_shadow */
665 0, /* SH_type_l3_64_shadow */
666 0, /* SH_type_l4_64_shadow */
667 2, /* SH_type_p2m_table */
668 0 /* SH_type_monitor_table */
669 };
670 ASSERT(shadow_type < 16);
671 return type_to_order[shadow_type];
672 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
673 return 0;
674 #endif
675 }
678 /* Do we have a free chunk of at least this order? */
679 static inline int chunk_is_available(struct domain *d, int order)
680 {
681 int i;
683 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
684 if ( !list_empty(&d->arch.shadow.freelists[i]) )
685 return 1;
686 return 0;
687 }
689 /* Dispatcher function: call the per-mode function that will unhook the
690 * non-Xen mappings in this top-level shadow mfn */
691 void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
692 {
693 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
694 switch ( sp->type )
695 {
696 case SH_type_l2_32_shadow:
697 #if CONFIG_PAGING_LEVELS == 2
698 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
699 #else
700 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
701 #endif
702 break;
703 #if CONFIG_PAGING_LEVELS >= 3
704 case SH_type_l2_pae_shadow:
705 case SH_type_l2h_pae_shadow:
706 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
707 break;
708 #endif
709 #if CONFIG_PAGING_LEVELS >= 4
710 case SH_type_l4_64_shadow:
711 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
712 break;
713 #endif
714 default:
715 SHADOW_PRINTK("top-level shadow has bad type %08x\n", sp->type);
716 BUG();
717 }
718 }
721 /* Make sure there is at least one chunk of the required order available
722 * in the shadow page pool. This must be called before any calls to
723 * shadow_alloc(). Since this will free existing shadows to make room,
724 * it must be called early enough to avoid freeing shadows that the
725 * caller is currently working on. */
726 void shadow_prealloc(struct domain *d, unsigned int order)
727 {
728 /* Need a vpcu for calling unpins; for now, since we don't have
729 * per-vcpu shadows, any will do */
730 struct vcpu *v, *v2;
731 struct list_head *l, *t;
732 struct shadow_page_info *sp;
733 cpumask_t flushmask = CPU_MASK_NONE;
734 mfn_t smfn;
735 int i;
737 if ( chunk_is_available(d, order) ) return;
739 v = current;
740 if ( v->domain != d )
741 v = d->vcpu[0];
742 ASSERT(v != NULL);
744 /* Stage one: walk the list of pinned pages, unpinning them */
745 perfc_incrc(shadow_prealloc_1);
746 list_for_each_backwards_safe(l, t, &d->arch.shadow.pinned_shadows)
747 {
748 sp = list_entry(l, struct shadow_page_info, list);
749 smfn = shadow_page_to_mfn(sp);
751 /* Unpin this top-level shadow */
752 sh_unpin(v, smfn);
754 /* See if that freed up a chunk of appropriate size */
755 if ( chunk_is_available(d, order) ) return;
756 }
758 /* Stage two: all shadow pages are in use in hierarchies that are
759 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
760 * mappings. */
761 perfc_incrc(shadow_prealloc_2);
763 for_each_vcpu(d, v2)
764 for ( i = 0 ; i < 4 ; i++ )
765 {
766 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
767 {
768 shadow_unhook_mappings(v,
769 pagetable_get_mfn(v2->arch.shadow_table[i]));
770 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
772 /* See if that freed up a chunk of appropriate size */
773 if ( chunk_is_available(d, order) )
774 {
775 flush_tlb_mask(flushmask);
776 return;
777 }
778 }
779 }
781 /* Nothing more we can do: all remaining shadows are of pages that
782 * hold Xen mappings for some vcpu. This can never happen. */
783 SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
784 " shadow pages total = %u, free = %u, p2m=%u\n",
785 1 << order,
786 d->arch.shadow.total_pages,
787 d->arch.shadow.free_pages,
788 d->arch.shadow.p2m_pages);
789 BUG();
790 }
792 /* Deliberately free all the memory we can: this will tear down all of
793 * this domain's shadows */
794 static void shadow_blow_tables(struct domain *d)
795 {
796 struct list_head *l, *t;
797 struct shadow_page_info *sp;
798 struct vcpu *v = d->vcpu[0];
799 mfn_t smfn;
800 int i;
802 /* Pass one: unpin all pinned pages */
803 list_for_each_backwards_safe(l,t, &d->arch.shadow.pinned_shadows)
804 {
805 sp = list_entry(l, struct shadow_page_info, list);
806 smfn = shadow_page_to_mfn(sp);
807 sh_unpin(v, smfn);
808 }
810 /* Second pass: unhook entries of in-use shadows */
811 for_each_vcpu(d, v)
812 for ( i = 0 ; i < 4 ; i++ )
813 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
814 shadow_unhook_mappings(v,
815 pagetable_get_mfn(v->arch.shadow_table[i]));
817 /* Make sure everyone sees the unshadowings */
818 flush_tlb_mask(d->domain_dirty_cpumask);
819 }
822 #ifndef NDEBUG
823 /* Blow all shadows of all shadowed domains: this can be used to cause the
824 * guest's pagetables to be re-shadowed if we suspect that the shadows
825 * have somehow got out of sync */
826 static void shadow_blow_all_tables(unsigned char c)
827 {
828 struct domain *d;
829 printk("'%c' pressed -> blowing all shadow tables\n", c);
830 for_each_domain(d)
831 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
832 {
833 shadow_lock(d);
834 shadow_blow_tables(d);
835 shadow_unlock(d);
836 }
837 }
839 /* Register this function in the Xen console keypress table */
840 static __init int shadow_blow_tables_keyhandler_init(void)
841 {
842 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
843 return 0;
844 }
845 __initcall(shadow_blow_tables_keyhandler_init);
846 #endif /* !NDEBUG */
848 /* Allocate another shadow's worth of (contiguous, aligned) pages,
849 * and fill in the type and backpointer fields of their page_infos.
850 * Never fails to allocate. */
851 mfn_t shadow_alloc(struct domain *d,
852 u32 shadow_type,
853 unsigned long backpointer)
854 {
855 struct shadow_page_info *sp = NULL;
856 unsigned int order = shadow_order(shadow_type);
857 cpumask_t mask;
858 void *p;
859 int i;
861 ASSERT(shadow_lock_is_acquired(d));
862 ASSERT(order <= SHADOW_MAX_ORDER);
863 ASSERT(shadow_type != SH_type_none);
864 perfc_incrc(shadow_alloc);
866 /* Find smallest order which can satisfy the request. */
867 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
868 if ( !list_empty(&d->arch.shadow.freelists[i]) )
869 {
870 sp = list_entry(d->arch.shadow.freelists[i].next,
871 struct shadow_page_info, list);
872 list_del(&sp->list);
874 /* We may have to halve the chunk a number of times. */
875 while ( i != order )
876 {
877 i--;
878 sp->order = i;
879 list_add_tail(&sp->list, &d->arch.shadow.freelists[i]);
880 sp += 1 << i;
881 }
882 d->arch.shadow.free_pages -= 1 << order;
884 /* Init page info fields and clear the pages */
885 for ( i = 0; i < 1<<order ; i++ )
886 {
887 /* Before we overwrite the old contents of this page,
888 * we need to be sure that no TLB holds a pointer to it. */
889 mask = d->domain_dirty_cpumask;
890 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
891 if ( unlikely(!cpus_empty(mask)) )
892 {
893 perfc_incrc(shadow_alloc_tlbflush);
894 flush_tlb_mask(mask);
895 }
896 /* Now safe to clear the page for reuse */
897 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
898 ASSERT(p != NULL);
899 clear_page(p);
900 sh_unmap_domain_page(p);
901 INIT_LIST_HEAD(&sp[i].list);
902 sp[i].type = shadow_type;
903 sp[i].pinned = 0;
904 sp[i].logdirty = 0;
905 sp[i].count = 0;
906 sp[i].backpointer = backpointer;
907 sp[i].next_shadow = NULL;
908 perfc_incr(shadow_alloc_count);
909 }
910 return shadow_page_to_mfn(sp);
911 }
913 /* If we get here, we failed to allocate. This should never happen.
914 * It means that we didn't call shadow_prealloc() correctly before
915 * we allocated. We can't recover by calling prealloc here, because
916 * we might free up higher-level pages that the caller is working on. */
917 SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
918 BUG();
919 }
922 /* Return some shadow pages to the pool. */
923 void shadow_free(struct domain *d, mfn_t smfn)
924 {
925 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
926 u32 shadow_type;
927 unsigned long order;
928 unsigned long mask;
929 int i;
931 ASSERT(shadow_lock_is_acquired(d));
932 perfc_incrc(shadow_free);
934 shadow_type = sp->type;
935 ASSERT(shadow_type != SH_type_none);
936 ASSERT(shadow_type != SH_type_p2m_table);
937 order = shadow_order(shadow_type);
939 d->arch.shadow.free_pages += 1 << order;
941 for ( i = 0; i < 1<<order; i++ )
942 {
943 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
944 struct vcpu *v;
945 for_each_vcpu(d, v)
946 {
947 /* No longer safe to look for a writeable mapping in this shadow */
948 if ( v->arch.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
949 v->arch.shadow.last_writeable_pte_smfn = 0;
950 }
951 #endif
952 /* Strip out the type: this is now a free shadow page */
953 sp[i].type = 0;
954 /* Remember the TLB timestamp so we will know whether to flush
955 * TLBs when we reuse the page. Because the destructors leave the
956 * contents of the pages in place, we can delay TLB flushes until
957 * just before the allocator hands the page out again. */
958 sp[i].tlbflush_timestamp = tlbflush_current_time();
959 perfc_decr(shadow_alloc_count);
960 }
962 /* Merge chunks as far as possible. */
963 while ( order < SHADOW_MAX_ORDER )
964 {
965 mask = 1 << order;
966 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
967 /* Merge with predecessor block? */
968 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
969 break;
970 list_del(&(sp-mask)->list);
971 sp -= mask;
972 } else {
973 /* Merge with successor block? */
974 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
975 break;
976 list_del(&(sp+mask)->list);
977 }
978 order++;
979 }
981 sp->order = order;
982 list_add_tail(&sp->list, &d->arch.shadow.freelists[order]);
983 }
985 /* Divert some memory from the pool to be used by the p2m mapping.
986 * This action is irreversible: the p2m mapping only ever grows.
987 * That's OK because the p2m table only exists for translated domains,
988 * and those domains can't ever turn off shadow mode.
989 * Also, we only ever allocate a max-order chunk, so as to preserve
990 * the invariant that shadow_prealloc() always works.
991 * Returns 0 iff it can't get a chunk (the caller should then
992 * free up some pages in domheap and call set_sh_allocation);
993 * returns non-zero on success.
994 */
995 static int
996 shadow_alloc_p2m_pages(struct domain *d)
997 {
998 struct page_info *pg;
999 u32 i;
1000 ASSERT(shadow_lock_is_acquired(d));
1002 if ( d->arch.shadow.total_pages
1003 < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
1004 return 0; /* Not enough shadow memory: need to increase it first */
1006 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1007 d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
1008 d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
1009 for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
1011 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1012 * Marking the domain as the owner would normally allow the guest to
1013 * create mappings of these pages, but these p2m pages will never be
1014 * in the domain's guest-physical address space, and so that is not
1015 * believed to be a concern.
1016 */
1017 page_set_owner(&pg[i], d);
1018 list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
1020 return 1;
1023 // Returns 0 if no memory is available...
1024 mfn_t
1025 shadow_alloc_p2m_page(struct domain *d)
1027 struct list_head *entry;
1028 struct page_info *pg;
1029 mfn_t mfn;
1030 void *p;
1032 if ( list_empty(&d->arch.shadow.p2m_freelist) &&
1033 !shadow_alloc_p2m_pages(d) )
1034 return _mfn(0);
1035 entry = d->arch.shadow.p2m_freelist.next;
1036 list_del(entry);
1037 list_add_tail(entry, &d->arch.shadow.p2m_inuse);
1038 pg = list_entry(entry, struct page_info, list);
1039 pg->count_info = 1;
1040 mfn = page_to_mfn(pg);
1041 p = sh_map_domain_page(mfn);
1042 clear_page(p);
1043 sh_unmap_domain_page(p);
1045 return mfn;
1048 #if CONFIG_PAGING_LEVELS == 3
1049 static void p2m_install_entry_in_monitors(struct domain *d,
1050 l3_pgentry_t *l3e)
1051 /* Special case, only used for external-mode domains on PAE hosts:
1052 * update the mapping of the p2m table. Once again, this is trivial in
1053 * other paging modes (one top-level entry points to the top-level p2m,
1054 * no maintenance needed), but PAE makes life difficult by needing a
1055 * copy the eight l3es of the p2m table in eight l2h slots in the
1056 * monitor table. This function makes fresh copies when a p2m l3e
1057 * changes. */
1059 l2_pgentry_t *ml2e;
1060 struct vcpu *v;
1061 unsigned int index;
1063 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1064 ASSERT(index < MACHPHYS_MBYTES>>1);
1066 for_each_vcpu(d, v)
1068 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1069 continue;
1070 ASSERT(shadow_mode_external(v->domain));
1072 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1073 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1075 if ( v == current ) /* OK to use linear map of monitor_table */
1076 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1077 else
1079 l3_pgentry_t *ml3e;
1080 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1081 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1082 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1083 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1084 sh_unmap_domain_page(ml3e);
1086 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1087 if ( v != current )
1088 sh_unmap_domain_page(ml2e);
1091 #endif
1093 // Find the next level's P2M entry, checking for out-of-range gfn's...
1094 // Returns NULL on error.
1095 //
1096 static l1_pgentry_t *
1097 p2m_find_entry(void *table, unsigned long *gfn_remainder,
1098 unsigned long gfn, u32 shift, u32 max)
1100 u32 index;
1102 index = *gfn_remainder >> shift;
1103 if ( index >= max )
1105 SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
1106 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
1107 gfn, *gfn_remainder, shift, index, max);
1108 return NULL;
1110 *gfn_remainder &= (1 << shift) - 1;
1111 return (l1_pgentry_t *)table + index;
1114 // Walk one level of the P2M table, allocating a new table if required.
1115 // Returns 0 on error.
1116 //
1117 static int
1118 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
1119 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
1120 u32 max, unsigned long type)
1122 l1_pgentry_t *p2m_entry;
1123 void *next;
1125 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
1126 shift, max)) )
1127 return 0;
1129 if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
1131 mfn_t mfn = shadow_alloc_p2m_page(d);
1132 if ( mfn_x(mfn) == 0 )
1133 return 0;
1134 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1135 mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
1136 mfn_to_page(mfn)->count_info = 1;
1137 #if CONFIG_PAGING_LEVELS == 3
1138 if (type == PGT_l2_page_table)
1140 struct vcpu *v;
1141 /* We have written to the p2m l3: need to sync the per-vcpu
1142 * copies of it in the monitor tables */
1143 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
1144 /* Also, any vcpus running on shadows of the p2m need to
1145 * reload their CR3s so the change propagates to the shadow */
1146 ASSERT(shadow_lock_is_acquired(d));
1147 for_each_vcpu(d, v)
1149 if ( pagetable_get_pfn(v->arch.guest_table)
1150 == pagetable_get_pfn(d->arch.phys_table)
1151 && v->arch.shadow.mode != NULL )
1152 v->arch.shadow.mode->update_cr3(v);
1155 #endif
1156 /* The P2M can be shadowed: keep the shadows synced */
1157 if ( d->vcpu[0] != NULL )
1158 (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
1159 p2m_entry, sizeof *p2m_entry);
1161 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
1162 next = sh_map_domain_page(*table_mfn);
1163 sh_unmap_domain_page(*table);
1164 *table = next;
1166 return 1;
1169 // Returns 0 on error (out of memory)
1170 int
1171 shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
1173 // XXX -- this might be able to be faster iff current->domain == d
1174 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
1175 void *table = sh_map_domain_page(table_mfn);
1176 unsigned long gfn_remainder = gfn;
1177 l1_pgentry_t *p2m_entry;
1178 int rv=0;
1180 #if CONFIG_PAGING_LEVELS >= 4
1181 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1182 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
1183 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
1184 goto out;
1185 #endif
1186 #if CONFIG_PAGING_LEVELS >= 3
1187 // When using PAE Xen, we only allow 33 bits of pseudo-physical
1188 // address in translated guests (i.e. 8 GBytes). This restriction
1189 // comes from wanting to map the P2M table into the 16MB RO_MPT hole
1190 // in Xen's address space for translated PV guests.
1191 //
1192 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1193 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
1194 (CONFIG_PAGING_LEVELS == 3
1195 ? 8
1196 : L3_PAGETABLE_ENTRIES),
1197 PGT_l2_page_table) )
1198 goto out;
1199 #endif
1200 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1201 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
1202 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
1203 goto out;
1205 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
1206 0, L1_PAGETABLE_ENTRIES);
1207 ASSERT(p2m_entry);
1208 if ( mfn_valid(mfn) )
1209 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1210 else
1211 *p2m_entry = l1e_empty();
1213 /* Track the highest gfn for which we have ever had a valid mapping */
1214 if ( mfn_valid(mfn) && (gfn > d->arch.max_mapped_pfn) )
1215 d->arch.max_mapped_pfn = gfn;
1217 /* The P2M can be shadowed: keep the shadows synced */
1218 if ( d->vcpu[0] != NULL )
1219 (void)__shadow_validate_guest_entry(
1220 d->vcpu[0], table_mfn, p2m_entry, sizeof(*p2m_entry));
1222 /* Success */
1223 rv = 1;
1225 out:
1226 sh_unmap_domain_page(table);
1227 return rv;
1230 // Allocate a new p2m table for a domain.
1231 //
1232 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
1233 // controlled by CONFIG_PAGING_LEVELS).
1234 //
1235 // Returns 0 if p2m table could not be initialized
1236 //
1237 static int
1238 shadow_alloc_p2m_table(struct domain *d)
1240 mfn_t p2m_top, mfn;
1241 struct list_head *entry;
1242 struct page_info *page;
1243 unsigned int page_count = 0;
1244 unsigned long gfn;
1246 SHADOW_PRINTK("allocating p2m table\n");
1247 ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
1249 p2m_top = shadow_alloc_p2m_page(d);
1250 mfn_to_page(p2m_top)->count_info = 1;
1251 mfn_to_page(p2m_top)->u.inuse.type_info =
1252 #if CONFIG_PAGING_LEVELS == 4
1253 PGT_l4_page_table
1254 #elif CONFIG_PAGING_LEVELS == 3
1255 PGT_l3_page_table
1256 #elif CONFIG_PAGING_LEVELS == 2
1257 PGT_l2_page_table
1258 #endif
1259 | 1 | PGT_validated;
1261 if ( mfn_x(p2m_top) == 0 )
1262 return 0;
1264 d->arch.phys_table = pagetable_from_mfn(p2m_top);
1266 SHADOW_PRINTK("populating p2m table\n");
1268 /* Initialise physmap tables for slot zero. Other code assumes this. */
1269 gfn = 0;
1270 mfn = _mfn(INVALID_MFN);
1271 if ( !shadow_set_p2m_entry(d, gfn, mfn) )
1272 goto error;
1274 for ( entry = d->page_list.next;
1275 entry != &d->page_list;
1276 entry = entry->next )
1278 page = list_entry(entry, struct page_info, list);
1279 mfn = page_to_mfn(page);
1280 gfn = get_gpfn_from_mfn(mfn_x(mfn));
1281 page_count++;
1282 if (
1283 #ifdef __x86_64__
1284 (gfn != 0x5555555555555555L)
1285 #else
1286 (gfn != 0x55555555L)
1287 #endif
1288 && gfn != INVALID_M2P_ENTRY
1289 && !shadow_set_p2m_entry(d, gfn, mfn) )
1290 goto error;
1293 SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
1294 return 1;
1296 error:
1297 SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
1298 SH_PRI_mfn "\n", gfn, mfn_x(mfn));
1299 return 0;
1302 mfn_t
1303 sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1304 /* Read another domain's p2m entries */
1306 mfn_t mfn;
1307 paddr_t addr = ((paddr_t)gpfn) << PAGE_SHIFT;
1308 l2_pgentry_t *l2e;
1309 l1_pgentry_t *l1e;
1311 ASSERT(shadow_mode_translate(d));
1312 mfn = pagetable_get_mfn(d->arch.phys_table);
1315 if ( gpfn > d->arch.max_mapped_pfn )
1316 /* This pfn is higher than the highest the p2m map currently holds */
1317 return _mfn(INVALID_MFN);
1319 #if CONFIG_PAGING_LEVELS >= 4
1321 l4_pgentry_t *l4e = sh_map_domain_page(mfn);
1322 l4e += l4_table_offset(addr);
1323 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1325 sh_unmap_domain_page(l4e);
1326 return _mfn(INVALID_MFN);
1328 mfn = _mfn(l4e_get_pfn(*l4e));
1329 sh_unmap_domain_page(l4e);
1331 #endif
1332 #if CONFIG_PAGING_LEVELS >= 3
1334 l3_pgentry_t *l3e = sh_map_domain_page(mfn);
1335 #if CONFIG_PAGING_LEVELS == 3
1336 /* On PAE hosts the p2m has eight l3 entries, not four (see
1337 * shadow_set_p2m_entry()) so we can't use l3_table_offset.
1338 * Instead, just count the number of l3es from zero. It's safe
1339 * to do this because we already checked that the gfn is within
1340 * the bounds of the p2m. */
1341 l3e += (addr >> L3_PAGETABLE_SHIFT);
1342 #else
1343 l3e += l3_table_offset(addr);
1344 #endif
1345 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1347 sh_unmap_domain_page(l3e);
1348 return _mfn(INVALID_MFN);
1350 mfn = _mfn(l3e_get_pfn(*l3e));
1351 sh_unmap_domain_page(l3e);
1353 #endif
1355 l2e = sh_map_domain_page(mfn);
1356 l2e += l2_table_offset(addr);
1357 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1359 sh_unmap_domain_page(l2e);
1360 return _mfn(INVALID_MFN);
1362 mfn = _mfn(l2e_get_pfn(*l2e));
1363 sh_unmap_domain_page(l2e);
1365 l1e = sh_map_domain_page(mfn);
1366 l1e += l1_table_offset(addr);
1367 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1369 sh_unmap_domain_page(l1e);
1370 return _mfn(INVALID_MFN);
1372 mfn = _mfn(l1e_get_pfn(*l1e));
1373 sh_unmap_domain_page(l1e);
1375 return mfn;
1378 unsigned long
1379 shadow_gfn_to_mfn_foreign(unsigned long gpfn)
1381 return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
1385 static void shadow_p2m_teardown(struct domain *d)
1386 /* Return all the p2m pages to Xen.
1387 * We know we don't have any extra mappings to these pages */
1389 struct list_head *entry, *n;
1390 struct page_info *pg;
1392 d->arch.phys_table = pagetable_null();
1394 list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
1396 pg = list_entry(entry, struct page_info, list);
1397 list_del(entry);
1398 /* Should have just the one ref we gave it in alloc_p2m_page() */
1399 if ( (pg->count_info & PGC_count_mask) != 1 )
1401 SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1402 pg->count_info, pg->u.inuse.type_info);
1404 ASSERT(page_get_owner(pg) == d);
1405 /* Free should not decrement domain's total allocation, since
1406 * these pages were allocated without an owner. */
1407 page_set_owner(pg, NULL);
1408 free_domheap_pages(pg, 0);
1409 d->arch.shadow.p2m_pages--;
1410 perfc_decr(shadow_alloc_count);
1412 list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
1414 list_del(entry);
1415 pg = list_entry(entry, struct page_info, list);
1416 ASSERT(page_get_owner(pg) == d);
1417 /* Free should not decrement domain's total allocation. */
1418 page_set_owner(pg, NULL);
1419 free_domheap_pages(pg, 0);
1420 d->arch.shadow.p2m_pages--;
1421 perfc_decr(shadow_alloc_count);
1423 ASSERT(d->arch.shadow.p2m_pages == 0);
1426 /* Set the pool of shadow pages to the required number of pages.
1427 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1428 * plus space for the p2m table.
1429 * Returns 0 for success, non-zero for failure. */
1430 static unsigned int set_sh_allocation(struct domain *d,
1431 unsigned int pages,
1432 int *preempted)
1434 struct shadow_page_info *sp;
1435 unsigned int lower_bound;
1436 int j;
1438 ASSERT(shadow_lock_is_acquired(d));
1440 /* Don't allocate less than the minimum acceptable, plus one page per
1441 * megabyte of RAM (for the p2m table) */
1442 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1443 if ( pages > 0 && pages < lower_bound )
1444 pages = lower_bound;
1445 /* Round up to largest block size */
1446 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1448 SHADOW_PRINTK("current %i target %i\n",
1449 d->arch.shadow.total_pages, pages);
1451 while ( d->arch.shadow.total_pages != pages )
1453 if ( d->arch.shadow.total_pages < pages )
1455 /* Need to allocate more memory from domheap */
1456 sp = (struct shadow_page_info *)
1457 alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
1458 if ( sp == NULL )
1460 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1461 return -ENOMEM;
1463 d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1464 d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1465 for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
1467 sp[j].type = 0;
1468 sp[j].pinned = 0;
1469 sp[j].logdirty = 0;
1470 sp[j].count = 0;
1471 sp[j].mbz = 0;
1472 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1474 sp->order = SHADOW_MAX_ORDER;
1475 list_add_tail(&sp->list,
1476 &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
1478 else if ( d->arch.shadow.total_pages > pages )
1480 /* Need to return memory to domheap */
1481 shadow_prealloc(d, SHADOW_MAX_ORDER);
1482 ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
1483 sp = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
1484 struct shadow_page_info, list);
1485 list_del(&sp->list);
1486 d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1487 d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1488 free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER);
1491 /* Check to see if we need to yield and try again */
1492 if ( preempted && hypercall_preempt_check() )
1494 *preempted = 1;
1495 return 0;
1499 return 0;
1502 unsigned int shadow_set_allocation(struct domain *d,
1503 unsigned int megabytes,
1504 int *preempted)
1505 /* Hypercall interface to set the shadow memory allocation */
1507 unsigned int rv;
1508 shadow_lock(d);
1509 rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
1510 SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
1511 d->domain_id,
1512 d->arch.shadow.total_pages,
1513 shadow_get_allocation(d));
1514 shadow_unlock(d);
1515 return rv;
1518 /**************************************************************************/
1519 /* Hash table for storing the guest->shadow mappings.
1520 * The table itself is an array of pointers to shadows; the shadows are then
1521 * threaded on a singly-linked list of shadows with the same hash value */
1523 #define SHADOW_HASH_BUCKETS 251
1524 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1526 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1527 typedef u32 key_t;
1528 static inline key_t sh_hash(unsigned long n, unsigned int t)
1530 unsigned char *p = (unsigned char *)&n;
1531 key_t k = t;
1532 int i;
1533 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1534 return k % SHADOW_HASH_BUCKETS;
1537 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1539 /* Before we get to the mechanism, define a pair of audit functions
1540 * that sanity-check the contents of the hash table. */
1541 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1542 /* Audit one bucket of the hash table */
1544 struct shadow_page_info *sp, *x;
1546 if ( !(SHADOW_AUDIT_ENABLE) )
1547 return;
1549 sp = d->arch.shadow.hash_table[bucket];
1550 while ( sp )
1552 /* Not a shadow? */
1553 BUG_ON( sp->mbz != 0 );
1554 /* Bogus type? */
1555 BUG_ON( sp->type == 0 );
1556 BUG_ON( sp->type > SH_type_max_shadow );
1557 /* Wrong bucket? */
1558 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1559 /* Duplicate entry? */
1560 for ( x = sp->next_shadow; x; x = x->next_shadow )
1561 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1562 /* Follow the backpointer to the guest pagetable */
1563 if ( sp->type != SH_type_fl1_32_shadow
1564 && sp->type != SH_type_fl1_pae_shadow
1565 && sp->type != SH_type_fl1_64_shadow )
1567 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1568 /* Bad shadow flags on guest page? */
1569 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1570 /* Bad type count on guest page? */
1571 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1572 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1574 SHADOW_ERROR("MFN %#lx shadowed (by %#"SH_PRI_mfn")"
1575 " but has typecount %#lx\n",
1576 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1577 gpg->u.inuse.type_info);
1578 BUG();
1581 /* That entry was OK; on we go */
1582 sp = sp->next_shadow;
1586 #else
1587 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1588 #endif /* Hashtable bucket audit */
1591 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1593 static void sh_hash_audit(struct domain *d)
1594 /* Full audit: audit every bucket in the table */
1596 int i;
1598 if ( !(SHADOW_AUDIT_ENABLE) )
1599 return;
1601 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1603 sh_hash_audit_bucket(d, i);
1607 #else
1608 #define sh_hash_audit(_d) do {} while(0)
1609 #endif /* Hashtable bucket audit */
1611 /* Allocate and initialise the table itself.
1612 * Returns 0 for success, 1 for error. */
1613 static int shadow_hash_alloc(struct domain *d)
1615 struct shadow_page_info **table;
1617 ASSERT(shadow_lock_is_acquired(d));
1618 ASSERT(!d->arch.shadow.hash_table);
1620 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1621 if ( !table ) return 1;
1622 memset(table, 0,
1623 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1624 d->arch.shadow.hash_table = table;
1625 return 0;
1628 /* Tear down the hash table and return all memory to Xen.
1629 * This function does not care whether the table is populated. */
1630 static void shadow_hash_teardown(struct domain *d)
1632 ASSERT(shadow_lock_is_acquired(d));
1633 ASSERT(d->arch.shadow.hash_table);
1635 xfree(d->arch.shadow.hash_table);
1636 d->arch.shadow.hash_table = NULL;
1640 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1641 /* Find an entry in the hash table. Returns the MFN of the shadow,
1642 * or INVALID_MFN if it doesn't exist */
1644 struct domain *d = v->domain;
1645 struct shadow_page_info *sp, *prev;
1646 key_t key;
1648 ASSERT(shadow_lock_is_acquired(d));
1649 ASSERT(d->arch.shadow.hash_table);
1650 ASSERT(t);
1652 sh_hash_audit(d);
1654 perfc_incrc(shadow_hash_lookups);
1655 key = sh_hash(n, t);
1656 sh_hash_audit_bucket(d, key);
1658 sp = d->arch.shadow.hash_table[key];
1659 prev = NULL;
1660 while(sp)
1662 if ( sp->backpointer == n && sp->type == t )
1664 /* Pull-to-front if 'sp' isn't already the head item */
1665 if ( unlikely(sp != d->arch.shadow.hash_table[key]) )
1667 if ( unlikely(d->arch.shadow.hash_walking != 0) )
1668 /* Can't reorder: someone is walking the hash chains */
1669 return shadow_page_to_mfn(sp);
1670 else
1672 ASSERT(prev);
1673 /* Delete sp from the list */
1674 prev->next_shadow = sp->next_shadow;
1675 /* Re-insert it at the head of the list */
1676 sp->next_shadow = d->arch.shadow.hash_table[key];
1677 d->arch.shadow.hash_table[key] = sp;
1680 else
1682 perfc_incrc(shadow_hash_lookup_head);
1684 return shadow_page_to_mfn(sp);
1686 prev = sp;
1687 sp = sp->next_shadow;
1690 perfc_incrc(shadow_hash_lookup_miss);
1691 return _mfn(INVALID_MFN);
1694 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
1695 mfn_t smfn)
1696 /* Put a mapping (n,t)->smfn into the hash table */
1698 struct domain *d = v->domain;
1699 struct shadow_page_info *sp;
1700 key_t key;
1702 ASSERT(shadow_lock_is_acquired(d));
1703 ASSERT(d->arch.shadow.hash_table);
1704 ASSERT(t);
1706 sh_hash_audit(d);
1708 perfc_incrc(shadow_hash_inserts);
1709 key = sh_hash(n, t);
1710 sh_hash_audit_bucket(d, key);
1712 /* Insert this shadow at the top of the bucket */
1713 sp = mfn_to_shadow_page(smfn);
1714 sp->next_shadow = d->arch.shadow.hash_table[key];
1715 d->arch.shadow.hash_table[key] = sp;
1717 sh_hash_audit_bucket(d, key);
1720 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
1721 mfn_t smfn)
1722 /* Excise the mapping (n,t)->smfn from the hash table */
1724 struct domain *d = v->domain;
1725 struct shadow_page_info *sp, *x;
1726 key_t key;
1728 ASSERT(shadow_lock_is_acquired(d));
1729 ASSERT(d->arch.shadow.hash_table);
1730 ASSERT(t);
1732 sh_hash_audit(d);
1734 perfc_incrc(shadow_hash_deletes);
1735 key = sh_hash(n, t);
1736 sh_hash_audit_bucket(d, key);
1738 sp = mfn_to_shadow_page(smfn);
1739 if ( d->arch.shadow.hash_table[key] == sp )
1740 /* Easy case: we're deleting the head item. */
1741 d->arch.shadow.hash_table[key] = sp->next_shadow;
1742 else
1744 /* Need to search for the one we want */
1745 x = d->arch.shadow.hash_table[key];
1746 while ( 1 )
1748 ASSERT(x); /* We can't have hit the end, since our target is
1749 * still in the chain somehwere... */
1750 if ( x->next_shadow == sp )
1752 x->next_shadow = sp->next_shadow;
1753 break;
1755 x = x->next_shadow;
1758 sp->next_shadow = NULL;
1760 sh_hash_audit_bucket(d, key);
1763 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1765 static void hash_foreach(struct vcpu *v,
1766 unsigned int callback_mask,
1767 hash_callback_t callbacks[],
1768 mfn_t callback_mfn)
1769 /* Walk the hash table looking at the types of the entries and
1770 * calling the appropriate callback function for each entry.
1771 * The mask determines which shadow types we call back for, and the array
1772 * of callbacks tells us which function to call.
1773 * Any callback may return non-zero to let us skip the rest of the scan.
1775 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1776 * then return non-zero to terminate the scan. */
1778 int i, done = 0;
1779 struct domain *d = v->domain;
1780 struct shadow_page_info *x;
1782 /* Say we're here, to stop hash-lookups reordering the chains */
1783 ASSERT(shadow_lock_is_acquired(d));
1784 ASSERT(d->arch.shadow.hash_walking == 0);
1785 d->arch.shadow.hash_walking = 1;
1787 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1789 /* WARNING: This is not safe against changes to the hash table.
1790 * The callback *must* return non-zero if it has inserted or
1791 * deleted anything from the hash (lookups are OK, though). */
1792 for ( x = d->arch.shadow.hash_table[i]; x; x = x->next_shadow )
1794 if ( callback_mask & (1 << x->type) )
1796 ASSERT(x->type <= 15);
1797 ASSERT(callbacks[x->type] != NULL);
1798 done = callbacks[x->type](v, shadow_page_to_mfn(x),
1799 callback_mfn);
1800 if ( done ) break;
1803 if ( done ) break;
1805 d->arch.shadow.hash_walking = 0;
1809 /**************************************************************************/
1810 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1811 * which will decrement refcounts appropriately and return memory to the
1812 * free pool. */
1814 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1816 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1817 unsigned int t = sp->type;
1820 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1822 /* Double-check, if we can, that the shadowed page belongs to this
1823 * domain, (by following the back-pointer). */
1824 ASSERT(t == SH_type_fl1_32_shadow ||
1825 t == SH_type_fl1_pae_shadow ||
1826 t == SH_type_fl1_64_shadow ||
1827 t == SH_type_monitor_table ||
1828 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
1829 == v->domain));
1831 /* The down-shifts here are so that the switch statement is on nice
1832 * small numbers that the compiler will enjoy */
1833 switch ( t )
1835 #if CONFIG_PAGING_LEVELS == 2
1836 case SH_type_l1_32_shadow:
1837 case SH_type_fl1_32_shadow:
1838 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1839 break;
1840 case SH_type_l2_32_shadow:
1841 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1842 break;
1843 #else /* PAE or 64bit */
1844 case SH_type_l1_32_shadow:
1845 case SH_type_fl1_32_shadow:
1846 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1847 break;
1848 case SH_type_l2_32_shadow:
1849 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1850 break;
1851 #endif
1853 #if CONFIG_PAGING_LEVELS >= 3
1854 case SH_type_l1_pae_shadow:
1855 case SH_type_fl1_pae_shadow:
1856 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1857 break;
1858 case SH_type_l2_pae_shadow:
1859 case SH_type_l2h_pae_shadow:
1860 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1861 break;
1862 #endif
1864 #if CONFIG_PAGING_LEVELS >= 4
1865 case SH_type_l1_64_shadow:
1866 case SH_type_fl1_64_shadow:
1867 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1868 break;
1869 case SH_type_l2_64_shadow:
1870 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1871 break;
1872 case SH_type_l3_64_shadow:
1873 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1874 break;
1875 case SH_type_l4_64_shadow:
1876 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1877 break;
1878 #endif
1879 default:
1880 SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
1881 (unsigned long)t);
1882 BUG();
1886 /**************************************************************************/
1887 /* Remove all writeable mappings of a guest frame from the shadow tables
1888 * Returns non-zero if we need to flush TLBs.
1889 * level and fault_addr desribe how we found this to be a pagetable;
1890 * level==0 means we have some other reason for revoking write access.*/
1892 int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn,
1893 unsigned int level,
1894 unsigned long fault_addr)
1896 /* Dispatch table for getting per-type functions */
1897 static hash_callback_t callbacks[16] = {
1898 NULL, /* none */
1899 #if CONFIG_PAGING_LEVELS == 2
1900 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */
1901 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */
1902 #else
1903 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */
1904 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */
1905 #endif
1906 NULL, /* l2_32 */
1907 #if CONFIG_PAGING_LEVELS >= 3
1908 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */
1909 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
1910 #else
1911 NULL, /* l1_pae */
1912 NULL, /* fl1_pae */
1913 #endif
1914 NULL, /* l2_pae */
1915 NULL, /* l2h_pae */
1916 #if CONFIG_PAGING_LEVELS >= 4
1917 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */
1918 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */
1919 #else
1920 NULL, /* l1_64 */
1921 NULL, /* fl1_64 */
1922 #endif
1923 NULL, /* l2_64 */
1924 NULL, /* l3_64 */
1925 NULL, /* l4_64 */
1926 NULL, /* p2m */
1927 NULL /* unused */
1928 };
1930 static unsigned int callback_mask =
1931 1 << SH_type_l1_32_shadow
1932 | 1 << SH_type_fl1_32_shadow
1933 | 1 << SH_type_l1_pae_shadow
1934 | 1 << SH_type_fl1_pae_shadow
1935 | 1 << SH_type_l1_64_shadow
1936 | 1 << SH_type_fl1_64_shadow
1938 struct page_info *pg = mfn_to_page(gmfn);
1940 ASSERT(shadow_lock_is_acquired(v->domain));
1942 /* Only remove writable mappings if we are doing shadow refcounts.
1943 * In guest refcounting, we trust Xen to already be restricting
1944 * all the writes to the guest page tables, so we do not need to
1945 * do more. */
1946 if ( !shadow_mode_refcounts(v->domain) )
1947 return 0;
1949 /* Early exit if it's already a pagetable, or otherwise not writeable */
1950 if ( sh_mfn_is_a_page_table(gmfn)
1951 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1952 return 0;
1954 perfc_incrc(shadow_writeable);
1956 /* If this isn't a "normal" writeable page, the domain is trying to
1957 * put pagetables in special memory of some kind. We can't allow that. */
1958 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1960 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1961 PRtype_info "\n",
1962 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1963 domain_crash(v->domain);
1966 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1967 if ( v == current && level != 0 )
1969 unsigned long gfn;
1970 /* Heuristic: there is likely to be only one writeable mapping,
1971 * and that mapping is likely to be in the current pagetable,
1972 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
1974 #define GUESS(_a, _h) do { \
1975 if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
1976 perfc_incrc(shadow_writeable_h_ ## _h); \
1977 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1978 return 1; \
1979 } while (0)
1982 if ( v->arch.shadow.mode->guest_levels == 2 )
1984 if ( level == 1 )
1985 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1986 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1988 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1989 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1990 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1993 #if CONFIG_PAGING_LEVELS >= 3
1994 else if ( v->arch.shadow.mode->guest_levels == 3 )
1996 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1997 switch ( level )
1999 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
2000 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
2003 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2004 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2005 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2007 #if CONFIG_PAGING_LEVELS >= 4
2008 else if ( v->arch.shadow.mode->guest_levels == 4 )
2010 /* 64bit w2k3: linear map at 0x0000070000000000 */
2011 switch ( level )
2013 case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
2014 case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
2015 case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
2018 /* 64bit Linux direct map at 0xffff810000000000; older kernels
2019 * had it at 0x0000010000000000UL */
2020 gfn = sh_mfn_to_gfn(v->domain, gmfn);
2021 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
2022 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
2024 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2025 #endif /* CONFIG_PAGING_LEVELS >= 3 */
2027 #undef GUESS
2030 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2031 return 1;
2033 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
2034 * (entries in the fixmap) where linux maps its pagetables. Since
2035 * we expect to hit them most of the time, we start the search for
2036 * the writeable mapping by looking at the same MFN where the last
2037 * brute-force search succeeded. */
2039 if ( v->arch.shadow.last_writeable_pte_smfn != 0 )
2041 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
2042 mfn_t last_smfn = _mfn(v->arch.shadow.last_writeable_pte_smfn);
2043 int shtype = mfn_to_shadow_page(last_smfn)->type;
2045 if ( callbacks[shtype] )
2046 callbacks[shtype](v, last_smfn, gmfn);
2048 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
2049 perfc_incrc(shadow_writeable_h_5);
2052 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2053 return 1;
2055 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
2057 /* Brute-force search of all the shadows, by walking the hash */
2058 perfc_incrc(shadow_writeable_bf);
2059 hash_foreach(v, callback_mask, callbacks, gmfn);
2061 /* If that didn't catch the mapping, something is very wrong */
2062 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2064 SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
2065 "%lu left\n", mfn_x(gmfn),
2066 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2067 domain_crash(v->domain);
2070 /* We killed at least one writeable mapping, so must flush TLBs. */
2071 return 1;
2076 /**************************************************************************/
2077 /* Remove all mappings of a guest frame from the shadow tables.
2078 * Returns non-zero if we need to flush TLBs. */
2080 int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
2082 struct page_info *page = mfn_to_page(gmfn);
2083 int expected_count;
2085 /* Dispatch table for getting per-type functions */
2086 static hash_callback_t callbacks[16] = {
2087 NULL, /* none */
2088 #if CONFIG_PAGING_LEVELS == 2
2089 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */
2090 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */
2091 #else
2092 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */
2093 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */
2094 #endif
2095 NULL, /* l2_32 */
2096 #if CONFIG_PAGING_LEVELS >= 3
2097 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */
2098 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
2099 #else
2100 NULL, /* l1_pae */
2101 NULL, /* fl1_pae */
2102 #endif
2103 NULL, /* l2_pae */
2104 NULL, /* l2h_pae */
2105 #if CONFIG_PAGING_LEVELS >= 4
2106 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */
2107 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */
2108 #else
2109 NULL, /* l1_64 */
2110 NULL, /* fl1_64 */
2111 #endif
2112 NULL, /* l2_64 */
2113 NULL, /* l3_64 */
2114 NULL, /* l4_64 */
2115 NULL, /* p2m */
2116 NULL /* unused */
2117 };
2119 static unsigned int callback_mask =
2120 1 << SH_type_l1_32_shadow
2121 | 1 << SH_type_fl1_32_shadow
2122 | 1 << SH_type_l1_pae_shadow
2123 | 1 << SH_type_fl1_pae_shadow
2124 | 1 << SH_type_l1_64_shadow
2125 | 1 << SH_type_fl1_64_shadow
2128 perfc_incrc(shadow_mappings);
2129 if ( (page->count_info & PGC_count_mask) == 0 )
2130 return 0;
2132 ASSERT(shadow_lock_is_acquired(v->domain));
2134 /* XXX TODO:
2135 * Heuristics for finding the (probably) single mapping of this gmfn */
2137 /* Brute-force search of all the shadows, by walking the hash */
2138 perfc_incrc(shadow_mappings_bf);
2139 hash_foreach(v, callback_mask, callbacks, gmfn);
2141 /* If that didn't catch the mapping, something is very wrong */
2142 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2143 if ( (page->count_info & PGC_count_mask) != expected_count )
2145 /* Don't complain if we're in HVM and there's one extra mapping:
2146 * The qemu helper process has an untyped mapping of this dom's RAM */
2147 if ( !(shadow_mode_external(v->domain)
2148 && (page->count_info & PGC_count_mask) <= 2
2149 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2151 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2152 "c=%08x t=%08lx\n", mfn_x(gmfn),
2153 page->count_info, page->u.inuse.type_info);
2157 /* We killed at least one mapping, so must flush TLBs. */
2158 return 1;
2162 /**************************************************************************/
2163 /* Remove all shadows of a guest frame from the shadow tables */
2165 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2166 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2167 * found there. Returns 1 if that was the only reference to this shadow */
2169 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2170 mfn_t pmfn;
2171 void *vaddr;
2172 int rc;
2174 ASSERT(sp->type > 0);
2175 ASSERT(sp->type < SH_type_max_shadow);
2176 ASSERT(sp->type != SH_type_l2_32_shadow);
2177 ASSERT(sp->type != SH_type_l2_pae_shadow);
2178 ASSERT(sp->type != SH_type_l2h_pae_shadow);
2179 ASSERT(sp->type != SH_type_l4_64_shadow);
2181 if (sp->up == 0) return 0;
2182 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2183 ASSERT(mfn_valid(pmfn));
2184 vaddr = sh_map_domain_page(pmfn);
2185 ASSERT(vaddr);
2186 vaddr += sp->up & (PAGE_SIZE-1);
2187 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2189 /* Is this the only reference to this shadow? */
2190 rc = (sp->count == 1) ? 1 : 0;
2192 /* Blank the offending entry */
2193 switch (sp->type)
2195 case SH_type_l1_32_shadow:
2196 case SH_type_l2_32_shadow:
2197 #if CONFIG_PAGING_LEVELS == 2
2198 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2199 #else
2200 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2201 #endif
2202 break;
2203 #if CONFIG_PAGING_LEVELS >=3
2204 case SH_type_l1_pae_shadow:
2205 case SH_type_l2_pae_shadow:
2206 case SH_type_l2h_pae_shadow:
2207 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2208 break;
2209 #if CONFIG_PAGING_LEVELS >= 4
2210 case SH_type_l1_64_shadow:
2211 case SH_type_l2_64_shadow:
2212 case SH_type_l3_64_shadow:
2213 case SH_type_l4_64_shadow:
2214 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2215 break;
2216 #endif
2217 #endif
2218 default: BUG(); /* Some wierd unknown shadow type */
2221 sh_unmap_domain_page(vaddr);
2222 if ( rc )
2223 perfc_incrc(shadow_up_pointer);
2224 else
2225 perfc_incrc(shadow_unshadow_bf);
2227 return rc;
2230 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2231 /* Remove the shadows of this guest page.
2232 * If fast != 0, just try the quick heuristic, which will remove
2233 * at most one reference to each shadow of the page. Otherwise, walk
2234 * all the shadow tables looking for refs to shadows of this gmfn.
2235 * If all != 0, kill the domain if we can't find all the shadows.
2236 * (all != 0 implies fast == 0)
2237 */
2239 struct page_info *pg;
2240 mfn_t smfn;
2241 u32 sh_flags;
2242 unsigned char t;
2244 /* Dispatch table for getting per-type functions: each level must
2245 * be called with the function to remove a lower-level shadow. */
2246 static hash_callback_t callbacks[16] = {
2247 NULL, /* none */
2248 NULL, /* l1_32 */
2249 NULL, /* fl1_32 */
2250 #if CONFIG_PAGING_LEVELS == 2
2251 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2252 #else
2253 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2254 #endif
2255 NULL, /* l1_pae */
2256 NULL, /* fl1_pae */
2257 #if CONFIG_PAGING_LEVELS >= 3
2258 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2259 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2260 #else
2261 NULL, /* l2_pae */
2262 NULL, /* l2h_pae */
2263 #endif
2264 NULL, /* l1_64 */
2265 NULL, /* fl1_64 */
2266 #if CONFIG_PAGING_LEVELS >= 4
2267 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2268 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2269 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2270 #else
2271 NULL, /* l2_64 */
2272 NULL, /* l3_64 */
2273 NULL, /* l4_64 */
2274 #endif
2275 NULL, /* p2m */
2276 NULL /* unused */
2277 };
2279 /* Another lookup table, for choosing which mask to use */
2280 static unsigned int masks[16] = {
2281 0, /* none */
2282 1 << SH_type_l2_32_shadow, /* l1_32 */
2283 0, /* fl1_32 */
2284 0, /* l2_32 */
2285 ((1 << SH_type_l2h_pae_shadow)
2286 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2287 0, /* fl1_pae */
2288 0, /* l2_pae */
2289 0, /* l2h_pae */
2290 1 << SH_type_l2_64_shadow, /* l1_64 */
2291 0, /* fl1_64 */
2292 1 << SH_type_l3_64_shadow, /* l2_64 */
2293 1 << SH_type_l4_64_shadow, /* l3_64 */
2294 0, /* l4_64 */
2295 0, /* p2m */
2296 0 /* unused */
2297 };
2299 ASSERT(shadow_lock_is_acquired(v->domain));
2300 ASSERT(!(all && fast));
2302 pg = mfn_to_page(gmfn);
2304 /* Bail out now if the page is not shadowed */
2305 if ( (pg->count_info & PGC_page_table) == 0 )
2306 return;
2308 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2309 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2311 /* Search for this shadow in all appropriate shadows */
2312 perfc_incrc(shadow_unshadow);
2313 sh_flags = pg->shadow_flags;
2315 /* Lower-level shadows need to be excised from upper-level shadows.
2316 * This call to hash_foreach() looks dangerous but is in fact OK: each
2317 * call will remove at most one shadow, and terminate immediately when
2318 * it does remove it, so we never walk the hash after doing a deletion. */
2319 #define DO_UNSHADOW(_type) do { \
2320 t = (_type); \
2321 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2322 if ( sh_type_is_pinnable(v, t) ) \
2323 sh_unpin(v, smfn); \
2324 else \
2325 sh_remove_shadow_via_pointer(v, smfn); \
2326 if ( (pg->count_info & PGC_page_table) && !fast ) \
2327 hash_foreach(v, masks[t], callbacks, smfn); \
2328 } while (0)
2330 if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(SH_type_l1_32_shadow);
2331 if ( sh_flags & SHF_L2_32 ) DO_UNSHADOW(SH_type_l2_32_shadow);
2332 #if CONFIG_PAGING_LEVELS >= 3
2333 if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(SH_type_l1_pae_shadow);
2334 if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(SH_type_l2_pae_shadow);
2335 if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(SH_type_l2h_pae_shadow);
2336 #if CONFIG_PAGING_LEVELS >= 4
2337 if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(SH_type_l1_64_shadow);
2338 if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(SH_type_l2_64_shadow);
2339 if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(SH_type_l3_64_shadow);
2340 if ( sh_flags & SHF_L4_64 ) DO_UNSHADOW(SH_type_l4_64_shadow);
2341 #endif
2342 #endif
2344 #undef DO_UNSHADOW
2346 /* If that didn't catch the shadows, something is wrong */
2347 if ( !fast && (pg->count_info & PGC_page_table) )
2349 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2350 "(shadow_flags=%08lx)\n",
2351 mfn_x(gmfn), pg->shadow_flags);
2352 if ( all )
2353 domain_crash(v->domain);
2356 /* Need to flush TLBs now, so that linear maps are safe next time we
2357 * take a fault. */
2358 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2361 void
2362 shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2363 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2364 * Unshadow it, and recursively unshadow pages that reference it. */
2366 shadow_remove_all_shadows(v, gmfn);
2367 /* XXX TODO:
2368 * Rework this hashtable walker to return a linked-list of all
2369 * the shadows it modified, then do breadth-first recursion
2370 * to find the way up to higher-level tables and unshadow them too.
2372 * The current code (just tearing down each page's shadows as we
2373 * detect that it is not a pagetable) is correct, but very slow.
2374 * It means extra emulated writes and slows down removal of mappings. */
2377 /**************************************************************************/
2379 void sh_update_paging_modes(struct vcpu *v)
2381 struct domain *d = v->domain;
2382 struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
2383 mfn_t old_guest_table;
2385 ASSERT(shadow_lock_is_acquired(d));
2387 // Valid transitions handled by this function:
2388 // - For PV guests:
2389 // - after a shadow mode has been changed
2390 // - For HVM guests:
2391 // - after a shadow mode has been changed
2392 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2393 //
2395 // First, tear down any old shadow tables held by this vcpu.
2396 //
2397 shadow_detach_old_tables(v);
2399 if ( !is_hvm_domain(d) )
2401 ///
2402 /// PV guest
2403 ///
2404 #if CONFIG_PAGING_LEVELS == 4
2405 if ( pv_32bit_guest(v) )
2406 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
2407 else
2408 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2409 #elif CONFIG_PAGING_LEVELS == 3
2410 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2411 #elif CONFIG_PAGING_LEVELS == 2
2412 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2413 #else
2414 #error unexpected paging mode
2415 #endif
2416 v->arch.shadow.translate_enabled = !!shadow_mode_translate(d);
2418 else
2420 ///
2421 /// HVM guest
2422 ///
2423 ASSERT(shadow_mode_translate(d));
2424 ASSERT(shadow_mode_external(d));
2426 v->arch.shadow.translate_enabled = !!hvm_paging_enabled(v);
2427 if ( !v->arch.shadow.translate_enabled )
2429 /* Set v->arch.guest_table to use the p2m map, and choose
2430 * the appropriate shadow mode */
2431 old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2432 #if CONFIG_PAGING_LEVELS == 2
2433 v->arch.guest_table =
2434 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2435 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2436 #elif CONFIG_PAGING_LEVELS == 3
2437 v->arch.guest_table =
2438 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2439 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2440 #else /* CONFIG_PAGING_LEVELS == 4 */
2442 l4_pgentry_t *l4e;
2443 /* Use the start of the first l3 table as a PAE l3 */
2444 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2445 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2446 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2447 v->arch.guest_table =
2448 pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2449 sh_unmap_domain_page(l4e);
2451 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2452 #endif
2453 /* Fix up refcounts on guest_table */
2454 get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2455 if ( mfn_x(old_guest_table) != 0 )
2456 put_page(mfn_to_page(old_guest_table));
2458 else
2460 #ifdef __x86_64__
2461 if ( hvm_long_mode_enabled(v) )
2463 // long mode guest...
2464 v->arch.shadow.mode =
2465 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2467 else
2468 #endif
2469 if ( hvm_pae_enabled(v) )
2471 #if CONFIG_PAGING_LEVELS >= 3
2472 // 32-bit PAE mode guest...
2473 v->arch.shadow.mode =
2474 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2475 #else
2476 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2477 domain_crash(d);
2478 return;
2479 #endif
2481 else
2483 // 32-bit 2 level guest...
2484 #if CONFIG_PAGING_LEVELS >= 3
2485 v->arch.shadow.mode =
2486 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2487 #else
2488 v->arch.shadow.mode =
2489 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2490 #endif
2494 if ( pagetable_is_null(v->arch.monitor_table) )
2496 mfn_t mmfn = shadow_make_monitor_table(v);
2497 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2500 if ( v->arch.shadow.mode != old_mode )
2502 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2503 "(was g=%u s=%u)\n",
2504 d->domain_id, v->vcpu_id,
2505 is_hvm_domain(d) ? !!hvm_paging_enabled(v) : 1,
2506 v->arch.shadow.mode->guest_levels,
2507 v->arch.shadow.mode->shadow_levels,
2508 old_mode ? old_mode->guest_levels : 0,
2509 old_mode ? old_mode->shadow_levels : 0);
2510 if ( old_mode &&
2511 (v->arch.shadow.mode->shadow_levels !=
2512 old_mode->shadow_levels) )
2514 /* Need to make a new monitor table for the new mode */
2515 mfn_t new_mfn, old_mfn;
2517 if ( v != current )
2519 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2520 "this HVM vcpu's (d=%u v=%u) paging mode!\n",
2521 current->domain->domain_id, current->vcpu_id,
2522 v->domain->domain_id, v->vcpu_id);
2523 domain_crash(v->domain);
2524 return;
2527 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2528 v->arch.monitor_table = pagetable_null();
2529 new_mfn = v->arch.shadow.mode->make_monitor_table(v);
2530 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2531 SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
2532 mfn_x(new_mfn));
2534 /* Don't be running on the old monitor table when we
2535 * pull it down! Switch CR3, and warn the HVM code that
2536 * its host cr3 has changed. */
2537 make_cr3(v, mfn_x(new_mfn));
2538 write_ptbase(v);
2539 hvm_update_host_cr3(v);
2540 old_mode->destroy_monitor_table(v, old_mfn);
2544 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2545 // These are HARD: think about the case where two CPU's have
2546 // different values for CR4.PSE and CR4.PGE at the same time.
2547 // This *does* happen, at least for CR4.PGE...
2550 v->arch.shadow.mode->update_cr3(v);
2553 /**************************************************************************/
2554 /* Turning on and off shadow features */
2556 static void sh_new_mode(struct domain *d, u32 new_mode)
2557 /* Inform all the vcpus that the shadow mode has been changed */
2559 struct vcpu *v;
2561 ASSERT(shadow_lock_is_acquired(d));
2562 ASSERT(d != current->domain);
2563 d->arch.shadow.mode = new_mode;
2564 if ( new_mode & SHM2_translate )
2565 shadow_audit_p2m(d);
2566 for_each_vcpu(d, v)
2567 sh_update_paging_modes(v);
2570 int shadow_enable(struct domain *d, u32 mode)
2571 /* Turn on "permanent" shadow features: external, translate, refcount.
2572 * Can only be called once on a domain, and these features cannot be
2573 * disabled.
2574 * Returns 0 for success, -errno for failure. */
2576 unsigned int old_pages;
2577 int rv = 0;
2579 mode |= SHM2_enable;
2581 domain_pause(d);
2582 shadow_lock(d);
2584 /* Sanity check the arguments */
2585 if ( (d == current->domain) ||
2586 shadow_mode_enabled(d) ||
2587 ((mode & SHM2_translate) && !(mode & SHM2_refcounts)) ||
2588 ((mode & SHM2_external) && !(mode & SHM2_translate)) )
2590 rv = -EINVAL;
2591 goto out;
2594 // XXX -- eventually would like to require that all memory be allocated
2595 // *after* shadow_enabled() is called... So here, we would test to make
2596 // sure that d->page_list is empty.
2597 #if 0
2598 spin_lock(&d->page_alloc_lock);
2599 if ( !list_empty(&d->page_list) )
2601 spin_unlock(&d->page_alloc_lock);
2602 rv = -EINVAL;
2603 goto out;
2605 spin_unlock(&d->page_alloc_lock);
2606 #endif
2608 /* Init the shadow memory allocation if the user hasn't done so */
2609 old_pages = d->arch.shadow.total_pages;
2610 if ( old_pages == 0 )
2611 if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
2613 set_sh_allocation(d, 0, NULL);
2614 rv = -ENOMEM;
2615 goto out;
2618 /* Init the hash table */
2619 if ( shadow_hash_alloc(d) != 0 )
2621 set_sh_allocation(d, old_pages, NULL);
2622 rv = -ENOMEM;
2623 goto out;
2626 /* Init the P2M table */
2627 if ( mode & SHM2_translate )
2628 if ( !shadow_alloc_p2m_table(d) )
2630 shadow_hash_teardown(d);
2631 set_sh_allocation(d, old_pages, NULL);
2632 shadow_p2m_teardown(d);
2633 rv = -ENOMEM;
2634 goto out;
2637 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2638 /* We assume we're dealing with an older 64bit linux guest until we
2639 * see the guest use more than one l4 per vcpu. */
2640 d->arch.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2641 #endif
2643 /* Update the bits */
2644 sh_new_mode(d, mode);
2645 shadow_audit_p2m(d);
2646 out:
2647 shadow_unlock(d);
2648 domain_unpause(d);
2649 return rv;
2652 void shadow_teardown(struct domain *d)
2653 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2654 * Should only be called for dying domains. */
2656 struct vcpu *v;
2657 mfn_t mfn;
2659 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2660 ASSERT(d != current->domain);
2662 if ( !shadow_lock_is_acquired(d) )
2663 shadow_lock(d); /* Keep various asserts happy */
2665 if ( shadow_mode_enabled(d) )
2667 /* Release the shadow and monitor tables held by each vcpu */
2668 for_each_vcpu(d, v)
2670 shadow_detach_old_tables(v);
2671 if ( shadow_mode_external(d) )
2673 mfn = pagetable_get_mfn(v->arch.monitor_table);
2674 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
2675 shadow_destroy_monitor_table(v, mfn);
2676 v->arch.monitor_table = pagetable_null();
2681 if ( d->arch.shadow.total_pages != 0 )
2683 SHADOW_PRINTK("teardown of domain %u starts."
2684 " Shadow pages total = %u, free = %u, p2m=%u\n",
2685 d->domain_id,
2686 d->arch.shadow.total_pages,
2687 d->arch.shadow.free_pages,
2688 d->arch.shadow.p2m_pages);
2689 /* Destroy all the shadows and release memory to domheap */
2690 set_sh_allocation(d, 0, NULL);
2691 /* Release the hash table back to xenheap */
2692 if (d->arch.shadow.hash_table)
2693 shadow_hash_teardown(d);
2694 /* Release the log-dirty bitmap of dirtied pages */
2695 sh_free_log_dirty_bitmap(d);
2696 /* Should not have any more memory held */
2697 SHADOW_PRINTK("teardown done."
2698 " Shadow pages total = %u, free = %u, p2m=%u\n",
2699 d->arch.shadow.total_pages,
2700 d->arch.shadow.free_pages,
2701 d->arch.shadow.p2m_pages);
2702 ASSERT(d->arch.shadow.total_pages == 0);
2705 /* We leave the "permanent" shadow modes enabled, but clear the
2706 * log-dirty mode bit. We don't want any more mark_dirty()
2707 * calls now that we've torn down the bitmap */
2708 d->arch.shadow.mode &= ~SHM2_log_dirty;
2710 shadow_unlock(d);
2713 void shadow_final_teardown(struct domain *d)
2714 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2717 SHADOW_PRINTK("dom %u final teardown starts."
2718 " Shadow pages total = %u, free = %u, p2m=%u\n",
2719 d->domain_id,
2720 d->arch.shadow.total_pages,
2721 d->arch.shadow.free_pages,
2722 d->arch.shadow.p2m_pages);
2724 /* Double-check that the domain didn't have any shadow memory.
2725 * It is possible for a domain that never got domain_kill()ed
2726 * to get here with its shadow allocation intact. */
2727 if ( d->arch.shadow.total_pages != 0 )
2728 shadow_teardown(d);
2730 /* It is now safe to pull down the p2m map. */
2731 if ( d->arch.shadow.p2m_pages != 0 )
2732 shadow_p2m_teardown(d);
2734 SHADOW_PRINTK("dom %u final teardown done."
2735 " Shadow pages total = %u, free = %u, p2m=%u\n",
2736 d->domain_id,
2737 d->arch.shadow.total_pages,
2738 d->arch.shadow.free_pages,
2739 d->arch.shadow.p2m_pages);
2742 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2743 /* Turn on a single shadow mode feature */
2745 ASSERT(shadow_lock_is_acquired(d));
2747 /* Sanity check the call */
2748 if ( d == current->domain || (d->arch.shadow.mode & mode) )
2750 return -EINVAL;
2753 if ( d->arch.shadow.mode == 0 )
2755 /* Init the shadow memory allocation and the hash table */
2756 if ( set_sh_allocation(d, 1, NULL) != 0
2757 || shadow_hash_alloc(d) != 0 )
2759 set_sh_allocation(d, 0, NULL);
2760 return -ENOMEM;
2764 /* Update the bits */
2765 sh_new_mode(d, d->arch.shadow.mode | mode);
2767 return 0;
2770 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2771 /* Turn off a single shadow mode feature */
2773 struct vcpu *v;
2774 ASSERT(shadow_lock_is_acquired(d));
2776 /* Sanity check the call */
2777 if ( d == current->domain || !(d->arch.shadow.mode & mode) )
2779 return -EINVAL;
2782 /* Update the bits */
2783 sh_new_mode(d, d->arch.shadow.mode & ~mode);
2784 if ( d->arch.shadow.mode == 0 )
2786 /* Get this domain off shadows */
2787 SHADOW_PRINTK("un-shadowing of domain %u starts."
2788 " Shadow pages total = %u, free = %u, p2m=%u\n",
2789 d->domain_id,
2790 d->arch.shadow.total_pages,
2791 d->arch.shadow.free_pages,
2792 d->arch.shadow.p2m_pages);
2793 for_each_vcpu(d, v)
2795 shadow_detach_old_tables(v);
2796 #if CONFIG_PAGING_LEVELS == 4
2797 if ( !(v->arch.flags & TF_kernel_mode) )
2798 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2799 else
2800 #endif
2801 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2805 /* Pull down the memory allocation */
2806 if ( set_sh_allocation(d, 0, NULL) != 0 )
2808 // XXX - How can this occur?
2809 // Seems like a bug to return an error now that we've
2810 // disabled the relevant shadow mode.
2811 //
2812 return -ENOMEM;
2814 shadow_hash_teardown(d);
2815 SHADOW_PRINTK("un-shadowing of domain %u done."
2816 " Shadow pages total = %u, free = %u, p2m=%u\n",
2817 d->domain_id,
2818 d->arch.shadow.total_pages,
2819 d->arch.shadow.free_pages,
2820 d->arch.shadow.p2m_pages);
2823 return 0;
2826 /* Enable/disable ops for the "test" and "log-dirty" modes */
2827 int shadow_test_enable(struct domain *d)
2829 int ret;
2831 domain_pause(d);
2832 shadow_lock(d);
2834 if ( shadow_mode_enabled(d) )
2836 SHADOW_ERROR("Don't support enabling test mode"
2837 " on already shadowed doms\n");
2838 ret = -EINVAL;
2839 goto out;
2842 ret = shadow_one_bit_enable(d, SHM2_enable);
2843 out:
2844 shadow_unlock(d);
2845 domain_unpause(d);
2847 return ret;
2850 int shadow_test_disable(struct domain *d)
2852 int ret;
2854 domain_pause(d);
2855 shadow_lock(d);
2856 ret = shadow_one_bit_disable(d, SHM2_enable);
2857 shadow_unlock(d);
2858 domain_unpause(d);
2860 return ret;
2863 static int
2864 sh_alloc_log_dirty_bitmap(struct domain *d)
2866 ASSERT(d->arch.shadow.dirty_bitmap == NULL);
2867 d->arch.shadow.dirty_bitmap_size =
2868 (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
2869 ~(BITS_PER_LONG - 1);
2870 d->arch.shadow.dirty_bitmap =
2871 xmalloc_array(unsigned long,
2872 d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
2873 if ( d->arch.shadow.dirty_bitmap == NULL )
2875 d->arch.shadow.dirty_bitmap_size = 0;
2876 return -ENOMEM;
2878 memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
2880 return 0;
2883 static void
2884 sh_free_log_dirty_bitmap(struct domain *d)
2886 d->arch.shadow.dirty_bitmap_size = 0;
2887 if ( d->arch.shadow.dirty_bitmap )
2889 xfree(d->arch.shadow.dirty_bitmap);
2890 d->arch.shadow.dirty_bitmap = NULL;
2894 static int shadow_log_dirty_enable(struct domain *d)
2896 int ret;
2898 domain_pause(d);
2899 shadow_lock(d);
2901 if ( shadow_mode_log_dirty(d) )
2903 ret = -EINVAL;
2904 goto out;
2907 if ( shadow_mode_enabled(d) )
2909 SHADOW_ERROR("Don't (yet) support enabling log-dirty"
2910 " on already shadowed doms\n");
2911 ret = -EINVAL;
2912 goto out;
2915 ret = sh_alloc_log_dirty_bitmap(d);
2916 if ( ret != 0 )
2918 sh_free_log_dirty_bitmap(d);
2919 goto out;
2922 ret = shadow_one_bit_enable(d, SHM2_log_dirty);
2923 if ( ret != 0 )
2924 sh_free_log_dirty_bitmap(d);
2926 out:
2927 shadow_unlock(d);
2928 domain_unpause(d);
2929 return ret;
2932 static int shadow_log_dirty_disable(struct domain *d)
2934 int ret;
2936 domain_pause(d);
2937 shadow_lock(d);
2938 ret = shadow_one_bit_disable(d, SHM2_log_dirty);
2939 if ( !shadow_mode_log_dirty(d) )
2940 sh_free_log_dirty_bitmap(d);
2941 shadow_unlock(d);
2942 domain_unpause(d);
2944 return ret;
2947 /**************************************************************************/
2948 /* P2M map manipulations */
2950 static void
2951 sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
2953 struct vcpu *v;
2955 if ( !shadow_mode_translate(d) )
2956 return;
2958 v = current;
2959 if ( v->domain != d )
2960 v = d->vcpu[0];
2962 SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
2964 ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
2965 //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
2967 if ( v != NULL )
2969 shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
2970 if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
2971 flush_tlb_mask(d->domain_dirty_cpumask);
2974 shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
2975 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
2978 void
2979 shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
2980 unsigned long mfn)
2982 shadow_lock(d);
2983 shadow_audit_p2m(d);
2984 sh_p2m_remove_page(d, gfn, mfn);
2985 shadow_audit_p2m(d);
2986 shadow_unlock(d);
2989 void
2990 shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
2991 unsigned long mfn)
2993 unsigned long ogfn;
2994 mfn_t omfn;
2996 if ( !shadow_mode_translate(d) )
2997 return;
2999 shadow_lock(d);
3000 shadow_audit_p2m(d);
3002 SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
3004 omfn = sh_gfn_to_mfn(d, gfn);
3005 if ( mfn_valid(omfn) )
3007 /* Get rid of the old mapping, especially any shadows */
3008 struct vcpu *v = current;
3009 if ( v->domain != d )
3010 v = d->vcpu[0];
3011 if ( v != NULL )
3013 shadow_remove_all_shadows_and_parents(v, omfn);
3014 if ( shadow_remove_all_mappings(v, omfn) )
3015 flush_tlb_mask(d->domain_dirty_cpumask);
3017 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
3020 ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
3021 if (
3022 #ifdef __x86_64__
3023 (ogfn != 0x5555555555555555L)
3024 #else
3025 (ogfn != 0x55555555L)
3026 #endif
3027 && (ogfn != INVALID_M2P_ENTRY)
3028 && (ogfn != gfn) )
3030 /* This machine frame is already mapped at another physical address */
3031 SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
3032 mfn, ogfn, gfn);
3033 if ( mfn_valid(omfn = sh_gfn_to_mfn(d, ogfn)) )
3035 SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
3036 ogfn , mfn_x(omfn));
3037 if ( mfn_x(omfn) == mfn )
3038 sh_p2m_remove_page(d, ogfn, mfn);
3042 shadow_set_p2m_entry(d, gfn, _mfn(mfn));
3043 set_gpfn_from_mfn(mfn, gfn);
3044 shadow_audit_p2m(d);
3045 shadow_unlock(d);
3048 /**************************************************************************/
3049 /* Log-dirty mode support */
3051 /* Convert a shadow to log-dirty mode. */
3052 void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
3054 BUG();
3058 /* Read a domain's log-dirty bitmap and stats.
3059 * If the operation is a CLEAN, clear the bitmap and stats as well. */
3060 static int shadow_log_dirty_op(
3061 struct domain *d, struct xen_domctl_shadow_op *sc)
3063 int i, rv = 0, clean = 0;
3065 domain_pause(d);
3066 shadow_lock(d);
3068 clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
3070 SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
3071 (clean) ? "clean" : "peek",
3072 d->domain_id,
3073 d->arch.shadow.fault_count,
3074 d->arch.shadow.dirty_count);
3076 sc->stats.fault_count = d->arch.shadow.fault_count;
3077 sc->stats.dirty_count = d->arch.shadow.dirty_count;
3079 if ( clean )
3081 /* Need to revoke write access to the domain's pages again.
3082 * In future, we'll have a less heavy-handed approach to this,
3083 * but for now, we just unshadow everything except Xen. */
3084 shadow_blow_tables(d);
3086 d->arch.shadow.fault_count = 0;
3087 d->arch.shadow.dirty_count = 0;
3090 if ( guest_handle_is_null(sc->dirty_bitmap) ||
3091 (d->arch.shadow.dirty_bitmap == NULL) )
3093 rv = -EINVAL;
3094 goto out;
3097 if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
3098 sc->pages = d->arch.shadow.dirty_bitmap_size;
3100 #define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
3101 for ( i = 0; i < sc->pages; i += CHUNK )
3103 int bytes = ((((sc->pages - i) > CHUNK)
3104 ? CHUNK
3105 : (sc->pages - i)) + 7) / 8;
3107 if ( copy_to_guest_offset(
3108 sc->dirty_bitmap,
3109 i/(8*sizeof(unsigned long)),
3110 d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
3111 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
3113 rv = -EINVAL;
3114 goto out;
3117 if ( clean )
3118 memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
3119 0, bytes);
3121 #undef CHUNK
3123 out:
3124 shadow_unlock(d);
3125 domain_unpause(d);
3126 return 0;
3130 /* Mark a page as dirty */
3131 void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
3133 unsigned long pfn;
3135 ASSERT(shadow_lock_is_acquired(d));
3136 ASSERT(shadow_mode_log_dirty(d));
3138 if ( !mfn_valid(gmfn) )
3139 return;
3141 ASSERT(d->arch.shadow.dirty_bitmap != NULL);
3143 /* We /really/ mean PFN here, even for non-translated guests. */
3144 pfn = get_gpfn_from_mfn(mfn_x(gmfn));
3146 /*
3147 * Values with the MSB set denote MFNs that aren't really part of the
3148 * domain's pseudo-physical memory map (e.g., the shared info frame).
3149 * Nothing to do here...
3150 */
3151 if ( unlikely(!VALID_M2P(pfn)) )
3152 return;
3154 /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
3155 if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
3157 if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
3159 SHADOW_DEBUG(LOGDIRTY,
3160 "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
3161 mfn_x(gmfn), pfn, d->domain_id);
3162 d->arch.shadow.dirty_count++;
3165 else
3167 SHADOW_PRINTK("mark_dirty OOR! "
3168 "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
3169 "owner=%d c=%08x t=%" PRtype_info "\n",
3170 mfn_x(gmfn),
3171 pfn,
3172 d->arch.shadow.dirty_bitmap_size,
3173 d->domain_id,
3174 (page_get_owner(mfn_to_page(gmfn))
3175 ? page_get_owner(mfn_to_page(gmfn))->domain_id
3176 : -1),
3177 mfn_to_page(gmfn)->count_info,
3178 mfn_to_page(gmfn)->u.inuse.type_info);
3183 /**************************************************************************/
3184 /* Shadow-control XEN_DOMCTL dispatcher */
3186 int shadow_domctl(struct domain *d,
3187 xen_domctl_shadow_op_t *sc,
3188 XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
3190 int rc, preempted = 0;
3192 if ( unlikely(d == current->domain) )
3194 gdprintk(XENLOG_INFO, "Don't try to do a shadow op on yourself!\n");
3195 return -EINVAL;
3198 switch ( sc->op )
3200 case XEN_DOMCTL_SHADOW_OP_OFF:
3201 if ( shadow_mode_log_dirty(d) )
3202 if ( (rc = shadow_log_dirty_disable(d)) != 0 )
3203 return rc;
3204 if ( is_hvm_domain(d) )
3205 return -EINVAL;
3206 if ( d->arch.shadow.mode & SHM2_enable )
3207 if ( (rc = shadow_test_disable(d)) != 0 )
3208 return rc;
3209 return 0;
3211 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3212 return shadow_test_enable(d);
3214 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
3215 return shadow_log_dirty_enable(d);
3217 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3218 return shadow_enable(d, SHM2_refcounts|SHM2_translate);
3220 case XEN_DOMCTL_SHADOW_OP_CLEAN:
3221 case XEN_DOMCTL_SHADOW_OP_PEEK:
3222 return shadow_log_dirty_op(d, sc);
3224 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3225 if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
3226 return shadow_log_dirty_enable(d);
3227 return shadow_enable(d, sc->mode << SHM2_shift);
3229 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3230 sc->mb = shadow_get_allocation(d);
3231 return 0;
3233 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3234 rc = shadow_set_allocation(d, sc->mb, &preempted);
3235 if ( preempted )
3236 /* Not finished. Set up to re-run the call. */
3237 rc = hypercall_create_continuation(
3238 __HYPERVISOR_domctl, "h", u_domctl);
3239 else
3240 /* Finished. Return the new allocation */
3241 sc->mb = shadow_get_allocation(d);
3242 return rc;
3244 default:
3245 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3246 return -EINVAL;
3251 /**************************************************************************/
3252 /* Auditing shadow tables */
3254 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3256 void shadow_audit_tables(struct vcpu *v)
3258 /* Dispatch table for getting per-type functions */
3259 static hash_callback_t callbacks[16] = {
3260 NULL, /* none */
3261 #if CONFIG_PAGING_LEVELS == 2
3262 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
3263 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
3264 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
3265 #else
3266 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
3267 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
3268 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
3269 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
3270 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
3271 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
3272 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
3273 #if CONFIG_PAGING_LEVELS >= 4
3274 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
3275 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
3276 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
3277 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
3278 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
3279 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3280 #endif /* CONFIG_PAGING_LEVELS > 2 */
3281 NULL /* All the rest */
3282 };
3283 unsigned int mask;
3285 if ( !(SHADOW_AUDIT_ENABLE) )
3286 return;
3288 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3289 mask = ~1; /* Audit every table in the system */
3290 else
3292 /* Audit only the current mode's tables */
3293 switch ( v->arch.shadow.mode->guest_levels )
3295 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3296 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3297 |SHF_L2H_PAE); break;
3298 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3299 |SHF_L3_64|SHF_L4_64); break;
3300 default: BUG();
3304 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3307 #endif /* Shadow audit */
3310 /**************************************************************************/
3311 /* Auditing p2m tables */
3313 #if SHADOW_AUDIT & SHADOW_AUDIT_P2M
3315 void shadow_audit_p2m(struct domain *d)
3317 struct list_head *entry;
3318 struct page_info *page;
3319 struct domain *od;
3320 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
3321 mfn_t p2mfn;
3322 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
3323 int test_linear;
3325 if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
3326 return;
3328 //SHADOW_PRINTK("p2m audit starts\n");
3330 test_linear = ( (d == current->domain)
3331 && !pagetable_is_null(current->arch.monitor_table) );
3332 if ( test_linear )
3333 local_flush_tlb();
3335 /* Audit part one: walk the domain's page allocation list, checking
3336 * the m2p entries. */
3337 for ( entry = d->page_list.next;
3338 entry != &d->page_list;
3339 entry = entry->next )
3341 page = list_entry(entry, struct page_info, list);
3342 mfn = mfn_x(page_to_mfn(page));
3344 // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
3346 od = page_get_owner(page);
3348 if ( od != d )
3350 SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
3351 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
3352 continue;
3355 gfn = get_gpfn_from_mfn(mfn);
3356 if ( gfn == INVALID_M2P_ENTRY )
3358 orphans_i++;
3359 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
3360 // mfn);
3361 continue;
3364 if ( gfn == 0x55555555 )
3366 orphans_d++;
3367 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
3368 // mfn);
3369 continue;
3372 p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
3373 if ( mfn_x(p2mfn) != mfn )
3375 mpbad++;
3376 SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
3377 " (-> gfn %#lx)\n",
3378 mfn, gfn, mfn_x(p2mfn),
3379 (mfn_valid(p2mfn)
3380 ? get_gpfn_from_mfn(mfn_x(p2mfn))
3381 : -1u));
3382 /* This m2p entry is stale: the domain has another frame in
3383 * this physical slot. No great disaster, but for neatness,
3384 * blow away the m2p entry. */
3385 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3388 if ( test_linear && (gfn <= d->arch.max_mapped_pfn) )
3390 lp2mfn = gfn_to_mfn_current(gfn);
3391 if ( mfn_x(lp2mfn) != mfn_x(p2mfn) )
3393 SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
3394 "(!= mfn %#lx)\n", gfn,
3395 mfn_x(lp2mfn), mfn_x(p2mfn));
3399 // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
3400 // mfn, gfn, p2mfn, lp2mfn);
3403 /* Audit part two: walk the domain's p2m table, checking the entries. */
3404 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
3406 l2_pgentry_t *l2e;
3407 l1_pgentry_t *l1e;
3408 int i1, i2;
3410 #if CONFIG_PAGING_LEVELS == 4
3411 l4_pgentry_t *l4e;
3412 l3_pgentry_t *l3e;
3413 int i3, i4;
3414 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3415 #elif CONFIG_PAGING_LEVELS == 3
3416 l3_pgentry_t *l3e;
3417 int i3;
3418 l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3419 #else /* CONFIG_PAGING_LEVELS == 2 */
3420 l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3421 #endif
3423 gfn = 0;
3424 #if CONFIG_PAGING_LEVELS >= 3
3425 #if CONFIG_PAGING_LEVELS >= 4
3426 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
3428 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
3430 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
3431 continue;
3433 l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
3434 #endif /* now at levels 3 or 4... */
3435 for ( i3 = 0;
3436 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
3437 i3++ )
3439 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
3441 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3442 continue;
3444 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
3445 #endif /* all levels... */
3446 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
3448 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
3450 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
3451 continue;
3453 l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
3455 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
3457 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
3458 continue;
3459 mfn = l1e_get_pfn(l1e[i1]);
3460 ASSERT(mfn_valid(_mfn(mfn)));
3461 m2pfn = get_gpfn_from_mfn(mfn);
3462 if ( m2pfn != gfn )
3464 pmbad++;
3465 SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
3466 " -> gfn %#lx\n", gfn, mfn, m2pfn);
3467 BUG();
3470 sh_unmap_domain_page(l1e);
3472 #if CONFIG_PAGING_LEVELS >= 3
3473 sh_unmap_domain_page(l2e);
3475 #if CONFIG_PAGING_LEVELS >= 4
3476 sh_unmap_domain_page(l3e);
3478 #endif
3479 #endif
3481 #if CONFIG_PAGING_LEVELS == 4
3482 sh_unmap_domain_page(l4e);
3483 #elif CONFIG_PAGING_LEVELS == 3
3484 sh_unmap_domain_page(l3e);
3485 #else /* CONFIG_PAGING_LEVELS == 2 */
3486 sh_unmap_domain_page(l2e);
3487 #endif
3491 //SHADOW_PRINTK("p2m audit complete\n");
3492 //if ( orphans_i | orphans_d | mpbad | pmbad )
3493 // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
3494 // orphans_i + orphans_d, orphans_i, orphans_d,
3495 if ( mpbad | pmbad )
3496 SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
3497 pmbad, mpbad);
3500 #endif /* p2m audit */
3502 /*
3503 * Local variables:
3504 * mode: C
3505 * c-set-style: "BSD"
3506 * c-basic-offset: 4
3507 * indent-tabs-mode: nil
3508 * End:
3509 */