ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 12813:963a02c040f6

[XEN] remove unused monitor_vtable mapping.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Wed Dec 06 12:13:22 2006 +0000 (2006-12-06)
parents a467eb0c5596
children f5121d001d1a
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include "private.h"
41 #if SHADOW_AUDIT
42 int shadow_audit_enable = 0;
44 static void shadow_audit_key(unsigned char key)
45 {
46 shadow_audit_enable = !shadow_audit_enable;
47 printk("%s shadow_audit_enable=%d\n",
48 __func__, shadow_audit_enable);
49 }
51 static int __init shadow_audit_key_init(void)
52 {
53 register_keyhandler(
54 'O', shadow_audit_key, "toggle shadow audits");
55 return 0;
56 }
57 __initcall(shadow_audit_key_init);
58 #endif /* SHADOW_AUDIT */
60 static void sh_free_log_dirty_bitmap(struct domain *d);
62 int _shadow_mode_refcounts(struct domain *d)
63 {
64 return shadow_mode_refcounts(d);
65 }
68 /**************************************************************************/
69 /* x86 emulator support for the shadow code
70 */
72 struct segment_register *hvm_get_seg_reg(
73 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
74 {
75 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
76 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
77 hvm_get_segment_register(current, seg, seg_reg);
78 return seg_reg;
79 }
81 enum hvm_access_type {
82 hvm_access_insn_fetch, hvm_access_read, hvm_access_write
83 };
85 static int hvm_translate_linear_addr(
86 enum x86_segment seg,
87 unsigned long offset,
88 unsigned int bytes,
89 enum hvm_access_type access_type,
90 struct sh_emulate_ctxt *sh_ctxt,
91 unsigned long *paddr)
92 {
93 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
94 unsigned long limit, addr = offset;
95 uint32_t last_byte;
97 if ( sh_ctxt->ctxt.mode != X86EMUL_MODE_PROT64 )
98 {
99 /*
100 * COMPATIBILITY MODE: Apply segment checks and add base.
101 */
103 switch ( access_type )
104 {
105 case hvm_access_read:
106 if ( (reg->attr.fields.type & 0xa) == 0x8 )
107 goto gpf; /* execute-only code segment */
108 break;
109 case hvm_access_write:
110 if ( (reg->attr.fields.type & 0xa) != 0x2 )
111 goto gpf; /* not a writable data segment */
112 break;
113 default:
114 break;
115 }
117 /* Calculate the segment limit, including granularity flag. */
118 limit = reg->limit;
119 if ( reg->attr.fields.g )
120 limit = (limit << 12) | 0xfff;
122 last_byte = offset + bytes - 1;
124 /* Is this a grows-down data segment? Special limit check if so. */
125 if ( (reg->attr.fields.type & 0xc) == 0x4 )
126 {
127 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
128 if ( !reg->attr.fields.db )
129 last_byte = (uint16_t)last_byte;
131 /* Check first byte and last byte against respective bounds. */
132 if ( (offset <= limit) || (last_byte < offset) )
133 goto gpf;
134 }
135 else if ( (last_byte > limit) || (last_byte < offset) )
136 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
138 /*
139 * Hardware truncates to 32 bits in compatibility mode.
140 * It does not truncate to 16 bits in 16-bit address-size mode.
141 */
142 addr = (uint32_t)(addr + reg->base);
143 }
144 else
145 {
146 /*
147 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
148 */
150 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
151 addr += reg->base;
153 if ( !is_canonical_address(addr) )
154 goto gpf;
155 }
157 *paddr = addr;
158 return 0;
160 gpf:
161 /* Inject #GP(0). */
162 hvm_inject_exception(TRAP_gp_fault, 0, 0);
163 return X86EMUL_PROPAGATE_FAULT;
164 }
166 static int
167 hvm_read(enum x86_segment seg,
168 unsigned long offset,
169 unsigned long *val,
170 unsigned int bytes,
171 enum hvm_access_type access_type,
172 struct sh_emulate_ctxt *sh_ctxt)
173 {
174 unsigned long addr;
175 int rc, errcode;
177 rc = hvm_translate_linear_addr(
178 seg, offset, bytes, access_type, sh_ctxt, &addr);
179 if ( rc )
180 return rc;
182 *val = 0;
183 // XXX -- this is WRONG.
184 // It entirely ignores the permissions in the page tables.
185 // In this case, that is only a user vs supervisor access check.
186 //
187 if ( (rc = hvm_copy_from_guest_virt(val, addr, bytes)) == 0 )
188 {
189 #if 0
190 struct vcpu *v = current;
191 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
192 v->domain->domain_id, v->vcpu_id,
193 addr, *val, bytes);
194 #endif
195 return X86EMUL_CONTINUE;
196 }
198 /* If we got here, there was nothing mapped here, or a bad GFN
199 * was mapped here. This should never happen: we're here because
200 * of a write fault at the end of the instruction we're emulating. */
201 SHADOW_PRINTK("read failed to va %#lx\n", addr);
202 errcode = ring_3(sh_ctxt->ctxt.regs) ? PFEC_user_mode : 0;
203 if ( access_type == hvm_access_insn_fetch )
204 errcode |= PFEC_insn_fetch;
205 hvm_inject_exception(TRAP_page_fault, errcode, addr + bytes - rc);
206 return X86EMUL_PROPAGATE_FAULT;
207 }
209 void shadow_init_emulation(struct sh_emulate_ctxt *sh_ctxt,
210 struct cpu_user_regs *regs)
211 {
212 struct segment_register *creg;
213 struct vcpu *v = current;
214 unsigned long addr;
216 sh_ctxt->ctxt.regs = regs;
218 /* Segment cache initialisation. Primed with CS. */
219 sh_ctxt->valid_seg_regs = 0;
220 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
222 /* Work out the emulation mode. */
223 if ( hvm_long_mode_enabled(v) )
224 sh_ctxt->ctxt.mode = creg->attr.fields.l ?
225 X86EMUL_MODE_PROT64 : X86EMUL_MODE_PROT32;
226 else if ( regs->eflags & X86_EFLAGS_VM )
227 sh_ctxt->ctxt.mode = X86EMUL_MODE_REAL;
228 else
229 sh_ctxt->ctxt.mode = creg->attr.fields.db ?
230 X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
232 /* Attempt to prefetch whole instruction. */
233 sh_ctxt->insn_buf_bytes =
234 (!hvm_translate_linear_addr(
235 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
236 hvm_access_insn_fetch, sh_ctxt, &addr) &&
237 !hvm_copy_from_guest_virt(
238 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf)))
239 ? sizeof(sh_ctxt->insn_buf) : 0;
240 }
242 static int
243 sh_x86_emulate_read(enum x86_segment seg,
244 unsigned long offset,
245 unsigned long *val,
246 unsigned int bytes,
247 struct x86_emulate_ctxt *ctxt)
248 {
249 return hvm_read(seg, offset, val, bytes, hvm_access_read,
250 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
251 }
253 static int
254 sh_x86_emulate_insn_fetch(enum x86_segment seg,
255 unsigned long offset,
256 unsigned long *val,
257 unsigned int bytes,
258 struct x86_emulate_ctxt *ctxt)
259 {
260 struct sh_emulate_ctxt *sh_ctxt =
261 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
262 unsigned int insn_off = offset - ctxt->regs->eip;
264 /* Fall back if requested bytes are not in the prefetch cache. */
265 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
266 return hvm_read(seg, offset, val, bytes,
267 hvm_access_insn_fetch, sh_ctxt);
269 /* Hit the cache. Simple memcpy. */
270 *val = 0;
271 memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
272 return X86EMUL_CONTINUE;
273 }
275 static int
276 sh_x86_emulate_write(enum x86_segment seg,
277 unsigned long offset,
278 unsigned long val,
279 unsigned int bytes,
280 struct x86_emulate_ctxt *ctxt)
281 {
282 struct sh_emulate_ctxt *sh_ctxt =
283 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
284 struct vcpu *v = current;
285 unsigned long addr;
286 int rc;
288 rc = hvm_translate_linear_addr(
289 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
290 if ( rc )
291 return rc;
293 #if 0
294 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
295 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
296 #endif
297 return v->arch.shadow.mode->x86_emulate_write(
298 v, addr, &val, bytes, sh_ctxt);
299 }
301 static int
302 sh_x86_emulate_cmpxchg(enum x86_segment seg,
303 unsigned long offset,
304 unsigned long old,
305 unsigned long new,
306 unsigned int bytes,
307 struct x86_emulate_ctxt *ctxt)
308 {
309 struct sh_emulate_ctxt *sh_ctxt =
310 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
311 struct vcpu *v = current;
312 unsigned long addr;
313 int rc;
315 rc = hvm_translate_linear_addr(
316 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
317 if ( rc )
318 return rc;
320 #if 0
321 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
322 v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
323 #endif
324 return v->arch.shadow.mode->x86_emulate_cmpxchg(
325 v, addr, old, new, bytes, sh_ctxt);
326 }
328 static int
329 sh_x86_emulate_cmpxchg8b(enum x86_segment seg,
330 unsigned long offset,
331 unsigned long old_lo,
332 unsigned long old_hi,
333 unsigned long new_lo,
334 unsigned long new_hi,
335 struct x86_emulate_ctxt *ctxt)
336 {
337 struct sh_emulate_ctxt *sh_ctxt =
338 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
339 struct vcpu *v = current;
340 unsigned long addr;
341 int rc;
343 rc = hvm_translate_linear_addr(
344 seg, offset, 8, hvm_access_write, sh_ctxt, &addr);
345 if ( rc )
346 return rc;
348 #if 0
349 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
350 v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
351 new_hi, new_lo, ctxt);
352 #endif
353 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(
354 v, addr, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
355 }
358 struct x86_emulate_ops shadow_emulator_ops = {
359 .read = sh_x86_emulate_read,
360 .insn_fetch = sh_x86_emulate_insn_fetch,
361 .write = sh_x86_emulate_write,
362 .cmpxchg = sh_x86_emulate_cmpxchg,
363 .cmpxchg8b = sh_x86_emulate_cmpxchg8b,
364 };
366 /**************************************************************************/
367 /* Code for "promoting" a guest page to the point where the shadow code is
368 * willing to let it be treated as a guest page table. This generally
369 * involves making sure there are no writable mappings available to the guest
370 * for this page.
371 */
372 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
373 {
374 struct page_info *page = mfn_to_page(gmfn);
376 ASSERT(mfn_valid(gmfn));
378 /* We should never try to promote a gmfn that has writeable mappings */
379 ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
381 /* Is the page already shadowed? */
382 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
383 page->shadow_flags = 0;
385 ASSERT(!test_bit(type, &page->shadow_flags));
386 set_bit(type, &page->shadow_flags);
387 }
389 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
390 {
391 struct page_info *page = mfn_to_page(gmfn);
393 ASSERT(test_bit(_PGC_page_table, &page->count_info));
394 ASSERT(test_bit(type, &page->shadow_flags));
396 clear_bit(type, &page->shadow_flags);
398 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
399 {
400 /* tlbflush timestamp field is valid again */
401 page->tlbflush_timestamp = tlbflush_current_time();
402 clear_bit(_PGC_page_table, &page->count_info);
403 }
404 }
406 /**************************************************************************/
407 /* Validate a pagetable change from the guest and update the shadows.
408 * Returns a bitmask of SHADOW_SET_* flags. */
410 int
411 __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
412 void *entry, u32 size)
413 {
414 int result = 0;
415 struct page_info *page = mfn_to_page(gmfn);
417 sh_mark_dirty(v->domain, gmfn);
419 // Determine which types of shadows are affected, and update each.
420 //
421 // Always validate L1s before L2s to prevent another cpu with a linear
422 // mapping of this gmfn from seeing a walk that results from
423 // using the new L2 value and the old L1 value. (It is OK for such a
424 // guest to see a walk that uses the old L2 value with the new L1 value,
425 // as hardware could behave this way if one level of the pagewalk occurs
426 // before the store, and the next level of the pagewalk occurs after the
427 // store.
428 //
429 // Ditto for L2s before L3s, etc.
430 //
432 if ( !(page->count_info & PGC_page_table) )
433 return 0; /* Not shadowed at all */
435 #if CONFIG_PAGING_LEVELS == 2
436 if ( page->shadow_flags & SHF_L1_32 )
437 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
438 (v, gmfn, entry, size);
439 #else
440 if ( page->shadow_flags & SHF_L1_32 )
441 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
442 (v, gmfn, entry, size);
443 #endif
445 #if CONFIG_PAGING_LEVELS == 2
446 if ( page->shadow_flags & SHF_L2_32 )
447 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
448 (v, gmfn, entry, size);
449 #else
450 if ( page->shadow_flags & SHF_L2_32 )
451 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
452 (v, gmfn, entry, size);
453 #endif
455 #if CONFIG_PAGING_LEVELS >= 3
456 if ( page->shadow_flags & SHF_L1_PAE )
457 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
458 (v, gmfn, entry, size);
459 if ( page->shadow_flags & SHF_L2_PAE )
460 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
461 (v, gmfn, entry, size);
462 if ( page->shadow_flags & SHF_L2H_PAE )
463 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
464 (v, gmfn, entry, size);
465 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
466 ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
467 #endif
469 #if CONFIG_PAGING_LEVELS >= 4
470 if ( page->shadow_flags & SHF_L1_64 )
471 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
472 (v, gmfn, entry, size);
473 if ( page->shadow_flags & SHF_L2_64 )
474 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
475 (v, gmfn, entry, size);
476 if ( page->shadow_flags & SHF_L3_64 )
477 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
478 (v, gmfn, entry, size);
479 if ( page->shadow_flags & SHF_L4_64 )
480 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
481 (v, gmfn, entry, size);
482 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
483 ASSERT((page->shadow_flags
484 & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
485 #endif
487 return result;
488 }
491 int
492 shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
493 /* This is the entry point from hypercalls. It returns a bitmask of all the
494 * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
495 {
496 int rc;
498 ASSERT(shadow_lock_is_acquired(v->domain));
499 rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
500 shadow_audit_tables(v);
501 return rc;
502 }
504 void
505 shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
506 void *entry, u32 size)
507 /* This is the entry point for emulated writes to pagetables in HVM guests and
508 * PV translated guests.
509 */
510 {
511 struct domain *d = v->domain;
512 int rc;
514 ASSERT(shadow_lock_is_acquired(v->domain));
515 rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
516 if ( rc & SHADOW_SET_FLUSH )
517 /* Need to flush TLBs to pick up shadow PT changes */
518 flush_tlb_mask(d->domain_dirty_cpumask);
519 if ( rc & SHADOW_SET_ERROR )
520 {
521 /* This page is probably not a pagetable any more: tear it out of the
522 * shadows, along with any tables that reference it.
523 * Since the validate call above will have made a "safe" (i.e. zero)
524 * shadow entry, we can let the domain live even if we can't fully
525 * unshadow the page. */
526 sh_remove_shadows(v, gmfn, 0, 0);
527 }
528 }
531 /**************************************************************************/
532 /* Memory management for shadow pages. */
534 /* Allocating shadow pages
535 * -----------------------
536 *
537 * Most shadow pages are allocated singly, but there is one case where
538 * we need to allocate multiple pages together: shadowing 32-bit guest
539 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
540 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
541 * l1 tables (covering 2MB of virtual address space each). Similarly, a
542 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
543 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
544 * contiguous and aligned; functions for handling offsets into them are
545 * defined in shadow.c (shadow_l1_index() etc.)
546 *
547 * This table shows the allocation behaviour of the different modes:
548 *
549 * Xen paging 32b pae pae 64b 64b 64b
550 * Guest paging 32b 32b pae 32b pae 64b
551 * PV or HVM * HVM * HVM HVM *
552 * Shadow paging 32b pae pae pae pae 64b
553 *
554 * sl1 size 4k 8k 4k 8k 4k 4k
555 * sl2 size 4k 16k 4k 16k 4k 4k
556 * sl3 size - - - - - 4k
557 * sl4 size - - - - - 4k
558 *
559 * We allocate memory from xen in four-page units and break them down
560 * with a simple buddy allocator. Can't use the xen allocator to handle
561 * this as it only works for contiguous zones, and a domain's shadow
562 * pool is made of fragments.
563 *
564 * In HVM guests, the p2m table is built out of shadow pages, and we provide
565 * a function for the p2m management to steal pages, in max-order chunks, from
566 * the free pool. We don't provide for giving them back, yet.
567 */
569 /* Figure out the least acceptable quantity of shadow memory.
570 * The minimum memory requirement for always being able to free up a
571 * chunk of memory is very small -- only three max-order chunks per
572 * vcpu to hold the top level shadows and pages with Xen mappings in them.
573 *
574 * But for a guest to be guaranteed to successfully execute a single
575 * instruction, we must be able to map a large number (about thirty) VAs
576 * at the same time, which means that to guarantee progress, we must
577 * allow for more than ninety allocated pages per vcpu. We round that
578 * up to 128 pages, or half a megabyte per vcpu. */
579 unsigned int shadow_min_acceptable_pages(struct domain *d)
580 {
581 u32 vcpu_count = 0;
582 struct vcpu *v;
584 for_each_vcpu(d, v)
585 vcpu_count++;
587 return (vcpu_count * 128);
588 }
590 /* Figure out the order of allocation needed for a given shadow type */
591 static inline u32
592 shadow_order(unsigned int shadow_type)
593 {
594 #if CONFIG_PAGING_LEVELS > 2
595 static const u32 type_to_order[16] = {
596 0, /* SH_type_none */
597 1, /* SH_type_l1_32_shadow */
598 1, /* SH_type_fl1_32_shadow */
599 2, /* SH_type_l2_32_shadow */
600 0, /* SH_type_l1_pae_shadow */
601 0, /* SH_type_fl1_pae_shadow */
602 0, /* SH_type_l2_pae_shadow */
603 0, /* SH_type_l2h_pae_shadow */
604 0, /* SH_type_l1_64_shadow */
605 0, /* SH_type_fl1_64_shadow */
606 0, /* SH_type_l2_64_shadow */
607 0, /* SH_type_l3_64_shadow */
608 0, /* SH_type_l4_64_shadow */
609 2, /* SH_type_p2m_table */
610 0 /* SH_type_monitor_table */
611 };
612 ASSERT(shadow_type < 16);
613 return type_to_order[shadow_type];
614 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
615 return 0;
616 #endif
617 }
620 /* Do we have a free chunk of at least this order? */
621 static inline int chunk_is_available(struct domain *d, int order)
622 {
623 int i;
625 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
626 if ( !list_empty(&d->arch.shadow.freelists[i]) )
627 return 1;
628 return 0;
629 }
631 /* Dispatcher function: call the per-mode function that will unhook the
632 * non-Xen mappings in this top-level shadow mfn */
633 void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
634 {
635 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
636 switch ( sp->type )
637 {
638 case SH_type_l2_32_shadow:
639 #if CONFIG_PAGING_LEVELS == 2
640 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
641 #else
642 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
643 #endif
644 break;
645 #if CONFIG_PAGING_LEVELS >= 3
646 case SH_type_l2_pae_shadow:
647 case SH_type_l2h_pae_shadow:
648 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
649 break;
650 #endif
651 #if CONFIG_PAGING_LEVELS >= 4
652 case SH_type_l4_64_shadow:
653 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
654 break;
655 #endif
656 default:
657 SHADOW_PRINTK("top-level shadow has bad type %08x\n", sp->type);
658 BUG();
659 }
660 }
663 /* Make sure there is at least one chunk of the required order available
664 * in the shadow page pool. This must be called before any calls to
665 * shadow_alloc(). Since this will free existing shadows to make room,
666 * it must be called early enough to avoid freeing shadows that the
667 * caller is currently working on. */
668 void shadow_prealloc(struct domain *d, unsigned int order)
669 {
670 /* Need a vpcu for calling unpins; for now, since we don't have
671 * per-vcpu shadows, any will do */
672 struct vcpu *v, *v2;
673 struct list_head *l, *t;
674 struct shadow_page_info *sp;
675 cpumask_t flushmask = CPU_MASK_NONE;
676 mfn_t smfn;
677 int i;
679 if ( chunk_is_available(d, order) ) return;
681 v = current;
682 if ( v->domain != d )
683 v = d->vcpu[0];
684 ASSERT(v != NULL);
686 /* Stage one: walk the list of pinned pages, unpinning them */
687 perfc_incrc(shadow_prealloc_1);
688 list_for_each_backwards_safe(l, t, &d->arch.shadow.pinned_shadows)
689 {
690 sp = list_entry(l, struct shadow_page_info, list);
691 smfn = shadow_page_to_mfn(sp);
693 /* Unpin this top-level shadow */
694 sh_unpin(v, smfn);
696 /* See if that freed up a chunk of appropriate size */
697 if ( chunk_is_available(d, order) ) return;
698 }
700 /* Stage two: all shadow pages are in use in hierarchies that are
701 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
702 * mappings. */
703 perfc_incrc(shadow_prealloc_2);
705 for_each_vcpu(d, v2)
706 for ( i = 0 ; i < 4 ; i++ )
707 {
708 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
709 {
710 shadow_unhook_mappings(v,
711 pagetable_get_mfn(v2->arch.shadow_table[i]));
712 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
714 /* See if that freed up a chunk of appropriate size */
715 if ( chunk_is_available(d, order) )
716 {
717 flush_tlb_mask(flushmask);
718 return;
719 }
720 }
721 }
723 /* Nothing more we can do: all remaining shadows are of pages that
724 * hold Xen mappings for some vcpu. This can never happen. */
725 SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
726 " shadow pages total = %u, free = %u, p2m=%u\n",
727 1 << order,
728 d->arch.shadow.total_pages,
729 d->arch.shadow.free_pages,
730 d->arch.shadow.p2m_pages);
731 BUG();
732 }
734 /* Deliberately free all the memory we can: this will tear down all of
735 * this domain's shadows */
736 static void shadow_blow_tables(struct domain *d)
737 {
738 struct list_head *l, *t;
739 struct shadow_page_info *sp;
740 struct vcpu *v = d->vcpu[0];
741 mfn_t smfn;
742 int i;
744 /* Pass one: unpin all pinned pages */
745 list_for_each_backwards_safe(l,t, &d->arch.shadow.pinned_shadows)
746 {
747 sp = list_entry(l, struct shadow_page_info, list);
748 smfn = shadow_page_to_mfn(sp);
749 sh_unpin(v, smfn);
750 }
752 /* Second pass: unhook entries of in-use shadows */
753 for_each_vcpu(d, v)
754 for ( i = 0 ; i < 4 ; i++ )
755 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
756 shadow_unhook_mappings(v,
757 pagetable_get_mfn(v->arch.shadow_table[i]));
759 /* Make sure everyone sees the unshadowings */
760 flush_tlb_mask(d->domain_dirty_cpumask);
761 }
764 #ifndef NDEBUG
765 /* Blow all shadows of all shadowed domains: this can be used to cause the
766 * guest's pagetables to be re-shadowed if we suspect that the shadows
767 * have somehow got out of sync */
768 static void shadow_blow_all_tables(unsigned char c)
769 {
770 struct domain *d;
771 printk("'%c' pressed -> blowing all shadow tables\n", c);
772 for_each_domain(d)
773 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
774 {
775 shadow_lock(d);
776 shadow_blow_tables(d);
777 shadow_unlock(d);
778 }
779 }
781 /* Register this function in the Xen console keypress table */
782 static __init int shadow_blow_tables_keyhandler_init(void)
783 {
784 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
785 return 0;
786 }
787 __initcall(shadow_blow_tables_keyhandler_init);
788 #endif /* !NDEBUG */
790 /* Allocate another shadow's worth of (contiguous, aligned) pages,
791 * and fill in the type and backpointer fields of their page_infos.
792 * Never fails to allocate. */
793 mfn_t shadow_alloc(struct domain *d,
794 u32 shadow_type,
795 unsigned long backpointer)
796 {
797 struct shadow_page_info *sp = NULL;
798 unsigned int order = shadow_order(shadow_type);
799 cpumask_t mask;
800 void *p;
801 int i;
803 ASSERT(shadow_lock_is_acquired(d));
804 ASSERT(order <= SHADOW_MAX_ORDER);
805 ASSERT(shadow_type != SH_type_none);
806 perfc_incrc(shadow_alloc);
808 /* Find smallest order which can satisfy the request. */
809 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
810 if ( !list_empty(&d->arch.shadow.freelists[i]) )
811 {
812 sp = list_entry(d->arch.shadow.freelists[i].next,
813 struct shadow_page_info, list);
814 list_del(&sp->list);
816 /* We may have to halve the chunk a number of times. */
817 while ( i != order )
818 {
819 i--;
820 sp->order = i;
821 list_add_tail(&sp->list, &d->arch.shadow.freelists[i]);
822 sp += 1 << i;
823 }
824 d->arch.shadow.free_pages -= 1 << order;
826 /* Init page info fields and clear the pages */
827 for ( i = 0; i < 1<<order ; i++ )
828 {
829 /* Before we overwrite the old contents of this page,
830 * we need to be sure that no TLB holds a pointer to it. */
831 mask = d->domain_dirty_cpumask;
832 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
833 if ( unlikely(!cpus_empty(mask)) )
834 {
835 perfc_incrc(shadow_alloc_tlbflush);
836 flush_tlb_mask(mask);
837 }
838 /* Now safe to clear the page for reuse */
839 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
840 ASSERT(p != NULL);
841 clear_page(p);
842 sh_unmap_domain_page(p);
843 INIT_LIST_HEAD(&sp[i].list);
844 sp[i].type = shadow_type;
845 sp[i].pinned = 0;
846 sp[i].logdirty = 0;
847 sp[i].count = 0;
848 sp[i].backpointer = backpointer;
849 sp[i].next_shadow = NULL;
850 perfc_incr(shadow_alloc_count);
851 }
852 return shadow_page_to_mfn(sp);
853 }
855 /* If we get here, we failed to allocate. This should never happen.
856 * It means that we didn't call shadow_prealloc() correctly before
857 * we allocated. We can't recover by calling prealloc here, because
858 * we might free up higher-level pages that the caller is working on. */
859 SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
860 BUG();
861 }
864 /* Return some shadow pages to the pool. */
865 void shadow_free(struct domain *d, mfn_t smfn)
866 {
867 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
868 u32 shadow_type;
869 unsigned long order;
870 unsigned long mask;
871 int i;
873 ASSERT(shadow_lock_is_acquired(d));
874 perfc_incrc(shadow_free);
876 shadow_type = sp->type;
877 ASSERT(shadow_type != SH_type_none);
878 ASSERT(shadow_type != SH_type_p2m_table);
879 order = shadow_order(shadow_type);
881 d->arch.shadow.free_pages += 1 << order;
883 for ( i = 0; i < 1<<order; i++ )
884 {
885 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
886 struct vcpu *v;
887 for_each_vcpu(d, v)
888 {
889 /* No longer safe to look for a writeable mapping in this shadow */
890 if ( v->arch.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
891 v->arch.shadow.last_writeable_pte_smfn = 0;
892 }
893 #endif
894 /* Strip out the type: this is now a free shadow page */
895 sp[i].type = 0;
896 /* Remember the TLB timestamp so we will know whether to flush
897 * TLBs when we reuse the page. Because the destructors leave the
898 * contents of the pages in place, we can delay TLB flushes until
899 * just before the allocator hands the page out again. */
900 sp[i].tlbflush_timestamp = tlbflush_current_time();
901 perfc_decr(shadow_alloc_count);
902 }
904 /* Merge chunks as far as possible. */
905 while ( order < SHADOW_MAX_ORDER )
906 {
907 mask = 1 << order;
908 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
909 /* Merge with predecessor block? */
910 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
911 break;
912 list_del(&(sp-mask)->list);
913 sp -= mask;
914 } else {
915 /* Merge with successor block? */
916 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
917 break;
918 list_del(&(sp+mask)->list);
919 }
920 order++;
921 }
923 sp->order = order;
924 list_add_tail(&sp->list, &d->arch.shadow.freelists[order]);
925 }
927 /* Divert some memory from the pool to be used by the p2m mapping.
928 * This action is irreversible: the p2m mapping only ever grows.
929 * That's OK because the p2m table only exists for translated domains,
930 * and those domains can't ever turn off shadow mode.
931 * Also, we only ever allocate a max-order chunk, so as to preserve
932 * the invariant that shadow_prealloc() always works.
933 * Returns 0 iff it can't get a chunk (the caller should then
934 * free up some pages in domheap and call set_sh_allocation);
935 * returns non-zero on success.
936 */
937 static int
938 shadow_alloc_p2m_pages(struct domain *d)
939 {
940 struct page_info *pg;
941 u32 i;
942 ASSERT(shadow_lock_is_acquired(d));
944 if ( d->arch.shadow.total_pages
945 < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
946 return 0; /* Not enough shadow memory: need to increase it first */
948 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
949 d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
950 d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
951 for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
952 {
953 /* Unlike shadow pages, mark p2m pages as owned by the domain.
954 * Marking the domain as the owner would normally allow the guest to
955 * create mappings of these pages, but these p2m pages will never be
956 * in the domain's guest-physical address space, and so that is not
957 * believed to be a concern.
958 */
959 page_set_owner(&pg[i], d);
960 list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
961 }
962 return 1;
963 }
965 // Returns 0 if no memory is available...
966 mfn_t
967 shadow_alloc_p2m_page(struct domain *d)
968 {
969 struct list_head *entry;
970 struct page_info *pg;
971 mfn_t mfn;
972 void *p;
974 if ( list_empty(&d->arch.shadow.p2m_freelist) &&
975 !shadow_alloc_p2m_pages(d) )
976 return _mfn(0);
977 entry = d->arch.shadow.p2m_freelist.next;
978 list_del(entry);
979 list_add_tail(entry, &d->arch.shadow.p2m_inuse);
980 pg = list_entry(entry, struct page_info, list);
981 pg->count_info = 1;
982 mfn = page_to_mfn(pg);
983 p = sh_map_domain_page(mfn);
984 clear_page(p);
985 sh_unmap_domain_page(p);
987 return mfn;
988 }
990 #if CONFIG_PAGING_LEVELS == 3
991 static void p2m_install_entry_in_monitors(struct domain *d,
992 l3_pgentry_t *l3e)
993 /* Special case, only used for external-mode domains on PAE hosts:
994 * update the mapping of the p2m table. Once again, this is trivial in
995 * other paging modes (one top-level entry points to the top-level p2m,
996 * no maintenance needed), but PAE makes life difficult by needing a
997 * copy the eight l3es of the p2m table in eight l2h slots in the
998 * monitor table. This function makes fresh copies when a p2m l3e
999 * changes. */
1001 l2_pgentry_t *ml2e;
1002 struct vcpu *v;
1003 unsigned int index;
1005 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1006 ASSERT(index < MACHPHYS_MBYTES>>1);
1008 for_each_vcpu(d, v)
1010 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1011 continue;
1012 ASSERT(shadow_mode_external(v->domain));
1014 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1015 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1017 if ( v == current ) /* OK to use linear map of monitor_table */
1018 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1019 else
1021 l3_pgentry_t *ml3e;
1022 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1023 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1024 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1025 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1026 sh_unmap_domain_page(ml3e);
1028 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1029 if ( v != current )
1030 sh_unmap_domain_page(ml2e);
1033 #endif
1035 // Find the next level's P2M entry, checking for out-of-range gfn's...
1036 // Returns NULL on error.
1037 //
1038 static l1_pgentry_t *
1039 p2m_find_entry(void *table, unsigned long *gfn_remainder,
1040 unsigned long gfn, u32 shift, u32 max)
1042 u32 index;
1044 index = *gfn_remainder >> shift;
1045 if ( index >= max )
1047 SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
1048 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
1049 gfn, *gfn_remainder, shift, index, max);
1050 return NULL;
1052 *gfn_remainder &= (1 << shift) - 1;
1053 return (l1_pgentry_t *)table + index;
1056 // Walk one level of the P2M table, allocating a new table if required.
1057 // Returns 0 on error.
1058 //
1059 static int
1060 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
1061 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
1062 u32 max, unsigned long type)
1064 l1_pgentry_t *p2m_entry;
1065 void *next;
1067 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
1068 shift, max)) )
1069 return 0;
1071 if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
1073 mfn_t mfn = shadow_alloc_p2m_page(d);
1074 if ( mfn_x(mfn) == 0 )
1075 return 0;
1076 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1077 mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
1078 mfn_to_page(mfn)->count_info = 1;
1079 #if CONFIG_PAGING_LEVELS == 3
1080 if (type == PGT_l2_page_table)
1082 struct vcpu *v;
1083 /* We have written to the p2m l3: need to sync the per-vcpu
1084 * copies of it in the monitor tables */
1085 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
1086 /* Also, any vcpus running on shadows of the p2m need to
1087 * reload their CR3s so the change propagates to the shadow */
1088 ASSERT(shadow_lock_is_acquired(d));
1089 for_each_vcpu(d, v)
1091 if ( pagetable_get_pfn(v->arch.guest_table)
1092 == pagetable_get_pfn(d->arch.phys_table)
1093 && v->arch.shadow.mode != NULL )
1094 v->arch.shadow.mode->update_cr3(v);
1097 #endif
1098 /* The P2M can be shadowed: keep the shadows synced */
1099 if ( d->vcpu[0] != NULL )
1100 (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
1101 p2m_entry, sizeof *p2m_entry);
1103 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
1104 next = sh_map_domain_page(*table_mfn);
1105 sh_unmap_domain_page(*table);
1106 *table = next;
1108 return 1;
1111 // Returns 0 on error (out of memory)
1112 int
1113 shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
1115 // XXX -- this might be able to be faster iff current->domain == d
1116 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
1117 void *table = sh_map_domain_page(table_mfn);
1118 unsigned long gfn_remainder = gfn;
1119 l1_pgentry_t *p2m_entry;
1120 int rv=0;
1122 #if CONFIG_PAGING_LEVELS >= 4
1123 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1124 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
1125 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
1126 goto out;
1127 #endif
1128 #if CONFIG_PAGING_LEVELS >= 3
1129 // When using PAE Xen, we only allow 33 bits of pseudo-physical
1130 // address in translated guests (i.e. 8 GBytes). This restriction
1131 // comes from wanting to map the P2M table into the 16MB RO_MPT hole
1132 // in Xen's address space for translated PV guests.
1133 //
1134 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1135 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
1136 (CONFIG_PAGING_LEVELS == 3
1137 ? 8
1138 : L3_PAGETABLE_ENTRIES),
1139 PGT_l2_page_table) )
1140 goto out;
1141 #endif
1142 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1143 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
1144 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
1145 goto out;
1147 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
1148 0, L1_PAGETABLE_ENTRIES);
1149 ASSERT(p2m_entry);
1150 if ( mfn_valid(mfn) )
1151 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1152 else
1153 *p2m_entry = l1e_empty();
1155 /* Track the highest gfn for which we have ever had a valid mapping */
1156 if ( mfn_valid(mfn) && (gfn > d->arch.max_mapped_pfn) )
1157 d->arch.max_mapped_pfn = gfn;
1159 /* The P2M can be shadowed: keep the shadows synced */
1160 if ( d->vcpu[0] != NULL )
1161 (void)__shadow_validate_guest_entry(
1162 d->vcpu[0], table_mfn, p2m_entry, sizeof(*p2m_entry));
1164 /* Success */
1165 rv = 1;
1167 out:
1168 sh_unmap_domain_page(table);
1169 return rv;
1172 // Allocate a new p2m table for a domain.
1173 //
1174 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
1175 // controlled by CONFIG_PAGING_LEVELS).
1176 //
1177 // Returns 0 if p2m table could not be initialized
1178 //
1179 static int
1180 shadow_alloc_p2m_table(struct domain *d)
1182 mfn_t p2m_top, mfn;
1183 struct list_head *entry;
1184 struct page_info *page;
1185 unsigned int page_count = 0;
1186 unsigned long gfn;
1188 SHADOW_PRINTK("allocating p2m table\n");
1189 ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
1191 p2m_top = shadow_alloc_p2m_page(d);
1192 mfn_to_page(p2m_top)->count_info = 1;
1193 mfn_to_page(p2m_top)->u.inuse.type_info =
1194 #if CONFIG_PAGING_LEVELS == 4
1195 PGT_l4_page_table
1196 #elif CONFIG_PAGING_LEVELS == 3
1197 PGT_l3_page_table
1198 #elif CONFIG_PAGING_LEVELS == 2
1199 PGT_l2_page_table
1200 #endif
1201 | 1 | PGT_validated;
1203 if ( mfn_x(p2m_top) == 0 )
1204 return 0;
1206 d->arch.phys_table = pagetable_from_mfn(p2m_top);
1208 SHADOW_PRINTK("populating p2m table\n");
1210 /* Initialise physmap tables for slot zero. Other code assumes this. */
1211 gfn = 0;
1212 mfn = _mfn(INVALID_MFN);
1213 if ( !shadow_set_p2m_entry(d, gfn, mfn) )
1214 goto error;
1216 for ( entry = d->page_list.next;
1217 entry != &d->page_list;
1218 entry = entry->next )
1220 page = list_entry(entry, struct page_info, list);
1221 mfn = page_to_mfn(page);
1222 gfn = get_gpfn_from_mfn(mfn_x(mfn));
1223 page_count++;
1224 if (
1225 #ifdef __x86_64__
1226 (gfn != 0x5555555555555555L)
1227 #else
1228 (gfn != 0x55555555L)
1229 #endif
1230 && gfn != INVALID_M2P_ENTRY
1231 && !shadow_set_p2m_entry(d, gfn, mfn) )
1232 goto error;
1235 SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
1236 return 1;
1238 error:
1239 SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
1240 SH_PRI_mfn "\n", gfn, mfn_x(mfn));
1241 return 0;
1244 mfn_t
1245 sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1246 /* Read another domain's p2m entries */
1248 mfn_t mfn;
1249 paddr_t addr = ((paddr_t)gpfn) << PAGE_SHIFT;
1250 l2_pgentry_t *l2e;
1251 l1_pgentry_t *l1e;
1253 ASSERT(shadow_mode_translate(d));
1254 mfn = pagetable_get_mfn(d->arch.phys_table);
1257 if ( gpfn > d->arch.max_mapped_pfn )
1258 /* This pfn is higher than the highest the p2m map currently holds */
1259 return _mfn(INVALID_MFN);
1261 #if CONFIG_PAGING_LEVELS >= 4
1263 l4_pgentry_t *l4e = sh_map_domain_page(mfn);
1264 l4e += l4_table_offset(addr);
1265 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1267 sh_unmap_domain_page(l4e);
1268 return _mfn(INVALID_MFN);
1270 mfn = _mfn(l4e_get_pfn(*l4e));
1271 sh_unmap_domain_page(l4e);
1273 #endif
1274 #if CONFIG_PAGING_LEVELS >= 3
1276 l3_pgentry_t *l3e = sh_map_domain_page(mfn);
1277 #if CONFIG_PAGING_LEVELS == 3
1278 /* On PAE hosts the p2m has eight l3 entries, not four (see
1279 * shadow_set_p2m_entry()) so we can't use l3_table_offset.
1280 * Instead, just count the number of l3es from zero. It's safe
1281 * to do this because we already checked that the gfn is within
1282 * the bounds of the p2m. */
1283 l3e += (addr >> L3_PAGETABLE_SHIFT);
1284 #else
1285 l3e += l3_table_offset(addr);
1286 #endif
1287 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1289 sh_unmap_domain_page(l3e);
1290 return _mfn(INVALID_MFN);
1292 mfn = _mfn(l3e_get_pfn(*l3e));
1293 sh_unmap_domain_page(l3e);
1295 #endif
1297 l2e = sh_map_domain_page(mfn);
1298 l2e += l2_table_offset(addr);
1299 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1301 sh_unmap_domain_page(l2e);
1302 return _mfn(INVALID_MFN);
1304 mfn = _mfn(l2e_get_pfn(*l2e));
1305 sh_unmap_domain_page(l2e);
1307 l1e = sh_map_domain_page(mfn);
1308 l1e += l1_table_offset(addr);
1309 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1311 sh_unmap_domain_page(l1e);
1312 return _mfn(INVALID_MFN);
1314 mfn = _mfn(l1e_get_pfn(*l1e));
1315 sh_unmap_domain_page(l1e);
1317 return mfn;
1320 unsigned long
1321 shadow_gfn_to_mfn_foreign(unsigned long gpfn)
1323 return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
1327 static void shadow_p2m_teardown(struct domain *d)
1328 /* Return all the p2m pages to Xen.
1329 * We know we don't have any extra mappings to these pages */
1331 struct list_head *entry, *n;
1332 struct page_info *pg;
1334 d->arch.phys_table = pagetable_null();
1336 list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
1338 pg = list_entry(entry, struct page_info, list);
1339 list_del(entry);
1340 /* Should have just the one ref we gave it in alloc_p2m_page() */
1341 if ( (pg->count_info & PGC_count_mask) != 1 )
1343 SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1344 pg->count_info, pg->u.inuse.type_info);
1346 ASSERT(page_get_owner(pg) == d);
1347 /* Free should not decrement domain's total allocation, since
1348 * these pages were allocated without an owner. */
1349 page_set_owner(pg, NULL);
1350 free_domheap_pages(pg, 0);
1351 d->arch.shadow.p2m_pages--;
1352 perfc_decr(shadow_alloc_count);
1354 list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
1356 list_del(entry);
1357 pg = list_entry(entry, struct page_info, list);
1358 ASSERT(page_get_owner(pg) == d);
1359 /* Free should not decrement domain's total allocation. */
1360 page_set_owner(pg, NULL);
1361 free_domheap_pages(pg, 0);
1362 d->arch.shadow.p2m_pages--;
1363 perfc_decr(shadow_alloc_count);
1365 ASSERT(d->arch.shadow.p2m_pages == 0);
1368 /* Set the pool of shadow pages to the required number of pages.
1369 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1370 * plus space for the p2m table.
1371 * Returns 0 for success, non-zero for failure. */
1372 static unsigned int set_sh_allocation(struct domain *d,
1373 unsigned int pages,
1374 int *preempted)
1376 struct shadow_page_info *sp;
1377 unsigned int lower_bound;
1378 int j;
1380 ASSERT(shadow_lock_is_acquired(d));
1382 /* Don't allocate less than the minimum acceptable, plus one page per
1383 * megabyte of RAM (for the p2m table) */
1384 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1385 if ( pages > 0 && pages < lower_bound )
1386 pages = lower_bound;
1387 /* Round up to largest block size */
1388 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1390 SHADOW_PRINTK("current %i target %i\n",
1391 d->arch.shadow.total_pages, pages);
1393 while ( d->arch.shadow.total_pages != pages )
1395 if ( d->arch.shadow.total_pages < pages )
1397 /* Need to allocate more memory from domheap */
1398 sp = (struct shadow_page_info *)
1399 alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
1400 if ( sp == NULL )
1402 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1403 return -ENOMEM;
1405 d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1406 d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1407 for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
1409 sp[j].type = 0;
1410 sp[j].pinned = 0;
1411 sp[j].logdirty = 0;
1412 sp[j].count = 0;
1413 sp[j].mbz = 0;
1414 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1416 sp->order = SHADOW_MAX_ORDER;
1417 list_add_tail(&sp->list,
1418 &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
1420 else if ( d->arch.shadow.total_pages > pages )
1422 /* Need to return memory to domheap */
1423 shadow_prealloc(d, SHADOW_MAX_ORDER);
1424 ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
1425 sp = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
1426 struct shadow_page_info, list);
1427 list_del(&sp->list);
1428 d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1429 d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1430 free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER);
1433 /* Check to see if we need to yield and try again */
1434 if ( preempted && hypercall_preempt_check() )
1436 *preempted = 1;
1437 return 0;
1441 return 0;
1444 unsigned int shadow_set_allocation(struct domain *d,
1445 unsigned int megabytes,
1446 int *preempted)
1447 /* Hypercall interface to set the shadow memory allocation */
1449 unsigned int rv;
1450 shadow_lock(d);
1451 rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
1452 SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
1453 d->domain_id,
1454 d->arch.shadow.total_pages,
1455 shadow_get_allocation(d));
1456 shadow_unlock(d);
1457 return rv;
1460 /**************************************************************************/
1461 /* Hash table for storing the guest->shadow mappings.
1462 * The table itself is an array of pointers to shadows; the shadows are then
1463 * threaded on a singly-linked list of shadows with the same hash value */
1465 #define SHADOW_HASH_BUCKETS 251
1466 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1468 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1469 typedef u32 key_t;
1470 static inline key_t sh_hash(unsigned long n, unsigned int t)
1472 unsigned char *p = (unsigned char *)&n;
1473 key_t k = t;
1474 int i;
1475 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1476 return k % SHADOW_HASH_BUCKETS;
1479 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1481 /* Before we get to the mechanism, define a pair of audit functions
1482 * that sanity-check the contents of the hash table. */
1483 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1484 /* Audit one bucket of the hash table */
1486 struct shadow_page_info *sp, *x;
1488 if ( !(SHADOW_AUDIT_ENABLE) )
1489 return;
1491 sp = d->arch.shadow.hash_table[bucket];
1492 while ( sp )
1494 /* Not a shadow? */
1495 BUG_ON( sp->mbz != 0 );
1496 /* Bogus type? */
1497 BUG_ON( sp->type == 0 );
1498 BUG_ON( sp->type > SH_type_max_shadow );
1499 /* Wrong bucket? */
1500 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1501 /* Duplicate entry? */
1502 for ( x = sp->next_shadow; x; x = x->next_shadow )
1503 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1504 /* Follow the backpointer to the guest pagetable */
1505 if ( sp->type != SH_type_fl1_32_shadow
1506 && sp->type != SH_type_fl1_pae_shadow
1507 && sp->type != SH_type_fl1_64_shadow )
1509 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1510 /* Bad shadow flags on guest page? */
1511 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1512 /* Bad type count on guest page? */
1513 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1514 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1516 SHADOW_ERROR("MFN %#lx shadowed (by %#"SH_PRI_mfn")"
1517 " but has typecount %#lx\n",
1518 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1519 gpg->u.inuse.type_info);
1520 BUG();
1523 /* That entry was OK; on we go */
1524 sp = sp->next_shadow;
1528 #else
1529 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1530 #endif /* Hashtable bucket audit */
1533 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1535 static void sh_hash_audit(struct domain *d)
1536 /* Full audit: audit every bucket in the table */
1538 int i;
1540 if ( !(SHADOW_AUDIT_ENABLE) )
1541 return;
1543 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1545 sh_hash_audit_bucket(d, i);
1549 #else
1550 #define sh_hash_audit(_d) do {} while(0)
1551 #endif /* Hashtable bucket audit */
1553 /* Allocate and initialise the table itself.
1554 * Returns 0 for success, 1 for error. */
1555 static int shadow_hash_alloc(struct domain *d)
1557 struct shadow_page_info **table;
1559 ASSERT(shadow_lock_is_acquired(d));
1560 ASSERT(!d->arch.shadow.hash_table);
1562 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1563 if ( !table ) return 1;
1564 memset(table, 0,
1565 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1566 d->arch.shadow.hash_table = table;
1567 return 0;
1570 /* Tear down the hash table and return all memory to Xen.
1571 * This function does not care whether the table is populated. */
1572 static void shadow_hash_teardown(struct domain *d)
1574 ASSERT(shadow_lock_is_acquired(d));
1575 ASSERT(d->arch.shadow.hash_table);
1577 xfree(d->arch.shadow.hash_table);
1578 d->arch.shadow.hash_table = NULL;
1582 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1583 /* Find an entry in the hash table. Returns the MFN of the shadow,
1584 * or INVALID_MFN if it doesn't exist */
1586 struct domain *d = v->domain;
1587 struct shadow_page_info *sp, *prev;
1588 key_t key;
1590 ASSERT(shadow_lock_is_acquired(d));
1591 ASSERT(d->arch.shadow.hash_table);
1592 ASSERT(t);
1594 sh_hash_audit(d);
1596 perfc_incrc(shadow_hash_lookups);
1597 key = sh_hash(n, t);
1598 sh_hash_audit_bucket(d, key);
1600 sp = d->arch.shadow.hash_table[key];
1601 prev = NULL;
1602 while(sp)
1604 if ( sp->backpointer == n && sp->type == t )
1606 /* Pull-to-front if 'sp' isn't already the head item */
1607 if ( unlikely(sp != d->arch.shadow.hash_table[key]) )
1609 if ( unlikely(d->arch.shadow.hash_walking != 0) )
1610 /* Can't reorder: someone is walking the hash chains */
1611 return shadow_page_to_mfn(sp);
1612 else
1614 ASSERT(prev);
1615 /* Delete sp from the list */
1616 prev->next_shadow = sp->next_shadow;
1617 /* Re-insert it at the head of the list */
1618 sp->next_shadow = d->arch.shadow.hash_table[key];
1619 d->arch.shadow.hash_table[key] = sp;
1622 else
1624 perfc_incrc(shadow_hash_lookup_head);
1626 return shadow_page_to_mfn(sp);
1628 prev = sp;
1629 sp = sp->next_shadow;
1632 perfc_incrc(shadow_hash_lookup_miss);
1633 return _mfn(INVALID_MFN);
1636 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
1637 mfn_t smfn)
1638 /* Put a mapping (n,t)->smfn into the hash table */
1640 struct domain *d = v->domain;
1641 struct shadow_page_info *sp;
1642 key_t key;
1644 ASSERT(shadow_lock_is_acquired(d));
1645 ASSERT(d->arch.shadow.hash_table);
1646 ASSERT(t);
1648 sh_hash_audit(d);
1650 perfc_incrc(shadow_hash_inserts);
1651 key = sh_hash(n, t);
1652 sh_hash_audit_bucket(d, key);
1654 /* Insert this shadow at the top of the bucket */
1655 sp = mfn_to_shadow_page(smfn);
1656 sp->next_shadow = d->arch.shadow.hash_table[key];
1657 d->arch.shadow.hash_table[key] = sp;
1659 sh_hash_audit_bucket(d, key);
1662 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
1663 mfn_t smfn)
1664 /* Excise the mapping (n,t)->smfn from the hash table */
1666 struct domain *d = v->domain;
1667 struct shadow_page_info *sp, *x;
1668 key_t key;
1670 ASSERT(shadow_lock_is_acquired(d));
1671 ASSERT(d->arch.shadow.hash_table);
1672 ASSERT(t);
1674 sh_hash_audit(d);
1676 perfc_incrc(shadow_hash_deletes);
1677 key = sh_hash(n, t);
1678 sh_hash_audit_bucket(d, key);
1680 sp = mfn_to_shadow_page(smfn);
1681 if ( d->arch.shadow.hash_table[key] == sp )
1682 /* Easy case: we're deleting the head item. */
1683 d->arch.shadow.hash_table[key] = sp->next_shadow;
1684 else
1686 /* Need to search for the one we want */
1687 x = d->arch.shadow.hash_table[key];
1688 while ( 1 )
1690 ASSERT(x); /* We can't have hit the end, since our target is
1691 * still in the chain somehwere... */
1692 if ( x->next_shadow == sp )
1694 x->next_shadow = sp->next_shadow;
1695 break;
1697 x = x->next_shadow;
1700 sp->next_shadow = NULL;
1702 sh_hash_audit_bucket(d, key);
1705 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1707 static void hash_foreach(struct vcpu *v,
1708 unsigned int callback_mask,
1709 hash_callback_t callbacks[],
1710 mfn_t callback_mfn)
1711 /* Walk the hash table looking at the types of the entries and
1712 * calling the appropriate callback function for each entry.
1713 * The mask determines which shadow types we call back for, and the array
1714 * of callbacks tells us which function to call.
1715 * Any callback may return non-zero to let us skip the rest of the scan.
1717 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1718 * then return non-zero to terminate the scan. */
1720 int i, done = 0;
1721 struct domain *d = v->domain;
1722 struct shadow_page_info *x;
1724 /* Say we're here, to stop hash-lookups reordering the chains */
1725 ASSERT(shadow_lock_is_acquired(d));
1726 ASSERT(d->arch.shadow.hash_walking == 0);
1727 d->arch.shadow.hash_walking = 1;
1729 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1731 /* WARNING: This is not safe against changes to the hash table.
1732 * The callback *must* return non-zero if it has inserted or
1733 * deleted anything from the hash (lookups are OK, though). */
1734 for ( x = d->arch.shadow.hash_table[i]; x; x = x->next_shadow )
1736 if ( callback_mask & (1 << x->type) )
1738 ASSERT(x->type <= 15);
1739 ASSERT(callbacks[x->type] != NULL);
1740 done = callbacks[x->type](v, shadow_page_to_mfn(x),
1741 callback_mfn);
1742 if ( done ) break;
1745 if ( done ) break;
1747 d->arch.shadow.hash_walking = 0;
1751 /**************************************************************************/
1752 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1753 * which will decrement refcounts appropriately and return memory to the
1754 * free pool. */
1756 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1758 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1759 unsigned int t = sp->type;
1762 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1764 /* Double-check, if we can, that the shadowed page belongs to this
1765 * domain, (by following the back-pointer). */
1766 ASSERT(t == SH_type_fl1_32_shadow ||
1767 t == SH_type_fl1_pae_shadow ||
1768 t == SH_type_fl1_64_shadow ||
1769 t == SH_type_monitor_table ||
1770 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
1771 == v->domain));
1773 /* The down-shifts here are so that the switch statement is on nice
1774 * small numbers that the compiler will enjoy */
1775 switch ( t )
1777 #if CONFIG_PAGING_LEVELS == 2
1778 case SH_type_l1_32_shadow:
1779 case SH_type_fl1_32_shadow:
1780 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1781 break;
1782 case SH_type_l2_32_shadow:
1783 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1784 break;
1785 #else /* PAE or 64bit */
1786 case SH_type_l1_32_shadow:
1787 case SH_type_fl1_32_shadow:
1788 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1789 break;
1790 case SH_type_l2_32_shadow:
1791 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1792 break;
1793 #endif
1795 #if CONFIG_PAGING_LEVELS >= 3
1796 case SH_type_l1_pae_shadow:
1797 case SH_type_fl1_pae_shadow:
1798 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1799 break;
1800 case SH_type_l2_pae_shadow:
1801 case SH_type_l2h_pae_shadow:
1802 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1803 break;
1804 #endif
1806 #if CONFIG_PAGING_LEVELS >= 4
1807 case SH_type_l1_64_shadow:
1808 case SH_type_fl1_64_shadow:
1809 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1810 break;
1811 case SH_type_l2_64_shadow:
1812 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1813 break;
1814 case SH_type_l3_64_shadow:
1815 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1816 break;
1817 case SH_type_l4_64_shadow:
1818 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1819 break;
1820 #endif
1821 default:
1822 SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
1823 (unsigned long)t);
1824 BUG();
1828 /**************************************************************************/
1829 /* Remove all writeable mappings of a guest frame from the shadow tables
1830 * Returns non-zero if we need to flush TLBs.
1831 * level and fault_addr desribe how we found this to be a pagetable;
1832 * level==0 means we have some other reason for revoking write access.*/
1834 int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn,
1835 unsigned int level,
1836 unsigned long fault_addr)
1838 /* Dispatch table for getting per-type functions */
1839 static hash_callback_t callbacks[16] = {
1840 NULL, /* none */
1841 #if CONFIG_PAGING_LEVELS == 2
1842 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */
1843 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */
1844 #else
1845 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */
1846 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */
1847 #endif
1848 NULL, /* l2_32 */
1849 #if CONFIG_PAGING_LEVELS >= 3
1850 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */
1851 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
1852 #else
1853 NULL, /* l1_pae */
1854 NULL, /* fl1_pae */
1855 #endif
1856 NULL, /* l2_pae */
1857 NULL, /* l2h_pae */
1858 #if CONFIG_PAGING_LEVELS >= 4
1859 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */
1860 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */
1861 #else
1862 NULL, /* l1_64 */
1863 NULL, /* fl1_64 */
1864 #endif
1865 NULL, /* l2_64 */
1866 NULL, /* l3_64 */
1867 NULL, /* l4_64 */
1868 NULL, /* p2m */
1869 NULL /* unused */
1870 };
1872 static unsigned int callback_mask =
1873 1 << SH_type_l1_32_shadow
1874 | 1 << SH_type_fl1_32_shadow
1875 | 1 << SH_type_l1_pae_shadow
1876 | 1 << SH_type_fl1_pae_shadow
1877 | 1 << SH_type_l1_64_shadow
1878 | 1 << SH_type_fl1_64_shadow
1880 struct page_info *pg = mfn_to_page(gmfn);
1882 ASSERT(shadow_lock_is_acquired(v->domain));
1884 /* Only remove writable mappings if we are doing shadow refcounts.
1885 * In guest refcounting, we trust Xen to already be restricting
1886 * all the writes to the guest page tables, so we do not need to
1887 * do more. */
1888 if ( !shadow_mode_refcounts(v->domain) )
1889 return 0;
1891 /* Early exit if it's already a pagetable, or otherwise not writeable */
1892 if ( sh_mfn_is_a_page_table(gmfn)
1893 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1894 return 0;
1896 perfc_incrc(shadow_writeable);
1898 /* If this isn't a "normal" writeable page, the domain is trying to
1899 * put pagetables in special memory of some kind. We can't allow that. */
1900 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1902 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1903 PRtype_info "\n",
1904 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1905 domain_crash(v->domain);
1908 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1909 if ( v == current && level != 0 )
1911 unsigned long gfn;
1912 /* Heuristic: there is likely to be only one writeable mapping,
1913 * and that mapping is likely to be in the current pagetable,
1914 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
1916 #define GUESS(_a, _h) do { \
1917 if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
1918 perfc_incrc(shadow_writeable_h_ ## _h); \
1919 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1920 return 1; \
1921 } while (0)
1924 if ( v->arch.shadow.mode->guest_levels == 2 )
1926 if ( level == 1 )
1927 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1928 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1930 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1931 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1932 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1935 #if CONFIG_PAGING_LEVELS >= 3
1936 else if ( v->arch.shadow.mode->guest_levels == 3 )
1938 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1939 switch ( level )
1941 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1942 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1945 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1946 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1947 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1949 #if CONFIG_PAGING_LEVELS >= 4
1950 else if ( v->arch.shadow.mode->guest_levels == 4 )
1952 /* 64bit w2k3: linear map at 0x0000070000000000 */
1953 switch ( level )
1955 case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
1956 case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
1957 case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
1960 /* 64bit Linux direct map at 0xffff810000000000; older kernels
1961 * had it at 0x0000010000000000UL */
1962 gfn = sh_mfn_to_gfn(v->domain, gmfn);
1963 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
1964 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
1966 #endif /* CONFIG_PAGING_LEVELS >= 4 */
1967 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1969 #undef GUESS
1972 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1973 return 1;
1975 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
1976 * (entries in the fixmap) where linux maps its pagetables. Since
1977 * we expect to hit them most of the time, we start the search for
1978 * the writeable mapping by looking at the same MFN where the last
1979 * brute-force search succeeded. */
1981 if ( v->arch.shadow.last_writeable_pte_smfn != 0 )
1983 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
1984 mfn_t last_smfn = _mfn(v->arch.shadow.last_writeable_pte_smfn);
1985 int shtype = mfn_to_shadow_page(last_smfn)->type;
1987 if ( callbacks[shtype] )
1988 callbacks[shtype](v, last_smfn, gmfn);
1990 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
1991 perfc_incrc(shadow_writeable_h_5);
1994 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1995 return 1;
1997 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
1999 /* Brute-force search of all the shadows, by walking the hash */
2000 perfc_incrc(shadow_writeable_bf);
2001 hash_foreach(v, callback_mask, callbacks, gmfn);
2003 /* If that didn't catch the mapping, something is very wrong */
2004 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2006 SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
2007 "%lu left\n", mfn_x(gmfn),
2008 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2009 domain_crash(v->domain);
2012 /* We killed at least one writeable mapping, so must flush TLBs. */
2013 return 1;
2018 /**************************************************************************/
2019 /* Remove all mappings of a guest frame from the shadow tables.
2020 * Returns non-zero if we need to flush TLBs. */
2022 int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
2024 struct page_info *page = mfn_to_page(gmfn);
2025 int expected_count;
2027 /* Dispatch table for getting per-type functions */
2028 static hash_callback_t callbacks[16] = {
2029 NULL, /* none */
2030 #if CONFIG_PAGING_LEVELS == 2
2031 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */
2032 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */
2033 #else
2034 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */
2035 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */
2036 #endif
2037 NULL, /* l2_32 */
2038 #if CONFIG_PAGING_LEVELS >= 3
2039 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */
2040 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
2041 #else
2042 NULL, /* l1_pae */
2043 NULL, /* fl1_pae */
2044 #endif
2045 NULL, /* l2_pae */
2046 NULL, /* l2h_pae */
2047 #if CONFIG_PAGING_LEVELS >= 4
2048 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */
2049 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */
2050 #else
2051 NULL, /* l1_64 */
2052 NULL, /* fl1_64 */
2053 #endif
2054 NULL, /* l2_64 */
2055 NULL, /* l3_64 */
2056 NULL, /* l4_64 */
2057 NULL, /* p2m */
2058 NULL /* unused */
2059 };
2061 static unsigned int callback_mask =
2062 1 << SH_type_l1_32_shadow
2063 | 1 << SH_type_fl1_32_shadow
2064 | 1 << SH_type_l1_pae_shadow
2065 | 1 << SH_type_fl1_pae_shadow
2066 | 1 << SH_type_l1_64_shadow
2067 | 1 << SH_type_fl1_64_shadow
2070 perfc_incrc(shadow_mappings);
2071 if ( (page->count_info & PGC_count_mask) == 0 )
2072 return 0;
2074 ASSERT(shadow_lock_is_acquired(v->domain));
2076 /* XXX TODO:
2077 * Heuristics for finding the (probably) single mapping of this gmfn */
2079 /* Brute-force search of all the shadows, by walking the hash */
2080 perfc_incrc(shadow_mappings_bf);
2081 hash_foreach(v, callback_mask, callbacks, gmfn);
2083 /* If that didn't catch the mapping, something is very wrong */
2084 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2085 if ( (page->count_info & PGC_count_mask) != expected_count )
2087 /* Don't complain if we're in HVM and there's one extra mapping:
2088 * The qemu helper process has an untyped mapping of this dom's RAM */
2089 if ( !(shadow_mode_external(v->domain)
2090 && (page->count_info & PGC_count_mask) <= 2
2091 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2093 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2094 "c=%08x t=%08lx\n", mfn_x(gmfn),
2095 page->count_info, page->u.inuse.type_info);
2099 /* We killed at least one mapping, so must flush TLBs. */
2100 return 1;
2104 /**************************************************************************/
2105 /* Remove all shadows of a guest frame from the shadow tables */
2107 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2108 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2109 * found there. Returns 1 if that was the only reference to this shadow */
2111 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2112 mfn_t pmfn;
2113 void *vaddr;
2114 int rc;
2116 ASSERT(sp->type > 0);
2117 ASSERT(sp->type < SH_type_max_shadow);
2118 ASSERT(sp->type != SH_type_l2_32_shadow);
2119 ASSERT(sp->type != SH_type_l2_pae_shadow);
2120 ASSERT(sp->type != SH_type_l2h_pae_shadow);
2121 ASSERT(sp->type != SH_type_l4_64_shadow);
2123 if (sp->up == 0) return 0;
2124 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2125 ASSERT(mfn_valid(pmfn));
2126 vaddr = sh_map_domain_page(pmfn);
2127 ASSERT(vaddr);
2128 vaddr += sp->up & (PAGE_SIZE-1);
2129 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2131 /* Is this the only reference to this shadow? */
2132 rc = (sp->count == 1) ? 1 : 0;
2134 /* Blank the offending entry */
2135 switch (sp->type)
2137 case SH_type_l1_32_shadow:
2138 case SH_type_l2_32_shadow:
2139 #if CONFIG_PAGING_LEVELS == 2
2140 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2141 #else
2142 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2143 #endif
2144 break;
2145 #if CONFIG_PAGING_LEVELS >=3
2146 case SH_type_l1_pae_shadow:
2147 case SH_type_l2_pae_shadow:
2148 case SH_type_l2h_pae_shadow:
2149 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2150 break;
2151 #if CONFIG_PAGING_LEVELS >= 4
2152 case SH_type_l1_64_shadow:
2153 case SH_type_l2_64_shadow:
2154 case SH_type_l3_64_shadow:
2155 case SH_type_l4_64_shadow:
2156 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2157 break;
2158 #endif
2159 #endif
2160 default: BUG(); /* Some wierd unknown shadow type */
2163 sh_unmap_domain_page(vaddr);
2164 if ( rc )
2165 perfc_incrc(shadow_up_pointer);
2166 else
2167 perfc_incrc(shadow_unshadow_bf);
2169 return rc;
2172 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2173 /* Remove the shadows of this guest page.
2174 * If fast != 0, just try the quick heuristic, which will remove
2175 * at most one reference to each shadow of the page. Otherwise, walk
2176 * all the shadow tables looking for refs to shadows of this gmfn.
2177 * If all != 0, kill the domain if we can't find all the shadows.
2178 * (all != 0 implies fast == 0)
2179 */
2181 struct page_info *pg;
2182 mfn_t smfn;
2183 u32 sh_flags;
2184 unsigned char t;
2186 /* Dispatch table for getting per-type functions: each level must
2187 * be called with the function to remove a lower-level shadow. */
2188 static hash_callback_t callbacks[16] = {
2189 NULL, /* none */
2190 NULL, /* l1_32 */
2191 NULL, /* fl1_32 */
2192 #if CONFIG_PAGING_LEVELS == 2
2193 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2194 #else
2195 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2196 #endif
2197 NULL, /* l1_pae */
2198 NULL, /* fl1_pae */
2199 #if CONFIG_PAGING_LEVELS >= 3
2200 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2201 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2202 #else
2203 NULL, /* l2_pae */
2204 NULL, /* l2h_pae */
2205 #endif
2206 NULL, /* l1_64 */
2207 NULL, /* fl1_64 */
2208 #if CONFIG_PAGING_LEVELS >= 4
2209 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2210 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2211 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2212 #else
2213 NULL, /* l2_64 */
2214 NULL, /* l3_64 */
2215 NULL, /* l4_64 */
2216 #endif
2217 NULL, /* p2m */
2218 NULL /* unused */
2219 };
2221 /* Another lookup table, for choosing which mask to use */
2222 static unsigned int masks[16] = {
2223 0, /* none */
2224 1 << SH_type_l2_32_shadow, /* l1_32 */
2225 0, /* fl1_32 */
2226 0, /* l2_32 */
2227 ((1 << SH_type_l2h_pae_shadow)
2228 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2229 0, /* fl1_pae */
2230 0, /* l2_pae */
2231 0, /* l2h_pae */
2232 1 << SH_type_l2_64_shadow, /* l1_64 */
2233 0, /* fl1_64 */
2234 1 << SH_type_l3_64_shadow, /* l2_64 */
2235 1 << SH_type_l4_64_shadow, /* l3_64 */
2236 0, /* l4_64 */
2237 0, /* p2m */
2238 0 /* unused */
2239 };
2241 ASSERT(shadow_lock_is_acquired(v->domain));
2242 ASSERT(!(all && fast));
2244 pg = mfn_to_page(gmfn);
2246 /* Bail out now if the page is not shadowed */
2247 if ( (pg->count_info & PGC_page_table) == 0 )
2248 return;
2250 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2251 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2253 /* Search for this shadow in all appropriate shadows */
2254 perfc_incrc(shadow_unshadow);
2255 sh_flags = pg->shadow_flags;
2257 /* Lower-level shadows need to be excised from upper-level shadows.
2258 * This call to hash_foreach() looks dangerous but is in fact OK: each
2259 * call will remove at most one shadow, and terminate immediately when
2260 * it does remove it, so we never walk the hash after doing a deletion. */
2261 #define DO_UNSHADOW(_type) do { \
2262 t = (_type); \
2263 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2264 if ( sh_type_is_pinnable(v, t) ) \
2265 sh_unpin(v, smfn); \
2266 else \
2267 sh_remove_shadow_via_pointer(v, smfn); \
2268 if ( (pg->count_info & PGC_page_table) && !fast ) \
2269 hash_foreach(v, masks[t], callbacks, smfn); \
2270 } while (0)
2272 if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(SH_type_l1_32_shadow);
2273 if ( sh_flags & SHF_L2_32 ) DO_UNSHADOW(SH_type_l2_32_shadow);
2274 #if CONFIG_PAGING_LEVELS >= 3
2275 if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(SH_type_l1_pae_shadow);
2276 if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(SH_type_l2_pae_shadow);
2277 if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(SH_type_l2h_pae_shadow);
2278 #if CONFIG_PAGING_LEVELS >= 4
2279 if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(SH_type_l1_64_shadow);
2280 if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(SH_type_l2_64_shadow);
2281 if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(SH_type_l3_64_shadow);
2282 if ( sh_flags & SHF_L4_64 ) DO_UNSHADOW(SH_type_l4_64_shadow);
2283 #endif
2284 #endif
2286 #undef DO_UNSHADOW
2288 /* If that didn't catch the shadows, something is wrong */
2289 if ( !fast && (pg->count_info & PGC_page_table) )
2291 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2292 "(shadow_flags=%08lx)\n",
2293 mfn_x(gmfn), pg->shadow_flags);
2294 if ( all )
2295 domain_crash(v->domain);
2298 /* Need to flush TLBs now, so that linear maps are safe next time we
2299 * take a fault. */
2300 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2303 void
2304 shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2305 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2306 * Unshadow it, and recursively unshadow pages that reference it. */
2308 shadow_remove_all_shadows(v, gmfn);
2309 /* XXX TODO:
2310 * Rework this hashtable walker to return a linked-list of all
2311 * the shadows it modified, then do breadth-first recursion
2312 * to find the way up to higher-level tables and unshadow them too.
2314 * The current code (just tearing down each page's shadows as we
2315 * detect that it is not a pagetable) is correct, but very slow.
2316 * It means extra emulated writes and slows down removal of mappings. */
2319 /**************************************************************************/
2321 void sh_update_paging_modes(struct vcpu *v)
2323 struct domain *d = v->domain;
2324 struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
2325 mfn_t old_guest_table;
2327 ASSERT(shadow_lock_is_acquired(d));
2329 // Valid transitions handled by this function:
2330 // - For PV guests:
2331 // - after a shadow mode has been changed
2332 // - For HVM guests:
2333 // - after a shadow mode has been changed
2334 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2335 //
2337 // First, tear down any old shadow tables held by this vcpu.
2338 //
2339 shadow_detach_old_tables(v);
2341 if ( !is_hvm_domain(d) )
2343 ///
2344 /// PV guest
2345 ///
2346 #if CONFIG_PAGING_LEVELS == 4
2347 if ( pv_32bit_guest(v) )
2348 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
2349 else
2350 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2351 #elif CONFIG_PAGING_LEVELS == 3
2352 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2353 #elif CONFIG_PAGING_LEVELS == 2
2354 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2355 #else
2356 #error unexpected paging mode
2357 #endif
2358 v->arch.shadow.translate_enabled = !!shadow_mode_translate(d);
2360 else
2362 ///
2363 /// HVM guest
2364 ///
2365 ASSERT(shadow_mode_translate(d));
2366 ASSERT(shadow_mode_external(d));
2368 v->arch.shadow.translate_enabled = !!hvm_paging_enabled(v);
2369 if ( !v->arch.shadow.translate_enabled )
2371 /* Set v->arch.guest_table to use the p2m map, and choose
2372 * the appropriate shadow mode */
2373 old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2374 #if CONFIG_PAGING_LEVELS == 2
2375 v->arch.guest_table =
2376 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2377 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2378 #elif CONFIG_PAGING_LEVELS == 3
2379 v->arch.guest_table =
2380 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2381 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2382 #else /* CONFIG_PAGING_LEVELS == 4 */
2384 l4_pgentry_t *l4e;
2385 /* Use the start of the first l3 table as a PAE l3 */
2386 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2387 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2388 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2389 v->arch.guest_table =
2390 pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2391 sh_unmap_domain_page(l4e);
2393 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2394 #endif
2395 /* Fix up refcounts on guest_table */
2396 get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2397 if ( mfn_x(old_guest_table) != 0 )
2398 put_page(mfn_to_page(old_guest_table));
2400 else
2402 #ifdef __x86_64__
2403 if ( hvm_long_mode_enabled(v) )
2405 // long mode guest...
2406 v->arch.shadow.mode =
2407 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2409 else
2410 #endif
2411 if ( hvm_pae_enabled(v) )
2413 #if CONFIG_PAGING_LEVELS >= 3
2414 // 32-bit PAE mode guest...
2415 v->arch.shadow.mode =
2416 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2417 #else
2418 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2419 domain_crash(d);
2420 return;
2421 #endif
2423 else
2425 // 32-bit 2 level guest...
2426 #if CONFIG_PAGING_LEVELS >= 3
2427 v->arch.shadow.mode =
2428 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2429 #else
2430 v->arch.shadow.mode =
2431 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2432 #endif
2436 if ( pagetable_is_null(v->arch.monitor_table) )
2438 mfn_t mmfn = shadow_make_monitor_table(v);
2439 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2442 if ( v->arch.shadow.mode != old_mode )
2444 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2445 "(was g=%u s=%u)\n",
2446 d->domain_id, v->vcpu_id,
2447 is_hvm_domain(d) ? !!hvm_paging_enabled(v) : 1,
2448 v->arch.shadow.mode->guest_levels,
2449 v->arch.shadow.mode->shadow_levels,
2450 old_mode ? old_mode->guest_levels : 0,
2451 old_mode ? old_mode->shadow_levels : 0);
2452 if ( old_mode &&
2453 (v->arch.shadow.mode->shadow_levels !=
2454 old_mode->shadow_levels) )
2456 /* Need to make a new monitor table for the new mode */
2457 mfn_t new_mfn, old_mfn;
2459 if ( v != current )
2461 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2462 "this HVM vcpu's (d=%u v=%u) paging mode!\n",
2463 current->domain->domain_id, current->vcpu_id,
2464 v->domain->domain_id, v->vcpu_id);
2465 domain_crash(v->domain);
2466 return;
2469 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2470 v->arch.monitor_table = pagetable_null();
2471 new_mfn = v->arch.shadow.mode->make_monitor_table(v);
2472 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2473 SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
2474 mfn_x(new_mfn));
2476 /* Don't be running on the old monitor table when we
2477 * pull it down! Switch CR3, and warn the HVM code that
2478 * its host cr3 has changed. */
2479 make_cr3(v, mfn_x(new_mfn));
2480 write_ptbase(v);
2481 hvm_update_host_cr3(v);
2482 old_mode->destroy_monitor_table(v, old_mfn);
2486 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2487 // These are HARD: think about the case where two CPU's have
2488 // different values for CR4.PSE and CR4.PGE at the same time.
2489 // This *does* happen, at least for CR4.PGE...
2492 v->arch.shadow.mode->update_cr3(v);
2495 /**************************************************************************/
2496 /* Turning on and off shadow features */
2498 static void sh_new_mode(struct domain *d, u32 new_mode)
2499 /* Inform all the vcpus that the shadow mode has been changed */
2501 struct vcpu *v;
2503 ASSERT(shadow_lock_is_acquired(d));
2504 ASSERT(d != current->domain);
2505 d->arch.shadow.mode = new_mode;
2506 if ( new_mode & SHM2_translate )
2507 shadow_audit_p2m(d);
2508 for_each_vcpu(d, v)
2509 sh_update_paging_modes(v);
2512 int shadow_enable(struct domain *d, u32 mode)
2513 /* Turn on "permanent" shadow features: external, translate, refcount.
2514 * Can only be called once on a domain, and these features cannot be
2515 * disabled.
2516 * Returns 0 for success, -errno for failure. */
2518 unsigned int old_pages;
2519 int rv = 0;
2521 mode |= SHM2_enable;
2523 domain_pause(d);
2524 shadow_lock(d);
2526 /* Sanity check the arguments */
2527 if ( (d == current->domain) ||
2528 shadow_mode_enabled(d) ||
2529 ((mode & SHM2_translate) && !(mode & SHM2_refcounts)) ||
2530 ((mode & SHM2_external) && !(mode & SHM2_translate)) )
2532 rv = -EINVAL;
2533 goto out;
2536 // XXX -- eventually would like to require that all memory be allocated
2537 // *after* shadow_enabled() is called... So here, we would test to make
2538 // sure that d->page_list is empty.
2539 #if 0
2540 spin_lock(&d->page_alloc_lock);
2541 if ( !list_empty(&d->page_list) )
2543 spin_unlock(&d->page_alloc_lock);
2544 rv = -EINVAL;
2545 goto out;
2547 spin_unlock(&d->page_alloc_lock);
2548 #endif
2550 /* Init the shadow memory allocation if the user hasn't done so */
2551 old_pages = d->arch.shadow.total_pages;
2552 if ( old_pages == 0 )
2553 if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
2555 set_sh_allocation(d, 0, NULL);
2556 rv = -ENOMEM;
2557 goto out;
2560 /* Init the hash table */
2561 if ( shadow_hash_alloc(d) != 0 )
2563 set_sh_allocation(d, old_pages, NULL);
2564 rv = -ENOMEM;
2565 goto out;
2568 /* Init the P2M table */
2569 if ( mode & SHM2_translate )
2570 if ( !shadow_alloc_p2m_table(d) )
2572 shadow_hash_teardown(d);
2573 set_sh_allocation(d, old_pages, NULL);
2574 shadow_p2m_teardown(d);
2575 rv = -ENOMEM;
2576 goto out;
2579 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2580 /* We assume we're dealing with an older 64bit linux guest until we
2581 * see the guest use more than one l4 per vcpu. */
2582 d->arch.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2583 #endif
2585 /* Update the bits */
2586 sh_new_mode(d, mode);
2587 shadow_audit_p2m(d);
2588 out:
2589 shadow_unlock(d);
2590 domain_unpause(d);
2591 return rv;
2594 void shadow_teardown(struct domain *d)
2595 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2596 * Should only be called for dying domains. */
2598 struct vcpu *v;
2599 mfn_t mfn;
2601 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2602 ASSERT(d != current->domain);
2604 if ( !shadow_lock_is_acquired(d) )
2605 shadow_lock(d); /* Keep various asserts happy */
2607 if ( shadow_mode_enabled(d) )
2609 /* Release the shadow and monitor tables held by each vcpu */
2610 for_each_vcpu(d, v)
2612 shadow_detach_old_tables(v);
2613 if ( shadow_mode_external(d) )
2615 mfn = pagetable_get_mfn(v->arch.monitor_table);
2616 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
2617 shadow_destroy_monitor_table(v, mfn);
2618 v->arch.monitor_table = pagetable_null();
2623 if ( d->arch.shadow.total_pages != 0 )
2625 SHADOW_PRINTK("teardown of domain %u starts."
2626 " Shadow pages total = %u, free = %u, p2m=%u\n",
2627 d->domain_id,
2628 d->arch.shadow.total_pages,
2629 d->arch.shadow.free_pages,
2630 d->arch.shadow.p2m_pages);
2631 /* Destroy all the shadows and release memory to domheap */
2632 set_sh_allocation(d, 0, NULL);
2633 /* Release the hash table back to xenheap */
2634 if (d->arch.shadow.hash_table)
2635 shadow_hash_teardown(d);
2636 /* Release the log-dirty bitmap of dirtied pages */
2637 sh_free_log_dirty_bitmap(d);
2638 /* Should not have any more memory held */
2639 SHADOW_PRINTK("teardown done."
2640 " Shadow pages total = %u, free = %u, p2m=%u\n",
2641 d->arch.shadow.total_pages,
2642 d->arch.shadow.free_pages,
2643 d->arch.shadow.p2m_pages);
2644 ASSERT(d->arch.shadow.total_pages == 0);
2647 /* We leave the "permanent" shadow modes enabled, but clear the
2648 * log-dirty mode bit. We don't want any more mark_dirty()
2649 * calls now that we've torn down the bitmap */
2650 d->arch.shadow.mode &= ~SHM2_log_dirty;
2652 shadow_unlock(d);
2655 void shadow_final_teardown(struct domain *d)
2656 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2659 SHADOW_PRINTK("dom %u final teardown starts."
2660 " Shadow pages total = %u, free = %u, p2m=%u\n",
2661 d->domain_id,
2662 d->arch.shadow.total_pages,
2663 d->arch.shadow.free_pages,
2664 d->arch.shadow.p2m_pages);
2666 /* Double-check that the domain didn't have any shadow memory.
2667 * It is possible for a domain that never got domain_kill()ed
2668 * to get here with its shadow allocation intact. */
2669 if ( d->arch.shadow.total_pages != 0 )
2670 shadow_teardown(d);
2672 /* It is now safe to pull down the p2m map. */
2673 if ( d->arch.shadow.p2m_pages != 0 )
2674 shadow_p2m_teardown(d);
2676 SHADOW_PRINTK("dom %u final teardown done."
2677 " Shadow pages total = %u, free = %u, p2m=%u\n",
2678 d->domain_id,
2679 d->arch.shadow.total_pages,
2680 d->arch.shadow.free_pages,
2681 d->arch.shadow.p2m_pages);
2684 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2685 /* Turn on a single shadow mode feature */
2687 ASSERT(shadow_lock_is_acquired(d));
2689 /* Sanity check the call */
2690 if ( d == current->domain || (d->arch.shadow.mode & mode) )
2692 return -EINVAL;
2695 if ( d->arch.shadow.mode == 0 )
2697 /* Init the shadow memory allocation and the hash table */
2698 if ( set_sh_allocation(d, 1, NULL) != 0
2699 || shadow_hash_alloc(d) != 0 )
2701 set_sh_allocation(d, 0, NULL);
2702 return -ENOMEM;
2706 /* Update the bits */
2707 sh_new_mode(d, d->arch.shadow.mode | mode);
2709 return 0;
2712 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2713 /* Turn off a single shadow mode feature */
2715 struct vcpu *v;
2716 ASSERT(shadow_lock_is_acquired(d));
2718 /* Sanity check the call */
2719 if ( d == current->domain || !(d->arch.shadow.mode & mode) )
2721 return -EINVAL;
2724 /* Update the bits */
2725 sh_new_mode(d, d->arch.shadow.mode & ~mode);
2726 if ( d->arch.shadow.mode == 0 )
2728 /* Get this domain off shadows */
2729 SHADOW_PRINTK("un-shadowing of domain %u starts."
2730 " Shadow pages total = %u, free = %u, p2m=%u\n",
2731 d->domain_id,
2732 d->arch.shadow.total_pages,
2733 d->arch.shadow.free_pages,
2734 d->arch.shadow.p2m_pages);
2735 for_each_vcpu(d, v)
2737 shadow_detach_old_tables(v);
2738 #if CONFIG_PAGING_LEVELS == 4
2739 if ( !(v->arch.flags & TF_kernel_mode) )
2740 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2741 else
2742 #endif
2743 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2747 /* Pull down the memory allocation */
2748 if ( set_sh_allocation(d, 0, NULL) != 0 )
2750 // XXX - How can this occur?
2751 // Seems like a bug to return an error now that we've
2752 // disabled the relevant shadow mode.
2753 //
2754 return -ENOMEM;
2756 shadow_hash_teardown(d);
2757 SHADOW_PRINTK("un-shadowing of domain %u done."
2758 " Shadow pages total = %u, free = %u, p2m=%u\n",
2759 d->domain_id,
2760 d->arch.shadow.total_pages,
2761 d->arch.shadow.free_pages,
2762 d->arch.shadow.p2m_pages);
2765 return 0;
2768 /* Enable/disable ops for the "test" and "log-dirty" modes */
2769 int shadow_test_enable(struct domain *d)
2771 int ret;
2773 domain_pause(d);
2774 shadow_lock(d);
2776 if ( shadow_mode_enabled(d) )
2778 SHADOW_ERROR("Don't support enabling test mode"
2779 " on already shadowed doms\n");
2780 ret = -EINVAL;
2781 goto out;
2784 ret = shadow_one_bit_enable(d, SHM2_enable);
2785 out:
2786 shadow_unlock(d);
2787 domain_unpause(d);
2789 return ret;
2792 int shadow_test_disable(struct domain *d)
2794 int ret;
2796 domain_pause(d);
2797 shadow_lock(d);
2798 ret = shadow_one_bit_disable(d, SHM2_enable);
2799 shadow_unlock(d);
2800 domain_unpause(d);
2802 return ret;
2805 static int
2806 sh_alloc_log_dirty_bitmap(struct domain *d)
2808 ASSERT(d->arch.shadow.dirty_bitmap == NULL);
2809 d->arch.shadow.dirty_bitmap_size =
2810 (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
2811 ~(BITS_PER_LONG - 1);
2812 d->arch.shadow.dirty_bitmap =
2813 xmalloc_array(unsigned long,
2814 d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
2815 if ( d->arch.shadow.dirty_bitmap == NULL )
2817 d->arch.shadow.dirty_bitmap_size = 0;
2818 return -ENOMEM;
2820 memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
2822 return 0;
2825 static void
2826 sh_free_log_dirty_bitmap(struct domain *d)
2828 d->arch.shadow.dirty_bitmap_size = 0;
2829 if ( d->arch.shadow.dirty_bitmap )
2831 xfree(d->arch.shadow.dirty_bitmap);
2832 d->arch.shadow.dirty_bitmap = NULL;
2836 static int shadow_log_dirty_enable(struct domain *d)
2838 int ret;
2840 domain_pause(d);
2841 shadow_lock(d);
2843 if ( shadow_mode_log_dirty(d) )
2845 ret = -EINVAL;
2846 goto out;
2849 if ( shadow_mode_enabled(d) )
2851 SHADOW_ERROR("Don't (yet) support enabling log-dirty"
2852 " on already shadowed doms\n");
2853 ret = -EINVAL;
2854 goto out;
2857 ret = sh_alloc_log_dirty_bitmap(d);
2858 if ( ret != 0 )
2860 sh_free_log_dirty_bitmap(d);
2861 goto out;
2864 ret = shadow_one_bit_enable(d, SHM2_log_dirty);
2865 if ( ret != 0 )
2866 sh_free_log_dirty_bitmap(d);
2868 out:
2869 shadow_unlock(d);
2870 domain_unpause(d);
2871 return ret;
2874 static int shadow_log_dirty_disable(struct domain *d)
2876 int ret;
2878 domain_pause(d);
2879 shadow_lock(d);
2880 ret = shadow_one_bit_disable(d, SHM2_log_dirty);
2881 if ( !shadow_mode_log_dirty(d) )
2882 sh_free_log_dirty_bitmap(d);
2883 shadow_unlock(d);
2884 domain_unpause(d);
2886 return ret;
2889 /**************************************************************************/
2890 /* P2M map manipulations */
2892 static void
2893 sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
2895 struct vcpu *v;
2897 if ( !shadow_mode_translate(d) )
2898 return;
2900 v = current;
2901 if ( v->domain != d )
2902 v = d->vcpu[0];
2904 SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
2906 ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
2907 //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
2909 if ( v != NULL )
2911 shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
2912 if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
2913 flush_tlb_mask(d->domain_dirty_cpumask);
2916 shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
2917 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
2920 void
2921 shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
2922 unsigned long mfn)
2924 shadow_lock(d);
2925 shadow_audit_p2m(d);
2926 sh_p2m_remove_page(d, gfn, mfn);
2927 shadow_audit_p2m(d);
2928 shadow_unlock(d);
2931 void
2932 shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
2933 unsigned long mfn)
2935 unsigned long ogfn;
2936 mfn_t omfn;
2938 if ( !shadow_mode_translate(d) )
2939 return;
2941 shadow_lock(d);
2942 shadow_audit_p2m(d);
2944 SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
2946 omfn = sh_gfn_to_mfn(d, gfn);
2947 if ( mfn_valid(omfn) )
2949 /* Get rid of the old mapping, especially any shadows */
2950 struct vcpu *v = current;
2951 if ( v->domain != d )
2952 v = d->vcpu[0];
2953 if ( v != NULL )
2955 shadow_remove_all_shadows_and_parents(v, omfn);
2956 if ( shadow_remove_all_mappings(v, omfn) )
2957 flush_tlb_mask(d->domain_dirty_cpumask);
2959 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
2962 ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
2963 if (
2964 #ifdef __x86_64__
2965 (ogfn != 0x5555555555555555L)
2966 #else
2967 (ogfn != 0x55555555L)
2968 #endif
2969 && (ogfn != INVALID_M2P_ENTRY)
2970 && (ogfn != gfn) )
2972 /* This machine frame is already mapped at another physical address */
2973 SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
2974 mfn, ogfn, gfn);
2975 if ( mfn_valid(omfn = sh_gfn_to_mfn(d, ogfn)) )
2977 SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
2978 ogfn , mfn_x(omfn));
2979 if ( mfn_x(omfn) == mfn )
2980 sh_p2m_remove_page(d, ogfn, mfn);
2984 shadow_set_p2m_entry(d, gfn, _mfn(mfn));
2985 set_gpfn_from_mfn(mfn, gfn);
2986 shadow_audit_p2m(d);
2987 shadow_unlock(d);
2990 /**************************************************************************/
2991 /* Log-dirty mode support */
2993 /* Convert a shadow to log-dirty mode. */
2994 void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
2996 BUG();
3000 /* Read a domain's log-dirty bitmap and stats.
3001 * If the operation is a CLEAN, clear the bitmap and stats as well. */
3002 static int shadow_log_dirty_op(
3003 struct domain *d, struct xen_domctl_shadow_op *sc)
3005 int i, rv = 0, clean = 0;
3007 domain_pause(d);
3008 shadow_lock(d);
3010 clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
3012 SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
3013 (clean) ? "clean" : "peek",
3014 d->domain_id,
3015 d->arch.shadow.fault_count,
3016 d->arch.shadow.dirty_count);
3018 sc->stats.fault_count = d->arch.shadow.fault_count;
3019 sc->stats.dirty_count = d->arch.shadow.dirty_count;
3021 if ( clean )
3023 /* Need to revoke write access to the domain's pages again.
3024 * In future, we'll have a less heavy-handed approach to this,
3025 * but for now, we just unshadow everything except Xen. */
3026 shadow_blow_tables(d);
3028 d->arch.shadow.fault_count = 0;
3029 d->arch.shadow.dirty_count = 0;
3032 if ( guest_handle_is_null(sc->dirty_bitmap) ||
3033 (d->arch.shadow.dirty_bitmap == NULL) )
3035 rv = -EINVAL;
3036 goto out;
3039 if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
3040 sc->pages = d->arch.shadow.dirty_bitmap_size;
3042 #define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
3043 for ( i = 0; i < sc->pages; i += CHUNK )
3045 int bytes = ((((sc->pages - i) > CHUNK)
3046 ? CHUNK
3047 : (sc->pages - i)) + 7) / 8;
3049 if ( copy_to_guest_offset(
3050 sc->dirty_bitmap,
3051 i/(8*sizeof(unsigned long)),
3052 d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
3053 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
3055 rv = -EINVAL;
3056 goto out;
3059 if ( clean )
3060 memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
3061 0, bytes);
3063 #undef CHUNK
3065 out:
3066 shadow_unlock(d);
3067 domain_unpause(d);
3068 return 0;
3072 /* Mark a page as dirty */
3073 void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
3075 unsigned long pfn;
3077 ASSERT(shadow_lock_is_acquired(d));
3078 ASSERT(shadow_mode_log_dirty(d));
3080 if ( !mfn_valid(gmfn) )
3081 return;
3083 ASSERT(d->arch.shadow.dirty_bitmap != NULL);
3085 /* We /really/ mean PFN here, even for non-translated guests. */
3086 pfn = get_gpfn_from_mfn(mfn_x(gmfn));
3088 /*
3089 * Values with the MSB set denote MFNs that aren't really part of the
3090 * domain's pseudo-physical memory map (e.g., the shared info frame).
3091 * Nothing to do here...
3092 */
3093 if ( unlikely(!VALID_M2P(pfn)) )
3094 return;
3096 /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
3097 if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
3099 if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
3101 SHADOW_DEBUG(LOGDIRTY,
3102 "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
3103 mfn_x(gmfn), pfn, d->domain_id);
3104 d->arch.shadow.dirty_count++;
3107 else
3109 SHADOW_PRINTK("mark_dirty OOR! "
3110 "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
3111 "owner=%d c=%08x t=%" PRtype_info "\n",
3112 mfn_x(gmfn),
3113 pfn,
3114 d->arch.shadow.dirty_bitmap_size,
3115 d->domain_id,
3116 (page_get_owner(mfn_to_page(gmfn))
3117 ? page_get_owner(mfn_to_page(gmfn))->domain_id
3118 : -1),
3119 mfn_to_page(gmfn)->count_info,
3120 mfn_to_page(gmfn)->u.inuse.type_info);
3125 /**************************************************************************/
3126 /* Shadow-control XEN_DOMCTL dispatcher */
3128 int shadow_domctl(struct domain *d,
3129 xen_domctl_shadow_op_t *sc,
3130 XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
3132 int rc, preempted = 0;
3134 if ( unlikely(d == current->domain) )
3136 gdprintk(XENLOG_INFO, "Don't try to do a shadow op on yourself!\n");
3137 return -EINVAL;
3140 switch ( sc->op )
3142 case XEN_DOMCTL_SHADOW_OP_OFF:
3143 if ( shadow_mode_log_dirty(d) )
3144 if ( (rc = shadow_log_dirty_disable(d)) != 0 )
3145 return rc;
3146 if ( is_hvm_domain(d) )
3147 return -EINVAL;
3148 if ( d->arch.shadow.mode & SHM2_enable )
3149 if ( (rc = shadow_test_disable(d)) != 0 )
3150 return rc;
3151 return 0;
3153 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3154 return shadow_test_enable(d);
3156 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
3157 return shadow_log_dirty_enable(d);
3159 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3160 return shadow_enable(d, SHM2_refcounts|SHM2_translate);
3162 case XEN_DOMCTL_SHADOW_OP_CLEAN:
3163 case XEN_DOMCTL_SHADOW_OP_PEEK:
3164 return shadow_log_dirty_op(d, sc);
3166 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3167 if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
3168 return shadow_log_dirty_enable(d);
3169 return shadow_enable(d, sc->mode << SHM2_shift);
3171 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3172 sc->mb = shadow_get_allocation(d);
3173 return 0;
3175 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3176 rc = shadow_set_allocation(d, sc->mb, &preempted);
3177 if ( preempted )
3178 /* Not finished. Set up to re-run the call. */
3179 rc = hypercall_create_continuation(
3180 __HYPERVISOR_domctl, "h", u_domctl);
3181 else
3182 /* Finished. Return the new allocation */
3183 sc->mb = shadow_get_allocation(d);
3184 return rc;
3186 default:
3187 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3188 return -EINVAL;
3193 /**************************************************************************/
3194 /* Auditing shadow tables */
3196 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3198 void shadow_audit_tables(struct vcpu *v)
3200 /* Dispatch table for getting per-type functions */
3201 static hash_callback_t callbacks[16] = {
3202 NULL, /* none */
3203 #if CONFIG_PAGING_LEVELS == 2
3204 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
3205 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
3206 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
3207 #else
3208 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
3209 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
3210 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
3211 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
3212 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
3213 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
3214 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
3215 #if CONFIG_PAGING_LEVELS >= 4
3216 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
3217 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
3218 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
3219 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
3220 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
3221 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3222 #endif /* CONFIG_PAGING_LEVELS > 2 */
3223 NULL /* All the rest */
3224 };
3225 unsigned int mask;
3227 if ( !(SHADOW_AUDIT_ENABLE) )
3228 return;
3230 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3231 mask = ~1; /* Audit every table in the system */
3232 else
3234 /* Audit only the current mode's tables */
3235 switch ( v->arch.shadow.mode->guest_levels )
3237 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3238 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3239 |SHF_L2H_PAE); break;
3240 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3241 |SHF_L3_64|SHF_L4_64); break;
3242 default: BUG();
3246 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3249 #endif /* Shadow audit */
3252 /**************************************************************************/
3253 /* Auditing p2m tables */
3255 #if SHADOW_AUDIT & SHADOW_AUDIT_P2M
3257 void shadow_audit_p2m(struct domain *d)
3259 struct list_head *entry;
3260 struct page_info *page;
3261 struct domain *od;
3262 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
3263 mfn_t p2mfn;
3264 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
3265 int test_linear;
3267 if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
3268 return;
3270 //SHADOW_PRINTK("p2m audit starts\n");
3272 test_linear = ( (d == current->domain)
3273 && !pagetable_is_null(current->arch.monitor_table) );
3274 if ( test_linear )
3275 local_flush_tlb();
3277 /* Audit part one: walk the domain's page allocation list, checking
3278 * the m2p entries. */
3279 for ( entry = d->page_list.next;
3280 entry != &d->page_list;
3281 entry = entry->next )
3283 page = list_entry(entry, struct page_info, list);
3284 mfn = mfn_x(page_to_mfn(page));
3286 // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
3288 od = page_get_owner(page);
3290 if ( od != d )
3292 SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
3293 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
3294 continue;
3297 gfn = get_gpfn_from_mfn(mfn);
3298 if ( gfn == INVALID_M2P_ENTRY )
3300 orphans_i++;
3301 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
3302 // mfn);
3303 continue;
3306 if ( gfn == 0x55555555 )
3308 orphans_d++;
3309 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
3310 // mfn);
3311 continue;
3314 p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
3315 if ( mfn_x(p2mfn) != mfn )
3317 mpbad++;
3318 SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
3319 " (-> gfn %#lx)\n",
3320 mfn, gfn, mfn_x(p2mfn),
3321 (mfn_valid(p2mfn)
3322 ? get_gpfn_from_mfn(mfn_x(p2mfn))
3323 : -1u));
3324 /* This m2p entry is stale: the domain has another frame in
3325 * this physical slot. No great disaster, but for neatness,
3326 * blow away the m2p entry. */
3327 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3330 if ( test_linear && (gfn <= d->arch.max_mapped_pfn) )
3332 lp2mfn = gfn_to_mfn_current(gfn);
3333 if ( mfn_x(lp2mfn) != mfn_x(p2mfn) )
3335 SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
3336 "(!= mfn %#lx)\n", gfn,
3337 mfn_x(lp2mfn), mfn_x(p2mfn));
3341 // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
3342 // mfn, gfn, p2mfn, lp2mfn);
3345 /* Audit part two: walk the domain's p2m table, checking the entries. */
3346 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
3348 l2_pgentry_t *l2e;
3349 l1_pgentry_t *l1e;
3350 int i1, i2;
3352 #if CONFIG_PAGING_LEVELS == 4
3353 l4_pgentry_t *l4e;
3354 l3_pgentry_t *l3e;
3355 int i3, i4;
3356 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3357 #elif CONFIG_PAGING_LEVELS == 3
3358 l3_pgentry_t *l3e;
3359 int i3;
3360 l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3361 #else /* CONFIG_PAGING_LEVELS == 2 */
3362 l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3363 #endif
3365 gfn = 0;
3366 #if CONFIG_PAGING_LEVELS >= 3
3367 #if CONFIG_PAGING_LEVELS >= 4
3368 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
3370 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
3372 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
3373 continue;
3375 l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
3376 #endif /* now at levels 3 or 4... */
3377 for ( i3 = 0;
3378 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
3379 i3++ )
3381 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
3383 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3384 continue;
3386 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
3387 #endif /* all levels... */
3388 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
3390 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
3392 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
3393 continue;
3395 l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
3397 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
3399 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
3400 continue;
3401 mfn = l1e_get_pfn(l1e[i1]);
3402 ASSERT(mfn_valid(_mfn(mfn)));
3403 m2pfn = get_gpfn_from_mfn(mfn);
3404 if ( m2pfn != gfn )
3406 pmbad++;
3407 SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
3408 " -> gfn %#lx\n", gfn, mfn, m2pfn);
3409 BUG();
3412 sh_unmap_domain_page(l1e);
3414 #if CONFIG_PAGING_LEVELS >= 3
3415 sh_unmap_domain_page(l2e);
3417 #if CONFIG_PAGING_LEVELS >= 4
3418 sh_unmap_domain_page(l3e);
3420 #endif
3421 #endif
3423 #if CONFIG_PAGING_LEVELS == 4
3424 sh_unmap_domain_page(l4e);
3425 #elif CONFIG_PAGING_LEVELS == 3
3426 sh_unmap_domain_page(l3e);
3427 #else /* CONFIG_PAGING_LEVELS == 2 */
3428 sh_unmap_domain_page(l2e);
3429 #endif
3433 //SHADOW_PRINTK("p2m audit complete\n");
3434 //if ( orphans_i | orphans_d | mpbad | pmbad )
3435 // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
3436 // orphans_i + orphans_d, orphans_i, orphans_d,
3437 if ( mpbad | pmbad )
3438 SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
3439 pmbad, mpbad);
3442 #endif /* p2m audit */
3444 /*
3445 * Local variables:
3446 * mode: C
3447 * c-set-style: "BSD"
3448 * c-basic-offset: 4
3449 * indent-tabs-mode: nil
3450 * End:
3451 */