ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 15812:86a154e1ef5d

[HVM] Shadow: don't shadow the p2m table.
For HVM vcpus with paging disabled, we used to shadow the p2m table,
and skip the p2m lookup to go from gfn to mfn. Instead, we now
provide a simple pagetable that gives a one-to-one mapping of 4GB, and
shadow that, making the translations from gfn to mfn via the p2m.
This removes the paging-disabled special-case code from the shadow
fault handler, and allows us to expand the p2m interface, since all HVM
translations now go through the same p2m lookups.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Fri Aug 31 11:06:22 2007 +0100 (2007-08-31)
parents 9fd5becfba6b
children 4633e9604da9
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include "private.h"
42 /* Set up the shadow-specific parts of a domain struct at start of day.
43 * Called for every domain from arch_domain_create() */
44 void shadow_domain_init(struct domain *d)
45 {
46 int i;
47 shadow_lock_init(d);
48 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
49 INIT_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
50 INIT_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
51 INIT_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
53 /* Use shadow pagetables for log-dirty support */
54 paging_log_dirty_init(d, shadow_enable_log_dirty,
55 shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
56 }
58 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
59 * job is to initialize the update_paging_modes() function pointer, which is
60 * used to initialized the rest of resources. Therefore, it really does not
61 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
62 * be compiled.
63 */
64 void shadow_vcpu_init(struct vcpu *v)
65 {
66 #if CONFIG_PAGING_LEVELS == 4
67 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
68 #elif CONFIG_PAGING_LEVELS == 3
69 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
70 #elif CONFIG_PAGING_LEVELS == 2
71 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
72 #endif
73 }
75 #if SHADOW_AUDIT
76 int shadow_audit_enable = 0;
78 static void shadow_audit_key(unsigned char key)
79 {
80 shadow_audit_enable = !shadow_audit_enable;
81 printk("%s shadow_audit_enable=%d\n",
82 __func__, shadow_audit_enable);
83 }
85 static int __init shadow_audit_key_init(void)
86 {
87 register_keyhandler(
88 'O', shadow_audit_key, "toggle shadow audits");
89 return 0;
90 }
91 __initcall(shadow_audit_key_init);
92 #endif /* SHADOW_AUDIT */
94 int _shadow_mode_refcounts(struct domain *d)
95 {
96 return shadow_mode_refcounts(d);
97 }
100 /**************************************************************************/
101 /* x86 emulator support for the shadow code
102 */
104 struct segment_register *hvm_get_seg_reg(
105 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
106 {
107 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
108 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
109 hvm_get_segment_register(current, seg, seg_reg);
110 return seg_reg;
111 }
113 enum hvm_access_type {
114 hvm_access_insn_fetch, hvm_access_read, hvm_access_write
115 };
117 static int hvm_translate_linear_addr(
118 enum x86_segment seg,
119 unsigned long offset,
120 unsigned int bytes,
121 enum hvm_access_type access_type,
122 struct sh_emulate_ctxt *sh_ctxt,
123 unsigned long *paddr)
124 {
125 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
126 unsigned long limit, addr = offset;
127 uint32_t last_byte;
129 if ( sh_ctxt->ctxt.addr_size != 64 )
130 {
131 /*
132 * COMPATIBILITY MODE: Apply segment checks and add base.
133 */
135 switch ( access_type )
136 {
137 case hvm_access_read:
138 if ( (reg->attr.fields.type & 0xa) == 0x8 )
139 goto gpf; /* execute-only code segment */
140 break;
141 case hvm_access_write:
142 if ( (reg->attr.fields.type & 0xa) != 0x2 )
143 goto gpf; /* not a writable data segment */
144 break;
145 default:
146 break;
147 }
149 /* Calculate the segment limit, including granularity flag. */
150 limit = reg->limit;
151 if ( reg->attr.fields.g )
152 limit = (limit << 12) | 0xfff;
154 last_byte = offset + bytes - 1;
156 /* Is this a grows-down data segment? Special limit check if so. */
157 if ( (reg->attr.fields.type & 0xc) == 0x4 )
158 {
159 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
160 if ( !reg->attr.fields.db )
161 last_byte = (uint16_t)last_byte;
163 /* Check first byte and last byte against respective bounds. */
164 if ( (offset <= limit) || (last_byte < offset) )
165 goto gpf;
166 }
167 else if ( (last_byte > limit) || (last_byte < offset) )
168 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
170 /*
171 * Hardware truncates to 32 bits in compatibility mode.
172 * It does not truncate to 16 bits in 16-bit address-size mode.
173 */
174 addr = (uint32_t)(addr + reg->base);
175 }
176 else
177 {
178 /*
179 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
180 */
182 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
183 addr += reg->base;
185 if ( !is_canonical_address(addr) )
186 goto gpf;
187 }
189 *paddr = addr;
190 return 0;
192 gpf:
193 /* Inject #GP(0). */
194 hvm_inject_exception(TRAP_gp_fault, 0, 0);
195 return X86EMUL_EXCEPTION;
196 }
198 static int
199 hvm_read(enum x86_segment seg,
200 unsigned long offset,
201 unsigned long *val,
202 unsigned int bytes,
203 enum hvm_access_type access_type,
204 struct sh_emulate_ctxt *sh_ctxt)
205 {
206 unsigned long addr;
207 int rc, errcode;
209 rc = hvm_translate_linear_addr(
210 seg, offset, bytes, access_type, sh_ctxt, &addr);
211 if ( rc )
212 return rc;
214 *val = 0;
215 // XXX -- this is WRONG.
216 // It entirely ignores the permissions in the page tables.
217 // In this case, that is only a user vs supervisor access check.
218 //
219 if ( (rc = hvm_copy_from_guest_virt(val, addr, bytes)) == 0 )
220 return X86EMUL_OKAY;
222 /* If we got here, there was nothing mapped here, or a bad GFN
223 * was mapped here. This should never happen: we're here because
224 * of a write fault at the end of the instruction we're emulating. */
225 SHADOW_PRINTK("read failed to va %#lx\n", addr);
226 errcode = ring_3(sh_ctxt->ctxt.regs) ? PFEC_user_mode : 0;
227 if ( access_type == hvm_access_insn_fetch )
228 errcode |= PFEC_insn_fetch;
229 hvm_inject_exception(TRAP_page_fault, errcode, addr + bytes - rc);
230 return X86EMUL_EXCEPTION;
231 }
233 static int
234 hvm_emulate_read(enum x86_segment seg,
235 unsigned long offset,
236 unsigned long *val,
237 unsigned int bytes,
238 struct x86_emulate_ctxt *ctxt)
239 {
240 return hvm_read(seg, offset, val, bytes, hvm_access_read,
241 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
242 }
244 static int
245 hvm_emulate_insn_fetch(enum x86_segment seg,
246 unsigned long offset,
247 unsigned long *val,
248 unsigned int bytes,
249 struct x86_emulate_ctxt *ctxt)
250 {
251 struct sh_emulate_ctxt *sh_ctxt =
252 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
253 unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
255 /* Fall back if requested bytes are not in the prefetch cache. */
256 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
257 return hvm_read(seg, offset, val, bytes,
258 hvm_access_insn_fetch, sh_ctxt);
260 /* Hit the cache. Simple memcpy. */
261 *val = 0;
262 memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
263 return X86EMUL_OKAY;
264 }
266 static int
267 hvm_emulate_write(enum x86_segment seg,
268 unsigned long offset,
269 unsigned long val,
270 unsigned int bytes,
271 struct x86_emulate_ctxt *ctxt)
272 {
273 struct sh_emulate_ctxt *sh_ctxt =
274 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
275 struct vcpu *v = current;
276 unsigned long addr;
277 int rc;
279 /* How many emulations could we save if we unshadowed on stack writes? */
280 if ( seg == x86_seg_ss )
281 perfc_incr(shadow_fault_emulate_stack);
283 rc = hvm_translate_linear_addr(
284 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
285 if ( rc )
286 return rc;
288 return v->arch.paging.mode->shadow.x86_emulate_write(
289 v, addr, &val, bytes, sh_ctxt);
290 }
292 static int
293 hvm_emulate_cmpxchg(enum x86_segment seg,
294 unsigned long offset,
295 unsigned long old,
296 unsigned long new,
297 unsigned int bytes,
298 struct x86_emulate_ctxt *ctxt)
299 {
300 struct sh_emulate_ctxt *sh_ctxt =
301 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
302 struct vcpu *v = current;
303 unsigned long addr;
304 int rc;
306 rc = hvm_translate_linear_addr(
307 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
308 if ( rc )
309 return rc;
311 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
312 v, addr, old, new, bytes, sh_ctxt);
313 }
315 static int
316 hvm_emulate_cmpxchg8b(enum x86_segment seg,
317 unsigned long offset,
318 unsigned long old_lo,
319 unsigned long old_hi,
320 unsigned long new_lo,
321 unsigned long new_hi,
322 struct x86_emulate_ctxt *ctxt)
323 {
324 struct sh_emulate_ctxt *sh_ctxt =
325 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
326 struct vcpu *v = current;
327 unsigned long addr;
328 int rc;
330 rc = hvm_translate_linear_addr(
331 seg, offset, 8, hvm_access_write, sh_ctxt, &addr);
332 if ( rc )
333 return rc;
335 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
336 v, addr, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
337 }
339 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
340 .read = hvm_emulate_read,
341 .insn_fetch = hvm_emulate_insn_fetch,
342 .write = hvm_emulate_write,
343 .cmpxchg = hvm_emulate_cmpxchg,
344 .cmpxchg8b = hvm_emulate_cmpxchg8b,
345 };
347 static int
348 pv_emulate_read(enum x86_segment seg,
349 unsigned long offset,
350 unsigned long *val,
351 unsigned int bytes,
352 struct x86_emulate_ctxt *ctxt)
353 {
354 unsigned int rc;
356 *val = 0;
357 if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
358 {
359 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
360 return X86EMUL_EXCEPTION;
361 }
363 return X86EMUL_OKAY;
364 }
366 static int
367 pv_emulate_write(enum x86_segment seg,
368 unsigned long offset,
369 unsigned long val,
370 unsigned int bytes,
371 struct x86_emulate_ctxt *ctxt)
372 {
373 struct sh_emulate_ctxt *sh_ctxt =
374 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
375 struct vcpu *v = current;
376 return v->arch.paging.mode->shadow.x86_emulate_write(
377 v, offset, &val, bytes, sh_ctxt);
378 }
380 static int
381 pv_emulate_cmpxchg(enum x86_segment seg,
382 unsigned long offset,
383 unsigned long old,
384 unsigned long new,
385 unsigned int bytes,
386 struct x86_emulate_ctxt *ctxt)
387 {
388 struct sh_emulate_ctxt *sh_ctxt =
389 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
390 struct vcpu *v = current;
391 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
392 v, offset, old, new, bytes, sh_ctxt);
393 }
395 static int
396 pv_emulate_cmpxchg8b(enum x86_segment seg,
397 unsigned long offset,
398 unsigned long old_lo,
399 unsigned long old_hi,
400 unsigned long new_lo,
401 unsigned long new_hi,
402 struct x86_emulate_ctxt *ctxt)
403 {
404 struct sh_emulate_ctxt *sh_ctxt =
405 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
406 struct vcpu *v = current;
407 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
408 v, offset, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
409 }
411 static struct x86_emulate_ops pv_shadow_emulator_ops = {
412 .read = pv_emulate_read,
413 .insn_fetch = pv_emulate_read,
414 .write = pv_emulate_write,
415 .cmpxchg = pv_emulate_cmpxchg,
416 .cmpxchg8b = pv_emulate_cmpxchg8b,
417 };
419 struct x86_emulate_ops *shadow_init_emulation(
420 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
421 {
422 struct segment_register *creg, *sreg;
423 struct vcpu *v = current;
424 unsigned long addr;
426 sh_ctxt->ctxt.regs = regs;
428 if ( !is_hvm_vcpu(v) )
429 {
430 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
431 return &pv_shadow_emulator_ops;
432 }
434 /* Segment cache initialisation. Primed with CS. */
435 sh_ctxt->valid_seg_regs = 0;
436 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
438 /* Work out the emulation mode. */
439 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
440 {
441 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
442 }
443 else if ( regs->eflags & X86_EFLAGS_VM )
444 {
445 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 16;
446 }
447 else
448 {
449 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
450 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
451 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
452 }
454 /* Attempt to prefetch whole instruction. */
455 sh_ctxt->insn_buf_eip = regs->eip;
456 sh_ctxt->insn_buf_bytes =
457 (!hvm_translate_linear_addr(
458 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
459 hvm_access_insn_fetch, sh_ctxt, &addr) &&
460 !hvm_copy_from_guest_virt(
461 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf)))
462 ? sizeof(sh_ctxt->insn_buf) : 0;
464 return &hvm_shadow_emulator_ops;
465 }
467 /* Update an initialized emulation context to prepare for the next
468 * instruction */
469 void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
470 struct cpu_user_regs *regs)
471 {
472 struct vcpu *v = current;
473 unsigned long addr, diff;
475 /* We don't refetch the segment bases, because we don't emulate
476 * writes to segment registers */
478 if ( is_hvm_vcpu(v) )
479 {
480 diff = regs->eip - sh_ctxt->insn_buf_eip;
481 if ( diff > sh_ctxt->insn_buf_bytes )
482 {
483 /* Prefetch more bytes. */
484 sh_ctxt->insn_buf_bytes =
485 (!hvm_translate_linear_addr(
486 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
487 hvm_access_insn_fetch, sh_ctxt, &addr) &&
488 !hvm_copy_from_guest_virt(
489 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf)))
490 ? sizeof(sh_ctxt->insn_buf) : 0;
491 sh_ctxt->insn_buf_eip = regs->eip;
492 }
493 }
494 }
496 /**************************************************************************/
497 /* Code for "promoting" a guest page to the point where the shadow code is
498 * willing to let it be treated as a guest page table. This generally
499 * involves making sure there are no writable mappings available to the guest
500 * for this page.
501 */
502 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
503 {
504 struct page_info *page = mfn_to_page(gmfn);
506 ASSERT(mfn_valid(gmfn));
508 /* We should never try to promote a gmfn that has writeable mappings */
509 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
510 || (page->u.inuse.type_info & PGT_count_mask) == 0
511 || v->domain->is_shutting_down);
513 /* Is the page already shadowed? */
514 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
515 page->shadow_flags = 0;
517 ASSERT(!test_bit(type, &page->shadow_flags));
518 set_bit(type, &page->shadow_flags);
519 }
521 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
522 {
523 struct page_info *page = mfn_to_page(gmfn);
525 ASSERT(test_bit(_PGC_page_table, &page->count_info));
526 ASSERT(test_bit(type, &page->shadow_flags));
528 clear_bit(type, &page->shadow_flags);
530 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
531 {
532 /* tlbflush timestamp field is valid again */
533 page->tlbflush_timestamp = tlbflush_current_time();
534 clear_bit(_PGC_page_table, &page->count_info);
535 }
536 }
538 /**************************************************************************/
539 /* Validate a pagetable change from the guest and update the shadows.
540 * Returns a bitmask of SHADOW_SET_* flags. */
542 int
543 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
544 {
545 int result = 0;
546 struct page_info *page = mfn_to_page(gmfn);
548 paging_mark_dirty(v->domain, mfn_x(gmfn));
550 // Determine which types of shadows are affected, and update each.
551 //
552 // Always validate L1s before L2s to prevent another cpu with a linear
553 // mapping of this gmfn from seeing a walk that results from
554 // using the new L2 value and the old L1 value. (It is OK for such a
555 // guest to see a walk that uses the old L2 value with the new L1 value,
556 // as hardware could behave this way if one level of the pagewalk occurs
557 // before the store, and the next level of the pagewalk occurs after the
558 // store.
559 //
560 // Ditto for L2s before L3s, etc.
561 //
563 if ( !(page->count_info & PGC_page_table) )
564 return 0; /* Not shadowed at all */
566 #if CONFIG_PAGING_LEVELS == 2
567 if ( page->shadow_flags & SHF_L1_32 )
568 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
569 (v, gmfn, entry, size);
570 #else
571 if ( page->shadow_flags & SHF_L1_32 )
572 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
573 (v, gmfn, entry, size);
574 #endif
576 #if CONFIG_PAGING_LEVELS == 2
577 if ( page->shadow_flags & SHF_L2_32 )
578 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
579 (v, gmfn, entry, size);
580 #else
581 if ( page->shadow_flags & SHF_L2_32 )
582 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
583 (v, gmfn, entry, size);
584 #endif
586 #if CONFIG_PAGING_LEVELS >= 3
587 if ( page->shadow_flags & SHF_L1_PAE )
588 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
589 (v, gmfn, entry, size);
590 if ( page->shadow_flags & SHF_L2_PAE )
591 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
592 (v, gmfn, entry, size);
593 if ( page->shadow_flags & SHF_L2H_PAE )
594 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
595 (v, gmfn, entry, size);
596 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
597 ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
598 #endif
600 #if CONFIG_PAGING_LEVELS >= 4
601 if ( page->shadow_flags & SHF_L1_64 )
602 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
603 (v, gmfn, entry, size);
604 if ( page->shadow_flags & SHF_L2_64 )
605 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
606 (v, gmfn, entry, size);
607 if ( page->shadow_flags & SHF_L2H_64 )
608 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4, 4)
609 (v, gmfn, entry, size);
610 if ( page->shadow_flags & SHF_L3_64 )
611 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
612 (v, gmfn, entry, size);
613 if ( page->shadow_flags & SHF_L4_64 )
614 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
615 (v, gmfn, entry, size);
616 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
617 ASSERT((page->shadow_flags
618 & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
619 #endif
621 return result;
622 }
625 void
626 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
627 void *entry, u32 size)
628 /* This is the entry point for emulated writes to pagetables in HVM guests and
629 * PV translated guests.
630 */
631 {
632 struct domain *d = v->domain;
633 int rc;
635 ASSERT(shadow_locked_by_me(v->domain));
636 rc = sh_validate_guest_entry(v, gmfn, entry, size);
637 if ( rc & SHADOW_SET_FLUSH )
638 /* Need to flush TLBs to pick up shadow PT changes */
639 flush_tlb_mask(d->domain_dirty_cpumask);
640 if ( rc & SHADOW_SET_ERROR )
641 {
642 /* This page is probably not a pagetable any more: tear it out of the
643 * shadows, along with any tables that reference it.
644 * Since the validate call above will have made a "safe" (i.e. zero)
645 * shadow entry, we can let the domain live even if we can't fully
646 * unshadow the page. */
647 sh_remove_shadows(v, gmfn, 0, 0);
648 }
649 }
651 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
652 intpte_t new, mfn_t gmfn)
653 /* Write a new value into the guest pagetable, and update the shadows
654 * appropriately. Returns 0 if we page-faulted, 1 for success. */
655 {
656 int failed;
657 shadow_lock(v->domain);
658 failed = __copy_to_user(p, &new, sizeof(new));
659 if ( failed != sizeof(new) )
660 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
661 shadow_unlock(v->domain);
662 return (failed == 0);
663 }
665 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
666 intpte_t *old, intpte_t new, mfn_t gmfn)
667 /* Cmpxchg a new value into the guest pagetable, and update the shadows
668 * appropriately. Returns 0 if we page-faulted, 1 if not.
669 * N.B. caller should check the value of "old" to see if the
670 * cmpxchg itself was successful. */
671 {
672 int failed;
673 intpte_t t = *old;
674 shadow_lock(v->domain);
675 failed = cmpxchg_user(p, t, new);
676 if ( t == *old )
677 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
678 *old = t;
679 shadow_unlock(v->domain);
680 return (failed == 0);
681 }
684 /**************************************************************************/
685 /* Memory management for shadow pages. */
687 /* Allocating shadow pages
688 * -----------------------
689 *
690 * Most shadow pages are allocated singly, but there is one case where
691 * we need to allocate multiple pages together: shadowing 32-bit guest
692 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
693 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
694 * l1 tables (covering 2MB of virtual address space each). Similarly, a
695 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
696 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
697 * contiguous and aligned; functions for handling offsets into them are
698 * defined in shadow.c (shadow_l1_index() etc.)
699 *
700 * This table shows the allocation behaviour of the different modes:
701 *
702 * Xen paging 32b pae pae 64b 64b 64b
703 * Guest paging 32b 32b pae 32b pae 64b
704 * PV or HVM * HVM * HVM HVM *
705 * Shadow paging 32b pae pae pae pae 64b
706 *
707 * sl1 size 4k 8k 4k 8k 4k 4k
708 * sl2 size 4k 16k 4k 16k 4k 4k
709 * sl3 size - - - - - 4k
710 * sl4 size - - - - - 4k
711 *
712 * We allocate memory from xen in four-page units and break them down
713 * with a simple buddy allocator. Can't use the xen allocator to handle
714 * this as it only works for contiguous zones, and a domain's shadow
715 * pool is made of fragments.
716 *
717 * In HVM guests, the p2m table is built out of shadow pages, and we provide
718 * a function for the p2m management to steal pages, in max-order chunks, from
719 * the free pool. We don't provide for giving them back, yet.
720 */
722 /* Figure out the least acceptable quantity of shadow memory.
723 * The minimum memory requirement for always being able to free up a
724 * chunk of memory is very small -- only three max-order chunks per
725 * vcpu to hold the top level shadows and pages with Xen mappings in them.
726 *
727 * But for a guest to be guaranteed to successfully execute a single
728 * instruction, we must be able to map a large number (about thirty) VAs
729 * at the same time, which means that to guarantee progress, we must
730 * allow for more than ninety allocated pages per vcpu. We round that
731 * up to 128 pages, or half a megabyte per vcpu. */
732 unsigned int shadow_min_acceptable_pages(struct domain *d)
733 {
734 u32 vcpu_count = 0;
735 struct vcpu *v;
737 for_each_vcpu(d, v)
738 vcpu_count++;
740 return (vcpu_count * 128);
741 }
743 /* Figure out the order of allocation needed for a given shadow type */
744 static inline u32
745 shadow_order(unsigned int shadow_type)
746 {
747 #if CONFIG_PAGING_LEVELS > 2
748 static const u32 type_to_order[SH_type_unused] = {
749 0, /* SH_type_none */
750 1, /* SH_type_l1_32_shadow */
751 1, /* SH_type_fl1_32_shadow */
752 2, /* SH_type_l2_32_shadow */
753 0, /* SH_type_l1_pae_shadow */
754 0, /* SH_type_fl1_pae_shadow */
755 0, /* SH_type_l2_pae_shadow */
756 0, /* SH_type_l2h_pae_shadow */
757 0, /* SH_type_l1_64_shadow */
758 0, /* SH_type_fl1_64_shadow */
759 0, /* SH_type_l2_64_shadow */
760 0, /* SH_type_l2h_64_shadow */
761 0, /* SH_type_l3_64_shadow */
762 0, /* SH_type_l4_64_shadow */
763 2, /* SH_type_p2m_table */
764 0 /* SH_type_monitor_table */
765 };
766 ASSERT(shadow_type < SH_type_unused);
767 return type_to_order[shadow_type];
768 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
769 return 0;
770 #endif
771 }
774 /* Do we have a free chunk of at least this order? */
775 static inline int chunk_is_available(struct domain *d, int order)
776 {
777 int i;
779 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
780 if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
781 return 1;
782 return 0;
783 }
785 /* Dispatcher function: call the per-mode function that will unhook the
786 * non-Xen mappings in this top-level shadow mfn */
787 void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
788 {
789 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
790 switch ( sp->type )
791 {
792 case SH_type_l2_32_shadow:
793 #if CONFIG_PAGING_LEVELS == 2
794 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
795 #else
796 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
797 #endif
798 break;
799 #if CONFIG_PAGING_LEVELS >= 3
800 case SH_type_l2_pae_shadow:
801 case SH_type_l2h_pae_shadow:
802 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
803 break;
804 #endif
805 #if CONFIG_PAGING_LEVELS >= 4
806 case SH_type_l4_64_shadow:
807 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
808 break;
809 #endif
810 default:
811 SHADOW_PRINTK("top-level shadow has bad type %08x\n", sp->type);
812 BUG();
813 }
814 }
817 /* Make sure there is at least one chunk of the required order available
818 * in the shadow page pool. This must be called before any calls to
819 * shadow_alloc(). Since this will free existing shadows to make room,
820 * it must be called early enough to avoid freeing shadows that the
821 * caller is currently working on. */
822 void shadow_prealloc(struct domain *d, unsigned int order)
823 {
824 /* Need a vpcu for calling unpins; for now, since we don't have
825 * per-vcpu shadows, any will do */
826 struct vcpu *v, *v2;
827 struct list_head *l, *t;
828 struct shadow_page_info *sp;
829 cpumask_t flushmask = CPU_MASK_NONE;
830 mfn_t smfn;
831 int i;
833 if ( chunk_is_available(d, order) ) return;
835 v = current;
836 if ( v->domain != d )
837 v = d->vcpu[0];
838 ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */
840 /* Stage one: walk the list of pinned pages, unpinning them */
841 perfc_incr(shadow_prealloc_1);
842 list_for_each_backwards_safe(l, t, &d->arch.paging.shadow.pinned_shadows)
843 {
844 sp = list_entry(l, struct shadow_page_info, list);
845 smfn = shadow_page_to_mfn(sp);
847 /* Unpin this top-level shadow */
848 sh_unpin(v, smfn);
850 /* See if that freed up a chunk of appropriate size */
851 if ( chunk_is_available(d, order) ) return;
852 }
854 /* Stage two: all shadow pages are in use in hierarchies that are
855 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
856 * mappings. */
857 perfc_incr(shadow_prealloc_2);
859 for_each_vcpu(d, v2)
860 for ( i = 0 ; i < 4 ; i++ )
861 {
862 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
863 {
864 shadow_unhook_mappings(v,
865 pagetable_get_mfn(v2->arch.shadow_table[i]));
866 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
868 /* See if that freed up a chunk of appropriate size */
869 if ( chunk_is_available(d, order) )
870 {
871 flush_tlb_mask(flushmask);
872 return;
873 }
874 }
875 }
877 /* Nothing more we can do: all remaining shadows are of pages that
878 * hold Xen mappings for some vcpu. This can never happen. */
879 SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
880 " shadow pages total = %u, free = %u, p2m=%u\n",
881 1 << order,
882 d->arch.paging.shadow.total_pages,
883 d->arch.paging.shadow.free_pages,
884 d->arch.paging.shadow.p2m_pages);
885 BUG();
886 }
888 /* Deliberately free all the memory we can: this will tear down all of
889 * this domain's shadows */
890 static void shadow_blow_tables(struct domain *d)
891 {
892 struct list_head *l, *t;
893 struct shadow_page_info *sp;
894 struct vcpu *v = d->vcpu[0];
895 mfn_t smfn;
896 int i;
898 ASSERT(v != NULL);
900 /* Pass one: unpin all pinned pages */
901 list_for_each_backwards_safe(l,t, &d->arch.paging.shadow.pinned_shadows)
902 {
903 sp = list_entry(l, struct shadow_page_info, list);
904 smfn = shadow_page_to_mfn(sp);
905 sh_unpin(v, smfn);
906 }
908 /* Second pass: unhook entries of in-use shadows */
909 for_each_vcpu(d, v)
910 for ( i = 0 ; i < 4 ; i++ )
911 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
912 shadow_unhook_mappings(v,
913 pagetable_get_mfn(v->arch.shadow_table[i]));
915 /* Make sure everyone sees the unshadowings */
916 flush_tlb_mask(d->domain_dirty_cpumask);
917 }
920 #ifndef NDEBUG
921 /* Blow all shadows of all shadowed domains: this can be used to cause the
922 * guest's pagetables to be re-shadowed if we suspect that the shadows
923 * have somehow got out of sync */
924 static void shadow_blow_all_tables(unsigned char c)
925 {
926 struct domain *d;
927 printk("'%c' pressed -> blowing all shadow tables\n", c);
928 rcu_read_lock(&domlist_read_lock);
929 for_each_domain(d)
930 {
931 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
932 {
933 shadow_lock(d);
934 shadow_blow_tables(d);
935 shadow_unlock(d);
936 }
937 }
938 rcu_read_unlock(&domlist_read_lock);
939 }
941 /* Register this function in the Xen console keypress table */
942 static __init int shadow_blow_tables_keyhandler_init(void)
943 {
944 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
945 return 0;
946 }
947 __initcall(shadow_blow_tables_keyhandler_init);
948 #endif /* !NDEBUG */
950 /* Allocate another shadow's worth of (contiguous, aligned) pages,
951 * and fill in the type and backpointer fields of their page_infos.
952 * Never fails to allocate. */
953 mfn_t shadow_alloc(struct domain *d,
954 u32 shadow_type,
955 unsigned long backpointer)
956 {
957 struct shadow_page_info *sp = NULL;
958 unsigned int order = shadow_order(shadow_type);
959 cpumask_t mask;
960 void *p;
961 int i;
963 ASSERT(shadow_locked_by_me(d));
964 ASSERT(order <= SHADOW_MAX_ORDER);
965 ASSERT(shadow_type != SH_type_none);
966 perfc_incr(shadow_alloc);
968 /* Find smallest order which can satisfy the request. */
969 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
970 if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
971 goto found;
973 /* If we get here, we failed to allocate. This should never happen.
974 * It means that we didn't call shadow_prealloc() correctly before
975 * we allocated. We can't recover by calling prealloc here, because
976 * we might free up higher-level pages that the caller is working on. */
977 SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
978 BUG();
980 found:
981 sp = list_entry(d->arch.paging.shadow.freelists[i].next,
982 struct shadow_page_info, list);
983 list_del(&sp->list);
985 /* We may have to halve the chunk a number of times. */
986 while ( i != order )
987 {
988 i--;
989 sp->order = i;
990 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[i]);
991 sp += 1 << i;
992 }
993 d->arch.paging.shadow.free_pages -= 1 << order;
995 /* Init page info fields and clear the pages */
996 for ( i = 0; i < 1<<order ; i++ )
997 {
998 /* Before we overwrite the old contents of this page,
999 * we need to be sure that no TLB holds a pointer to it. */
1000 mask = d->domain_dirty_cpumask;
1001 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
1002 if ( unlikely(!cpus_empty(mask)) )
1004 perfc_incr(shadow_alloc_tlbflush);
1005 flush_tlb_mask(mask);
1007 /* Now safe to clear the page for reuse */
1008 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
1009 ASSERT(p != NULL);
1010 clear_page(p);
1011 sh_unmap_domain_page(p);
1012 INIT_LIST_HEAD(&sp[i].list);
1013 sp[i].type = shadow_type;
1014 sp[i].pinned = 0;
1015 sp[i].count = 0;
1016 sp[i].backpointer = backpointer;
1017 sp[i].next_shadow = NULL;
1018 perfc_incr(shadow_alloc_count);
1020 return shadow_page_to_mfn(sp);
1024 /* Return some shadow pages to the pool. */
1025 void shadow_free(struct domain *d, mfn_t smfn)
1027 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1028 u32 shadow_type;
1029 unsigned long order;
1030 unsigned long mask;
1031 int i;
1033 ASSERT(shadow_locked_by_me(d));
1034 perfc_incr(shadow_free);
1036 shadow_type = sp->type;
1037 ASSERT(shadow_type != SH_type_none);
1038 ASSERT(shadow_type != SH_type_p2m_table);
1039 order = shadow_order(shadow_type);
1041 d->arch.paging.shadow.free_pages += 1 << order;
1043 for ( i = 0; i < 1<<order; i++ )
1045 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1046 struct vcpu *v;
1047 for_each_vcpu(d, v)
1049 /* No longer safe to look for a writeable mapping in this shadow */
1050 if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
1051 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1053 #endif
1054 /* Strip out the type: this is now a free shadow page */
1055 sp[i].type = 0;
1056 /* Remember the TLB timestamp so we will know whether to flush
1057 * TLBs when we reuse the page. Because the destructors leave the
1058 * contents of the pages in place, we can delay TLB flushes until
1059 * just before the allocator hands the page out again. */
1060 sp[i].tlbflush_timestamp = tlbflush_current_time();
1061 perfc_decr(shadow_alloc_count);
1064 /* Merge chunks as far as possible. */
1065 while ( order < SHADOW_MAX_ORDER )
1067 mask = 1 << order;
1068 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
1069 /* Merge with predecessor block? */
1070 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
1071 break;
1072 list_del(&(sp-mask)->list);
1073 sp -= mask;
1074 } else {
1075 /* Merge with successor block? */
1076 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
1077 break;
1078 list_del(&(sp+mask)->list);
1080 order++;
1083 sp->order = order;
1084 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1087 /* Divert some memory from the pool to be used by the p2m mapping.
1088 * This action is irreversible: the p2m mapping only ever grows.
1089 * That's OK because the p2m table only exists for translated domains,
1090 * and those domains can't ever turn off shadow mode.
1091 * Also, we only ever allocate a max-order chunk, so as to preserve
1092 * the invariant that shadow_prealloc() always works.
1093 * Returns 0 iff it can't get a chunk (the caller should then
1094 * free up some pages in domheap and call sh_set_allocation);
1095 * returns non-zero on success.
1096 */
1097 static int
1098 sh_alloc_p2m_pages(struct domain *d)
1100 struct page_info *pg;
1101 u32 i;
1102 ASSERT(shadow_locked_by_me(d));
1104 if ( d->arch.paging.shadow.total_pages
1105 < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
1106 return 0; /* Not enough shadow memory: need to increase it first */
1108 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1109 d->arch.paging.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
1110 d->arch.paging.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
1111 for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
1113 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1114 * Marking the domain as the owner would normally allow the guest to
1115 * create mappings of these pages, but these p2m pages will never be
1116 * in the domain's guest-physical address space, and so that is not
1117 * believed to be a concern.
1118 */
1119 page_set_owner(&pg[i], d);
1120 pg[i].count_info = 1;
1121 list_add_tail(&pg[i].list, &d->arch.paging.shadow.p2m_freelist);
1123 return 1;
1126 // Returns 0 if no memory is available...
1127 struct page_info *
1128 shadow_alloc_p2m_page(struct domain *d)
1130 struct list_head *entry;
1131 struct page_info *pg;
1132 mfn_t mfn;
1133 void *p;
1135 shadow_lock(d);
1137 if ( list_empty(&d->arch.paging.shadow.p2m_freelist) &&
1138 !sh_alloc_p2m_pages(d) )
1140 shadow_unlock(d);
1141 return NULL;
1143 entry = d->arch.paging.shadow.p2m_freelist.next;
1144 list_del(entry);
1146 shadow_unlock(d);
1148 pg = list_entry(entry, struct page_info, list);
1149 mfn = page_to_mfn(pg);
1150 p = sh_map_domain_page(mfn);
1151 clear_page(p);
1152 sh_unmap_domain_page(p);
1154 return pg;
1157 void
1158 shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1160 ASSERT(page_get_owner(pg) == d);
1161 /* Should have just the one ref we gave it in alloc_p2m_page() */
1162 if ( (pg->count_info & PGC_count_mask) != 1 )
1164 SHADOW_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1165 pg->count_info, pg->u.inuse.type_info);
1167 pg->count_info = 0;
1168 /* Free should not decrement domain's total allocation, since
1169 * these pages were allocated without an owner. */
1170 page_set_owner(pg, NULL);
1171 free_domheap_pages(pg, 0);
1172 d->arch.paging.shadow.p2m_pages--;
1173 perfc_decr(shadow_alloc_count);
1176 #if CONFIG_PAGING_LEVELS == 3
1177 static void p2m_install_entry_in_monitors(struct domain *d,
1178 l3_pgentry_t *l3e)
1179 /* Special case, only used for external-mode domains on PAE hosts:
1180 * update the mapping of the p2m table. Once again, this is trivial in
1181 * other paging modes (one top-level entry points to the top-level p2m,
1182 * no maintenance needed), but PAE makes life difficult by needing a
1183 * copy the eight l3es of the p2m table in eight l2h slots in the
1184 * monitor table. This function makes fresh copies when a p2m l3e
1185 * changes. */
1187 l2_pgentry_t *ml2e;
1188 struct vcpu *v;
1189 unsigned int index;
1191 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1192 ASSERT(index < MACHPHYS_MBYTES>>1);
1194 for_each_vcpu(d, v)
1196 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1197 continue;
1198 ASSERT(shadow_mode_external(v->domain));
1200 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1201 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1203 if ( v == current ) /* OK to use linear map of monitor_table */
1204 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1205 else
1207 l3_pgentry_t *ml3e;
1208 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1209 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1210 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1211 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1212 sh_unmap_domain_page(ml3e);
1214 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1215 if ( v != current )
1216 sh_unmap_domain_page(ml2e);
1219 #endif
1221 /* Set the pool of shadow pages to the required number of pages.
1222 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1223 * plus space for the p2m table.
1224 * Returns 0 for success, non-zero for failure. */
1225 static unsigned int sh_set_allocation(struct domain *d,
1226 unsigned int pages,
1227 int *preempted)
1229 struct shadow_page_info *sp;
1230 unsigned int lower_bound;
1231 int j;
1233 ASSERT(shadow_locked_by_me(d));
1235 /* Don't allocate less than the minimum acceptable, plus one page per
1236 * megabyte of RAM (for the p2m table) */
1237 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1238 if ( pages > 0 && pages < lower_bound )
1239 pages = lower_bound;
1240 /* Round up to largest block size */
1241 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1243 SHADOW_PRINTK("current %i target %i\n",
1244 d->arch.paging.shadow.total_pages, pages);
1246 while ( d->arch.paging.shadow.total_pages != pages )
1248 if ( d->arch.paging.shadow.total_pages < pages )
1250 /* Need to allocate more memory from domheap */
1251 sp = (struct shadow_page_info *)
1252 alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
1253 if ( sp == NULL )
1255 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1256 return -ENOMEM;
1258 d->arch.paging.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1259 d->arch.paging.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1260 for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
1262 sp[j].type = 0;
1263 sp[j].pinned = 0;
1264 sp[j].count = 0;
1265 sp[j].mbz = 0;
1266 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1268 sp->order = SHADOW_MAX_ORDER;
1269 list_add_tail(&sp->list,
1270 &d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER]);
1272 else if ( d->arch.paging.shadow.total_pages > pages )
1274 /* Need to return memory to domheap */
1275 shadow_prealloc(d, SHADOW_MAX_ORDER);
1276 ASSERT(!list_empty(&d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER]));
1277 sp = list_entry(d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER].next,
1278 struct shadow_page_info, list);
1279 list_del(&sp->list);
1280 d->arch.paging.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1281 d->arch.paging.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1282 free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER);
1285 /* Check to see if we need to yield and try again */
1286 if ( preempted && hypercall_preempt_check() )
1288 *preempted = 1;
1289 return 0;
1293 return 0;
1296 /* Return the size of the shadow pool, rounded up to the nearest MB */
1297 static unsigned int shadow_get_allocation(struct domain *d)
1299 unsigned int pg = d->arch.paging.shadow.total_pages;
1300 return ((pg >> (20 - PAGE_SHIFT))
1301 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1304 /**************************************************************************/
1305 /* Hash table for storing the guest->shadow mappings.
1306 * The table itself is an array of pointers to shadows; the shadows are then
1307 * threaded on a singly-linked list of shadows with the same hash value */
1309 #define SHADOW_HASH_BUCKETS 251
1310 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1312 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1313 typedef u32 key_t;
1314 static inline key_t sh_hash(unsigned long n, unsigned int t)
1316 unsigned char *p = (unsigned char *)&n;
1317 key_t k = t;
1318 int i;
1319 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1320 return k % SHADOW_HASH_BUCKETS;
1323 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1325 /* Before we get to the mechanism, define a pair of audit functions
1326 * that sanity-check the contents of the hash table. */
1327 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1328 /* Audit one bucket of the hash table */
1330 struct shadow_page_info *sp, *x;
1332 if ( !(SHADOW_AUDIT_ENABLE) )
1333 return;
1335 sp = d->arch.paging.shadow.hash_table[bucket];
1336 while ( sp )
1338 /* Not a shadow? */
1339 BUG_ON( sp->mbz != 0 );
1340 /* Bogus type? */
1341 BUG_ON( sp->type == 0 );
1342 BUG_ON( sp->type > SH_type_max_shadow );
1343 /* Wrong bucket? */
1344 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1345 /* Duplicate entry? */
1346 for ( x = sp->next_shadow; x; x = x->next_shadow )
1347 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1348 /* Follow the backpointer to the guest pagetable */
1349 if ( sp->type != SH_type_fl1_32_shadow
1350 && sp->type != SH_type_fl1_pae_shadow
1351 && sp->type != SH_type_fl1_64_shadow )
1353 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1354 /* Bad shadow flags on guest page? */
1355 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1356 /* Bad type count on guest page? */
1357 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1358 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1360 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1361 " but has typecount %#lx\n",
1362 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1363 gpg->u.inuse.type_info);
1364 BUG();
1367 /* That entry was OK; on we go */
1368 sp = sp->next_shadow;
1372 #else
1373 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1374 #endif /* Hashtable bucket audit */
1377 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1379 static void sh_hash_audit(struct domain *d)
1380 /* Full audit: audit every bucket in the table */
1382 int i;
1384 if ( !(SHADOW_AUDIT_ENABLE) )
1385 return;
1387 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1389 sh_hash_audit_bucket(d, i);
1393 #else
1394 #define sh_hash_audit(_d) do {} while(0)
1395 #endif /* Hashtable bucket audit */
1397 /* Allocate and initialise the table itself.
1398 * Returns 0 for success, 1 for error. */
1399 static int shadow_hash_alloc(struct domain *d)
1401 struct shadow_page_info **table;
1403 ASSERT(shadow_locked_by_me(d));
1404 ASSERT(!d->arch.paging.shadow.hash_table);
1406 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1407 if ( !table ) return 1;
1408 memset(table, 0,
1409 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1410 d->arch.paging.shadow.hash_table = table;
1411 return 0;
1414 /* Tear down the hash table and return all memory to Xen.
1415 * This function does not care whether the table is populated. */
1416 static void shadow_hash_teardown(struct domain *d)
1418 ASSERT(shadow_locked_by_me(d));
1419 ASSERT(d->arch.paging.shadow.hash_table);
1421 xfree(d->arch.paging.shadow.hash_table);
1422 d->arch.paging.shadow.hash_table = NULL;
1426 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1427 /* Find an entry in the hash table. Returns the MFN of the shadow,
1428 * or INVALID_MFN if it doesn't exist */
1430 struct domain *d = v->domain;
1431 struct shadow_page_info *sp, *prev;
1432 key_t key;
1434 ASSERT(shadow_locked_by_me(d));
1435 ASSERT(d->arch.paging.shadow.hash_table);
1436 ASSERT(t);
1438 sh_hash_audit(d);
1440 perfc_incr(shadow_hash_lookups);
1441 key = sh_hash(n, t);
1442 sh_hash_audit_bucket(d, key);
1444 sp = d->arch.paging.shadow.hash_table[key];
1445 prev = NULL;
1446 while(sp)
1448 if ( sp->backpointer == n && sp->type == t )
1450 /* Pull-to-front if 'sp' isn't already the head item */
1451 if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
1453 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
1454 /* Can't reorder: someone is walking the hash chains */
1455 return shadow_page_to_mfn(sp);
1456 else
1458 ASSERT(prev);
1459 /* Delete sp from the list */
1460 prev->next_shadow = sp->next_shadow;
1461 /* Re-insert it at the head of the list */
1462 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
1463 d->arch.paging.shadow.hash_table[key] = sp;
1466 else
1468 perfc_incr(shadow_hash_lookup_head);
1470 return shadow_page_to_mfn(sp);
1472 prev = sp;
1473 sp = sp->next_shadow;
1476 perfc_incr(shadow_hash_lookup_miss);
1477 return _mfn(INVALID_MFN);
1480 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
1481 mfn_t smfn)
1482 /* Put a mapping (n,t)->smfn into the hash table */
1484 struct domain *d = v->domain;
1485 struct shadow_page_info *sp;
1486 key_t key;
1488 ASSERT(shadow_locked_by_me(d));
1489 ASSERT(d->arch.paging.shadow.hash_table);
1490 ASSERT(t);
1492 sh_hash_audit(d);
1494 perfc_incr(shadow_hash_inserts);
1495 key = sh_hash(n, t);
1496 sh_hash_audit_bucket(d, key);
1498 /* Insert this shadow at the top of the bucket */
1499 sp = mfn_to_shadow_page(smfn);
1500 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
1501 d->arch.paging.shadow.hash_table[key] = sp;
1503 sh_hash_audit_bucket(d, key);
1506 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
1507 mfn_t smfn)
1508 /* Excise the mapping (n,t)->smfn from the hash table */
1510 struct domain *d = v->domain;
1511 struct shadow_page_info *sp, *x;
1512 key_t key;
1514 ASSERT(shadow_locked_by_me(d));
1515 ASSERT(d->arch.paging.shadow.hash_table);
1516 ASSERT(t);
1518 sh_hash_audit(d);
1520 perfc_incr(shadow_hash_deletes);
1521 key = sh_hash(n, t);
1522 sh_hash_audit_bucket(d, key);
1524 sp = mfn_to_shadow_page(smfn);
1525 if ( d->arch.paging.shadow.hash_table[key] == sp )
1526 /* Easy case: we're deleting the head item. */
1527 d->arch.paging.shadow.hash_table[key] = sp->next_shadow;
1528 else
1530 /* Need to search for the one we want */
1531 x = d->arch.paging.shadow.hash_table[key];
1532 while ( 1 )
1534 ASSERT(x); /* We can't have hit the end, since our target is
1535 * still in the chain somehwere... */
1536 if ( x->next_shadow == sp )
1538 x->next_shadow = sp->next_shadow;
1539 break;
1541 x = x->next_shadow;
1544 sp->next_shadow = NULL;
1546 sh_hash_audit_bucket(d, key);
1549 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1551 static void hash_foreach(struct vcpu *v,
1552 unsigned int callback_mask,
1553 hash_callback_t callbacks[],
1554 mfn_t callback_mfn)
1555 /* Walk the hash table looking at the types of the entries and
1556 * calling the appropriate callback function for each entry.
1557 * The mask determines which shadow types we call back for, and the array
1558 * of callbacks tells us which function to call.
1559 * Any callback may return non-zero to let us skip the rest of the scan.
1561 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1562 * then return non-zero to terminate the scan. */
1564 int i, done = 0;
1565 struct domain *d = v->domain;
1566 struct shadow_page_info *x;
1568 /* Say we're here, to stop hash-lookups reordering the chains */
1569 ASSERT(shadow_locked_by_me(d));
1570 ASSERT(d->arch.paging.shadow.hash_walking == 0);
1571 d->arch.paging.shadow.hash_walking = 1;
1573 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1575 /* WARNING: This is not safe against changes to the hash table.
1576 * The callback *must* return non-zero if it has inserted or
1577 * deleted anything from the hash (lookups are OK, though). */
1578 for ( x = d->arch.paging.shadow.hash_table[i]; x; x = x->next_shadow )
1580 if ( callback_mask & (1 << x->type) )
1582 ASSERT(x->type <= 15);
1583 ASSERT(callbacks[x->type] != NULL);
1584 done = callbacks[x->type](v, shadow_page_to_mfn(x),
1585 callback_mfn);
1586 if ( done ) break;
1589 if ( done ) break;
1591 d->arch.paging.shadow.hash_walking = 0;
1595 /**************************************************************************/
1596 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1597 * which will decrement refcounts appropriately and return memory to the
1598 * free pool. */
1600 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1602 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1603 unsigned int t = sp->type;
1606 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1608 /* Double-check, if we can, that the shadowed page belongs to this
1609 * domain, (by following the back-pointer). */
1610 ASSERT(t == SH_type_fl1_32_shadow ||
1611 t == SH_type_fl1_pae_shadow ||
1612 t == SH_type_fl1_64_shadow ||
1613 t == SH_type_monitor_table ||
1614 (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
1615 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
1616 == v->domain));
1618 /* The down-shifts here are so that the switch statement is on nice
1619 * small numbers that the compiler will enjoy */
1620 switch ( t )
1622 #if CONFIG_PAGING_LEVELS == 2
1623 case SH_type_l1_32_shadow:
1624 case SH_type_fl1_32_shadow:
1625 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1626 break;
1627 case SH_type_l2_32_shadow:
1628 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1629 break;
1630 #else /* PAE or 64bit */
1631 case SH_type_l1_32_shadow:
1632 case SH_type_fl1_32_shadow:
1633 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1634 break;
1635 case SH_type_l2_32_shadow:
1636 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1637 break;
1638 #endif
1640 #if CONFIG_PAGING_LEVELS >= 3
1641 case SH_type_l1_pae_shadow:
1642 case SH_type_fl1_pae_shadow:
1643 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1644 break;
1645 case SH_type_l2_pae_shadow:
1646 case SH_type_l2h_pae_shadow:
1647 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1648 break;
1649 #endif
1651 #if CONFIG_PAGING_LEVELS >= 4
1652 case SH_type_l1_64_shadow:
1653 case SH_type_fl1_64_shadow:
1654 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1655 break;
1656 case SH_type_l2h_64_shadow:
1657 ASSERT(is_pv_32on64_vcpu(v));
1658 /* Fall through... */
1659 case SH_type_l2_64_shadow:
1660 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1661 break;
1662 case SH_type_l3_64_shadow:
1663 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1664 break;
1665 case SH_type_l4_64_shadow:
1666 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1667 break;
1668 #endif
1669 default:
1670 SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
1671 (unsigned long)t);
1672 BUG();
1676 /**************************************************************************/
1677 /* Remove all writeable mappings of a guest frame from the shadow tables
1678 * Returns non-zero if we need to flush TLBs.
1679 * level and fault_addr desribe how we found this to be a pagetable;
1680 * level==0 means we have some other reason for revoking write access.*/
1682 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
1683 unsigned int level,
1684 unsigned long fault_addr)
1686 /* Dispatch table for getting per-type functions */
1687 static hash_callback_t callbacks[SH_type_unused] = {
1688 NULL, /* none */
1689 #if CONFIG_PAGING_LEVELS == 2
1690 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* l1_32 */
1691 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* fl1_32 */
1692 #else
1693 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* l1_32 */
1694 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* fl1_32 */
1695 #endif
1696 NULL, /* l2_32 */
1697 #if CONFIG_PAGING_LEVELS >= 3
1698 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* l1_pae */
1699 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* fl1_pae */
1700 #else
1701 NULL, /* l1_pae */
1702 NULL, /* fl1_pae */
1703 #endif
1704 NULL, /* l2_pae */
1705 NULL, /* l2h_pae */
1706 #if CONFIG_PAGING_LEVELS >= 4
1707 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* l1_64 */
1708 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* fl1_64 */
1709 #else
1710 NULL, /* l1_64 */
1711 NULL, /* fl1_64 */
1712 #endif
1713 NULL, /* l2_64 */
1714 NULL, /* l2h_64 */
1715 NULL, /* l3_64 */
1716 NULL, /* l4_64 */
1717 NULL, /* p2m */
1718 NULL /* unused */
1719 };
1721 static unsigned int callback_mask =
1722 1 << SH_type_l1_32_shadow
1723 | 1 << SH_type_fl1_32_shadow
1724 | 1 << SH_type_l1_pae_shadow
1725 | 1 << SH_type_fl1_pae_shadow
1726 | 1 << SH_type_l1_64_shadow
1727 | 1 << SH_type_fl1_64_shadow
1729 struct page_info *pg = mfn_to_page(gmfn);
1731 ASSERT(shadow_locked_by_me(v->domain));
1733 /* Only remove writable mappings if we are doing shadow refcounts.
1734 * In guest refcounting, we trust Xen to already be restricting
1735 * all the writes to the guest page tables, so we do not need to
1736 * do more. */
1737 if ( !shadow_mode_refcounts(v->domain) )
1738 return 0;
1740 /* Early exit if it's already a pagetable, or otherwise not writeable */
1741 if ( sh_mfn_is_a_page_table(gmfn)
1742 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1743 return 0;
1745 perfc_incr(shadow_writeable);
1747 /* If this isn't a "normal" writeable page, the domain is trying to
1748 * put pagetables in special memory of some kind. We can't allow that. */
1749 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1751 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1752 PRtype_info "\n",
1753 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1754 domain_crash(v->domain);
1757 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1758 if ( v == current && level != 0 )
1760 unsigned long gfn;
1761 /* Heuristic: there is likely to be only one writeable mapping,
1762 * and that mapping is likely to be in the current pagetable,
1763 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
1765 #define GUESS(_a, _h) do { \
1766 if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
1767 perfc_incr(shadow_writeable_h_ ## _h); \
1768 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1769 return 1; \
1770 } while (0)
1773 if ( v->arch.paging.mode->guest_levels == 2 )
1775 if ( level == 1 )
1776 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1777 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1779 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1780 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1781 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1784 #if CONFIG_PAGING_LEVELS >= 3
1785 else if ( v->arch.paging.mode->guest_levels == 3 )
1787 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1788 switch ( level )
1790 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1791 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1794 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1795 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1796 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1798 #if CONFIG_PAGING_LEVELS >= 4
1799 else if ( v->arch.paging.mode->guest_levels == 4 )
1801 /* 64bit w2k3: linear map at 0xfffff68000000000 */
1802 switch ( level )
1804 case 1: GUESS(0xfffff68000000000UL
1805 + ((fault_addr & VADDR_MASK) >> 9), 3); break;
1806 case 2: GUESS(0xfffff6fb40000000UL
1807 + ((fault_addr & VADDR_MASK) >> 18), 3); break;
1808 case 3: GUESS(0xfffff6fb7da00000UL
1809 + ((fault_addr & VADDR_MASK) >> 27), 3); break;
1812 /* 64bit Linux direct map at 0xffff810000000000; older kernels
1813 * had it at 0x0000010000000000UL */
1814 gfn = mfn_to_gfn(v->domain, gmfn);
1815 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
1816 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
1818 #endif /* CONFIG_PAGING_LEVELS >= 4 */
1819 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1821 #undef GUESS
1824 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1825 return 1;
1827 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
1828 * (entries in the fixmap) where linux maps its pagetables. Since
1829 * we expect to hit them most of the time, we start the search for
1830 * the writeable mapping by looking at the same MFN where the last
1831 * brute-force search succeeded. */
1833 if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
1835 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
1836 mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
1837 int shtype = mfn_to_shadow_page(last_smfn)->type;
1839 if ( callbacks[shtype] )
1840 callbacks[shtype](v, last_smfn, gmfn);
1842 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
1843 perfc_incr(shadow_writeable_h_5);
1846 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1847 return 1;
1849 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
1851 /* Brute-force search of all the shadows, by walking the hash */
1852 perfc_incr(shadow_writeable_bf);
1853 hash_foreach(v, callback_mask, callbacks, gmfn);
1855 /* If that didn't catch the mapping, then there's some non-pagetable
1856 * mapping -- ioreq page, grant mapping, &c. */
1857 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
1859 SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
1860 "%lu special-use mappings of it\n", mfn_x(gmfn),
1861 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
1862 domain_crash(v->domain);
1865 /* We killed at least one writeable mapping, so must flush TLBs. */
1866 return 1;
1871 /**************************************************************************/
1872 /* Remove all mappings of a guest frame from the shadow tables.
1873 * Returns non-zero if we need to flush TLBs. */
1875 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
1877 struct page_info *page = mfn_to_page(gmfn);
1878 int expected_count, do_locking;
1880 /* Dispatch table for getting per-type functions */
1881 static hash_callback_t callbacks[SH_type_unused] = {
1882 NULL, /* none */
1883 #if CONFIG_PAGING_LEVELS == 2
1884 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* l1_32 */
1885 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* fl1_32 */
1886 #else
1887 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* l1_32 */
1888 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* fl1_32 */
1889 #endif
1890 NULL, /* l2_32 */
1891 #if CONFIG_PAGING_LEVELS >= 3
1892 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* l1_pae */
1893 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* fl1_pae */
1894 #else
1895 NULL, /* l1_pae */
1896 NULL, /* fl1_pae */
1897 #endif
1898 NULL, /* l2_pae */
1899 NULL, /* l2h_pae */
1900 #if CONFIG_PAGING_LEVELS >= 4
1901 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* l1_64 */
1902 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* fl1_64 */
1903 #else
1904 NULL, /* l1_64 */
1905 NULL, /* fl1_64 */
1906 #endif
1907 NULL, /* l2_64 */
1908 NULL, /* l2h_64 */
1909 NULL, /* l3_64 */
1910 NULL, /* l4_64 */
1911 NULL, /* p2m */
1912 NULL /* unused */
1913 };
1915 static unsigned int callback_mask =
1916 1 << SH_type_l1_32_shadow
1917 | 1 << SH_type_fl1_32_shadow
1918 | 1 << SH_type_l1_pae_shadow
1919 | 1 << SH_type_fl1_pae_shadow
1920 | 1 << SH_type_l1_64_shadow
1921 | 1 << SH_type_fl1_64_shadow
1924 perfc_incr(shadow_mappings);
1925 if ( (page->count_info & PGC_count_mask) == 0 )
1926 return 0;
1928 /* Although this is an externally visible function, we do not know
1929 * whether the shadow lock will be held when it is called (since it
1930 * can be called via put_page_type when we clear a shadow l1e).
1931 * If the lock isn't held, take it for the duration of the call. */
1932 do_locking = !shadow_locked_by_me(v->domain);
1933 if ( do_locking ) shadow_lock(v->domain);
1935 /* XXX TODO:
1936 * Heuristics for finding the (probably) single mapping of this gmfn */
1938 /* Brute-force search of all the shadows, by walking the hash */
1939 perfc_incr(shadow_mappings_bf);
1940 hash_foreach(v, callback_mask, callbacks, gmfn);
1942 /* If that didn't catch the mapping, something is very wrong */
1943 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
1944 if ( (page->count_info & PGC_count_mask) != expected_count )
1946 /* Don't complain if we're in HVM and there are some extra mappings:
1947 * The qemu helper process has an untyped mapping of this dom's RAM
1948 * and the HVM restore program takes another. */
1949 if ( !(shadow_mode_external(v->domain)
1950 && (page->count_info & PGC_count_mask) <= 3
1951 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
1953 SHADOW_ERROR("can't find all mappings of mfn %lx: "
1954 "c=%08x t=%08lx\n", mfn_x(gmfn),
1955 page->count_info, page->u.inuse.type_info);
1959 if ( do_locking ) shadow_unlock(v->domain);
1961 /* We killed at least one mapping, so must flush TLBs. */
1962 return 1;
1966 /**************************************************************************/
1967 /* Remove all shadows of a guest frame from the shadow tables */
1969 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
1970 /* Follow this shadow's up-pointer, if it has one, and remove the reference
1971 * found there. Returns 1 if that was the only reference to this shadow */
1973 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1974 mfn_t pmfn;
1975 void *vaddr;
1976 int rc;
1978 ASSERT(sp->type > 0);
1979 ASSERT(sp->type < SH_type_max_shadow);
1980 ASSERT(sp->type != SH_type_l2_32_shadow);
1981 ASSERT(sp->type != SH_type_l2_pae_shadow);
1982 ASSERT(sp->type != SH_type_l2h_pae_shadow);
1983 ASSERT(sp->type != SH_type_l4_64_shadow);
1985 if (sp->up == 0) return 0;
1986 pmfn = _mfn(sp->up >> PAGE_SHIFT);
1987 ASSERT(mfn_valid(pmfn));
1988 vaddr = sh_map_domain_page(pmfn);
1989 ASSERT(vaddr);
1990 vaddr += sp->up & (PAGE_SIZE-1);
1991 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
1993 /* Is this the only reference to this shadow? */
1994 rc = (sp->count == 1) ? 1 : 0;
1996 /* Blank the offending entry */
1997 switch (sp->type)
1999 case SH_type_l1_32_shadow:
2000 case SH_type_l2_32_shadow:
2001 #if CONFIG_PAGING_LEVELS == 2
2002 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2003 #else
2004 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2005 #endif
2006 break;
2007 #if CONFIG_PAGING_LEVELS >=3
2008 case SH_type_l1_pae_shadow:
2009 case SH_type_l2_pae_shadow:
2010 case SH_type_l2h_pae_shadow:
2011 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2012 break;
2013 #if CONFIG_PAGING_LEVELS >= 4
2014 case SH_type_l1_64_shadow:
2015 case SH_type_l2_64_shadow:
2016 case SH_type_l2h_64_shadow:
2017 case SH_type_l3_64_shadow:
2018 case SH_type_l4_64_shadow:
2019 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2020 break;
2021 #endif
2022 #endif
2023 default: BUG(); /* Some wierd unknown shadow type */
2026 sh_unmap_domain_page(vaddr);
2027 if ( rc )
2028 perfc_incr(shadow_up_pointer);
2029 else
2030 perfc_incr(shadow_unshadow_bf);
2032 return rc;
2035 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2036 /* Remove the shadows of this guest page.
2037 * If fast != 0, just try the quick heuristic, which will remove
2038 * at most one reference to each shadow of the page. Otherwise, walk
2039 * all the shadow tables looking for refs to shadows of this gmfn.
2040 * If all != 0, kill the domain if we can't find all the shadows.
2041 * (all != 0 implies fast == 0)
2042 */
2044 struct page_info *pg = mfn_to_page(gmfn);
2045 mfn_t smfn;
2046 u32 sh_flags;
2047 int do_locking;
2048 unsigned char t;
2050 /* Dispatch table for getting per-type functions: each level must
2051 * be called with the function to remove a lower-level shadow. */
2052 static hash_callback_t callbacks[SH_type_unused] = {
2053 NULL, /* none */
2054 NULL, /* l1_32 */
2055 NULL, /* fl1_32 */
2056 #if CONFIG_PAGING_LEVELS == 2
2057 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2058 #else
2059 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2060 #endif
2061 NULL, /* l1_pae */
2062 NULL, /* fl1_pae */
2063 #if CONFIG_PAGING_LEVELS >= 3
2064 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2065 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2066 #else
2067 NULL, /* l2_pae */
2068 NULL, /* l2h_pae */
2069 #endif
2070 NULL, /* l1_64 */
2071 NULL, /* fl1_64 */
2072 #if CONFIG_PAGING_LEVELS >= 4
2073 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2074 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2h_64 */
2075 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2076 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2077 #else
2078 NULL, /* l2_64 */
2079 NULL, /* l2h_64 */
2080 NULL, /* l3_64 */
2081 NULL, /* l4_64 */
2082 #endif
2083 NULL, /* p2m */
2084 NULL /* unused */
2085 };
2087 /* Another lookup table, for choosing which mask to use */
2088 static unsigned int masks[SH_type_unused] = {
2089 0, /* none */
2090 1 << SH_type_l2_32_shadow, /* l1_32 */
2091 0, /* fl1_32 */
2092 0, /* l2_32 */
2093 ((1 << SH_type_l2h_pae_shadow)
2094 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2095 0, /* fl1_pae */
2096 0, /* l2_pae */
2097 0, /* l2h_pae */
2098 ((1 << SH_type_l2h_64_shadow)
2099 | (1 << SH_type_l2_64_shadow)), /* l1_64 */
2100 0, /* fl1_64 */
2101 1 << SH_type_l3_64_shadow, /* l2_64 */
2102 1 << SH_type_l3_64_shadow, /* l2h_64 */
2103 1 << SH_type_l4_64_shadow, /* l3_64 */
2104 0, /* l4_64 */
2105 0, /* p2m */
2106 0 /* unused */
2107 };
2109 ASSERT(!(all && fast));
2111 /* Although this is an externally visible function, we do not know
2112 * whether the shadow lock will be held when it is called (since it
2113 * can be called via put_page_type when we clear a shadow l1e).
2114 * If the lock isn't held, take it for the duration of the call. */
2115 do_locking = !shadow_locked_by_me(v->domain);
2116 if ( do_locking ) shadow_lock(v->domain);
2118 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2119 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2121 /* Bail out now if the page is not shadowed */
2122 if ( (pg->count_info & PGC_page_table) == 0 )
2124 if ( do_locking ) shadow_unlock(v->domain);
2125 return;
2128 /* Search for this shadow in all appropriate shadows */
2129 perfc_incr(shadow_unshadow);
2130 sh_flags = pg->shadow_flags;
2132 /* Lower-level shadows need to be excised from upper-level shadows.
2133 * This call to hash_foreach() looks dangerous but is in fact OK: each
2134 * call will remove at most one shadow, and terminate immediately when
2135 * it does remove it, so we never walk the hash after doing a deletion. */
2136 #define DO_UNSHADOW(_type) do { \
2137 t = (_type); \
2138 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2139 if ( unlikely(!mfn_valid(smfn)) ) \
2140 { \
2141 SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32 \
2142 " but no type-0x%"PRIx32" shadow\n", \
2143 mfn_x(gmfn), sh_flags, t); \
2144 break; \
2145 } \
2146 if ( sh_type_is_pinnable(v, t) ) \
2147 sh_unpin(v, smfn); \
2148 else \
2149 sh_remove_shadow_via_pointer(v, smfn); \
2150 if ( (pg->count_info & PGC_page_table) && !fast ) \
2151 hash_foreach(v, masks[t], callbacks, smfn); \
2152 } while (0)
2154 if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(SH_type_l1_32_shadow);
2155 if ( sh_flags & SHF_L2_32 ) DO_UNSHADOW(SH_type_l2_32_shadow);
2156 #if CONFIG_PAGING_LEVELS >= 3
2157 if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(SH_type_l1_pae_shadow);
2158 if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(SH_type_l2_pae_shadow);
2159 if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(SH_type_l2h_pae_shadow);
2160 #if CONFIG_PAGING_LEVELS >= 4
2161 if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(SH_type_l1_64_shadow);
2162 if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(SH_type_l2_64_shadow);
2163 if ( sh_flags & SHF_L2H_64 ) DO_UNSHADOW(SH_type_l2h_64_shadow);
2164 if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(SH_type_l3_64_shadow);
2165 if ( sh_flags & SHF_L4_64 ) DO_UNSHADOW(SH_type_l4_64_shadow);
2166 #endif
2167 #endif
2169 #undef DO_UNSHADOW
2171 /* If that didn't catch the shadows, something is wrong */
2172 if ( !fast && (pg->count_info & PGC_page_table) )
2174 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2175 "(shadow_flags=%08lx)\n",
2176 mfn_x(gmfn), pg->shadow_flags);
2177 if ( all )
2178 domain_crash(v->domain);
2181 /* Need to flush TLBs now, so that linear maps are safe next time we
2182 * take a fault. */
2183 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2185 if ( do_locking ) shadow_unlock(v->domain);
2188 static void
2189 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2190 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2191 * Unshadow it, and recursively unshadow pages that reference it. */
2193 sh_remove_shadows(v, gmfn, 0, 1);
2194 /* XXX TODO:
2195 * Rework this hashtable walker to return a linked-list of all
2196 * the shadows it modified, then do breadth-first recursion
2197 * to find the way up to higher-level tables and unshadow them too.
2199 * The current code (just tearing down each page's shadows as we
2200 * detect that it is not a pagetable) is correct, but very slow.
2201 * It means extra emulated writes and slows down removal of mappings. */
2204 /**************************************************************************/
2206 static void sh_update_paging_modes(struct vcpu *v)
2208 struct domain *d = v->domain;
2209 struct paging_mode *old_mode = v->arch.paging.mode;
2211 ASSERT(shadow_locked_by_me(d));
2213 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2214 /* Make sure this vcpu has a virtual TLB array allocated */
2215 if ( unlikely(!v->arch.paging.vtlb) )
2217 v->arch.paging.vtlb = xmalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
2218 if ( unlikely(!v->arch.paging.vtlb) )
2220 SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
2221 d->domain_id, v->vcpu_id);
2222 domain_crash(v->domain);
2223 return;
2225 memset(v->arch.paging.vtlb, 0,
2226 VTLB_ENTRIES * sizeof (struct shadow_vtlb));
2227 spin_lock_init(&v->arch.paging.vtlb_lock);
2229 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2231 // Valid transitions handled by this function:
2232 // - For PV guests:
2233 // - after a shadow mode has been changed
2234 // - For HVM guests:
2235 // - after a shadow mode has been changed
2236 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2237 //
2239 // First, tear down any old shadow tables held by this vcpu.
2240 //
2241 if ( v->arch.paging.mode )
2242 v->arch.paging.mode->shadow.detach_old_tables(v);
2244 if ( !is_hvm_domain(d) )
2246 ///
2247 /// PV guest
2248 ///
2249 #if CONFIG_PAGING_LEVELS == 4
2250 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2251 #elif CONFIG_PAGING_LEVELS == 3
2252 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2253 #elif CONFIG_PAGING_LEVELS == 2
2254 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2255 #else
2256 #error unexpected paging mode
2257 #endif
2259 else
2261 ///
2262 /// HVM guest
2263 ///
2264 ASSERT(shadow_mode_translate(d));
2265 ASSERT(shadow_mode_external(d));
2267 if ( !hvm_paging_enabled(v) )
2269 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
2270 * pagetable for it, mapping 4 GB one-to-one using a single l2
2271 * page of 1024 superpage mappings */
2272 v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
2273 #if CONFIG_PAGING_LEVELS >= 3
2274 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2275 #else
2276 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2277 #endif
2279 else
2281 #ifdef __x86_64__
2282 if ( hvm_long_mode_enabled(v) )
2284 // long mode guest...
2285 v->arch.paging.mode =
2286 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2288 else
2289 #endif
2290 if ( hvm_pae_enabled(v) )
2292 #if CONFIG_PAGING_LEVELS >= 3
2293 // 32-bit PAE mode guest...
2294 v->arch.paging.mode =
2295 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2296 #else
2297 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2298 domain_crash(d);
2299 return;
2300 #endif
2302 else
2304 // 32-bit 2 level guest...
2305 #if CONFIG_PAGING_LEVELS >= 3
2306 v->arch.paging.mode =
2307 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2308 #else
2309 v->arch.paging.mode =
2310 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2311 #endif
2315 if ( pagetable_is_null(v->arch.monitor_table) )
2317 mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2318 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2319 make_cr3(v, mfn_x(mmfn));
2320 hvm_update_host_cr3(v);
2323 if ( v->arch.paging.mode != old_mode )
2325 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2326 "(was g=%u s=%u)\n",
2327 d->domain_id, v->vcpu_id,
2328 is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
2329 v->arch.paging.mode->guest_levels,
2330 v->arch.paging.mode->shadow.shadow_levels,
2331 old_mode ? old_mode->guest_levels : 0,
2332 old_mode ? old_mode->shadow.shadow_levels : 0);
2333 if ( old_mode &&
2334 (v->arch.paging.mode->shadow.shadow_levels !=
2335 old_mode->shadow.shadow_levels) )
2337 /* Need to make a new monitor table for the new mode */
2338 mfn_t new_mfn, old_mfn;
2340 if ( v != current && vcpu_runnable(v) )
2342 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2343 "this HVM vcpu's (d=%u v=%u) paging mode "
2344 "while it is running.\n",
2345 current->domain->domain_id, current->vcpu_id,
2346 v->domain->domain_id, v->vcpu_id);
2347 /* It's not safe to do that because we can't change
2348 * the host CR£ for a running domain */
2349 domain_crash(v->domain);
2350 return;
2353 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2354 v->arch.monitor_table = pagetable_null();
2355 new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2356 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2357 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
2358 mfn_x(new_mfn));
2360 /* Don't be running on the old monitor table when we
2361 * pull it down! Switch CR3, and warn the HVM code that
2362 * its host cr3 has changed. */
2363 make_cr3(v, mfn_x(new_mfn));
2364 if ( v == current )
2365 write_ptbase(v);
2366 hvm_update_host_cr3(v);
2367 old_mode->shadow.destroy_monitor_table(v, old_mfn);
2371 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2372 // These are HARD: think about the case where two CPU's have
2373 // different values for CR4.PSE and CR4.PGE at the same time.
2374 // This *does* happen, at least for CR4.PGE...
2377 v->arch.paging.mode->update_cr3(v, 0);
2380 void shadow_update_paging_modes(struct vcpu *v)
2382 shadow_lock(v->domain);
2383 sh_update_paging_modes(v);
2384 shadow_unlock(v->domain);
2387 /**************************************************************************/
2388 /* Turning on and off shadow features */
2390 static void sh_new_mode(struct domain *d, u32 new_mode)
2391 /* Inform all the vcpus that the shadow mode has been changed */
2393 struct vcpu *v;
2395 ASSERT(shadow_locked_by_me(d));
2396 ASSERT(d != current->domain);
2397 d->arch.paging.mode = new_mode;
2398 for_each_vcpu(d, v)
2399 sh_update_paging_modes(v);
2402 int shadow_enable(struct domain *d, u32 mode)
2403 /* Turn on "permanent" shadow features: external, translate, refcount.
2404 * Can only be called once on a domain, and these features cannot be
2405 * disabled.
2406 * Returns 0 for success, -errno for failure. */
2408 unsigned int old_pages;
2409 struct page_info *pg = NULL;
2410 uint32_t *e;
2411 int i, rv = 0;
2413 mode |= PG_SH_enable;
2415 domain_pause(d);
2417 /* Sanity check the arguments */
2418 if ( (d == current->domain) ||
2419 shadow_mode_enabled(d) ||
2420 ((mode & PG_translate) && !(mode & PG_refcounts)) ||
2421 ((mode & PG_external) && !(mode & PG_translate)) )
2423 rv = -EINVAL;
2424 goto out_unlocked;
2427 /* Init the shadow memory allocation if the user hasn't done so */
2428 old_pages = d->arch.paging.shadow.total_pages;
2429 if ( old_pages == 0 )
2431 unsigned int r;
2432 shadow_lock(d);
2433 r = sh_set_allocation(d, 256, NULL); /* Use at least 1MB */
2434 shadow_unlock(d);
2435 if ( r != 0 )
2437 sh_set_allocation(d, 0, NULL);
2438 rv = -ENOMEM;
2439 goto out_unlocked;
2443 /* Init the P2M table. Must be done before we take the shadow lock
2444 * to avoid possible deadlock. */
2445 if ( mode & PG_translate )
2447 rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
2448 if (rv != 0)
2449 goto out_unlocked;
2452 /* HVM domains need an extra pagetable for vcpus that think they
2453 * have paging disabled */
2454 if ( is_hvm_domain(d) )
2456 /* Get a single page from the shadow pool. Take it via the
2457 * P2M interface to make freeing it simpler afterwards. */
2458 pg = shadow_alloc_p2m_page(d);
2459 if ( pg == NULL )
2461 rv = -ENOMEM;
2462 goto out_unlocked;
2464 /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
2465 * of virtual address space onto the same physical address range */
2466 e = sh_map_domain_page(page_to_mfn(pg));
2467 for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
2468 e[i] = ((0x400000U * i)
2469 | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
2470 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
2471 sh_unmap_domain_page(e);
2472 pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
2475 shadow_lock(d);
2477 /* Sanity check again with the lock held */
2478 if ( shadow_mode_enabled(d) )
2480 rv = -EINVAL;
2481 goto out_locked;
2484 /* Init the hash table */
2485 if ( shadow_hash_alloc(d) != 0 )
2487 rv = -ENOMEM;
2488 goto out_locked;
2491 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2492 /* We assume we're dealing with an older 64bit linux guest until we
2493 * see the guest use more than one l4 per vcpu. */
2494 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2495 #endif
2497 /* Record the 1-to-1 pagetable we just made */
2498 if ( is_hvm_domain(d) )
2499 d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
2501 /* Update the bits */
2502 sh_new_mode(d, mode);
2504 out_locked:
2505 shadow_unlock(d);
2506 out_unlocked:
2507 if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
2508 p2m_teardown(d);
2509 if ( rv != 0 && pg != NULL )
2510 shadow_free_p2m_page(d, pg);
2511 domain_unpause(d);
2512 return rv;
2515 void shadow_teardown(struct domain *d)
2516 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2517 * Should only be called for dying domains. */
2519 struct vcpu *v;
2520 mfn_t mfn;
2521 struct list_head *entry, *n;
2522 struct page_info *pg;
2524 ASSERT(d->is_dying);
2525 ASSERT(d != current->domain);
2527 if ( !shadow_locked_by_me(d) )
2528 shadow_lock(d); /* Keep various asserts happy */
2530 if ( shadow_mode_enabled(d) )
2532 /* Release the shadow and monitor tables held by each vcpu */
2533 for_each_vcpu(d, v)
2535 if ( v->arch.paging.mode )
2537 v->arch.paging.mode->shadow.detach_old_tables(v);
2538 if ( shadow_mode_external(d) )
2540 mfn = pagetable_get_mfn(v->arch.monitor_table);
2541 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
2542 v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
2543 v->arch.monitor_table = pagetable_null();
2549 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2550 /* Free the virtual-TLB array attached to each vcpu */
2551 for_each_vcpu(d, v)
2553 if ( v->arch.paging.vtlb )
2555 xfree(v->arch.paging.vtlb);
2556 v->arch.paging.vtlb = NULL;
2559 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2561 list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
2563 list_del(entry);
2564 pg = list_entry(entry, struct page_info, list);
2565 shadow_free_p2m_page(d, pg);
2568 if ( d->arch.paging.shadow.total_pages != 0 )
2570 SHADOW_PRINTK("teardown of domain %u starts."
2571 " Shadow pages total = %u, free = %u, p2m=%u\n",
2572 d->domain_id,
2573 d->arch.paging.shadow.total_pages,
2574 d->arch.paging.shadow.free_pages,
2575 d->arch.paging.shadow.p2m_pages);
2576 /* Destroy all the shadows and release memory to domheap */
2577 sh_set_allocation(d, 0, NULL);
2578 /* Release the hash table back to xenheap */
2579 if (d->arch.paging.shadow.hash_table)
2580 shadow_hash_teardown(d);
2581 /* Should not have any more memory held */
2582 SHADOW_PRINTK("teardown done."
2583 " Shadow pages total = %u, free = %u, p2m=%u\n",
2584 d->arch.paging.shadow.total_pages,
2585 d->arch.paging.shadow.free_pages,
2586 d->arch.paging.shadow.p2m_pages);
2587 ASSERT(d->arch.paging.shadow.total_pages == 0);
2590 /* Free the non-paged-vcpus pagetable; must happen after we've
2591 * destroyed any shadows of it or sh_destroy_shadow will get confused. */
2592 if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
2594 for_each_vcpu(d, v)
2596 ASSERT(is_hvm_vcpu(v));
2597 if ( !hvm_paging_enabled(v) )
2598 v->arch.guest_table = pagetable_null();
2600 shadow_free_p2m_page(d,
2601 pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable));
2602 d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
2605 /* We leave the "permanent" shadow modes enabled, but clear the
2606 * log-dirty mode bit. We don't want any more mark_dirty()
2607 * calls now that we've torn down the bitmap */
2608 d->arch.paging.mode &= ~PG_log_dirty;
2610 shadow_unlock(d);
2613 void shadow_final_teardown(struct domain *d)
2614 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2616 SHADOW_PRINTK("dom %u final teardown starts."
2617 " Shadow pages total = %u, free = %u, p2m=%u\n",
2618 d->domain_id,
2619 d->arch.paging.shadow.total_pages,
2620 d->arch.paging.shadow.free_pages,
2621 d->arch.paging.shadow.p2m_pages);
2623 /* Double-check that the domain didn't have any shadow memory.
2624 * It is possible for a domain that never got domain_kill()ed
2625 * to get here with its shadow allocation intact. */
2626 if ( d->arch.paging.shadow.total_pages != 0 )
2627 shadow_teardown(d);
2629 /* It is now safe to pull down the p2m map. */
2630 p2m_teardown(d);
2632 SHADOW_PRINTK("dom %u final teardown done."
2633 " Shadow pages total = %u, free = %u, p2m=%u\n",
2634 d->domain_id,
2635 d->arch.paging.shadow.total_pages,
2636 d->arch.paging.shadow.free_pages,
2637 d->arch.paging.shadow.p2m_pages);
2640 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2641 /* Turn on a single shadow mode feature */
2643 ASSERT(shadow_locked_by_me(d));
2645 /* Sanity check the call */
2646 if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
2648 return -EINVAL;
2651 mode |= PG_SH_enable;
2653 if ( d->arch.paging.mode == 0 )
2655 /* Init the shadow memory allocation and the hash table */
2656 if ( sh_set_allocation(d, 1, NULL) != 0
2657 || shadow_hash_alloc(d) != 0 )
2659 sh_set_allocation(d, 0, NULL);
2660 return -ENOMEM;
2664 /* Update the bits */
2665 sh_new_mode(d, d->arch.paging.mode | mode);
2667 return 0;
2670 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2671 /* Turn off a single shadow mode feature */
2673 struct vcpu *v;
2674 ASSERT(shadow_locked_by_me(d));
2676 /* Sanity check the call */
2677 if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
2679 return -EINVAL;
2682 /* Update the bits */
2683 sh_new_mode(d, d->arch.paging.mode & ~mode);
2684 if ( d->arch.paging.mode == 0 )
2686 /* Get this domain off shadows */
2687 SHADOW_PRINTK("un-shadowing of domain %u starts."
2688 " Shadow pages total = %u, free = %u, p2m=%u\n",
2689 d->domain_id,
2690 d->arch.paging.shadow.total_pages,
2691 d->arch.paging.shadow.free_pages,
2692 d->arch.paging.shadow.p2m_pages);
2693 for_each_vcpu(d, v)
2695 if ( v->arch.paging.mode )
2696 v->arch.paging.mode->shadow.detach_old_tables(v);
2697 #if CONFIG_PAGING_LEVELS == 4
2698 if ( !(v->arch.flags & TF_kernel_mode) )
2699 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2700 else
2701 #endif
2702 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2706 /* Pull down the memory allocation */
2707 if ( sh_set_allocation(d, 0, NULL) != 0 )
2709 // XXX - How can this occur?
2710 // Seems like a bug to return an error now that we've
2711 // disabled the relevant shadow mode.
2712 //
2713 return -ENOMEM;
2715 shadow_hash_teardown(d);
2716 SHADOW_PRINTK("un-shadowing of domain %u done."
2717 " Shadow pages total = %u, free = %u, p2m=%u\n",
2718 d->domain_id,
2719 d->arch.paging.shadow.total_pages,
2720 d->arch.paging.shadow.free_pages,
2721 d->arch.paging.shadow.p2m_pages);
2724 return 0;
2727 /* Enable/disable ops for the "test" and "log-dirty" modes */
2728 static int shadow_test_enable(struct domain *d)
2730 int ret;
2732 domain_pause(d);
2733 shadow_lock(d);
2734 ret = shadow_one_bit_enable(d, PG_SH_enable);
2735 shadow_unlock(d);
2736 domain_unpause(d);
2738 return ret;
2741 static int shadow_test_disable(struct domain *d)
2743 int ret;
2745 domain_pause(d);
2746 shadow_lock(d);
2747 ret = shadow_one_bit_disable(d, PG_SH_enable);
2748 shadow_unlock(d);
2749 domain_unpause(d);
2751 return ret;
2754 /**************************************************************************/
2755 /* P2M map manipulations */
2757 /* shadow specific code which should be called when P2M table entry is updated
2758 * with new content. It is responsible for update the entry, as well as other
2759 * shadow processing jobs.
2760 */
2761 void
2762 shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn,
2763 l1_pgentry_t *p, mfn_t table_mfn,
2764 l1_pgentry_t new, unsigned int level)
2766 struct domain *d = v->domain;
2767 mfn_t mfn;
2769 shadow_lock(d);
2771 /* handle physmap_add and physmap_remove */
2772 mfn = gfn_to_mfn(d, gfn);
2773 if ( v != NULL && level == 1 && mfn_valid(mfn) ) {
2774 sh_remove_all_shadows_and_parents(v, mfn);
2775 if ( sh_remove_all_mappings(v, mfn) )
2776 flush_tlb_mask(d->domain_dirty_cpumask);
2779 /* update the entry with new content */
2780 safe_write_pte(p, new);
2782 /* install P2M in monitors for PAE Xen */
2783 #if CONFIG_PAGING_LEVELS == 3
2784 if ( level == 3 ) {
2785 struct vcpu *v;
2786 /* We have written to the p2m l3: need to sync the per-vcpu
2787 * copies of it in the monitor tables */
2788 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
2789 /* Also, any vcpus running on shadows of the p2m need to
2790 * reload their CR3s so the change propagates to the shadow */
2791 for_each_vcpu(d, v) {
2792 if ( pagetable_get_pfn(v->arch.guest_table)
2793 == pagetable_get_pfn(d->arch.phys_table)
2794 && v->arch.paging.mode != NULL )
2795 v->arch.paging.mode->update_cr3(v, 0);
2798 #endif
2800 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
2801 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
2802 cached the fact that this is an mmio region in the shadow
2803 page tables. Blow the tables away to remove the cache.
2804 This is pretty heavy handed, but this is a rare operation
2805 (it might happen a dozen times during boot and then never
2806 again), so it doesn't matter too much. */
2807 if ( d->arch.paging.shadow.has_fast_mmio_entries )
2809 shadow_blow_tables(d);
2810 d->arch.paging.shadow.has_fast_mmio_entries = 0;
2812 #endif
2814 shadow_unlock(d);
2817 /**************************************************************************/
2818 /* Log-dirty mode support */
2820 /* Shadow specific code which is called in paging_log_dirty_enable().
2821 * Return 0 if no problem found.
2822 */
2823 int shadow_enable_log_dirty(struct domain *d)
2825 int ret;
2827 /* shadow lock is required here */
2828 shadow_lock(d);
2829 if ( shadow_mode_enabled(d) )
2831 /* This domain already has some shadows: need to clear them out
2832 * of the way to make sure that all references to guest memory are
2833 * properly write-protected */
2834 shadow_blow_tables(d);
2837 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2838 /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
2839 * change an l4e instead of cr3 to switch tables. Give them the
2840 * same optimization */
2841 if ( is_pv_32on64_domain(d) )
2842 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2843 #endif
2845 ret = shadow_one_bit_enable(d, PG_log_dirty);
2846 shadow_unlock(d);
2848 return ret;
2851 /* shadow specfic code which is called in paging_log_dirty_disable() */
2852 int shadow_disable_log_dirty(struct domain *d)
2854 int ret;
2856 /* shadow lock is required here */
2857 shadow_lock(d);
2858 ret = shadow_one_bit_disable(d, PG_log_dirty);
2859 shadow_unlock(d);
2861 return ret;
2864 /* This function is called when we CLEAN log dirty bitmap. See
2865 * paging_log_dirty_op() for details.
2866 */
2867 void shadow_clean_dirty_bitmap(struct domain *d)
2869 shadow_lock(d);
2870 /* Need to revoke write access to the domain's pages again.
2871 * In future, we'll have a less heavy-handed approach to this,
2872 * but for now, we just unshadow everything except Xen. */
2873 shadow_blow_tables(d);
2874 shadow_unlock(d);
2876 /**************************************************************************/
2877 /* Shadow-control XEN_DOMCTL dispatcher */
2879 int shadow_domctl(struct domain *d,
2880 xen_domctl_shadow_op_t *sc,
2881 XEN_GUEST_HANDLE(void) u_domctl)
2883 int rc, preempted = 0;
2885 switch ( sc->op )
2887 case XEN_DOMCTL_SHADOW_OP_OFF:
2888 if ( d->arch.paging.mode == PG_SH_enable )
2889 if ( (rc = shadow_test_disable(d)) != 0 )
2890 return rc;
2891 return 0;
2893 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
2894 return shadow_test_enable(d);
2896 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
2897 return shadow_enable(d, PG_refcounts|PG_translate);
2899 case XEN_DOMCTL_SHADOW_OP_ENABLE:
2900 return shadow_enable(d, sc->mode << PG_mode_shift);
2902 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
2903 sc->mb = shadow_get_allocation(d);
2904 return 0;
2906 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
2907 shadow_lock(d);
2908 if ( sc->mb == 0 && shadow_mode_enabled(d) )
2910 /* Can't set the allocation to zero unless the domain stops using
2911 * shadow pagetables first */
2912 SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
2913 " is still using shadows.\n", d->domain_id);
2914 shadow_unlock(d);
2915 return -EINVAL;
2917 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
2918 shadow_unlock(d);
2919 if ( preempted )
2920 /* Not finished. Set up to re-run the call. */
2921 rc = hypercall_create_continuation(
2922 __HYPERVISOR_domctl, "h", u_domctl);
2923 else
2924 /* Finished. Return the new allocation */
2925 sc->mb = shadow_get_allocation(d);
2926 return rc;
2928 default:
2929 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
2930 return -EINVAL;
2935 /**************************************************************************/
2936 /* Auditing shadow tables */
2938 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
2940 void shadow_audit_tables(struct vcpu *v)
2942 /* Dispatch table for getting per-type functions */
2943 static hash_callback_t callbacks[SH_type_unused] = {
2944 NULL, /* none */
2945 #if CONFIG_PAGING_LEVELS == 2
2946 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
2947 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
2948 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
2949 #else
2950 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
2951 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
2952 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
2953 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
2954 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
2955 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
2956 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
2957 #if CONFIG_PAGING_LEVELS >= 4
2958 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
2959 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
2960 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
2961 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2h_64 */
2962 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
2963 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
2964 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2965 #endif /* CONFIG_PAGING_LEVELS > 2 */
2966 NULL /* All the rest */
2967 };
2968 unsigned int mask;
2970 if ( !(SHADOW_AUDIT_ENABLE) )
2971 return;
2973 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
2974 mask = ~1; /* Audit every table in the system */
2975 else
2977 /* Audit only the current mode's tables */
2978 switch ( v->arch.paging.mode->guest_levels )
2980 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
2981 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
2982 |SHF_L2H_PAE); break;
2983 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
2984 |SHF_L3_64|SHF_L4_64); break;
2985 default: BUG();
2989 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
2992 #endif /* Shadow audit */
2994 /*
2995 * Local variables:
2996 * mode: C
2997 * c-set-style: "BSD"
2998 * c-basic-offset: 4
2999 * indent-tabs-mode: nil
3000 * End:
3001 */