ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 16989:92734271810a

vmx realmode: Emulate protected-mode transition while CS and SS have
bad selector values (bottom two bits non-zero).

Allows opensuse 10.3 install CD to boot. Unfortunately SUSE Linux 10.1
install CD still fails to work...

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Feb 05 15:45:10 2008 +0000 (2008-02-05)
parents e818c24cec03
children 03d13b696027
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include "private.h"
42 /* Set up the shadow-specific parts of a domain struct at start of day.
43 * Called for every domain from arch_domain_create() */
44 void shadow_domain_init(struct domain *d)
45 {
46 int i;
47 shadow_lock_init(d);
48 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
49 INIT_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
50 INIT_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
51 INIT_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
53 /* Use shadow pagetables for log-dirty support */
54 paging_log_dirty_init(d, shadow_enable_log_dirty,
55 shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
56 }
58 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
59 * job is to initialize the update_paging_modes() function pointer, which is
60 * used to initialized the rest of resources. Therefore, it really does not
61 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
62 * be compiled.
63 */
64 void shadow_vcpu_init(struct vcpu *v)
65 {
66 #if CONFIG_PAGING_LEVELS == 4
67 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
68 #elif CONFIG_PAGING_LEVELS == 3
69 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
70 #elif CONFIG_PAGING_LEVELS == 2
71 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
72 #endif
73 }
75 #if SHADOW_AUDIT
76 int shadow_audit_enable = 0;
78 static void shadow_audit_key(unsigned char key)
79 {
80 shadow_audit_enable = !shadow_audit_enable;
81 printk("%s shadow_audit_enable=%d\n",
82 __func__, shadow_audit_enable);
83 }
85 static int __init shadow_audit_key_init(void)
86 {
87 register_keyhandler(
88 'O', shadow_audit_key, "toggle shadow audits");
89 return 0;
90 }
91 __initcall(shadow_audit_key_init);
92 #endif /* SHADOW_AUDIT */
94 int _shadow_mode_refcounts(struct domain *d)
95 {
96 return shadow_mode_refcounts(d);
97 }
100 /**************************************************************************/
101 /* x86 emulator support for the shadow code
102 */
104 struct segment_register *hvm_get_seg_reg(
105 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
106 {
107 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
108 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
109 hvm_get_segment_register(current, seg, seg_reg);
110 return seg_reg;
111 }
113 static int hvm_translate_linear_addr(
114 enum x86_segment seg,
115 unsigned long offset,
116 unsigned int bytes,
117 enum hvm_access_type access_type,
118 struct sh_emulate_ctxt *sh_ctxt,
119 unsigned long *paddr)
120 {
121 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
122 int okay;
124 okay = hvm_virtual_to_linear_addr(
125 seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr);
127 if ( !okay )
128 {
129 hvm_inject_exception(TRAP_gp_fault, 0, 0);
130 return X86EMUL_EXCEPTION;
131 }
133 return 0;
134 }
136 static int
137 hvm_read(enum x86_segment seg,
138 unsigned long offset,
139 unsigned long *val,
140 unsigned int bytes,
141 enum hvm_access_type access_type,
142 struct sh_emulate_ctxt *sh_ctxt)
143 {
144 unsigned long addr;
145 int rc;
147 rc = hvm_translate_linear_addr(
148 seg, offset, bytes, access_type, sh_ctxt, &addr);
149 if ( rc )
150 return rc;
152 *val = 0;
154 if ( access_type == hvm_access_insn_fetch )
155 rc = hvm_fetch_from_guest_virt(val, addr, bytes);
156 else
157 rc = hvm_copy_from_guest_virt(val, addr, bytes);
159 switch ( rc )
160 {
161 case HVMCOPY_okay:
162 return X86EMUL_OKAY;
163 case HVMCOPY_bad_gva_to_gfn:
164 return X86EMUL_EXCEPTION;
165 default:
166 break;
167 }
169 return X86EMUL_UNHANDLEABLE;
170 }
172 static int
173 hvm_emulate_read(enum x86_segment seg,
174 unsigned long offset,
175 unsigned long *val,
176 unsigned int bytes,
177 struct x86_emulate_ctxt *ctxt)
178 {
179 if ( !is_x86_user_segment(seg) )
180 return X86EMUL_UNHANDLEABLE;
181 return hvm_read(seg, offset, val, bytes, hvm_access_read,
182 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
183 }
185 static int
186 hvm_emulate_insn_fetch(enum x86_segment seg,
187 unsigned long offset,
188 unsigned long *val,
189 unsigned int bytes,
190 struct x86_emulate_ctxt *ctxt)
191 {
192 struct sh_emulate_ctxt *sh_ctxt =
193 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
194 unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
196 ASSERT(seg == x86_seg_cs);
198 /* Fall back if requested bytes are not in the prefetch cache. */
199 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
200 return hvm_read(seg, offset, val, bytes,
201 hvm_access_insn_fetch, sh_ctxt);
203 /* Hit the cache. Simple memcpy. */
204 *val = 0;
205 memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
206 return X86EMUL_OKAY;
207 }
209 static int
210 hvm_emulate_write(enum x86_segment seg,
211 unsigned long offset,
212 unsigned long val,
213 unsigned int bytes,
214 struct x86_emulate_ctxt *ctxt)
215 {
216 struct sh_emulate_ctxt *sh_ctxt =
217 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
218 struct vcpu *v = current;
219 unsigned long addr;
220 int rc;
222 if ( !is_x86_user_segment(seg) )
223 return X86EMUL_UNHANDLEABLE;
225 /* How many emulations could we save if we unshadowed on stack writes? */
226 if ( seg == x86_seg_ss )
227 perfc_incr(shadow_fault_emulate_stack);
229 rc = hvm_translate_linear_addr(
230 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
231 if ( rc )
232 return rc;
234 return v->arch.paging.mode->shadow.x86_emulate_write(
235 v, addr, &val, bytes, sh_ctxt);
236 }
238 static int
239 hvm_emulate_cmpxchg(enum x86_segment seg,
240 unsigned long offset,
241 unsigned long old,
242 unsigned long new,
243 unsigned int bytes,
244 struct x86_emulate_ctxt *ctxt)
245 {
246 struct sh_emulate_ctxt *sh_ctxt =
247 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
248 struct vcpu *v = current;
249 unsigned long addr;
250 int rc;
252 if ( !is_x86_user_segment(seg) )
253 return X86EMUL_UNHANDLEABLE;
255 rc = hvm_translate_linear_addr(
256 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
257 if ( rc )
258 return rc;
260 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
261 v, addr, old, new, bytes, sh_ctxt);
262 }
264 static int
265 hvm_emulate_cmpxchg8b(enum x86_segment seg,
266 unsigned long offset,
267 unsigned long old_lo,
268 unsigned long old_hi,
269 unsigned long new_lo,
270 unsigned long new_hi,
271 struct x86_emulate_ctxt *ctxt)
272 {
273 struct sh_emulate_ctxt *sh_ctxt =
274 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
275 struct vcpu *v = current;
276 unsigned long addr;
277 int rc;
279 if ( !is_x86_user_segment(seg) )
280 return X86EMUL_UNHANDLEABLE;
282 rc = hvm_translate_linear_addr(
283 seg, offset, 8, hvm_access_write, sh_ctxt, &addr);
284 if ( rc )
285 return rc;
287 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
288 v, addr, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
289 }
291 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
292 .read = hvm_emulate_read,
293 .insn_fetch = hvm_emulate_insn_fetch,
294 .write = hvm_emulate_write,
295 .cmpxchg = hvm_emulate_cmpxchg,
296 .cmpxchg8b = hvm_emulate_cmpxchg8b,
297 };
299 static int
300 pv_emulate_read(enum x86_segment seg,
301 unsigned long offset,
302 unsigned long *val,
303 unsigned int bytes,
304 struct x86_emulate_ctxt *ctxt)
305 {
306 unsigned int rc;
308 if ( !is_x86_user_segment(seg) )
309 return X86EMUL_UNHANDLEABLE;
311 *val = 0;
312 if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
313 {
314 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
315 return X86EMUL_EXCEPTION;
316 }
318 return X86EMUL_OKAY;
319 }
321 static int
322 pv_emulate_write(enum x86_segment seg,
323 unsigned long offset,
324 unsigned long val,
325 unsigned int bytes,
326 struct x86_emulate_ctxt *ctxt)
327 {
328 struct sh_emulate_ctxt *sh_ctxt =
329 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
330 struct vcpu *v = current;
331 if ( !is_x86_user_segment(seg) )
332 return X86EMUL_UNHANDLEABLE;
333 return v->arch.paging.mode->shadow.x86_emulate_write(
334 v, offset, &val, bytes, sh_ctxt);
335 }
337 static int
338 pv_emulate_cmpxchg(enum x86_segment seg,
339 unsigned long offset,
340 unsigned long old,
341 unsigned long new,
342 unsigned int bytes,
343 struct x86_emulate_ctxt *ctxt)
344 {
345 struct sh_emulate_ctxt *sh_ctxt =
346 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
347 struct vcpu *v = current;
348 if ( !is_x86_user_segment(seg) )
349 return X86EMUL_UNHANDLEABLE;
350 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
351 v, offset, old, new, bytes, sh_ctxt);
352 }
354 static int
355 pv_emulate_cmpxchg8b(enum x86_segment seg,
356 unsigned long offset,
357 unsigned long old_lo,
358 unsigned long old_hi,
359 unsigned long new_lo,
360 unsigned long new_hi,
361 struct x86_emulate_ctxt *ctxt)
362 {
363 struct sh_emulate_ctxt *sh_ctxt =
364 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
365 struct vcpu *v = current;
366 if ( !is_x86_user_segment(seg) )
367 return X86EMUL_UNHANDLEABLE;
368 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
369 v, offset, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
370 }
372 static struct x86_emulate_ops pv_shadow_emulator_ops = {
373 .read = pv_emulate_read,
374 .insn_fetch = pv_emulate_read,
375 .write = pv_emulate_write,
376 .cmpxchg = pv_emulate_cmpxchg,
377 .cmpxchg8b = pv_emulate_cmpxchg8b,
378 };
380 struct x86_emulate_ops *shadow_init_emulation(
381 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
382 {
383 struct segment_register *creg, *sreg;
384 struct vcpu *v = current;
385 unsigned long addr;
387 sh_ctxt->ctxt.regs = regs;
389 if ( !is_hvm_vcpu(v) )
390 {
391 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
392 return &pv_shadow_emulator_ops;
393 }
395 /* Segment cache initialisation. Primed with CS. */
396 sh_ctxt->valid_seg_regs = 0;
397 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
399 /* Work out the emulation mode. */
400 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
401 {
402 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
403 }
404 else if ( regs->eflags & X86_EFLAGS_VM )
405 {
406 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 16;
407 }
408 else
409 {
410 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
411 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
412 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
413 }
415 /* Attempt to prefetch whole instruction. */
416 sh_ctxt->insn_buf_eip = regs->eip;
417 sh_ctxt->insn_buf_bytes =
418 (!hvm_translate_linear_addr(
419 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
420 hvm_access_insn_fetch, sh_ctxt, &addr) &&
421 !hvm_fetch_from_guest_virt_nofault(
422 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf)))
423 ? sizeof(sh_ctxt->insn_buf) : 0;
425 return &hvm_shadow_emulator_ops;
426 }
428 /* Update an initialized emulation context to prepare for the next
429 * instruction */
430 void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
431 struct cpu_user_regs *regs)
432 {
433 struct vcpu *v = current;
434 unsigned long addr, diff;
436 /* We don't refetch the segment bases, because we don't emulate
437 * writes to segment registers */
439 if ( is_hvm_vcpu(v) )
440 {
441 diff = regs->eip - sh_ctxt->insn_buf_eip;
442 if ( diff > sh_ctxt->insn_buf_bytes )
443 {
444 /* Prefetch more bytes. */
445 sh_ctxt->insn_buf_bytes =
446 (!hvm_translate_linear_addr(
447 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
448 hvm_access_insn_fetch, sh_ctxt, &addr) &&
449 !hvm_fetch_from_guest_virt_nofault(
450 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf)))
451 ? sizeof(sh_ctxt->insn_buf) : 0;
452 sh_ctxt->insn_buf_eip = regs->eip;
453 }
454 }
455 }
457 /**************************************************************************/
458 /* Code for "promoting" a guest page to the point where the shadow code is
459 * willing to let it be treated as a guest page table. This generally
460 * involves making sure there are no writable mappings available to the guest
461 * for this page.
462 */
463 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
464 {
465 struct page_info *page = mfn_to_page(gmfn);
467 ASSERT(mfn_valid(gmfn));
469 /* We should never try to promote a gmfn that has writeable mappings */
470 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
471 || (page->u.inuse.type_info & PGT_count_mask) == 0
472 || v->domain->is_shutting_down);
474 /* Is the page already shadowed? */
475 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
476 page->shadow_flags = 0;
478 ASSERT(!test_bit(type, &page->shadow_flags));
479 set_bit(type, &page->shadow_flags);
480 }
482 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
483 {
484 struct page_info *page = mfn_to_page(gmfn);
486 ASSERT(test_bit(_PGC_page_table, &page->count_info));
487 ASSERT(test_bit(type, &page->shadow_flags));
489 clear_bit(type, &page->shadow_flags);
491 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
492 {
493 /* tlbflush timestamp field is valid again */
494 page->tlbflush_timestamp = tlbflush_current_time();
495 clear_bit(_PGC_page_table, &page->count_info);
496 }
497 }
499 /**************************************************************************/
500 /* Validate a pagetable change from the guest and update the shadows.
501 * Returns a bitmask of SHADOW_SET_* flags. */
503 int
504 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
505 {
506 int result = 0;
507 struct page_info *page = mfn_to_page(gmfn);
509 paging_mark_dirty(v->domain, mfn_x(gmfn));
511 // Determine which types of shadows are affected, and update each.
512 //
513 // Always validate L1s before L2s to prevent another cpu with a linear
514 // mapping of this gmfn from seeing a walk that results from
515 // using the new L2 value and the old L1 value. (It is OK for such a
516 // guest to see a walk that uses the old L2 value with the new L1 value,
517 // as hardware could behave this way if one level of the pagewalk occurs
518 // before the store, and the next level of the pagewalk occurs after the
519 // store.
520 //
521 // Ditto for L2s before L3s, etc.
522 //
524 if ( !(page->count_info & PGC_page_table) )
525 return 0; /* Not shadowed at all */
527 #if CONFIG_PAGING_LEVELS == 2
528 if ( page->shadow_flags & SHF_L1_32 )
529 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
530 (v, gmfn, entry, size);
531 #else
532 if ( page->shadow_flags & SHF_L1_32 )
533 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
534 (v, gmfn, entry, size);
535 #endif
537 #if CONFIG_PAGING_LEVELS == 2
538 if ( page->shadow_flags & SHF_L2_32 )
539 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
540 (v, gmfn, entry, size);
541 #else
542 if ( page->shadow_flags & SHF_L2_32 )
543 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
544 (v, gmfn, entry, size);
545 #endif
547 #if CONFIG_PAGING_LEVELS >= 3
548 if ( page->shadow_flags & SHF_L1_PAE )
549 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
550 (v, gmfn, entry, size);
551 if ( page->shadow_flags & SHF_L2_PAE )
552 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
553 (v, gmfn, entry, size);
554 if ( page->shadow_flags & SHF_L2H_PAE )
555 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
556 (v, gmfn, entry, size);
557 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
558 ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
559 #endif
561 #if CONFIG_PAGING_LEVELS >= 4
562 if ( page->shadow_flags & SHF_L1_64 )
563 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
564 (v, gmfn, entry, size);
565 if ( page->shadow_flags & SHF_L2_64 )
566 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
567 (v, gmfn, entry, size);
568 if ( page->shadow_flags & SHF_L2H_64 )
569 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4, 4)
570 (v, gmfn, entry, size);
571 if ( page->shadow_flags & SHF_L3_64 )
572 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
573 (v, gmfn, entry, size);
574 if ( page->shadow_flags & SHF_L4_64 )
575 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
576 (v, gmfn, entry, size);
577 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
578 ASSERT((page->shadow_flags
579 & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
580 #endif
582 return result;
583 }
586 void
587 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
588 void *entry, u32 size)
589 /* This is the entry point for emulated writes to pagetables in HVM guests and
590 * PV translated guests.
591 */
592 {
593 struct domain *d = v->domain;
594 int rc;
596 ASSERT(shadow_locked_by_me(v->domain));
597 rc = sh_validate_guest_entry(v, gmfn, entry, size);
598 if ( rc & SHADOW_SET_FLUSH )
599 /* Need to flush TLBs to pick up shadow PT changes */
600 flush_tlb_mask(d->domain_dirty_cpumask);
601 if ( rc & SHADOW_SET_ERROR )
602 {
603 /* This page is probably not a pagetable any more: tear it out of the
604 * shadows, along with any tables that reference it.
605 * Since the validate call above will have made a "safe" (i.e. zero)
606 * shadow entry, we can let the domain live even if we can't fully
607 * unshadow the page. */
608 sh_remove_shadows(v, gmfn, 0, 0);
609 }
610 }
612 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
613 intpte_t new, mfn_t gmfn)
614 /* Write a new value into the guest pagetable, and update the shadows
615 * appropriately. Returns 0 if we page-faulted, 1 for success. */
616 {
617 int failed;
618 shadow_lock(v->domain);
619 failed = __copy_to_user(p, &new, sizeof(new));
620 if ( failed != sizeof(new) )
621 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
622 shadow_unlock(v->domain);
623 return (failed == 0);
624 }
626 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
627 intpte_t *old, intpte_t new, mfn_t gmfn)
628 /* Cmpxchg a new value into the guest pagetable, and update the shadows
629 * appropriately. Returns 0 if we page-faulted, 1 if not.
630 * N.B. caller should check the value of "old" to see if the
631 * cmpxchg itself was successful. */
632 {
633 int failed;
634 intpte_t t = *old;
635 shadow_lock(v->domain);
636 failed = cmpxchg_user(p, t, new);
637 if ( t == *old )
638 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
639 *old = t;
640 shadow_unlock(v->domain);
641 return (failed == 0);
642 }
645 /**************************************************************************/
646 /* Memory management for shadow pages. */
648 /* Allocating shadow pages
649 * -----------------------
650 *
651 * Most shadow pages are allocated singly, but there is one case where
652 * we need to allocate multiple pages together: shadowing 32-bit guest
653 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
654 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
655 * l1 tables (covering 2MB of virtual address space each). Similarly, a
656 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
657 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
658 * contiguous and aligned; functions for handling offsets into them are
659 * defined in shadow.c (shadow_l1_index() etc.)
660 *
661 * This table shows the allocation behaviour of the different modes:
662 *
663 * Xen paging 32b pae pae 64b 64b 64b
664 * Guest paging 32b 32b pae 32b pae 64b
665 * PV or HVM * HVM * HVM HVM *
666 * Shadow paging 32b pae pae pae pae 64b
667 *
668 * sl1 size 4k 8k 4k 8k 4k 4k
669 * sl2 size 4k 16k 4k 16k 4k 4k
670 * sl3 size - - - - - 4k
671 * sl4 size - - - - - 4k
672 *
673 * We allocate memory from xen in four-page units and break them down
674 * with a simple buddy allocator. Can't use the xen allocator to handle
675 * this as it only works for contiguous zones, and a domain's shadow
676 * pool is made of fragments.
677 *
678 * In HVM guests, the p2m table is built out of shadow pages, and we provide
679 * a function for the p2m management to steal pages, in max-order chunks, from
680 * the free pool. We don't provide for giving them back, yet.
681 */
683 /* Figure out the least acceptable quantity of shadow memory.
684 * The minimum memory requirement for always being able to free up a
685 * chunk of memory is very small -- only three max-order chunks per
686 * vcpu to hold the top level shadows and pages with Xen mappings in them.
687 *
688 * But for a guest to be guaranteed to successfully execute a single
689 * instruction, we must be able to map a large number (about thirty) VAs
690 * at the same time, which means that to guarantee progress, we must
691 * allow for more than ninety allocated pages per vcpu. We round that
692 * up to 128 pages, or half a megabyte per vcpu. */
693 static unsigned int shadow_min_acceptable_pages(struct domain *d)
694 {
695 u32 vcpu_count = 0;
696 struct vcpu *v;
698 for_each_vcpu(d, v)
699 vcpu_count++;
701 return (vcpu_count * 128);
702 }
704 /* Figure out the order of allocation needed for a given shadow type */
705 static inline u32
706 shadow_order(unsigned int shadow_type)
707 {
708 #if CONFIG_PAGING_LEVELS > 2
709 static const u32 type_to_order[SH_type_unused] = {
710 0, /* SH_type_none */
711 1, /* SH_type_l1_32_shadow */
712 1, /* SH_type_fl1_32_shadow */
713 2, /* SH_type_l2_32_shadow */
714 0, /* SH_type_l1_pae_shadow */
715 0, /* SH_type_fl1_pae_shadow */
716 0, /* SH_type_l2_pae_shadow */
717 0, /* SH_type_l2h_pae_shadow */
718 0, /* SH_type_l1_64_shadow */
719 0, /* SH_type_fl1_64_shadow */
720 0, /* SH_type_l2_64_shadow */
721 0, /* SH_type_l2h_64_shadow */
722 0, /* SH_type_l3_64_shadow */
723 0, /* SH_type_l4_64_shadow */
724 2, /* SH_type_p2m_table */
725 0 /* SH_type_monitor_table */
726 };
727 ASSERT(shadow_type < SH_type_unused);
728 return type_to_order[shadow_type];
729 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
730 return 0;
731 #endif
732 }
734 static inline unsigned int
735 shadow_max_order(struct domain *d)
736 {
737 return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0;
738 }
740 /* Do we have at total of count pages of the requested order free? */
741 static inline int space_is_available(
742 struct domain *d,
743 unsigned int order,
744 unsigned int count)
745 {
746 for ( ; order <= shadow_max_order(d); ++order )
747 {
748 unsigned int n = count;
749 const struct list_head *p;
751 list_for_each ( p, &d->arch.paging.shadow.freelists[order] )
752 if ( --n == 0 )
753 return 1;
754 count = (count + 1) >> 1;
755 }
757 return 0;
758 }
760 /* Dispatcher function: call the per-mode function that will unhook the
761 * non-Xen mappings in this top-level shadow mfn */
762 static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
763 {
764 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
765 switch ( sp->type )
766 {
767 case SH_type_l2_32_shadow:
768 #if CONFIG_PAGING_LEVELS == 2
769 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
770 #else
771 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
772 #endif
773 break;
774 #if CONFIG_PAGING_LEVELS >= 3
775 case SH_type_l2_pae_shadow:
776 case SH_type_l2h_pae_shadow:
777 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
778 break;
779 #endif
780 #if CONFIG_PAGING_LEVELS >= 4
781 case SH_type_l4_64_shadow:
782 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
783 break;
784 #endif
785 default:
786 SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->type);
787 BUG();
788 }
789 }
792 /* Make sure there are at least count order-sized pages
793 * available in the shadow page pool. */
794 static void _shadow_prealloc(
795 struct domain *d,
796 unsigned int order,
797 unsigned int count)
798 {
799 /* Need a vpcu for calling unpins; for now, since we don't have
800 * per-vcpu shadows, any will do */
801 struct vcpu *v, *v2;
802 struct list_head *l, *t;
803 struct shadow_page_info *sp;
804 cpumask_t flushmask = CPU_MASK_NONE;
805 mfn_t smfn;
806 int i;
808 ASSERT(order <= shadow_max_order(d));
809 if ( space_is_available(d, order, count) ) return;
811 v = current;
812 if ( v->domain != d )
813 v = d->vcpu[0];
814 ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */
816 /* Stage one: walk the list of pinned pages, unpinning them */
817 perfc_incr(shadow_prealloc_1);
818 list_for_each_backwards_safe(l, t, &d->arch.paging.shadow.pinned_shadows)
819 {
820 sp = list_entry(l, struct shadow_page_info, list);
821 smfn = shadow_page_to_mfn(sp);
823 /* Unpin this top-level shadow */
824 sh_unpin(v, smfn);
826 /* See if that freed up enough space */
827 if ( space_is_available(d, order, count) ) return;
828 }
830 /* Stage two: all shadow pages are in use in hierarchies that are
831 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
832 * mappings. */
833 perfc_incr(shadow_prealloc_2);
835 for_each_vcpu(d, v2)
836 for ( i = 0 ; i < 4 ; i++ )
837 {
838 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
839 {
840 shadow_unhook_mappings(v,
841 pagetable_get_mfn(v2->arch.shadow_table[i]));
842 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
844 /* See if that freed up enough space */
845 if ( space_is_available(d, order, count) )
846 {
847 flush_tlb_mask(flushmask);
848 return;
849 }
850 }
851 }
853 /* Nothing more we can do: all remaining shadows are of pages that
854 * hold Xen mappings for some vcpu. This can never happen. */
855 SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n"
856 " shadow pages total = %u, free = %u, p2m=%u\n",
857 count, order,
858 d->arch.paging.shadow.total_pages,
859 d->arch.paging.shadow.free_pages,
860 d->arch.paging.shadow.p2m_pages);
861 BUG();
862 }
864 /* Make sure there are at least count pages of the order according to
865 * type available in the shadow page pool.
866 * This must be called before any calls to shadow_alloc(). Since this
867 * will free existing shadows to make room, it must be called early enough
868 * to avoid freeing shadows that the caller is currently working on. */
869 void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
870 {
871 ASSERT(type != SH_type_p2m_table);
872 return _shadow_prealloc(d, shadow_order(type), count);
873 }
875 /* Deliberately free all the memory we can: this will tear down all of
876 * this domain's shadows */
877 static void shadow_blow_tables(struct domain *d)
878 {
879 struct list_head *l, *t;
880 struct shadow_page_info *sp;
881 struct vcpu *v = d->vcpu[0];
882 mfn_t smfn;
883 int i;
885 ASSERT(v != NULL);
887 /* Pass one: unpin all pinned pages */
888 list_for_each_backwards_safe(l,t, &d->arch.paging.shadow.pinned_shadows)
889 {
890 sp = list_entry(l, struct shadow_page_info, list);
891 smfn = shadow_page_to_mfn(sp);
892 sh_unpin(v, smfn);
893 }
895 /* Second pass: unhook entries of in-use shadows */
896 for_each_vcpu(d, v)
897 for ( i = 0 ; i < 4 ; i++ )
898 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
899 shadow_unhook_mappings(v,
900 pagetable_get_mfn(v->arch.shadow_table[i]));
902 /* Make sure everyone sees the unshadowings */
903 flush_tlb_mask(d->domain_dirty_cpumask);
904 }
906 void shadow_blow_tables_per_domain(struct domain *d)
907 {
908 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL ) {
909 shadow_lock(d);
910 shadow_blow_tables(d);
911 shadow_unlock(d);
912 }
913 }
915 #ifndef NDEBUG
916 /* Blow all shadows of all shadowed domains: this can be used to cause the
917 * guest's pagetables to be re-shadowed if we suspect that the shadows
918 * have somehow got out of sync */
919 static void shadow_blow_all_tables(unsigned char c)
920 {
921 struct domain *d;
922 printk("'%c' pressed -> blowing all shadow tables\n", c);
923 rcu_read_lock(&domlist_read_lock);
924 for_each_domain(d)
925 {
926 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
927 {
928 shadow_lock(d);
929 shadow_blow_tables(d);
930 shadow_unlock(d);
931 }
932 }
933 rcu_read_unlock(&domlist_read_lock);
934 }
936 /* Register this function in the Xen console keypress table */
937 static __init int shadow_blow_tables_keyhandler_init(void)
938 {
939 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
940 return 0;
941 }
942 __initcall(shadow_blow_tables_keyhandler_init);
943 #endif /* !NDEBUG */
945 /* Allocate another shadow's worth of (contiguous, aligned) pages,
946 * and fill in the type and backpointer fields of their page_infos.
947 * Never fails to allocate. */
948 mfn_t shadow_alloc(struct domain *d,
949 u32 shadow_type,
950 unsigned long backpointer)
951 {
952 struct shadow_page_info *sp = NULL;
953 unsigned int order = shadow_order(shadow_type);
954 cpumask_t mask;
955 void *p;
956 int i;
958 ASSERT(shadow_locked_by_me(d));
959 if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d))
960 order = shadow_max_order(d);
961 ASSERT(order <= shadow_max_order(d));
962 ASSERT(shadow_type != SH_type_none);
963 perfc_incr(shadow_alloc);
965 /* Find smallest order which can satisfy the request. */
966 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
967 if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
968 goto found;
970 /* If we get here, we failed to allocate. This should never happen.
971 * It means that we didn't call shadow_prealloc() correctly before
972 * we allocated. We can't recover by calling prealloc here, because
973 * we might free up higher-level pages that the caller is working on. */
974 SHADOW_ERROR("Can't allocate %i shadow pages!\n", 1 << order);
975 BUG();
977 found:
978 sp = list_entry(d->arch.paging.shadow.freelists[i].next,
979 struct shadow_page_info, list);
980 list_del(&sp->list);
982 /* We may have to halve the chunk a number of times. */
983 while ( i != order )
984 {
985 i--;
986 sp->order = i;
987 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[i]);
988 sp += 1 << i;
989 }
990 d->arch.paging.shadow.free_pages -= 1 << order;
992 /* Init page info fields and clear the pages */
993 for ( i = 0; i < 1<<order ; i++ )
994 {
995 /* Before we overwrite the old contents of this page,
996 * we need to be sure that no TLB holds a pointer to it. */
997 mask = d->domain_dirty_cpumask;
998 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
999 if ( unlikely(!cpus_empty(mask)) )
1001 perfc_incr(shadow_alloc_tlbflush);
1002 flush_tlb_mask(mask);
1004 /* Now safe to clear the page for reuse */
1005 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
1006 ASSERT(p != NULL);
1007 clear_page(p);
1008 sh_unmap_domain_page(p);
1009 INIT_LIST_HEAD(&sp[i].list);
1010 sp[i].type = shadow_type;
1011 sp[i].pinned = 0;
1012 sp[i].count = 0;
1013 sp[i].backpointer = backpointer;
1014 sp[i].next_shadow = NULL;
1015 perfc_incr(shadow_alloc_count);
1017 return shadow_page_to_mfn(sp);
1021 /* Return some shadow pages to the pool. */
1022 void shadow_free(struct domain *d, mfn_t smfn)
1024 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1025 u32 shadow_type;
1026 unsigned long order;
1027 unsigned long mask;
1028 int i;
1030 ASSERT(shadow_locked_by_me(d));
1031 perfc_incr(shadow_free);
1033 shadow_type = sp->type;
1034 ASSERT(shadow_type != SH_type_none);
1035 ASSERT(shadow_type != SH_type_p2m_table);
1036 order = shadow_order(shadow_type);
1038 d->arch.paging.shadow.free_pages += 1 << order;
1040 for ( i = 0; i < 1<<order; i++ )
1042 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1043 struct vcpu *v;
1044 for_each_vcpu(d, v)
1046 /* No longer safe to look for a writeable mapping in this shadow */
1047 if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
1048 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1050 #endif
1051 /* Strip out the type: this is now a free shadow page */
1052 sp[i].type = 0;
1053 /* Remember the TLB timestamp so we will know whether to flush
1054 * TLBs when we reuse the page. Because the destructors leave the
1055 * contents of the pages in place, we can delay TLB flushes until
1056 * just before the allocator hands the page out again. */
1057 sp[i].tlbflush_timestamp = tlbflush_current_time();
1058 perfc_decr(shadow_alloc_count);
1061 /* Merge chunks as far as possible. */
1062 for ( ; order < shadow_max_order(d); ++order )
1064 mask = 1 << order;
1065 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
1066 /* Merge with predecessor block? */
1067 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
1068 break;
1069 list_del(&(sp-mask)->list);
1070 sp -= mask;
1071 } else {
1072 /* Merge with successor block? */
1073 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
1074 break;
1075 list_del(&(sp+mask)->list);
1079 sp->order = order;
1080 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1083 /* Divert some memory from the pool to be used by the p2m mapping.
1084 * This action is irreversible: the p2m mapping only ever grows.
1085 * That's OK because the p2m table only exists for translated domains,
1086 * and those domains can't ever turn off shadow mode.
1087 * Also, we only ever allocate a max-order chunk, so as to preserve
1088 * the invariant that shadow_prealloc() always works.
1089 * Returns 0 iff it can't get a chunk (the caller should then
1090 * free up some pages in domheap and call sh_set_allocation);
1091 * returns non-zero on success.
1092 */
1093 static int
1094 sh_alloc_p2m_pages(struct domain *d)
1096 struct page_info *pg;
1097 u32 i;
1098 unsigned int order = shadow_max_order(d);
1100 ASSERT(shadow_locked_by_me(d));
1102 if ( d->arch.paging.shadow.total_pages
1103 < (shadow_min_acceptable_pages(d) + (1 << order)) )
1104 return 0; /* Not enough shadow memory: need to increase it first */
1106 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1107 d->arch.paging.shadow.p2m_pages += (1 << order);
1108 d->arch.paging.shadow.total_pages -= (1 << order);
1109 for (i = 0; i < (1U << order); i++)
1111 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1112 * Marking the domain as the owner would normally allow the guest to
1113 * create mappings of these pages, but these p2m pages will never be
1114 * in the domain's guest-physical address space, and so that is not
1115 * believed to be a concern.
1116 */
1117 page_set_owner(&pg[i], d);
1118 pg[i].count_info = 1;
1119 list_add_tail(&pg[i].list, &d->arch.paging.shadow.p2m_freelist);
1121 return 1;
1124 // Returns 0 if no memory is available...
1125 static struct page_info *
1126 shadow_alloc_p2m_page(struct domain *d)
1128 struct list_head *entry;
1129 struct page_info *pg;
1130 mfn_t mfn;
1131 void *p;
1133 shadow_lock(d);
1135 if ( list_empty(&d->arch.paging.shadow.p2m_freelist) &&
1136 !sh_alloc_p2m_pages(d) )
1138 shadow_unlock(d);
1139 return NULL;
1141 entry = d->arch.paging.shadow.p2m_freelist.next;
1142 list_del(entry);
1144 shadow_unlock(d);
1146 pg = list_entry(entry, struct page_info, list);
1147 mfn = page_to_mfn(pg);
1148 p = sh_map_domain_page(mfn);
1149 clear_page(p);
1150 sh_unmap_domain_page(p);
1152 return pg;
1155 static void
1156 shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1158 ASSERT(page_get_owner(pg) == d);
1159 /* Should have just the one ref we gave it in alloc_p2m_page() */
1160 if ( (pg->count_info & PGC_count_mask) != 1 )
1162 SHADOW_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1163 pg->count_info, pg->u.inuse.type_info);
1165 pg->count_info = 0;
1166 /* Free should not decrement domain's total allocation, since
1167 * these pages were allocated without an owner. */
1168 page_set_owner(pg, NULL);
1169 free_domheap_pages(pg, 0);
1170 d->arch.paging.shadow.p2m_pages--;
1171 perfc_decr(shadow_alloc_count);
1174 #if CONFIG_PAGING_LEVELS == 3
1175 static void p2m_install_entry_in_monitors(struct domain *d,
1176 l3_pgentry_t *l3e)
1177 /* Special case, only used for external-mode domains on PAE hosts:
1178 * update the mapping of the p2m table. Once again, this is trivial in
1179 * other paging modes (one top-level entry points to the top-level p2m,
1180 * no maintenance needed), but PAE makes life difficult by needing a
1181 * copy the eight l3es of the p2m table in eight l2h slots in the
1182 * monitor table. This function makes fresh copies when a p2m l3e
1183 * changes. */
1185 l2_pgentry_t *ml2e;
1186 struct vcpu *v;
1187 unsigned int index;
1189 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1190 ASSERT(index < MACHPHYS_MBYTES>>1);
1192 for_each_vcpu(d, v)
1194 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1195 continue;
1196 ASSERT(shadow_mode_external(v->domain));
1198 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1199 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1201 if ( v == current ) /* OK to use linear map of monitor_table */
1202 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1203 else
1205 l3_pgentry_t *ml3e;
1206 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1207 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1208 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1209 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1210 sh_unmap_domain_page(ml3e);
1212 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1213 if ( v != current )
1214 sh_unmap_domain_page(ml2e);
1217 #endif
1219 /* Set the pool of shadow pages to the required number of pages.
1220 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1221 * plus space for the p2m table.
1222 * Returns 0 for success, non-zero for failure. */
1223 static unsigned int sh_set_allocation(struct domain *d,
1224 unsigned int pages,
1225 int *preempted)
1227 struct shadow_page_info *sp;
1228 unsigned int lower_bound;
1229 unsigned int j, order = shadow_max_order(d);
1231 ASSERT(shadow_locked_by_me(d));
1233 /* Don't allocate less than the minimum acceptable, plus one page per
1234 * megabyte of RAM (for the p2m table) */
1235 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1236 if ( pages > 0 && pages < lower_bound )
1237 pages = lower_bound;
1238 /* Round up to largest block size */
1239 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1241 SHADOW_PRINTK("current %i target %i\n",
1242 d->arch.paging.shadow.total_pages, pages);
1244 while ( d->arch.paging.shadow.total_pages != pages )
1246 if ( d->arch.paging.shadow.total_pages < pages )
1248 /* Need to allocate more memory from domheap */
1249 sp = (struct shadow_page_info *)
1250 alloc_domheap_pages(NULL, order, 0);
1251 if ( sp == NULL )
1253 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1254 return -ENOMEM;
1256 d->arch.paging.shadow.free_pages += 1 << order;
1257 d->arch.paging.shadow.total_pages += 1 << order;
1258 for ( j = 0; j < 1U << order; j++ )
1260 sp[j].type = 0;
1261 sp[j].pinned = 0;
1262 sp[j].count = 0;
1263 sp[j].mbz = 0;
1264 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1266 sp->order = order;
1267 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1269 else if ( d->arch.paging.shadow.total_pages > pages )
1271 /* Need to return memory to domheap */
1272 _shadow_prealloc(d, order, 1);
1273 ASSERT(!list_empty(&d->arch.paging.shadow.freelists[order]));
1274 sp = list_entry(d->arch.paging.shadow.freelists[order].next,
1275 struct shadow_page_info, list);
1276 list_del(&sp->list);
1277 d->arch.paging.shadow.free_pages -= 1 << order;
1278 d->arch.paging.shadow.total_pages -= 1 << order;
1279 free_domheap_pages((struct page_info *)sp, order);
1282 /* Check to see if we need to yield and try again */
1283 if ( preempted && hypercall_preempt_check() )
1285 *preempted = 1;
1286 return 0;
1290 return 0;
1293 /* Return the size of the shadow pool, rounded up to the nearest MB */
1294 static unsigned int shadow_get_allocation(struct domain *d)
1296 unsigned int pg = d->arch.paging.shadow.total_pages;
1297 return ((pg >> (20 - PAGE_SHIFT))
1298 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1301 /**************************************************************************/
1302 /* Hash table for storing the guest->shadow mappings.
1303 * The table itself is an array of pointers to shadows; the shadows are then
1304 * threaded on a singly-linked list of shadows with the same hash value */
1306 #define SHADOW_HASH_BUCKETS 251
1307 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1309 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1310 typedef u32 key_t;
1311 static inline key_t sh_hash(unsigned long n, unsigned int t)
1313 unsigned char *p = (unsigned char *)&n;
1314 key_t k = t;
1315 int i;
1316 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1317 return k % SHADOW_HASH_BUCKETS;
1320 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1322 /* Before we get to the mechanism, define a pair of audit functions
1323 * that sanity-check the contents of the hash table. */
1324 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1325 /* Audit one bucket of the hash table */
1327 struct shadow_page_info *sp, *x;
1329 if ( !(SHADOW_AUDIT_ENABLE) )
1330 return;
1332 sp = d->arch.paging.shadow.hash_table[bucket];
1333 while ( sp )
1335 /* Not a shadow? */
1336 BUG_ON( sp->mbz != 0 );
1337 /* Bogus type? */
1338 BUG_ON( sp->type == 0 );
1339 BUG_ON( sp->type > SH_type_max_shadow );
1340 /* Wrong bucket? */
1341 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1342 /* Duplicate entry? */
1343 for ( x = sp->next_shadow; x; x = x->next_shadow )
1344 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1345 /* Follow the backpointer to the guest pagetable */
1346 if ( sp->type != SH_type_fl1_32_shadow
1347 && sp->type != SH_type_fl1_pae_shadow
1348 && sp->type != SH_type_fl1_64_shadow )
1350 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1351 /* Bad shadow flags on guest page? */
1352 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1353 /* Bad type count on guest page? */
1354 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1355 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1357 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1358 " but has typecount %#lx\n",
1359 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1360 gpg->u.inuse.type_info);
1361 BUG();
1364 /* That entry was OK; on we go */
1365 sp = sp->next_shadow;
1369 #else
1370 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1371 #endif /* Hashtable bucket audit */
1374 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1376 static void sh_hash_audit(struct domain *d)
1377 /* Full audit: audit every bucket in the table */
1379 int i;
1381 if ( !(SHADOW_AUDIT_ENABLE) )
1382 return;
1384 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1386 sh_hash_audit_bucket(d, i);
1390 #else
1391 #define sh_hash_audit(_d) do {} while(0)
1392 #endif /* Hashtable bucket audit */
1394 /* Allocate and initialise the table itself.
1395 * Returns 0 for success, 1 for error. */
1396 static int shadow_hash_alloc(struct domain *d)
1398 struct shadow_page_info **table;
1400 ASSERT(shadow_locked_by_me(d));
1401 ASSERT(!d->arch.paging.shadow.hash_table);
1403 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1404 if ( !table ) return 1;
1405 memset(table, 0,
1406 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1407 d->arch.paging.shadow.hash_table = table;
1408 return 0;
1411 /* Tear down the hash table and return all memory to Xen.
1412 * This function does not care whether the table is populated. */
1413 static void shadow_hash_teardown(struct domain *d)
1415 ASSERT(shadow_locked_by_me(d));
1416 ASSERT(d->arch.paging.shadow.hash_table);
1418 xfree(d->arch.paging.shadow.hash_table);
1419 d->arch.paging.shadow.hash_table = NULL;
1423 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1424 /* Find an entry in the hash table. Returns the MFN of the shadow,
1425 * or INVALID_MFN if it doesn't exist */
1427 struct domain *d = v->domain;
1428 struct shadow_page_info *sp, *prev;
1429 key_t key;
1431 ASSERT(shadow_locked_by_me(d));
1432 ASSERT(d->arch.paging.shadow.hash_table);
1433 ASSERT(t);
1435 sh_hash_audit(d);
1437 perfc_incr(shadow_hash_lookups);
1438 key = sh_hash(n, t);
1439 sh_hash_audit_bucket(d, key);
1441 sp = d->arch.paging.shadow.hash_table[key];
1442 prev = NULL;
1443 while(sp)
1445 if ( sp->backpointer == n && sp->type == t )
1447 /* Pull-to-front if 'sp' isn't already the head item */
1448 if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
1450 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
1451 /* Can't reorder: someone is walking the hash chains */
1452 return shadow_page_to_mfn(sp);
1453 else
1455 ASSERT(prev);
1456 /* Delete sp from the list */
1457 prev->next_shadow = sp->next_shadow;
1458 /* Re-insert it at the head of the list */
1459 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
1460 d->arch.paging.shadow.hash_table[key] = sp;
1463 else
1465 perfc_incr(shadow_hash_lookup_head);
1467 return shadow_page_to_mfn(sp);
1469 prev = sp;
1470 sp = sp->next_shadow;
1473 perfc_incr(shadow_hash_lookup_miss);
1474 return _mfn(INVALID_MFN);
1477 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
1478 mfn_t smfn)
1479 /* Put a mapping (n,t)->smfn into the hash table */
1481 struct domain *d = v->domain;
1482 struct shadow_page_info *sp;
1483 key_t key;
1485 ASSERT(shadow_locked_by_me(d));
1486 ASSERT(d->arch.paging.shadow.hash_table);
1487 ASSERT(t);
1489 sh_hash_audit(d);
1491 perfc_incr(shadow_hash_inserts);
1492 key = sh_hash(n, t);
1493 sh_hash_audit_bucket(d, key);
1495 /* Insert this shadow at the top of the bucket */
1496 sp = mfn_to_shadow_page(smfn);
1497 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
1498 d->arch.paging.shadow.hash_table[key] = sp;
1500 sh_hash_audit_bucket(d, key);
1503 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
1504 mfn_t smfn)
1505 /* Excise the mapping (n,t)->smfn from the hash table */
1507 struct domain *d = v->domain;
1508 struct shadow_page_info *sp, *x;
1509 key_t key;
1511 ASSERT(shadow_locked_by_me(d));
1512 ASSERT(d->arch.paging.shadow.hash_table);
1513 ASSERT(t);
1515 sh_hash_audit(d);
1517 perfc_incr(shadow_hash_deletes);
1518 key = sh_hash(n, t);
1519 sh_hash_audit_bucket(d, key);
1521 sp = mfn_to_shadow_page(smfn);
1522 if ( d->arch.paging.shadow.hash_table[key] == sp )
1523 /* Easy case: we're deleting the head item. */
1524 d->arch.paging.shadow.hash_table[key] = sp->next_shadow;
1525 else
1527 /* Need to search for the one we want */
1528 x = d->arch.paging.shadow.hash_table[key];
1529 while ( 1 )
1531 ASSERT(x); /* We can't have hit the end, since our target is
1532 * still in the chain somehwere... */
1533 if ( x->next_shadow == sp )
1535 x->next_shadow = sp->next_shadow;
1536 break;
1538 x = x->next_shadow;
1541 sp->next_shadow = NULL;
1543 sh_hash_audit_bucket(d, key);
1546 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1548 static void hash_foreach(struct vcpu *v,
1549 unsigned int callback_mask,
1550 hash_callback_t callbacks[],
1551 mfn_t callback_mfn)
1552 /* Walk the hash table looking at the types of the entries and
1553 * calling the appropriate callback function for each entry.
1554 * The mask determines which shadow types we call back for, and the array
1555 * of callbacks tells us which function to call.
1556 * Any callback may return non-zero to let us skip the rest of the scan.
1558 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1559 * then return non-zero to terminate the scan. */
1561 int i, done = 0;
1562 struct domain *d = v->domain;
1563 struct shadow_page_info *x;
1565 /* Say we're here, to stop hash-lookups reordering the chains */
1566 ASSERT(shadow_locked_by_me(d));
1567 ASSERT(d->arch.paging.shadow.hash_walking == 0);
1568 d->arch.paging.shadow.hash_walking = 1;
1570 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1572 /* WARNING: This is not safe against changes to the hash table.
1573 * The callback *must* return non-zero if it has inserted or
1574 * deleted anything from the hash (lookups are OK, though). */
1575 for ( x = d->arch.paging.shadow.hash_table[i]; x; x = x->next_shadow )
1577 if ( callback_mask & (1 << x->type) )
1579 ASSERT(x->type <= 15);
1580 ASSERT(callbacks[x->type] != NULL);
1581 done = callbacks[x->type](v, shadow_page_to_mfn(x),
1582 callback_mfn);
1583 if ( done ) break;
1586 if ( done ) break;
1588 d->arch.paging.shadow.hash_walking = 0;
1592 /**************************************************************************/
1593 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1594 * which will decrement refcounts appropriately and return memory to the
1595 * free pool. */
1597 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1599 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1600 unsigned int t = sp->type;
1603 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1605 /* Double-check, if we can, that the shadowed page belongs to this
1606 * domain, (by following the back-pointer). */
1607 ASSERT(t == SH_type_fl1_32_shadow ||
1608 t == SH_type_fl1_pae_shadow ||
1609 t == SH_type_fl1_64_shadow ||
1610 t == SH_type_monitor_table ||
1611 (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
1612 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
1613 == v->domain));
1615 /* The down-shifts here are so that the switch statement is on nice
1616 * small numbers that the compiler will enjoy */
1617 switch ( t )
1619 #if CONFIG_PAGING_LEVELS == 2
1620 case SH_type_l1_32_shadow:
1621 case SH_type_fl1_32_shadow:
1622 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1623 break;
1624 case SH_type_l2_32_shadow:
1625 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1626 break;
1627 #else /* PAE or 64bit */
1628 case SH_type_l1_32_shadow:
1629 case SH_type_fl1_32_shadow:
1630 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1631 break;
1632 case SH_type_l2_32_shadow:
1633 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1634 break;
1635 #endif
1637 #if CONFIG_PAGING_LEVELS >= 3
1638 case SH_type_l1_pae_shadow:
1639 case SH_type_fl1_pae_shadow:
1640 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1641 break;
1642 case SH_type_l2_pae_shadow:
1643 case SH_type_l2h_pae_shadow:
1644 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1645 break;
1646 #endif
1648 #if CONFIG_PAGING_LEVELS >= 4
1649 case SH_type_l1_64_shadow:
1650 case SH_type_fl1_64_shadow:
1651 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1652 break;
1653 case SH_type_l2h_64_shadow:
1654 ASSERT(is_pv_32on64_vcpu(v));
1655 /* Fall through... */
1656 case SH_type_l2_64_shadow:
1657 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1658 break;
1659 case SH_type_l3_64_shadow:
1660 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1661 break;
1662 case SH_type_l4_64_shadow:
1663 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1664 break;
1665 #endif
1666 default:
1667 SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
1668 (unsigned long)t);
1669 BUG();
1673 /**************************************************************************/
1674 /* Remove all writeable mappings of a guest frame from the shadow tables
1675 * Returns non-zero if we need to flush TLBs.
1676 * level and fault_addr desribe how we found this to be a pagetable;
1677 * level==0 means we have some other reason for revoking write access.*/
1679 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
1680 unsigned int level,
1681 unsigned long fault_addr)
1683 /* Dispatch table for getting per-type functions */
1684 static hash_callback_t callbacks[SH_type_unused] = {
1685 NULL, /* none */
1686 #if CONFIG_PAGING_LEVELS == 2
1687 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* l1_32 */
1688 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* fl1_32 */
1689 #else
1690 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* l1_32 */
1691 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* fl1_32 */
1692 #endif
1693 NULL, /* l2_32 */
1694 #if CONFIG_PAGING_LEVELS >= 3
1695 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* l1_pae */
1696 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* fl1_pae */
1697 #else
1698 NULL, /* l1_pae */
1699 NULL, /* fl1_pae */
1700 #endif
1701 NULL, /* l2_pae */
1702 NULL, /* l2h_pae */
1703 #if CONFIG_PAGING_LEVELS >= 4
1704 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* l1_64 */
1705 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* fl1_64 */
1706 #else
1707 NULL, /* l1_64 */
1708 NULL, /* fl1_64 */
1709 #endif
1710 NULL, /* l2_64 */
1711 NULL, /* l2h_64 */
1712 NULL, /* l3_64 */
1713 NULL, /* l4_64 */
1714 NULL, /* p2m */
1715 NULL /* unused */
1716 };
1718 static unsigned int callback_mask =
1719 1 << SH_type_l1_32_shadow
1720 | 1 << SH_type_fl1_32_shadow
1721 | 1 << SH_type_l1_pae_shadow
1722 | 1 << SH_type_fl1_pae_shadow
1723 | 1 << SH_type_l1_64_shadow
1724 | 1 << SH_type_fl1_64_shadow
1726 struct page_info *pg = mfn_to_page(gmfn);
1728 ASSERT(shadow_locked_by_me(v->domain));
1730 /* Only remove writable mappings if we are doing shadow refcounts.
1731 * In guest refcounting, we trust Xen to already be restricting
1732 * all the writes to the guest page tables, so we do not need to
1733 * do more. */
1734 if ( !shadow_mode_refcounts(v->domain) )
1735 return 0;
1737 /* Early exit if it's already a pagetable, or otherwise not writeable */
1738 if ( sh_mfn_is_a_page_table(gmfn)
1739 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1740 return 0;
1742 perfc_incr(shadow_writeable);
1744 /* If this isn't a "normal" writeable page, the domain is trying to
1745 * put pagetables in special memory of some kind. We can't allow that. */
1746 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1748 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1749 PRtype_info "\n",
1750 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1751 domain_crash(v->domain);
1754 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1755 if ( v == current && level != 0 )
1757 unsigned long gfn;
1758 /* Heuristic: there is likely to be only one writeable mapping,
1759 * and that mapping is likely to be in the current pagetable,
1760 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
1762 #define GUESS(_a, _h) do { \
1763 if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
1764 perfc_incr(shadow_writeable_h_ ## _h); \
1765 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1766 return 1; \
1767 } while (0)
1770 if ( v->arch.paging.mode->guest_levels == 2 )
1772 if ( level == 1 )
1773 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1774 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1776 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1777 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1778 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1781 #if CONFIG_PAGING_LEVELS >= 3
1782 else if ( v->arch.paging.mode->guest_levels == 3 )
1784 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1785 switch ( level )
1787 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1788 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1791 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1792 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1793 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1795 #if CONFIG_PAGING_LEVELS >= 4
1796 else if ( v->arch.paging.mode->guest_levels == 4 )
1798 /* 64bit w2k3: linear map at 0xfffff68000000000 */
1799 switch ( level )
1801 case 1: GUESS(0xfffff68000000000UL
1802 + ((fault_addr & VADDR_MASK) >> 9), 3); break;
1803 case 2: GUESS(0xfffff6fb40000000UL
1804 + ((fault_addr & VADDR_MASK) >> 18), 3); break;
1805 case 3: GUESS(0xfffff6fb7da00000UL
1806 + ((fault_addr & VADDR_MASK) >> 27), 3); break;
1809 /* 64bit Linux direct map at 0xffff810000000000; older kernels
1810 * had it at 0x0000010000000000UL */
1811 gfn = mfn_to_gfn(v->domain, gmfn);
1812 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
1813 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
1815 #endif /* CONFIG_PAGING_LEVELS >= 4 */
1816 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1818 #undef GUESS
1821 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1822 return 1;
1824 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
1825 * (entries in the fixmap) where linux maps its pagetables. Since
1826 * we expect to hit them most of the time, we start the search for
1827 * the writeable mapping by looking at the same MFN where the last
1828 * brute-force search succeeded. */
1830 if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
1832 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
1833 mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
1834 int shtype = mfn_to_shadow_page(last_smfn)->type;
1836 if ( callbacks[shtype] )
1837 callbacks[shtype](v, last_smfn, gmfn);
1839 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
1840 perfc_incr(shadow_writeable_h_5);
1843 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1844 return 1;
1846 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
1848 /* Brute-force search of all the shadows, by walking the hash */
1849 perfc_incr(shadow_writeable_bf);
1850 hash_foreach(v, callback_mask, callbacks, gmfn);
1852 /* If that didn't catch the mapping, then there's some non-pagetable
1853 * mapping -- ioreq page, grant mapping, &c. */
1854 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
1856 SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
1857 "%lu special-use mappings of it\n", mfn_x(gmfn),
1858 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
1859 domain_crash(v->domain);
1862 /* We killed at least one writeable mapping, so must flush TLBs. */
1863 return 1;
1868 /**************************************************************************/
1869 /* Remove all mappings of a guest frame from the shadow tables.
1870 * Returns non-zero if we need to flush TLBs. */
1872 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
1874 struct page_info *page = mfn_to_page(gmfn);
1875 int expected_count, do_locking;
1877 /* Dispatch table for getting per-type functions */
1878 static hash_callback_t callbacks[SH_type_unused] = {
1879 NULL, /* none */
1880 #if CONFIG_PAGING_LEVELS == 2
1881 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* l1_32 */
1882 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* fl1_32 */
1883 #else
1884 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* l1_32 */
1885 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* fl1_32 */
1886 #endif
1887 NULL, /* l2_32 */
1888 #if CONFIG_PAGING_LEVELS >= 3
1889 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* l1_pae */
1890 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* fl1_pae */
1891 #else
1892 NULL, /* l1_pae */
1893 NULL, /* fl1_pae */
1894 #endif
1895 NULL, /* l2_pae */
1896 NULL, /* l2h_pae */
1897 #if CONFIG_PAGING_LEVELS >= 4
1898 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* l1_64 */
1899 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* fl1_64 */
1900 #else
1901 NULL, /* l1_64 */
1902 NULL, /* fl1_64 */
1903 #endif
1904 NULL, /* l2_64 */
1905 NULL, /* l2h_64 */
1906 NULL, /* l3_64 */
1907 NULL, /* l4_64 */
1908 NULL, /* p2m */
1909 NULL /* unused */
1910 };
1912 static unsigned int callback_mask =
1913 1 << SH_type_l1_32_shadow
1914 | 1 << SH_type_fl1_32_shadow
1915 | 1 << SH_type_l1_pae_shadow
1916 | 1 << SH_type_fl1_pae_shadow
1917 | 1 << SH_type_l1_64_shadow
1918 | 1 << SH_type_fl1_64_shadow
1921 perfc_incr(shadow_mappings);
1922 if ( (page->count_info & PGC_count_mask) == 0 )
1923 return 0;
1925 /* Although this is an externally visible function, we do not know
1926 * whether the shadow lock will be held when it is called (since it
1927 * can be called via put_page_type when we clear a shadow l1e).
1928 * If the lock isn't held, take it for the duration of the call. */
1929 do_locking = !shadow_locked_by_me(v->domain);
1930 if ( do_locking ) shadow_lock(v->domain);
1932 /* XXX TODO:
1933 * Heuristics for finding the (probably) single mapping of this gmfn */
1935 /* Brute-force search of all the shadows, by walking the hash */
1936 perfc_incr(shadow_mappings_bf);
1937 hash_foreach(v, callback_mask, callbacks, gmfn);
1939 /* If that didn't catch the mapping, something is very wrong */
1940 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
1941 if ( (page->count_info & PGC_count_mask) != expected_count )
1943 /* Don't complain if we're in HVM and there are some extra mappings:
1944 * The qemu helper process has an untyped mapping of this dom's RAM
1945 * and the HVM restore program takes another. */
1946 if ( !(shadow_mode_external(v->domain)
1947 && (page->count_info & PGC_count_mask) <= 3
1948 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
1950 SHADOW_ERROR("can't find all mappings of mfn %lx: "
1951 "c=%08x t=%08lx\n", mfn_x(gmfn),
1952 page->count_info, page->u.inuse.type_info);
1956 if ( do_locking ) shadow_unlock(v->domain);
1958 /* We killed at least one mapping, so must flush TLBs. */
1959 return 1;
1963 /**************************************************************************/
1964 /* Remove all shadows of a guest frame from the shadow tables */
1966 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
1967 /* Follow this shadow's up-pointer, if it has one, and remove the reference
1968 * found there. Returns 1 if that was the only reference to this shadow */
1970 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1971 mfn_t pmfn;
1972 void *vaddr;
1973 int rc;
1975 ASSERT(sp->type > 0);
1976 ASSERT(sp->type < SH_type_max_shadow);
1977 ASSERT(sp->type != SH_type_l2_32_shadow);
1978 ASSERT(sp->type != SH_type_l2_pae_shadow);
1979 ASSERT(sp->type != SH_type_l2h_pae_shadow);
1980 ASSERT(sp->type != SH_type_l4_64_shadow);
1982 if (sp->up == 0) return 0;
1983 pmfn = _mfn(sp->up >> PAGE_SHIFT);
1984 ASSERT(mfn_valid(pmfn));
1985 vaddr = sh_map_domain_page(pmfn);
1986 ASSERT(vaddr);
1987 vaddr += sp->up & (PAGE_SIZE-1);
1988 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
1990 /* Is this the only reference to this shadow? */
1991 rc = (sp->count == 1) ? 1 : 0;
1993 /* Blank the offending entry */
1994 switch (sp->type)
1996 case SH_type_l1_32_shadow:
1997 case SH_type_l2_32_shadow:
1998 #if CONFIG_PAGING_LEVELS == 2
1999 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2000 #else
2001 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2002 #endif
2003 break;
2004 #if CONFIG_PAGING_LEVELS >=3
2005 case SH_type_l1_pae_shadow:
2006 case SH_type_l2_pae_shadow:
2007 case SH_type_l2h_pae_shadow:
2008 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2009 break;
2010 #if CONFIG_PAGING_LEVELS >= 4
2011 case SH_type_l1_64_shadow:
2012 case SH_type_l2_64_shadow:
2013 case SH_type_l2h_64_shadow:
2014 case SH_type_l3_64_shadow:
2015 case SH_type_l4_64_shadow:
2016 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2017 break;
2018 #endif
2019 #endif
2020 default: BUG(); /* Some wierd unknown shadow type */
2023 sh_unmap_domain_page(vaddr);
2024 if ( rc )
2025 perfc_incr(shadow_up_pointer);
2026 else
2027 perfc_incr(shadow_unshadow_bf);
2029 return rc;
2032 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2033 /* Remove the shadows of this guest page.
2034 * If fast != 0, just try the quick heuristic, which will remove
2035 * at most one reference to each shadow of the page. Otherwise, walk
2036 * all the shadow tables looking for refs to shadows of this gmfn.
2037 * If all != 0, kill the domain if we can't find all the shadows.
2038 * (all != 0 implies fast == 0)
2039 */
2041 struct page_info *pg = mfn_to_page(gmfn);
2042 mfn_t smfn;
2043 int do_locking;
2044 unsigned char t;
2046 /* Dispatch table for getting per-type functions: each level must
2047 * be called with the function to remove a lower-level shadow. */
2048 static hash_callback_t callbacks[SH_type_unused] = {
2049 NULL, /* none */
2050 NULL, /* l1_32 */
2051 NULL, /* fl1_32 */
2052 #if CONFIG_PAGING_LEVELS == 2
2053 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2054 #else
2055 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2056 #endif
2057 NULL, /* l1_pae */
2058 NULL, /* fl1_pae */
2059 #if CONFIG_PAGING_LEVELS >= 3
2060 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2061 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2062 #else
2063 NULL, /* l2_pae */
2064 NULL, /* l2h_pae */
2065 #endif
2066 NULL, /* l1_64 */
2067 NULL, /* fl1_64 */
2068 #if CONFIG_PAGING_LEVELS >= 4
2069 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2070 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2h_64 */
2071 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2072 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2073 #else
2074 NULL, /* l2_64 */
2075 NULL, /* l2h_64 */
2076 NULL, /* l3_64 */
2077 NULL, /* l4_64 */
2078 #endif
2079 NULL, /* p2m */
2080 NULL /* unused */
2081 };
2083 /* Another lookup table, for choosing which mask to use */
2084 static unsigned int masks[SH_type_unused] = {
2085 0, /* none */
2086 1 << SH_type_l2_32_shadow, /* l1_32 */
2087 0, /* fl1_32 */
2088 0, /* l2_32 */
2089 ((1 << SH_type_l2h_pae_shadow)
2090 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2091 0, /* fl1_pae */
2092 0, /* l2_pae */
2093 0, /* l2h_pae */
2094 ((1 << SH_type_l2h_64_shadow)
2095 | (1 << SH_type_l2_64_shadow)), /* l1_64 */
2096 0, /* fl1_64 */
2097 1 << SH_type_l3_64_shadow, /* l2_64 */
2098 1 << SH_type_l3_64_shadow, /* l2h_64 */
2099 1 << SH_type_l4_64_shadow, /* l3_64 */
2100 0, /* l4_64 */
2101 0, /* p2m */
2102 0 /* unused */
2103 };
2105 ASSERT(!(all && fast));
2107 /* Although this is an externally visible function, we do not know
2108 * whether the shadow lock will be held when it is called (since it
2109 * can be called via put_page_type when we clear a shadow l1e).
2110 * If the lock isn't held, take it for the duration of the call. */
2111 do_locking = !shadow_locked_by_me(v->domain);
2112 if ( do_locking ) shadow_lock(v->domain);
2114 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2115 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2117 /* Bail out now if the page is not shadowed */
2118 if ( (pg->count_info & PGC_page_table) == 0 )
2120 if ( do_locking ) shadow_unlock(v->domain);
2121 return;
2124 /* Search for this shadow in all appropriate shadows */
2125 perfc_incr(shadow_unshadow);
2127 /* Lower-level shadows need to be excised from upper-level shadows.
2128 * This call to hash_foreach() looks dangerous but is in fact OK: each
2129 * call will remove at most one shadow, and terminate immediately when
2130 * it does remove it, so we never walk the hash after doing a deletion. */
2131 #define DO_UNSHADOW(_type) do { \
2132 t = (_type); \
2133 if( !(pg->count_info & PGC_page_table) \
2134 || !(pg->shadow_flags & (1 << t)) ) \
2135 break; \
2136 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2137 if ( unlikely(!mfn_valid(smfn)) ) \
2138 { \
2139 SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32 \
2140 " but no type-0x%"PRIx32" shadow\n", \
2141 mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \
2142 break; \
2143 } \
2144 if ( sh_type_is_pinnable(v, t) ) \
2145 sh_unpin(v, smfn); \
2146 else \
2147 sh_remove_shadow_via_pointer(v, smfn); \
2148 if( !fast \
2149 && (pg->count_info & PGC_page_table) \
2150 && (pg->shadow_flags & (1 << t)) ) \
2151 hash_foreach(v, masks[t], callbacks, smfn); \
2152 } while (0)
2154 DO_UNSHADOW(SH_type_l2_32_shadow);
2155 DO_UNSHADOW(SH_type_l1_32_shadow);
2156 #if CONFIG_PAGING_LEVELS >= 3
2157 DO_UNSHADOW(SH_type_l2h_pae_shadow);
2158 DO_UNSHADOW(SH_type_l2_pae_shadow);
2159 DO_UNSHADOW(SH_type_l1_pae_shadow);
2160 #if CONFIG_PAGING_LEVELS >= 4
2161 DO_UNSHADOW(SH_type_l4_64_shadow);
2162 DO_UNSHADOW(SH_type_l3_64_shadow);
2163 DO_UNSHADOW(SH_type_l2h_64_shadow);
2164 DO_UNSHADOW(SH_type_l2_64_shadow);
2165 DO_UNSHADOW(SH_type_l1_64_shadow);
2166 #endif
2167 #endif
2169 #undef DO_UNSHADOW
2171 /* If that didn't catch the shadows, something is wrong */
2172 if ( !fast && (pg->count_info & PGC_page_table) )
2174 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2175 "(shadow_flags=%08lx)\n",
2176 mfn_x(gmfn), pg->shadow_flags);
2177 if ( all )
2178 domain_crash(v->domain);
2181 /* Need to flush TLBs now, so that linear maps are safe next time we
2182 * take a fault. */
2183 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2185 if ( do_locking ) shadow_unlock(v->domain);
2188 static void
2189 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2190 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2191 * Unshadow it, and recursively unshadow pages that reference it. */
2193 sh_remove_shadows(v, gmfn, 0, 1);
2194 /* XXX TODO:
2195 * Rework this hashtable walker to return a linked-list of all
2196 * the shadows it modified, then do breadth-first recursion
2197 * to find the way up to higher-level tables and unshadow them too.
2199 * The current code (just tearing down each page's shadows as we
2200 * detect that it is not a pagetable) is correct, but very slow.
2201 * It means extra emulated writes and slows down removal of mappings. */
2204 /**************************************************************************/
2206 static void sh_update_paging_modes(struct vcpu *v)
2208 struct domain *d = v->domain;
2209 struct paging_mode *old_mode = v->arch.paging.mode;
2211 ASSERT(shadow_locked_by_me(d));
2213 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2214 /* Make sure this vcpu has a virtual TLB array allocated */
2215 if ( unlikely(!v->arch.paging.vtlb) )
2217 v->arch.paging.vtlb = xmalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
2218 if ( unlikely(!v->arch.paging.vtlb) )
2220 SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
2221 d->domain_id, v->vcpu_id);
2222 domain_crash(v->domain);
2223 return;
2225 memset(v->arch.paging.vtlb, 0,
2226 VTLB_ENTRIES * sizeof (struct shadow_vtlb));
2227 spin_lock_init(&v->arch.paging.vtlb_lock);
2229 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2231 // Valid transitions handled by this function:
2232 // - For PV guests:
2233 // - after a shadow mode has been changed
2234 // - For HVM guests:
2235 // - after a shadow mode has been changed
2236 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2237 //
2239 // First, tear down any old shadow tables held by this vcpu.
2240 //
2241 if ( v->arch.paging.mode )
2242 v->arch.paging.mode->shadow.detach_old_tables(v);
2244 if ( !is_hvm_domain(d) )
2246 ///
2247 /// PV guest
2248 ///
2249 #if CONFIG_PAGING_LEVELS == 4
2250 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2251 #elif CONFIG_PAGING_LEVELS == 3
2252 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2253 #elif CONFIG_PAGING_LEVELS == 2
2254 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2255 #else
2256 #error unexpected paging mode
2257 #endif
2259 else
2261 ///
2262 /// HVM guest
2263 ///
2264 ASSERT(shadow_mode_translate(d));
2265 ASSERT(shadow_mode_external(d));
2267 if ( !hvm_paging_enabled(v) )
2269 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
2270 * pagetable for it, mapping 4 GB one-to-one using a single l2
2271 * page of 1024 superpage mappings */
2272 v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
2273 #if CONFIG_PAGING_LEVELS >= 3
2274 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2275 #else
2276 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2277 #endif
2279 else
2281 #ifdef __x86_64__
2282 if ( hvm_long_mode_enabled(v) )
2284 // long mode guest...
2285 v->arch.paging.mode =
2286 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2288 else
2289 #endif
2290 if ( hvm_pae_enabled(v) )
2292 #if CONFIG_PAGING_LEVELS >= 3
2293 // 32-bit PAE mode guest...
2294 v->arch.paging.mode =
2295 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2296 #else
2297 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2298 domain_crash(d);
2299 return;
2300 #endif
2302 else
2304 // 32-bit 2 level guest...
2305 #if CONFIG_PAGING_LEVELS >= 3
2306 v->arch.paging.mode =
2307 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2308 #else
2309 v->arch.paging.mode =
2310 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2311 #endif
2315 if ( pagetable_is_null(v->arch.monitor_table) )
2317 mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2318 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2319 make_cr3(v, mfn_x(mmfn));
2320 hvm_update_host_cr3(v);
2323 if ( v->arch.paging.mode != old_mode )
2325 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2326 "(was g=%u s=%u)\n",
2327 d->domain_id, v->vcpu_id,
2328 is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
2329 v->arch.paging.mode->guest_levels,
2330 v->arch.paging.mode->shadow.shadow_levels,
2331 old_mode ? old_mode->guest_levels : 0,
2332 old_mode ? old_mode->shadow.shadow_levels : 0);
2333 if ( old_mode &&
2334 (v->arch.paging.mode->shadow.shadow_levels !=
2335 old_mode->shadow.shadow_levels) )
2337 /* Need to make a new monitor table for the new mode */
2338 mfn_t new_mfn, old_mfn;
2340 if ( v != current && vcpu_runnable(v) )
2342 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2343 "this HVM vcpu's (d=%u v=%u) paging mode "
2344 "while it is running.\n",
2345 current->domain->domain_id, current->vcpu_id,
2346 v->domain->domain_id, v->vcpu_id);
2347 /* It's not safe to do that because we can't change
2348 * the host CR£ for a running domain */
2349 domain_crash(v->domain);
2350 return;
2353 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2354 v->arch.monitor_table = pagetable_null();
2355 new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2356 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2357 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
2358 mfn_x(new_mfn));
2360 /* Don't be running on the old monitor table when we
2361 * pull it down! Switch CR3, and warn the HVM code that
2362 * its host cr3 has changed. */
2363 make_cr3(v, mfn_x(new_mfn));
2364 if ( v == current )
2365 write_ptbase(v);
2366 hvm_update_host_cr3(v);
2367 old_mode->shadow.destroy_monitor_table(v, old_mfn);
2371 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2372 // These are HARD: think about the case where two CPU's have
2373 // different values for CR4.PSE and CR4.PGE at the same time.
2374 // This *does* happen, at least for CR4.PGE...
2377 v->arch.paging.mode->update_cr3(v, 0);
2380 void shadow_update_paging_modes(struct vcpu *v)
2382 shadow_lock(v->domain);
2383 sh_update_paging_modes(v);
2384 shadow_unlock(v->domain);
2387 /**************************************************************************/
2388 /* Turning on and off shadow features */
2390 static void sh_new_mode(struct domain *d, u32 new_mode)
2391 /* Inform all the vcpus that the shadow mode has been changed */
2393 struct vcpu *v;
2395 ASSERT(shadow_locked_by_me(d));
2396 ASSERT(d != current->domain);
2397 d->arch.paging.mode = new_mode;
2398 for_each_vcpu(d, v)
2399 sh_update_paging_modes(v);
2402 int shadow_enable(struct domain *d, u32 mode)
2403 /* Turn on "permanent" shadow features: external, translate, refcount.
2404 * Can only be called once on a domain, and these features cannot be
2405 * disabled.
2406 * Returns 0 for success, -errno for failure. */
2408 unsigned int old_pages;
2409 struct page_info *pg = NULL;
2410 uint32_t *e;
2411 int i, rv = 0;
2413 mode |= PG_SH_enable;
2415 domain_pause(d);
2417 /* Sanity check the arguments */
2418 if ( (d == current->domain) ||
2419 shadow_mode_enabled(d) ||
2420 ((mode & PG_translate) && !(mode & PG_refcounts)) ||
2421 ((mode & PG_external) && !(mode & PG_translate)) )
2423 rv = -EINVAL;
2424 goto out_unlocked;
2427 /* Init the shadow memory allocation if the user hasn't done so */
2428 old_pages = d->arch.paging.shadow.total_pages;
2429 if ( old_pages == 0 )
2431 unsigned int r;
2432 shadow_lock(d);
2433 r = sh_set_allocation(d, 256, NULL); /* Use at least 1MB */
2434 if ( r != 0 )
2436 sh_set_allocation(d, 0, NULL);
2437 rv = -ENOMEM;
2438 goto out_locked;
2440 shadow_unlock(d);
2443 /* Init the P2M table. Must be done before we take the shadow lock
2444 * to avoid possible deadlock. */
2445 if ( mode & PG_translate )
2447 rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
2448 if (rv != 0)
2449 goto out_unlocked;
2452 /* HVM domains need an extra pagetable for vcpus that think they
2453 * have paging disabled */
2454 if ( is_hvm_domain(d) )
2456 /* Get a single page from the shadow pool. Take it via the
2457 * P2M interface to make freeing it simpler afterwards. */
2458 pg = shadow_alloc_p2m_page(d);
2459 if ( pg == NULL )
2461 rv = -ENOMEM;
2462 goto out_unlocked;
2464 /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
2465 * of virtual address space onto the same physical address range */
2466 e = sh_map_domain_page(page_to_mfn(pg));
2467 for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
2468 e[i] = ((0x400000U * i)
2469 | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
2470 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
2471 sh_unmap_domain_page(e);
2472 pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
2475 shadow_lock(d);
2477 /* Sanity check again with the lock held */
2478 if ( shadow_mode_enabled(d) )
2480 rv = -EINVAL;
2481 goto out_locked;
2484 /* Init the hash table */
2485 if ( shadow_hash_alloc(d) != 0 )
2487 rv = -ENOMEM;
2488 goto out_locked;
2491 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2492 /* We assume we're dealing with an older 64bit linux guest until we
2493 * see the guest use more than one l4 per vcpu. */
2494 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2495 #endif
2497 /* Record the 1-to-1 pagetable we just made */
2498 if ( is_hvm_domain(d) )
2499 d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
2501 /* Update the bits */
2502 sh_new_mode(d, mode);
2504 out_locked:
2505 shadow_unlock(d);
2506 out_unlocked:
2507 if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
2508 p2m_teardown(d);
2509 if ( rv != 0 && pg != NULL )
2510 shadow_free_p2m_page(d, pg);
2511 domain_unpause(d);
2512 return rv;
2515 void shadow_teardown(struct domain *d)
2516 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2517 * Should only be called for dying domains. */
2519 struct vcpu *v;
2520 mfn_t mfn;
2521 struct list_head *entry, *n;
2522 struct page_info *pg;
2524 ASSERT(d->is_dying);
2525 ASSERT(d != current->domain);
2527 if ( !shadow_locked_by_me(d) )
2528 shadow_lock(d); /* Keep various asserts happy */
2530 if ( shadow_mode_enabled(d) )
2532 /* Release the shadow and monitor tables held by each vcpu */
2533 for_each_vcpu(d, v)
2535 if ( v->arch.paging.mode )
2537 v->arch.paging.mode->shadow.detach_old_tables(v);
2538 if ( shadow_mode_external(d) )
2540 mfn = pagetable_get_mfn(v->arch.monitor_table);
2541 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
2542 v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
2543 v->arch.monitor_table = pagetable_null();
2549 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2550 /* Free the virtual-TLB array attached to each vcpu */
2551 for_each_vcpu(d, v)
2553 if ( v->arch.paging.vtlb )
2555 xfree(v->arch.paging.vtlb);
2556 v->arch.paging.vtlb = NULL;
2559 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2561 list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
2563 list_del(entry);
2564 pg = list_entry(entry, struct page_info, list);
2565 shadow_free_p2m_page(d, pg);
2568 if ( d->arch.paging.shadow.total_pages != 0 )
2570 SHADOW_PRINTK("teardown of domain %u starts."
2571 " Shadow pages total = %u, free = %u, p2m=%u\n",
2572 d->domain_id,
2573 d->arch.paging.shadow.total_pages,
2574 d->arch.paging.shadow.free_pages,
2575 d->arch.paging.shadow.p2m_pages);
2576 /* Destroy all the shadows and release memory to domheap */
2577 sh_set_allocation(d, 0, NULL);
2578 /* Release the hash table back to xenheap */
2579 if (d->arch.paging.shadow.hash_table)
2580 shadow_hash_teardown(d);
2581 /* Should not have any more memory held */
2582 SHADOW_PRINTK("teardown done."
2583 " Shadow pages total = %u, free = %u, p2m=%u\n",
2584 d->arch.paging.shadow.total_pages,
2585 d->arch.paging.shadow.free_pages,
2586 d->arch.paging.shadow.p2m_pages);
2587 ASSERT(d->arch.paging.shadow.total_pages == 0);
2590 /* Free the non-paged-vcpus pagetable; must happen after we've
2591 * destroyed any shadows of it or sh_destroy_shadow will get confused. */
2592 if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
2594 for_each_vcpu(d, v)
2596 ASSERT(is_hvm_vcpu(v));
2597 if ( !hvm_paging_enabled(v) )
2598 v->arch.guest_table = pagetable_null();
2600 shadow_free_p2m_page(d,
2601 pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable));
2602 d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
2605 /* We leave the "permanent" shadow modes enabled, but clear the
2606 * log-dirty mode bit. We don't want any more mark_dirty()
2607 * calls now that we've torn down the bitmap */
2608 d->arch.paging.mode &= ~PG_log_dirty;
2610 shadow_unlock(d);
2613 void shadow_final_teardown(struct domain *d)
2614 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2616 SHADOW_PRINTK("dom %u final teardown starts."
2617 " Shadow pages total = %u, free = %u, p2m=%u\n",
2618 d->domain_id,
2619 d->arch.paging.shadow.total_pages,
2620 d->arch.paging.shadow.free_pages,
2621 d->arch.paging.shadow.p2m_pages);
2623 /* Double-check that the domain didn't have any shadow memory.
2624 * It is possible for a domain that never got domain_kill()ed
2625 * to get here with its shadow allocation intact. */
2626 if ( d->arch.paging.shadow.total_pages != 0 )
2627 shadow_teardown(d);
2629 /* It is now safe to pull down the p2m map. */
2630 p2m_teardown(d);
2632 SHADOW_PRINTK("dom %u final teardown done."
2633 " Shadow pages total = %u, free = %u, p2m=%u\n",
2634 d->domain_id,
2635 d->arch.paging.shadow.total_pages,
2636 d->arch.paging.shadow.free_pages,
2637 d->arch.paging.shadow.p2m_pages);
2640 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2641 /* Turn on a single shadow mode feature */
2643 ASSERT(shadow_locked_by_me(d));
2645 /* Sanity check the call */
2646 if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
2648 return -EINVAL;
2651 mode |= PG_SH_enable;
2653 if ( d->arch.paging.mode == 0 )
2655 /* Init the shadow memory allocation and the hash table */
2656 if ( sh_set_allocation(d, 1, NULL) != 0
2657 || shadow_hash_alloc(d) != 0 )
2659 sh_set_allocation(d, 0, NULL);
2660 return -ENOMEM;
2664 /* Update the bits */
2665 sh_new_mode(d, d->arch.paging.mode | mode);
2667 return 0;
2670 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2671 /* Turn off a single shadow mode feature */
2673 struct vcpu *v;
2674 ASSERT(shadow_locked_by_me(d));
2676 /* Sanity check the call */
2677 if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
2679 return -EINVAL;
2682 /* Update the bits */
2683 sh_new_mode(d, d->arch.paging.mode & ~mode);
2684 if ( d->arch.paging.mode == 0 )
2686 /* Get this domain off shadows */
2687 SHADOW_PRINTK("un-shadowing of domain %u starts."
2688 " Shadow pages total = %u, free = %u, p2m=%u\n",
2689 d->domain_id,
2690 d->arch.paging.shadow.total_pages,
2691 d->arch.paging.shadow.free_pages,
2692 d->arch.paging.shadow.p2m_pages);
2693 for_each_vcpu(d, v)
2695 if ( v->arch.paging.mode )
2696 v->arch.paging.mode->shadow.detach_old_tables(v);
2697 #if CONFIG_PAGING_LEVELS == 4
2698 if ( !(v->arch.flags & TF_kernel_mode) )
2699 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2700 else
2701 #endif
2702 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2706 /* Pull down the memory allocation */
2707 if ( sh_set_allocation(d, 0, NULL) != 0 )
2709 // XXX - How can this occur?
2710 // Seems like a bug to return an error now that we've
2711 // disabled the relevant shadow mode.
2712 //
2713 return -ENOMEM;
2715 shadow_hash_teardown(d);
2716 SHADOW_PRINTK("un-shadowing of domain %u done."
2717 " Shadow pages total = %u, free = %u, p2m=%u\n",
2718 d->domain_id,
2719 d->arch.paging.shadow.total_pages,
2720 d->arch.paging.shadow.free_pages,
2721 d->arch.paging.shadow.p2m_pages);
2724 return 0;
2727 /* Enable/disable ops for the "test" and "log-dirty" modes */
2728 static int shadow_test_enable(struct domain *d)
2730 int ret;
2732 domain_pause(d);
2733 shadow_lock(d);
2734 ret = shadow_one_bit_enable(d, PG_SH_enable);
2735 shadow_unlock(d);
2736 domain_unpause(d);
2738 return ret;
2741 static int shadow_test_disable(struct domain *d)
2743 int ret;
2745 domain_pause(d);
2746 shadow_lock(d);
2747 ret = shadow_one_bit_disable(d, PG_SH_enable);
2748 shadow_unlock(d);
2749 domain_unpause(d);
2751 return ret;
2754 /**************************************************************************/
2755 /* P2M map manipulations */
2757 /* shadow specific code which should be called when P2M table entry is updated
2758 * with new content. It is responsible for update the entry, as well as other
2759 * shadow processing jobs.
2760 */
2761 void
2762 shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn,
2763 l1_pgentry_t *p, mfn_t table_mfn,
2764 l1_pgentry_t new, unsigned int level)
2766 struct domain *d = v->domain;
2768 shadow_lock(d);
2770 /* If we're removing an MFN from the p2m, remove it from the shadows too */
2771 if ( level == 1 )
2773 mfn_t mfn = _mfn(l1e_get_pfn(*p));
2774 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
2775 if ( p2m_is_valid(p2mt) && mfn_valid(mfn) )
2777 sh_remove_all_shadows_and_parents(v, mfn);
2778 if ( sh_remove_all_mappings(v, mfn) )
2779 flush_tlb_mask(d->domain_dirty_cpumask);
2783 /* Update the entry with new content */
2784 safe_write_pte(p, new);
2786 /* install P2M in monitors for PAE Xen */
2787 #if CONFIG_PAGING_LEVELS == 3
2788 if ( level == 3 )
2789 /* We have written to the p2m l3: need to sync the per-vcpu
2790 * copies of it in the monitor tables */
2791 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
2792 #endif
2794 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
2795 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
2796 cached the fact that this is an mmio region in the shadow
2797 page tables. Blow the tables away to remove the cache.
2798 This is pretty heavy handed, but this is a rare operation
2799 (it might happen a dozen times during boot and then never
2800 again), so it doesn't matter too much. */
2801 if ( d->arch.paging.shadow.has_fast_mmio_entries )
2803 shadow_blow_tables(d);
2804 d->arch.paging.shadow.has_fast_mmio_entries = 0;
2806 #endif
2808 shadow_unlock(d);
2811 /**************************************************************************/
2812 /* Log-dirty mode support */
2814 /* Shadow specific code which is called in paging_log_dirty_enable().
2815 * Return 0 if no problem found.
2816 */
2817 int shadow_enable_log_dirty(struct domain *d)
2819 int ret;
2821 /* shadow lock is required here */
2822 shadow_lock(d);
2823 if ( shadow_mode_enabled(d) )
2825 /* This domain already has some shadows: need to clear them out
2826 * of the way to make sure that all references to guest memory are
2827 * properly write-protected */
2828 shadow_blow_tables(d);
2831 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2832 /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
2833 * change an l4e instead of cr3 to switch tables. Give them the
2834 * same optimization */
2835 if ( is_pv_32on64_domain(d) )
2836 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2837 #endif
2839 ret = shadow_one_bit_enable(d, PG_log_dirty);
2840 shadow_unlock(d);
2842 return ret;
2845 /* shadow specfic code which is called in paging_log_dirty_disable() */
2846 int shadow_disable_log_dirty(struct domain *d)
2848 int ret;
2850 /* shadow lock is required here */
2851 shadow_lock(d);
2852 ret = shadow_one_bit_disable(d, PG_log_dirty);
2853 shadow_unlock(d);
2855 return ret;
2858 /* This function is called when we CLEAN log dirty bitmap. See
2859 * paging_log_dirty_op() for details.
2860 */
2861 void shadow_clean_dirty_bitmap(struct domain *d)
2863 shadow_lock(d);
2864 /* Need to revoke write access to the domain's pages again.
2865 * In future, we'll have a less heavy-handed approach to this,
2866 * but for now, we just unshadow everything except Xen. */
2867 shadow_blow_tables(d);
2868 shadow_unlock(d);
2870 /**************************************************************************/
2871 /* Shadow-control XEN_DOMCTL dispatcher */
2873 int shadow_domctl(struct domain *d,
2874 xen_domctl_shadow_op_t *sc,
2875 XEN_GUEST_HANDLE(void) u_domctl)
2877 int rc, preempted = 0;
2879 switch ( sc->op )
2881 case XEN_DOMCTL_SHADOW_OP_OFF:
2882 if ( d->arch.paging.mode == PG_SH_enable )
2883 if ( (rc = shadow_test_disable(d)) != 0 )
2884 return rc;
2885 return 0;
2887 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
2888 return shadow_test_enable(d);
2890 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
2891 return shadow_enable(d, PG_refcounts|PG_translate);
2893 case XEN_DOMCTL_SHADOW_OP_ENABLE:
2894 return shadow_enable(d, sc->mode << PG_mode_shift);
2896 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
2897 sc->mb = shadow_get_allocation(d);
2898 return 0;
2900 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
2901 shadow_lock(d);
2902 if ( sc->mb == 0 && shadow_mode_enabled(d) )
2904 /* Can't set the allocation to zero unless the domain stops using
2905 * shadow pagetables first */
2906 SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
2907 " is still using shadows.\n", d->domain_id);
2908 shadow_unlock(d);
2909 return -EINVAL;
2911 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
2912 shadow_unlock(d);
2913 if ( preempted )
2914 /* Not finished. Set up to re-run the call. */
2915 rc = hypercall_create_continuation(
2916 __HYPERVISOR_domctl, "h", u_domctl);
2917 else
2918 /* Finished. Return the new allocation */
2919 sc->mb = shadow_get_allocation(d);
2920 return rc;
2922 default:
2923 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
2924 return -EINVAL;
2929 /**************************************************************************/
2930 /* Auditing shadow tables */
2932 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
2934 void shadow_audit_tables(struct vcpu *v)
2936 /* Dispatch table for getting per-type functions */
2937 static hash_callback_t callbacks[SH_type_unused] = {
2938 NULL, /* none */
2939 #if CONFIG_PAGING_LEVELS == 2
2940 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
2941 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
2942 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
2943 #else
2944 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
2945 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
2946 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
2947 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
2948 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
2949 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
2950 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
2951 #if CONFIG_PAGING_LEVELS >= 4
2952 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
2953 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
2954 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
2955 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2h_64 */
2956 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
2957 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
2958 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2959 #endif /* CONFIG_PAGING_LEVELS > 2 */
2960 NULL /* All the rest */
2961 };
2962 unsigned int mask;
2964 if ( !(SHADOW_AUDIT_ENABLE) )
2965 return;
2967 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
2968 mask = ~1; /* Audit every table in the system */
2969 else
2971 /* Audit only the current mode's tables */
2972 switch ( v->arch.paging.mode->guest_levels )
2974 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
2975 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
2976 |SHF_L2H_PAE); break;
2977 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
2978 |SHF_L3_64|SHF_L4_64); break;
2979 default: BUG();
2983 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
2986 #endif /* Shadow audit */
2988 /*
2989 * Local variables:
2990 * mode: C
2991 * c-set-style: "BSD"
2992 * c-basic-offset: 4
2993 * indent-tabs-mode: nil
2994 * End:
2995 */