ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 17423:5ffd167d7772

x86: Suppress scary console message from sh_remove_shadows()
except in cases where the guest's behaviour is unrecoverable.

Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Apr 09 16:04:10 2008 +0100 (2008-04-09)
parents 57febe0264e1
children 6271ba3bb4b6
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include <xen/numa.h>
40 #include "private.h"
43 /* Set up the shadow-specific parts of a domain struct at start of day.
44 * Called for every domain from arch_domain_create() */
45 void shadow_domain_init(struct domain *d)
46 {
47 int i;
48 shadow_lock_init(d);
49 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
50 INIT_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
51 INIT_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
52 INIT_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
54 /* Use shadow pagetables for log-dirty support */
55 paging_log_dirty_init(d, shadow_enable_log_dirty,
56 shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
57 }
59 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
60 * job is to initialize the update_paging_modes() function pointer, which is
61 * used to initialized the rest of resources. Therefore, it really does not
62 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
63 * be compiled.
64 */
65 void shadow_vcpu_init(struct vcpu *v)
66 {
67 #if CONFIG_PAGING_LEVELS == 4
68 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
69 #elif CONFIG_PAGING_LEVELS == 3
70 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
71 #elif CONFIG_PAGING_LEVELS == 2
72 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
73 #endif
74 }
76 #if SHADOW_AUDIT
77 int shadow_audit_enable = 0;
79 static void shadow_audit_key(unsigned char key)
80 {
81 shadow_audit_enable = !shadow_audit_enable;
82 printk("%s shadow_audit_enable=%d\n",
83 __func__, shadow_audit_enable);
84 }
86 static int __init shadow_audit_key_init(void)
87 {
88 register_keyhandler(
89 'O', shadow_audit_key, "toggle shadow audits");
90 return 0;
91 }
92 __initcall(shadow_audit_key_init);
93 #endif /* SHADOW_AUDIT */
95 int _shadow_mode_refcounts(struct domain *d)
96 {
97 return shadow_mode_refcounts(d);
98 }
101 /**************************************************************************/
102 /* x86 emulator support for the shadow code
103 */
105 struct segment_register *hvm_get_seg_reg(
106 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
107 {
108 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
109 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
110 hvm_get_segment_register(current, seg, seg_reg);
111 return seg_reg;
112 }
114 static int hvm_translate_linear_addr(
115 enum x86_segment seg,
116 unsigned long offset,
117 unsigned int bytes,
118 enum hvm_access_type access_type,
119 struct sh_emulate_ctxt *sh_ctxt,
120 unsigned long *paddr)
121 {
122 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
123 int okay;
125 okay = hvm_virtual_to_linear_addr(
126 seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr);
128 if ( !okay )
129 {
130 hvm_inject_exception(TRAP_gp_fault, 0, 0);
131 return X86EMUL_EXCEPTION;
132 }
134 return 0;
135 }
137 static int
138 hvm_read(enum x86_segment seg,
139 unsigned long offset,
140 unsigned long *val,
141 unsigned int bytes,
142 enum hvm_access_type access_type,
143 struct sh_emulate_ctxt *sh_ctxt)
144 {
145 unsigned long addr;
146 int rc;
148 rc = hvm_translate_linear_addr(
149 seg, offset, bytes, access_type, sh_ctxt, &addr);
150 if ( rc )
151 return rc;
153 *val = 0;
155 if ( access_type == hvm_access_insn_fetch )
156 rc = hvm_fetch_from_guest_virt(val, addr, bytes, 0);
157 else
158 rc = hvm_copy_from_guest_virt(val, addr, bytes, 0);
160 switch ( rc )
161 {
162 case HVMCOPY_okay:
163 return X86EMUL_OKAY;
164 case HVMCOPY_bad_gva_to_gfn:
165 return X86EMUL_EXCEPTION;
166 default:
167 break;
168 }
170 return X86EMUL_UNHANDLEABLE;
171 }
173 static int
174 hvm_emulate_read(enum x86_segment seg,
175 unsigned long offset,
176 unsigned long *val,
177 unsigned int bytes,
178 struct x86_emulate_ctxt *ctxt)
179 {
180 if ( !is_x86_user_segment(seg) )
181 return X86EMUL_UNHANDLEABLE;
182 return hvm_read(seg, offset, val, bytes, hvm_access_read,
183 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
184 }
186 static int
187 hvm_emulate_insn_fetch(enum x86_segment seg,
188 unsigned long offset,
189 unsigned long *val,
190 unsigned int bytes,
191 struct x86_emulate_ctxt *ctxt)
192 {
193 struct sh_emulate_ctxt *sh_ctxt =
194 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
195 unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
197 ASSERT(seg == x86_seg_cs);
199 /* Fall back if requested bytes are not in the prefetch cache. */
200 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
201 return hvm_read(seg, offset, val, bytes,
202 hvm_access_insn_fetch, sh_ctxt);
204 /* Hit the cache. Simple memcpy. */
205 *val = 0;
206 memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
207 return X86EMUL_OKAY;
208 }
210 static int
211 hvm_emulate_write(enum x86_segment seg,
212 unsigned long offset,
213 unsigned long val,
214 unsigned int bytes,
215 struct x86_emulate_ctxt *ctxt)
216 {
217 struct sh_emulate_ctxt *sh_ctxt =
218 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
219 struct vcpu *v = current;
220 unsigned long addr;
221 int rc;
223 if ( !is_x86_user_segment(seg) )
224 return X86EMUL_UNHANDLEABLE;
226 /* How many emulations could we save if we unshadowed on stack writes? */
227 if ( seg == x86_seg_ss )
228 perfc_incr(shadow_fault_emulate_stack);
230 rc = hvm_translate_linear_addr(
231 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
232 if ( rc )
233 return rc;
235 return v->arch.paging.mode->shadow.x86_emulate_write(
236 v, addr, &val, bytes, sh_ctxt);
237 }
239 static int
240 hvm_emulate_cmpxchg(enum x86_segment seg,
241 unsigned long offset,
242 unsigned long old,
243 unsigned long new,
244 unsigned int bytes,
245 struct x86_emulate_ctxt *ctxt)
246 {
247 struct sh_emulate_ctxt *sh_ctxt =
248 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
249 struct vcpu *v = current;
250 unsigned long addr;
251 int rc;
253 if ( !is_x86_user_segment(seg) )
254 return X86EMUL_UNHANDLEABLE;
256 rc = hvm_translate_linear_addr(
257 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
258 if ( rc )
259 return rc;
261 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
262 v, addr, old, new, bytes, sh_ctxt);
263 }
265 static int
266 hvm_emulate_cmpxchg8b(enum x86_segment seg,
267 unsigned long offset,
268 unsigned long old_lo,
269 unsigned long old_hi,
270 unsigned long new_lo,
271 unsigned long new_hi,
272 struct x86_emulate_ctxt *ctxt)
273 {
274 struct sh_emulate_ctxt *sh_ctxt =
275 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
276 struct vcpu *v = current;
277 unsigned long addr;
278 int rc;
280 if ( !is_x86_user_segment(seg) )
281 return X86EMUL_UNHANDLEABLE;
283 rc = hvm_translate_linear_addr(
284 seg, offset, 8, hvm_access_write, sh_ctxt, &addr);
285 if ( rc )
286 return rc;
288 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
289 v, addr, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
290 }
292 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
293 .read = hvm_emulate_read,
294 .insn_fetch = hvm_emulate_insn_fetch,
295 .write = hvm_emulate_write,
296 .cmpxchg = hvm_emulate_cmpxchg,
297 .cmpxchg8b = hvm_emulate_cmpxchg8b,
298 };
300 static int
301 pv_emulate_read(enum x86_segment seg,
302 unsigned long offset,
303 unsigned long *val,
304 unsigned int bytes,
305 struct x86_emulate_ctxt *ctxt)
306 {
307 unsigned int rc;
309 if ( !is_x86_user_segment(seg) )
310 return X86EMUL_UNHANDLEABLE;
312 *val = 0;
313 if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
314 {
315 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
316 return X86EMUL_EXCEPTION;
317 }
319 return X86EMUL_OKAY;
320 }
322 static int
323 pv_emulate_write(enum x86_segment seg,
324 unsigned long offset,
325 unsigned long val,
326 unsigned int bytes,
327 struct x86_emulate_ctxt *ctxt)
328 {
329 struct sh_emulate_ctxt *sh_ctxt =
330 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
331 struct vcpu *v = current;
332 if ( !is_x86_user_segment(seg) )
333 return X86EMUL_UNHANDLEABLE;
334 return v->arch.paging.mode->shadow.x86_emulate_write(
335 v, offset, &val, bytes, sh_ctxt);
336 }
338 static int
339 pv_emulate_cmpxchg(enum x86_segment seg,
340 unsigned long offset,
341 unsigned long old,
342 unsigned long new,
343 unsigned int bytes,
344 struct x86_emulate_ctxt *ctxt)
345 {
346 struct sh_emulate_ctxt *sh_ctxt =
347 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
348 struct vcpu *v = current;
349 if ( !is_x86_user_segment(seg) )
350 return X86EMUL_UNHANDLEABLE;
351 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
352 v, offset, old, new, bytes, sh_ctxt);
353 }
355 static int
356 pv_emulate_cmpxchg8b(enum x86_segment seg,
357 unsigned long offset,
358 unsigned long old_lo,
359 unsigned long old_hi,
360 unsigned long new_lo,
361 unsigned long new_hi,
362 struct x86_emulate_ctxt *ctxt)
363 {
364 struct sh_emulate_ctxt *sh_ctxt =
365 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
366 struct vcpu *v = current;
367 if ( !is_x86_user_segment(seg) )
368 return X86EMUL_UNHANDLEABLE;
369 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
370 v, offset, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
371 }
373 static struct x86_emulate_ops pv_shadow_emulator_ops = {
374 .read = pv_emulate_read,
375 .insn_fetch = pv_emulate_read,
376 .write = pv_emulate_write,
377 .cmpxchg = pv_emulate_cmpxchg,
378 .cmpxchg8b = pv_emulate_cmpxchg8b,
379 };
381 struct x86_emulate_ops *shadow_init_emulation(
382 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
383 {
384 struct segment_register *creg, *sreg;
385 struct vcpu *v = current;
386 unsigned long addr;
388 sh_ctxt->ctxt.regs = regs;
389 sh_ctxt->ctxt.force_writeback = 0;
391 if ( !is_hvm_vcpu(v) )
392 {
393 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
394 return &pv_shadow_emulator_ops;
395 }
397 /* Segment cache initialisation. Primed with CS. */
398 sh_ctxt->valid_seg_regs = 0;
399 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
401 /* Work out the emulation mode. */
402 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
403 {
404 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
405 }
406 else
407 {
408 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
409 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
410 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
411 }
413 /* Attempt to prefetch whole instruction. */
414 sh_ctxt->insn_buf_eip = regs->eip;
415 sh_ctxt->insn_buf_bytes =
416 (!hvm_translate_linear_addr(
417 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
418 hvm_access_insn_fetch, sh_ctxt, &addr) &&
419 !hvm_fetch_from_guest_virt_nofault(
420 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
421 ? sizeof(sh_ctxt->insn_buf) : 0;
423 return &hvm_shadow_emulator_ops;
424 }
426 /* Update an initialized emulation context to prepare for the next
427 * instruction */
428 void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
429 struct cpu_user_regs *regs)
430 {
431 struct vcpu *v = current;
432 unsigned long addr, diff;
434 /* We don't refetch the segment bases, because we don't emulate
435 * writes to segment registers */
437 if ( is_hvm_vcpu(v) )
438 {
439 diff = regs->eip - sh_ctxt->insn_buf_eip;
440 if ( diff > sh_ctxt->insn_buf_bytes )
441 {
442 /* Prefetch more bytes. */
443 sh_ctxt->insn_buf_bytes =
444 (!hvm_translate_linear_addr(
445 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
446 hvm_access_insn_fetch, sh_ctxt, &addr) &&
447 !hvm_fetch_from_guest_virt_nofault(
448 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
449 ? sizeof(sh_ctxt->insn_buf) : 0;
450 sh_ctxt->insn_buf_eip = regs->eip;
451 }
452 }
453 }
455 /**************************************************************************/
456 /* Code for "promoting" a guest page to the point where the shadow code is
457 * willing to let it be treated as a guest page table. This generally
458 * involves making sure there are no writable mappings available to the guest
459 * for this page.
460 */
461 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
462 {
463 struct page_info *page = mfn_to_page(gmfn);
465 ASSERT(mfn_valid(gmfn));
467 /* We should never try to promote a gmfn that has writeable mappings */
468 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
469 || (page->u.inuse.type_info & PGT_count_mask) == 0
470 || v->domain->is_shutting_down);
472 /* Is the page already shadowed? */
473 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
474 page->shadow_flags = 0;
476 ASSERT(!test_bit(type, &page->shadow_flags));
477 set_bit(type, &page->shadow_flags);
478 }
480 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
481 {
482 struct page_info *page = mfn_to_page(gmfn);
484 ASSERT(test_bit(_PGC_page_table, &page->count_info));
485 ASSERT(test_bit(type, &page->shadow_flags));
487 clear_bit(type, &page->shadow_flags);
489 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
490 {
491 /* tlbflush timestamp field is valid again */
492 page->tlbflush_timestamp = tlbflush_current_time();
493 clear_bit(_PGC_page_table, &page->count_info);
494 }
495 }
497 /**************************************************************************/
498 /* Validate a pagetable change from the guest and update the shadows.
499 * Returns a bitmask of SHADOW_SET_* flags. */
501 int
502 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
503 {
504 int result = 0;
505 struct page_info *page = mfn_to_page(gmfn);
507 paging_mark_dirty(v->domain, mfn_x(gmfn));
509 // Determine which types of shadows are affected, and update each.
510 //
511 // Always validate L1s before L2s to prevent another cpu with a linear
512 // mapping of this gmfn from seeing a walk that results from
513 // using the new L2 value and the old L1 value. (It is OK for such a
514 // guest to see a walk that uses the old L2 value with the new L1 value,
515 // as hardware could behave this way if one level of the pagewalk occurs
516 // before the store, and the next level of the pagewalk occurs after the
517 // store.
518 //
519 // Ditto for L2s before L3s, etc.
520 //
522 if ( !(page->count_info & PGC_page_table) )
523 return 0; /* Not shadowed at all */
525 #if CONFIG_PAGING_LEVELS == 2
526 if ( page->shadow_flags & SHF_L1_32 )
527 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
528 (v, gmfn, entry, size);
529 #else
530 if ( page->shadow_flags & SHF_L1_32 )
531 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
532 (v, gmfn, entry, size);
533 #endif
535 #if CONFIG_PAGING_LEVELS == 2
536 if ( page->shadow_flags & SHF_L2_32 )
537 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
538 (v, gmfn, entry, size);
539 #else
540 if ( page->shadow_flags & SHF_L2_32 )
541 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
542 (v, gmfn, entry, size);
543 #endif
545 #if CONFIG_PAGING_LEVELS >= 3
546 if ( page->shadow_flags & SHF_L1_PAE )
547 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
548 (v, gmfn, entry, size);
549 if ( page->shadow_flags & SHF_L2_PAE )
550 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
551 (v, gmfn, entry, size);
552 if ( page->shadow_flags & SHF_L2H_PAE )
553 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
554 (v, gmfn, entry, size);
555 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
556 ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
557 #endif
559 #if CONFIG_PAGING_LEVELS >= 4
560 if ( page->shadow_flags & SHF_L1_64 )
561 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
562 (v, gmfn, entry, size);
563 if ( page->shadow_flags & SHF_L2_64 )
564 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
565 (v, gmfn, entry, size);
566 if ( page->shadow_flags & SHF_L2H_64 )
567 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4, 4)
568 (v, gmfn, entry, size);
569 if ( page->shadow_flags & SHF_L3_64 )
570 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
571 (v, gmfn, entry, size);
572 if ( page->shadow_flags & SHF_L4_64 )
573 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
574 (v, gmfn, entry, size);
575 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
576 ASSERT((page->shadow_flags
577 & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
578 #endif
580 return result;
581 }
584 void
585 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
586 void *entry, u32 size)
587 /* This is the entry point for emulated writes to pagetables in HVM guests and
588 * PV translated guests.
589 */
590 {
591 struct domain *d = v->domain;
592 int rc;
594 ASSERT(shadow_locked_by_me(v->domain));
595 rc = sh_validate_guest_entry(v, gmfn, entry, size);
596 if ( rc & SHADOW_SET_FLUSH )
597 /* Need to flush TLBs to pick up shadow PT changes */
598 flush_tlb_mask(d->domain_dirty_cpumask);
599 if ( rc & SHADOW_SET_ERROR )
600 {
601 /* This page is probably not a pagetable any more: tear it out of the
602 * shadows, along with any tables that reference it.
603 * Since the validate call above will have made a "safe" (i.e. zero)
604 * shadow entry, we can let the domain live even if we can't fully
605 * unshadow the page. */
606 sh_remove_shadows(v, gmfn, 0, 0);
607 }
608 }
610 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
611 intpte_t new, mfn_t gmfn)
612 /* Write a new value into the guest pagetable, and update the shadows
613 * appropriately. Returns 0 if we page-faulted, 1 for success. */
614 {
615 int failed;
616 shadow_lock(v->domain);
617 failed = __copy_to_user(p, &new, sizeof(new));
618 if ( failed != sizeof(new) )
619 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
620 shadow_unlock(v->domain);
621 return (failed == 0);
622 }
624 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
625 intpte_t *old, intpte_t new, mfn_t gmfn)
626 /* Cmpxchg a new value into the guest pagetable, and update the shadows
627 * appropriately. Returns 0 if we page-faulted, 1 if not.
628 * N.B. caller should check the value of "old" to see if the
629 * cmpxchg itself was successful. */
630 {
631 int failed;
632 intpte_t t = *old;
633 shadow_lock(v->domain);
634 failed = cmpxchg_user(p, t, new);
635 if ( t == *old )
636 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
637 *old = t;
638 shadow_unlock(v->domain);
639 return (failed == 0);
640 }
643 /**************************************************************************/
644 /* Memory management for shadow pages. */
646 /* Allocating shadow pages
647 * -----------------------
648 *
649 * Most shadow pages are allocated singly, but there is one case where
650 * we need to allocate multiple pages together: shadowing 32-bit guest
651 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
652 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
653 * l1 tables (covering 2MB of virtual address space each). Similarly, a
654 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
655 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
656 * contiguous and aligned; functions for handling offsets into them are
657 * defined in shadow.c (shadow_l1_index() etc.)
658 *
659 * This table shows the allocation behaviour of the different modes:
660 *
661 * Xen paging 32b pae pae 64b 64b 64b
662 * Guest paging 32b 32b pae 32b pae 64b
663 * PV or HVM * HVM * HVM HVM *
664 * Shadow paging 32b pae pae pae pae 64b
665 *
666 * sl1 size 4k 8k 4k 8k 4k 4k
667 * sl2 size 4k 16k 4k 16k 4k 4k
668 * sl3 size - - - - - 4k
669 * sl4 size - - - - - 4k
670 *
671 * We allocate memory from xen in four-page units and break them down
672 * with a simple buddy allocator. Can't use the xen allocator to handle
673 * this as it only works for contiguous zones, and a domain's shadow
674 * pool is made of fragments.
675 *
676 * In HVM guests, the p2m table is built out of shadow pages, and we provide
677 * a function for the p2m management to steal pages, in max-order chunks, from
678 * the free pool. We don't provide for giving them back, yet.
679 */
681 /* Figure out the least acceptable quantity of shadow memory.
682 * The minimum memory requirement for always being able to free up a
683 * chunk of memory is very small -- only three max-order chunks per
684 * vcpu to hold the top level shadows and pages with Xen mappings in them.
685 *
686 * But for a guest to be guaranteed to successfully execute a single
687 * instruction, we must be able to map a large number (about thirty) VAs
688 * at the same time, which means that to guarantee progress, we must
689 * allow for more than ninety allocated pages per vcpu. We round that
690 * up to 128 pages, or half a megabyte per vcpu. */
691 static unsigned int shadow_min_acceptable_pages(struct domain *d)
692 {
693 u32 vcpu_count = 0;
694 struct vcpu *v;
696 for_each_vcpu(d, v)
697 vcpu_count++;
699 return (vcpu_count * 128);
700 }
702 /* Figure out the order of allocation needed for a given shadow type */
703 static inline u32
704 shadow_order(unsigned int shadow_type)
705 {
706 #if CONFIG_PAGING_LEVELS > 2
707 static const u32 type_to_order[SH_type_unused] = {
708 0, /* SH_type_none */
709 1, /* SH_type_l1_32_shadow */
710 1, /* SH_type_fl1_32_shadow */
711 2, /* SH_type_l2_32_shadow */
712 0, /* SH_type_l1_pae_shadow */
713 0, /* SH_type_fl1_pae_shadow */
714 0, /* SH_type_l2_pae_shadow */
715 0, /* SH_type_l2h_pae_shadow */
716 0, /* SH_type_l1_64_shadow */
717 0, /* SH_type_fl1_64_shadow */
718 0, /* SH_type_l2_64_shadow */
719 0, /* SH_type_l2h_64_shadow */
720 0, /* SH_type_l3_64_shadow */
721 0, /* SH_type_l4_64_shadow */
722 2, /* SH_type_p2m_table */
723 0 /* SH_type_monitor_table */
724 };
725 ASSERT(shadow_type < SH_type_unused);
726 return type_to_order[shadow_type];
727 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
728 return 0;
729 #endif
730 }
732 static inline unsigned int
733 shadow_max_order(struct domain *d)
734 {
735 return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0;
736 }
738 /* Do we have at total of count pages of the requested order free? */
739 static inline int space_is_available(
740 struct domain *d,
741 unsigned int order,
742 unsigned int count)
743 {
744 for ( ; order <= shadow_max_order(d); ++order )
745 {
746 unsigned int n = count;
747 const struct list_head *p;
749 list_for_each ( p, &d->arch.paging.shadow.freelists[order] )
750 if ( --n == 0 )
751 return 1;
752 count = (count + 1) >> 1;
753 }
755 return 0;
756 }
758 /* Dispatcher function: call the per-mode function that will unhook the
759 * non-Xen mappings in this top-level shadow mfn */
760 static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
761 {
762 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
763 switch ( sp->type )
764 {
765 case SH_type_l2_32_shadow:
766 #if CONFIG_PAGING_LEVELS == 2
767 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
768 #else
769 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
770 #endif
771 break;
772 #if CONFIG_PAGING_LEVELS >= 3
773 case SH_type_l2_pae_shadow:
774 case SH_type_l2h_pae_shadow:
775 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
776 break;
777 #endif
778 #if CONFIG_PAGING_LEVELS >= 4
779 case SH_type_l4_64_shadow:
780 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
781 break;
782 #endif
783 default:
784 SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->type);
785 BUG();
786 }
787 }
790 /* Make sure there are at least count order-sized pages
791 * available in the shadow page pool. */
792 static void _shadow_prealloc(
793 struct domain *d,
794 unsigned int order,
795 unsigned int count)
796 {
797 /* Need a vpcu for calling unpins; for now, since we don't have
798 * per-vcpu shadows, any will do */
799 struct vcpu *v, *v2;
800 struct list_head *l, *t;
801 struct shadow_page_info *sp;
802 cpumask_t flushmask = CPU_MASK_NONE;
803 mfn_t smfn;
804 int i;
806 ASSERT(order <= shadow_max_order(d));
807 if ( space_is_available(d, order, count) ) return;
809 v = current;
810 if ( v->domain != d )
811 v = d->vcpu[0];
812 ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */
814 /* Stage one: walk the list of pinned pages, unpinning them */
815 perfc_incr(shadow_prealloc_1);
816 list_for_each_backwards_safe(l, t, &d->arch.paging.shadow.pinned_shadows)
817 {
818 sp = list_entry(l, struct shadow_page_info, list);
819 smfn = shadow_page_to_mfn(sp);
821 /* Unpin this top-level shadow */
822 sh_unpin(v, smfn);
824 /* See if that freed up enough space */
825 if ( space_is_available(d, order, count) ) return;
826 }
828 /* Stage two: all shadow pages are in use in hierarchies that are
829 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
830 * mappings. */
831 perfc_incr(shadow_prealloc_2);
833 for_each_vcpu(d, v2)
834 for ( i = 0 ; i < 4 ; i++ )
835 {
836 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
837 {
838 shadow_unhook_mappings(v,
839 pagetable_get_mfn(v2->arch.shadow_table[i]));
840 cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
842 /* See if that freed up enough space */
843 if ( space_is_available(d, order, count) )
844 {
845 flush_tlb_mask(flushmask);
846 return;
847 }
848 }
849 }
851 /* Nothing more we can do: all remaining shadows are of pages that
852 * hold Xen mappings for some vcpu. This can never happen. */
853 SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n"
854 " shadow pages total = %u, free = %u, p2m=%u\n",
855 count, order,
856 d->arch.paging.shadow.total_pages,
857 d->arch.paging.shadow.free_pages,
858 d->arch.paging.shadow.p2m_pages);
859 BUG();
860 }
862 /* Make sure there are at least count pages of the order according to
863 * type available in the shadow page pool.
864 * This must be called before any calls to shadow_alloc(). Since this
865 * will free existing shadows to make room, it must be called early enough
866 * to avoid freeing shadows that the caller is currently working on. */
867 void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
868 {
869 ASSERT(type != SH_type_p2m_table);
870 return _shadow_prealloc(d, shadow_order(type), count);
871 }
873 /* Deliberately free all the memory we can: this will tear down all of
874 * this domain's shadows */
875 static void shadow_blow_tables(struct domain *d)
876 {
877 struct list_head *l, *t;
878 struct shadow_page_info *sp;
879 struct vcpu *v = d->vcpu[0];
880 mfn_t smfn;
881 int i;
883 ASSERT(v != NULL);
885 /* Pass one: unpin all pinned pages */
886 list_for_each_backwards_safe(l,t, &d->arch.paging.shadow.pinned_shadows)
887 {
888 sp = list_entry(l, struct shadow_page_info, list);
889 smfn = shadow_page_to_mfn(sp);
890 sh_unpin(v, smfn);
891 }
893 /* Second pass: unhook entries of in-use shadows */
894 for_each_vcpu(d, v)
895 for ( i = 0 ; i < 4 ; i++ )
896 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
897 shadow_unhook_mappings(v,
898 pagetable_get_mfn(v->arch.shadow_table[i]));
900 /* Make sure everyone sees the unshadowings */
901 flush_tlb_mask(d->domain_dirty_cpumask);
902 }
904 void shadow_blow_tables_per_domain(struct domain *d)
905 {
906 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL ) {
907 shadow_lock(d);
908 shadow_blow_tables(d);
909 shadow_unlock(d);
910 }
911 }
913 #ifndef NDEBUG
914 /* Blow all shadows of all shadowed domains: this can be used to cause the
915 * guest's pagetables to be re-shadowed if we suspect that the shadows
916 * have somehow got out of sync */
917 static void shadow_blow_all_tables(unsigned char c)
918 {
919 struct domain *d;
920 printk("'%c' pressed -> blowing all shadow tables\n", c);
921 rcu_read_lock(&domlist_read_lock);
922 for_each_domain(d)
923 {
924 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
925 {
926 shadow_lock(d);
927 shadow_blow_tables(d);
928 shadow_unlock(d);
929 }
930 }
931 rcu_read_unlock(&domlist_read_lock);
932 }
934 /* Register this function in the Xen console keypress table */
935 static __init int shadow_blow_tables_keyhandler_init(void)
936 {
937 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
938 return 0;
939 }
940 __initcall(shadow_blow_tables_keyhandler_init);
941 #endif /* !NDEBUG */
943 /* Allocate another shadow's worth of (contiguous, aligned) pages,
944 * and fill in the type and backpointer fields of their page_infos.
945 * Never fails to allocate. */
946 mfn_t shadow_alloc(struct domain *d,
947 u32 shadow_type,
948 unsigned long backpointer)
949 {
950 struct shadow_page_info *sp = NULL;
951 unsigned int order = shadow_order(shadow_type);
952 cpumask_t mask;
953 void *p;
954 int i;
956 ASSERT(shadow_locked_by_me(d));
957 if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d))
958 order = shadow_max_order(d);
959 ASSERT(order <= shadow_max_order(d));
960 ASSERT(shadow_type != SH_type_none);
961 perfc_incr(shadow_alloc);
963 /* Find smallest order which can satisfy the request. */
964 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
965 if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
966 goto found;
968 /* If we get here, we failed to allocate. This should never happen.
969 * It means that we didn't call shadow_prealloc() correctly before
970 * we allocated. We can't recover by calling prealloc here, because
971 * we might free up higher-level pages that the caller is working on. */
972 SHADOW_ERROR("Can't allocate %i shadow pages!\n", 1 << order);
973 BUG();
975 found:
976 sp = list_entry(d->arch.paging.shadow.freelists[i].next,
977 struct shadow_page_info, list);
978 list_del(&sp->list);
980 /* We may have to halve the chunk a number of times. */
981 while ( i != order )
982 {
983 i--;
984 sp->order = i;
985 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[i]);
986 sp += 1 << i;
987 }
988 d->arch.paging.shadow.free_pages -= 1 << order;
990 /* Init page info fields and clear the pages */
991 for ( i = 0; i < 1<<order ; i++ )
992 {
993 /* Before we overwrite the old contents of this page,
994 * we need to be sure that no TLB holds a pointer to it. */
995 mask = d->domain_dirty_cpumask;
996 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
997 if ( unlikely(!cpus_empty(mask)) )
998 {
999 perfc_incr(shadow_alloc_tlbflush);
1000 flush_tlb_mask(mask);
1002 /* Now safe to clear the page for reuse */
1003 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
1004 ASSERT(p != NULL);
1005 clear_page(p);
1006 sh_unmap_domain_page(p);
1007 INIT_LIST_HEAD(&sp[i].list);
1008 sp[i].type = shadow_type;
1009 sp[i].pinned = 0;
1010 sp[i].count = 0;
1011 sp[i].backpointer = backpointer;
1012 sp[i].next_shadow = NULL;
1013 perfc_incr(shadow_alloc_count);
1015 return shadow_page_to_mfn(sp);
1019 /* Return some shadow pages to the pool. */
1020 void shadow_free(struct domain *d, mfn_t smfn)
1022 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1023 u32 shadow_type;
1024 unsigned long order;
1025 unsigned long mask;
1026 int i;
1028 ASSERT(shadow_locked_by_me(d));
1029 perfc_incr(shadow_free);
1031 shadow_type = sp->type;
1032 ASSERT(shadow_type != SH_type_none);
1033 ASSERT(shadow_type != SH_type_p2m_table);
1034 order = shadow_order(shadow_type);
1036 d->arch.paging.shadow.free_pages += 1 << order;
1038 for ( i = 0; i < 1<<order; i++ )
1040 #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION)
1041 struct vcpu *v;
1042 for_each_vcpu(d, v)
1044 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1045 /* No longer safe to look for a writeable mapping in this shadow */
1046 if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
1047 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1048 #endif
1049 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
1050 v->arch.paging.last_write_emul_ok = 0;
1051 #endif
1053 #endif
1054 /* Strip out the type: this is now a free shadow page */
1055 sp[i].type = 0;
1056 /* Remember the TLB timestamp so we will know whether to flush
1057 * TLBs when we reuse the page. Because the destructors leave the
1058 * contents of the pages in place, we can delay TLB flushes until
1059 * just before the allocator hands the page out again. */
1060 sp[i].tlbflush_timestamp = tlbflush_current_time();
1061 perfc_decr(shadow_alloc_count);
1064 /* Merge chunks as far as possible. */
1065 for ( ; order < shadow_max_order(d); ++order )
1067 mask = 1 << order;
1068 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
1069 /* Merge with predecessor block? */
1070 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
1071 break;
1072 list_del(&(sp-mask)->list);
1073 sp -= mask;
1074 } else {
1075 /* Merge with successor block? */
1076 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
1077 break;
1078 list_del(&(sp+mask)->list);
1082 sp->order = order;
1083 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1086 /* Divert some memory from the pool to be used by the p2m mapping.
1087 * This action is irreversible: the p2m mapping only ever grows.
1088 * That's OK because the p2m table only exists for translated domains,
1089 * and those domains can't ever turn off shadow mode.
1090 * Also, we only ever allocate a max-order chunk, so as to preserve
1091 * the invariant that shadow_prealloc() always works.
1092 * Returns 0 iff it can't get a chunk (the caller should then
1093 * free up some pages in domheap and call sh_set_allocation);
1094 * returns non-zero on success.
1095 */
1096 static int
1097 sh_alloc_p2m_pages(struct domain *d)
1099 struct page_info *pg;
1100 u32 i;
1101 unsigned int order = shadow_max_order(d);
1103 ASSERT(shadow_locked_by_me(d));
1105 if ( d->arch.paging.shadow.total_pages
1106 < (shadow_min_acceptable_pages(d) + (1 << order)) )
1107 return 0; /* Not enough shadow memory: need to increase it first */
1109 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1110 d->arch.paging.shadow.p2m_pages += (1 << order);
1111 d->arch.paging.shadow.total_pages -= (1 << order);
1112 for (i = 0; i < (1U << order); i++)
1114 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1115 * Marking the domain as the owner would normally allow the guest to
1116 * create mappings of these pages, but these p2m pages will never be
1117 * in the domain's guest-physical address space, and so that is not
1118 * believed to be a concern.
1119 */
1120 page_set_owner(&pg[i], d);
1121 pg[i].count_info = 1;
1122 list_add_tail(&pg[i].list, &d->arch.paging.shadow.p2m_freelist);
1124 return 1;
1127 // Returns 0 if no memory is available...
1128 static struct page_info *
1129 shadow_alloc_p2m_page(struct domain *d)
1131 struct list_head *entry;
1132 struct page_info *pg;
1133 mfn_t mfn;
1134 void *p;
1136 shadow_lock(d);
1138 if ( list_empty(&d->arch.paging.shadow.p2m_freelist) &&
1139 !sh_alloc_p2m_pages(d) )
1141 shadow_unlock(d);
1142 return NULL;
1144 entry = d->arch.paging.shadow.p2m_freelist.next;
1145 list_del(entry);
1147 shadow_unlock(d);
1149 pg = list_entry(entry, struct page_info, list);
1150 mfn = page_to_mfn(pg);
1151 p = sh_map_domain_page(mfn);
1152 clear_page(p);
1153 sh_unmap_domain_page(p);
1155 return pg;
1158 static void
1159 shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1161 ASSERT(page_get_owner(pg) == d);
1162 /* Should have just the one ref we gave it in alloc_p2m_page() */
1163 if ( (pg->count_info & PGC_count_mask) != 1 )
1165 SHADOW_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1166 pg->count_info, pg->u.inuse.type_info);
1168 pg->count_info = 0;
1169 /* Free should not decrement domain's total allocation, since
1170 * these pages were allocated without an owner. */
1171 page_set_owner(pg, NULL);
1172 free_domheap_pages(pg, 0);
1173 d->arch.paging.shadow.p2m_pages--;
1174 perfc_decr(shadow_alloc_count);
1177 #if CONFIG_PAGING_LEVELS == 3
1178 static void p2m_install_entry_in_monitors(struct domain *d,
1179 l3_pgentry_t *l3e)
1180 /* Special case, only used for external-mode domains on PAE hosts:
1181 * update the mapping of the p2m table. Once again, this is trivial in
1182 * other paging modes (one top-level entry points to the top-level p2m,
1183 * no maintenance needed), but PAE makes life difficult by needing a
1184 * copy the eight l3es of the p2m table in eight l2h slots in the
1185 * monitor table. This function makes fresh copies when a p2m l3e
1186 * changes. */
1188 l2_pgentry_t *ml2e;
1189 struct vcpu *v;
1190 unsigned int index;
1192 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1193 ASSERT(index < MACHPHYS_MBYTES>>1);
1195 for_each_vcpu(d, v)
1197 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1198 continue;
1199 ASSERT(shadow_mode_external(v->domain));
1201 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1202 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1204 if ( v == current ) /* OK to use linear map of monitor_table */
1205 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1206 else
1208 l3_pgentry_t *ml3e;
1209 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1210 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1211 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1212 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1213 sh_unmap_domain_page(ml3e);
1215 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1216 if ( v != current )
1217 sh_unmap_domain_page(ml2e);
1220 #endif
1222 /* Set the pool of shadow pages to the required number of pages.
1223 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1224 * plus space for the p2m table.
1225 * Returns 0 for success, non-zero for failure. */
1226 static unsigned int sh_set_allocation(struct domain *d,
1227 unsigned int pages,
1228 int *preempted)
1230 struct shadow_page_info *sp;
1231 unsigned int lower_bound;
1232 unsigned int j, order = shadow_max_order(d);
1234 ASSERT(shadow_locked_by_me(d));
1236 /* Don't allocate less than the minimum acceptable, plus one page per
1237 * megabyte of RAM (for the p2m table) */
1238 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1239 if ( pages > 0 && pages < lower_bound )
1240 pages = lower_bound;
1241 /* Round up to largest block size */
1242 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1244 SHADOW_PRINTK("current %i target %i\n",
1245 d->arch.paging.shadow.total_pages, pages);
1247 while ( d->arch.paging.shadow.total_pages != pages )
1249 if ( d->arch.paging.shadow.total_pages < pages )
1251 /* Need to allocate more memory from domheap */
1252 sp = (struct shadow_page_info *)
1253 alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d)));
1254 if ( sp == NULL )
1256 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1257 return -ENOMEM;
1259 d->arch.paging.shadow.free_pages += 1 << order;
1260 d->arch.paging.shadow.total_pages += 1 << order;
1261 for ( j = 0; j < 1U << order; j++ )
1263 sp[j].type = 0;
1264 sp[j].pinned = 0;
1265 sp[j].count = 0;
1266 sp[j].mbz = 0;
1267 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1269 sp->order = order;
1270 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1272 else if ( d->arch.paging.shadow.total_pages > pages )
1274 /* Need to return memory to domheap */
1275 _shadow_prealloc(d, order, 1);
1276 ASSERT(!list_empty(&d->arch.paging.shadow.freelists[order]));
1277 sp = list_entry(d->arch.paging.shadow.freelists[order].next,
1278 struct shadow_page_info, list);
1279 list_del(&sp->list);
1280 d->arch.paging.shadow.free_pages -= 1 << order;
1281 d->arch.paging.shadow.total_pages -= 1 << order;
1282 free_domheap_pages((struct page_info *)sp, order);
1285 /* Check to see if we need to yield and try again */
1286 if ( preempted && hypercall_preempt_check() )
1288 *preempted = 1;
1289 return 0;
1293 return 0;
1296 /* Return the size of the shadow pool, rounded up to the nearest MB */
1297 static unsigned int shadow_get_allocation(struct domain *d)
1299 unsigned int pg = d->arch.paging.shadow.total_pages;
1300 return ((pg >> (20 - PAGE_SHIFT))
1301 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1304 /**************************************************************************/
1305 /* Hash table for storing the guest->shadow mappings.
1306 * The table itself is an array of pointers to shadows; the shadows are then
1307 * threaded on a singly-linked list of shadows with the same hash value */
1309 #define SHADOW_HASH_BUCKETS 251
1310 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1312 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1313 typedef u32 key_t;
1314 static inline key_t sh_hash(unsigned long n, unsigned int t)
1316 unsigned char *p = (unsigned char *)&n;
1317 key_t k = t;
1318 int i;
1319 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1320 return k % SHADOW_HASH_BUCKETS;
1323 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1325 /* Before we get to the mechanism, define a pair of audit functions
1326 * that sanity-check the contents of the hash table. */
1327 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1328 /* Audit one bucket of the hash table */
1330 struct shadow_page_info *sp, *x;
1332 if ( !(SHADOW_AUDIT_ENABLE) )
1333 return;
1335 sp = d->arch.paging.shadow.hash_table[bucket];
1336 while ( sp )
1338 /* Not a shadow? */
1339 BUG_ON( sp->mbz != 0 );
1340 /* Bogus type? */
1341 BUG_ON( sp->type == 0 );
1342 BUG_ON( sp->type > SH_type_max_shadow );
1343 /* Wrong bucket? */
1344 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1345 /* Duplicate entry? */
1346 for ( x = sp->next_shadow; x; x = x->next_shadow )
1347 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1348 /* Follow the backpointer to the guest pagetable */
1349 if ( sp->type != SH_type_fl1_32_shadow
1350 && sp->type != SH_type_fl1_pae_shadow
1351 && sp->type != SH_type_fl1_64_shadow )
1353 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1354 /* Bad shadow flags on guest page? */
1355 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1356 /* Bad type count on guest page? */
1357 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1358 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1360 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1361 " but has typecount %#lx\n",
1362 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1363 gpg->u.inuse.type_info);
1364 BUG();
1367 /* That entry was OK; on we go */
1368 sp = sp->next_shadow;
1372 #else
1373 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1374 #endif /* Hashtable bucket audit */
1377 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1379 static void sh_hash_audit(struct domain *d)
1380 /* Full audit: audit every bucket in the table */
1382 int i;
1384 if ( !(SHADOW_AUDIT_ENABLE) )
1385 return;
1387 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1389 sh_hash_audit_bucket(d, i);
1393 #else
1394 #define sh_hash_audit(_d) do {} while(0)
1395 #endif /* Hashtable bucket audit */
1397 /* Allocate and initialise the table itself.
1398 * Returns 0 for success, 1 for error. */
1399 static int shadow_hash_alloc(struct domain *d)
1401 struct shadow_page_info **table;
1403 ASSERT(shadow_locked_by_me(d));
1404 ASSERT(!d->arch.paging.shadow.hash_table);
1406 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1407 if ( !table ) return 1;
1408 memset(table, 0,
1409 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1410 d->arch.paging.shadow.hash_table = table;
1411 return 0;
1414 /* Tear down the hash table and return all memory to Xen.
1415 * This function does not care whether the table is populated. */
1416 static void shadow_hash_teardown(struct domain *d)
1418 ASSERT(shadow_locked_by_me(d));
1419 ASSERT(d->arch.paging.shadow.hash_table);
1421 xfree(d->arch.paging.shadow.hash_table);
1422 d->arch.paging.shadow.hash_table = NULL;
1426 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1427 /* Find an entry in the hash table. Returns the MFN of the shadow,
1428 * or INVALID_MFN if it doesn't exist */
1430 struct domain *d = v->domain;
1431 struct shadow_page_info *sp, *prev;
1432 key_t key;
1434 ASSERT(shadow_locked_by_me(d));
1435 ASSERT(d->arch.paging.shadow.hash_table);
1436 ASSERT(t);
1438 sh_hash_audit(d);
1440 perfc_incr(shadow_hash_lookups);
1441 key = sh_hash(n, t);
1442 sh_hash_audit_bucket(d, key);
1444 sp = d->arch.paging.shadow.hash_table[key];
1445 prev = NULL;
1446 while(sp)
1448 if ( sp->backpointer == n && sp->type == t )
1450 /* Pull-to-front if 'sp' isn't already the head item */
1451 if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
1453 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
1454 /* Can't reorder: someone is walking the hash chains */
1455 return shadow_page_to_mfn(sp);
1456 else
1458 ASSERT(prev);
1459 /* Delete sp from the list */
1460 prev->next_shadow = sp->next_shadow;
1461 /* Re-insert it at the head of the list */
1462 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
1463 d->arch.paging.shadow.hash_table[key] = sp;
1466 else
1468 perfc_incr(shadow_hash_lookup_head);
1470 return shadow_page_to_mfn(sp);
1472 prev = sp;
1473 sp = sp->next_shadow;
1476 perfc_incr(shadow_hash_lookup_miss);
1477 return _mfn(INVALID_MFN);
1480 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
1481 mfn_t smfn)
1482 /* Put a mapping (n,t)->smfn into the hash table */
1484 struct domain *d = v->domain;
1485 struct shadow_page_info *sp;
1486 key_t key;
1488 ASSERT(shadow_locked_by_me(d));
1489 ASSERT(d->arch.paging.shadow.hash_table);
1490 ASSERT(t);
1492 sh_hash_audit(d);
1494 perfc_incr(shadow_hash_inserts);
1495 key = sh_hash(n, t);
1496 sh_hash_audit_bucket(d, key);
1498 /* Insert this shadow at the top of the bucket */
1499 sp = mfn_to_shadow_page(smfn);
1500 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
1501 d->arch.paging.shadow.hash_table[key] = sp;
1503 sh_hash_audit_bucket(d, key);
1506 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
1507 mfn_t smfn)
1508 /* Excise the mapping (n,t)->smfn from the hash table */
1510 struct domain *d = v->domain;
1511 struct shadow_page_info *sp, *x;
1512 key_t key;
1514 ASSERT(shadow_locked_by_me(d));
1515 ASSERT(d->arch.paging.shadow.hash_table);
1516 ASSERT(t);
1518 sh_hash_audit(d);
1520 perfc_incr(shadow_hash_deletes);
1521 key = sh_hash(n, t);
1522 sh_hash_audit_bucket(d, key);
1524 sp = mfn_to_shadow_page(smfn);
1525 if ( d->arch.paging.shadow.hash_table[key] == sp )
1526 /* Easy case: we're deleting the head item. */
1527 d->arch.paging.shadow.hash_table[key] = sp->next_shadow;
1528 else
1530 /* Need to search for the one we want */
1531 x = d->arch.paging.shadow.hash_table[key];
1532 while ( 1 )
1534 ASSERT(x); /* We can't have hit the end, since our target is
1535 * still in the chain somehwere... */
1536 if ( x->next_shadow == sp )
1538 x->next_shadow = sp->next_shadow;
1539 break;
1541 x = x->next_shadow;
1544 sp->next_shadow = NULL;
1546 sh_hash_audit_bucket(d, key);
1549 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1551 static void hash_foreach(struct vcpu *v,
1552 unsigned int callback_mask,
1553 hash_callback_t callbacks[],
1554 mfn_t callback_mfn)
1555 /* Walk the hash table looking at the types of the entries and
1556 * calling the appropriate callback function for each entry.
1557 * The mask determines which shadow types we call back for, and the array
1558 * of callbacks tells us which function to call.
1559 * Any callback may return non-zero to let us skip the rest of the scan.
1561 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1562 * then return non-zero to terminate the scan. */
1564 int i, done = 0;
1565 struct domain *d = v->domain;
1566 struct shadow_page_info *x;
1568 /* Say we're here, to stop hash-lookups reordering the chains */
1569 ASSERT(shadow_locked_by_me(d));
1570 ASSERT(d->arch.paging.shadow.hash_walking == 0);
1571 d->arch.paging.shadow.hash_walking = 1;
1573 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1575 /* WARNING: This is not safe against changes to the hash table.
1576 * The callback *must* return non-zero if it has inserted or
1577 * deleted anything from the hash (lookups are OK, though). */
1578 for ( x = d->arch.paging.shadow.hash_table[i]; x; x = x->next_shadow )
1580 if ( callback_mask & (1 << x->type) )
1582 ASSERT(x->type <= 15);
1583 ASSERT(callbacks[x->type] != NULL);
1584 done = callbacks[x->type](v, shadow_page_to_mfn(x),
1585 callback_mfn);
1586 if ( done ) break;
1589 if ( done ) break;
1591 d->arch.paging.shadow.hash_walking = 0;
1595 /**************************************************************************/
1596 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1597 * which will decrement refcounts appropriately and return memory to the
1598 * free pool. */
1600 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1602 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1603 unsigned int t = sp->type;
1606 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1608 /* Double-check, if we can, that the shadowed page belongs to this
1609 * domain, (by following the back-pointer). */
1610 ASSERT(t == SH_type_fl1_32_shadow ||
1611 t == SH_type_fl1_pae_shadow ||
1612 t == SH_type_fl1_64_shadow ||
1613 t == SH_type_monitor_table ||
1614 (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
1615 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
1616 == v->domain));
1618 /* The down-shifts here are so that the switch statement is on nice
1619 * small numbers that the compiler will enjoy */
1620 switch ( t )
1622 #if CONFIG_PAGING_LEVELS == 2
1623 case SH_type_l1_32_shadow:
1624 case SH_type_fl1_32_shadow:
1625 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1626 break;
1627 case SH_type_l2_32_shadow:
1628 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1629 break;
1630 #else /* PAE or 64bit */
1631 case SH_type_l1_32_shadow:
1632 case SH_type_fl1_32_shadow:
1633 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1634 break;
1635 case SH_type_l2_32_shadow:
1636 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1637 break;
1638 #endif
1640 #if CONFIG_PAGING_LEVELS >= 3
1641 case SH_type_l1_pae_shadow:
1642 case SH_type_fl1_pae_shadow:
1643 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1644 break;
1645 case SH_type_l2_pae_shadow:
1646 case SH_type_l2h_pae_shadow:
1647 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1648 break;
1649 #endif
1651 #if CONFIG_PAGING_LEVELS >= 4
1652 case SH_type_l1_64_shadow:
1653 case SH_type_fl1_64_shadow:
1654 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1655 break;
1656 case SH_type_l2h_64_shadow:
1657 ASSERT(is_pv_32on64_vcpu(v));
1658 /* Fall through... */
1659 case SH_type_l2_64_shadow:
1660 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1661 break;
1662 case SH_type_l3_64_shadow:
1663 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1664 break;
1665 case SH_type_l4_64_shadow:
1666 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1667 break;
1668 #endif
1669 default:
1670 SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
1671 (unsigned long)t);
1672 BUG();
1676 /**************************************************************************/
1677 /* Remove all writeable mappings of a guest frame from the shadow tables
1678 * Returns non-zero if we need to flush TLBs.
1679 * level and fault_addr desribe how we found this to be a pagetable;
1680 * level==0 means we have some other reason for revoking write access.*/
1682 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
1683 unsigned int level,
1684 unsigned long fault_addr)
1686 /* Dispatch table for getting per-type functions */
1687 static hash_callback_t callbacks[SH_type_unused] = {
1688 NULL, /* none */
1689 #if CONFIG_PAGING_LEVELS == 2
1690 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* l1_32 */
1691 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* fl1_32 */
1692 #else
1693 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* l1_32 */
1694 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* fl1_32 */
1695 #endif
1696 NULL, /* l2_32 */
1697 #if CONFIG_PAGING_LEVELS >= 3
1698 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* l1_pae */
1699 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* fl1_pae */
1700 #else
1701 NULL, /* l1_pae */
1702 NULL, /* fl1_pae */
1703 #endif
1704 NULL, /* l2_pae */
1705 NULL, /* l2h_pae */
1706 #if CONFIG_PAGING_LEVELS >= 4
1707 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* l1_64 */
1708 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* fl1_64 */
1709 #else
1710 NULL, /* l1_64 */
1711 NULL, /* fl1_64 */
1712 #endif
1713 NULL, /* l2_64 */
1714 NULL, /* l2h_64 */
1715 NULL, /* l3_64 */
1716 NULL, /* l4_64 */
1717 NULL, /* p2m */
1718 NULL /* unused */
1719 };
1721 static unsigned int callback_mask =
1722 1 << SH_type_l1_32_shadow
1723 | 1 << SH_type_fl1_32_shadow
1724 | 1 << SH_type_l1_pae_shadow
1725 | 1 << SH_type_fl1_pae_shadow
1726 | 1 << SH_type_l1_64_shadow
1727 | 1 << SH_type_fl1_64_shadow
1729 struct page_info *pg = mfn_to_page(gmfn);
1731 ASSERT(shadow_locked_by_me(v->domain));
1733 /* Only remove writable mappings if we are doing shadow refcounts.
1734 * In guest refcounting, we trust Xen to already be restricting
1735 * all the writes to the guest page tables, so we do not need to
1736 * do more. */
1737 if ( !shadow_mode_refcounts(v->domain) )
1738 return 0;
1740 /* Early exit if it's already a pagetable, or otherwise not writeable */
1741 if ( sh_mfn_is_a_page_table(gmfn)
1742 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1743 return 0;
1745 perfc_incr(shadow_writeable);
1747 /* If this isn't a "normal" writeable page, the domain is trying to
1748 * put pagetables in special memory of some kind. We can't allow that. */
1749 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1751 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1752 PRtype_info "\n",
1753 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1754 domain_crash(v->domain);
1757 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1758 if ( v == current && level != 0 )
1760 unsigned long gfn;
1761 /* Heuristic: there is likely to be only one writeable mapping,
1762 * and that mapping is likely to be in the current pagetable,
1763 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
1765 #define GUESS(_a, _h) do { \
1766 if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
1767 perfc_incr(shadow_writeable_h_ ## _h); \
1768 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1769 return 1; \
1770 } while (0)
1773 if ( v->arch.paging.mode->guest_levels == 2 )
1775 if ( level == 1 )
1776 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1777 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1779 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1780 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1781 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1784 #if CONFIG_PAGING_LEVELS >= 3
1785 else if ( v->arch.paging.mode->guest_levels == 3 )
1787 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1788 switch ( level )
1790 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1791 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1794 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1795 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1796 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1798 #if CONFIG_PAGING_LEVELS >= 4
1799 else if ( v->arch.paging.mode->guest_levels == 4 )
1801 /* 64bit w2k3: linear map at 0xfffff68000000000 */
1802 switch ( level )
1804 case 1: GUESS(0xfffff68000000000UL
1805 + ((fault_addr & VADDR_MASK) >> 9), 3); break;
1806 case 2: GUESS(0xfffff6fb40000000UL
1807 + ((fault_addr & VADDR_MASK) >> 18), 3); break;
1808 case 3: GUESS(0xfffff6fb7da00000UL
1809 + ((fault_addr & VADDR_MASK) >> 27), 3); break;
1812 /* 64bit Linux direct map at 0xffff810000000000; older kernels
1813 * had it at 0x0000010000000000UL */
1814 gfn = mfn_to_gfn(v->domain, gmfn);
1815 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
1816 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
1818 #endif /* CONFIG_PAGING_LEVELS >= 4 */
1819 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1821 #undef GUESS
1824 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1825 return 1;
1827 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
1828 * (entries in the fixmap) where linux maps its pagetables. Since
1829 * we expect to hit them most of the time, we start the search for
1830 * the writeable mapping by looking at the same MFN where the last
1831 * brute-force search succeeded. */
1833 if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
1835 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
1836 mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
1837 int shtype = mfn_to_shadow_page(last_smfn)->type;
1839 if ( callbacks[shtype] )
1840 callbacks[shtype](v, last_smfn, gmfn);
1842 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
1843 perfc_incr(shadow_writeable_h_5);
1846 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1847 return 1;
1849 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
1851 /* Brute-force search of all the shadows, by walking the hash */
1852 perfc_incr(shadow_writeable_bf);
1853 hash_foreach(v, callback_mask, callbacks, gmfn);
1855 /* If that didn't catch the mapping, then there's some non-pagetable
1856 * mapping -- ioreq page, grant mapping, &c. */
1857 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
1859 SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
1860 "%lu special-use mappings of it\n", mfn_x(gmfn),
1861 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
1862 domain_crash(v->domain);
1865 /* We killed at least one writeable mapping, so must flush TLBs. */
1866 return 1;
1871 /**************************************************************************/
1872 /* Remove all mappings of a guest frame from the shadow tables.
1873 * Returns non-zero if we need to flush TLBs. */
1875 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
1877 struct page_info *page = mfn_to_page(gmfn);
1878 int expected_count, do_locking;
1880 /* Dispatch table for getting per-type functions */
1881 static hash_callback_t callbacks[SH_type_unused] = {
1882 NULL, /* none */
1883 #if CONFIG_PAGING_LEVELS == 2
1884 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* l1_32 */
1885 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* fl1_32 */
1886 #else
1887 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* l1_32 */
1888 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* fl1_32 */
1889 #endif
1890 NULL, /* l2_32 */
1891 #if CONFIG_PAGING_LEVELS >= 3
1892 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* l1_pae */
1893 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* fl1_pae */
1894 #else
1895 NULL, /* l1_pae */
1896 NULL, /* fl1_pae */
1897 #endif
1898 NULL, /* l2_pae */
1899 NULL, /* l2h_pae */
1900 #if CONFIG_PAGING_LEVELS >= 4
1901 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* l1_64 */
1902 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* fl1_64 */
1903 #else
1904 NULL, /* l1_64 */
1905 NULL, /* fl1_64 */
1906 #endif
1907 NULL, /* l2_64 */
1908 NULL, /* l2h_64 */
1909 NULL, /* l3_64 */
1910 NULL, /* l4_64 */
1911 NULL, /* p2m */
1912 NULL /* unused */
1913 };
1915 static unsigned int callback_mask =
1916 1 << SH_type_l1_32_shadow
1917 | 1 << SH_type_fl1_32_shadow
1918 | 1 << SH_type_l1_pae_shadow
1919 | 1 << SH_type_fl1_pae_shadow
1920 | 1 << SH_type_l1_64_shadow
1921 | 1 << SH_type_fl1_64_shadow
1924 perfc_incr(shadow_mappings);
1925 if ( (page->count_info & PGC_count_mask) == 0 )
1926 return 0;
1928 /* Although this is an externally visible function, we do not know
1929 * whether the shadow lock will be held when it is called (since it
1930 * can be called via put_page_type when we clear a shadow l1e).
1931 * If the lock isn't held, take it for the duration of the call. */
1932 do_locking = !shadow_locked_by_me(v->domain);
1933 if ( do_locking ) shadow_lock(v->domain);
1935 /* XXX TODO:
1936 * Heuristics for finding the (probably) single mapping of this gmfn */
1938 /* Brute-force search of all the shadows, by walking the hash */
1939 perfc_incr(shadow_mappings_bf);
1940 hash_foreach(v, callback_mask, callbacks, gmfn);
1942 /* If that didn't catch the mapping, something is very wrong */
1943 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
1944 if ( (page->count_info & PGC_count_mask) != expected_count )
1946 /* Don't complain if we're in HVM and there are some extra mappings:
1947 * The qemu helper process has an untyped mapping of this dom's RAM
1948 * and the HVM restore program takes another. */
1949 if ( !(shadow_mode_external(v->domain)
1950 && (page->count_info & PGC_count_mask) <= 3
1951 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
1953 SHADOW_ERROR("can't find all mappings of mfn %lx: "
1954 "c=%08x t=%08lx\n", mfn_x(gmfn),
1955 page->count_info, page->u.inuse.type_info);
1959 if ( do_locking ) shadow_unlock(v->domain);
1961 /* We killed at least one mapping, so must flush TLBs. */
1962 return 1;
1966 /**************************************************************************/
1967 /* Remove all shadows of a guest frame from the shadow tables */
1969 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
1970 /* Follow this shadow's up-pointer, if it has one, and remove the reference
1971 * found there. Returns 1 if that was the only reference to this shadow */
1973 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1974 mfn_t pmfn;
1975 void *vaddr;
1976 int rc;
1978 ASSERT(sp->type > 0);
1979 ASSERT(sp->type < SH_type_max_shadow);
1980 ASSERT(sp->type != SH_type_l2_32_shadow);
1981 ASSERT(sp->type != SH_type_l2_pae_shadow);
1982 ASSERT(sp->type != SH_type_l2h_pae_shadow);
1983 ASSERT(sp->type != SH_type_l4_64_shadow);
1985 if (sp->up == 0) return 0;
1986 pmfn = _mfn(sp->up >> PAGE_SHIFT);
1987 ASSERT(mfn_valid(pmfn));
1988 vaddr = sh_map_domain_page(pmfn);
1989 ASSERT(vaddr);
1990 vaddr += sp->up & (PAGE_SIZE-1);
1991 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
1993 /* Is this the only reference to this shadow? */
1994 rc = (sp->count == 1) ? 1 : 0;
1996 /* Blank the offending entry */
1997 switch (sp->type)
1999 case SH_type_l1_32_shadow:
2000 case SH_type_l2_32_shadow:
2001 #if CONFIG_PAGING_LEVELS == 2
2002 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2003 #else
2004 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2005 #endif
2006 break;
2007 #if CONFIG_PAGING_LEVELS >=3
2008 case SH_type_l1_pae_shadow:
2009 case SH_type_l2_pae_shadow:
2010 case SH_type_l2h_pae_shadow:
2011 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2012 break;
2013 #if CONFIG_PAGING_LEVELS >= 4
2014 case SH_type_l1_64_shadow:
2015 case SH_type_l2_64_shadow:
2016 case SH_type_l2h_64_shadow:
2017 case SH_type_l3_64_shadow:
2018 case SH_type_l4_64_shadow:
2019 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2020 break;
2021 #endif
2022 #endif
2023 default: BUG(); /* Some wierd unknown shadow type */
2026 sh_unmap_domain_page(vaddr);
2027 if ( rc )
2028 perfc_incr(shadow_up_pointer);
2029 else
2030 perfc_incr(shadow_unshadow_bf);
2032 return rc;
2035 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2036 /* Remove the shadows of this guest page.
2037 * If fast != 0, just try the quick heuristic, which will remove
2038 * at most one reference to each shadow of the page. Otherwise, walk
2039 * all the shadow tables looking for refs to shadows of this gmfn.
2040 * If all != 0, kill the domain if we can't find all the shadows.
2041 * (all != 0 implies fast == 0)
2042 */
2044 struct page_info *pg = mfn_to_page(gmfn);
2045 mfn_t smfn;
2046 int do_locking;
2047 unsigned char t;
2049 /* Dispatch table for getting per-type functions: each level must
2050 * be called with the function to remove a lower-level shadow. */
2051 static hash_callback_t callbacks[SH_type_unused] = {
2052 NULL, /* none */
2053 NULL, /* l1_32 */
2054 NULL, /* fl1_32 */
2055 #if CONFIG_PAGING_LEVELS == 2
2056 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2057 #else
2058 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2059 #endif
2060 NULL, /* l1_pae */
2061 NULL, /* fl1_pae */
2062 #if CONFIG_PAGING_LEVELS >= 3
2063 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2064 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2065 #else
2066 NULL, /* l2_pae */
2067 NULL, /* l2h_pae */
2068 #endif
2069 NULL, /* l1_64 */
2070 NULL, /* fl1_64 */
2071 #if CONFIG_PAGING_LEVELS >= 4
2072 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2073 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2h_64 */
2074 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2075 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2076 #else
2077 NULL, /* l2_64 */
2078 NULL, /* l2h_64 */
2079 NULL, /* l3_64 */
2080 NULL, /* l4_64 */
2081 #endif
2082 NULL, /* p2m */
2083 NULL /* unused */
2084 };
2086 /* Another lookup table, for choosing which mask to use */
2087 static unsigned int masks[SH_type_unused] = {
2088 0, /* none */
2089 1 << SH_type_l2_32_shadow, /* l1_32 */
2090 0, /* fl1_32 */
2091 0, /* l2_32 */
2092 ((1 << SH_type_l2h_pae_shadow)
2093 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2094 0, /* fl1_pae */
2095 0, /* l2_pae */
2096 0, /* l2h_pae */
2097 ((1 << SH_type_l2h_64_shadow)
2098 | (1 << SH_type_l2_64_shadow)), /* l1_64 */
2099 0, /* fl1_64 */
2100 1 << SH_type_l3_64_shadow, /* l2_64 */
2101 1 << SH_type_l3_64_shadow, /* l2h_64 */
2102 1 << SH_type_l4_64_shadow, /* l3_64 */
2103 0, /* l4_64 */
2104 0, /* p2m */
2105 0 /* unused */
2106 };
2108 ASSERT(!(all && fast));
2110 /* Although this is an externally visible function, we do not know
2111 * whether the shadow lock will be held when it is called (since it
2112 * can be called via put_page_type when we clear a shadow l1e).
2113 * If the lock isn't held, take it for the duration of the call. */
2114 do_locking = !shadow_locked_by_me(v->domain);
2115 if ( do_locking ) shadow_lock(v->domain);
2117 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2118 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2120 /* Bail out now if the page is not shadowed */
2121 if ( (pg->count_info & PGC_page_table) == 0 )
2123 if ( do_locking ) shadow_unlock(v->domain);
2124 return;
2127 /* Search for this shadow in all appropriate shadows */
2128 perfc_incr(shadow_unshadow);
2130 /* Lower-level shadows need to be excised from upper-level shadows.
2131 * This call to hash_foreach() looks dangerous but is in fact OK: each
2132 * call will remove at most one shadow, and terminate immediately when
2133 * it does remove it, so we never walk the hash after doing a deletion. */
2134 #define DO_UNSHADOW(_type) do { \
2135 t = (_type); \
2136 if( !(pg->count_info & PGC_page_table) \
2137 || !(pg->shadow_flags & (1 << t)) ) \
2138 break; \
2139 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2140 if ( unlikely(!mfn_valid(smfn)) ) \
2141 { \
2142 SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32 \
2143 " but no type-0x%"PRIx32" shadow\n", \
2144 mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \
2145 break; \
2146 } \
2147 if ( sh_type_is_pinnable(v, t) ) \
2148 sh_unpin(v, smfn); \
2149 else \
2150 sh_remove_shadow_via_pointer(v, smfn); \
2151 if( !fast \
2152 && (pg->count_info & PGC_page_table) \
2153 && (pg->shadow_flags & (1 << t)) ) \
2154 hash_foreach(v, masks[t], callbacks, smfn); \
2155 } while (0)
2157 DO_UNSHADOW(SH_type_l2_32_shadow);
2158 DO_UNSHADOW(SH_type_l1_32_shadow);
2159 #if CONFIG_PAGING_LEVELS >= 3
2160 DO_UNSHADOW(SH_type_l2h_pae_shadow);
2161 DO_UNSHADOW(SH_type_l2_pae_shadow);
2162 DO_UNSHADOW(SH_type_l1_pae_shadow);
2163 #if CONFIG_PAGING_LEVELS >= 4
2164 DO_UNSHADOW(SH_type_l4_64_shadow);
2165 DO_UNSHADOW(SH_type_l3_64_shadow);
2166 DO_UNSHADOW(SH_type_l2h_64_shadow);
2167 DO_UNSHADOW(SH_type_l2_64_shadow);
2168 DO_UNSHADOW(SH_type_l1_64_shadow);
2169 #endif
2170 #endif
2172 #undef DO_UNSHADOW
2174 /* If that didn't catch the shadows, something is wrong */
2175 if ( !fast && all && (pg->count_info & PGC_page_table) )
2177 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2178 "(shadow_flags=%08lx)\n",
2179 mfn_x(gmfn), pg->shadow_flags);
2180 domain_crash(v->domain);
2183 /* Need to flush TLBs now, so that linear maps are safe next time we
2184 * take a fault. */
2185 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2187 if ( do_locking ) shadow_unlock(v->domain);
2190 static void
2191 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2192 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2193 * Unshadow it, and recursively unshadow pages that reference it. */
2195 sh_remove_shadows(v, gmfn, 0, 1);
2196 /* XXX TODO:
2197 * Rework this hashtable walker to return a linked-list of all
2198 * the shadows it modified, then do breadth-first recursion
2199 * to find the way up to higher-level tables and unshadow them too.
2201 * The current code (just tearing down each page's shadows as we
2202 * detect that it is not a pagetable) is correct, but very slow.
2203 * It means extra emulated writes and slows down removal of mappings. */
2206 /**************************************************************************/
2208 static void sh_update_paging_modes(struct vcpu *v)
2210 struct domain *d = v->domain;
2211 struct paging_mode *old_mode = v->arch.paging.mode;
2213 ASSERT(shadow_locked_by_me(d));
2215 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2216 /* Make sure this vcpu has a virtual TLB array allocated */
2217 if ( unlikely(!v->arch.paging.vtlb) )
2219 v->arch.paging.vtlb = xmalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
2220 if ( unlikely(!v->arch.paging.vtlb) )
2222 SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
2223 d->domain_id, v->vcpu_id);
2224 domain_crash(v->domain);
2225 return;
2227 memset(v->arch.paging.vtlb, 0,
2228 VTLB_ENTRIES * sizeof (struct shadow_vtlb));
2229 spin_lock_init(&v->arch.paging.vtlb_lock);
2231 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2233 // Valid transitions handled by this function:
2234 // - For PV guests:
2235 // - after a shadow mode has been changed
2236 // - For HVM guests:
2237 // - after a shadow mode has been changed
2238 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2239 //
2241 // First, tear down any old shadow tables held by this vcpu.
2242 //
2243 if ( v->arch.paging.mode )
2244 v->arch.paging.mode->shadow.detach_old_tables(v);
2246 if ( !is_hvm_domain(d) )
2248 ///
2249 /// PV guest
2250 ///
2251 #if CONFIG_PAGING_LEVELS == 4
2252 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2253 #elif CONFIG_PAGING_LEVELS == 3
2254 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2255 #elif CONFIG_PAGING_LEVELS == 2
2256 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2257 #else
2258 #error unexpected paging mode
2259 #endif
2261 else
2263 ///
2264 /// HVM guest
2265 ///
2266 ASSERT(shadow_mode_translate(d));
2267 ASSERT(shadow_mode_external(d));
2269 if ( !hvm_paging_enabled(v) )
2271 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
2272 * pagetable for it, mapping 4 GB one-to-one using a single l2
2273 * page of 1024 superpage mappings */
2274 v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
2275 #if CONFIG_PAGING_LEVELS >= 3
2276 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2277 #else
2278 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2279 #endif
2281 else
2283 #ifdef __x86_64__
2284 if ( hvm_long_mode_enabled(v) )
2286 // long mode guest...
2287 v->arch.paging.mode =
2288 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2290 else
2291 #endif
2292 if ( hvm_pae_enabled(v) )
2294 #if CONFIG_PAGING_LEVELS >= 3
2295 // 32-bit PAE mode guest...
2296 v->arch.paging.mode =
2297 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2298 #else
2299 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2300 domain_crash(d);
2301 return;
2302 #endif
2304 else
2306 // 32-bit 2 level guest...
2307 #if CONFIG_PAGING_LEVELS >= 3
2308 v->arch.paging.mode =
2309 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2310 #else
2311 v->arch.paging.mode =
2312 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2313 #endif
2317 if ( pagetable_is_null(v->arch.monitor_table) )
2319 mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2320 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2321 make_cr3(v, mfn_x(mmfn));
2322 hvm_update_host_cr3(v);
2325 if ( v->arch.paging.mode != old_mode )
2327 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2328 "(was g=%u s=%u)\n",
2329 d->domain_id, v->vcpu_id,
2330 is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
2331 v->arch.paging.mode->guest_levels,
2332 v->arch.paging.mode->shadow.shadow_levels,
2333 old_mode ? old_mode->guest_levels : 0,
2334 old_mode ? old_mode->shadow.shadow_levels : 0);
2335 if ( old_mode &&
2336 (v->arch.paging.mode->shadow.shadow_levels !=
2337 old_mode->shadow.shadow_levels) )
2339 /* Need to make a new monitor table for the new mode */
2340 mfn_t new_mfn, old_mfn;
2342 if ( v != current && vcpu_runnable(v) )
2344 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2345 "this HVM vcpu's (d=%u v=%u) paging mode "
2346 "while it is running.\n",
2347 current->domain->domain_id, current->vcpu_id,
2348 v->domain->domain_id, v->vcpu_id);
2349 /* It's not safe to do that because we can't change
2350 * the host CR£ for a running domain */
2351 domain_crash(v->domain);
2352 return;
2355 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2356 v->arch.monitor_table = pagetable_null();
2357 new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2358 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2359 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
2360 mfn_x(new_mfn));
2362 /* Don't be running on the old monitor table when we
2363 * pull it down! Switch CR3, and warn the HVM code that
2364 * its host cr3 has changed. */
2365 make_cr3(v, mfn_x(new_mfn));
2366 if ( v == current )
2367 write_ptbase(v);
2368 hvm_update_host_cr3(v);
2369 old_mode->shadow.destroy_monitor_table(v, old_mfn);
2373 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2374 // These are HARD: think about the case where two CPU's have
2375 // different values for CR4.PSE and CR4.PGE at the same time.
2376 // This *does* happen, at least for CR4.PGE...
2379 v->arch.paging.mode->update_cr3(v, 0);
2382 void shadow_update_paging_modes(struct vcpu *v)
2384 shadow_lock(v->domain);
2385 sh_update_paging_modes(v);
2386 shadow_unlock(v->domain);
2389 /**************************************************************************/
2390 /* Turning on and off shadow features */
2392 static void sh_new_mode(struct domain *d, u32 new_mode)
2393 /* Inform all the vcpus that the shadow mode has been changed */
2395 struct vcpu *v;
2397 ASSERT(shadow_locked_by_me(d));
2398 ASSERT(d != current->domain);
2399 d->arch.paging.mode = new_mode;
2400 for_each_vcpu(d, v)
2401 sh_update_paging_modes(v);
2404 int shadow_enable(struct domain *d, u32 mode)
2405 /* Turn on "permanent" shadow features: external, translate, refcount.
2406 * Can only be called once on a domain, and these features cannot be
2407 * disabled.
2408 * Returns 0 for success, -errno for failure. */
2410 unsigned int old_pages;
2411 struct page_info *pg = NULL;
2412 uint32_t *e;
2413 int i, rv = 0;
2415 mode |= PG_SH_enable;
2417 domain_pause(d);
2419 /* Sanity check the arguments */
2420 if ( (d == current->domain) ||
2421 shadow_mode_enabled(d) ||
2422 ((mode & PG_translate) && !(mode & PG_refcounts)) ||
2423 ((mode & PG_external) && !(mode & PG_translate)) )
2425 rv = -EINVAL;
2426 goto out_unlocked;
2429 /* Init the shadow memory allocation if the user hasn't done so */
2430 old_pages = d->arch.paging.shadow.total_pages;
2431 if ( old_pages == 0 )
2433 unsigned int r;
2434 shadow_lock(d);
2435 r = sh_set_allocation(d, 256, NULL); /* Use at least 1MB */
2436 if ( r != 0 )
2438 sh_set_allocation(d, 0, NULL);
2439 rv = -ENOMEM;
2440 goto out_locked;
2442 shadow_unlock(d);
2445 /* Init the P2M table. Must be done before we take the shadow lock
2446 * to avoid possible deadlock. */
2447 if ( mode & PG_translate )
2449 rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
2450 if (rv != 0)
2451 goto out_unlocked;
2454 /* HVM domains need an extra pagetable for vcpus that think they
2455 * have paging disabled */
2456 if ( is_hvm_domain(d) )
2458 /* Get a single page from the shadow pool. Take it via the
2459 * P2M interface to make freeing it simpler afterwards. */
2460 pg = shadow_alloc_p2m_page(d);
2461 if ( pg == NULL )
2463 rv = -ENOMEM;
2464 goto out_unlocked;
2466 /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
2467 * of virtual address space onto the same physical address range */
2468 e = sh_map_domain_page(page_to_mfn(pg));
2469 for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
2470 e[i] = ((0x400000U * i)
2471 | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
2472 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
2473 sh_unmap_domain_page(e);
2474 pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
2477 shadow_lock(d);
2479 /* Sanity check again with the lock held */
2480 if ( shadow_mode_enabled(d) )
2482 rv = -EINVAL;
2483 goto out_locked;
2486 /* Init the hash table */
2487 if ( shadow_hash_alloc(d) != 0 )
2489 rv = -ENOMEM;
2490 goto out_locked;
2493 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2494 /* We assume we're dealing with an older 64bit linux guest until we
2495 * see the guest use more than one l4 per vcpu. */
2496 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2497 #endif
2499 /* Record the 1-to-1 pagetable we just made */
2500 if ( is_hvm_domain(d) )
2501 d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
2503 /* Update the bits */
2504 sh_new_mode(d, mode);
2506 out_locked:
2507 shadow_unlock(d);
2508 out_unlocked:
2509 if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
2510 p2m_teardown(d);
2511 if ( rv != 0 && pg != NULL )
2512 shadow_free_p2m_page(d, pg);
2513 domain_unpause(d);
2514 return rv;
2517 void shadow_teardown(struct domain *d)
2518 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2519 * Should only be called for dying domains. */
2521 struct vcpu *v;
2522 mfn_t mfn;
2523 struct list_head *entry, *n;
2524 struct page_info *pg;
2526 ASSERT(d->is_dying);
2527 ASSERT(d != current->domain);
2529 if ( !shadow_locked_by_me(d) )
2530 shadow_lock(d); /* Keep various asserts happy */
2532 if ( shadow_mode_enabled(d) )
2534 /* Release the shadow and monitor tables held by each vcpu */
2535 for_each_vcpu(d, v)
2537 if ( v->arch.paging.mode )
2539 v->arch.paging.mode->shadow.detach_old_tables(v);
2540 if ( shadow_mode_external(d) )
2542 mfn = pagetable_get_mfn(v->arch.monitor_table);
2543 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
2544 v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
2545 v->arch.monitor_table = pagetable_null();
2551 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2552 /* Free the virtual-TLB array attached to each vcpu */
2553 for_each_vcpu(d, v)
2555 if ( v->arch.paging.vtlb )
2557 xfree(v->arch.paging.vtlb);
2558 v->arch.paging.vtlb = NULL;
2561 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2563 list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
2565 list_del(entry);
2566 pg = list_entry(entry, struct page_info, list);
2567 shadow_free_p2m_page(d, pg);
2570 if ( d->arch.paging.shadow.total_pages != 0 )
2572 SHADOW_PRINTK("teardown of domain %u starts."
2573 " Shadow pages total = %u, free = %u, p2m=%u\n",
2574 d->domain_id,
2575 d->arch.paging.shadow.total_pages,
2576 d->arch.paging.shadow.free_pages,
2577 d->arch.paging.shadow.p2m_pages);
2578 /* Destroy all the shadows and release memory to domheap */
2579 sh_set_allocation(d, 0, NULL);
2580 /* Release the hash table back to xenheap */
2581 if (d->arch.paging.shadow.hash_table)
2582 shadow_hash_teardown(d);
2583 /* Should not have any more memory held */
2584 SHADOW_PRINTK("teardown done."
2585 " Shadow pages total = %u, free = %u, p2m=%u\n",
2586 d->arch.paging.shadow.total_pages,
2587 d->arch.paging.shadow.free_pages,
2588 d->arch.paging.shadow.p2m_pages);
2589 ASSERT(d->arch.paging.shadow.total_pages == 0);
2592 /* Free the non-paged-vcpus pagetable; must happen after we've
2593 * destroyed any shadows of it or sh_destroy_shadow will get confused. */
2594 if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
2596 for_each_vcpu(d, v)
2598 ASSERT(is_hvm_vcpu(v));
2599 if ( !hvm_paging_enabled(v) )
2600 v->arch.guest_table = pagetable_null();
2602 shadow_free_p2m_page(d,
2603 pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable));
2604 d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
2607 /* We leave the "permanent" shadow modes enabled, but clear the
2608 * log-dirty mode bit. We don't want any more mark_dirty()
2609 * calls now that we've torn down the bitmap */
2610 d->arch.paging.mode &= ~PG_log_dirty;
2612 shadow_unlock(d);
2615 void shadow_final_teardown(struct domain *d)
2616 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2618 SHADOW_PRINTK("dom %u final teardown starts."
2619 " Shadow pages total = %u, free = %u, p2m=%u\n",
2620 d->domain_id,
2621 d->arch.paging.shadow.total_pages,
2622 d->arch.paging.shadow.free_pages,
2623 d->arch.paging.shadow.p2m_pages);
2625 /* Double-check that the domain didn't have any shadow memory.
2626 * It is possible for a domain that never got domain_kill()ed
2627 * to get here with its shadow allocation intact. */
2628 if ( d->arch.paging.shadow.total_pages != 0 )
2629 shadow_teardown(d);
2631 /* It is now safe to pull down the p2m map. */
2632 p2m_teardown(d);
2634 SHADOW_PRINTK("dom %u final teardown done."
2635 " Shadow pages total = %u, free = %u, p2m=%u\n",
2636 d->domain_id,
2637 d->arch.paging.shadow.total_pages,
2638 d->arch.paging.shadow.free_pages,
2639 d->arch.paging.shadow.p2m_pages);
2642 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2643 /* Turn on a single shadow mode feature */
2645 ASSERT(shadow_locked_by_me(d));
2647 /* Sanity check the call */
2648 if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
2650 return -EINVAL;
2653 mode |= PG_SH_enable;
2655 if ( d->arch.paging.mode == 0 )
2657 /* Init the shadow memory allocation and the hash table */
2658 if ( sh_set_allocation(d, 1, NULL) != 0
2659 || shadow_hash_alloc(d) != 0 )
2661 sh_set_allocation(d, 0, NULL);
2662 return -ENOMEM;
2666 /* Update the bits */
2667 sh_new_mode(d, d->arch.paging.mode | mode);
2669 return 0;
2672 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2673 /* Turn off a single shadow mode feature */
2675 struct vcpu *v;
2676 ASSERT(shadow_locked_by_me(d));
2678 /* Sanity check the call */
2679 if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
2681 return -EINVAL;
2684 /* Update the bits */
2685 sh_new_mode(d, d->arch.paging.mode & ~mode);
2686 if ( d->arch.paging.mode == 0 )
2688 /* Get this domain off shadows */
2689 SHADOW_PRINTK("un-shadowing of domain %u starts."
2690 " Shadow pages total = %u, free = %u, p2m=%u\n",
2691 d->domain_id,
2692 d->arch.paging.shadow.total_pages,
2693 d->arch.paging.shadow.free_pages,
2694 d->arch.paging.shadow.p2m_pages);
2695 for_each_vcpu(d, v)
2697 if ( v->arch.paging.mode )
2698 v->arch.paging.mode->shadow.detach_old_tables(v);
2699 #if CONFIG_PAGING_LEVELS == 4
2700 if ( !(v->arch.flags & TF_kernel_mode) )
2701 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2702 else
2703 #endif
2704 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2708 /* Pull down the memory allocation */
2709 if ( sh_set_allocation(d, 0, NULL) != 0 )
2711 // XXX - How can this occur?
2712 // Seems like a bug to return an error now that we've
2713 // disabled the relevant shadow mode.
2714 //
2715 return -ENOMEM;
2717 shadow_hash_teardown(d);
2718 SHADOW_PRINTK("un-shadowing of domain %u done."
2719 " Shadow pages total = %u, free = %u, p2m=%u\n",
2720 d->domain_id,
2721 d->arch.paging.shadow.total_pages,
2722 d->arch.paging.shadow.free_pages,
2723 d->arch.paging.shadow.p2m_pages);
2726 return 0;
2729 /* Enable/disable ops for the "test" and "log-dirty" modes */
2730 static int shadow_test_enable(struct domain *d)
2732 int ret;
2734 domain_pause(d);
2735 shadow_lock(d);
2736 ret = shadow_one_bit_enable(d, PG_SH_enable);
2737 shadow_unlock(d);
2738 domain_unpause(d);
2740 return ret;
2743 static int shadow_test_disable(struct domain *d)
2745 int ret;
2747 domain_pause(d);
2748 shadow_lock(d);
2749 ret = shadow_one_bit_disable(d, PG_SH_enable);
2750 shadow_unlock(d);
2751 domain_unpause(d);
2753 return ret;
2756 /**************************************************************************/
2757 /* P2M map manipulations */
2759 /* shadow specific code which should be called when P2M table entry is updated
2760 * with new content. It is responsible for update the entry, as well as other
2761 * shadow processing jobs.
2762 */
2763 void
2764 shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn,
2765 l1_pgentry_t *p, mfn_t table_mfn,
2766 l1_pgentry_t new, unsigned int level)
2768 struct domain *d = v->domain;
2770 shadow_lock(d);
2772 /* If we're removing an MFN from the p2m, remove it from the shadows too */
2773 if ( level == 1 )
2775 mfn_t mfn = _mfn(l1e_get_pfn(*p));
2776 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
2777 if ( p2m_is_valid(p2mt) && mfn_valid(mfn) )
2779 sh_remove_all_shadows_and_parents(v, mfn);
2780 if ( sh_remove_all_mappings(v, mfn) )
2781 flush_tlb_mask(d->domain_dirty_cpumask);
2785 /* Update the entry with new content */
2786 safe_write_pte(p, new);
2788 /* install P2M in monitors for PAE Xen */
2789 #if CONFIG_PAGING_LEVELS == 3
2790 if ( level == 3 )
2791 /* We have written to the p2m l3: need to sync the per-vcpu
2792 * copies of it in the monitor tables */
2793 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
2794 #endif
2796 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
2797 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
2798 cached the fact that this is an mmio region in the shadow
2799 page tables. Blow the tables away to remove the cache.
2800 This is pretty heavy handed, but this is a rare operation
2801 (it might happen a dozen times during boot and then never
2802 again), so it doesn't matter too much. */
2803 if ( d->arch.paging.shadow.has_fast_mmio_entries )
2805 shadow_blow_tables(d);
2806 d->arch.paging.shadow.has_fast_mmio_entries = 0;
2808 #endif
2810 shadow_unlock(d);
2813 /**************************************************************************/
2814 /* Log-dirty mode support */
2816 /* Shadow specific code which is called in paging_log_dirty_enable().
2817 * Return 0 if no problem found.
2818 */
2819 int shadow_enable_log_dirty(struct domain *d)
2821 int ret;
2823 /* shadow lock is required here */
2824 shadow_lock(d);
2825 if ( shadow_mode_enabled(d) )
2827 /* This domain already has some shadows: need to clear them out
2828 * of the way to make sure that all references to guest memory are
2829 * properly write-protected */
2830 shadow_blow_tables(d);
2833 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2834 /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
2835 * change an l4e instead of cr3 to switch tables. Give them the
2836 * same optimization */
2837 if ( is_pv_32on64_domain(d) )
2838 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2839 #endif
2841 ret = shadow_one_bit_enable(d, PG_log_dirty);
2842 shadow_unlock(d);
2844 return ret;
2847 /* shadow specfic code which is called in paging_log_dirty_disable() */
2848 int shadow_disable_log_dirty(struct domain *d)
2850 int ret;
2852 /* shadow lock is required here */
2853 shadow_lock(d);
2854 ret = shadow_one_bit_disable(d, PG_log_dirty);
2855 shadow_unlock(d);
2857 return ret;
2860 /* This function is called when we CLEAN log dirty bitmap. See
2861 * paging_log_dirty_op() for details.
2862 */
2863 void shadow_clean_dirty_bitmap(struct domain *d)
2865 shadow_lock(d);
2866 /* Need to revoke write access to the domain's pages again.
2867 * In future, we'll have a less heavy-handed approach to this,
2868 * but for now, we just unshadow everything except Xen. */
2869 shadow_blow_tables(d);
2870 shadow_unlock(d);
2872 /**************************************************************************/
2873 /* Shadow-control XEN_DOMCTL dispatcher */
2875 int shadow_domctl(struct domain *d,
2876 xen_domctl_shadow_op_t *sc,
2877 XEN_GUEST_HANDLE(void) u_domctl)
2879 int rc, preempted = 0;
2881 switch ( sc->op )
2883 case XEN_DOMCTL_SHADOW_OP_OFF:
2884 if ( d->arch.paging.mode == PG_SH_enable )
2885 if ( (rc = shadow_test_disable(d)) != 0 )
2886 return rc;
2887 return 0;
2889 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
2890 return shadow_test_enable(d);
2892 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
2893 return shadow_enable(d, PG_refcounts|PG_translate);
2895 case XEN_DOMCTL_SHADOW_OP_ENABLE:
2896 return shadow_enable(d, sc->mode << PG_mode_shift);
2898 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
2899 sc->mb = shadow_get_allocation(d);
2900 return 0;
2902 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
2903 shadow_lock(d);
2904 if ( sc->mb == 0 && shadow_mode_enabled(d) )
2906 /* Can't set the allocation to zero unless the domain stops using
2907 * shadow pagetables first */
2908 SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
2909 " is still using shadows.\n", d->domain_id);
2910 shadow_unlock(d);
2911 return -EINVAL;
2913 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
2914 shadow_unlock(d);
2915 if ( preempted )
2916 /* Not finished. Set up to re-run the call. */
2917 rc = hypercall_create_continuation(
2918 __HYPERVISOR_domctl, "h", u_domctl);
2919 else
2920 /* Finished. Return the new allocation */
2921 sc->mb = shadow_get_allocation(d);
2922 return rc;
2924 default:
2925 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
2926 return -EINVAL;
2931 /**************************************************************************/
2932 /* Auditing shadow tables */
2934 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
2936 void shadow_audit_tables(struct vcpu *v)
2938 /* Dispatch table for getting per-type functions */
2939 static hash_callback_t callbacks[SH_type_unused] = {
2940 NULL, /* none */
2941 #if CONFIG_PAGING_LEVELS == 2
2942 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
2943 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
2944 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
2945 #else
2946 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
2947 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
2948 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
2949 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
2950 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
2951 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
2952 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
2953 #if CONFIG_PAGING_LEVELS >= 4
2954 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
2955 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
2956 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
2957 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2h_64 */
2958 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
2959 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
2960 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2961 #endif /* CONFIG_PAGING_LEVELS > 2 */
2962 NULL /* All the rest */
2963 };
2964 unsigned int mask;
2966 if ( !(SHADOW_AUDIT_ENABLE) )
2967 return;
2969 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
2970 mask = ~1; /* Audit every table in the system */
2971 else
2973 /* Audit only the current mode's tables */
2974 switch ( v->arch.paging.mode->guest_levels )
2976 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
2977 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
2978 |SHF_L2H_PAE); break;
2979 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
2980 |SHF_L3_64|SHF_L4_64); break;
2981 default: BUG();
2985 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
2988 #endif /* Shadow audit */
2990 /*
2991 * Local variables:
2992 * mode: C
2993 * c-set-style: "BSD"
2994 * c-basic-offset: 4
2995 * indent-tabs-mode: nil
2996 * End:
2997 */