ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 19738:8dd5c3cae086

x86 hvm: move dirty_vram into struct hvm_domain

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:04:03 2009 +0100 (2009-06-05)
parents 822ea2bf0c54
children 2f9e1348aa98
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include <xen/numa.h>
40 #include "private.h"
42 DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
44 /* Set up the shadow-specific parts of a domain struct at start of day.
45 * Called for every domain from arch_domain_create() */
46 void shadow_domain_init(struct domain *d)
47 {
48 int i;
49 shadow_lock_init(d);
50 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
51 INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
52 INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
53 INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
55 /* Use shadow pagetables for log-dirty support */
56 paging_log_dirty_init(d, shadow_enable_log_dirty,
57 shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
59 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
60 d->arch.paging.shadow.oos_active = 0;
61 #endif
62 }
64 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
65 * job is to initialize the update_paging_modes() function pointer, which is
66 * used to initialized the rest of resources. Therefore, it really does not
67 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
68 * be compiled.
69 */
70 void shadow_vcpu_init(struct vcpu *v)
71 {
72 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
73 int i, j;
75 for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
76 {
77 v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
78 v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
79 for ( j = 0; j < SHADOW_OOS_FIXUPS; j++ )
80 v->arch.paging.shadow.oos_fixup[i].smfn[j] = _mfn(INVALID_MFN);
81 }
82 #endif
84 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
85 }
87 #if SHADOW_AUDIT
88 int shadow_audit_enable = 0;
90 static void shadow_audit_key(unsigned char key)
91 {
92 shadow_audit_enable = !shadow_audit_enable;
93 printk("%s shadow_audit_enable=%d\n",
94 __func__, shadow_audit_enable);
95 }
97 static int __init shadow_audit_key_init(void)
98 {
99 register_keyhandler(
100 'O', shadow_audit_key, "toggle shadow audits");
101 return 0;
102 }
103 __initcall(shadow_audit_key_init);
104 #endif /* SHADOW_AUDIT */
106 int _shadow_mode_refcounts(struct domain *d)
107 {
108 return shadow_mode_refcounts(d);
109 }
112 /**************************************************************************/
113 /* x86 emulator support for the shadow code
114 */
116 struct segment_register *hvm_get_seg_reg(
117 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
118 {
119 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
120 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
121 hvm_get_segment_register(current, seg, seg_reg);
122 return seg_reg;
123 }
125 static int hvm_translate_linear_addr(
126 enum x86_segment seg,
127 unsigned long offset,
128 unsigned int bytes,
129 enum hvm_access_type access_type,
130 struct sh_emulate_ctxt *sh_ctxt,
131 unsigned long *paddr)
132 {
133 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
134 int okay;
136 okay = hvm_virtual_to_linear_addr(
137 seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr);
139 if ( !okay )
140 {
141 hvm_inject_exception(TRAP_gp_fault, 0, 0);
142 return X86EMUL_EXCEPTION;
143 }
145 return 0;
146 }
148 static int
149 hvm_read(enum x86_segment seg,
150 unsigned long offset,
151 void *p_data,
152 unsigned int bytes,
153 enum hvm_access_type access_type,
154 struct sh_emulate_ctxt *sh_ctxt)
155 {
156 unsigned long addr;
157 int rc;
159 rc = hvm_translate_linear_addr(
160 seg, offset, bytes, access_type, sh_ctxt, &addr);
161 if ( rc )
162 return rc;
164 if ( access_type == hvm_access_insn_fetch )
165 rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0);
166 else
167 rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0);
169 switch ( rc )
170 {
171 case HVMCOPY_okay:
172 return X86EMUL_OKAY;
173 case HVMCOPY_bad_gva_to_gfn:
174 return X86EMUL_EXCEPTION;
175 default:
176 break;
177 }
179 return X86EMUL_UNHANDLEABLE;
180 }
182 static int
183 hvm_emulate_read(enum x86_segment seg,
184 unsigned long offset,
185 void *p_data,
186 unsigned int bytes,
187 struct x86_emulate_ctxt *ctxt)
188 {
189 if ( !is_x86_user_segment(seg) )
190 return X86EMUL_UNHANDLEABLE;
191 return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
192 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
193 }
195 static int
196 hvm_emulate_insn_fetch(enum x86_segment seg,
197 unsigned long offset,
198 void *p_data,
199 unsigned int bytes,
200 struct x86_emulate_ctxt *ctxt)
201 {
202 struct sh_emulate_ctxt *sh_ctxt =
203 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
204 unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
206 ASSERT(seg == x86_seg_cs);
208 /* Fall back if requested bytes are not in the prefetch cache. */
209 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
210 return hvm_read(seg, offset, p_data, bytes,
211 hvm_access_insn_fetch, sh_ctxt);
213 /* Hit the cache. Simple memcpy. */
214 memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
215 return X86EMUL_OKAY;
216 }
218 static int
219 hvm_emulate_write(enum x86_segment seg,
220 unsigned long offset,
221 void *p_data,
222 unsigned int bytes,
223 struct x86_emulate_ctxt *ctxt)
224 {
225 struct sh_emulate_ctxt *sh_ctxt =
226 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
227 struct vcpu *v = current;
228 unsigned long addr;
229 int rc;
231 if ( !is_x86_user_segment(seg) )
232 return X86EMUL_UNHANDLEABLE;
234 /* How many emulations could we save if we unshadowed on stack writes? */
235 if ( seg == x86_seg_ss )
236 perfc_incr(shadow_fault_emulate_stack);
238 rc = hvm_translate_linear_addr(
239 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
240 if ( rc )
241 return rc;
243 return v->arch.paging.mode->shadow.x86_emulate_write(
244 v, addr, p_data, bytes, sh_ctxt);
245 }
247 static int
248 hvm_emulate_cmpxchg(enum x86_segment seg,
249 unsigned long offset,
250 void *p_old,
251 void *p_new,
252 unsigned int bytes,
253 struct x86_emulate_ctxt *ctxt)
254 {
255 struct sh_emulate_ctxt *sh_ctxt =
256 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
257 struct vcpu *v = current;
258 unsigned long addr, old[2], new[2];
259 int rc;
261 if ( !is_x86_user_segment(seg) )
262 return X86EMUL_UNHANDLEABLE;
264 rc = hvm_translate_linear_addr(
265 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
266 if ( rc )
267 return rc;
269 old[0] = new[0] = 0;
270 memcpy(old, p_old, bytes);
271 memcpy(new, p_new, bytes);
273 if ( bytes <= sizeof(long) )
274 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
275 v, addr, old[0], new[0], bytes, sh_ctxt);
277 #ifdef __i386__
278 if ( bytes == 8 )
279 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
280 v, addr, old[0], old[1], new[0], new[1], sh_ctxt);
281 #endif
283 return X86EMUL_UNHANDLEABLE;
284 }
286 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
287 .read = hvm_emulate_read,
288 .insn_fetch = hvm_emulate_insn_fetch,
289 .write = hvm_emulate_write,
290 .cmpxchg = hvm_emulate_cmpxchg,
291 };
293 static int
294 pv_emulate_read(enum x86_segment seg,
295 unsigned long offset,
296 void *p_data,
297 unsigned int bytes,
298 struct x86_emulate_ctxt *ctxt)
299 {
300 unsigned int rc;
302 if ( !is_x86_user_segment(seg) )
303 return X86EMUL_UNHANDLEABLE;
305 if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 )
306 {
307 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
308 return X86EMUL_EXCEPTION;
309 }
311 return X86EMUL_OKAY;
312 }
314 static int
315 pv_emulate_write(enum x86_segment seg,
316 unsigned long offset,
317 void *p_data,
318 unsigned int bytes,
319 struct x86_emulate_ctxt *ctxt)
320 {
321 struct sh_emulate_ctxt *sh_ctxt =
322 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
323 struct vcpu *v = current;
324 if ( !is_x86_user_segment(seg) )
325 return X86EMUL_UNHANDLEABLE;
326 return v->arch.paging.mode->shadow.x86_emulate_write(
327 v, offset, p_data, bytes, sh_ctxt);
328 }
330 static int
331 pv_emulate_cmpxchg(enum x86_segment seg,
332 unsigned long offset,
333 void *p_old,
334 void *p_new,
335 unsigned int bytes,
336 struct x86_emulate_ctxt *ctxt)
337 {
338 struct sh_emulate_ctxt *sh_ctxt =
339 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
340 unsigned long old[2], new[2];
341 struct vcpu *v = current;
343 if ( !is_x86_user_segment(seg) )
344 return X86EMUL_UNHANDLEABLE;
346 old[0] = new[0] = 0;
347 memcpy(old, p_old, bytes);
348 memcpy(new, p_new, bytes);
350 if ( bytes <= sizeof(long) )
351 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
352 v, offset, old[0], new[0], bytes, sh_ctxt);
354 #ifdef __i386__
355 if ( bytes == 8 )
356 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
357 v, offset, old[0], old[1], new[0], new[1], sh_ctxt);
358 #endif
360 return X86EMUL_UNHANDLEABLE;
361 }
363 static struct x86_emulate_ops pv_shadow_emulator_ops = {
364 .read = pv_emulate_read,
365 .insn_fetch = pv_emulate_read,
366 .write = pv_emulate_write,
367 .cmpxchg = pv_emulate_cmpxchg,
368 };
370 struct x86_emulate_ops *shadow_init_emulation(
371 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
372 {
373 struct segment_register *creg, *sreg;
374 struct vcpu *v = current;
375 unsigned long addr;
377 sh_ctxt->ctxt.regs = regs;
378 sh_ctxt->ctxt.force_writeback = 0;
380 if ( !is_hvm_vcpu(v) )
381 {
382 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
383 return &pv_shadow_emulator_ops;
384 }
386 /* Segment cache initialisation. Primed with CS. */
387 sh_ctxt->valid_seg_regs = 0;
388 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
390 /* Work out the emulation mode. */
391 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
392 {
393 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
394 }
395 else
396 {
397 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
398 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
399 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
400 }
402 /* Attempt to prefetch whole instruction. */
403 sh_ctxt->insn_buf_eip = regs->eip;
404 sh_ctxt->insn_buf_bytes =
405 (!hvm_translate_linear_addr(
406 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
407 hvm_access_insn_fetch, sh_ctxt, &addr) &&
408 !hvm_fetch_from_guest_virt_nofault(
409 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
410 ? sizeof(sh_ctxt->insn_buf) : 0;
412 return &hvm_shadow_emulator_ops;
413 }
415 /* Update an initialized emulation context to prepare for the next
416 * instruction */
417 void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
418 struct cpu_user_regs *regs)
419 {
420 struct vcpu *v = current;
421 unsigned long addr, diff;
423 /* We don't refetch the segment bases, because we don't emulate
424 * writes to segment registers */
426 if ( is_hvm_vcpu(v) )
427 {
428 diff = regs->eip - sh_ctxt->insn_buf_eip;
429 if ( diff > sh_ctxt->insn_buf_bytes )
430 {
431 /* Prefetch more bytes. */
432 sh_ctxt->insn_buf_bytes =
433 (!hvm_translate_linear_addr(
434 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
435 hvm_access_insn_fetch, sh_ctxt, &addr) &&
436 !hvm_fetch_from_guest_virt_nofault(
437 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
438 ? sizeof(sh_ctxt->insn_buf) : 0;
439 sh_ctxt->insn_buf_eip = regs->eip;
440 }
441 }
442 }
445 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
446 /**************************************************************************/
447 /* Out-of-sync shadows. */
449 /* From time to time, we let a shadowed pagetable page go out of sync
450 * with its shadow: the guest is allowed to write directly to the page,
451 * and those writes are not synchronously reflected in the shadow.
452 * This lets us avoid many emulations if the guest is writing a lot to a
453 * pagetable, but it relaxes a pretty important invariant in the shadow
454 * pagetable design. Therefore, some rules:
455 *
456 * 1. Only L1 pagetables may go out of sync: any page that is shadowed
457 * at at higher level must be synchronously updated. This makes
458 * using linear shadow pagetables much less dangerous.
459 * That means that: (a) unsyncing code needs to check for higher-level
460 * shadows, and (b) promotion code needs to resync.
461 *
462 * 2. All shadow operations on a guest page require the page to be brought
463 * back into sync before proceeding. This must be done under the
464 * shadow lock so that the page is guaranteed to remain synced until
465 * the operation completes.
466 *
467 * Exceptions to this rule: the pagefault and invlpg handlers may
468 * update only one entry on an out-of-sync page without resyncing it.
469 *
470 * 3. Operations on shadows that do not start from a guest page need to
471 * be aware that they may be handling an out-of-sync shadow.
472 *
473 * 4. Operations that do not normally take the shadow lock (fast-path
474 * #PF handler, INVLPG) must fall back to a locking, syncing version
475 * if they see an out-of-sync table.
476 *
477 * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
478 * must explicitly resync all relevant pages or update their
479 * shadows.
480 *
481 * Currently out-of-sync pages are listed in a simple open-addressed
482 * hash table with a second chance (must resist temptation to radically
483 * over-engineer hash tables...) The virtual address of the access
484 * which caused us to unsync the page is also kept in the hash table, as
485 * a hint for finding the writable mappings later.
486 *
487 * We keep a hash per vcpu, because we want as much as possible to do
488 * the re-sync on the save vcpu we did the unsync on, so the VA hint
489 * will be valid.
490 */
493 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
494 static void sh_oos_audit(struct domain *d)
495 {
496 int idx, expected_idx, expected_idx_alt;
497 struct page_info *pg;
498 struct vcpu *v;
500 for_each_vcpu(d, v)
501 {
502 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
503 {
504 mfn_t *oos = v->arch.paging.shadow.oos;
505 if ( !mfn_valid(oos[idx]) )
506 continue;
508 expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
509 expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
510 if ( idx != expected_idx && idx != expected_idx_alt )
511 {
512 printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
513 __func__, idx, mfn_x(oos[idx]),
514 expected_idx, expected_idx_alt);
515 BUG();
516 }
517 pg = mfn_to_page(oos[idx]);
518 if ( !(pg->count_info & PGC_page_table) )
519 {
520 printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
521 __func__, idx, mfn_x(oos[idx]), pg->count_info);
522 BUG();
523 }
524 if ( !(pg->shadow_flags & SHF_out_of_sync) )
525 {
526 printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
527 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
528 BUG();
529 }
530 if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
531 {
532 printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
533 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
534 BUG();
535 }
536 }
537 }
538 }
539 #endif
541 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
542 void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
543 {
544 int idx;
545 struct vcpu *v;
546 mfn_t *oos;
548 ASSERT(mfn_is_out_of_sync(gmfn));
550 for_each_vcpu(d, v)
551 {
552 oos = v->arch.paging.shadow.oos;
553 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
554 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
555 idx = (idx + 1) % SHADOW_OOS_PAGES;
557 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
558 return;
559 }
561 SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
562 BUG();
563 }
564 #endif
566 /* Update the shadow, but keep the page out of sync. */
567 static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
568 {
569 struct page_info *pg = mfn_to_page(gmfn);
571 ASSERT(mfn_valid(gmfn));
572 ASSERT(page_is_out_of_sync(pg));
574 /* Call out to the appropriate per-mode resyncing function */
575 if ( pg->shadow_flags & SHF_L1_32 )
576 SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
577 else if ( pg->shadow_flags & SHF_L1_PAE )
578 SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
579 #if CONFIG_PAGING_LEVELS >= 4
580 else if ( pg->shadow_flags & SHF_L1_64 )
581 SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
582 #endif
583 }
586 /*
587 * Fixup arrays: We limit the maximum number of writable mappings to
588 * SHADOW_OOS_FIXUPS and store enough information to remove them
589 * quickly on resync.
590 */
592 static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn,
593 struct oos_fixup *fixup)
594 {
595 int i;
596 for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
597 {
598 if ( mfn_x(fixup->smfn[i]) != INVALID_MFN )
599 {
600 sh_remove_write_access_from_sl1p(v, gmfn,
601 fixup->smfn[i],
602 fixup->off[i]);
603 fixup->smfn[i] = _mfn(INVALID_MFN);
604 }
605 }
607 /* Always flush the TLBs. See comment on oos_fixup_add(). */
608 return 1;
609 }
611 void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
612 mfn_t smfn, unsigned long off)
613 {
614 int idx, next;
615 mfn_t *oos;
616 struct oos_fixup *oos_fixup;
617 struct domain *d = v->domain;
619 perfc_incr(shadow_oos_fixup_add);
621 for_each_vcpu(d, v)
622 {
623 oos = v->arch.paging.shadow.oos;
624 oos_fixup = v->arch.paging.shadow.oos_fixup;
625 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
626 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
627 idx = (idx + 1) % SHADOW_OOS_PAGES;
628 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
629 {
630 int i;
631 for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
632 {
633 if ( mfn_valid(oos_fixup[idx].smfn[i])
634 && (mfn_x(oos_fixup[idx].smfn[i]) == mfn_x(smfn))
635 && (oos_fixup[idx].off[i] == off) )
636 return;
637 }
639 next = oos_fixup[idx].next;
641 if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN )
642 {
643 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT);
645 /* Reuse this slot and remove current writable mapping. */
646 sh_remove_write_access_from_sl1p(v, gmfn,
647 oos_fixup[idx].smfn[next],
648 oos_fixup[idx].off[next]);
649 perfc_incr(shadow_oos_fixup_evict);
650 /* We should flush the TLBs now, because we removed a
651 writable mapping, but since the shadow is already
652 OOS we have no problem if another vcpu write to
653 this page table. We just have to be very careful to
654 *always* flush the tlbs on resync. */
655 }
657 oos_fixup[idx].smfn[next] = smfn;
658 oos_fixup[idx].off[next] = off;
659 oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS;
661 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD);
662 return;
663 }
664 }
666 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
667 BUG();
668 }
670 static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn,
671 struct oos_fixup *fixup)
672 {
673 int ftlb = 0;
675 ftlb |= oos_fixup_flush_gmfn(v, gmfn, fixup);
677 switch ( sh_remove_write_access(v, gmfn, 0, 0) )
678 {
679 default:
680 case 0:
681 break;
683 case 1:
684 ftlb |= 1;
685 break;
687 case -1:
688 /* An unfindable writeable typecount has appeared, probably via a
689 * grant table entry: can't shoot the mapping, so try to unshadow
690 * the page. If that doesn't work either, the guest is granting
691 * his pagetables and must be killed after all.
692 * This will flush the tlb, so we can return with no worries. */
693 sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
694 return 1;
695 }
697 if ( ftlb )
698 flush_tlb_mask(&v->domain->domain_dirty_cpumask);
700 return 0;
701 }
704 static inline void trace_resync(int event, mfn_t gmfn)
705 {
706 if ( tb_init_done )
707 {
708 /* Convert gmfn to gfn */
709 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
710 __trace_var(event, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
711 }
712 }
714 /* Pull all the entries on an out-of-sync page back into sync. */
715 static void _sh_resync(struct vcpu *v, mfn_t gmfn,
716 struct oos_fixup *fixup, mfn_t snp)
717 {
718 struct page_info *pg = mfn_to_page(gmfn);
720 ASSERT(shadow_locked_by_me(v->domain));
721 ASSERT(mfn_is_out_of_sync(gmfn));
722 /* Guest page must be shadowed *only* as L1 when out of sync. */
723 ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
724 & ~SHF_L1_ANY));
725 ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
727 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
728 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
730 /* Need to pull write access so the page *stays* in sync. */
731 if ( oos_remove_write_access(v, gmfn, fixup) )
732 {
733 /* Page has been unshadowed. */
734 return;
735 }
737 /* No more writable mappings of this page, please */
738 pg->shadow_flags &= ~SHF_oos_may_write;
740 /* Update the shadows with current guest entries. */
741 _sh_resync_l1(v, gmfn, snp);
743 /* Now we know all the entries are synced, and will stay that way */
744 pg->shadow_flags &= ~SHF_out_of_sync;
745 perfc_incr(shadow_resync);
746 trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn);
747 }
750 /* Add an MFN to the list of out-of-sync guest pagetables */
751 static void oos_hash_add(struct vcpu *v, mfn_t gmfn)
752 {
753 int i, idx, oidx, swap = 0;
754 void *gptr, *gsnpptr;
755 mfn_t *oos = v->arch.paging.shadow.oos;
756 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
757 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
758 struct oos_fixup fixup = { .next = 0 };
760 for (i = 0; i < SHADOW_OOS_FIXUPS; i++ )
761 fixup.smfn[i] = _mfn(INVALID_MFN);
763 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
764 oidx = idx;
766 if ( mfn_valid(oos[idx])
767 && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
768 {
769 /* Punt the current occupant into the next slot */
770 SWAP(oos[idx], gmfn);
771 SWAP(oos_fixup[idx], fixup);
772 swap = 1;
773 idx = (idx + 1) % SHADOW_OOS_PAGES;
774 }
775 if ( mfn_valid(oos[idx]) )
776 {
777 /* Crush the current occupant. */
778 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
779 perfc_incr(shadow_unsync_evict);
780 }
781 oos[idx] = gmfn;
782 oos_fixup[idx] = fixup;
784 if ( swap )
785 SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
787 gptr = sh_map_domain_page(oos[oidx]);
788 gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
789 memcpy(gsnpptr, gptr, PAGE_SIZE);
790 sh_unmap_domain_page(gptr);
791 sh_unmap_domain_page(gsnpptr);
792 }
794 /* Remove an MFN from the list of out-of-sync guest pagetables */
795 static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
796 {
797 int idx;
798 mfn_t *oos;
799 struct domain *d = v->domain;
801 SHADOW_PRINTK("D%dV%d gmfn %lx\n",
802 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
804 for_each_vcpu(d, v)
805 {
806 oos = v->arch.paging.shadow.oos;
807 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
808 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
809 idx = (idx + 1) % SHADOW_OOS_PAGES;
810 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
811 {
812 oos[idx] = _mfn(INVALID_MFN);
813 return;
814 }
815 }
817 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
818 BUG();
819 }
821 mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
822 {
823 int idx;
824 mfn_t *oos;
825 mfn_t *oos_snapshot;
826 struct domain *d = v->domain;
828 for_each_vcpu(d, v)
829 {
830 oos = v->arch.paging.shadow.oos;
831 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
832 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
833 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
834 idx = (idx + 1) % SHADOW_OOS_PAGES;
835 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
836 {
837 return oos_snapshot[idx];
838 }
839 }
841 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
842 BUG();
843 return _mfn(INVALID_MFN);
844 }
846 /* Pull a single guest page back into sync */
847 void sh_resync(struct vcpu *v, mfn_t gmfn)
848 {
849 int idx;
850 mfn_t *oos;
851 mfn_t *oos_snapshot;
852 struct oos_fixup *oos_fixup;
853 struct domain *d = v->domain;
855 for_each_vcpu(d, v)
856 {
857 oos = v->arch.paging.shadow.oos;
858 oos_fixup = v->arch.paging.shadow.oos_fixup;
859 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
860 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
861 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
862 idx = (idx + 1) % SHADOW_OOS_PAGES;
864 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
865 {
866 _sh_resync(v, gmfn, &oos_fixup[idx], oos_snapshot[idx]);
867 oos[idx] = _mfn(INVALID_MFN);
868 return;
869 }
870 }
872 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
873 BUG();
874 }
876 /* Figure out whether it's definitely safe not to sync this l1 table,
877 * by making a call out to the mode in which that shadow was made. */
878 static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
879 {
880 struct page_info *pg = mfn_to_page(gl1mfn);
881 if ( pg->shadow_flags & SHF_L1_32 )
882 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
883 else if ( pg->shadow_flags & SHF_L1_PAE )
884 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
885 #if CONFIG_PAGING_LEVELS >= 4
886 else if ( pg->shadow_flags & SHF_L1_64 )
887 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
888 #endif
889 SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n",
890 mfn_x(gl1mfn));
891 BUG();
892 return 0; /* BUG() is no longer __attribute__((noreturn)). */
893 }
896 /* Pull all out-of-sync pages back into sync. Pages brought out of sync
897 * on other vcpus are allowed to remain out of sync, but their contents
898 * will be made safe (TLB flush semantics); pages unsynced by this vcpu
899 * are brought back into sync and write-protected. If skip != 0, we try
900 * to avoid resyncing at all if we think we can get away with it. */
901 void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking)
902 {
903 int idx;
904 struct vcpu *other;
905 mfn_t *oos = v->arch.paging.shadow.oos;
906 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
907 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
909 SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
911 ASSERT(do_locking || shadow_locked_by_me(v->domain));
913 if ( !this )
914 goto resync_others;
916 if ( do_locking )
917 shadow_lock(v->domain);
919 /* First: resync all of this vcpu's oos pages */
920 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
921 if ( mfn_valid(oos[idx]) )
922 {
923 /* Write-protect and sync contents */
924 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
925 oos[idx] = _mfn(INVALID_MFN);
926 }
928 if ( do_locking )
929 shadow_unlock(v->domain);
931 resync_others:
932 if ( !others )
933 return;
935 /* Second: make all *other* vcpus' oos pages safe. */
936 for_each_vcpu(v->domain, other)
937 {
938 if ( v == other )
939 continue;
941 if ( do_locking )
942 shadow_lock(v->domain);
944 oos = other->arch.paging.shadow.oos;
945 oos_fixup = other->arch.paging.shadow.oos_fixup;
946 oos_snapshot = other->arch.paging.shadow.oos_snapshot;
948 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
949 {
950 if ( !mfn_valid(oos[idx]) )
951 continue;
953 if ( skip )
954 {
955 /* Update the shadows and leave the page OOS. */
956 if ( sh_skip_sync(v, oos[idx]) )
957 continue;
958 trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]);
959 _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
960 }
961 else
962 {
963 /* Write-protect and sync contents */
964 _sh_resync(other, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
965 oos[idx] = _mfn(INVALID_MFN);
966 }
967 }
969 if ( do_locking )
970 shadow_unlock(v->domain);
971 }
972 }
974 /* Allow a shadowed page to go out of sync. Unsyncs are traced in
975 * multi.c:sh_page_fault() */
976 int sh_unsync(struct vcpu *v, mfn_t gmfn)
977 {
978 struct page_info *pg;
980 ASSERT(shadow_locked_by_me(v->domain));
982 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
983 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
985 pg = mfn_to_page(gmfn);
987 /* Guest page must be shadowed *only* as L1 and *only* once when out
988 * of sync. Also, get out now if it's already out of sync.
989 * Also, can't safely unsync if some vcpus have paging disabled.*/
990 if ( pg->shadow_flags &
991 ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
992 || sh_page_has_multiple_shadows(pg)
993 || !is_hvm_domain(v->domain)
994 || !v->domain->arch.paging.shadow.oos_active )
995 return 0;
997 pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
998 oos_hash_add(v, gmfn);
999 perfc_incr(shadow_unsync);
1000 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC);
1001 return 1;
1004 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
1007 /**************************************************************************/
1008 /* Code for "promoting" a guest page to the point where the shadow code is
1009 * willing to let it be treated as a guest page table. This generally
1010 * involves making sure there are no writable mappings available to the guest
1011 * for this page.
1012 */
1013 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
1015 struct page_info *page = mfn_to_page(gmfn);
1017 ASSERT(mfn_valid(gmfn));
1019 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1020 /* Is the page already shadowed and out of sync? */
1021 if ( page_is_out_of_sync(page) )
1022 sh_resync(v, gmfn);
1023 #endif
1025 /* We should never try to promote a gmfn that has writeable mappings */
1026 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
1027 || (page->u.inuse.type_info & PGT_count_mask) == 0
1028 || v->domain->is_shutting_down);
1030 /* Is the page already shadowed? */
1031 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
1032 page->shadow_flags = 0;
1034 ASSERT(!test_bit(type, &page->shadow_flags));
1035 set_bit(type, &page->shadow_flags);
1036 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
1039 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
1041 struct page_info *page = mfn_to_page(gmfn);
1043 ASSERT(test_bit(_PGC_page_table, &page->count_info));
1044 ASSERT(test_bit(type, &page->shadow_flags));
1046 clear_bit(type, &page->shadow_flags);
1048 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
1050 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1051 /* Was the page out of sync? */
1052 if ( page_is_out_of_sync(page) )
1054 oos_hash_remove(v, gmfn);
1056 #endif
1057 clear_bit(_PGC_page_table, &page->count_info);
1060 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE);
1063 /**************************************************************************/
1064 /* Validate a pagetable change from the guest and update the shadows.
1065 * Returns a bitmask of SHADOW_SET_* flags. */
1067 int
1068 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
1070 int result = 0;
1071 struct page_info *page = mfn_to_page(gmfn);
1073 paging_mark_dirty(v->domain, mfn_x(gmfn));
1075 // Determine which types of shadows are affected, and update each.
1076 //
1077 // Always validate L1s before L2s to prevent another cpu with a linear
1078 // mapping of this gmfn from seeing a walk that results from
1079 // using the new L2 value and the old L1 value. (It is OK for such a
1080 // guest to see a walk that uses the old L2 value with the new L1 value,
1081 // as hardware could behave this way if one level of the pagewalk occurs
1082 // before the store, and the next level of the pagewalk occurs after the
1083 // store.
1084 //
1085 // Ditto for L2s before L3s, etc.
1086 //
1088 if ( !(page->count_info & PGC_page_table) )
1089 return 0; /* Not shadowed at all */
1091 if ( page->shadow_flags & SHF_L1_32 )
1092 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2)
1093 (v, gmfn, entry, size);
1094 if ( page->shadow_flags & SHF_L2_32 )
1095 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2)
1096 (v, gmfn, entry, size);
1098 if ( page->shadow_flags & SHF_L1_PAE )
1099 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3)
1100 (v, gmfn, entry, size);
1101 if ( page->shadow_flags & SHF_L2_PAE )
1102 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3)
1103 (v, gmfn, entry, size);
1104 if ( page->shadow_flags & SHF_L2H_PAE )
1105 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3)
1106 (v, gmfn, entry, size);
1108 #if CONFIG_PAGING_LEVELS >= 4
1109 if ( page->shadow_flags & SHF_L1_64 )
1110 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4)
1111 (v, gmfn, entry, size);
1112 if ( page->shadow_flags & SHF_L2_64 )
1113 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4)
1114 (v, gmfn, entry, size);
1115 if ( page->shadow_flags & SHF_L2H_64 )
1116 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4)
1117 (v, gmfn, entry, size);
1118 if ( page->shadow_flags & SHF_L3_64 )
1119 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4)
1120 (v, gmfn, entry, size);
1121 if ( page->shadow_flags & SHF_L4_64 )
1122 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4)
1123 (v, gmfn, entry, size);
1124 #else /* 32-bit hypervisor does not support 64-bit guests */
1125 ASSERT((page->shadow_flags
1126 & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
1127 #endif
1128 this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED));
1130 return result;
1134 void
1135 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
1136 void *entry, u32 size)
1137 /* This is the entry point for emulated writes to pagetables in HVM guests and
1138 * PV translated guests.
1139 */
1141 struct domain *d = v->domain;
1142 int rc;
1144 ASSERT(shadow_locked_by_me(v->domain));
1145 rc = sh_validate_guest_entry(v, gmfn, entry, size);
1146 if ( rc & SHADOW_SET_FLUSH )
1147 /* Need to flush TLBs to pick up shadow PT changes */
1148 flush_tlb_mask(&d->domain_dirty_cpumask);
1149 if ( rc & SHADOW_SET_ERROR )
1151 /* This page is probably not a pagetable any more: tear it out of the
1152 * shadows, along with any tables that reference it.
1153 * Since the validate call above will have made a "safe" (i.e. zero)
1154 * shadow entry, we can let the domain live even if we can't fully
1155 * unshadow the page. */
1156 sh_remove_shadows(v, gmfn, 0, 0);
1160 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
1161 intpte_t new, mfn_t gmfn)
1162 /* Write a new value into the guest pagetable, and update the shadows
1163 * appropriately. Returns 0 if we page-faulted, 1 for success. */
1165 int failed;
1166 shadow_lock(v->domain);
1167 failed = __copy_to_user(p, &new, sizeof(new));
1168 if ( failed != sizeof(new) )
1169 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1170 shadow_unlock(v->domain);
1171 return (failed == 0);
1174 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
1175 intpte_t *old, intpte_t new, mfn_t gmfn)
1176 /* Cmpxchg a new value into the guest pagetable, and update the shadows
1177 * appropriately. Returns 0 if we page-faulted, 1 if not.
1178 * N.B. caller should check the value of "old" to see if the
1179 * cmpxchg itself was successful. */
1181 int failed;
1182 intpte_t t = *old;
1183 shadow_lock(v->domain);
1184 failed = cmpxchg_user(p, t, new);
1185 if ( t == *old )
1186 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1187 *old = t;
1188 shadow_unlock(v->domain);
1189 return (failed == 0);
1193 /**************************************************************************/
1194 /* Memory management for shadow pages. */
1196 /* Allocating shadow pages
1197 * -----------------------
1199 * Most shadow pages are allocated singly, but there is one case where
1200 * we need to allocate multiple pages together: shadowing 32-bit guest
1201 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
1202 * of virtual address space, and needs to be shadowed by two PAE/64-bit
1203 * l1 tables (covering 2MB of virtual address space each). Similarly, a
1204 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
1205 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
1206 * contiguous and aligned; functions for handling offsets into them are
1207 * defined in shadow.c (shadow_l1_index() etc.)
1209 * This table shows the allocation behaviour of the different modes:
1211 * Xen paging pae pae 64b 64b 64b
1212 * Guest paging 32b pae 32b pae 64b
1213 * PV or HVM HVM * HVM HVM *
1214 * Shadow paging pae pae pae pae 64b
1216 * sl1 size 8k 4k 8k 4k 4k
1217 * sl2 size 16k 4k 16k 4k 4k
1218 * sl3 size - - - - 4k
1219 * sl4 size - - - - 4k
1221 * We allocate memory from xen in four-page units and break them down
1222 * with a simple buddy allocator. Can't use the xen allocator to handle
1223 * this as it only works for contiguous zones, and a domain's shadow
1224 * pool is made of fragments.
1226 * In HVM guests, the p2m table is built out of shadow pages, and we provide
1227 * a function for the p2m management to steal pages, in max-order chunks, from
1228 * the free pool. We don't provide for giving them back, yet.
1229 */
1231 /* Figure out the least acceptable quantity of shadow memory.
1232 * The minimum memory requirement for always being able to free up a
1233 * chunk of memory is very small -- only three max-order chunks per
1234 * vcpu to hold the top level shadows and pages with Xen mappings in them.
1236 * But for a guest to be guaranteed to successfully execute a single
1237 * instruction, we must be able to map a large number (about thirty) VAs
1238 * at the same time, which means that to guarantee progress, we must
1239 * allow for more than ninety allocated pages per vcpu. We round that
1240 * up to 128 pages, or half a megabyte per vcpu. */
1241 static unsigned int shadow_min_acceptable_pages(struct domain *d)
1243 u32 vcpu_count = 0;
1244 struct vcpu *v;
1246 for_each_vcpu(d, v)
1247 vcpu_count++;
1249 return (vcpu_count * 128);
1252 /* Figure out the order of allocation needed for a given shadow type */
1253 static inline u32
1254 shadow_order(unsigned int shadow_type)
1256 static const u32 type_to_order[SH_type_unused] = {
1257 0, /* SH_type_none */
1258 1, /* SH_type_l1_32_shadow */
1259 1, /* SH_type_fl1_32_shadow */
1260 2, /* SH_type_l2_32_shadow */
1261 0, /* SH_type_l1_pae_shadow */
1262 0, /* SH_type_fl1_pae_shadow */
1263 0, /* SH_type_l2_pae_shadow */
1264 0, /* SH_type_l2h_pae_shadow */
1265 0, /* SH_type_l1_64_shadow */
1266 0, /* SH_type_fl1_64_shadow */
1267 0, /* SH_type_l2_64_shadow */
1268 0, /* SH_type_l2h_64_shadow */
1269 0, /* SH_type_l3_64_shadow */
1270 0, /* SH_type_l4_64_shadow */
1271 2, /* SH_type_p2m_table */
1272 0, /* SH_type_monitor_table */
1273 0 /* SH_type_oos_snapshot */
1274 };
1275 ASSERT(shadow_type < SH_type_unused);
1276 return type_to_order[shadow_type];
1279 static inline unsigned int
1280 shadow_max_order(struct domain *d)
1282 return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0;
1285 /* Do we have at total of count pages of the requested order free? */
1286 static inline int space_is_available(
1287 struct domain *d,
1288 unsigned int order,
1289 unsigned int count)
1291 for ( ; order <= shadow_max_order(d); ++order )
1293 unsigned int n = count;
1294 const struct page_info *sp;
1296 page_list_for_each ( sp, &d->arch.paging.shadow.freelists[order] )
1297 if ( --n == 0 )
1298 return 1;
1299 count = (count + 1) >> 1;
1302 return 0;
1305 /* Dispatcher function: call the per-mode function that will unhook the
1306 * non-Xen mappings in this top-level shadow mfn */
1307 static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
1309 struct page_info *sp = mfn_to_page(smfn);
1310 switch ( sp->u.sh.type )
1312 case SH_type_l2_32_shadow:
1313 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v,smfn);
1314 break;
1315 case SH_type_l2_pae_shadow:
1316 case SH_type_l2h_pae_shadow:
1317 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v,smfn);
1318 break;
1319 #if CONFIG_PAGING_LEVELS >= 4
1320 case SH_type_l4_64_shadow:
1321 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v,smfn);
1322 break;
1323 #endif
1324 default:
1325 SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->u.sh.type);
1326 BUG();
1330 static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
1332 if ( tb_init_done )
1334 /* Convert smfn to gfn */
1335 unsigned long gfn;
1336 ASSERT(mfn_valid(smfn));
1337 gfn = mfn_to_gfn(d, _mfn(mfn_to_page(smfn)->v.sh.back));
1338 __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/,
1339 sizeof(gfn), (unsigned char*)&gfn);
1343 /* Make sure there are at least count order-sized pages
1344 * available in the shadow page pool. */
1345 static void _shadow_prealloc(
1346 struct domain *d,
1347 unsigned int order,
1348 unsigned int count)
1350 /* Need a vpcu for calling unpins; for now, since we don't have
1351 * per-vcpu shadows, any will do */
1352 struct vcpu *v, *v2;
1353 struct page_info *sp, *t;
1354 mfn_t smfn;
1355 int i;
1357 ASSERT(order <= shadow_max_order(d));
1358 if ( space_is_available(d, order, count) ) return;
1360 v = current;
1361 if ( v->domain != d )
1362 v = d->vcpu[0];
1363 ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */
1365 /* Stage one: walk the list of pinned pages, unpinning them */
1366 perfc_incr(shadow_prealloc_1);
1367 page_list_for_each_safe_reverse(sp, t, &d->arch.paging.shadow.pinned_shadows)
1369 smfn = page_to_mfn(sp);
1371 /* Unpin this top-level shadow */
1372 trace_shadow_prealloc_unpin(d, smfn);
1373 sh_unpin(v, smfn);
1375 /* See if that freed up enough space */
1376 if ( space_is_available(d, order, count) ) return;
1379 /* Stage two: all shadow pages are in use in hierarchies that are
1380 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
1381 * mappings. */
1382 perfc_incr(shadow_prealloc_2);
1384 for_each_vcpu(d, v2)
1385 for ( i = 0 ; i < 4 ; i++ )
1387 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
1389 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
1390 shadow_unhook_mappings(v,
1391 pagetable_get_mfn(v2->arch.shadow_table[i]));
1393 /* See if that freed up enough space */
1394 if ( space_is_available(d, order, count) )
1396 flush_tlb_mask(&d->domain_dirty_cpumask);
1397 return;
1402 /* Nothing more we can do: all remaining shadows are of pages that
1403 * hold Xen mappings for some vcpu. This can never happen. */
1404 SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n"
1405 " shadow pages total = %u, free = %u, p2m=%u\n",
1406 count, order,
1407 d->arch.paging.shadow.total_pages,
1408 d->arch.paging.shadow.free_pages,
1409 d->arch.paging.shadow.p2m_pages);
1410 BUG();
1413 /* Make sure there are at least count pages of the order according to
1414 * type available in the shadow page pool.
1415 * This must be called before any calls to shadow_alloc(). Since this
1416 * will free existing shadows to make room, it must be called early enough
1417 * to avoid freeing shadows that the caller is currently working on. */
1418 void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
1420 ASSERT(type != SH_type_p2m_table);
1421 return _shadow_prealloc(d, shadow_order(type), count);
1424 /* Deliberately free all the memory we can: this will tear down all of
1425 * this domain's shadows */
1426 static void shadow_blow_tables(struct domain *d)
1428 struct page_info *sp, *t;
1429 struct vcpu *v = d->vcpu[0];
1430 mfn_t smfn;
1431 int i;
1433 ASSERT(v != NULL);
1435 /* Pass one: unpin all pinned pages */
1436 page_list_for_each_safe_reverse(sp, t, &d->arch.paging.shadow.pinned_shadows)
1438 smfn = page_to_mfn(sp);
1439 sh_unpin(v, smfn);
1442 /* Second pass: unhook entries of in-use shadows */
1443 for_each_vcpu(d, v)
1444 for ( i = 0 ; i < 4 ; i++ )
1445 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
1446 shadow_unhook_mappings(v,
1447 pagetable_get_mfn(v->arch.shadow_table[i]));
1449 /* Make sure everyone sees the unshadowings */
1450 flush_tlb_mask(&d->domain_dirty_cpumask);
1453 void shadow_blow_tables_per_domain(struct domain *d)
1455 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL ) {
1456 shadow_lock(d);
1457 shadow_blow_tables(d);
1458 shadow_unlock(d);
1462 #ifndef NDEBUG
1463 /* Blow all shadows of all shadowed domains: this can be used to cause the
1464 * guest's pagetables to be re-shadowed if we suspect that the shadows
1465 * have somehow got out of sync */
1466 static void shadow_blow_all_tables(unsigned char c)
1468 struct domain *d;
1469 printk("'%c' pressed -> blowing all shadow tables\n", c);
1470 rcu_read_lock(&domlist_read_lock);
1471 for_each_domain(d)
1473 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
1475 shadow_lock(d);
1476 shadow_blow_tables(d);
1477 shadow_unlock(d);
1480 rcu_read_unlock(&domlist_read_lock);
1483 /* Register this function in the Xen console keypress table */
1484 static __init int shadow_blow_tables_keyhandler_init(void)
1486 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
1487 return 0;
1489 __initcall(shadow_blow_tables_keyhandler_init);
1490 #endif /* !NDEBUG */
1492 static inline struct page_info *
1493 next_shadow(const struct page_info *sp)
1495 return sp->next_shadow ? mfn_to_page(_mfn(sp->next_shadow)) : NULL;
1498 static inline void
1499 set_next_shadow(struct page_info *sp, struct page_info *next)
1501 sp->next_shadow = next ? mfn_x(page_to_mfn(next)) : 0;
1504 /* Allocate another shadow's worth of (contiguous, aligned) pages,
1505 * and fill in the type and backpointer fields of their page_infos.
1506 * Never fails to allocate. */
1507 mfn_t shadow_alloc(struct domain *d,
1508 u32 shadow_type,
1509 unsigned long backpointer)
1511 struct page_info *sp = NULL;
1512 unsigned int order = shadow_order(shadow_type);
1513 cpumask_t mask;
1514 void *p;
1515 int i;
1517 ASSERT(shadow_locked_by_me(d));
1518 if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d))
1519 order = shadow_max_order(d);
1520 ASSERT(order <= shadow_max_order(d));
1521 ASSERT(shadow_type != SH_type_none);
1522 perfc_incr(shadow_alloc);
1524 /* Find smallest order which can satisfy the request. */
1525 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
1526 if ( (sp = page_list_remove_head(&d->arch.paging.shadow.freelists[i])) )
1527 goto found;
1529 /* If we get here, we failed to allocate. This should never happen.
1530 * It means that we didn't call shadow_prealloc() correctly before
1531 * we allocated. We can't recover by calling prealloc here, because
1532 * we might free up higher-level pages that the caller is working on. */
1533 SHADOW_ERROR("Can't allocate %i shadow pages!\n", 1 << order);
1534 BUG();
1536 found:
1537 /* We may have to halve the chunk a number of times. */
1538 while ( i != order )
1540 i--;
1541 sp->v.free.order = i;
1542 page_list_add_tail(sp, &d->arch.paging.shadow.freelists[i]);
1543 sp += 1 << i;
1545 d->arch.paging.shadow.free_pages -= 1 << order;
1547 /* Init page info fields and clear the pages */
1548 for ( i = 0; i < 1<<order ; i++ )
1550 /* Before we overwrite the old contents of this page,
1551 * we need to be sure that no TLB holds a pointer to it. */
1552 mask = d->domain_dirty_cpumask;
1553 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
1554 if ( unlikely(!cpus_empty(mask)) )
1556 perfc_incr(shadow_alloc_tlbflush);
1557 flush_tlb_mask(&mask);
1559 /* Now safe to clear the page for reuse */
1560 p = sh_map_domain_page(page_to_mfn(sp+i));
1561 ASSERT(p != NULL);
1562 clear_page(p);
1563 sh_unmap_domain_page(p);
1564 INIT_PAGE_LIST_ENTRY(&sp[i].list);
1565 sp[i].u.sh.type = shadow_type;
1566 sp[i].u.sh.pinned = 0;
1567 sp[i].u.sh.count = 0;
1568 sp[i].v.sh.back = backpointer;
1569 set_next_shadow(&sp[i], NULL);
1570 perfc_incr(shadow_alloc_count);
1572 return page_to_mfn(sp);
1576 /* Return some shadow pages to the pool. */
1577 void shadow_free(struct domain *d, mfn_t smfn)
1579 struct page_info *sp = mfn_to_page(smfn);
1580 u32 shadow_type;
1581 unsigned long order;
1582 unsigned long mask;
1583 int i;
1585 ASSERT(shadow_locked_by_me(d));
1586 perfc_incr(shadow_free);
1588 shadow_type = sp->u.sh.type;
1589 ASSERT(shadow_type != SH_type_none);
1590 ASSERT(shadow_type != SH_type_p2m_table);
1591 order = shadow_order(shadow_type);
1593 d->arch.paging.shadow.free_pages += 1 << order;
1595 for ( i = 0; i < 1<<order; i++ )
1597 #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION)
1598 struct vcpu *v;
1599 for_each_vcpu(d, v)
1601 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1602 /* No longer safe to look for a writeable mapping in this shadow */
1603 if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
1604 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1605 #endif
1606 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
1607 v->arch.paging.last_write_emul_ok = 0;
1608 #endif
1610 #endif
1611 /* Strip out the type: this is now a free shadow page */
1612 sp[i].u.sh.type = 0;
1613 /* Remember the TLB timestamp so we will know whether to flush
1614 * TLBs when we reuse the page. Because the destructors leave the
1615 * contents of the pages in place, we can delay TLB flushes until
1616 * just before the allocator hands the page out again. */
1617 sp[i].tlbflush_timestamp = tlbflush_current_time();
1618 perfc_decr(shadow_alloc_count);
1621 /* Merge chunks as far as possible. */
1622 for ( ; order < shadow_max_order(d); ++order )
1624 mask = 1 << order;
1625 if ( (mfn_x(page_to_mfn(sp)) & mask) ) {
1626 /* Merge with predecessor block? */
1627 if ( ((sp-mask)->u.sh.type != PGT_none) ||
1628 ((sp-mask)->v.free.order != order) )
1629 break;
1630 sp -= mask;
1631 page_list_del(sp, &d->arch.paging.shadow.freelists[order]);
1632 } else {
1633 /* Merge with successor block? */
1634 if ( ((sp+mask)->u.sh.type != PGT_none) ||
1635 ((sp+mask)->v.free.order != order) )
1636 break;
1637 page_list_del(sp + mask, &d->arch.paging.shadow.freelists[order]);
1641 sp->v.free.order = order;
1642 page_list_add_tail(sp, &d->arch.paging.shadow.freelists[order]);
1645 /* Divert some memory from the pool to be used by the p2m mapping.
1646 * This action is irreversible: the p2m mapping only ever grows.
1647 * That's OK because the p2m table only exists for translated domains,
1648 * and those domains can't ever turn off shadow mode.
1649 * Also, we only ever allocate a max-order chunk, so as to preserve
1650 * the invariant that shadow_prealloc() always works.
1651 * Returns 0 iff it can't get a chunk (the caller should then
1652 * free up some pages in domheap and call sh_set_allocation);
1653 * returns non-zero on success.
1654 */
1655 static int
1656 sh_alloc_p2m_pages(struct domain *d)
1658 struct page_info *pg;
1659 u32 i;
1660 unsigned int order = shadow_max_order(d);
1662 ASSERT(shadow_locked_by_me(d));
1664 if ( d->arch.paging.shadow.total_pages
1665 < (shadow_min_acceptable_pages(d) + (1 << order)) )
1666 return 0; /* Not enough shadow memory: need to increase it first */
1668 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1669 d->arch.paging.shadow.p2m_pages += (1 << order);
1670 d->arch.paging.shadow.total_pages -= (1 << order);
1671 for (i = 0; i < (1U << order); i++)
1673 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1674 * Marking the domain as the owner would normally allow the guest to
1675 * create mappings of these pages, but these p2m pages will never be
1676 * in the domain's guest-physical address space, and so that is not
1677 * believed to be a concern.
1678 */
1679 page_set_owner(&pg[i], d);
1680 pg[i].count_info |= 1;
1681 page_list_add_tail(&pg[i], &d->arch.paging.shadow.p2m_freelist);
1683 return 1;
1686 // Returns 0 if no memory is available...
1687 static struct page_info *
1688 shadow_alloc_p2m_page(struct domain *d)
1690 struct page_info *pg;
1691 mfn_t mfn;
1692 void *p;
1694 shadow_lock(d);
1696 if ( page_list_empty(&d->arch.paging.shadow.p2m_freelist) &&
1697 !sh_alloc_p2m_pages(d) )
1699 shadow_unlock(d);
1700 return NULL;
1702 pg = page_list_remove_head(&d->arch.paging.shadow.p2m_freelist);
1704 shadow_unlock(d);
1706 mfn = page_to_mfn(pg);
1707 p = sh_map_domain_page(mfn);
1708 clear_page(p);
1709 sh_unmap_domain_page(p);
1711 return pg;
1714 static void
1715 shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1717 ASSERT(page_get_owner(pg) == d);
1718 /* Should have just the one ref we gave it in alloc_p2m_page() */
1719 if ( (pg->count_info & PGC_count_mask) != 1 )
1721 SHADOW_ERROR("Odd p2m page count c=%#lx t=%"PRtype_info"\n",
1722 pg->count_info, pg->u.inuse.type_info);
1724 pg->count_info &= ~PGC_count_mask;
1725 /* Free should not decrement domain's total allocation, since
1726 * these pages were allocated without an owner. */
1727 page_set_owner(pg, NULL);
1728 free_domheap_pages(pg, 0);
1729 d->arch.paging.shadow.p2m_pages--;
1730 perfc_decr(shadow_alloc_count);
1733 #if CONFIG_PAGING_LEVELS == 3
1734 static void p2m_install_entry_in_monitors(struct domain *d,
1735 l3_pgentry_t *l3e)
1736 /* Special case, only used for external-mode domains on PAE hosts:
1737 * update the mapping of the p2m table. Once again, this is trivial in
1738 * other paging modes (one top-level entry points to the top-level p2m,
1739 * no maintenance needed), but PAE makes life difficult by needing a
1740 * copy the eight l3es of the p2m table in eight l2h slots in the
1741 * monitor table. This function makes fresh copies when a p2m l3e
1742 * changes. */
1744 l2_pgentry_t *ml2e;
1745 struct vcpu *v;
1746 unsigned int index;
1748 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1749 ASSERT(index < MACHPHYS_MBYTES>>1);
1751 for_each_vcpu(d, v)
1753 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1754 continue;
1755 ASSERT(shadow_mode_external(v->domain));
1757 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1758 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1760 if ( v == current ) /* OK to use linear map of monitor_table */
1761 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1762 else
1764 l3_pgentry_t *ml3e;
1765 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1766 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1767 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1768 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1769 sh_unmap_domain_page(ml3e);
1771 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1772 if ( v != current )
1773 sh_unmap_domain_page(ml2e);
1776 #endif
1778 /* Set the pool of shadow pages to the required number of pages.
1779 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1780 * plus space for the p2m table.
1781 * Returns 0 for success, non-zero for failure. */
1782 static unsigned int sh_set_allocation(struct domain *d,
1783 unsigned int pages,
1784 int *preempted)
1786 struct page_info *sp;
1787 unsigned int lower_bound;
1788 unsigned int j, order = shadow_max_order(d);
1790 ASSERT(shadow_locked_by_me(d));
1792 /* Don't allocate less than the minimum acceptable, plus one page per
1793 * megabyte of RAM (for the p2m table) */
1794 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1795 if ( pages > 0 && pages < lower_bound )
1796 pages = lower_bound;
1797 /* Round up to largest block size */
1798 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1800 SHADOW_PRINTK("current %i target %i\n",
1801 d->arch.paging.shadow.total_pages, pages);
1803 while ( d->arch.paging.shadow.total_pages != pages )
1805 if ( d->arch.paging.shadow.total_pages < pages )
1807 /* Need to allocate more memory from domheap */
1808 sp = (struct page_info *)
1809 alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d)));
1810 if ( sp == NULL )
1812 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1813 return -ENOMEM;
1815 d->arch.paging.shadow.free_pages += 1 << order;
1816 d->arch.paging.shadow.total_pages += 1 << order;
1817 for ( j = 0; j < 1U << order; j++ )
1819 sp[j].u.sh.type = 0;
1820 sp[j].u.sh.pinned = 0;
1821 sp[j].u.sh.count = 0;
1822 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1824 sp->v.free.order = order;
1825 page_list_add_tail(sp, &d->arch.paging.shadow.freelists[order]);
1827 else if ( d->arch.paging.shadow.total_pages > pages )
1829 /* Need to return memory to domheap */
1830 _shadow_prealloc(d, order, 1);
1831 sp = page_list_remove_head(&d->arch.paging.shadow.freelists[order]);
1832 ASSERT(sp);
1833 /*
1834 * The pages were allocated anonymously, but the owner field
1835 * gets overwritten normally, so need to clear it here.
1836 */
1837 for ( j = 0; j < 1U << order; j++ )
1838 page_set_owner(&((struct page_info *)sp)[j], NULL);
1839 d->arch.paging.shadow.free_pages -= 1 << order;
1840 d->arch.paging.shadow.total_pages -= 1 << order;
1841 free_domheap_pages((struct page_info *)sp, order);
1844 /* Check to see if we need to yield and try again */
1845 if ( preempted && hypercall_preempt_check() )
1847 *preempted = 1;
1848 return 0;
1852 return 0;
1855 /* Return the size of the shadow pool, rounded up to the nearest MB */
1856 static unsigned int shadow_get_allocation(struct domain *d)
1858 unsigned int pg = d->arch.paging.shadow.total_pages;
1859 return ((pg >> (20 - PAGE_SHIFT))
1860 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1863 /**************************************************************************/
1864 /* Hash table for storing the guest->shadow mappings.
1865 * The table itself is an array of pointers to shadows; the shadows are then
1866 * threaded on a singly-linked list of shadows with the same hash value */
1868 #define SHADOW_HASH_BUCKETS 251
1869 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1871 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1872 typedef u32 key_t;
1873 static inline key_t sh_hash(unsigned long n, unsigned int t)
1875 unsigned char *p = (unsigned char *)&n;
1876 key_t k = t;
1877 int i;
1878 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1879 return k % SHADOW_HASH_BUCKETS;
1882 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1884 /* Before we get to the mechanism, define a pair of audit functions
1885 * that sanity-check the contents of the hash table. */
1886 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1887 /* Audit one bucket of the hash table */
1889 struct page_info *sp, *x;
1891 if ( !(SHADOW_AUDIT_ENABLE) )
1892 return;
1894 sp = d->arch.paging.shadow.hash_table[bucket];
1895 while ( sp )
1897 /* Not a shadow? */
1898 BUG_ON( (sp->count_info & PGC_count_mask )!= 0 ) ;
1899 /* Bogus type? */
1900 BUG_ON( sp->u.sh.type == 0 );
1901 BUG_ON( sp->u.sh.type > SH_type_max_shadow );
1902 /* Wrong bucket? */
1903 BUG_ON( sh_hash(sp->v.sh.back, sp->u.sh.type) != bucket );
1904 /* Duplicate entry? */
1905 for ( x = next_shadow(sp); x; x = next_shadow(x) )
1906 BUG_ON( x->v.sh.back == sp->v.sh.back &&
1907 x->u.sh.type == sp->u.sh.type );
1908 /* Follow the backpointer to the guest pagetable */
1909 if ( sp->u.sh.type != SH_type_fl1_32_shadow
1910 && sp->u.sh.type != SH_type_fl1_pae_shadow
1911 && sp->u.sh.type != SH_type_fl1_64_shadow )
1913 struct page_info *gpg = mfn_to_page(_mfn(sp->v.sh.back));
1914 /* Bad shadow flags on guest page? */
1915 BUG_ON( !(gpg->shadow_flags & (1<<sp->u.sh.type)) );
1916 /* Bad type count on guest page? */
1917 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1918 if ( sp->u.sh.type == SH_type_l1_32_shadow
1919 || sp->u.sh.type == SH_type_l1_pae_shadow
1920 || sp->u.sh.type == SH_type_l1_64_shadow )
1922 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1923 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1925 if ( !page_is_out_of_sync(gpg) )
1927 SHADOW_ERROR("MFN %#"PRpgmfn" shadowed (by %#"PRI_mfn")"
1928 " and not OOS but has typecount %#lx\n",
1929 sp->v.sh.back,
1930 mfn_x(page_to_mfn(sp)),
1931 gpg->u.inuse.type_info);
1932 BUG();
1936 else /* Not an l1 */
1937 #endif
1938 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1939 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1941 SHADOW_ERROR("MFN %#"PRpgmfn" shadowed (by %#"PRI_mfn")"
1942 " but has typecount %#lx\n",
1943 sp->v.sh.back, mfn_x(page_to_mfn(sp)),
1944 gpg->u.inuse.type_info);
1945 BUG();
1948 /* That entry was OK; on we go */
1949 sp = next_shadow(sp);
1953 #else
1954 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1955 #endif /* Hashtable bucket audit */
1958 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1960 static void sh_hash_audit(struct domain *d)
1961 /* Full audit: audit every bucket in the table */
1963 int i;
1965 if ( !(SHADOW_AUDIT_ENABLE) )
1966 return;
1968 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1970 sh_hash_audit_bucket(d, i);
1974 #else
1975 #define sh_hash_audit(_d) do {} while(0)
1976 #endif /* Hashtable bucket audit */
1978 /* Allocate and initialise the table itself.
1979 * Returns 0 for success, 1 for error. */
1980 static int shadow_hash_alloc(struct domain *d)
1982 struct page_info **table;
1984 ASSERT(shadow_locked_by_me(d));
1985 ASSERT(!d->arch.paging.shadow.hash_table);
1987 table = xmalloc_array(struct page_info *, SHADOW_HASH_BUCKETS);
1988 if ( !table ) return 1;
1989 memset(table, 0,
1990 SHADOW_HASH_BUCKETS * sizeof (struct page_info *));
1991 d->arch.paging.shadow.hash_table = table;
1992 return 0;
1995 /* Tear down the hash table and return all memory to Xen.
1996 * This function does not care whether the table is populated. */
1997 static void shadow_hash_teardown(struct domain *d)
1999 ASSERT(shadow_locked_by_me(d));
2000 ASSERT(d->arch.paging.shadow.hash_table);
2002 xfree(d->arch.paging.shadow.hash_table);
2003 d->arch.paging.shadow.hash_table = NULL;
2007 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
2008 /* Find an entry in the hash table. Returns the MFN of the shadow,
2009 * or INVALID_MFN if it doesn't exist */
2011 struct domain *d = v->domain;
2012 struct page_info *sp, *prev;
2013 key_t key;
2015 ASSERT(shadow_locked_by_me(d));
2016 ASSERT(d->arch.paging.shadow.hash_table);
2017 ASSERT(t);
2019 sh_hash_audit(d);
2021 perfc_incr(shadow_hash_lookups);
2022 key = sh_hash(n, t);
2023 sh_hash_audit_bucket(d, key);
2025 sp = d->arch.paging.shadow.hash_table[key];
2026 prev = NULL;
2027 while(sp)
2029 if ( sp->v.sh.back == n && sp->u.sh.type == t )
2031 /* Pull-to-front if 'sp' isn't already the head item */
2032 if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
2034 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
2035 /* Can't reorder: someone is walking the hash chains */
2036 return page_to_mfn(sp);
2037 else
2039 ASSERT(prev);
2040 /* Delete sp from the list */
2041 prev->next_shadow = sp->next_shadow;
2042 /* Re-insert it at the head of the list */
2043 set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
2044 d->arch.paging.shadow.hash_table[key] = sp;
2047 else
2049 perfc_incr(shadow_hash_lookup_head);
2051 return page_to_mfn(sp);
2053 prev = sp;
2054 sp = next_shadow(sp);
2057 perfc_incr(shadow_hash_lookup_miss);
2058 return _mfn(INVALID_MFN);
2061 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
2062 mfn_t smfn)
2063 /* Put a mapping (n,t)->smfn into the hash table */
2065 struct domain *d = v->domain;
2066 struct page_info *sp;
2067 key_t key;
2069 ASSERT(shadow_locked_by_me(d));
2070 ASSERT(d->arch.paging.shadow.hash_table);
2071 ASSERT(t);
2073 sh_hash_audit(d);
2075 perfc_incr(shadow_hash_inserts);
2076 key = sh_hash(n, t);
2077 sh_hash_audit_bucket(d, key);
2079 /* Insert this shadow at the top of the bucket */
2080 sp = mfn_to_page(smfn);
2081 set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
2082 d->arch.paging.shadow.hash_table[key] = sp;
2084 sh_hash_audit_bucket(d, key);
2087 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
2088 mfn_t smfn)
2089 /* Excise the mapping (n,t)->smfn from the hash table */
2091 struct domain *d = v->domain;
2092 struct page_info *sp, *x;
2093 key_t key;
2095 ASSERT(shadow_locked_by_me(d));
2096 ASSERT(d->arch.paging.shadow.hash_table);
2097 ASSERT(t);
2099 sh_hash_audit(d);
2101 perfc_incr(shadow_hash_deletes);
2102 key = sh_hash(n, t);
2103 sh_hash_audit_bucket(d, key);
2105 sp = mfn_to_page(smfn);
2106 if ( d->arch.paging.shadow.hash_table[key] == sp )
2107 /* Easy case: we're deleting the head item. */
2108 d->arch.paging.shadow.hash_table[key] = next_shadow(sp);
2109 else
2111 /* Need to search for the one we want */
2112 x = d->arch.paging.shadow.hash_table[key];
2113 while ( 1 )
2115 ASSERT(x); /* We can't have hit the end, since our target is
2116 * still in the chain somehwere... */
2117 if ( next_shadow(x) == sp )
2119 x->next_shadow = sp->next_shadow;
2120 break;
2122 x = next_shadow(x);
2125 set_next_shadow(sp, NULL);
2127 sh_hash_audit_bucket(d, key);
2130 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
2132 static void hash_foreach(struct vcpu *v,
2133 unsigned int callback_mask,
2134 hash_callback_t callbacks[],
2135 mfn_t callback_mfn)
2136 /* Walk the hash table looking at the types of the entries and
2137 * calling the appropriate callback function for each entry.
2138 * The mask determines which shadow types we call back for, and the array
2139 * of callbacks tells us which function to call.
2140 * Any callback may return non-zero to let us skip the rest of the scan.
2142 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
2143 * then return non-zero to terminate the scan. */
2145 int i, done = 0;
2146 struct domain *d = v->domain;
2147 struct page_info *x;
2149 /* Say we're here, to stop hash-lookups reordering the chains */
2150 ASSERT(shadow_locked_by_me(d));
2151 ASSERT(d->arch.paging.shadow.hash_walking == 0);
2152 d->arch.paging.shadow.hash_walking = 1;
2154 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
2156 /* WARNING: This is not safe against changes to the hash table.
2157 * The callback *must* return non-zero if it has inserted or
2158 * deleted anything from the hash (lookups are OK, though). */
2159 for ( x = d->arch.paging.shadow.hash_table[i]; x; x = next_shadow(x) )
2161 if ( callback_mask & (1 << x->u.sh.type) )
2163 ASSERT(x->u.sh.type <= 15);
2164 ASSERT(callbacks[x->u.sh.type] != NULL);
2165 done = callbacks[x->u.sh.type](v, page_to_mfn(x),
2166 callback_mfn);
2167 if ( done ) break;
2170 if ( done ) break;
2172 d->arch.paging.shadow.hash_walking = 0;
2176 /**************************************************************************/
2177 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
2178 * which will decrement refcounts appropriately and return memory to the
2179 * free pool. */
2181 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
2183 struct page_info *sp = mfn_to_page(smfn);
2184 unsigned int t = sp->u.sh.type;
2187 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
2189 /* Double-check, if we can, that the shadowed page belongs to this
2190 * domain, (by following the back-pointer). */
2191 ASSERT(t == SH_type_fl1_32_shadow ||
2192 t == SH_type_fl1_pae_shadow ||
2193 t == SH_type_fl1_64_shadow ||
2194 t == SH_type_monitor_table ||
2195 (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
2196 (page_get_owner(mfn_to_page(_mfn(sp->v.sh.back)))
2197 == v->domain));
2199 /* The down-shifts here are so that the switch statement is on nice
2200 * small numbers that the compiler will enjoy */
2201 switch ( t )
2203 case SH_type_l1_32_shadow:
2204 case SH_type_fl1_32_shadow:
2205 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(v, smfn);
2206 break;
2207 case SH_type_l2_32_shadow:
2208 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(v, smfn);
2209 break;
2211 case SH_type_l1_pae_shadow:
2212 case SH_type_fl1_pae_shadow:
2213 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(v, smfn);
2214 break;
2215 case SH_type_l2_pae_shadow:
2216 case SH_type_l2h_pae_shadow:
2217 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(v, smfn);
2218 break;
2220 #if CONFIG_PAGING_LEVELS >= 4
2221 case SH_type_l1_64_shadow:
2222 case SH_type_fl1_64_shadow:
2223 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(v, smfn);
2224 break;
2225 case SH_type_l2h_64_shadow:
2226 ASSERT(is_pv_32on64_vcpu(v));
2227 /* Fall through... */
2228 case SH_type_l2_64_shadow:
2229 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(v, smfn);
2230 break;
2231 case SH_type_l3_64_shadow:
2232 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(v, smfn);
2233 break;
2234 case SH_type_l4_64_shadow:
2235 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(v, smfn);
2236 break;
2237 #endif
2238 default:
2239 SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
2240 (unsigned long)t);
2241 BUG();
2245 static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
2247 if ( tb_init_done )
2249 /* Convert gmfn to gfn */
2250 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
2251 __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
2255 /**************************************************************************/
2256 /* Remove all writeable mappings of a guest frame from the shadow tables
2257 * Returns non-zero if we need to flush TLBs.
2258 * level and fault_addr desribe how we found this to be a pagetable;
2259 * level==0 means we have some other reason for revoking write access.
2260 * If level==0 we are allowed to fail, returning -1. */
2262 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
2263 unsigned int level,
2264 unsigned long fault_addr)
2266 /* Dispatch table for getting per-type functions */
2267 static hash_callback_t callbacks[SH_type_unused] = {
2268 NULL, /* none */
2269 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32 */
2270 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32 */
2271 NULL, /* l2_32 */
2272 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* l1_pae */
2273 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* fl1_pae */
2274 NULL, /* l2_pae */
2275 NULL, /* l2h_pae */
2276 #if CONFIG_PAGING_LEVELS >= 4
2277 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* l1_64 */
2278 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* fl1_64 */
2279 #else
2280 NULL, /* l1_64 */
2281 NULL, /* fl1_64 */
2282 #endif
2283 NULL, /* l2_64 */
2284 NULL, /* l2h_64 */
2285 NULL, /* l3_64 */
2286 NULL, /* l4_64 */
2287 NULL, /* p2m */
2288 NULL /* unused */
2289 };
2291 static unsigned int callback_mask =
2292 1 << SH_type_l1_32_shadow
2293 | 1 << SH_type_fl1_32_shadow
2294 | 1 << SH_type_l1_pae_shadow
2295 | 1 << SH_type_fl1_pae_shadow
2296 | 1 << SH_type_l1_64_shadow
2297 | 1 << SH_type_fl1_64_shadow
2299 struct page_info *pg = mfn_to_page(gmfn);
2301 ASSERT(shadow_locked_by_me(v->domain));
2303 /* Only remove writable mappings if we are doing shadow refcounts.
2304 * In guest refcounting, we trust Xen to already be restricting
2305 * all the writes to the guest page tables, so we do not need to
2306 * do more. */
2307 if ( !shadow_mode_refcounts(v->domain) )
2308 return 0;
2310 /* Early exit if it's already a pagetable, or otherwise not writeable */
2311 if ( (sh_mfn_is_a_page_table(gmfn)
2312 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2313 /* Unless they've been allowed to go out of sync with their shadows */
2314 && !mfn_oos_may_write(gmfn)
2315 #endif
2317 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2318 return 0;
2320 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP);
2322 perfc_incr(shadow_writeable);
2324 /* If this isn't a "normal" writeable page, the domain is trying to
2325 * put pagetables in special memory of some kind. We can't allow that. */
2326 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
2328 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
2329 PRtype_info "\n",
2330 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
2331 domain_crash(v->domain);
2334 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
2335 if ( v == current )
2337 unsigned long gfn;
2338 /* Heuristic: there is likely to be only one writeable mapping,
2339 * and that mapping is likely to be in the current pagetable,
2340 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
2342 #define GUESS(_a, _h) do { \
2343 if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
2344 perfc_incr(shadow_writeable_h_ ## _h); \
2345 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
2346 { \
2347 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); \
2348 return 1; \
2349 } \
2350 } while (0)
2352 if ( v->arch.paging.mode->guest_levels == 2 )
2354 if ( level == 1 )
2355 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
2356 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
2358 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2359 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2360 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2362 /* FreeBSD: Linear map at 0xBFC00000 */
2363 if ( level == 1 )
2364 GUESS(0xBFC00000UL
2365 + ((fault_addr & VADDR_MASK) >> 10), 6);
2367 else if ( v->arch.paging.mode->guest_levels == 3 )
2369 /* 32bit PAE w2k3: linear map at 0xC0000000 */
2370 switch ( level )
2372 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
2373 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
2376 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2377 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2378 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2380 /* FreeBSD PAE: Linear map at 0xBF800000 */
2381 switch ( level )
2383 case 1: GUESS(0xBF800000UL
2384 + ((fault_addr & VADDR_MASK) >> 9), 6); break;
2385 case 2: GUESS(0xBFDFC000UL
2386 + ((fault_addr & VADDR_MASK) >> 18), 6); break;
2389 #if CONFIG_PAGING_LEVELS >= 4
2390 else if ( v->arch.paging.mode->guest_levels == 4 )
2392 /* 64bit w2k3: linear map at 0xfffff68000000000 */
2393 switch ( level )
2395 case 1: GUESS(0xfffff68000000000UL
2396 + ((fault_addr & VADDR_MASK) >> 9), 3); break;
2397 case 2: GUESS(0xfffff6fb40000000UL
2398 + ((fault_addr & VADDR_MASK) >> 18), 3); break;
2399 case 3: GUESS(0xfffff6fb7da00000UL
2400 + ((fault_addr & VADDR_MASK) >> 27), 3); break;
2403 /* 64bit Linux direct map at 0xffff880000000000; older kernels
2404 * had it at 0xffff810000000000, and older kernels yet had it
2405 * at 0x0000010000000000UL */
2406 gfn = mfn_to_gfn(v->domain, gmfn);
2407 GUESS(0xffff880000000000UL + (gfn << PAGE_SHIFT), 4);
2408 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
2409 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
2411 /*
2412 * 64bit Solaris kernel page map at
2413 * kpm_vbase; 0xfffffe0000000000UL
2414 */
2415 GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4);
2417 /* FreeBSD 64bit: linear map 0xffff800000000000 */
2418 switch ( level )
2420 case 1: GUESS(0xffff800000000000
2421 + ((fault_addr & VADDR_MASK) >> 9), 6); break;
2422 case 2: GUESS(0xffff804000000000UL
2423 + ((fault_addr & VADDR_MASK) >> 18), 6); break;
2424 case 3: GUESS(0xffff804020000000UL
2425 + ((fault_addr & VADDR_MASK) >> 27), 6); break;
2427 /* FreeBSD 64bit: direct map at 0xffffff0000000000 */
2428 GUESS(0xffffff0000000000 + (gfn << PAGE_SHIFT), 6);
2430 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2432 #undef GUESS
2435 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2436 return 1;
2438 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
2439 * (entries in the fixmap) where linux maps its pagetables. Since
2440 * we expect to hit them most of the time, we start the search for
2441 * the writeable mapping by looking at the same MFN where the last
2442 * brute-force search succeeded. */
2444 if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
2446 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
2447 mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
2448 int shtype = mfn_to_page(last_smfn)->u.sh.type;
2450 if ( callbacks[shtype] )
2451 callbacks[shtype](v, last_smfn, gmfn);
2453 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
2454 perfc_incr(shadow_writeable_h_5);
2457 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2458 return 1;
2460 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
2462 /* Brute-force search of all the shadows, by walking the hash */
2463 trace_shadow_wrmap_bf(gmfn);
2464 if ( level == 0 )
2465 perfc_incr(shadow_writeable_bf_1);
2466 else
2467 perfc_incr(shadow_writeable_bf);
2468 hash_foreach(v, callback_mask, callbacks, gmfn);
2470 /* If that didn't catch the mapping, then there's some non-pagetable
2471 * mapping -- ioreq page, grant mapping, &c. */
2472 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2474 if ( level == 0 )
2475 return -1;
2477 SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
2478 "%lu special-use mappings of it\n", mfn_x(gmfn),
2479 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2480 domain_crash(v->domain);
2483 /* We killed at least one writeable mapping, so must flush TLBs. */
2484 return 1;
2487 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2488 int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
2489 mfn_t smfn, unsigned long off)
2491 struct page_info *sp = mfn_to_page(smfn);
2493 ASSERT(mfn_valid(smfn));
2494 ASSERT(mfn_valid(gmfn));
2496 if ( sp->u.sh.type == SH_type_l1_32_shadow
2497 || sp->u.sh.type == SH_type_fl1_32_shadow )
2499 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
2500 (v, gmfn, smfn, off);
2502 #if CONFIG_PAGING_LEVELS >= 3
2503 else if ( sp->u.sh.type == SH_type_l1_pae_shadow
2504 || sp->u.sh.type == SH_type_fl1_pae_shadow )
2505 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
2506 (v, gmfn, smfn, off);
2507 #if CONFIG_PAGING_LEVELS >= 4
2508 else if ( sp->u.sh.type == SH_type_l1_64_shadow
2509 || sp->u.sh.type == SH_type_fl1_64_shadow )
2510 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
2511 (v, gmfn, smfn, off);
2512 #endif
2513 #endif
2515 return 0;
2517 #endif
2519 /**************************************************************************/
2520 /* Remove all mappings of a guest frame from the shadow tables.
2521 * Returns non-zero if we need to flush TLBs. */
2523 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
2525 struct page_info *page = mfn_to_page(gmfn);
2526 int expected_count, do_locking;
2528 /* Dispatch table for getting per-type functions */
2529 static hash_callback_t callbacks[SH_type_unused] = {
2530 NULL, /* none */
2531 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32 */
2532 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32 */
2533 NULL, /* l2_32 */
2534 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* l1_pae */
2535 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* fl1_pae */
2536 NULL, /* l2_pae */
2537 NULL, /* l2h_pae */
2538 #if CONFIG_PAGING_LEVELS >= 4
2539 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* l1_64 */
2540 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* fl1_64 */
2541 #else
2542 NULL, /* l1_64 */
2543 NULL, /* fl1_64 */
2544 #endif
2545 NULL, /* l2_64 */
2546 NULL, /* l2h_64 */
2547 NULL, /* l3_64 */
2548 NULL, /* l4_64 */
2549 NULL, /* p2m */
2550 NULL /* unused */
2551 };
2553 static unsigned int callback_mask =
2554 1 << SH_type_l1_32_shadow
2555 | 1 << SH_type_fl1_32_shadow
2556 | 1 << SH_type_l1_pae_shadow
2557 | 1 << SH_type_fl1_pae_shadow
2558 | 1 << SH_type_l1_64_shadow
2559 | 1 << SH_type_fl1_64_shadow
2562 perfc_incr(shadow_mappings);
2563 if ( (page->count_info & PGC_count_mask) == 0 )
2564 return 0;
2566 /* Although this is an externally visible function, we do not know
2567 * whether the shadow lock will be held when it is called (since it
2568 * can be called via put_page_type when we clear a shadow l1e).
2569 * If the lock isn't held, take it for the duration of the call. */
2570 do_locking = !shadow_locked_by_me(v->domain);
2571 if ( do_locking ) shadow_lock(v->domain);
2573 /* XXX TODO:
2574 * Heuristics for finding the (probably) single mapping of this gmfn */
2576 /* Brute-force search of all the shadows, by walking the hash */
2577 perfc_incr(shadow_mappings_bf);
2578 hash_foreach(v, callback_mask, callbacks, gmfn);
2580 /* If that didn't catch the mapping, something is very wrong */
2581 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2582 if ( (page->count_info & PGC_count_mask) != expected_count )
2584 /* Don't complain if we're in HVM and there are some extra mappings:
2585 * The qemu helper process has an untyped mapping of this dom's RAM
2586 * and the HVM restore program takes another. */
2587 if ( !(shadow_mode_external(v->domain)
2588 && (page->count_info & PGC_count_mask) <= 3
2589 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2591 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2592 "c=%08lx t=%08lx\n", mfn_x(gmfn),
2593 page->count_info, page->u.inuse.type_info);
2597 if ( do_locking ) shadow_unlock(v->domain);
2599 /* We killed at least one mapping, so must flush TLBs. */
2600 return 1;
2604 /**************************************************************************/
2605 /* Remove all shadows of a guest frame from the shadow tables */
2607 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2608 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2609 * found there. Returns 1 if that was the only reference to this shadow */
2611 struct page_info *sp = mfn_to_page(smfn);
2612 mfn_t pmfn;
2613 void *vaddr;
2614 int rc;
2616 ASSERT(sp->u.sh.type > 0);
2617 ASSERT(sp->u.sh.type < SH_type_max_shadow);
2618 ASSERT(sp->u.sh.type != SH_type_l2_32_shadow);
2619 ASSERT(sp->u.sh.type != SH_type_l2_pae_shadow);
2620 ASSERT(sp->u.sh.type != SH_type_l2h_pae_shadow);
2621 ASSERT(sp->u.sh.type != SH_type_l4_64_shadow);
2623 if (sp->up == 0) return 0;
2624 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2625 ASSERT(mfn_valid(pmfn));
2626 vaddr = sh_map_domain_page(pmfn);
2627 ASSERT(vaddr);
2628 vaddr += sp->up & (PAGE_SIZE-1);
2629 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2631 /* Is this the only reference to this shadow? */
2632 rc = (sp->u.sh.count == 1) ? 1 : 0;
2634 /* Blank the offending entry */
2635 switch (sp->u.sh.type)
2637 case SH_type_l1_32_shadow:
2638 case SH_type_l2_32_shadow:
2639 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(v, vaddr, pmfn);
2640 break;
2641 case SH_type_l1_pae_shadow:
2642 case SH_type_l2_pae_shadow:
2643 case SH_type_l2h_pae_shadow:
2644 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(v, vaddr, pmfn);
2645 break;
2646 #if CONFIG_PAGING_LEVELS >= 4
2647 case SH_type_l1_64_shadow:
2648 case SH_type_l2_64_shadow:
2649 case SH_type_l2h_64_shadow:
2650 case SH_type_l3_64_shadow:
2651 case SH_type_l4_64_shadow:
2652 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(v, vaddr, pmfn);
2653 break;
2654 #endif
2655 default: BUG(); /* Some wierd unknown shadow type */
2658 sh_unmap_domain_page(vaddr);
2659 if ( rc )
2660 perfc_incr(shadow_up_pointer);
2661 else
2662 perfc_incr(shadow_unshadow_bf);
2664 return rc;
2667 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2668 /* Remove the shadows of this guest page.
2669 * If fast != 0, just try the quick heuristic, which will remove
2670 * at most one reference to each shadow of the page. Otherwise, walk
2671 * all the shadow tables looking for refs to shadows of this gmfn.
2672 * If all != 0, kill the domain if we can't find all the shadows.
2673 * (all != 0 implies fast == 0)
2674 */
2676 struct page_info *pg = mfn_to_page(gmfn);
2677 mfn_t smfn;
2678 int do_locking;
2679 unsigned char t;
2681 /* Dispatch table for getting per-type functions: each level must
2682 * be called with the function to remove a lower-level shadow. */
2683 static hash_callback_t callbacks[SH_type_unused] = {
2684 NULL, /* none */
2685 NULL, /* l1_32 */
2686 NULL, /* fl1_32 */
2687 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 2), /* l2_32 */
2688 NULL, /* l1_pae */
2689 NULL, /* fl1_pae */
2690 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2_pae */
2691 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2h_pae */
2692 NULL, /* l1_64 */
2693 NULL, /* fl1_64 */
2694 #if CONFIG_PAGING_LEVELS >= 4
2695 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2_64 */
2696 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2h_64 */
2697 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, 4), /* l3_64 */
2698 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, 4), /* l4_64 */
2699 #else
2700 NULL, /* l2_64 */
2701 NULL, /* l2h_64 */
2702 NULL, /* l3_64 */
2703 NULL, /* l4_64 */
2704 #endif
2705 NULL, /* p2m */
2706 NULL /* unused */
2707 };
2709 /* Another lookup table, for choosing which mask to use */
2710 static unsigned int masks[SH_type_unused] = {
2711 0, /* none */
2712 1 << SH_type_l2_32_shadow, /* l1_32 */
2713 0, /* fl1_32 */
2714 0, /* l2_32 */
2715 ((1 << SH_type_l2h_pae_shadow)
2716 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2717 0, /* fl1_pae */
2718 0, /* l2_pae */
2719 0, /* l2h_pae */
2720 ((1 << SH_type_l2h_64_shadow)
2721 | (1 << SH_type_l2_64_shadow)), /* l1_64 */
2722 0, /* fl1_64 */
2723 1 << SH_type_l3_64_shadow, /* l2_64 */
2724 1 << SH_type_l3_64_shadow, /* l2h_64 */
2725 1 << SH_type_l4_64_shadow, /* l3_64 */
2726 0, /* l4_64 */
2727 0, /* p2m */
2728 0 /* unused */
2729 };
2731 ASSERT(!(all && fast));
2733 /* Although this is an externally visible function, we do not know
2734 * whether the shadow lock will be held when it is called (since it
2735 * can be called via put_page_type when we clear a shadow l1e).
2736 * If the lock isn't held, take it for the duration of the call. */
2737 do_locking = !shadow_locked_by_me(v->domain);
2738 if ( do_locking ) shadow_lock(v->domain);
2740 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2741 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2743 /* Bail out now if the page is not shadowed */
2744 if ( (pg->count_info & PGC_page_table) == 0 )
2746 if ( do_locking ) shadow_unlock(v->domain);
2747 return;
2750 /* Search for this shadow in all appropriate shadows */
2751 perfc_incr(shadow_unshadow);
2753 /* Lower-level shadows need to be excised from upper-level shadows.
2754 * This call to hash_foreach() looks dangerous but is in fact OK: each
2755 * call will remove at most one shadow, and terminate immediately when
2756 * it does remove it, so we never walk the hash after doing a deletion. */
2757 #define DO_UNSHADOW(_type) do { \
2758 t = (_type); \
2759 if( !(pg->count_info & PGC_page_table) \
2760 || !(pg->shadow_flags & (1 << t)) ) \
2761 break; \
2762 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2763 if ( unlikely(!mfn_valid(smfn)) ) \
2764 { \
2765 SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32 \
2766 " but no type-0x%"PRIx32" shadow\n", \
2767 mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \
2768 break; \
2769 } \
2770 if ( sh_type_is_pinnable(v, t) ) \
2771 sh_unpin(v, smfn); \
2772 else \
2773 sh_remove_shadow_via_pointer(v, smfn); \
2774 if( !fast \
2775 && (pg->count_info & PGC_page_table) \
2776 && (pg->shadow_flags & (1 << t)) ) \
2777 hash_foreach(v, masks[t], callbacks, smfn); \
2778 } while (0)
2780 DO_UNSHADOW(SH_type_l2_32_shadow);
2781 DO_UNSHADOW(SH_type_l1_32_shadow);
2782 DO_UNSHADOW(SH_type_l2h_pae_shadow);
2783 DO_UNSHADOW(SH_type_l2_pae_shadow);
2784 DO_UNSHADOW(SH_type_l1_pae_shadow);
2785 #if CONFIG_PAGING_LEVELS >= 4
2786 DO_UNSHADOW(SH_type_l4_64_shadow);
2787 DO_UNSHADOW(SH_type_l3_64_shadow);
2788 DO_UNSHADOW(SH_type_l2h_64_shadow);
2789 DO_UNSHADOW(SH_type_l2_64_shadow);
2790 DO_UNSHADOW(SH_type_l1_64_shadow);
2791 #endif
2793 #undef DO_UNSHADOW
2795 /* If that didn't catch the shadows, something is wrong */
2796 if ( !fast && all && (pg->count_info & PGC_page_table) )
2798 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2799 "(shadow_flags=%08x)\n",
2800 mfn_x(gmfn), pg->shadow_flags);
2801 domain_crash(v->domain);
2804 /* Need to flush TLBs now, so that linear maps are safe next time we
2805 * take a fault. */
2806 flush_tlb_mask(&v->domain->domain_dirty_cpumask);
2808 if ( do_locking ) shadow_unlock(v->domain);
2811 static void
2812 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2813 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2814 * Unshadow it, and recursively unshadow pages that reference it. */
2816 sh_remove_shadows(v, gmfn, 0, 1);
2817 /* XXX TODO:
2818 * Rework this hashtable walker to return a linked-list of all
2819 * the shadows it modified, then do breadth-first recursion
2820 * to find the way up to higher-level tables and unshadow them too.
2822 * The current code (just tearing down each page's shadows as we
2823 * detect that it is not a pagetable) is correct, but very slow.
2824 * It means extra emulated writes and slows down removal of mappings. */
2827 /**************************************************************************/
2829 static void sh_update_paging_modes(struct vcpu *v)
2831 struct domain *d = v->domain;
2832 struct paging_mode *old_mode = v->arch.paging.mode;
2834 ASSERT(shadow_locked_by_me(d));
2836 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2837 /* Make sure this vcpu has a virtual TLB array allocated */
2838 if ( unlikely(!v->arch.paging.vtlb) )
2840 v->arch.paging.vtlb = xmalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
2841 if ( unlikely(!v->arch.paging.vtlb) )
2843 SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
2844 d->domain_id, v->vcpu_id);
2845 domain_crash(v->domain);
2846 return;
2848 memset(v->arch.paging.vtlb, 0,
2849 VTLB_ENTRIES * sizeof (struct shadow_vtlb));
2850 spin_lock_init(&v->arch.paging.vtlb_lock);
2852 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2854 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2855 if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
2857 int i;
2858 for(i = 0; i < SHADOW_OOS_PAGES; i++)
2860 shadow_prealloc(d, SH_type_oos_snapshot, 1);
2861 v->arch.paging.shadow.oos_snapshot[i] =
2862 shadow_alloc(d, SH_type_oos_snapshot, 0);
2865 #endif /* OOS */
2867 // Valid transitions handled by this function:
2868 // - For PV guests:
2869 // - after a shadow mode has been changed
2870 // - For HVM guests:
2871 // - after a shadow mode has been changed
2872 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2873 //
2875 // First, tear down any old shadow tables held by this vcpu.
2876 //
2877 if ( v->arch.paging.mode )
2878 v->arch.paging.mode->shadow.detach_old_tables(v);
2880 if ( !is_hvm_domain(d) )
2882 ///
2883 /// PV guest
2884 ///
2885 #if CONFIG_PAGING_LEVELS == 4
2886 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2887 #else /* CONFIG_PAGING_LEVELS == 3 */
2888 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2889 #endif
2891 else
2893 ///
2894 /// HVM guest
2895 ///
2896 ASSERT(shadow_mode_translate(d));
2897 ASSERT(shadow_mode_external(d));
2899 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2900 /* Need to resync all our pages now, because if a page goes out
2901 * of sync with paging enabled and is resynced with paging
2902 * disabled, the resync will go wrong. */
2903 shadow_resync_all(v, 0);
2904 #endif /* OOS */
2906 if ( !hvm_paging_enabled(v) )
2908 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
2909 * pagetable for it, mapping 4 GB one-to-one using a single l2
2910 * page of 1024 superpage mappings */
2911 v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
2912 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2914 else
2916 #ifdef __x86_64__
2917 if ( hvm_long_mode_enabled(v) )
2919 // long mode guest...
2920 v->arch.paging.mode =
2921 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2923 else
2924 #endif
2925 if ( hvm_pae_enabled(v) )
2927 // 32-bit PAE mode guest...
2928 v->arch.paging.mode =
2929 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2931 else
2933 // 32-bit 2 level guest...
2934 v->arch.paging.mode =
2935 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2939 if ( pagetable_is_null(v->arch.monitor_table) )
2941 mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2942 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2943 make_cr3(v, mfn_x(mmfn));
2944 hvm_update_host_cr3(v);
2947 if ( v->arch.paging.mode != old_mode )
2949 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d gl=%u "
2950 "(was g=%u s=%u)\n",
2951 d->domain_id, v->vcpu_id,
2952 is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
2953 v->arch.paging.mode->guest_levels,
2954 v->arch.paging.mode->shadow.shadow_levels,
2955 old_mode ? old_mode->guest_levels : 0,
2956 old_mode ? old_mode->shadow.shadow_levels : 0);
2957 if ( old_mode &&
2958 (v->arch.paging.mode->shadow.shadow_levels !=
2959 old_mode->shadow.shadow_levels) )
2961 /* Need to make a new monitor table for the new mode */
2962 mfn_t new_mfn, old_mfn;
2964 if ( v != current && vcpu_runnable(v) )
2966 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2967 "this HVM vcpu's (d=%u v=%u) paging mode "
2968 "while it is running.\n",
2969 current->domain->domain_id, current->vcpu_id,
2970 v->domain->domain_id, v->vcpu_id);
2971 /* It's not safe to do that because we can't change
2972 * the host CR3 for a running domain */
2973 domain_crash(v->domain);
2974 return;
2977 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2978 v->arch.monitor_table = pagetable_null();
2979 new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2980 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2981 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
2982 mfn_x(new_mfn));
2984 /* Don't be running on the old monitor table when we
2985 * pull it down! Switch CR3, and warn the HVM code that
2986 * its host cr3 has changed. */
2987 make_cr3(v, mfn_x(new_mfn));
2988 if ( v == current )
2989 write_ptbase(v);
2990 hvm_update_host_cr3(v);
2991 old_mode->shadow.destroy_monitor_table(v, old_mfn);
2995 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2996 // These are HARD: think about the case where two CPU's have
2997 // different values for CR4.PSE and CR4.PGE at the same time.
2998 // This *does* happen, at least for CR4.PGE...
3001 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3002 /* We need to check that all the vcpus have paging enabled to
3003 * unsync PTs. */
3004 if ( is_hvm_domain(d) )
3006 int pe = 1;
3007 struct vcpu *vptr;
3009 for_each_vcpu(d, vptr)
3011 if ( !hvm_paging_enabled(vptr) )
3013 pe = 0;
3014 break;
3018 d->arch.paging.shadow.oos_active = pe;
3020 #endif /* OOS */
3022 v->arch.paging.mode->update_cr3(v, 0);
3025 void shadow_update_paging_modes(struct vcpu *v)
3027 shadow_lock(v->domain);
3028 sh_update_paging_modes(v);
3029 shadow_unlock(v->domain);
3032 /**************************************************************************/
3033 /* Turning on and off shadow features */
3035 static void sh_new_mode(struct domain *d, u32 new_mode)
3036 /* Inform all the vcpus that the shadow mode has been changed */
3038 struct vcpu *v;
3040 ASSERT(shadow_locked_by_me(d));
3041 ASSERT(d != current->domain);
3042 d->arch.paging.mode = new_mode;
3043 for_each_vcpu(d, v)
3044 sh_update_paging_modes(v);
3047 int shadow_enable(struct domain *d, u32 mode)
3048 /* Turn on "permanent" shadow features: external, translate, refcount.
3049 * Can only be called once on a domain, and these features cannot be
3050 * disabled.
3051 * Returns 0 for success, -errno for failure. */
3053 unsigned int old_pages;
3054 struct page_info *pg = NULL;
3055 uint32_t *e;
3056 int i, rv = 0;
3058 mode |= PG_SH_enable;
3060 domain_pause(d);
3062 /* Sanity check the arguments */
3063 if ( (d == current->domain) ||
3064 shadow_mode_enabled(d) ||
3065 ((mode & PG_translate) && !(mode & PG_refcounts)) ||
3066 ((mode & PG_external) && !(mode & PG_translate)) )
3068 rv = -EINVAL;
3069 goto out_unlocked;
3072 /* Init the shadow memory allocation if the user hasn't done so */
3073 old_pages = d->arch.paging.shadow.total_pages;
3074 if ( old_pages == 0 )
3076 unsigned int r;
3077 shadow_lock(d);
3078 r = sh_set_allocation(d, 256, NULL); /* Use at least 1MB */
3079 if ( r != 0 )
3081 sh_set_allocation(d, 0, NULL);
3082 rv = -ENOMEM;
3083 goto out_locked;
3085 shadow_unlock(d);
3088 /* Init the P2M table. Must be done before we take the shadow lock
3089 * to avoid possible deadlock. */
3090 if ( mode & PG_translate )
3092 rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
3093 if (rv != 0)
3094 goto out_unlocked;
3097 /* HVM domains need an extra pagetable for vcpus that think they
3098 * have paging disabled */
3099 if ( is_hvm_domain(d) )
3101 /* Get a single page from the shadow pool. Take it via the
3102 * P2M interface to make freeing it simpler afterwards. */
3103 pg = shadow_alloc_p2m_page(d);
3104 if ( pg == NULL )
3106 rv = -ENOMEM;
3107 goto out_unlocked;
3109 /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
3110 * of virtual address space onto the same physical address range */
3111 e = sh_map_domain_page(page_to_mfn(pg));
3112 for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
3113 e[i] = ((0x400000U * i)
3114 | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
3115 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3116 sh_unmap_domain_page(e);
3117 pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
3120 shadow_lock(d);
3122 /* Sanity check again with the lock held */
3123 if ( shadow_mode_enabled(d) )
3125 rv = -EINVAL;
3126 goto out_locked;
3129 /* Init the hash table */
3130 if ( shadow_hash_alloc(d) != 0 )
3132 rv = -ENOMEM;
3133 goto out_locked;
3136 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3137 /* We assume we're dealing with an older 64bit linux guest until we
3138 * see the guest use more than one l4 per vcpu. */
3139 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3140 #endif
3142 /* Record the 1-to-1 pagetable we just made */
3143 if ( is_hvm_domain(d) )
3144 d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
3146 /* Update the bits */
3147 sh_new_mode(d, mode);
3149 out_locked:
3150 shadow_unlock(d);
3151 out_unlocked:
3152 if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
3153 p2m_teardown(d);
3154 if ( rv != 0 && pg != NULL )
3155 shadow_free_p2m_page(d, pg);
3156 domain_unpause(d);
3157 return rv;
3160 void shadow_teardown(struct domain *d)
3161 /* Destroy the shadow pagetables of this domain and free its shadow memory.
3162 * Should only be called for dying domains. */
3164 struct vcpu *v;
3165 mfn_t mfn;
3166 struct page_info *pg;
3168 ASSERT(d->is_dying);
3169 ASSERT(d != current->domain);
3171 if ( !shadow_locked_by_me(d) )
3172 shadow_lock(d); /* Keep various asserts happy */
3174 if ( shadow_mode_enabled(d) )
3176 /* Release the shadow and monitor tables held by each vcpu */
3177 for_each_vcpu(d, v)
3179 if ( v->arch.paging.mode )
3181 v->arch.paging.mode->shadow.detach_old_tables(v);
3182 if ( shadow_mode_external(d) )
3184 mfn = pagetable_get_mfn(v->arch.monitor_table);
3185 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
3186 v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
3187 v->arch.monitor_table = pagetable_null();
3193 #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
3194 /* Free the virtual-TLB array attached to each vcpu */
3195 for_each_vcpu(d, v)
3197 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3198 if ( v->arch.paging.vtlb )
3200 xfree(v->arch.paging.vtlb);
3201 v->arch.paging.vtlb = NULL;
3203 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3205 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3207 int i;
3208 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
3209 for(i = 0; i < SHADOW_OOS_PAGES; i++)
3210 if ( mfn_valid(oos_snapshot[i]) )
3211 shadow_free(d, oos_snapshot[i]);
3213 #endif /* OOS */
3215 #endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
3217 while ( (pg = page_list_remove_head(&d->arch.paging.shadow.p2m_freelist)) )
3218 shadow_free_p2m_page(d, pg);
3220 if ( d->arch.paging.shadow.total_pages != 0 )
3222 SHADOW_PRINTK("teardown of domain %u starts."
3223 " Shadow pages total = %u, free = %u, p2m=%u\n",
3224 d->domain_id,
3225 d->arch.paging.shadow.total_pages,
3226 d->arch.paging.shadow.free_pages,
3227 d->arch.paging.shadow.p2m_pages);
3228 /* Destroy all the shadows and release memory to domheap */
3229 sh_set_allocation(d, 0, NULL);
3230 /* Release the hash table back to xenheap */
3231 if (d->arch.paging.shadow.hash_table)
3232 shadow_hash_teardown(d);
3233 /* Should not have any more memory held */
3234 SHADOW_PRINTK("teardown done."
3235 " Shadow pages total = %u, free = %u, p2m=%u\n",
3236 d->arch.paging.shadow.total_pages,
3237 d->arch.paging.shadow.free_pages,
3238 d->arch.paging.shadow.p2m_pages);
3239 ASSERT(d->arch.paging.shadow.total_pages == 0);
3242 /* Free the non-paged-vcpus pagetable; must happen after we've
3243 * destroyed any shadows of it or sh_destroy_shadow will get confused. */
3244 if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
3246 for_each_vcpu(d, v)
3248 ASSERT(is_hvm_vcpu(v));
3249 if ( !hvm_paging_enabled(v) )
3250 v->arch.guest_table = pagetable_null();
3252 shadow_free_p2m_page(d,
3253 pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable));
3254 d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
3257 /* We leave the "permanent" shadow modes enabled, but clear the
3258 * log-dirty mode bit. We don't want any more mark_dirty()
3259 * calls now that we've torn down the bitmap */
3260 d->arch.paging.mode &= ~PG_log_dirty;
3262 if (d->arch.hvm_domain.dirty_vram) {
3263 xfree(d->arch.hvm_domain.dirty_vram->sl1ma);
3264 xfree(d->arch.hvm_domain.dirty_vram->dirty_bitmap);
3265 xfree(d->arch.hvm_domain.dirty_vram);
3266 d->arch.hvm_domain.dirty_vram = NULL;
3269 shadow_unlock(d);
3272 void shadow_final_teardown(struct domain *d)
3273 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
3275 SHADOW_PRINTK("dom %u final teardown starts."
3276 " Shadow pages total = %u, free = %u, p2m=%u\n",
3277 d->domain_id,
3278 d->arch.paging.shadow.total_pages,
3279 d->arch.paging.shadow.free_pages,
3280 d->arch.paging.shadow.p2m_pages);
3282 /* Double-check that the domain didn't have any shadow memory.
3283 * It is possible for a domain that never got domain_kill()ed
3284 * to get here with its shadow allocation intact. */
3285 if ( d->arch.paging.shadow.total_pages != 0 )
3286 shadow_teardown(d);
3288 /* It is now safe to pull down the p2m map. */
3289 p2m_teardown(d);
3291 SHADOW_PRINTK("dom %u final teardown done."
3292 " Shadow pages total = %u, free = %u, p2m=%u\n",
3293 d->domain_id,
3294 d->arch.paging.shadow.total_pages,
3295 d->arch.paging.shadow.free_pages,
3296 d->arch.paging.shadow.p2m_pages);
3299 static int shadow_one_bit_enable(struct domain *d, u32 mode)
3300 /* Turn on a single shadow mode feature */
3302 ASSERT(shadow_locked_by_me(d));
3304 /* Sanity check the call */
3305 if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
3307 return -EINVAL;
3310 mode |= PG_SH_enable;
3312 if ( d->arch.paging.mode == 0 )
3314 /* Init the shadow memory allocation and the hash table */
3315 if ( sh_set_allocation(d, 1, NULL) != 0
3316 || shadow_hash_alloc(d) != 0 )
3318 sh_set_allocation(d, 0, NULL);
3319 return -ENOMEM;
3323 /* Update the bits */
3324 sh_new_mode(d, d->arch.paging.mode | mode);
3326 return 0;
3329 static int shadow_one_bit_disable(struct domain *d, u32 mode)
3330 /* Turn off a single shadow mode feature */
3332 struct vcpu *v;
3333 ASSERT(shadow_locked_by_me(d));
3335 /* Sanity check the call */
3336 if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
3338 return -EINVAL;
3341 /* Update the bits */
3342 sh_new_mode(d, d->arch.paging.mode & ~mode);
3343 if ( d->arch.paging.mode == 0 )
3345 /* Get this domain off shadows */
3346 SHADOW_PRINTK("un-shadowing of domain %u starts."
3347 " Shadow pages total = %u, free = %u, p2m=%u\n",
3348 d->domain_id,
3349 d->arch.paging.shadow.total_pages,
3350 d->arch.paging.shadow.free_pages,
3351 d->arch.paging.shadow.p2m_pages);
3352 for_each_vcpu(d, v)
3354 if ( v->arch.paging.mode )
3355 v->arch.paging.mode->shadow.detach_old_tables(v);
3356 #if CONFIG_PAGING_LEVELS == 4
3357 if ( !(v->arch.flags & TF_kernel_mode) )
3358 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
3359 else
3360 #endif
3361 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
3365 /* Pull down the memory allocation */
3366 if ( sh_set_allocation(d, 0, NULL) != 0 )
3368 // XXX - How can this occur?
3369 // Seems like a bug to return an error now that we've
3370 // disabled the relevant shadow mode.
3371 //
3372 return -ENOMEM;
3374 shadow_hash_teardown(d);
3375 SHADOW_PRINTK("un-shadowing of domain %u done."
3376 " Shadow pages total = %u, free = %u, p2m=%u\n",
3377 d->domain_id,
3378 d->arch.paging.shadow.total_pages,
3379 d->arch.paging.shadow.free_pages,
3380 d->arch.paging.shadow.p2m_pages);
3383 return 0;
3386 /* Enable/disable ops for the "test" and "log-dirty" modes */
3387 static int shadow_test_enable(struct domain *d)
3389 int ret;
3391 domain_pause(d);
3392 shadow_lock(d);
3393 ret = shadow_one_bit_enable(d, PG_SH_enable);
3394 shadow_unlock(d);
3395 domain_unpause(d);
3397 return ret;
3400 static int shadow_test_disable(struct domain *d)
3402 int ret;
3404 domain_pause(d);
3405 shadow_lock(d);
3406 ret = shadow_one_bit_disable(d, PG_SH_enable);
3407 shadow_unlock(d);
3408 domain_unpause(d);
3410 return ret;
3413 /**************************************************************************/
3414 /* P2M map manipulations */
3416 /* shadow specific code which should be called when P2M table entry is updated
3417 * with new content. It is responsible for update the entry, as well as other
3418 * shadow processing jobs.
3419 */
3420 void
3421 shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn,
3422 l1_pgentry_t *p, mfn_t table_mfn,
3423 l1_pgentry_t new, unsigned int level)
3425 struct domain *d = v->domain;
3427 shadow_lock(d);
3429 /* If we're removing an MFN from the p2m, remove it from the shadows too */
3430 if ( level == 1 )
3432 mfn_t mfn = _mfn(l1e_get_pfn(*p));
3433 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3434 if ( p2m_is_valid(p2mt) && mfn_valid(mfn) )
3436 sh_remove_all_shadows_and_parents(v, mfn);
3437 if ( sh_remove_all_mappings(v, mfn) )
3438 flush_tlb_mask(&d->domain_dirty_cpumask);
3442 /* If we're removing a superpage mapping from the p2m, we need to check
3443 * all the pages covered by it. If they're still there in the new
3444 * scheme, that's OK, but otherwise they must be unshadowed. */
3445 if ( level == 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) &&
3446 (l1e_get_flags(*p) & _PAGE_PSE) )
3448 unsigned int i;
3449 cpumask_t flushmask;
3450 mfn_t omfn = _mfn(l1e_get_pfn(*p));
3451 mfn_t nmfn = _mfn(l1e_get_pfn(new));
3452 l1_pgentry_t *npte = NULL;
3453 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3454 if ( p2m_is_valid(p2mt) && mfn_valid(omfn) )
3456 cpus_clear(flushmask);
3458 /* If we're replacing a superpage with a normal L1 page, map it */
3459 if ( (l1e_get_flags(new) & _PAGE_PRESENT)
3460 && !(l1e_get_flags(new) & _PAGE_PSE)
3461 && mfn_valid(nmfn) )
3462 npte = map_domain_page(mfn_x(nmfn));
3464 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3466 if ( !npte
3467 || !p2m_is_ram(p2m_flags_to_type(l1e_get_flags(npte[i])))
3468 || l1e_get_pfn(npte[i]) != mfn_x(omfn) )
3470 /* This GFN->MFN mapping has gone away */
3471 sh_remove_all_shadows_and_parents(v, omfn);
3472 if ( sh_remove_all_mappings(v, omfn) )
3473 cpus_or(flushmask, flushmask, d->domain_dirty_cpumask);
3475 omfn = _mfn(mfn_x(omfn) + 1);
3477 flush_tlb_mask(&flushmask);
3479 if ( npte )
3480 unmap_domain_page(npte);
3484 /* Update the entry with new content */
3485 safe_write_pte(p, new);
3487 /* install P2M in monitors for PAE Xen */
3488 #if CONFIG_PAGING_LEVELS == 3
3489 if ( level == 3 )
3490 /* We have written to the p2m l3: need to sync the per-vcpu
3491 * copies of it in the monitor tables */
3492 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
3493 #endif
3495 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3496 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
3497 cached the fact that this is an mmio region in the shadow
3498 page tables. Blow the tables away to remove the cache.
3499 This is pretty heavy handed, but this is a rare operation
3500 (it might happen a dozen times during boot and then never
3501 again), so it doesn't matter too much. */
3502 if ( d->arch.paging.shadow.has_fast_mmio_entries )
3504 shadow_blow_tables(d);
3505 d->arch.paging.shadow.has_fast_mmio_entries = 0;
3507 #endif
3509 shadow_unlock(d);
3512 /**************************************************************************/
3513 /* Log-dirty mode support */
3515 /* Shadow specific code which is called in paging_log_dirty_enable().
3516 * Return 0 if no problem found.
3517 */
3518 int shadow_enable_log_dirty(struct domain *d)
3520 int ret;
3522 /* shadow lock is required here */
3523 shadow_lock(d);
3524 if ( shadow_mode_enabled(d) )
3526 /* This domain already has some shadows: need to clear them out
3527 * of the way to make sure that all references to guest memory are
3528 * properly write-protected */
3529 shadow_blow_tables(d);
3532 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3533 /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
3534 * change an l4e instead of cr3 to switch tables. Give them the
3535 * same optimization */
3536 if ( is_pv_32on64_domain(d) )
3537 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3538 #endif
3540 ret = shadow_one_bit_enable(d, PG_log_dirty);
3541 shadow_unlock(d);
3543 return ret;
3546 /* shadow specfic code which is called in paging_log_dirty_disable() */
3547 int shadow_disable_log_dirty(struct domain *d)
3549 int ret;
3551 /* shadow lock is required here */
3552 shadow_lock(d);
3553 ret = shadow_one_bit_disable(d, PG_log_dirty);
3554 shadow_unlock(d);
3556 return ret;
3559 /* This function is called when we CLEAN log dirty bitmap. See
3560 * paging_log_dirty_op() for details.
3561 */
3562 void shadow_clean_dirty_bitmap(struct domain *d)
3564 shadow_lock(d);
3565 /* Need to revoke write access to the domain's pages again.
3566 * In future, we'll have a less heavy-handed approach to this,
3567 * but for now, we just unshadow everything except Xen. */
3568 shadow_blow_tables(d);
3569 shadow_unlock(d);
3573 /**************************************************************************/
3574 /* VRAM dirty tracking support */
3575 int shadow_track_dirty_vram(struct domain *d,
3576 unsigned long begin_pfn,
3577 unsigned long nr,
3578 XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
3580 int rc;
3581 unsigned long end_pfn = begin_pfn + nr;
3582 unsigned long dirty_size = (nr + 7) / 8;
3583 int flush_tlb = 0;
3584 unsigned long i;
3585 p2m_type_t t;
3586 struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
3588 if (end_pfn < begin_pfn
3589 || begin_pfn > d->arch.p2m->max_mapped_pfn
3590 || end_pfn >= d->arch.p2m->max_mapped_pfn)
3591 return -EINVAL;
3593 shadow_lock(d);
3595 if ( dirty_vram && (!nr ||
3596 ( begin_pfn != dirty_vram->begin_pfn
3597 || end_pfn != dirty_vram->end_pfn )) )
3599 /* Different tracking, tear the previous down. */
3600 gdprintk(XENLOG_INFO, "stopping tracking VRAM %lx - %lx\n", dirty_vram->begin_pfn, dirty_vram->end_pfn);
3601 xfree(dirty_vram->sl1ma);
3602 xfree(dirty_vram->dirty_bitmap);
3603 xfree(dirty_vram);
3604 dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
3607 if ( !nr )
3609 rc = 0;
3610 goto out;
3613 /* This should happen seldomly (Video mode change),
3614 * no need to be careful. */
3615 if ( !dirty_vram )
3617 /* Just recount from start. */
3618 for ( i = begin_pfn; i < end_pfn; i++ ) {
3619 mfn_t mfn = gfn_to_mfn(d, i, &t);
3620 if (mfn_x(mfn) != INVALID_MFN)
3621 flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
3624 gdprintk(XENLOG_INFO, "tracking VRAM %lx - %lx\n", begin_pfn, end_pfn);
3626 rc = -ENOMEM;
3627 if ( (dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL )
3628 goto out;
3629 dirty_vram->begin_pfn = begin_pfn;
3630 dirty_vram->end_pfn = end_pfn;
3631 d->arch.hvm_domain.dirty_vram = dirty_vram;
3633 if ( (dirty_vram->sl1ma = xmalloc_array(paddr_t, nr)) == NULL )
3634 goto out_dirty_vram;
3635 memset(dirty_vram->sl1ma, ~0, sizeof(paddr_t) * nr);
3637 if ( (dirty_vram->dirty_bitmap = xmalloc_array(uint8_t, dirty_size)) == NULL )
3638 goto out_sl1ma;
3639 memset(dirty_vram->dirty_bitmap, 0, dirty_size);
3641 dirty_vram->last_dirty = NOW();
3643 /* Tell the caller that this time we could not track dirty bits. */
3644 rc = -ENODATA;
3646 else if (dirty_vram->last_dirty == -1)
3648 /* still completely clean, just copy our empty bitmap */
3649 rc = -EFAULT;
3650 if ( copy_to_guest(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size) == 0 )
3651 rc = 0;
3653 else
3655 #ifdef __i386__
3656 unsigned long map_mfn = INVALID_MFN;
3657 void *map_sl1p = NULL;
3658 #endif
3660 /* Iterate over VRAM to track dirty bits. */
3661 for ( i = 0; i < nr; i++ ) {
3662 mfn_t mfn = gfn_to_mfn(d, begin_pfn + i, &t);
3663 struct page_info *page;
3664 int dirty = 0;
3665 paddr_t sl1ma = dirty_vram->sl1ma[i];
3667 if (mfn_x(mfn) == INVALID_MFN)
3669 dirty = 1;
3671 else
3673 page = mfn_to_page(mfn);
3674 switch (page->u.inuse.type_info & PGT_count_mask)
3676 case 0:
3677 /* No guest reference, nothing to track. */
3678 break;
3679 case 1:
3680 /* One guest reference. */
3681 if ( sl1ma == INVALID_PADDR )
3683 /* We don't know which sl1e points to this, too bad. */
3684 dirty = 1;
3685 /* TODO: Heuristics for finding the single mapping of
3686 * this gmfn */
3687 flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
3689 else
3691 /* Hopefully the most common case: only one mapping,
3692 * whose dirty bit we can use. */
3693 l1_pgentry_t *sl1e;
3694 #ifdef __i386__
3695 void *sl1p = map_sl1p;
3696 unsigned long sl1mfn = paddr_to_pfn(sl1ma);
3698 if ( sl1mfn != map_mfn ) {
3699 if ( map_sl1p )
3700 sh_unmap_domain_page(map_sl1p);
3701 map_sl1p = sl1p = sh_map_domain_page(_mfn(sl1mfn));
3702 map_mfn = sl1mfn;
3704 sl1e = sl1p + (sl1ma & ~PAGE_MASK);
3705 #else
3706 sl1e = maddr_to_virt(sl1ma);
3707 #endif
3709 if ( l1e_get_flags(*sl1e) & _PAGE_DIRTY )
3711 dirty = 1;
3712 /* Note: this is atomic, so we may clear a
3713 * _PAGE_ACCESSED set by another processor. */
3714 l1e_remove_flags(*sl1e, _PAGE_DIRTY);
3715 flush_tlb = 1;
3718 break;
3719 default:
3720 /* More than one guest reference,
3721 * we don't afford tracking that. */
3722 dirty = 1;
3723 break;
3727 if ( dirty )
3729 dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
3730 dirty_vram->last_dirty = NOW();
3734 #ifdef __i386__
3735 if ( map_sl1p )
3736 sh_unmap_domain_page(map_sl1p);
3737 #endif
3739 rc = -EFAULT;
3740 if ( copy_to_guest(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size) == 0 ) {
3741 memset(dirty_vram->dirty_bitmap, 0, dirty_size);
3742 if (dirty_vram->last_dirty + SECONDS(2) < NOW())
3744 /* was clean for more than two seconds, try to disable guest
3745 * write access */
3746 for ( i = begin_pfn; i < end_pfn; i++ ) {
3747 mfn_t mfn = gfn_to_mfn(d, i, &t);
3748 if (mfn_x(mfn) != INVALID_MFN)
3749 flush_tlb |= sh_remove_write_access(d->vcpu[0], mfn, 1, 0);
3751 dirty_vram->last_dirty = -1;
3753 rc = 0;
3756 if ( flush_tlb )
3757 flush_tlb_mask(&d->domain_dirty_cpumask);
3758 goto out;
3760 out_sl1ma:
3761 xfree(dirty_vram->sl1ma);
3762 out_dirty_vram:
3763 xfree(dirty_vram);
3764 dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
3766 out:
3767 shadow_unlock(d);
3768 return rc;
3771 /**************************************************************************/
3772 /* Shadow-control XEN_DOMCTL dispatcher */
3774 int shadow_domctl(struct domain *d,
3775 xen_domctl_shadow_op_t *sc,
3776 XEN_GUEST_HANDLE(void) u_domctl)
3778 int rc, preempted = 0;
3780 switch ( sc->op )
3782 case XEN_DOMCTL_SHADOW_OP_OFF:
3783 if ( d->arch.paging.mode == PG_SH_enable )
3784 if ( (rc = shadow_test_disable(d)) != 0 )
3785 return rc;
3786 return 0;
3788 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3789 return shadow_test_enable(d);
3791 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3792 return shadow_enable(d, PG_refcounts|PG_translate);
3794 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3795 return shadow_enable(d, sc->mode << PG_mode_shift);
3797 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3798 sc->mb = shadow_get_allocation(d);
3799 return 0;
3801 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3802 shadow_lock(d);
3803 if ( sc->mb == 0 && shadow_mode_enabled(d) )
3805 /* Can't set the allocation to zero unless the domain stops using
3806 * shadow pagetables first */
3807 SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
3808 " is still using shadows.\n", d->domain_id);
3809 shadow_unlock(d);
3810 return -EINVAL;
3812 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
3813 shadow_unlock(d);
3814 if ( preempted )
3815 /* Not finished. Set up to re-run the call. */
3816 rc = hypercall_create_continuation(
3817 __HYPERVISOR_domctl, "h", u_domctl);
3818 else
3819 /* Finished. Return the new allocation */
3820 sc->mb = shadow_get_allocation(d);
3821 return rc;
3823 default:
3824 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3825 return -EINVAL;
3830 /**************************************************************************/
3831 /* Auditing shadow tables */
3833 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3835 void shadow_audit_tables(struct vcpu *v)
3837 /* Dispatch table for getting per-type functions */
3838 static hash_callback_t callbacks[SH_type_unused] = {
3839 NULL, /* none */
3840 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 2), /* l1_32 */
3841 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 2), /* fl1_32 */
3842 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 2), /* l2_32 */
3843 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 3), /* l1_pae */
3844 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 3), /* fl1_pae */
3845 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2_pae */
3846 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2h_pae */
3847 #if CONFIG_PAGING_LEVELS >= 4
3848 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 4), /* l1_64 */
3849 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 4), /* fl1_64 */
3850 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2_64 */
3851 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2h_64 */
3852 SHADOW_INTERNAL_NAME(sh_audit_l3_table, 4), /* l3_64 */
3853 SHADOW_INTERNAL_NAME(sh_audit_l4_table, 4), /* l4_64 */
3854 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3855 NULL /* All the rest */
3856 };
3857 unsigned int mask;
3859 if ( !(SHADOW_AUDIT_ENABLE) )
3860 return;
3862 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3863 sh_oos_audit(v->domain);
3864 #endif
3866 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3867 mask = ~1; /* Audit every table in the system */
3868 else
3870 /* Audit only the current mode's tables */
3871 switch ( v->arch.paging.mode->guest_levels )
3873 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3874 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3875 |SHF_L2H_PAE); break;
3876 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3877 |SHF_L3_64|SHF_L4_64); break;
3878 default: BUG();
3882 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3885 #endif /* Shadow audit */
3887 /*
3888 * Local variables:
3889 * mode: C
3890 * c-set-style: "BSD"
3891 * c-basic-offset: 4
3892 * indent-tabs-mode: nil
3893 * End:
3894 */