ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 18454:74621a2add54

xentrace 5/7: Additional tracing for the shadow code.

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
Signed-off-by: Trolle Selander <trolle.selander@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Sep 08 15:58:04 2008 +0100 (2008-09-08)
parents d96bf4cd0f37
children fa2adc7fb996
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include <xen/numa.h>
40 #include "private.h"
42 DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
44 /* Set up the shadow-specific parts of a domain struct at start of day.
45 * Called for every domain from arch_domain_create() */
46 void shadow_domain_init(struct domain *d)
47 {
48 int i;
49 shadow_lock_init(d);
50 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
51 INIT_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
52 INIT_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
53 INIT_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
55 /* Use shadow pagetables for log-dirty support */
56 paging_log_dirty_init(d, shadow_enable_log_dirty,
57 shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
59 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
60 d->arch.paging.shadow.oos_active = 0;
61 #endif
62 }
64 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
65 * job is to initialize the update_paging_modes() function pointer, which is
66 * used to initialized the rest of resources. Therefore, it really does not
67 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
68 * be compiled.
69 */
70 void shadow_vcpu_init(struct vcpu *v)
71 {
72 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
73 int i, j;
75 for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
76 {
77 v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
78 v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
79 for ( j = 0; j < SHADOW_OOS_FIXUPS; j++ )
80 v->arch.paging.shadow.oos_fixup[i].smfn[j] = _mfn(INVALID_MFN);
81 }
82 #endif
84 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
85 }
87 #if SHADOW_AUDIT
88 int shadow_audit_enable = 0;
90 static void shadow_audit_key(unsigned char key)
91 {
92 shadow_audit_enable = !shadow_audit_enable;
93 printk("%s shadow_audit_enable=%d\n",
94 __func__, shadow_audit_enable);
95 }
97 static int __init shadow_audit_key_init(void)
98 {
99 register_keyhandler(
100 'O', shadow_audit_key, "toggle shadow audits");
101 return 0;
102 }
103 __initcall(shadow_audit_key_init);
104 #endif /* SHADOW_AUDIT */
106 int _shadow_mode_refcounts(struct domain *d)
107 {
108 return shadow_mode_refcounts(d);
109 }
112 /**************************************************************************/
113 /* x86 emulator support for the shadow code
114 */
116 struct segment_register *hvm_get_seg_reg(
117 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
118 {
119 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
120 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
121 hvm_get_segment_register(current, seg, seg_reg);
122 return seg_reg;
123 }
125 static int hvm_translate_linear_addr(
126 enum x86_segment seg,
127 unsigned long offset,
128 unsigned int bytes,
129 enum hvm_access_type access_type,
130 struct sh_emulate_ctxt *sh_ctxt,
131 unsigned long *paddr)
132 {
133 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
134 int okay;
136 okay = hvm_virtual_to_linear_addr(
137 seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr);
139 if ( !okay )
140 {
141 hvm_inject_exception(TRAP_gp_fault, 0, 0);
142 return X86EMUL_EXCEPTION;
143 }
145 return 0;
146 }
148 static int
149 hvm_read(enum x86_segment seg,
150 unsigned long offset,
151 void *p_data,
152 unsigned int bytes,
153 enum hvm_access_type access_type,
154 struct sh_emulate_ctxt *sh_ctxt)
155 {
156 unsigned long addr;
157 int rc;
159 rc = hvm_translate_linear_addr(
160 seg, offset, bytes, access_type, sh_ctxt, &addr);
161 if ( rc )
162 return rc;
164 if ( access_type == hvm_access_insn_fetch )
165 rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0);
166 else
167 rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0);
169 switch ( rc )
170 {
171 case HVMCOPY_okay:
172 return X86EMUL_OKAY;
173 case HVMCOPY_bad_gva_to_gfn:
174 return X86EMUL_EXCEPTION;
175 default:
176 break;
177 }
179 return X86EMUL_UNHANDLEABLE;
180 }
182 static int
183 hvm_emulate_read(enum x86_segment seg,
184 unsigned long offset,
185 void *p_data,
186 unsigned int bytes,
187 struct x86_emulate_ctxt *ctxt)
188 {
189 if ( !is_x86_user_segment(seg) )
190 return X86EMUL_UNHANDLEABLE;
191 return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
192 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
193 }
195 static int
196 hvm_emulate_insn_fetch(enum x86_segment seg,
197 unsigned long offset,
198 void *p_data,
199 unsigned int bytes,
200 struct x86_emulate_ctxt *ctxt)
201 {
202 struct sh_emulate_ctxt *sh_ctxt =
203 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
204 unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
206 ASSERT(seg == x86_seg_cs);
208 /* Fall back if requested bytes are not in the prefetch cache. */
209 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
210 return hvm_read(seg, offset, p_data, bytes,
211 hvm_access_insn_fetch, sh_ctxt);
213 /* Hit the cache. Simple memcpy. */
214 memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
215 return X86EMUL_OKAY;
216 }
218 static int
219 hvm_emulate_write(enum x86_segment seg,
220 unsigned long offset,
221 void *p_data,
222 unsigned int bytes,
223 struct x86_emulate_ctxt *ctxt)
224 {
225 struct sh_emulate_ctxt *sh_ctxt =
226 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
227 struct vcpu *v = current;
228 unsigned long addr;
229 int rc;
231 if ( !is_x86_user_segment(seg) )
232 return X86EMUL_UNHANDLEABLE;
234 /* How many emulations could we save if we unshadowed on stack writes? */
235 if ( seg == x86_seg_ss )
236 perfc_incr(shadow_fault_emulate_stack);
238 rc = hvm_translate_linear_addr(
239 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
240 if ( rc )
241 return rc;
243 return v->arch.paging.mode->shadow.x86_emulate_write(
244 v, addr, p_data, bytes, sh_ctxt);
245 }
247 static int
248 hvm_emulate_cmpxchg(enum x86_segment seg,
249 unsigned long offset,
250 void *p_old,
251 void *p_new,
252 unsigned int bytes,
253 struct x86_emulate_ctxt *ctxt)
254 {
255 struct sh_emulate_ctxt *sh_ctxt =
256 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
257 struct vcpu *v = current;
258 unsigned long addr, old[2], new[2];
259 int rc;
261 if ( !is_x86_user_segment(seg) )
262 return X86EMUL_UNHANDLEABLE;
264 rc = hvm_translate_linear_addr(
265 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
266 if ( rc )
267 return rc;
269 old[0] = new[0] = 0;
270 memcpy(old, p_old, bytes);
271 memcpy(new, p_new, bytes);
273 if ( bytes <= sizeof(long) )
274 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
275 v, addr, old[0], new[0], bytes, sh_ctxt);
277 #ifdef __i386__
278 if ( bytes == 8 )
279 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
280 v, addr, old[0], old[1], new[0], new[1], sh_ctxt);
281 #endif
283 return X86EMUL_UNHANDLEABLE;
284 }
286 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
287 .read = hvm_emulate_read,
288 .insn_fetch = hvm_emulate_insn_fetch,
289 .write = hvm_emulate_write,
290 .cmpxchg = hvm_emulate_cmpxchg,
291 };
293 static int
294 pv_emulate_read(enum x86_segment seg,
295 unsigned long offset,
296 void *p_data,
297 unsigned int bytes,
298 struct x86_emulate_ctxt *ctxt)
299 {
300 unsigned int rc;
302 if ( !is_x86_user_segment(seg) )
303 return X86EMUL_UNHANDLEABLE;
305 if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 )
306 {
307 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
308 return X86EMUL_EXCEPTION;
309 }
311 return X86EMUL_OKAY;
312 }
314 static int
315 pv_emulate_write(enum x86_segment seg,
316 unsigned long offset,
317 void *p_data,
318 unsigned int bytes,
319 struct x86_emulate_ctxt *ctxt)
320 {
321 struct sh_emulate_ctxt *sh_ctxt =
322 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
323 struct vcpu *v = current;
324 if ( !is_x86_user_segment(seg) )
325 return X86EMUL_UNHANDLEABLE;
326 return v->arch.paging.mode->shadow.x86_emulate_write(
327 v, offset, p_data, bytes, sh_ctxt);
328 }
330 static int
331 pv_emulate_cmpxchg(enum x86_segment seg,
332 unsigned long offset,
333 void *p_old,
334 void *p_new,
335 unsigned int bytes,
336 struct x86_emulate_ctxt *ctxt)
337 {
338 struct sh_emulate_ctxt *sh_ctxt =
339 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
340 unsigned long old[2], new[2];
341 struct vcpu *v = current;
343 if ( !is_x86_user_segment(seg) )
344 return X86EMUL_UNHANDLEABLE;
346 old[0] = new[0] = 0;
347 memcpy(old, p_old, bytes);
348 memcpy(new, p_new, bytes);
350 if ( bytes <= sizeof(long) )
351 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
352 v, offset, old[0], new[0], bytes, sh_ctxt);
354 #ifdef __i386__
355 if ( bytes == 8 )
356 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
357 v, offset, old[0], old[1], new[0], new[1], sh_ctxt);
358 #endif
360 return X86EMUL_UNHANDLEABLE;
361 }
363 static struct x86_emulate_ops pv_shadow_emulator_ops = {
364 .read = pv_emulate_read,
365 .insn_fetch = pv_emulate_read,
366 .write = pv_emulate_write,
367 .cmpxchg = pv_emulate_cmpxchg,
368 };
370 struct x86_emulate_ops *shadow_init_emulation(
371 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
372 {
373 struct segment_register *creg, *sreg;
374 struct vcpu *v = current;
375 unsigned long addr;
377 sh_ctxt->ctxt.regs = regs;
378 sh_ctxt->ctxt.force_writeback = 0;
380 if ( !is_hvm_vcpu(v) )
381 {
382 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
383 return &pv_shadow_emulator_ops;
384 }
386 /* Segment cache initialisation. Primed with CS. */
387 sh_ctxt->valid_seg_regs = 0;
388 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
390 /* Work out the emulation mode. */
391 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
392 {
393 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
394 }
395 else
396 {
397 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
398 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
399 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
400 }
402 /* Attempt to prefetch whole instruction. */
403 sh_ctxt->insn_buf_eip = regs->eip;
404 sh_ctxt->insn_buf_bytes =
405 (!hvm_translate_linear_addr(
406 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
407 hvm_access_insn_fetch, sh_ctxt, &addr) &&
408 !hvm_fetch_from_guest_virt_nofault(
409 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
410 ? sizeof(sh_ctxt->insn_buf) : 0;
412 return &hvm_shadow_emulator_ops;
413 }
415 /* Update an initialized emulation context to prepare for the next
416 * instruction */
417 void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
418 struct cpu_user_regs *regs)
419 {
420 struct vcpu *v = current;
421 unsigned long addr, diff;
423 /* We don't refetch the segment bases, because we don't emulate
424 * writes to segment registers */
426 if ( is_hvm_vcpu(v) )
427 {
428 diff = regs->eip - sh_ctxt->insn_buf_eip;
429 if ( diff > sh_ctxt->insn_buf_bytes )
430 {
431 /* Prefetch more bytes. */
432 sh_ctxt->insn_buf_bytes =
433 (!hvm_translate_linear_addr(
434 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
435 hvm_access_insn_fetch, sh_ctxt, &addr) &&
436 !hvm_fetch_from_guest_virt_nofault(
437 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
438 ? sizeof(sh_ctxt->insn_buf) : 0;
439 sh_ctxt->insn_buf_eip = regs->eip;
440 }
441 }
442 }
445 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
446 /**************************************************************************/
447 /* Out-of-sync shadows. */
449 /* From time to time, we let a shadowed pagetable page go out of sync
450 * with its shadow: the guest is allowed to write directly to the page,
451 * and those writes are not synchronously reflected in the shadow.
452 * This lets us avoid many emulations if the guest is writing a lot to a
453 * pagetable, but it relaxes a pretty important invariant in the shadow
454 * pagetable design. Therefore, some rules:
455 *
456 * 1. Only L1 pagetables may go out of sync: any page that is shadowed
457 * at at higher level must be synchronously updated. This makes
458 * using linear shadow pagetables much less dangerous.
459 * That means that: (a) unsyncing code needs to check for higher-level
460 * shadows, and (b) promotion code needs to resync.
461 *
462 * 2. All shadow operations on a guest page require the page to be brought
463 * back into sync before proceeding. This must be done under the
464 * shadow lock so that the page is guaranteed to remain synced until
465 * the operation completes.
466 *
467 * Exceptions to this rule: the pagefault and invlpg handlers may
468 * update only one entry on an out-of-sync page without resyncing it.
469 *
470 * 3. Operations on shadows that do not start from a guest page need to
471 * be aware that they may be handling an out-of-sync shadow.
472 *
473 * 4. Operations that do not normally take the shadow lock (fast-path
474 * #PF handler, INVLPG) must fall back to a locking, syncing version
475 * if they see an out-of-sync table.
476 *
477 * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
478 * must explicitly resync all relevant pages or update their
479 * shadows.
480 *
481 * Currently out-of-sync pages are listed in a simple open-addressed
482 * hash table with a second chance (must resist temptation to radically
483 * over-engineer hash tables...) The virtual address of the access
484 * which caused us to unsync the page is also kept in the hash table, as
485 * a hint for finding the writable mappings later.
486 *
487 * We keep a hash per vcpu, because we want as much as possible to do
488 * the re-sync on the save vcpu we did the unsync on, so the VA hint
489 * will be valid.
490 */
493 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
494 static void sh_oos_audit(struct domain *d)
495 {
496 int idx, expected_idx, expected_idx_alt;
497 struct page_info *pg;
498 struct vcpu *v;
500 for_each_vcpu(d, v)
501 {
502 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
503 {
504 mfn_t *oos = v->arch.paging.shadow.oos;
505 if ( !mfn_valid(oos[idx]) )
506 continue;
508 expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
509 expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
510 if ( idx != expected_idx && idx != expected_idx_alt )
511 {
512 printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
513 __func__, idx, mfn_x(oos[idx]),
514 expected_idx, expected_idx_alt);
515 BUG();
516 }
517 pg = mfn_to_page(oos[idx]);
518 if ( !(pg->count_info & PGC_page_table) )
519 {
520 printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
521 __func__, idx, mfn_x(oos[idx]), pg->count_info);
522 BUG();
523 }
524 if ( !(pg->shadow_flags & SHF_out_of_sync) )
525 {
526 printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
527 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
528 BUG();
529 }
530 if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
531 {
532 printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
533 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
534 BUG();
535 }
536 }
537 }
538 }
539 #endif
541 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
542 void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
543 {
544 int idx;
545 struct vcpu *v;
546 mfn_t *oos;
548 ASSERT(mfn_is_out_of_sync(gmfn));
550 for_each_vcpu(d, v)
551 {
552 oos = v->arch.paging.shadow.oos;
553 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
554 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
555 idx = (idx + 1) % SHADOW_OOS_PAGES;
557 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
558 return;
559 }
561 SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
562 BUG();
563 }
564 #endif
566 /* Update the shadow, but keep the page out of sync. */
567 static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
568 {
569 struct page_info *pg = mfn_to_page(gmfn);
571 ASSERT(mfn_valid(gmfn));
572 ASSERT(page_is_out_of_sync(pg));
574 /* Call out to the appropriate per-mode resyncing function */
575 if ( pg->shadow_flags & SHF_L1_32 )
576 SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
577 else if ( pg->shadow_flags & SHF_L1_PAE )
578 SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
579 #if CONFIG_PAGING_LEVELS >= 4
580 else if ( pg->shadow_flags & SHF_L1_64 )
581 SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
582 #endif
583 }
586 /*
587 * Fixup arrays: We limit the maximum number of writable mappings to
588 * SHADOW_OOS_FIXUPS and store enough information to remove them
589 * quickly on resync.
590 */
592 static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn,
593 struct oos_fixup *fixup)
594 {
595 int i;
596 for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
597 {
598 if ( mfn_x(fixup->smfn[i]) != INVALID_MFN )
599 {
600 sh_remove_write_access_from_sl1p(v, gmfn,
601 fixup->smfn[i],
602 fixup->off[i]);
603 fixup->smfn[i] = _mfn(INVALID_MFN);
604 }
605 }
607 /* Always flush the TLBs. See comment on oos_fixup_add(). */
608 return 1;
609 }
611 void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
612 mfn_t smfn, unsigned long off)
613 {
614 int idx, next;
615 mfn_t *oos;
616 struct oos_fixup *oos_fixup;
617 struct domain *d = v->domain;
619 perfc_incr(shadow_oos_fixup_add);
621 for_each_vcpu(d, v)
622 {
623 oos = v->arch.paging.shadow.oos;
624 oos_fixup = v->arch.paging.shadow.oos_fixup;
625 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
626 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
627 idx = (idx + 1) % SHADOW_OOS_PAGES;
628 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
629 {
630 next = oos_fixup[idx].next;
632 if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN )
633 {
634 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT);
636 /* Reuse this slot and remove current writable mapping. */
637 sh_remove_write_access_from_sl1p(v, gmfn,
638 oos_fixup[idx].smfn[next],
639 oos_fixup[idx].off[next]);
640 perfc_incr(shadow_oos_fixup_evict);
641 /* We should flush the TLBs now, because we removed a
642 writable mapping, but since the shadow is already
643 OOS we have no problem if another vcpu write to
644 this page table. We just have to be very careful to
645 *always* flush the tlbs on resync. */
646 }
648 oos_fixup[idx].smfn[next] = smfn;
649 oos_fixup[idx].off[next] = off;
650 oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS;
652 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD);
653 return;
654 }
655 }
657 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
658 BUG();
659 }
661 static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn,
662 struct oos_fixup *fixup)
663 {
664 int ftlb = 0;
666 ftlb |= oos_fixup_flush_gmfn(v, gmfn, fixup);
668 switch ( sh_remove_write_access(v, gmfn, 0, 0) )
669 {
670 default:
671 case 0:
672 break;
674 case 1:
675 ftlb |= 1;
676 break;
678 case -1:
679 /* An unfindable writeable typecount has appeared, probably via a
680 * grant table entry: can't shoot the mapping, so try to unshadow
681 * the page. If that doesn't work either, the guest is granting
682 * his pagetables and must be killed after all.
683 * This will flush the tlb, so we can return with no worries. */
684 sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
685 return 1;
686 }
688 if ( ftlb )
689 flush_tlb_mask(v->domain->domain_dirty_cpumask);
691 return 0;
692 }
695 static inline void trace_resync(int event, mfn_t gmfn)
696 {
697 if ( tb_init_done )
698 {
699 /* Convert gmfn to gfn */
700 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
701 __trace_var(event, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
702 }
703 }
705 /* Pull all the entries on an out-of-sync page back into sync. */
706 static void _sh_resync(struct vcpu *v, mfn_t gmfn,
707 struct oos_fixup *fixup, mfn_t snp)
708 {
709 struct page_info *pg = mfn_to_page(gmfn);
711 ASSERT(shadow_locked_by_me(v->domain));
712 ASSERT(mfn_is_out_of_sync(gmfn));
713 /* Guest page must be shadowed *only* as L1 when out of sync. */
714 ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
715 & ~SHF_L1_ANY));
716 ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
718 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
719 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
721 /* Need to pull write access so the page *stays* in sync. */
722 if ( oos_remove_write_access(v, gmfn, fixup) )
723 {
724 /* Page has been unshadowed. */
725 return;
726 }
728 /* No more writable mappings of this page, please */
729 pg->shadow_flags &= ~SHF_oos_may_write;
731 /* Update the shadows with current guest entries. */
732 _sh_resync_l1(v, gmfn, snp);
734 /* Now we know all the entries are synced, and will stay that way */
735 pg->shadow_flags &= ~SHF_out_of_sync;
736 perfc_incr(shadow_resync);
737 trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn);
738 }
741 /* Add an MFN to the list of out-of-sync guest pagetables */
742 static void oos_hash_add(struct vcpu *v, mfn_t gmfn)
743 {
744 int i, idx, oidx, swap = 0;
745 void *gptr, *gsnpptr;
746 mfn_t *oos = v->arch.paging.shadow.oos;
747 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
748 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
749 struct oos_fixup fixup = { .next = 0 };
751 for (i = 0; i < SHADOW_OOS_FIXUPS; i++ )
752 fixup.smfn[i] = _mfn(INVALID_MFN);
754 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
755 oidx = idx;
757 if ( mfn_valid(oos[idx])
758 && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
759 {
760 /* Punt the current occupant into the next slot */
761 SWAP(oos[idx], gmfn);
762 SWAP(oos_fixup[idx], fixup);
763 swap = 1;
764 idx = (idx + 1) % SHADOW_OOS_PAGES;
765 }
766 if ( mfn_valid(oos[idx]) )
767 {
768 /* Crush the current occupant. */
769 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
770 perfc_incr(shadow_unsync_evict);
771 }
772 oos[idx] = gmfn;
773 oos_fixup[idx] = fixup;
775 if ( swap )
776 SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
778 gptr = sh_map_domain_page(oos[oidx]);
779 gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
780 memcpy(gsnpptr, gptr, PAGE_SIZE);
781 sh_unmap_domain_page(gptr);
782 sh_unmap_domain_page(gsnpptr);
783 }
785 /* Remove an MFN from the list of out-of-sync guest pagetables */
786 static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
787 {
788 int idx;
789 mfn_t *oos;
790 struct domain *d = v->domain;
792 SHADOW_PRINTK("D%dV%d gmfn %lx\n",
793 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
795 for_each_vcpu(d, v)
796 {
797 oos = v->arch.paging.shadow.oos;
798 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
799 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
800 idx = (idx + 1) % SHADOW_OOS_PAGES;
801 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
802 {
803 oos[idx] = _mfn(INVALID_MFN);
804 return;
805 }
806 }
808 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
809 BUG();
810 }
812 mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
813 {
814 int idx;
815 mfn_t *oos;
816 mfn_t *oos_snapshot;
817 struct domain *d = v->domain;
819 for_each_vcpu(d, v)
820 {
821 oos = v->arch.paging.shadow.oos;
822 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
823 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
824 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
825 idx = (idx + 1) % SHADOW_OOS_PAGES;
826 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
827 {
828 return oos_snapshot[idx];
829 }
830 }
832 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
833 BUG();
834 return _mfn(INVALID_MFN);
835 }
837 /* Pull a single guest page back into sync */
838 void sh_resync(struct vcpu *v, mfn_t gmfn)
839 {
840 int idx;
841 mfn_t *oos;
842 mfn_t *oos_snapshot;
843 struct oos_fixup *oos_fixup;
844 struct domain *d = v->domain;
846 for_each_vcpu(d, v)
847 {
848 oos = v->arch.paging.shadow.oos;
849 oos_fixup = v->arch.paging.shadow.oos_fixup;
850 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
851 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
852 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
853 idx = (idx + 1) % SHADOW_OOS_PAGES;
855 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
856 {
857 _sh_resync(v, gmfn, &oos_fixup[idx], oos_snapshot[idx]);
858 oos[idx] = _mfn(INVALID_MFN);
859 return;
860 }
861 }
863 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
864 BUG();
865 }
867 /* Figure out whether it's definitely safe not to sync this l1 table,
868 * by making a call out to the mode in which that shadow was made. */
869 static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
870 {
871 struct page_info *pg = mfn_to_page(gl1mfn);
872 if ( pg->shadow_flags & SHF_L1_32 )
873 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
874 else if ( pg->shadow_flags & SHF_L1_PAE )
875 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
876 #if CONFIG_PAGING_LEVELS >= 4
877 else if ( pg->shadow_flags & SHF_L1_64 )
878 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
879 #endif
880 SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n",
881 mfn_x(gl1mfn));
882 BUG();
883 return 0; /* BUG() is no longer __attribute__((noreturn)). */
884 }
887 /* Pull all out-of-sync pages back into sync. Pages brought out of sync
888 * on other vcpus are allowed to remain out of sync, but their contents
889 * will be made safe (TLB flush semantics); pages unsynced by this vcpu
890 * are brought back into sync and write-protected. If skip != 0, we try
891 * to avoid resyncing at all if we think we can get away with it. */
892 void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking)
893 {
894 int idx;
895 struct vcpu *other;
896 mfn_t *oos = v->arch.paging.shadow.oos;
897 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
898 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
900 SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
902 ASSERT(do_locking || shadow_locked_by_me(v->domain));
904 if ( !this )
905 goto resync_others;
907 if ( do_locking )
908 shadow_lock(v->domain);
910 /* First: resync all of this vcpu's oos pages */
911 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
912 if ( mfn_valid(oos[idx]) )
913 {
914 /* Write-protect and sync contents */
915 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
916 oos[idx] = _mfn(INVALID_MFN);
917 }
919 if ( do_locking )
920 shadow_unlock(v->domain);
922 resync_others:
923 if ( !others )
924 return;
926 /* Second: make all *other* vcpus' oos pages safe. */
927 for_each_vcpu(v->domain, other)
928 {
929 if ( v == other )
930 continue;
932 if ( do_locking )
933 shadow_lock(v->domain);
935 oos = other->arch.paging.shadow.oos;
936 oos_fixup = other->arch.paging.shadow.oos_fixup;
937 oos_snapshot = other->arch.paging.shadow.oos_snapshot;
939 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
940 {
941 if ( !mfn_valid(oos[idx]) )
942 continue;
944 if ( skip )
945 {
946 /* Update the shadows and leave the page OOS. */
947 if ( sh_skip_sync(v, oos[idx]) )
948 continue;
949 trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]);
950 _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
951 }
952 else
953 {
954 /* Write-protect and sync contents */
955 _sh_resync(other, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
956 oos[idx] = _mfn(INVALID_MFN);
957 }
958 }
960 if ( do_locking )
961 shadow_unlock(v->domain);
962 }
963 }
965 /* Allow a shadowed page to go out of sync. Unsyncs are traced in
966 * multi.c:sh_page_fault() */
967 int sh_unsync(struct vcpu *v, mfn_t gmfn)
968 {
969 struct page_info *pg;
971 ASSERT(shadow_locked_by_me(v->domain));
973 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
974 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
976 pg = mfn_to_page(gmfn);
978 /* Guest page must be shadowed *only* as L1 and *only* once when out
979 * of sync. Also, get out now if it's already out of sync.
980 * Also, can't safely unsync if some vcpus have paging disabled.*/
981 if ( pg->shadow_flags &
982 ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
983 || sh_page_has_multiple_shadows(pg)
984 || !is_hvm_domain(v->domain)
985 || !v->domain->arch.paging.shadow.oos_active )
986 return 0;
988 pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
989 oos_hash_add(v, gmfn);
990 perfc_incr(shadow_unsync);
991 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC);
992 return 1;
993 }
995 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
998 /**************************************************************************/
999 /* Code for "promoting" a guest page to the point where the shadow code is
1000 * willing to let it be treated as a guest page table. This generally
1001 * involves making sure there are no writable mappings available to the guest
1002 * for this page.
1003 */
1004 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
1006 struct page_info *page = mfn_to_page(gmfn);
1008 ASSERT(mfn_valid(gmfn));
1010 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1011 /* Is the page already shadowed and out of sync? */
1012 if ( page_is_out_of_sync(page) )
1013 sh_resync(v, gmfn);
1014 #endif
1016 /* We should never try to promote a gmfn that has writeable mappings */
1017 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
1018 || (page->u.inuse.type_info & PGT_count_mask) == 0
1019 || v->domain->is_shutting_down);
1021 /* Is the page already shadowed? */
1022 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
1023 page->shadow_flags = 0;
1025 ASSERT(!test_bit(type, &page->shadow_flags));
1026 set_bit(type, &page->shadow_flags);
1027 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
1030 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
1032 struct page_info *page = mfn_to_page(gmfn);
1034 ASSERT(test_bit(_PGC_page_table, &page->count_info));
1035 ASSERT(test_bit(type, &page->shadow_flags));
1037 clear_bit(type, &page->shadow_flags);
1039 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
1041 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1042 /* Was the page out of sync? */
1043 if ( page_is_out_of_sync(page) )
1045 oos_hash_remove(v, gmfn);
1047 #endif
1048 clear_bit(_PGC_page_table, &page->count_info);
1051 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE);
1054 /**************************************************************************/
1055 /* Validate a pagetable change from the guest and update the shadows.
1056 * Returns a bitmask of SHADOW_SET_* flags. */
1058 int
1059 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
1061 int result = 0;
1062 struct page_info *page = mfn_to_page(gmfn);
1064 paging_mark_dirty(v->domain, mfn_x(gmfn));
1066 // Determine which types of shadows are affected, and update each.
1067 //
1068 // Always validate L1s before L2s to prevent another cpu with a linear
1069 // mapping of this gmfn from seeing a walk that results from
1070 // using the new L2 value and the old L1 value. (It is OK for such a
1071 // guest to see a walk that uses the old L2 value with the new L1 value,
1072 // as hardware could behave this way if one level of the pagewalk occurs
1073 // before the store, and the next level of the pagewalk occurs after the
1074 // store.
1075 //
1076 // Ditto for L2s before L3s, etc.
1077 //
1079 if ( !(page->count_info & PGC_page_table) )
1080 return 0; /* Not shadowed at all */
1082 if ( page->shadow_flags & SHF_L1_32 )
1083 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2)
1084 (v, gmfn, entry, size);
1085 if ( page->shadow_flags & SHF_L2_32 )
1086 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2)
1087 (v, gmfn, entry, size);
1089 if ( page->shadow_flags & SHF_L1_PAE )
1090 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3)
1091 (v, gmfn, entry, size);
1092 if ( page->shadow_flags & SHF_L2_PAE )
1093 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3)
1094 (v, gmfn, entry, size);
1095 if ( page->shadow_flags & SHF_L2H_PAE )
1096 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3)
1097 (v, gmfn, entry, size);
1099 #if CONFIG_PAGING_LEVELS >= 4
1100 if ( page->shadow_flags & SHF_L1_64 )
1101 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4)
1102 (v, gmfn, entry, size);
1103 if ( page->shadow_flags & SHF_L2_64 )
1104 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4)
1105 (v, gmfn, entry, size);
1106 if ( page->shadow_flags & SHF_L2H_64 )
1107 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4)
1108 (v, gmfn, entry, size);
1109 if ( page->shadow_flags & SHF_L3_64 )
1110 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4)
1111 (v, gmfn, entry, size);
1112 if ( page->shadow_flags & SHF_L4_64 )
1113 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4)
1114 (v, gmfn, entry, size);
1115 #else /* 32-bit hypervisor does not support 64-bit guests */
1116 ASSERT((page->shadow_flags
1117 & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
1118 #endif
1119 this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED));
1121 return result;
1125 void
1126 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
1127 void *entry, u32 size)
1128 /* This is the entry point for emulated writes to pagetables in HVM guests and
1129 * PV translated guests.
1130 */
1132 struct domain *d = v->domain;
1133 int rc;
1135 ASSERT(shadow_locked_by_me(v->domain));
1136 rc = sh_validate_guest_entry(v, gmfn, entry, size);
1137 if ( rc & SHADOW_SET_FLUSH )
1138 /* Need to flush TLBs to pick up shadow PT changes */
1139 flush_tlb_mask(d->domain_dirty_cpumask);
1140 if ( rc & SHADOW_SET_ERROR )
1142 /* This page is probably not a pagetable any more: tear it out of the
1143 * shadows, along with any tables that reference it.
1144 * Since the validate call above will have made a "safe" (i.e. zero)
1145 * shadow entry, we can let the domain live even if we can't fully
1146 * unshadow the page. */
1147 sh_remove_shadows(v, gmfn, 0, 0);
1151 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
1152 intpte_t new, mfn_t gmfn)
1153 /* Write a new value into the guest pagetable, and update the shadows
1154 * appropriately. Returns 0 if we page-faulted, 1 for success. */
1156 int failed;
1157 shadow_lock(v->domain);
1158 failed = __copy_to_user(p, &new, sizeof(new));
1159 if ( failed != sizeof(new) )
1160 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1161 shadow_unlock(v->domain);
1162 return (failed == 0);
1165 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
1166 intpte_t *old, intpte_t new, mfn_t gmfn)
1167 /* Cmpxchg a new value into the guest pagetable, and update the shadows
1168 * appropriately. Returns 0 if we page-faulted, 1 if not.
1169 * N.B. caller should check the value of "old" to see if the
1170 * cmpxchg itself was successful. */
1172 int failed;
1173 intpte_t t = *old;
1174 shadow_lock(v->domain);
1175 failed = cmpxchg_user(p, t, new);
1176 if ( t == *old )
1177 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1178 *old = t;
1179 shadow_unlock(v->domain);
1180 return (failed == 0);
1184 /**************************************************************************/
1185 /* Memory management for shadow pages. */
1187 /* Allocating shadow pages
1188 * -----------------------
1190 * Most shadow pages are allocated singly, but there is one case where
1191 * we need to allocate multiple pages together: shadowing 32-bit guest
1192 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
1193 * of virtual address space, and needs to be shadowed by two PAE/64-bit
1194 * l1 tables (covering 2MB of virtual address space each). Similarly, a
1195 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
1196 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
1197 * contiguous and aligned; functions for handling offsets into them are
1198 * defined in shadow.c (shadow_l1_index() etc.)
1200 * This table shows the allocation behaviour of the different modes:
1202 * Xen paging pae pae 64b 64b 64b
1203 * Guest paging 32b pae 32b pae 64b
1204 * PV or HVM HVM * HVM HVM *
1205 * Shadow paging pae pae pae pae 64b
1207 * sl1 size 8k 4k 8k 4k 4k
1208 * sl2 size 16k 4k 16k 4k 4k
1209 * sl3 size - - - - 4k
1210 * sl4 size - - - - 4k
1212 * We allocate memory from xen in four-page units and break them down
1213 * with a simple buddy allocator. Can't use the xen allocator to handle
1214 * this as it only works for contiguous zones, and a domain's shadow
1215 * pool is made of fragments.
1217 * In HVM guests, the p2m table is built out of shadow pages, and we provide
1218 * a function for the p2m management to steal pages, in max-order chunks, from
1219 * the free pool. We don't provide for giving them back, yet.
1220 */
1222 /* Figure out the least acceptable quantity of shadow memory.
1223 * The minimum memory requirement for always being able to free up a
1224 * chunk of memory is very small -- only three max-order chunks per
1225 * vcpu to hold the top level shadows and pages with Xen mappings in them.
1227 * But for a guest to be guaranteed to successfully execute a single
1228 * instruction, we must be able to map a large number (about thirty) VAs
1229 * at the same time, which means that to guarantee progress, we must
1230 * allow for more than ninety allocated pages per vcpu. We round that
1231 * up to 128 pages, or half a megabyte per vcpu. */
1232 static unsigned int shadow_min_acceptable_pages(struct domain *d)
1234 u32 vcpu_count = 0;
1235 struct vcpu *v;
1237 for_each_vcpu(d, v)
1238 vcpu_count++;
1240 return (vcpu_count * 128);
1243 /* Figure out the order of allocation needed for a given shadow type */
1244 static inline u32
1245 shadow_order(unsigned int shadow_type)
1247 static const u32 type_to_order[SH_type_unused] = {
1248 0, /* SH_type_none */
1249 1, /* SH_type_l1_32_shadow */
1250 1, /* SH_type_fl1_32_shadow */
1251 2, /* SH_type_l2_32_shadow */
1252 0, /* SH_type_l1_pae_shadow */
1253 0, /* SH_type_fl1_pae_shadow */
1254 0, /* SH_type_l2_pae_shadow */
1255 0, /* SH_type_l2h_pae_shadow */
1256 0, /* SH_type_l1_64_shadow */
1257 0, /* SH_type_fl1_64_shadow */
1258 0, /* SH_type_l2_64_shadow */
1259 0, /* SH_type_l2h_64_shadow */
1260 0, /* SH_type_l3_64_shadow */
1261 0, /* SH_type_l4_64_shadow */
1262 2, /* SH_type_p2m_table */
1263 0, /* SH_type_monitor_table */
1264 0 /* SH_type_oos_snapshot */
1265 };
1266 ASSERT(shadow_type < SH_type_unused);
1267 return type_to_order[shadow_type];
1270 static inline unsigned int
1271 shadow_max_order(struct domain *d)
1273 return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0;
1276 /* Do we have at total of count pages of the requested order free? */
1277 static inline int space_is_available(
1278 struct domain *d,
1279 unsigned int order,
1280 unsigned int count)
1282 for ( ; order <= shadow_max_order(d); ++order )
1284 unsigned int n = count;
1285 const struct list_head *p;
1287 list_for_each ( p, &d->arch.paging.shadow.freelists[order] )
1288 if ( --n == 0 )
1289 return 1;
1290 count = (count + 1) >> 1;
1293 return 0;
1296 /* Dispatcher function: call the per-mode function that will unhook the
1297 * non-Xen mappings in this top-level shadow mfn */
1298 static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
1300 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1301 switch ( sp->type )
1303 case SH_type_l2_32_shadow:
1304 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v,smfn);
1305 break;
1306 case SH_type_l2_pae_shadow:
1307 case SH_type_l2h_pae_shadow:
1308 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v,smfn);
1309 break;
1310 #if CONFIG_PAGING_LEVELS >= 4
1311 case SH_type_l4_64_shadow:
1312 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v,smfn);
1313 break;
1314 #endif
1315 default:
1316 SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->type);
1317 BUG();
1321 static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
1323 if ( tb_init_done )
1325 /* Convert smfn to gfn */
1326 unsigned long gfn;
1327 ASSERT(mfn_valid(smfn));
1328 gfn = mfn_to_gfn(d, _mfn(mfn_to_shadow_page(smfn)->backpointer));
1329 __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/,
1330 sizeof(gfn), (unsigned char*)&gfn);
1334 /* Make sure there are at least count order-sized pages
1335 * available in the shadow page pool. */
1336 static void _shadow_prealloc(
1337 struct domain *d,
1338 unsigned int order,
1339 unsigned int count)
1341 /* Need a vpcu for calling unpins; for now, since we don't have
1342 * per-vcpu shadows, any will do */
1343 struct vcpu *v, *v2;
1344 struct list_head *l, *t;
1345 struct shadow_page_info *sp;
1346 mfn_t smfn;
1347 int i;
1349 ASSERT(order <= shadow_max_order(d));
1350 if ( space_is_available(d, order, count) ) return;
1352 v = current;
1353 if ( v->domain != d )
1354 v = d->vcpu[0];
1355 ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */
1357 /* Stage one: walk the list of pinned pages, unpinning them */
1358 perfc_incr(shadow_prealloc_1);
1359 list_for_each_backwards_safe(l, t, &d->arch.paging.shadow.pinned_shadows)
1361 sp = list_entry(l, struct shadow_page_info, list);
1362 smfn = shadow_page_to_mfn(sp);
1364 /* Unpin this top-level shadow */
1365 trace_shadow_prealloc_unpin(d, smfn);
1366 sh_unpin(v, smfn);
1368 /* See if that freed up enough space */
1369 if ( space_is_available(d, order, count) ) return;
1372 /* Stage two: all shadow pages are in use in hierarchies that are
1373 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
1374 * mappings. */
1375 perfc_incr(shadow_prealloc_2);
1377 for_each_vcpu(d, v2)
1378 for ( i = 0 ; i < 4 ; i++ )
1380 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
1382 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
1383 shadow_unhook_mappings(v,
1384 pagetable_get_mfn(v2->arch.shadow_table[i]));
1386 /* See if that freed up enough space */
1387 if ( space_is_available(d, order, count) )
1389 flush_tlb_mask(d->domain_dirty_cpumask);
1390 return;
1395 /* Nothing more we can do: all remaining shadows are of pages that
1396 * hold Xen mappings for some vcpu. This can never happen. */
1397 SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n"
1398 " shadow pages total = %u, free = %u, p2m=%u\n",
1399 count, order,
1400 d->arch.paging.shadow.total_pages,
1401 d->arch.paging.shadow.free_pages,
1402 d->arch.paging.shadow.p2m_pages);
1403 BUG();
1406 /* Make sure there are at least count pages of the order according to
1407 * type available in the shadow page pool.
1408 * This must be called before any calls to shadow_alloc(). Since this
1409 * will free existing shadows to make room, it must be called early enough
1410 * to avoid freeing shadows that the caller is currently working on. */
1411 void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
1413 ASSERT(type != SH_type_p2m_table);
1414 return _shadow_prealloc(d, shadow_order(type), count);
1417 /* Deliberately free all the memory we can: this will tear down all of
1418 * this domain's shadows */
1419 static void shadow_blow_tables(struct domain *d)
1421 struct list_head *l, *t;
1422 struct shadow_page_info *sp;
1423 struct vcpu *v = d->vcpu[0];
1424 mfn_t smfn;
1425 int i;
1427 ASSERT(v != NULL);
1429 /* Pass one: unpin all pinned pages */
1430 list_for_each_backwards_safe(l,t, &d->arch.paging.shadow.pinned_shadows)
1432 sp = list_entry(l, struct shadow_page_info, list);
1433 smfn = shadow_page_to_mfn(sp);
1434 sh_unpin(v, smfn);
1437 /* Second pass: unhook entries of in-use shadows */
1438 for_each_vcpu(d, v)
1439 for ( i = 0 ; i < 4 ; i++ )
1440 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
1441 shadow_unhook_mappings(v,
1442 pagetable_get_mfn(v->arch.shadow_table[i]));
1444 /* Make sure everyone sees the unshadowings */
1445 flush_tlb_mask(d->domain_dirty_cpumask);
1448 void shadow_blow_tables_per_domain(struct domain *d)
1450 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL ) {
1451 shadow_lock(d);
1452 shadow_blow_tables(d);
1453 shadow_unlock(d);
1457 #ifndef NDEBUG
1458 /* Blow all shadows of all shadowed domains: this can be used to cause the
1459 * guest's pagetables to be re-shadowed if we suspect that the shadows
1460 * have somehow got out of sync */
1461 static void shadow_blow_all_tables(unsigned char c)
1463 struct domain *d;
1464 printk("'%c' pressed -> blowing all shadow tables\n", c);
1465 rcu_read_lock(&domlist_read_lock);
1466 for_each_domain(d)
1468 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
1470 shadow_lock(d);
1471 shadow_blow_tables(d);
1472 shadow_unlock(d);
1475 rcu_read_unlock(&domlist_read_lock);
1478 /* Register this function in the Xen console keypress table */
1479 static __init int shadow_blow_tables_keyhandler_init(void)
1481 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
1482 return 0;
1484 __initcall(shadow_blow_tables_keyhandler_init);
1485 #endif /* !NDEBUG */
1487 /* Allocate another shadow's worth of (contiguous, aligned) pages,
1488 * and fill in the type and backpointer fields of their page_infos.
1489 * Never fails to allocate. */
1490 mfn_t shadow_alloc(struct domain *d,
1491 u32 shadow_type,
1492 unsigned long backpointer)
1494 struct shadow_page_info *sp = NULL;
1495 unsigned int order = shadow_order(shadow_type);
1496 cpumask_t mask;
1497 void *p;
1498 int i;
1500 ASSERT(shadow_locked_by_me(d));
1501 if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d))
1502 order = shadow_max_order(d);
1503 ASSERT(order <= shadow_max_order(d));
1504 ASSERT(shadow_type != SH_type_none);
1505 perfc_incr(shadow_alloc);
1507 /* Find smallest order which can satisfy the request. */
1508 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
1509 if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
1510 goto found;
1512 /* If we get here, we failed to allocate. This should never happen.
1513 * It means that we didn't call shadow_prealloc() correctly before
1514 * we allocated. We can't recover by calling prealloc here, because
1515 * we might free up higher-level pages that the caller is working on. */
1516 SHADOW_ERROR("Can't allocate %i shadow pages!\n", 1 << order);
1517 BUG();
1519 found:
1520 sp = list_entry(d->arch.paging.shadow.freelists[i].next,
1521 struct shadow_page_info, list);
1522 list_del(&sp->list);
1524 /* We may have to halve the chunk a number of times. */
1525 while ( i != order )
1527 i--;
1528 sp->order = i;
1529 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[i]);
1530 sp += 1 << i;
1532 d->arch.paging.shadow.free_pages -= 1 << order;
1534 /* Init page info fields and clear the pages */
1535 for ( i = 0; i < 1<<order ; i++ )
1537 /* Before we overwrite the old contents of this page,
1538 * we need to be sure that no TLB holds a pointer to it. */
1539 mask = d->domain_dirty_cpumask;
1540 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
1541 if ( unlikely(!cpus_empty(mask)) )
1543 perfc_incr(shadow_alloc_tlbflush);
1544 flush_tlb_mask(mask);
1546 /* Now safe to clear the page for reuse */
1547 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
1548 ASSERT(p != NULL);
1549 clear_page(p);
1550 sh_unmap_domain_page(p);
1551 INIT_LIST_HEAD(&sp[i].list);
1552 sp[i].type = shadow_type;
1553 sp[i].pinned = 0;
1554 sp[i].count = 0;
1555 sp[i].backpointer = backpointer;
1556 sp[i].next_shadow = NULL;
1557 perfc_incr(shadow_alloc_count);
1559 return shadow_page_to_mfn(sp);
1563 /* Return some shadow pages to the pool. */
1564 void shadow_free(struct domain *d, mfn_t smfn)
1566 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1567 u32 shadow_type;
1568 unsigned long order;
1569 unsigned long mask;
1570 int i;
1572 ASSERT(shadow_locked_by_me(d));
1573 perfc_incr(shadow_free);
1575 shadow_type = sp->type;
1576 ASSERT(shadow_type != SH_type_none);
1577 ASSERT(shadow_type != SH_type_p2m_table);
1578 order = shadow_order(shadow_type);
1580 d->arch.paging.shadow.free_pages += 1 << order;
1582 for ( i = 0; i < 1<<order; i++ )
1584 #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION)
1585 struct vcpu *v;
1586 for_each_vcpu(d, v)
1588 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1589 /* No longer safe to look for a writeable mapping in this shadow */
1590 if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
1591 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1592 #endif
1593 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
1594 v->arch.paging.last_write_emul_ok = 0;
1595 #endif
1597 #endif
1598 /* Strip out the type: this is now a free shadow page */
1599 sp[i].type = 0;
1600 /* Remember the TLB timestamp so we will know whether to flush
1601 * TLBs when we reuse the page. Because the destructors leave the
1602 * contents of the pages in place, we can delay TLB flushes until
1603 * just before the allocator hands the page out again. */
1604 sp[i].tlbflush_timestamp = tlbflush_current_time();
1605 perfc_decr(shadow_alloc_count);
1608 /* Merge chunks as far as possible. */
1609 for ( ; order < shadow_max_order(d); ++order )
1611 mask = 1 << order;
1612 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
1613 /* Merge with predecessor block? */
1614 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
1615 break;
1616 list_del(&(sp-mask)->list);
1617 sp -= mask;
1618 } else {
1619 /* Merge with successor block? */
1620 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
1621 break;
1622 list_del(&(sp+mask)->list);
1626 sp->order = order;
1627 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1630 /* Divert some memory from the pool to be used by the p2m mapping.
1631 * This action is irreversible: the p2m mapping only ever grows.
1632 * That's OK because the p2m table only exists for translated domains,
1633 * and those domains can't ever turn off shadow mode.
1634 * Also, we only ever allocate a max-order chunk, so as to preserve
1635 * the invariant that shadow_prealloc() always works.
1636 * Returns 0 iff it can't get a chunk (the caller should then
1637 * free up some pages in domheap and call sh_set_allocation);
1638 * returns non-zero on success.
1639 */
1640 static int
1641 sh_alloc_p2m_pages(struct domain *d)
1643 struct page_info *pg;
1644 u32 i;
1645 unsigned int order = shadow_max_order(d);
1647 ASSERT(shadow_locked_by_me(d));
1649 if ( d->arch.paging.shadow.total_pages
1650 < (shadow_min_acceptable_pages(d) + (1 << order)) )
1651 return 0; /* Not enough shadow memory: need to increase it first */
1653 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1654 d->arch.paging.shadow.p2m_pages += (1 << order);
1655 d->arch.paging.shadow.total_pages -= (1 << order);
1656 for (i = 0; i < (1U << order); i++)
1658 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1659 * Marking the domain as the owner would normally allow the guest to
1660 * create mappings of these pages, but these p2m pages will never be
1661 * in the domain's guest-physical address space, and so that is not
1662 * believed to be a concern.
1663 */
1664 page_set_owner(&pg[i], d);
1665 pg[i].count_info = 1;
1666 list_add_tail(&pg[i].list, &d->arch.paging.shadow.p2m_freelist);
1668 return 1;
1671 // Returns 0 if no memory is available...
1672 static struct page_info *
1673 shadow_alloc_p2m_page(struct domain *d)
1675 struct list_head *entry;
1676 struct page_info *pg;
1677 mfn_t mfn;
1678 void *p;
1680 shadow_lock(d);
1682 if ( list_empty(&d->arch.paging.shadow.p2m_freelist) &&
1683 !sh_alloc_p2m_pages(d) )
1685 shadow_unlock(d);
1686 return NULL;
1688 entry = d->arch.paging.shadow.p2m_freelist.next;
1689 list_del(entry);
1691 shadow_unlock(d);
1693 pg = list_entry(entry, struct page_info, list);
1694 mfn = page_to_mfn(pg);
1695 p = sh_map_domain_page(mfn);
1696 clear_page(p);
1697 sh_unmap_domain_page(p);
1699 return pg;
1702 static void
1703 shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1705 ASSERT(page_get_owner(pg) == d);
1706 /* Should have just the one ref we gave it in alloc_p2m_page() */
1707 if ( (pg->count_info & PGC_count_mask) != 1 )
1709 SHADOW_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1710 pg->count_info, pg->u.inuse.type_info);
1712 pg->count_info = 0;
1713 /* Free should not decrement domain's total allocation, since
1714 * these pages were allocated without an owner. */
1715 page_set_owner(pg, NULL);
1716 #if defined(__x86_64__)
1717 spin_lock_init(&pg->lock);
1718 #endif
1719 free_domheap_pages(pg, 0);
1720 d->arch.paging.shadow.p2m_pages--;
1721 perfc_decr(shadow_alloc_count);
1724 #if CONFIG_PAGING_LEVELS == 3
1725 static void p2m_install_entry_in_monitors(struct domain *d,
1726 l3_pgentry_t *l3e)
1727 /* Special case, only used for external-mode domains on PAE hosts:
1728 * update the mapping of the p2m table. Once again, this is trivial in
1729 * other paging modes (one top-level entry points to the top-level p2m,
1730 * no maintenance needed), but PAE makes life difficult by needing a
1731 * copy the eight l3es of the p2m table in eight l2h slots in the
1732 * monitor table. This function makes fresh copies when a p2m l3e
1733 * changes. */
1735 l2_pgentry_t *ml2e;
1736 struct vcpu *v;
1737 unsigned int index;
1739 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1740 ASSERT(index < MACHPHYS_MBYTES>>1);
1742 for_each_vcpu(d, v)
1744 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1745 continue;
1746 ASSERT(shadow_mode_external(v->domain));
1748 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1749 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1751 if ( v == current ) /* OK to use linear map of monitor_table */
1752 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1753 else
1755 l3_pgentry_t *ml3e;
1756 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1757 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1758 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1759 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1760 sh_unmap_domain_page(ml3e);
1762 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1763 if ( v != current )
1764 sh_unmap_domain_page(ml2e);
1767 #endif
1769 /* Set the pool of shadow pages to the required number of pages.
1770 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1771 * plus space for the p2m table.
1772 * Returns 0 for success, non-zero for failure. */
1773 static unsigned int sh_set_allocation(struct domain *d,
1774 unsigned int pages,
1775 int *preempted)
1777 struct shadow_page_info *sp;
1778 unsigned int lower_bound;
1779 unsigned int j, order = shadow_max_order(d);
1781 ASSERT(shadow_locked_by_me(d));
1783 /* Don't allocate less than the minimum acceptable, plus one page per
1784 * megabyte of RAM (for the p2m table) */
1785 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1786 if ( pages > 0 && pages < lower_bound )
1787 pages = lower_bound;
1788 /* Round up to largest block size */
1789 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1791 SHADOW_PRINTK("current %i target %i\n",
1792 d->arch.paging.shadow.total_pages, pages);
1794 while ( d->arch.paging.shadow.total_pages != pages )
1796 if ( d->arch.paging.shadow.total_pages < pages )
1798 /* Need to allocate more memory from domheap */
1799 sp = (struct shadow_page_info *)
1800 alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d)));
1801 if ( sp == NULL )
1803 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1804 return -ENOMEM;
1806 d->arch.paging.shadow.free_pages += 1 << order;
1807 d->arch.paging.shadow.total_pages += 1 << order;
1808 for ( j = 0; j < 1U << order; j++ )
1810 sp[j].type = 0;
1811 sp[j].pinned = 0;
1812 sp[j].count = 0;
1813 sp[j].mbz = 0;
1814 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1816 sp->order = order;
1817 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1819 else if ( d->arch.paging.shadow.total_pages > pages )
1821 /* Need to return memory to domheap */
1822 _shadow_prealloc(d, order, 1);
1823 ASSERT(!list_empty(&d->arch.paging.shadow.freelists[order]));
1824 sp = list_entry(d->arch.paging.shadow.freelists[order].next,
1825 struct shadow_page_info, list);
1826 list_del(&sp->list);
1827 #if defined(__x86_64__)
1828 /*
1829 * Re-instate lock field which we overwrite with shadow_page_info.
1830 * This was safe, since the lock is only used on guest pages.
1831 */
1832 for ( j = 0; j < 1U << order; j++ )
1833 spin_lock_init(&((struct page_info *)sp)[j].lock);
1834 #endif
1835 d->arch.paging.shadow.free_pages -= 1 << order;
1836 d->arch.paging.shadow.total_pages -= 1 << order;
1837 free_domheap_pages((struct page_info *)sp, order);
1840 /* Check to see if we need to yield and try again */
1841 if ( preempted && hypercall_preempt_check() )
1843 *preempted = 1;
1844 return 0;
1848 return 0;
1851 /* Return the size of the shadow pool, rounded up to the nearest MB */
1852 static unsigned int shadow_get_allocation(struct domain *d)
1854 unsigned int pg = d->arch.paging.shadow.total_pages;
1855 return ((pg >> (20 - PAGE_SHIFT))
1856 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1859 /**************************************************************************/
1860 /* Hash table for storing the guest->shadow mappings.
1861 * The table itself is an array of pointers to shadows; the shadows are then
1862 * threaded on a singly-linked list of shadows with the same hash value */
1864 #define SHADOW_HASH_BUCKETS 251
1865 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1867 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1868 typedef u32 key_t;
1869 static inline key_t sh_hash(unsigned long n, unsigned int t)
1871 unsigned char *p = (unsigned char *)&n;
1872 key_t k = t;
1873 int i;
1874 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1875 return k % SHADOW_HASH_BUCKETS;
1878 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1880 /* Before we get to the mechanism, define a pair of audit functions
1881 * that sanity-check the contents of the hash table. */
1882 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1883 /* Audit one bucket of the hash table */
1885 struct shadow_page_info *sp, *x;
1887 if ( !(SHADOW_AUDIT_ENABLE) )
1888 return;
1890 sp = d->arch.paging.shadow.hash_table[bucket];
1891 while ( sp )
1893 /* Not a shadow? */
1894 BUG_ON( sp->mbz != 0 );
1895 /* Bogus type? */
1896 BUG_ON( sp->type == 0 );
1897 BUG_ON( sp->type > SH_type_max_shadow );
1898 /* Wrong bucket? */
1899 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1900 /* Duplicate entry? */
1901 for ( x = sp->next_shadow; x; x = x->next_shadow )
1902 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1903 /* Follow the backpointer to the guest pagetable */
1904 if ( sp->type != SH_type_fl1_32_shadow
1905 && sp->type != SH_type_fl1_pae_shadow
1906 && sp->type != SH_type_fl1_64_shadow )
1908 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1909 /* Bad shadow flags on guest page? */
1910 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1911 /* Bad type count on guest page? */
1912 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1913 if ( sp->type == SH_type_l1_32_shadow
1914 || sp->type == SH_type_l1_pae_shadow
1915 || sp->type == SH_type_l1_64_shadow )
1917 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1918 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1920 if ( !page_is_out_of_sync(gpg) )
1922 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1923 " and not OOS but has typecount %#lx\n",
1924 sp->backpointer,
1925 mfn_x(shadow_page_to_mfn(sp)),
1926 gpg->u.inuse.type_info);
1927 BUG();
1931 else /* Not an l1 */
1932 #endif
1933 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1934 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1936 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1937 " but has typecount %#lx\n",
1938 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1939 gpg->u.inuse.type_info);
1940 BUG();
1943 /* That entry was OK; on we go */
1944 sp = sp->next_shadow;
1948 #else
1949 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1950 #endif /* Hashtable bucket audit */
1953 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1955 static void sh_hash_audit(struct domain *d)
1956 /* Full audit: audit every bucket in the table */
1958 int i;
1960 if ( !(SHADOW_AUDIT_ENABLE) )
1961 return;
1963 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1965 sh_hash_audit_bucket(d, i);
1969 #else
1970 #define sh_hash_audit(_d) do {} while(0)
1971 #endif /* Hashtable bucket audit */
1973 /* Allocate and initialise the table itself.
1974 * Returns 0 for success, 1 for error. */
1975 static int shadow_hash_alloc(struct domain *d)
1977 struct shadow_page_info **table;
1979 ASSERT(shadow_locked_by_me(d));
1980 ASSERT(!d->arch.paging.shadow.hash_table);
1982 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1983 if ( !table ) return 1;
1984 memset(table, 0,
1985 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1986 d->arch.paging.shadow.hash_table = table;
1987 return 0;
1990 /* Tear down the hash table and return all memory to Xen.
1991 * This function does not care whether the table is populated. */
1992 static void shadow_hash_teardown(struct domain *d)
1994 ASSERT(shadow_locked_by_me(d));
1995 ASSERT(d->arch.paging.shadow.hash_table);
1997 xfree(d->arch.paging.shadow.hash_table);
1998 d->arch.paging.shadow.hash_table = NULL;
2002 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
2003 /* Find an entry in the hash table. Returns the MFN of the shadow,
2004 * or INVALID_MFN if it doesn't exist */
2006 struct domain *d = v->domain;
2007 struct shadow_page_info *sp, *prev;
2008 key_t key;
2010 ASSERT(shadow_locked_by_me(d));
2011 ASSERT(d->arch.paging.shadow.hash_table);
2012 ASSERT(t);
2014 sh_hash_audit(d);
2016 perfc_incr(shadow_hash_lookups);
2017 key = sh_hash(n, t);
2018 sh_hash_audit_bucket(d, key);
2020 sp = d->arch.paging.shadow.hash_table[key];
2021 prev = NULL;
2022 while(sp)
2024 if ( sp->backpointer == n && sp->type == t )
2026 /* Pull-to-front if 'sp' isn't already the head item */
2027 if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
2029 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
2030 /* Can't reorder: someone is walking the hash chains */
2031 return shadow_page_to_mfn(sp);
2032 else
2034 ASSERT(prev);
2035 /* Delete sp from the list */
2036 prev->next_shadow = sp->next_shadow;
2037 /* Re-insert it at the head of the list */
2038 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
2039 d->arch.paging.shadow.hash_table[key] = sp;
2042 else
2044 perfc_incr(shadow_hash_lookup_head);
2046 return shadow_page_to_mfn(sp);
2048 prev = sp;
2049 sp = sp->next_shadow;
2052 perfc_incr(shadow_hash_lookup_miss);
2053 return _mfn(INVALID_MFN);
2056 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
2057 mfn_t smfn)
2058 /* Put a mapping (n,t)->smfn into the hash table */
2060 struct domain *d = v->domain;
2061 struct shadow_page_info *sp;
2062 key_t key;
2064 ASSERT(shadow_locked_by_me(d));
2065 ASSERT(d->arch.paging.shadow.hash_table);
2066 ASSERT(t);
2068 sh_hash_audit(d);
2070 perfc_incr(shadow_hash_inserts);
2071 key = sh_hash(n, t);
2072 sh_hash_audit_bucket(d, key);
2074 /* Insert this shadow at the top of the bucket */
2075 sp = mfn_to_shadow_page(smfn);
2076 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
2077 d->arch.paging.shadow.hash_table[key] = sp;
2079 sh_hash_audit_bucket(d, key);
2082 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
2083 mfn_t smfn)
2084 /* Excise the mapping (n,t)->smfn from the hash table */
2086 struct domain *d = v->domain;
2087 struct shadow_page_info *sp, *x;
2088 key_t key;
2090 ASSERT(shadow_locked_by_me(d));
2091 ASSERT(d->arch.paging.shadow.hash_table);
2092 ASSERT(t);
2094 sh_hash_audit(d);
2096 perfc_incr(shadow_hash_deletes);
2097 key = sh_hash(n, t);
2098 sh_hash_audit_bucket(d, key);
2100 sp = mfn_to_shadow_page(smfn);
2101 if ( d->arch.paging.shadow.hash_table[key] == sp )
2102 /* Easy case: we're deleting the head item. */
2103 d->arch.paging.shadow.hash_table[key] = sp->next_shadow;
2104 else
2106 /* Need to search for the one we want */
2107 x = d->arch.paging.shadow.hash_table[key];
2108 while ( 1 )
2110 ASSERT(x); /* We can't have hit the end, since our target is
2111 * still in the chain somehwere... */
2112 if ( x->next_shadow == sp )
2114 x->next_shadow = sp->next_shadow;
2115 break;
2117 x = x->next_shadow;
2120 sp->next_shadow = NULL;
2122 sh_hash_audit_bucket(d, key);
2125 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
2127 static void hash_foreach(struct vcpu *v,
2128 unsigned int callback_mask,
2129 hash_callback_t callbacks[],
2130 mfn_t callback_mfn)
2131 /* Walk the hash table looking at the types of the entries and
2132 * calling the appropriate callback function for each entry.
2133 * The mask determines which shadow types we call back for, and the array
2134 * of callbacks tells us which function to call.
2135 * Any callback may return non-zero to let us skip the rest of the scan.
2137 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
2138 * then return non-zero to terminate the scan. */
2140 int i, done = 0;
2141 struct domain *d = v->domain;
2142 struct shadow_page_info *x;
2144 /* Say we're here, to stop hash-lookups reordering the chains */
2145 ASSERT(shadow_locked_by_me(d));
2146 ASSERT(d->arch.paging.shadow.hash_walking == 0);
2147 d->arch.paging.shadow.hash_walking = 1;
2149 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
2151 /* WARNING: This is not safe against changes to the hash table.
2152 * The callback *must* return non-zero if it has inserted or
2153 * deleted anything from the hash (lookups are OK, though). */
2154 for ( x = d->arch.paging.shadow.hash_table[i]; x; x = x->next_shadow )
2156 if ( callback_mask & (1 << x->type) )
2158 ASSERT(x->type <= 15);
2159 ASSERT(callbacks[x->type] != NULL);
2160 done = callbacks[x->type](v, shadow_page_to_mfn(x),
2161 callback_mfn);
2162 if ( done ) break;
2165 if ( done ) break;
2167 d->arch.paging.shadow.hash_walking = 0;
2171 /**************************************************************************/
2172 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
2173 * which will decrement refcounts appropriately and return memory to the
2174 * free pool. */
2176 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
2178 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2179 unsigned int t = sp->type;
2182 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
2184 /* Double-check, if we can, that the shadowed page belongs to this
2185 * domain, (by following the back-pointer). */
2186 ASSERT(t == SH_type_fl1_32_shadow ||
2187 t == SH_type_fl1_pae_shadow ||
2188 t == SH_type_fl1_64_shadow ||
2189 t == SH_type_monitor_table ||
2190 (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
2191 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
2192 == v->domain));
2194 /* The down-shifts here are so that the switch statement is on nice
2195 * small numbers that the compiler will enjoy */
2196 switch ( t )
2198 case SH_type_l1_32_shadow:
2199 case SH_type_fl1_32_shadow:
2200 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(v, smfn);
2201 break;
2202 case SH_type_l2_32_shadow:
2203 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(v, smfn);
2204 break;
2206 case SH_type_l1_pae_shadow:
2207 case SH_type_fl1_pae_shadow:
2208 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(v, smfn);
2209 break;
2210 case SH_type_l2_pae_shadow:
2211 case SH_type_l2h_pae_shadow:
2212 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(v, smfn);
2213 break;
2215 #if CONFIG_PAGING_LEVELS >= 4
2216 case SH_type_l1_64_shadow:
2217 case SH_type_fl1_64_shadow:
2218 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(v, smfn);
2219 break;
2220 case SH_type_l2h_64_shadow:
2221 ASSERT(is_pv_32on64_vcpu(v));
2222 /* Fall through... */
2223 case SH_type_l2_64_shadow:
2224 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(v, smfn);
2225 break;
2226 case SH_type_l3_64_shadow:
2227 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(v, smfn);
2228 break;
2229 case SH_type_l4_64_shadow:
2230 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(v, smfn);
2231 break;
2232 #endif
2233 default:
2234 SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
2235 (unsigned long)t);
2236 BUG();
2240 static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
2242 if ( tb_init_done )
2244 /* Convert gmfn to gfn */
2245 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
2246 __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
2250 /**************************************************************************/
2251 /* Remove all writeable mappings of a guest frame from the shadow tables
2252 * Returns non-zero if we need to flush TLBs.
2253 * level and fault_addr desribe how we found this to be a pagetable;
2254 * level==0 means we have some other reason for revoking write access.
2255 * If level==0 we are allowed to fail, returning -1. */
2257 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
2258 unsigned int level,
2259 unsigned long fault_addr)
2261 /* Dispatch table for getting per-type functions */
2262 static hash_callback_t callbacks[SH_type_unused] = {
2263 NULL, /* none */
2264 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32 */
2265 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32 */
2266 NULL, /* l2_32 */
2267 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* l1_pae */
2268 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* fl1_pae */
2269 NULL, /* l2_pae */
2270 NULL, /* l2h_pae */
2271 #if CONFIG_PAGING_LEVELS >= 4
2272 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* l1_64 */
2273 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* fl1_64 */
2274 #else
2275 NULL, /* l1_64 */
2276 NULL, /* fl1_64 */
2277 #endif
2278 NULL, /* l2_64 */
2279 NULL, /* l2h_64 */
2280 NULL, /* l3_64 */
2281 NULL, /* l4_64 */
2282 NULL, /* p2m */
2283 NULL /* unused */
2284 };
2286 static unsigned int callback_mask =
2287 1 << SH_type_l1_32_shadow
2288 | 1 << SH_type_fl1_32_shadow
2289 | 1 << SH_type_l1_pae_shadow
2290 | 1 << SH_type_fl1_pae_shadow
2291 | 1 << SH_type_l1_64_shadow
2292 | 1 << SH_type_fl1_64_shadow
2294 struct page_info *pg = mfn_to_page(gmfn);
2296 ASSERT(shadow_locked_by_me(v->domain));
2298 /* Only remove writable mappings if we are doing shadow refcounts.
2299 * In guest refcounting, we trust Xen to already be restricting
2300 * all the writes to the guest page tables, so we do not need to
2301 * do more. */
2302 if ( !shadow_mode_refcounts(v->domain) )
2303 return 0;
2305 /* Early exit if it's already a pagetable, or otherwise not writeable */
2306 if ( (sh_mfn_is_a_page_table(gmfn)
2307 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2308 /* Unless they've been allowed to go out of sync with their shadows */
2309 && !mfn_oos_may_write(gmfn)
2310 #endif
2312 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2313 return 0;
2315 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP);
2317 perfc_incr(shadow_writeable);
2319 /* If this isn't a "normal" writeable page, the domain is trying to
2320 * put pagetables in special memory of some kind. We can't allow that. */
2321 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
2323 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
2324 PRtype_info "\n",
2325 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
2326 domain_crash(v->domain);
2329 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
2330 if ( v == current )
2332 unsigned long gfn;
2333 /* Heuristic: there is likely to be only one writeable mapping,
2334 * and that mapping is likely to be in the current pagetable,
2335 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
2337 #define GUESS(_a, _h) do { \
2338 if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
2339 perfc_incr(shadow_writeable_h_ ## _h); \
2340 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
2341 { \
2342 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); \
2343 return 1; \
2344 } \
2345 } while (0)
2347 if ( level == 0 && fault_addr )
2348 GUESS(fault_addr, 6);
2350 if ( v->arch.paging.mode->guest_levels == 2 )
2352 if ( level == 1 )
2353 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
2354 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
2356 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2357 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2358 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2361 else if ( v->arch.paging.mode->guest_levels == 3 )
2363 /* 32bit PAE w2k3: linear map at 0xC0000000 */
2364 switch ( level )
2366 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
2367 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
2370 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2371 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2372 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2374 #if CONFIG_PAGING_LEVELS >= 4
2375 else if ( v->arch.paging.mode->guest_levels == 4 )
2377 /* 64bit w2k3: linear map at 0xfffff68000000000 */
2378 switch ( level )
2380 case 1: GUESS(0xfffff68000000000UL
2381 + ((fault_addr & VADDR_MASK) >> 9), 3); break;
2382 case 2: GUESS(0xfffff6fb40000000UL
2383 + ((fault_addr & VADDR_MASK) >> 18), 3); break;
2384 case 3: GUESS(0xfffff6fb7da00000UL
2385 + ((fault_addr & VADDR_MASK) >> 27), 3); break;
2388 /* 64bit Linux direct map at 0xffff810000000000; older kernels
2389 * had it at 0x0000010000000000UL */
2390 gfn = mfn_to_gfn(v->domain, gmfn);
2391 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
2392 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
2393 /*
2394 * 64bit Solaris kernel page map at
2395 * kpm_vbase; 0xfffffe0000000000UL
2396 */
2397 GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4);
2399 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2401 #undef GUESS
2404 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2405 return 1;
2407 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
2408 * (entries in the fixmap) where linux maps its pagetables. Since
2409 * we expect to hit them most of the time, we start the search for
2410 * the writeable mapping by looking at the same MFN where the last
2411 * brute-force search succeeded. */
2413 if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
2415 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
2416 mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
2417 int shtype = mfn_to_shadow_page(last_smfn)->type;
2419 if ( callbacks[shtype] )
2420 callbacks[shtype](v, last_smfn, gmfn);
2422 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
2423 perfc_incr(shadow_writeable_h_5);
2426 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2427 return 1;
2429 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
2431 /* Brute-force search of all the shadows, by walking the hash */
2432 trace_shadow_wrmap_bf(gmfn);
2433 if ( level == 0 )
2434 perfc_incr(shadow_writeable_bf_1);
2435 else
2436 perfc_incr(shadow_writeable_bf);
2437 hash_foreach(v, callback_mask, callbacks, gmfn);
2439 /* If that didn't catch the mapping, then there's some non-pagetable
2440 * mapping -- ioreq page, grant mapping, &c. */
2441 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2443 if ( level == 0 )
2444 return -1;
2446 SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
2447 "%lu special-use mappings of it\n", mfn_x(gmfn),
2448 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2449 domain_crash(v->domain);
2452 /* We killed at least one writeable mapping, so must flush TLBs. */
2453 return 1;
2456 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2457 int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
2458 mfn_t smfn, unsigned long off)
2460 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2462 ASSERT(mfn_valid(smfn));
2463 ASSERT(mfn_valid(gmfn));
2465 if ( sp->type == SH_type_l1_32_shadow )
2467 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
2468 (v, gmfn, smfn, off);
2470 #if CONFIG_PAGING_LEVELS >= 3
2471 else if ( sp->type == SH_type_l1_pae_shadow )
2472 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
2473 (v, gmfn, smfn, off);
2474 #if CONFIG_PAGING_LEVELS >= 4
2475 else if ( sp->type == SH_type_l1_64_shadow )
2476 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
2477 (v, gmfn, smfn, off);
2478 #endif
2479 #endif
2481 return 0;
2483 #endif
2485 /**************************************************************************/
2486 /* Remove all mappings of a guest frame from the shadow tables.
2487 * Returns non-zero if we need to flush TLBs. */
2489 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
2491 struct page_info *page = mfn_to_page(gmfn);
2492 int expected_count, do_locking;
2494 /* Dispatch table for getting per-type functions */
2495 static hash_callback_t callbacks[SH_type_unused] = {
2496 NULL, /* none */
2497 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32 */
2498 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32 */
2499 NULL, /* l2_32 */
2500 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* l1_pae */
2501 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* fl1_pae */
2502 NULL, /* l2_pae */
2503 NULL, /* l2h_pae */
2504 #if CONFIG_PAGING_LEVELS >= 4
2505 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* l1_64 */
2506 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* fl1_64 */
2507 #else
2508 NULL, /* l1_64 */
2509 NULL, /* fl1_64 */
2510 #endif
2511 NULL, /* l2_64 */
2512 NULL, /* l2h_64 */
2513 NULL, /* l3_64 */
2514 NULL, /* l4_64 */
2515 NULL, /* p2m */
2516 NULL /* unused */
2517 };
2519 static unsigned int callback_mask =
2520 1 << SH_type_l1_32_shadow
2521 | 1 << SH_type_fl1_32_shadow
2522 | 1 << SH_type_l1_pae_shadow
2523 | 1 << SH_type_fl1_pae_shadow
2524 | 1 << SH_type_l1_64_shadow
2525 | 1 << SH_type_fl1_64_shadow
2528 perfc_incr(shadow_mappings);
2529 if ( (page->count_info & PGC_count_mask) == 0 )
2530 return 0;
2532 /* Although this is an externally visible function, we do not know
2533 * whether the shadow lock will be held when it is called (since it
2534 * can be called via put_page_type when we clear a shadow l1e).
2535 * If the lock isn't held, take it for the duration of the call. */
2536 do_locking = !shadow_locked_by_me(v->domain);
2537 if ( do_locking ) shadow_lock(v->domain);
2539 /* XXX TODO:
2540 * Heuristics for finding the (probably) single mapping of this gmfn */
2542 /* Brute-force search of all the shadows, by walking the hash */
2543 perfc_incr(shadow_mappings_bf);
2544 hash_foreach(v, callback_mask, callbacks, gmfn);
2546 /* If that didn't catch the mapping, something is very wrong */
2547 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2548 if ( (page->count_info & PGC_count_mask) != expected_count )
2550 /* Don't complain if we're in HVM and there are some extra mappings:
2551 * The qemu helper process has an untyped mapping of this dom's RAM
2552 * and the HVM restore program takes another. */
2553 if ( !(shadow_mode_external(v->domain)
2554 && (page->count_info & PGC_count_mask) <= 3
2555 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2557 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2558 "c=%08x t=%08lx\n", mfn_x(gmfn),
2559 page->count_info, page->u.inuse.type_info);
2563 if ( do_locking ) shadow_unlock(v->domain);
2565 /* We killed at least one mapping, so must flush TLBs. */
2566 return 1;
2570 /**************************************************************************/
2571 /* Remove all shadows of a guest frame from the shadow tables */
2573 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2574 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2575 * found there. Returns 1 if that was the only reference to this shadow */
2577 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2578 mfn_t pmfn;
2579 void *vaddr;
2580 int rc;
2582 ASSERT(sp->type > 0);
2583 ASSERT(sp->type < SH_type_max_shadow);
2584 ASSERT(sp->type != SH_type_l2_32_shadow);
2585 ASSERT(sp->type != SH_type_l2_pae_shadow);
2586 ASSERT(sp->type != SH_type_l2h_pae_shadow);
2587 ASSERT(sp->type != SH_type_l4_64_shadow);
2589 if (sp->up == 0) return 0;
2590 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2591 ASSERT(mfn_valid(pmfn));
2592 vaddr = sh_map_domain_page(pmfn);
2593 ASSERT(vaddr);
2594 vaddr += sp->up & (PAGE_SIZE-1);
2595 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2597 /* Is this the only reference to this shadow? */
2598 rc = (sp->count == 1) ? 1 : 0;
2600 /* Blank the offending entry */
2601 switch (sp->type)
2603 case SH_type_l1_32_shadow:
2604 case SH_type_l2_32_shadow:
2605 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(v, vaddr, pmfn);
2606 break;
2607 case SH_type_l1_pae_shadow:
2608 case SH_type_l2_pae_shadow:
2609 case SH_type_l2h_pae_shadow:
2610 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(v, vaddr, pmfn);
2611 break;
2612 #if CONFIG_PAGING_LEVELS >= 4
2613 case SH_type_l1_64_shadow:
2614 case SH_type_l2_64_shadow:
2615 case SH_type_l2h_64_shadow:
2616 case SH_type_l3_64_shadow:
2617 case SH_type_l4_64_shadow:
2618 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(v, vaddr, pmfn);
2619 break;
2620 #endif
2621 default: BUG(); /* Some wierd unknown shadow type */
2624 sh_unmap_domain_page(vaddr);
2625 if ( rc )
2626 perfc_incr(shadow_up_pointer);
2627 else
2628 perfc_incr(shadow_unshadow_bf);
2630 return rc;
2633 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2634 /* Remove the shadows of this guest page.
2635 * If fast != 0, just try the quick heuristic, which will remove
2636 * at most one reference to each shadow of the page. Otherwise, walk
2637 * all the shadow tables looking for refs to shadows of this gmfn.
2638 * If all != 0, kill the domain if we can't find all the shadows.
2639 * (all != 0 implies fast == 0)
2640 */
2642 struct page_info *pg = mfn_to_page(gmfn);
2643 mfn_t smfn;
2644 int do_locking;
2645 unsigned char t;
2647 /* Dispatch table for getting per-type functions: each level must
2648 * be called with the function to remove a lower-level shadow. */
2649 static hash_callback_t callbacks[SH_type_unused] = {
2650 NULL, /* none */
2651 NULL, /* l1_32 */
2652 NULL, /* fl1_32 */
2653 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 2), /* l2_32 */
2654 NULL, /* l1_pae */
2655 NULL, /* fl1_pae */
2656 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2_pae */
2657 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2h_pae */
2658 NULL, /* l1_64 */
2659 NULL, /* fl1_64 */
2660 #if CONFIG_PAGING_LEVELS >= 4
2661 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2_64 */
2662 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2h_64 */
2663 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, 4), /* l3_64 */
2664 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, 4), /* l4_64 */
2665 #else
2666 NULL, /* l2_64 */
2667 NULL, /* l2h_64 */
2668 NULL, /* l3_64 */
2669 NULL, /* l4_64 */
2670 #endif
2671 NULL, /* p2m */
2672 NULL /* unused */
2673 };
2675 /* Another lookup table, for choosing which mask to use */
2676 static unsigned int masks[SH_type_unused] = {
2677 0, /* none */
2678 1 << SH_type_l2_32_shadow, /* l1_32 */
2679 0, /* fl1_32 */
2680 0, /* l2_32 */
2681 ((1 << SH_type_l2h_pae_shadow)
2682 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2683 0, /* fl1_pae */
2684 0, /* l2_pae */
2685 0, /* l2h_pae */
2686 ((1 << SH_type_l2h_64_shadow)
2687 | (1 << SH_type_l2_64_shadow)), /* l1_64 */
2688 0, /* fl1_64 */
2689 1 << SH_type_l3_64_shadow, /* l2_64 */
2690 1 << SH_type_l3_64_shadow, /* l2h_64 */
2691 1 << SH_type_l4_64_shadow, /* l3_64 */
2692 0, /* l4_64 */
2693 0, /* p2m */
2694 0 /* unused */
2695 };
2697 ASSERT(!(all && fast));
2699 /* Although this is an externally visible function, we do not know
2700 * whether the shadow lock will be held when it is called (since it
2701 * can be called via put_page_type when we clear a shadow l1e).
2702 * If the lock isn't held, take it for the duration of the call. */
2703 do_locking = !shadow_locked_by_me(v->domain);
2704 if ( do_locking ) shadow_lock(v->domain);
2706 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2707 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2709 /* Bail out now if the page is not shadowed */
2710 if ( (pg->count_info & PGC_page_table) == 0 )
2712 if ( do_locking ) shadow_unlock(v->domain);
2713 return;
2716 /* Search for this shadow in all appropriate shadows */
2717 perfc_incr(shadow_unshadow);
2719 /* Lower-level shadows need to be excised from upper-level shadows.
2720 * This call to hash_foreach() looks dangerous but is in fact OK: each
2721 * call will remove at most one shadow, and terminate immediately when
2722 * it does remove it, so we never walk the hash after doing a deletion. */
2723 #define DO_UNSHADOW(_type) do { \
2724 t = (_type); \
2725 if( !(pg->count_info & PGC_page_table) \
2726 || !(pg->shadow_flags & (1 << t)) ) \
2727 break; \
2728 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2729 if ( unlikely(!mfn_valid(smfn)) ) \
2730 { \
2731 SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32 \
2732 " but no type-0x%"PRIx32" shadow\n", \
2733 mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \
2734 break; \
2735 } \
2736 if ( sh_type_is_pinnable(v, t) ) \
2737 sh_unpin(v, smfn); \
2738 else \
2739 sh_remove_shadow_via_pointer(v, smfn); \
2740 if( !fast \
2741 && (pg->count_info & PGC_page_table) \
2742 && (pg->shadow_flags & (1 << t)) ) \
2743 hash_foreach(v, masks[t], callbacks, smfn); \
2744 } while (0)
2746 DO_UNSHADOW(SH_type_l2_32_shadow);
2747 DO_UNSHADOW(SH_type_l1_32_shadow);
2748 DO_UNSHADOW(SH_type_l2h_pae_shadow);
2749 DO_UNSHADOW(SH_type_l2_pae_shadow);
2750 DO_UNSHADOW(SH_type_l1_pae_shadow);
2751 #if CONFIG_PAGING_LEVELS >= 4
2752 DO_UNSHADOW(SH_type_l4_64_shadow);
2753 DO_UNSHADOW(SH_type_l3_64_shadow);
2754 DO_UNSHADOW(SH_type_l2h_64_shadow);
2755 DO_UNSHADOW(SH_type_l2_64_shadow);
2756 DO_UNSHADOW(SH_type_l1_64_shadow);
2757 #endif
2759 #undef DO_UNSHADOW
2761 /* If that didn't catch the shadows, something is wrong */
2762 if ( !fast && all && (pg->count_info & PGC_page_table) )
2764 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2765 "(shadow_flags=%08x)\n",
2766 mfn_x(gmfn), pg->shadow_flags);
2767 domain_crash(v->domain);
2770 /* Need to flush TLBs now, so that linear maps are safe next time we
2771 * take a fault. */
2772 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2774 if ( do_locking ) shadow_unlock(v->domain);
2777 static void
2778 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2779 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2780 * Unshadow it, and recursively unshadow pages that reference it. */
2782 sh_remove_shadows(v, gmfn, 0, 1);
2783 /* XXX TODO:
2784 * Rework this hashtable walker to return a linked-list of all
2785 * the shadows it modified, then do breadth-first recursion
2786 * to find the way up to higher-level tables and unshadow them too.
2788 * The current code (just tearing down each page's shadows as we
2789 * detect that it is not a pagetable) is correct, but very slow.
2790 * It means extra emulated writes and slows down removal of mappings. */
2793 /**************************************************************************/
2795 static void sh_update_paging_modes(struct vcpu *v)
2797 struct domain *d = v->domain;
2798 struct paging_mode *old_mode = v->arch.paging.mode;
2800 ASSERT(shadow_locked_by_me(d));
2802 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2803 /* Make sure this vcpu has a virtual TLB array allocated */
2804 if ( unlikely(!v->arch.paging.vtlb) )
2806 v->arch.paging.vtlb = xmalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
2807 if ( unlikely(!v->arch.paging.vtlb) )
2809 SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
2810 d->domain_id, v->vcpu_id);
2811 domain_crash(v->domain);
2812 return;
2814 memset(v->arch.paging.vtlb, 0,
2815 VTLB_ENTRIES * sizeof (struct shadow_vtlb));
2816 spin_lock_init(&v->arch.paging.vtlb_lock);
2818 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2820 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2821 if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
2823 int i;
2824 for(i = 0; i < SHADOW_OOS_PAGES; i++)
2826 shadow_prealloc(d, SH_type_oos_snapshot, 1);
2827 v->arch.paging.shadow.oos_snapshot[i] =
2828 shadow_alloc(d, SH_type_oos_snapshot, 0);
2831 #endif /* OOS */
2833 // Valid transitions handled by this function:
2834 // - For PV guests:
2835 // - after a shadow mode has been changed
2836 // - For HVM guests:
2837 // - after a shadow mode has been changed
2838 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2839 //
2841 // First, tear down any old shadow tables held by this vcpu.
2842 //
2843 if ( v->arch.paging.mode )
2844 v->arch.paging.mode->shadow.detach_old_tables(v);
2846 if ( !is_hvm_domain(d) )
2848 ///
2849 /// PV guest
2850 ///
2851 #if CONFIG_PAGING_LEVELS == 4
2852 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2853 #else /* CONFIG_PAGING_LEVELS == 3 */
2854 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2855 #endif
2857 else
2859 ///
2860 /// HVM guest
2861 ///
2862 ASSERT(shadow_mode_translate(d));
2863 ASSERT(shadow_mode_external(d));
2865 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2866 /* Need to resync all our pages now, because if a page goes out
2867 * of sync with paging enabled and is resynced with paging
2868 * disabled, the resync will go wrong. */
2869 shadow_resync_all(v, 0);
2870 #endif /* OOS */
2872 if ( !hvm_paging_enabled(v) )
2874 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
2875 * pagetable for it, mapping 4 GB one-to-one using a single l2
2876 * page of 1024 superpage mappings */
2877 v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
2878 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2880 else
2882 #ifdef __x86_64__
2883 if ( hvm_long_mode_enabled(v) )
2885 // long mode guest...
2886 v->arch.paging.mode =
2887 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2889 else
2890 #endif
2891 if ( hvm_pae_enabled(v) )
2893 // 32-bit PAE mode guest...
2894 v->arch.paging.mode =
2895 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2897 else
2899 // 32-bit 2 level guest...
2900 v->arch.paging.mode =
2901 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2905 if ( pagetable_is_null(v->arch.monitor_table) )
2907 mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2908 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2909 make_cr3(v, mfn_x(mmfn));
2910 hvm_update_host_cr3(v);
2913 if ( v->arch.paging.mode != old_mode )
2915 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d gl=%u "
2916 "(was g=%u s=%u)\n",
2917 d->domain_id, v->vcpu_id,
2918 is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
2919 v->arch.paging.mode->guest_levels,
2920 v->arch.paging.mode->shadow.shadow_levels,
2921 old_mode ? old_mode->guest_levels : 0,
2922 old_mode ? old_mode->shadow.shadow_levels : 0);
2923 if ( old_mode &&
2924 (v->arch.paging.mode->shadow.shadow_levels !=
2925 old_mode->shadow.shadow_levels) )
2927 /* Need to make a new monitor table for the new mode */
2928 mfn_t new_mfn, old_mfn;
2930 if ( v != current && vcpu_runnable(v) )
2932 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2933 "this HVM vcpu's (d=%u v=%u) paging mode "
2934 "while it is running.\n",
2935 current->domain->domain_id, current->vcpu_id,
2936 v->domain->domain_id, v->vcpu_id);
2937 /* It's not safe to do that because we can't change
2938 * the host CR3 for a running domain */
2939 domain_crash(v->domain);
2940 return;
2943 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2944 v->arch.monitor_table = pagetable_null();
2945 new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2946 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2947 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
2948 mfn_x(new_mfn));
2950 /* Don't be running on the old monitor table when we
2951 * pull it down! Switch CR3, and warn the HVM code that
2952 * its host cr3 has changed. */
2953 make_cr3(v, mfn_x(new_mfn));
2954 if ( v == current )
2955 write_ptbase(v);
2956 hvm_update_host_cr3(v);
2957 old_mode->shadow.destroy_monitor_table(v, old_mfn);
2961 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2962 // These are HARD: think about the case where two CPU's have
2963 // different values for CR4.PSE and CR4.PGE at the same time.
2964 // This *does* happen, at least for CR4.PGE...
2967 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2968 /* We need to check that all the vcpus have paging enabled to
2969 * unsync PTs. */
2970 if ( is_hvm_domain(d) )
2972 int pe = 1;
2973 struct vcpu *vptr;
2975 for_each_vcpu(d, vptr)
2977 if ( !hvm_paging_enabled(vptr) )
2979 pe = 0;
2980 break;
2984 d->arch.paging.shadow.oos_active = pe;
2986 #endif /* OOS */
2988 v->arch.paging.mode->update_cr3(v, 0);
2991 void shadow_update_paging_modes(struct vcpu *v)
2993 shadow_lock(v->domain);
2994 sh_update_paging_modes(v);
2995 shadow_unlock(v->domain);
2998 /**************************************************************************/
2999 /* Turning on and off shadow features */
3001 static void sh_new_mode(struct domain *d, u32 new_mode)
3002 /* Inform all the vcpus that the shadow mode has been changed */
3004 struct vcpu *v;
3006 ASSERT(shadow_locked_by_me(d));
3007 ASSERT(d != current->domain);
3008 d->arch.paging.mode = new_mode;
3009 for_each_vcpu(d, v)
3010 sh_update_paging_modes(v);
3013 int shadow_enable(struct domain *d, u32 mode)
3014 /* Turn on "permanent" shadow features: external, translate, refcount.
3015 * Can only be called once on a domain, and these features cannot be
3016 * disabled.
3017 * Returns 0 for success, -errno for failure. */
3019 unsigned int old_pages;
3020 struct page_info *pg = NULL;
3021 uint32_t *e;
3022 int i, rv = 0;
3024 mode |= PG_SH_enable;
3026 domain_pause(d);
3028 /* Sanity check the arguments */
3029 if ( (d == current->domain) ||
3030 shadow_mode_enabled(d) ||
3031 ((mode & PG_translate) && !(mode & PG_refcounts)) ||
3032 ((mode & PG_external) && !(mode & PG_translate)) )
3034 rv = -EINVAL;
3035 goto out_unlocked;
3038 /* Init the shadow memory allocation if the user hasn't done so */
3039 old_pages = d->arch.paging.shadow.total_pages;
3040 if ( old_pages == 0 )
3042 unsigned int r;
3043 shadow_lock(d);
3044 r = sh_set_allocation(d, 256, NULL); /* Use at least 1MB */
3045 if ( r != 0 )
3047 sh_set_allocation(d, 0, NULL);
3048 rv = -ENOMEM;
3049 goto out_locked;
3051 shadow_unlock(d);
3054 /* Init the P2M table. Must be done before we take the shadow lock
3055 * to avoid possible deadlock. */
3056 if ( mode & PG_translate )
3058 rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
3059 if (rv != 0)
3060 goto out_unlocked;
3063 /* HVM domains need an extra pagetable for vcpus that think they
3064 * have paging disabled */
3065 if ( is_hvm_domain(d) )
3067 /* Get a single page from the shadow pool. Take it via the
3068 * P2M interface to make freeing it simpler afterwards. */
3069 pg = shadow_alloc_p2m_page(d);
3070 if ( pg == NULL )
3072 rv = -ENOMEM;
3073 goto out_unlocked;
3075 /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
3076 * of virtual address space onto the same physical address range */
3077 e = sh_map_domain_page(page_to_mfn(pg));
3078 for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
3079 e[i] = ((0x400000U * i)
3080 | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
3081 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3082 sh_unmap_domain_page(e);
3083 pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
3086 shadow_lock(d);
3088 /* Sanity check again with the lock held */
3089 if ( shadow_mode_enabled(d) )
3091 rv = -EINVAL;
3092 goto out_locked;
3095 /* Init the hash table */
3096 if ( shadow_hash_alloc(d) != 0 )
3098 rv = -ENOMEM;
3099 goto out_locked;
3102 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3103 /* We assume we're dealing with an older 64bit linux guest until we
3104 * see the guest use more than one l4 per vcpu. */
3105 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3106 #endif
3108 /* Record the 1-to-1 pagetable we just made */
3109 if ( is_hvm_domain(d) )
3110 d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
3112 /* Update the bits */
3113 sh_new_mode(d, mode);
3115 out_locked:
3116 shadow_unlock(d);
3117 out_unlocked:
3118 if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
3119 p2m_teardown(d);
3120 if ( rv != 0 && pg != NULL )
3121 shadow_free_p2m_page(d, pg);
3122 domain_unpause(d);
3123 return rv;
3126 void shadow_teardown(struct domain *d)
3127 /* Destroy the shadow pagetables of this domain and free its shadow memory.
3128 * Should only be called for dying domains. */
3130 struct vcpu *v;
3131 mfn_t mfn;
3132 struct list_head *entry, *n;
3133 struct page_info *pg;
3135 ASSERT(d->is_dying);
3136 ASSERT(d != current->domain);
3138 if ( !shadow_locked_by_me(d) )
3139 shadow_lock(d); /* Keep various asserts happy */
3141 if ( shadow_mode_enabled(d) )
3143 /* Release the shadow and monitor tables held by each vcpu */
3144 for_each_vcpu(d, v)
3146 if ( v->arch.paging.mode )
3148 v->arch.paging.mode->shadow.detach_old_tables(v);
3149 if ( shadow_mode_external(d) )
3151 mfn = pagetable_get_mfn(v->arch.monitor_table);
3152 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
3153 v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
3154 v->arch.monitor_table = pagetable_null();
3160 #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
3161 /* Free the virtual-TLB array attached to each vcpu */
3162 for_each_vcpu(d, v)
3164 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3165 if ( v->arch.paging.vtlb )
3167 xfree(v->arch.paging.vtlb);
3168 v->arch.paging.vtlb = NULL;
3170 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3172 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3174 int i;
3175 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
3176 for(i = 0; i < SHADOW_OOS_PAGES; i++)
3177 if ( mfn_valid(oos_snapshot[i]) )
3178 shadow_free(d, oos_snapshot[i]);
3180 #endif /* OOS */
3182 #endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
3184 list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
3186 list_del(entry);
3187 pg = list_entry(entry, struct page_info, list);
3188 shadow_free_p2m_page(d, pg);
3191 if ( d->arch.paging.shadow.total_pages != 0 )
3193 SHADOW_PRINTK("teardown of domain %u starts."
3194 " Shadow pages total = %u, free = %u, p2m=%u\n",
3195 d->domain_id,
3196 d->arch.paging.shadow.total_pages,
3197 d->arch.paging.shadow.free_pages,
3198 d->arch.paging.shadow.p2m_pages);
3199 /* Destroy all the shadows and release memory to domheap */
3200 sh_set_allocation(d, 0, NULL);
3201 /* Release the hash table back to xenheap */
3202 if (d->arch.paging.shadow.hash_table)
3203 shadow_hash_teardown(d);
3204 /* Should not have any more memory held */
3205 SHADOW_PRINTK("teardown done."
3206 " Shadow pages total = %u, free = %u, p2m=%u\n",
3207 d->arch.paging.shadow.total_pages,
3208 d->arch.paging.shadow.free_pages,
3209 d->arch.paging.shadow.p2m_pages);
3210 ASSERT(d->arch.paging.shadow.total_pages == 0);
3213 /* Free the non-paged-vcpus pagetable; must happen after we've
3214 * destroyed any shadows of it or sh_destroy_shadow will get confused. */
3215 if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
3217 for_each_vcpu(d, v)
3219 ASSERT(is_hvm_vcpu(v));
3220 if ( !hvm_paging_enabled(v) )
3221 v->arch.guest_table = pagetable_null();
3223 shadow_free_p2m_page(d,
3224 pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable));
3225 d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
3228 /* We leave the "permanent" shadow modes enabled, but clear the
3229 * log-dirty mode bit. We don't want any more mark_dirty()
3230 * calls now that we've torn down the bitmap */
3231 d->arch.paging.mode &= ~PG_log_dirty;
3233 if (d->dirty_vram) {
3234 xfree(d->dirty_vram->sl1ma);
3235 xfree(d->dirty_vram->dirty_bitmap);
3236 xfree(d->dirty_vram);
3237 d->dirty_vram = NULL;
3240 shadow_unlock(d);
3243 void shadow_final_teardown(struct domain *d)
3244 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
3246 SHADOW_PRINTK("dom %u final teardown starts."
3247 " Shadow pages total = %u, free = %u, p2m=%u\n",
3248 d->domain_id,
3249 d->arch.paging.shadow.total_pages,
3250 d->arch.paging.shadow.free_pages,
3251 d->arch.paging.shadow.p2m_pages);
3253 /* Double-check that the domain didn't have any shadow memory.
3254 * It is possible for a domain that never got domain_kill()ed
3255 * to get here with its shadow allocation intact. */
3256 if ( d->arch.paging.shadow.total_pages != 0 )
3257 shadow_teardown(d);
3259 /* It is now safe to pull down the p2m map. */
3260 p2m_teardown(d);
3262 SHADOW_PRINTK("dom %u final teardown done."
3263 " Shadow pages total = %u, free = %u, p2m=%u\n",
3264 d->domain_id,
3265 d->arch.paging.shadow.total_pages,
3266 d->arch.paging.shadow.free_pages,
3267 d->arch.paging.shadow.p2m_pages);
3270 static int shadow_one_bit_enable(struct domain *d, u32 mode)
3271 /* Turn on a single shadow mode feature */
3273 ASSERT(shadow_locked_by_me(d));
3275 /* Sanity check the call */
3276 if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
3278 return -EINVAL;
3281 mode |= PG_SH_enable;
3283 if ( d->arch.paging.mode == 0 )
3285 /* Init the shadow memory allocation and the hash table */
3286 if ( sh_set_allocation(d, 1, NULL) != 0
3287 || shadow_hash_alloc(d) != 0 )
3289 sh_set_allocation(d, 0, NULL);
3290 return -ENOMEM;
3294 /* Update the bits */
3295 sh_new_mode(d, d->arch.paging.mode | mode);
3297 return 0;
3300 static int shadow_one_bit_disable(struct domain *d, u32 mode)
3301 /* Turn off a single shadow mode feature */
3303 struct vcpu *v;
3304 ASSERT(shadow_locked_by_me(d));
3306 /* Sanity check the call */
3307 if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
3309 return -EINVAL;
3312 /* Update the bits */
3313 sh_new_mode(d, d->arch.paging.mode & ~mode);
3314 if ( d->arch.paging.mode == 0 )
3316 /* Get this domain off shadows */
3317 SHADOW_PRINTK("un-shadowing of domain %u starts."
3318 " Shadow pages total = %u, free = %u, p2m=%u\n",
3319 d->domain_id,
3320 d->arch.paging.shadow.total_pages,
3321 d->arch.paging.shadow.free_pages,
3322 d->arch.paging.shadow.p2m_pages);
3323 for_each_vcpu(d, v)
3325 if ( v->arch.paging.mode )
3326 v->arch.paging.mode->shadow.detach_old_tables(v);
3327 #if CONFIG_PAGING_LEVELS == 4
3328 if ( !(v->arch.flags & TF_kernel_mode) )
3329 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
3330 else
3331 #endif
3332 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
3336 /* Pull down the memory allocation */
3337 if ( sh_set_allocation(d, 0, NULL) != 0 )
3339 // XXX - How can this occur?
3340 // Seems like a bug to return an error now that we've
3341 // disabled the relevant shadow mode.
3342 //
3343 return -ENOMEM;
3345 shadow_hash_teardown(d);
3346 SHADOW_PRINTK("un-shadowing of domain %u done."
3347 " Shadow pages total = %u, free = %u, p2m=%u\n",
3348 d->domain_id,
3349 d->arch.paging.shadow.total_pages,
3350 d->arch.paging.shadow.free_pages,
3351 d->arch.paging.shadow.p2m_pages);
3354 return 0;
3357 /* Enable/disable ops for the "test" and "log-dirty" modes */
3358 static int shadow_test_enable(struct domain *d)
3360 int ret;
3362 domain_pause(d);
3363 shadow_lock(d);
3364 ret = shadow_one_bit_enable(d, PG_SH_enable);
3365 shadow_unlock(d);
3366 domain_unpause(d);
3368 return ret;
3371 static int shadow_test_disable(struct domain *d)
3373 int ret;
3375 domain_pause(d);
3376 shadow_lock(d);
3377 ret = shadow_one_bit_disable(d, PG_SH_enable);
3378 shadow_unlock(d);
3379 domain_unpause(d);
3381 return ret;
3384 /**************************************************************************/
3385 /* P2M map manipulations */
3387 /* shadow specific code which should be called when P2M table entry is updated
3388 * with new content. It is responsible for update the entry, as well as other
3389 * shadow processing jobs.
3390 */
3391 void
3392 shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn,
3393 l1_pgentry_t *p, mfn_t table_mfn,
3394 l1_pgentry_t new, unsigned int level)
3396 struct domain *d = v->domain;
3398 shadow_lock(d);
3400 /* If we're removing an MFN from the p2m, remove it from the shadows too */
3401 if ( level == 1 )
3403 mfn_t mfn = _mfn(l1e_get_pfn(*p));
3404 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3405 if ( p2m_is_valid(p2mt) && mfn_valid(mfn) )
3407 sh_remove_all_shadows_and_parents(v, mfn);
3408 if ( sh_remove_all_mappings(v, mfn) )
3409 flush_tlb_mask(d->domain_dirty_cpumask);
3413 /* If we're removing a superpage mapping from the p2m, we need to check
3414 * all the pages covered by it. If they're still there in the new
3415 * scheme, that's OK, but otherwise they must be unshadowed. */
3416 if ( level == 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) &&
3417 (l1e_get_flags(*p) & _PAGE_PSE) )
3419 unsigned int i;
3420 cpumask_t flushmask;
3421 mfn_t omfn = _mfn(l1e_get_pfn(*p));
3422 mfn_t nmfn = _mfn(l1e_get_pfn(new));
3423 l1_pgentry_t *npte = NULL;
3424 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3425 if ( p2m_is_valid(p2mt) && mfn_valid(omfn) )
3427 cpus_clear(flushmask);
3429 /* If we're replacing a superpage with a normal L1 page, map it */
3430 if ( (l1e_get_flags(new) & _PAGE_PRESENT)
3431 && !(l1e_get_flags(new) & _PAGE_PSE)
3432 && mfn_valid(nmfn) )
3433 npte = map_domain_page(mfn_x(nmfn));
3435 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3437 if ( !npte
3438 || !p2m_is_ram(p2m_flags_to_type(l1e_get_flags(npte[i])))
3439 || l1e_get_pfn(npte[i]) != mfn_x(omfn) )
3441 /* This GFN->MFN mapping has gone away */
3442 sh_remove_all_shadows_and_parents(v, omfn);
3443 if ( sh_remove_all_mappings(v, omfn) )
3444 cpus_or(flushmask, flushmask, d->domain_dirty_cpumask);
3446 omfn = _mfn(mfn_x(omfn) + 1);
3448 flush_tlb_mask(flushmask);
3450 if ( npte )
3451 unmap_domain_page(npte);
3455 /* Update the entry with new content */
3456 safe_write_pte(p, new);
3458 /* install P2M in monitors for PAE Xen */
3459 #if CONFIG_PAGING_LEVELS == 3
3460 if ( level == 3 )
3461 /* We have written to the p2m l3: need to sync the per-vcpu
3462 * copies of it in the monitor tables */
3463 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
3464 #endif
3466 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3467 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
3468 cached the fact that this is an mmio region in the shadow
3469 page tables. Blow the tables away to remove the cache.
3470 This is pretty heavy handed, but this is a rare operation
3471 (it might happen a dozen times during boot and then never
3472 again), so it doesn't matter too much. */
3473 if ( d->arch.paging.shadow.has_fast_mmio_entries )
3475 shadow_blow_tables(d);
3476 d->arch.paging.shadow.has_fast_mmio_entries = 0;
3478 #endif
3480 shadow_unlock(d);
3483 /**************************************************************************/
3484 /* Log-dirty mode support */
3486 /* Shadow specific code which is called in paging_log_dirty_enable().
3487 * Return 0 if no problem found.
3488 */
3489 int shadow_enable_log_dirty(struct domain *d)
3491 int ret;
3493 /* shadow lock is required here */
3494 shadow_lock(d);
3495 if ( shadow_mode_enabled(d) )
3497 /* This domain already has some shadows: need to clear them out
3498 * of the way to make sure that all references to guest memory are
3499 * properly write-protected */
3500 shadow_blow_tables(d);
3503 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3504 /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
3505 * change an l4e instead of cr3 to switch tables. Give them the
3506 * same optimization */
3507 if ( is_pv_32on64_domain(d) )
3508 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3509 #endif
3511 ret = shadow_one_bit_enable(d, PG_log_dirty);
3512 shadow_unlock(d);
3514 return ret;
3517 /* shadow specfic code which is called in paging_log_dirty_disable() */
3518 int shadow_disable_log_dirty(struct domain *d)
3520 int ret;
3522 /* shadow lock is required here */
3523 shadow_lock(d);
3524 ret = shadow_one_bit_disable(d, PG_log_dirty);
3525 shadow_unlock(d);
3527 return ret;
3530 /* This function is called when we CLEAN log dirty bitmap. See
3531 * paging_log_dirty_op() for details.
3532 */
3533 void shadow_clean_dirty_bitmap(struct domain *d)
3535 shadow_lock(d);
3536 /* Need to revoke write access to the domain's pages again.
3537 * In future, we'll have a less heavy-handed approach to this,
3538 * but for now, we just unshadow everything except Xen. */
3539 shadow_blow_tables(d);
3540 shadow_unlock(d);
3544 /**************************************************************************/
3545 /* VRAM dirty tracking support */
3546 int shadow_track_dirty_vram(struct domain *d,
3547 unsigned long begin_pfn,
3548 unsigned long nr,
3549 XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
3551 int rc;
3552 unsigned long end_pfn = begin_pfn + nr;
3553 unsigned long dirty_size = (nr + 7) / 8;
3554 int flush_tlb = 0;
3555 unsigned long i;
3556 p2m_type_t t;
3558 if (end_pfn < begin_pfn
3559 || begin_pfn > d->arch.p2m->max_mapped_pfn
3560 || end_pfn >= d->arch.p2m->max_mapped_pfn)
3561 return -EINVAL;
3563 shadow_lock(d);
3565 if ( d->dirty_vram && (!nr ||
3566 ( begin_pfn != d->dirty_vram->begin_pfn
3567 || end_pfn != d->dirty_vram->end_pfn )) )
3569 /* Different tracking, tear the previous down. */
3570 gdprintk(XENLOG_INFO, "stopping tracking VRAM %lx - %lx\n", d->dirty_vram->begin_pfn, d->dirty_vram->end_pfn);
3571 xfree(d->dirty_vram->sl1ma);
3572 xfree(d->dirty_vram->dirty_bitmap);
3573 xfree(d->dirty_vram);
3574 d->dirty_vram = NULL;
3577 if ( !nr )
3579 rc = 0;
3580 goto out;
3583 /* This should happen seldomly (Video mode change),
3584 * no need to be careful. */
3585 if ( !d->dirty_vram )
3587 /* Just recount from start. */
3588 for ( i = begin_pfn; i < end_pfn; i++ ) {
3589 mfn_t mfn = gfn_to_mfn(d, i, &t);
3590 if (mfn_x(mfn) != INVALID_MFN)
3591 flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
3594 gdprintk(XENLOG_INFO, "tracking VRAM %lx - %lx\n", begin_pfn, end_pfn);
3596 rc = -ENOMEM;
3597 if ( (d->dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL )
3598 goto out;
3599 d->dirty_vram->begin_pfn = begin_pfn;
3600 d->dirty_vram->end_pfn = end_pfn;
3602 if ( (d->dirty_vram->sl1ma = xmalloc_array(paddr_t, nr)) == NULL )
3603 goto out_dirty_vram;
3604 memset(d->dirty_vram->sl1ma, ~0, sizeof(paddr_t) * nr);
3606 if ( (d->dirty_vram->dirty_bitmap = xmalloc_array(uint8_t, dirty_size)) == NULL )
3607 goto out_sl1ma;
3608 memset(d->dirty_vram->dirty_bitmap, 0, dirty_size);
3610 d->dirty_vram->last_dirty = NOW();
3612 /* Tell the caller that this time we could not track dirty bits. */
3613 rc = -ENODATA;
3615 else if (d->dirty_vram->last_dirty == -1)
3617 /* still completely clean, just copy our empty bitmap */
3618 rc = -EFAULT;
3619 if ( copy_to_guest(dirty_bitmap, d->dirty_vram->dirty_bitmap, dirty_size) == 0 )
3620 rc = 0;
3622 else
3624 #ifdef __i386__
3625 unsigned long map_mfn = INVALID_MFN;
3626 void *map_sl1p = NULL;
3627 #endif
3629 /* Iterate over VRAM to track dirty bits. */
3630 for ( i = 0; i < nr; i++ ) {
3631 mfn_t mfn = gfn_to_mfn(d, begin_pfn + i, &t);
3632 struct page_info *page;
3633 u32 count_info;
3634 int dirty = 0;
3635 paddr_t sl1ma = d->dirty_vram->sl1ma[i];
3637 if (mfn_x(mfn) == INVALID_MFN)
3639 dirty = 1;
3641 else
3643 page = mfn_to_page(mfn);
3644 count_info = page->u.inuse.type_info & PGT_count_mask;
3645 switch (count_info)
3647 case 0:
3648 /* No guest reference, nothing to track. */
3649 break;
3650 case 1:
3651 /* One guest reference. */
3652 if ( sl1ma == INVALID_PADDR )
3654 /* We don't know which sl1e points to this, too bad. */
3655 dirty = 1;
3656 /* TODO: Heuristics for finding the single mapping of
3657 * this gmfn */
3658 flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
3660 else
3662 /* Hopefully the most common case: only one mapping,
3663 * whose dirty bit we can use. */
3664 l1_pgentry_t *sl1e;
3665 #ifdef __i386__
3666 void *sl1p = map_sl1p;
3667 unsigned long sl1mfn = paddr_to_pfn(sl1ma);
3669 if ( sl1mfn != map_mfn ) {
3670 if ( map_sl1p )
3671 sh_unmap_domain_page(map_sl1p);
3672 map_sl1p = sl1p = sh_map_domain_page(_mfn(sl1mfn));
3673 map_mfn = sl1mfn;
3675 sl1e = sl1p + (sl1ma & ~PAGE_MASK);
3676 #else
3677 sl1e = maddr_to_virt(sl1ma);
3678 #endif
3680 if ( l1e_get_flags(*sl1e) & _PAGE_DIRTY )
3682 dirty = 1;
3683 /* Note: this is atomic, so we may clear a
3684 * _PAGE_ACCESSED set by another processor. */
3685 l1e_remove_flags(*sl1e, _PAGE_DIRTY);
3686 flush_tlb = 1;
3689 break;
3690 default:
3691 /* More than one guest reference,
3692 * we don't afford tracking that. */
3693 dirty = 1;
3694 break;
3698 if ( dirty )
3700 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
3701 d->dirty_vram->last_dirty = NOW();
3705 #ifdef __i386__
3706 if ( map_sl1p )
3707 sh_unmap_domain_page(map_sl1p);
3708 #endif
3710 rc = -EFAULT;
3711 if ( copy_to_guest(dirty_bitmap, d->dirty_vram->dirty_bitmap, dirty_size) == 0 ) {
3712 memset(d->dirty_vram->dirty_bitmap, 0, dirty_size);
3713 if (d->dirty_vram->last_dirty + SECONDS(2) < NOW())
3715 /* was clean for more than two seconds, try to disable guest
3716 * write access */
3717 for ( i = begin_pfn; i < end_pfn; i++ ) {
3718 mfn_t mfn = gfn_to_mfn(d, i, &t);
3719 if (mfn_x(mfn) != INVALID_MFN)
3720 flush_tlb |= sh_remove_write_access(d->vcpu[0], mfn, 1, 0);
3722 d->dirty_vram->last_dirty = -1;
3724 rc = 0;
3727 if ( flush_tlb )
3728 flush_tlb_mask(d->domain_dirty_cpumask);
3729 goto out;
3731 out_sl1ma:
3732 xfree(d->dirty_vram->sl1ma);
3733 out_dirty_vram:
3734 xfree(d->dirty_vram);
3735 d->dirty_vram = NULL;
3737 out:
3738 shadow_unlock(d);
3739 return rc;
3742 /**************************************************************************/
3743 /* Shadow-control XEN_DOMCTL dispatcher */
3745 int shadow_domctl(struct domain *d,
3746 xen_domctl_shadow_op_t *sc,
3747 XEN_GUEST_HANDLE(void) u_domctl)
3749 int rc, preempted = 0;
3751 switch ( sc->op )
3753 case XEN_DOMCTL_SHADOW_OP_OFF:
3754 if ( d->arch.paging.mode == PG_SH_enable )
3755 if ( (rc = shadow_test_disable(d)) != 0 )
3756 return rc;
3757 return 0;
3759 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3760 return shadow_test_enable(d);
3762 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3763 return shadow_enable(d, PG_refcounts|PG_translate);
3765 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3766 return shadow_enable(d, sc->mode << PG_mode_shift);
3768 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3769 sc->mb = shadow_get_allocation(d);
3770 return 0;
3772 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3773 shadow_lock(d);
3774 if ( sc->mb == 0 && shadow_mode_enabled(d) )
3776 /* Can't set the allocation to zero unless the domain stops using
3777 * shadow pagetables first */
3778 SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
3779 " is still using shadows.\n", d->domain_id);
3780 shadow_unlock(d);
3781 return -EINVAL;
3783 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
3784 shadow_unlock(d);
3785 if ( preempted )
3786 /* Not finished. Set up to re-run the call. */
3787 rc = hypercall_create_continuation(
3788 __HYPERVISOR_domctl, "h", u_domctl);
3789 else
3790 /* Finished. Return the new allocation */
3791 sc->mb = shadow_get_allocation(d);
3792 return rc;
3794 default:
3795 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3796 return -EINVAL;
3801 /**************************************************************************/
3802 /* Auditing shadow tables */
3804 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3806 void shadow_audit_tables(struct vcpu *v)
3808 /* Dispatch table for getting per-type functions */
3809 static hash_callback_t callbacks[SH_type_unused] = {
3810 NULL, /* none */
3811 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 2), /* l1_32 */
3812 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 2), /* fl1_32 */
3813 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 2), /* l2_32 */
3814 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 3), /* l1_pae */
3815 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 3), /* fl1_pae */
3816 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2_pae */
3817 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2h_pae */
3818 #if CONFIG_PAGING_LEVELS >= 4
3819 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 4), /* l1_64 */
3820 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 4), /* fl1_64 */
3821 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2_64 */
3822 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2h_64 */
3823 SHADOW_INTERNAL_NAME(sh_audit_l3_table, 4), /* l3_64 */
3824 SHADOW_INTERNAL_NAME(sh_audit_l4_table, 4), /* l4_64 */
3825 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3826 NULL /* All the rest */
3827 };
3828 unsigned int mask;
3830 if ( !(SHADOW_AUDIT_ENABLE) )
3831 return;
3833 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3834 sh_oos_audit(v->domain);
3835 #endif
3837 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3838 mask = ~1; /* Audit every table in the system */
3839 else
3841 /* Audit only the current mode's tables */
3842 switch ( v->arch.paging.mode->guest_levels )
3844 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3845 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3846 |SHF_L2H_PAE); break;
3847 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3848 |SHF_L3_64|SHF_L4_64); break;
3849 default: BUG();
3853 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3856 #endif /* Shadow audit */
3858 /*
3859 * Local variables:
3860 * mode: C
3861 * c-set-style: "BSD"
3862 * c-basic-offset: 4
3863 * indent-tabs-mode: nil
3864 * End:
3865 */