ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 18491:8f456fcd8af1

x86, shadow: Fix comment about Linux direct-map address.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Sat Sep 13 08:15:26 2008 +0100 (2008-09-13)
parents 12f3edfab6ef
children c353f07bae84
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include <xen/numa.h>
40 #include "private.h"
42 DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
44 /* Set up the shadow-specific parts of a domain struct at start of day.
45 * Called for every domain from arch_domain_create() */
46 void shadow_domain_init(struct domain *d)
47 {
48 int i;
49 shadow_lock_init(d);
50 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
51 INIT_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
52 INIT_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
53 INIT_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
55 /* Use shadow pagetables for log-dirty support */
56 paging_log_dirty_init(d, shadow_enable_log_dirty,
57 shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
59 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
60 d->arch.paging.shadow.oos_active = 0;
61 #endif
62 }
64 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
65 * job is to initialize the update_paging_modes() function pointer, which is
66 * used to initialized the rest of resources. Therefore, it really does not
67 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
68 * be compiled.
69 */
70 void shadow_vcpu_init(struct vcpu *v)
71 {
72 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
73 int i, j;
75 for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
76 {
77 v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
78 v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
79 for ( j = 0; j < SHADOW_OOS_FIXUPS; j++ )
80 v->arch.paging.shadow.oos_fixup[i].smfn[j] = _mfn(INVALID_MFN);
81 }
82 #endif
84 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
85 }
87 #if SHADOW_AUDIT
88 int shadow_audit_enable = 0;
90 static void shadow_audit_key(unsigned char key)
91 {
92 shadow_audit_enable = !shadow_audit_enable;
93 printk("%s shadow_audit_enable=%d\n",
94 __func__, shadow_audit_enable);
95 }
97 static int __init shadow_audit_key_init(void)
98 {
99 register_keyhandler(
100 'O', shadow_audit_key, "toggle shadow audits");
101 return 0;
102 }
103 __initcall(shadow_audit_key_init);
104 #endif /* SHADOW_AUDIT */
106 int _shadow_mode_refcounts(struct domain *d)
107 {
108 return shadow_mode_refcounts(d);
109 }
112 /**************************************************************************/
113 /* x86 emulator support for the shadow code
114 */
116 struct segment_register *hvm_get_seg_reg(
117 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
118 {
119 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
120 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
121 hvm_get_segment_register(current, seg, seg_reg);
122 return seg_reg;
123 }
125 static int hvm_translate_linear_addr(
126 enum x86_segment seg,
127 unsigned long offset,
128 unsigned int bytes,
129 enum hvm_access_type access_type,
130 struct sh_emulate_ctxt *sh_ctxt,
131 unsigned long *paddr)
132 {
133 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
134 int okay;
136 okay = hvm_virtual_to_linear_addr(
137 seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr);
139 if ( !okay )
140 {
141 hvm_inject_exception(TRAP_gp_fault, 0, 0);
142 return X86EMUL_EXCEPTION;
143 }
145 return 0;
146 }
148 static int
149 hvm_read(enum x86_segment seg,
150 unsigned long offset,
151 void *p_data,
152 unsigned int bytes,
153 enum hvm_access_type access_type,
154 struct sh_emulate_ctxt *sh_ctxt)
155 {
156 unsigned long addr;
157 int rc;
159 rc = hvm_translate_linear_addr(
160 seg, offset, bytes, access_type, sh_ctxt, &addr);
161 if ( rc )
162 return rc;
164 if ( access_type == hvm_access_insn_fetch )
165 rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0);
166 else
167 rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0);
169 switch ( rc )
170 {
171 case HVMCOPY_okay:
172 return X86EMUL_OKAY;
173 case HVMCOPY_bad_gva_to_gfn:
174 return X86EMUL_EXCEPTION;
175 default:
176 break;
177 }
179 return X86EMUL_UNHANDLEABLE;
180 }
182 static int
183 hvm_emulate_read(enum x86_segment seg,
184 unsigned long offset,
185 void *p_data,
186 unsigned int bytes,
187 struct x86_emulate_ctxt *ctxt)
188 {
189 if ( !is_x86_user_segment(seg) )
190 return X86EMUL_UNHANDLEABLE;
191 return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
192 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
193 }
195 static int
196 hvm_emulate_insn_fetch(enum x86_segment seg,
197 unsigned long offset,
198 void *p_data,
199 unsigned int bytes,
200 struct x86_emulate_ctxt *ctxt)
201 {
202 struct sh_emulate_ctxt *sh_ctxt =
203 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
204 unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
206 ASSERT(seg == x86_seg_cs);
208 /* Fall back if requested bytes are not in the prefetch cache. */
209 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
210 return hvm_read(seg, offset, p_data, bytes,
211 hvm_access_insn_fetch, sh_ctxt);
213 /* Hit the cache. Simple memcpy. */
214 memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
215 return X86EMUL_OKAY;
216 }
218 static int
219 hvm_emulate_write(enum x86_segment seg,
220 unsigned long offset,
221 void *p_data,
222 unsigned int bytes,
223 struct x86_emulate_ctxt *ctxt)
224 {
225 struct sh_emulate_ctxt *sh_ctxt =
226 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
227 struct vcpu *v = current;
228 unsigned long addr;
229 int rc;
231 if ( !is_x86_user_segment(seg) )
232 return X86EMUL_UNHANDLEABLE;
234 /* How many emulations could we save if we unshadowed on stack writes? */
235 if ( seg == x86_seg_ss )
236 perfc_incr(shadow_fault_emulate_stack);
238 rc = hvm_translate_linear_addr(
239 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
240 if ( rc )
241 return rc;
243 return v->arch.paging.mode->shadow.x86_emulate_write(
244 v, addr, p_data, bytes, sh_ctxt);
245 }
247 static int
248 hvm_emulate_cmpxchg(enum x86_segment seg,
249 unsigned long offset,
250 void *p_old,
251 void *p_new,
252 unsigned int bytes,
253 struct x86_emulate_ctxt *ctxt)
254 {
255 struct sh_emulate_ctxt *sh_ctxt =
256 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
257 struct vcpu *v = current;
258 unsigned long addr, old[2], new[2];
259 int rc;
261 if ( !is_x86_user_segment(seg) )
262 return X86EMUL_UNHANDLEABLE;
264 rc = hvm_translate_linear_addr(
265 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
266 if ( rc )
267 return rc;
269 old[0] = new[0] = 0;
270 memcpy(old, p_old, bytes);
271 memcpy(new, p_new, bytes);
273 if ( bytes <= sizeof(long) )
274 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
275 v, addr, old[0], new[0], bytes, sh_ctxt);
277 #ifdef __i386__
278 if ( bytes == 8 )
279 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
280 v, addr, old[0], old[1], new[0], new[1], sh_ctxt);
281 #endif
283 return X86EMUL_UNHANDLEABLE;
284 }
286 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
287 .read = hvm_emulate_read,
288 .insn_fetch = hvm_emulate_insn_fetch,
289 .write = hvm_emulate_write,
290 .cmpxchg = hvm_emulate_cmpxchg,
291 };
293 static int
294 pv_emulate_read(enum x86_segment seg,
295 unsigned long offset,
296 void *p_data,
297 unsigned int bytes,
298 struct x86_emulate_ctxt *ctxt)
299 {
300 unsigned int rc;
302 if ( !is_x86_user_segment(seg) )
303 return X86EMUL_UNHANDLEABLE;
305 if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 )
306 {
307 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
308 return X86EMUL_EXCEPTION;
309 }
311 return X86EMUL_OKAY;
312 }
314 static int
315 pv_emulate_write(enum x86_segment seg,
316 unsigned long offset,
317 void *p_data,
318 unsigned int bytes,
319 struct x86_emulate_ctxt *ctxt)
320 {
321 struct sh_emulate_ctxt *sh_ctxt =
322 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
323 struct vcpu *v = current;
324 if ( !is_x86_user_segment(seg) )
325 return X86EMUL_UNHANDLEABLE;
326 return v->arch.paging.mode->shadow.x86_emulate_write(
327 v, offset, p_data, bytes, sh_ctxt);
328 }
330 static int
331 pv_emulate_cmpxchg(enum x86_segment seg,
332 unsigned long offset,
333 void *p_old,
334 void *p_new,
335 unsigned int bytes,
336 struct x86_emulate_ctxt *ctxt)
337 {
338 struct sh_emulate_ctxt *sh_ctxt =
339 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
340 unsigned long old[2], new[2];
341 struct vcpu *v = current;
343 if ( !is_x86_user_segment(seg) )
344 return X86EMUL_UNHANDLEABLE;
346 old[0] = new[0] = 0;
347 memcpy(old, p_old, bytes);
348 memcpy(new, p_new, bytes);
350 if ( bytes <= sizeof(long) )
351 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
352 v, offset, old[0], new[0], bytes, sh_ctxt);
354 #ifdef __i386__
355 if ( bytes == 8 )
356 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
357 v, offset, old[0], old[1], new[0], new[1], sh_ctxt);
358 #endif
360 return X86EMUL_UNHANDLEABLE;
361 }
363 static struct x86_emulate_ops pv_shadow_emulator_ops = {
364 .read = pv_emulate_read,
365 .insn_fetch = pv_emulate_read,
366 .write = pv_emulate_write,
367 .cmpxchg = pv_emulate_cmpxchg,
368 };
370 struct x86_emulate_ops *shadow_init_emulation(
371 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
372 {
373 struct segment_register *creg, *sreg;
374 struct vcpu *v = current;
375 unsigned long addr;
377 sh_ctxt->ctxt.regs = regs;
378 sh_ctxt->ctxt.force_writeback = 0;
380 if ( !is_hvm_vcpu(v) )
381 {
382 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
383 return &pv_shadow_emulator_ops;
384 }
386 /* Segment cache initialisation. Primed with CS. */
387 sh_ctxt->valid_seg_regs = 0;
388 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
390 /* Work out the emulation mode. */
391 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
392 {
393 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
394 }
395 else
396 {
397 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
398 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
399 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
400 }
402 /* Attempt to prefetch whole instruction. */
403 sh_ctxt->insn_buf_eip = regs->eip;
404 sh_ctxt->insn_buf_bytes =
405 (!hvm_translate_linear_addr(
406 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
407 hvm_access_insn_fetch, sh_ctxt, &addr) &&
408 !hvm_fetch_from_guest_virt_nofault(
409 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
410 ? sizeof(sh_ctxt->insn_buf) : 0;
412 return &hvm_shadow_emulator_ops;
413 }
415 /* Update an initialized emulation context to prepare for the next
416 * instruction */
417 void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
418 struct cpu_user_regs *regs)
419 {
420 struct vcpu *v = current;
421 unsigned long addr, diff;
423 /* We don't refetch the segment bases, because we don't emulate
424 * writes to segment registers */
426 if ( is_hvm_vcpu(v) )
427 {
428 diff = regs->eip - sh_ctxt->insn_buf_eip;
429 if ( diff > sh_ctxt->insn_buf_bytes )
430 {
431 /* Prefetch more bytes. */
432 sh_ctxt->insn_buf_bytes =
433 (!hvm_translate_linear_addr(
434 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
435 hvm_access_insn_fetch, sh_ctxt, &addr) &&
436 !hvm_fetch_from_guest_virt_nofault(
437 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
438 ? sizeof(sh_ctxt->insn_buf) : 0;
439 sh_ctxt->insn_buf_eip = regs->eip;
440 }
441 }
442 }
445 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
446 /**************************************************************************/
447 /* Out-of-sync shadows. */
449 /* From time to time, we let a shadowed pagetable page go out of sync
450 * with its shadow: the guest is allowed to write directly to the page,
451 * and those writes are not synchronously reflected in the shadow.
452 * This lets us avoid many emulations if the guest is writing a lot to a
453 * pagetable, but it relaxes a pretty important invariant in the shadow
454 * pagetable design. Therefore, some rules:
455 *
456 * 1. Only L1 pagetables may go out of sync: any page that is shadowed
457 * at at higher level must be synchronously updated. This makes
458 * using linear shadow pagetables much less dangerous.
459 * That means that: (a) unsyncing code needs to check for higher-level
460 * shadows, and (b) promotion code needs to resync.
461 *
462 * 2. All shadow operations on a guest page require the page to be brought
463 * back into sync before proceeding. This must be done under the
464 * shadow lock so that the page is guaranteed to remain synced until
465 * the operation completes.
466 *
467 * Exceptions to this rule: the pagefault and invlpg handlers may
468 * update only one entry on an out-of-sync page without resyncing it.
469 *
470 * 3. Operations on shadows that do not start from a guest page need to
471 * be aware that they may be handling an out-of-sync shadow.
472 *
473 * 4. Operations that do not normally take the shadow lock (fast-path
474 * #PF handler, INVLPG) must fall back to a locking, syncing version
475 * if they see an out-of-sync table.
476 *
477 * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
478 * must explicitly resync all relevant pages or update their
479 * shadows.
480 *
481 * Currently out-of-sync pages are listed in a simple open-addressed
482 * hash table with a second chance (must resist temptation to radically
483 * over-engineer hash tables...) The virtual address of the access
484 * which caused us to unsync the page is also kept in the hash table, as
485 * a hint for finding the writable mappings later.
486 *
487 * We keep a hash per vcpu, because we want as much as possible to do
488 * the re-sync on the save vcpu we did the unsync on, so the VA hint
489 * will be valid.
490 */
493 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
494 static void sh_oos_audit(struct domain *d)
495 {
496 int idx, expected_idx, expected_idx_alt;
497 struct page_info *pg;
498 struct vcpu *v;
500 for_each_vcpu(d, v)
501 {
502 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
503 {
504 mfn_t *oos = v->arch.paging.shadow.oos;
505 if ( !mfn_valid(oos[idx]) )
506 continue;
508 expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
509 expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
510 if ( idx != expected_idx && idx != expected_idx_alt )
511 {
512 printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
513 __func__, idx, mfn_x(oos[idx]),
514 expected_idx, expected_idx_alt);
515 BUG();
516 }
517 pg = mfn_to_page(oos[idx]);
518 if ( !(pg->count_info & PGC_page_table) )
519 {
520 printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
521 __func__, idx, mfn_x(oos[idx]), pg->count_info);
522 BUG();
523 }
524 if ( !(pg->shadow_flags & SHF_out_of_sync) )
525 {
526 printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
527 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
528 BUG();
529 }
530 if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
531 {
532 printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
533 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
534 BUG();
535 }
536 }
537 }
538 }
539 #endif
541 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
542 void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
543 {
544 int idx;
545 struct vcpu *v;
546 mfn_t *oos;
548 ASSERT(mfn_is_out_of_sync(gmfn));
550 for_each_vcpu(d, v)
551 {
552 oos = v->arch.paging.shadow.oos;
553 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
554 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
555 idx = (idx + 1) % SHADOW_OOS_PAGES;
557 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
558 return;
559 }
561 SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
562 BUG();
563 }
564 #endif
566 /* Update the shadow, but keep the page out of sync. */
567 static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
568 {
569 struct page_info *pg = mfn_to_page(gmfn);
571 ASSERT(mfn_valid(gmfn));
572 ASSERT(page_is_out_of_sync(pg));
574 /* Call out to the appropriate per-mode resyncing function */
575 if ( pg->shadow_flags & SHF_L1_32 )
576 SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
577 else if ( pg->shadow_flags & SHF_L1_PAE )
578 SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
579 #if CONFIG_PAGING_LEVELS >= 4
580 else if ( pg->shadow_flags & SHF_L1_64 )
581 SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
582 #endif
583 }
586 /*
587 * Fixup arrays: We limit the maximum number of writable mappings to
588 * SHADOW_OOS_FIXUPS and store enough information to remove them
589 * quickly on resync.
590 */
592 static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn,
593 struct oos_fixup *fixup)
594 {
595 int i;
596 for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
597 {
598 if ( mfn_x(fixup->smfn[i]) != INVALID_MFN )
599 {
600 sh_remove_write_access_from_sl1p(v, gmfn,
601 fixup->smfn[i],
602 fixup->off[i]);
603 fixup->smfn[i] = _mfn(INVALID_MFN);
604 }
605 }
607 /* Always flush the TLBs. See comment on oos_fixup_add(). */
608 return 1;
609 }
611 void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
612 mfn_t smfn, unsigned long off)
613 {
614 int idx, next;
615 mfn_t *oos;
616 struct oos_fixup *oos_fixup;
617 struct domain *d = v->domain;
619 perfc_incr(shadow_oos_fixup_add);
621 for_each_vcpu(d, v)
622 {
623 oos = v->arch.paging.shadow.oos;
624 oos_fixup = v->arch.paging.shadow.oos_fixup;
625 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
626 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
627 idx = (idx + 1) % SHADOW_OOS_PAGES;
628 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
629 {
630 next = oos_fixup[idx].next;
632 if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN )
633 {
634 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT);
636 /* Reuse this slot and remove current writable mapping. */
637 sh_remove_write_access_from_sl1p(v, gmfn,
638 oos_fixup[idx].smfn[next],
639 oos_fixup[idx].off[next]);
640 perfc_incr(shadow_oos_fixup_evict);
641 /* We should flush the TLBs now, because we removed a
642 writable mapping, but since the shadow is already
643 OOS we have no problem if another vcpu write to
644 this page table. We just have to be very careful to
645 *always* flush the tlbs on resync. */
646 }
648 oos_fixup[idx].smfn[next] = smfn;
649 oos_fixup[idx].off[next] = off;
650 oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS;
652 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD);
653 return;
654 }
655 }
657 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
658 BUG();
659 }
661 static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn,
662 struct oos_fixup *fixup)
663 {
664 int ftlb = 0;
666 ftlb |= oos_fixup_flush_gmfn(v, gmfn, fixup);
668 switch ( sh_remove_write_access(v, gmfn, 0, 0) )
669 {
670 default:
671 case 0:
672 break;
674 case 1:
675 ftlb |= 1;
676 break;
678 case -1:
679 /* An unfindable writeable typecount has appeared, probably via a
680 * grant table entry: can't shoot the mapping, so try to unshadow
681 * the page. If that doesn't work either, the guest is granting
682 * his pagetables and must be killed after all.
683 * This will flush the tlb, so we can return with no worries. */
684 sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
685 return 1;
686 }
688 if ( ftlb )
689 flush_tlb_mask(v->domain->domain_dirty_cpumask);
691 return 0;
692 }
695 static inline void trace_resync(int event, mfn_t gmfn)
696 {
697 if ( tb_init_done )
698 {
699 /* Convert gmfn to gfn */
700 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
701 __trace_var(event, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
702 }
703 }
705 /* Pull all the entries on an out-of-sync page back into sync. */
706 static void _sh_resync(struct vcpu *v, mfn_t gmfn,
707 struct oos_fixup *fixup, mfn_t snp)
708 {
709 struct page_info *pg = mfn_to_page(gmfn);
711 ASSERT(shadow_locked_by_me(v->domain));
712 ASSERT(mfn_is_out_of_sync(gmfn));
713 /* Guest page must be shadowed *only* as L1 when out of sync. */
714 ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
715 & ~SHF_L1_ANY));
716 ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
718 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
719 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
721 /* Need to pull write access so the page *stays* in sync. */
722 if ( oos_remove_write_access(v, gmfn, fixup) )
723 {
724 /* Page has been unshadowed. */
725 return;
726 }
728 /* No more writable mappings of this page, please */
729 pg->shadow_flags &= ~SHF_oos_may_write;
731 /* Update the shadows with current guest entries. */
732 _sh_resync_l1(v, gmfn, snp);
734 /* Now we know all the entries are synced, and will stay that way */
735 pg->shadow_flags &= ~SHF_out_of_sync;
736 perfc_incr(shadow_resync);
737 trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn);
738 }
741 /* Add an MFN to the list of out-of-sync guest pagetables */
742 static void oos_hash_add(struct vcpu *v, mfn_t gmfn)
743 {
744 int i, idx, oidx, swap = 0;
745 void *gptr, *gsnpptr;
746 mfn_t *oos = v->arch.paging.shadow.oos;
747 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
748 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
749 struct oos_fixup fixup = { .next = 0 };
751 for (i = 0; i < SHADOW_OOS_FIXUPS; i++ )
752 fixup.smfn[i] = _mfn(INVALID_MFN);
754 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
755 oidx = idx;
757 if ( mfn_valid(oos[idx])
758 && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
759 {
760 /* Punt the current occupant into the next slot */
761 SWAP(oos[idx], gmfn);
762 SWAP(oos_fixup[idx], fixup);
763 swap = 1;
764 idx = (idx + 1) % SHADOW_OOS_PAGES;
765 }
766 if ( mfn_valid(oos[idx]) )
767 {
768 /* Crush the current occupant. */
769 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
770 perfc_incr(shadow_unsync_evict);
771 }
772 oos[idx] = gmfn;
773 oos_fixup[idx] = fixup;
775 if ( swap )
776 SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
778 gptr = sh_map_domain_page(oos[oidx]);
779 gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
780 memcpy(gsnpptr, gptr, PAGE_SIZE);
781 sh_unmap_domain_page(gptr);
782 sh_unmap_domain_page(gsnpptr);
783 }
785 /* Remove an MFN from the list of out-of-sync guest pagetables */
786 static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
787 {
788 int idx;
789 mfn_t *oos;
790 struct domain *d = v->domain;
792 SHADOW_PRINTK("D%dV%d gmfn %lx\n",
793 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
795 for_each_vcpu(d, v)
796 {
797 oos = v->arch.paging.shadow.oos;
798 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
799 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
800 idx = (idx + 1) % SHADOW_OOS_PAGES;
801 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
802 {
803 oos[idx] = _mfn(INVALID_MFN);
804 return;
805 }
806 }
808 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
809 BUG();
810 }
812 mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
813 {
814 int idx;
815 mfn_t *oos;
816 mfn_t *oos_snapshot;
817 struct domain *d = v->domain;
819 for_each_vcpu(d, v)
820 {
821 oos = v->arch.paging.shadow.oos;
822 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
823 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
824 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
825 idx = (idx + 1) % SHADOW_OOS_PAGES;
826 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
827 {
828 return oos_snapshot[idx];
829 }
830 }
832 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
833 BUG();
834 return _mfn(INVALID_MFN);
835 }
837 /* Pull a single guest page back into sync */
838 void sh_resync(struct vcpu *v, mfn_t gmfn)
839 {
840 int idx;
841 mfn_t *oos;
842 mfn_t *oos_snapshot;
843 struct oos_fixup *oos_fixup;
844 struct domain *d = v->domain;
846 for_each_vcpu(d, v)
847 {
848 oos = v->arch.paging.shadow.oos;
849 oos_fixup = v->arch.paging.shadow.oos_fixup;
850 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
851 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
852 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
853 idx = (idx + 1) % SHADOW_OOS_PAGES;
855 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
856 {
857 _sh_resync(v, gmfn, &oos_fixup[idx], oos_snapshot[idx]);
858 oos[idx] = _mfn(INVALID_MFN);
859 return;
860 }
861 }
863 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
864 BUG();
865 }
867 /* Figure out whether it's definitely safe not to sync this l1 table,
868 * by making a call out to the mode in which that shadow was made. */
869 static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
870 {
871 struct page_info *pg = mfn_to_page(gl1mfn);
872 if ( pg->shadow_flags & SHF_L1_32 )
873 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
874 else if ( pg->shadow_flags & SHF_L1_PAE )
875 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
876 #if CONFIG_PAGING_LEVELS >= 4
877 else if ( pg->shadow_flags & SHF_L1_64 )
878 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
879 #endif
880 SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n",
881 mfn_x(gl1mfn));
882 BUG();
883 return 0; /* BUG() is no longer __attribute__((noreturn)). */
884 }
887 /* Pull all out-of-sync pages back into sync. Pages brought out of sync
888 * on other vcpus are allowed to remain out of sync, but their contents
889 * will be made safe (TLB flush semantics); pages unsynced by this vcpu
890 * are brought back into sync and write-protected. If skip != 0, we try
891 * to avoid resyncing at all if we think we can get away with it. */
892 void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking)
893 {
894 int idx;
895 struct vcpu *other;
896 mfn_t *oos = v->arch.paging.shadow.oos;
897 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
898 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
900 SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
902 ASSERT(do_locking || shadow_locked_by_me(v->domain));
904 if ( !this )
905 goto resync_others;
907 if ( do_locking )
908 shadow_lock(v->domain);
910 /* First: resync all of this vcpu's oos pages */
911 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
912 if ( mfn_valid(oos[idx]) )
913 {
914 /* Write-protect and sync contents */
915 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
916 oos[idx] = _mfn(INVALID_MFN);
917 }
919 if ( do_locking )
920 shadow_unlock(v->domain);
922 resync_others:
923 if ( !others )
924 return;
926 /* Second: make all *other* vcpus' oos pages safe. */
927 for_each_vcpu(v->domain, other)
928 {
929 if ( v == other )
930 continue;
932 if ( do_locking )
933 shadow_lock(v->domain);
935 oos = other->arch.paging.shadow.oos;
936 oos_fixup = other->arch.paging.shadow.oos_fixup;
937 oos_snapshot = other->arch.paging.shadow.oos_snapshot;
939 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
940 {
941 if ( !mfn_valid(oos[idx]) )
942 continue;
944 if ( skip )
945 {
946 /* Update the shadows and leave the page OOS. */
947 if ( sh_skip_sync(v, oos[idx]) )
948 continue;
949 trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]);
950 _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
951 }
952 else
953 {
954 /* Write-protect and sync contents */
955 _sh_resync(other, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
956 oos[idx] = _mfn(INVALID_MFN);
957 }
958 }
960 if ( do_locking )
961 shadow_unlock(v->domain);
962 }
963 }
965 /* Allow a shadowed page to go out of sync. Unsyncs are traced in
966 * multi.c:sh_page_fault() */
967 int sh_unsync(struct vcpu *v, mfn_t gmfn)
968 {
969 struct page_info *pg;
971 ASSERT(shadow_locked_by_me(v->domain));
973 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
974 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
976 pg = mfn_to_page(gmfn);
978 /* Guest page must be shadowed *only* as L1 and *only* once when out
979 * of sync. Also, get out now if it's already out of sync.
980 * Also, can't safely unsync if some vcpus have paging disabled.*/
981 if ( pg->shadow_flags &
982 ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
983 || sh_page_has_multiple_shadows(pg)
984 || !is_hvm_domain(v->domain)
985 || !v->domain->arch.paging.shadow.oos_active )
986 return 0;
988 pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
989 oos_hash_add(v, gmfn);
990 perfc_incr(shadow_unsync);
991 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC);
992 return 1;
993 }
995 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
998 /**************************************************************************/
999 /* Code for "promoting" a guest page to the point where the shadow code is
1000 * willing to let it be treated as a guest page table. This generally
1001 * involves making sure there are no writable mappings available to the guest
1002 * for this page.
1003 */
1004 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
1006 struct page_info *page = mfn_to_page(gmfn);
1008 ASSERT(mfn_valid(gmfn));
1010 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1011 /* Is the page already shadowed and out of sync? */
1012 if ( page_is_out_of_sync(page) )
1013 sh_resync(v, gmfn);
1014 #endif
1016 /* We should never try to promote a gmfn that has writeable mappings */
1017 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
1018 || (page->u.inuse.type_info & PGT_count_mask) == 0
1019 || v->domain->is_shutting_down);
1021 /* Is the page already shadowed? */
1022 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
1023 page->shadow_flags = 0;
1025 ASSERT(!test_bit(type, &page->shadow_flags));
1026 set_bit(type, &page->shadow_flags);
1027 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
1030 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
1032 struct page_info *page = mfn_to_page(gmfn);
1034 ASSERT(test_bit(_PGC_page_table, &page->count_info));
1035 ASSERT(test_bit(type, &page->shadow_flags));
1037 clear_bit(type, &page->shadow_flags);
1039 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
1041 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1042 /* Was the page out of sync? */
1043 if ( page_is_out_of_sync(page) )
1045 oos_hash_remove(v, gmfn);
1047 #endif
1048 clear_bit(_PGC_page_table, &page->count_info);
1051 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE);
1054 /**************************************************************************/
1055 /* Validate a pagetable change from the guest and update the shadows.
1056 * Returns a bitmask of SHADOW_SET_* flags. */
1058 int
1059 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
1061 int result = 0;
1062 struct page_info *page = mfn_to_page(gmfn);
1064 paging_mark_dirty(v->domain, mfn_x(gmfn));
1066 // Determine which types of shadows are affected, and update each.
1067 //
1068 // Always validate L1s before L2s to prevent another cpu with a linear
1069 // mapping of this gmfn from seeing a walk that results from
1070 // using the new L2 value and the old L1 value. (It is OK for such a
1071 // guest to see a walk that uses the old L2 value with the new L1 value,
1072 // as hardware could behave this way if one level of the pagewalk occurs
1073 // before the store, and the next level of the pagewalk occurs after the
1074 // store.
1075 //
1076 // Ditto for L2s before L3s, etc.
1077 //
1079 if ( !(page->count_info & PGC_page_table) )
1080 return 0; /* Not shadowed at all */
1082 if ( page->shadow_flags & SHF_L1_32 )
1083 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2)
1084 (v, gmfn, entry, size);
1085 if ( page->shadow_flags & SHF_L2_32 )
1086 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2)
1087 (v, gmfn, entry, size);
1089 if ( page->shadow_flags & SHF_L1_PAE )
1090 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3)
1091 (v, gmfn, entry, size);
1092 if ( page->shadow_flags & SHF_L2_PAE )
1093 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3)
1094 (v, gmfn, entry, size);
1095 if ( page->shadow_flags & SHF_L2H_PAE )
1096 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3)
1097 (v, gmfn, entry, size);
1099 #if CONFIG_PAGING_LEVELS >= 4
1100 if ( page->shadow_flags & SHF_L1_64 )
1101 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4)
1102 (v, gmfn, entry, size);
1103 if ( page->shadow_flags & SHF_L2_64 )
1104 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4)
1105 (v, gmfn, entry, size);
1106 if ( page->shadow_flags & SHF_L2H_64 )
1107 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4)
1108 (v, gmfn, entry, size);
1109 if ( page->shadow_flags & SHF_L3_64 )
1110 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4)
1111 (v, gmfn, entry, size);
1112 if ( page->shadow_flags & SHF_L4_64 )
1113 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4)
1114 (v, gmfn, entry, size);
1115 #else /* 32-bit hypervisor does not support 64-bit guests */
1116 ASSERT((page->shadow_flags
1117 & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
1118 #endif
1119 this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED));
1121 return result;
1125 void
1126 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
1127 void *entry, u32 size)
1128 /* This is the entry point for emulated writes to pagetables in HVM guests and
1129 * PV translated guests.
1130 */
1132 struct domain *d = v->domain;
1133 int rc;
1135 ASSERT(shadow_locked_by_me(v->domain));
1136 rc = sh_validate_guest_entry(v, gmfn, entry, size);
1137 if ( rc & SHADOW_SET_FLUSH )
1138 /* Need to flush TLBs to pick up shadow PT changes */
1139 flush_tlb_mask(d->domain_dirty_cpumask);
1140 if ( rc & SHADOW_SET_ERROR )
1142 /* This page is probably not a pagetable any more: tear it out of the
1143 * shadows, along with any tables that reference it.
1144 * Since the validate call above will have made a "safe" (i.e. zero)
1145 * shadow entry, we can let the domain live even if we can't fully
1146 * unshadow the page. */
1147 sh_remove_shadows(v, gmfn, 0, 0);
1151 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
1152 intpte_t new, mfn_t gmfn)
1153 /* Write a new value into the guest pagetable, and update the shadows
1154 * appropriately. Returns 0 if we page-faulted, 1 for success. */
1156 int failed;
1157 shadow_lock(v->domain);
1158 failed = __copy_to_user(p, &new, sizeof(new));
1159 if ( failed != sizeof(new) )
1160 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1161 shadow_unlock(v->domain);
1162 return (failed == 0);
1165 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
1166 intpte_t *old, intpte_t new, mfn_t gmfn)
1167 /* Cmpxchg a new value into the guest pagetable, and update the shadows
1168 * appropriately. Returns 0 if we page-faulted, 1 if not.
1169 * N.B. caller should check the value of "old" to see if the
1170 * cmpxchg itself was successful. */
1172 int failed;
1173 intpte_t t = *old;
1174 shadow_lock(v->domain);
1175 failed = cmpxchg_user(p, t, new);
1176 if ( t == *old )
1177 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1178 *old = t;
1179 shadow_unlock(v->domain);
1180 return (failed == 0);
1184 /**************************************************************************/
1185 /* Memory management for shadow pages. */
1187 /* Allocating shadow pages
1188 * -----------------------
1190 * Most shadow pages are allocated singly, but there is one case where
1191 * we need to allocate multiple pages together: shadowing 32-bit guest
1192 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
1193 * of virtual address space, and needs to be shadowed by two PAE/64-bit
1194 * l1 tables (covering 2MB of virtual address space each). Similarly, a
1195 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
1196 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
1197 * contiguous and aligned; functions for handling offsets into them are
1198 * defined in shadow.c (shadow_l1_index() etc.)
1200 * This table shows the allocation behaviour of the different modes:
1202 * Xen paging pae pae 64b 64b 64b
1203 * Guest paging 32b pae 32b pae 64b
1204 * PV or HVM HVM * HVM HVM *
1205 * Shadow paging pae pae pae pae 64b
1207 * sl1 size 8k 4k 8k 4k 4k
1208 * sl2 size 16k 4k 16k 4k 4k
1209 * sl3 size - - - - 4k
1210 * sl4 size - - - - 4k
1212 * We allocate memory from xen in four-page units and break them down
1213 * with a simple buddy allocator. Can't use the xen allocator to handle
1214 * this as it only works for contiguous zones, and a domain's shadow
1215 * pool is made of fragments.
1217 * In HVM guests, the p2m table is built out of shadow pages, and we provide
1218 * a function for the p2m management to steal pages, in max-order chunks, from
1219 * the free pool. We don't provide for giving them back, yet.
1220 */
1222 /* Figure out the least acceptable quantity of shadow memory.
1223 * The minimum memory requirement for always being able to free up a
1224 * chunk of memory is very small -- only three max-order chunks per
1225 * vcpu to hold the top level shadows and pages with Xen mappings in them.
1227 * But for a guest to be guaranteed to successfully execute a single
1228 * instruction, we must be able to map a large number (about thirty) VAs
1229 * at the same time, which means that to guarantee progress, we must
1230 * allow for more than ninety allocated pages per vcpu. We round that
1231 * up to 128 pages, or half a megabyte per vcpu. */
1232 static unsigned int shadow_min_acceptable_pages(struct domain *d)
1234 u32 vcpu_count = 0;
1235 struct vcpu *v;
1237 for_each_vcpu(d, v)
1238 vcpu_count++;
1240 return (vcpu_count * 128);
1243 /* Figure out the order of allocation needed for a given shadow type */
1244 static inline u32
1245 shadow_order(unsigned int shadow_type)
1247 static const u32 type_to_order[SH_type_unused] = {
1248 0, /* SH_type_none */
1249 1, /* SH_type_l1_32_shadow */
1250 1, /* SH_type_fl1_32_shadow */
1251 2, /* SH_type_l2_32_shadow */
1252 0, /* SH_type_l1_pae_shadow */
1253 0, /* SH_type_fl1_pae_shadow */
1254 0, /* SH_type_l2_pae_shadow */
1255 0, /* SH_type_l2h_pae_shadow */
1256 0, /* SH_type_l1_64_shadow */
1257 0, /* SH_type_fl1_64_shadow */
1258 0, /* SH_type_l2_64_shadow */
1259 0, /* SH_type_l2h_64_shadow */
1260 0, /* SH_type_l3_64_shadow */
1261 0, /* SH_type_l4_64_shadow */
1262 2, /* SH_type_p2m_table */
1263 0, /* SH_type_monitor_table */
1264 0 /* SH_type_oos_snapshot */
1265 };
1266 ASSERT(shadow_type < SH_type_unused);
1267 return type_to_order[shadow_type];
1270 static inline unsigned int
1271 shadow_max_order(struct domain *d)
1273 return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0;
1276 /* Do we have at total of count pages of the requested order free? */
1277 static inline int space_is_available(
1278 struct domain *d,
1279 unsigned int order,
1280 unsigned int count)
1282 for ( ; order <= shadow_max_order(d); ++order )
1284 unsigned int n = count;
1285 const struct list_head *p;
1287 list_for_each ( p, &d->arch.paging.shadow.freelists[order] )
1288 if ( --n == 0 )
1289 return 1;
1290 count = (count + 1) >> 1;
1293 return 0;
1296 /* Dispatcher function: call the per-mode function that will unhook the
1297 * non-Xen mappings in this top-level shadow mfn */
1298 static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
1300 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1301 switch ( sp->type )
1303 case SH_type_l2_32_shadow:
1304 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v,smfn);
1305 break;
1306 case SH_type_l2_pae_shadow:
1307 case SH_type_l2h_pae_shadow:
1308 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v,smfn);
1309 break;
1310 #if CONFIG_PAGING_LEVELS >= 4
1311 case SH_type_l4_64_shadow:
1312 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v,smfn);
1313 break;
1314 #endif
1315 default:
1316 SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->type);
1317 BUG();
1321 static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
1323 if ( tb_init_done )
1325 /* Convert smfn to gfn */
1326 unsigned long gfn;
1327 ASSERT(mfn_valid(smfn));
1328 gfn = mfn_to_gfn(d, _mfn(mfn_to_shadow_page(smfn)->backpointer));
1329 __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/,
1330 sizeof(gfn), (unsigned char*)&gfn);
1334 /* Make sure there are at least count order-sized pages
1335 * available in the shadow page pool. */
1336 static void _shadow_prealloc(
1337 struct domain *d,
1338 unsigned int order,
1339 unsigned int count)
1341 /* Need a vpcu for calling unpins; for now, since we don't have
1342 * per-vcpu shadows, any will do */
1343 struct vcpu *v, *v2;
1344 struct list_head *l, *t;
1345 struct shadow_page_info *sp;
1346 mfn_t smfn;
1347 int i;
1349 ASSERT(order <= shadow_max_order(d));
1350 if ( space_is_available(d, order, count) ) return;
1352 v = current;
1353 if ( v->domain != d )
1354 v = d->vcpu[0];
1355 ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */
1357 /* Stage one: walk the list of pinned pages, unpinning them */
1358 perfc_incr(shadow_prealloc_1);
1359 list_for_each_backwards_safe(l, t, &d->arch.paging.shadow.pinned_shadows)
1361 sp = list_entry(l, struct shadow_page_info, list);
1362 smfn = shadow_page_to_mfn(sp);
1364 /* Unpin this top-level shadow */
1365 trace_shadow_prealloc_unpin(d, smfn);
1366 sh_unpin(v, smfn);
1368 /* See if that freed up enough space */
1369 if ( space_is_available(d, order, count) ) return;
1372 /* Stage two: all shadow pages are in use in hierarchies that are
1373 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
1374 * mappings. */
1375 perfc_incr(shadow_prealloc_2);
1377 for_each_vcpu(d, v2)
1378 for ( i = 0 ; i < 4 ; i++ )
1380 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
1382 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
1383 shadow_unhook_mappings(v,
1384 pagetable_get_mfn(v2->arch.shadow_table[i]));
1386 /* See if that freed up enough space */
1387 if ( space_is_available(d, order, count) )
1389 flush_tlb_mask(d->domain_dirty_cpumask);
1390 return;
1395 /* Nothing more we can do: all remaining shadows are of pages that
1396 * hold Xen mappings for some vcpu. This can never happen. */
1397 SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n"
1398 " shadow pages total = %u, free = %u, p2m=%u\n",
1399 count, order,
1400 d->arch.paging.shadow.total_pages,
1401 d->arch.paging.shadow.free_pages,
1402 d->arch.paging.shadow.p2m_pages);
1403 BUG();
1406 /* Make sure there are at least count pages of the order according to
1407 * type available in the shadow page pool.
1408 * This must be called before any calls to shadow_alloc(). Since this
1409 * will free existing shadows to make room, it must be called early enough
1410 * to avoid freeing shadows that the caller is currently working on. */
1411 void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
1413 ASSERT(type != SH_type_p2m_table);
1414 return _shadow_prealloc(d, shadow_order(type), count);
1417 /* Deliberately free all the memory we can: this will tear down all of
1418 * this domain's shadows */
1419 static void shadow_blow_tables(struct domain *d)
1421 struct list_head *l, *t;
1422 struct shadow_page_info *sp;
1423 struct vcpu *v = d->vcpu[0];
1424 mfn_t smfn;
1425 int i;
1427 ASSERT(v != NULL);
1429 /* Pass one: unpin all pinned pages */
1430 list_for_each_backwards_safe(l,t, &d->arch.paging.shadow.pinned_shadows)
1432 sp = list_entry(l, struct shadow_page_info, list);
1433 smfn = shadow_page_to_mfn(sp);
1434 sh_unpin(v, smfn);
1437 /* Second pass: unhook entries of in-use shadows */
1438 for_each_vcpu(d, v)
1439 for ( i = 0 ; i < 4 ; i++ )
1440 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
1441 shadow_unhook_mappings(v,
1442 pagetable_get_mfn(v->arch.shadow_table[i]));
1444 /* Make sure everyone sees the unshadowings */
1445 flush_tlb_mask(d->domain_dirty_cpumask);
1448 void shadow_blow_tables_per_domain(struct domain *d)
1450 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL ) {
1451 shadow_lock(d);
1452 shadow_blow_tables(d);
1453 shadow_unlock(d);
1457 #ifndef NDEBUG
1458 /* Blow all shadows of all shadowed domains: this can be used to cause the
1459 * guest's pagetables to be re-shadowed if we suspect that the shadows
1460 * have somehow got out of sync */
1461 static void shadow_blow_all_tables(unsigned char c)
1463 struct domain *d;
1464 printk("'%c' pressed -> blowing all shadow tables\n", c);
1465 rcu_read_lock(&domlist_read_lock);
1466 for_each_domain(d)
1468 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
1470 shadow_lock(d);
1471 shadow_blow_tables(d);
1472 shadow_unlock(d);
1475 rcu_read_unlock(&domlist_read_lock);
1478 /* Register this function in the Xen console keypress table */
1479 static __init int shadow_blow_tables_keyhandler_init(void)
1481 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
1482 return 0;
1484 __initcall(shadow_blow_tables_keyhandler_init);
1485 #endif /* !NDEBUG */
1487 /* Allocate another shadow's worth of (contiguous, aligned) pages,
1488 * and fill in the type and backpointer fields of their page_infos.
1489 * Never fails to allocate. */
1490 mfn_t shadow_alloc(struct domain *d,
1491 u32 shadow_type,
1492 unsigned long backpointer)
1494 struct shadow_page_info *sp = NULL;
1495 unsigned int order = shadow_order(shadow_type);
1496 cpumask_t mask;
1497 void *p;
1498 int i;
1500 ASSERT(shadow_locked_by_me(d));
1501 if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d))
1502 order = shadow_max_order(d);
1503 ASSERT(order <= shadow_max_order(d));
1504 ASSERT(shadow_type != SH_type_none);
1505 perfc_incr(shadow_alloc);
1507 /* Find smallest order which can satisfy the request. */
1508 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
1509 if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
1510 goto found;
1512 /* If we get here, we failed to allocate. This should never happen.
1513 * It means that we didn't call shadow_prealloc() correctly before
1514 * we allocated. We can't recover by calling prealloc here, because
1515 * we might free up higher-level pages that the caller is working on. */
1516 SHADOW_ERROR("Can't allocate %i shadow pages!\n", 1 << order);
1517 BUG();
1519 found:
1520 sp = list_entry(d->arch.paging.shadow.freelists[i].next,
1521 struct shadow_page_info, list);
1522 list_del(&sp->list);
1524 /* We may have to halve the chunk a number of times. */
1525 while ( i != order )
1527 i--;
1528 sp->order = i;
1529 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[i]);
1530 sp += 1 << i;
1532 d->arch.paging.shadow.free_pages -= 1 << order;
1534 /* Init page info fields and clear the pages */
1535 for ( i = 0; i < 1<<order ; i++ )
1537 /* Before we overwrite the old contents of this page,
1538 * we need to be sure that no TLB holds a pointer to it. */
1539 mask = d->domain_dirty_cpumask;
1540 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
1541 if ( unlikely(!cpus_empty(mask)) )
1543 perfc_incr(shadow_alloc_tlbflush);
1544 flush_tlb_mask(mask);
1546 /* Now safe to clear the page for reuse */
1547 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
1548 ASSERT(p != NULL);
1549 clear_page(p);
1550 sh_unmap_domain_page(p);
1551 INIT_LIST_HEAD(&sp[i].list);
1552 sp[i].type = shadow_type;
1553 sp[i].pinned = 0;
1554 sp[i].count = 0;
1555 sp[i].backpointer = backpointer;
1556 sp[i].next_shadow = NULL;
1557 perfc_incr(shadow_alloc_count);
1559 return shadow_page_to_mfn(sp);
1563 /* Return some shadow pages to the pool. */
1564 void shadow_free(struct domain *d, mfn_t smfn)
1566 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1567 u32 shadow_type;
1568 unsigned long order;
1569 unsigned long mask;
1570 int i;
1572 ASSERT(shadow_locked_by_me(d));
1573 perfc_incr(shadow_free);
1575 shadow_type = sp->type;
1576 ASSERT(shadow_type != SH_type_none);
1577 ASSERT(shadow_type != SH_type_p2m_table);
1578 order = shadow_order(shadow_type);
1580 d->arch.paging.shadow.free_pages += 1 << order;
1582 for ( i = 0; i < 1<<order; i++ )
1584 #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION)
1585 struct vcpu *v;
1586 for_each_vcpu(d, v)
1588 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1589 /* No longer safe to look for a writeable mapping in this shadow */
1590 if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
1591 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1592 #endif
1593 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
1594 v->arch.paging.last_write_emul_ok = 0;
1595 #endif
1597 #endif
1598 /* Strip out the type: this is now a free shadow page */
1599 sp[i].type = 0;
1600 /* Remember the TLB timestamp so we will know whether to flush
1601 * TLBs when we reuse the page. Because the destructors leave the
1602 * contents of the pages in place, we can delay TLB flushes until
1603 * just before the allocator hands the page out again. */
1604 sp[i].tlbflush_timestamp = tlbflush_current_time();
1605 perfc_decr(shadow_alloc_count);
1608 /* Merge chunks as far as possible. */
1609 for ( ; order < shadow_max_order(d); ++order )
1611 mask = 1 << order;
1612 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
1613 /* Merge with predecessor block? */
1614 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
1615 break;
1616 list_del(&(sp-mask)->list);
1617 sp -= mask;
1618 } else {
1619 /* Merge with successor block? */
1620 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
1621 break;
1622 list_del(&(sp+mask)->list);
1626 sp->order = order;
1627 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1630 /* Divert some memory from the pool to be used by the p2m mapping.
1631 * This action is irreversible: the p2m mapping only ever grows.
1632 * That's OK because the p2m table only exists for translated domains,
1633 * and those domains can't ever turn off shadow mode.
1634 * Also, we only ever allocate a max-order chunk, so as to preserve
1635 * the invariant that shadow_prealloc() always works.
1636 * Returns 0 iff it can't get a chunk (the caller should then
1637 * free up some pages in domheap and call sh_set_allocation);
1638 * returns non-zero on success.
1639 */
1640 static int
1641 sh_alloc_p2m_pages(struct domain *d)
1643 struct page_info *pg;
1644 u32 i;
1645 unsigned int order = shadow_max_order(d);
1647 ASSERT(shadow_locked_by_me(d));
1649 if ( d->arch.paging.shadow.total_pages
1650 < (shadow_min_acceptable_pages(d) + (1 << order)) )
1651 return 0; /* Not enough shadow memory: need to increase it first */
1653 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1654 d->arch.paging.shadow.p2m_pages += (1 << order);
1655 d->arch.paging.shadow.total_pages -= (1 << order);
1656 for (i = 0; i < (1U << order); i++)
1658 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1659 * Marking the domain as the owner would normally allow the guest to
1660 * create mappings of these pages, but these p2m pages will never be
1661 * in the domain's guest-physical address space, and so that is not
1662 * believed to be a concern.
1663 */
1664 page_set_owner(&pg[i], d);
1665 pg[i].count_info = 1;
1666 list_add_tail(&pg[i].list, &d->arch.paging.shadow.p2m_freelist);
1668 return 1;
1671 // Returns 0 if no memory is available...
1672 static struct page_info *
1673 shadow_alloc_p2m_page(struct domain *d)
1675 struct list_head *entry;
1676 struct page_info *pg;
1677 mfn_t mfn;
1678 void *p;
1680 shadow_lock(d);
1682 if ( list_empty(&d->arch.paging.shadow.p2m_freelist) &&
1683 !sh_alloc_p2m_pages(d) )
1685 shadow_unlock(d);
1686 return NULL;
1688 entry = d->arch.paging.shadow.p2m_freelist.next;
1689 list_del(entry);
1691 shadow_unlock(d);
1693 pg = list_entry(entry, struct page_info, list);
1694 mfn = page_to_mfn(pg);
1695 p = sh_map_domain_page(mfn);
1696 clear_page(p);
1697 sh_unmap_domain_page(p);
1699 return pg;
1702 static void
1703 shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1705 ASSERT(page_get_owner(pg) == d);
1706 /* Should have just the one ref we gave it in alloc_p2m_page() */
1707 if ( (pg->count_info & PGC_count_mask) != 1 )
1709 SHADOW_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1710 pg->count_info, pg->u.inuse.type_info);
1712 pg->count_info = 0;
1713 /* Free should not decrement domain's total allocation, since
1714 * these pages were allocated without an owner. */
1715 page_set_owner(pg, NULL);
1716 #if defined(__x86_64__)
1717 spin_lock_init(&pg->lock);
1718 #endif
1719 free_domheap_pages(pg, 0);
1720 d->arch.paging.shadow.p2m_pages--;
1721 perfc_decr(shadow_alloc_count);
1724 #if CONFIG_PAGING_LEVELS == 3
1725 static void p2m_install_entry_in_monitors(struct domain *d,
1726 l3_pgentry_t *l3e)
1727 /* Special case, only used for external-mode domains on PAE hosts:
1728 * update the mapping of the p2m table. Once again, this is trivial in
1729 * other paging modes (one top-level entry points to the top-level p2m,
1730 * no maintenance needed), but PAE makes life difficult by needing a
1731 * copy the eight l3es of the p2m table in eight l2h slots in the
1732 * monitor table. This function makes fresh copies when a p2m l3e
1733 * changes. */
1735 l2_pgentry_t *ml2e;
1736 struct vcpu *v;
1737 unsigned int index;
1739 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1740 ASSERT(index < MACHPHYS_MBYTES>>1);
1742 for_each_vcpu(d, v)
1744 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1745 continue;
1746 ASSERT(shadow_mode_external(v->domain));
1748 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1749 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1751 if ( v == current ) /* OK to use linear map of monitor_table */
1752 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1753 else
1755 l3_pgentry_t *ml3e;
1756 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1757 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1758 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1759 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1760 sh_unmap_domain_page(ml3e);
1762 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1763 if ( v != current )
1764 sh_unmap_domain_page(ml2e);
1767 #endif
1769 /* Set the pool of shadow pages to the required number of pages.
1770 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1771 * plus space for the p2m table.
1772 * Returns 0 for success, non-zero for failure. */
1773 static unsigned int sh_set_allocation(struct domain *d,
1774 unsigned int pages,
1775 int *preempted)
1777 struct shadow_page_info *sp;
1778 unsigned int lower_bound;
1779 unsigned int j, order = shadow_max_order(d);
1781 ASSERT(shadow_locked_by_me(d));
1783 /* Don't allocate less than the minimum acceptable, plus one page per
1784 * megabyte of RAM (for the p2m table) */
1785 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1786 if ( pages > 0 && pages < lower_bound )
1787 pages = lower_bound;
1788 /* Round up to largest block size */
1789 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1791 SHADOW_PRINTK("current %i target %i\n",
1792 d->arch.paging.shadow.total_pages, pages);
1794 while ( d->arch.paging.shadow.total_pages != pages )
1796 if ( d->arch.paging.shadow.total_pages < pages )
1798 /* Need to allocate more memory from domheap */
1799 sp = (struct shadow_page_info *)
1800 alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d)));
1801 if ( sp == NULL )
1803 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1804 return -ENOMEM;
1806 d->arch.paging.shadow.free_pages += 1 << order;
1807 d->arch.paging.shadow.total_pages += 1 << order;
1808 for ( j = 0; j < 1U << order; j++ )
1810 sp[j].type = 0;
1811 sp[j].pinned = 0;
1812 sp[j].count = 0;
1813 sp[j].mbz = 0;
1814 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1816 sp->order = order;
1817 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1819 else if ( d->arch.paging.shadow.total_pages > pages )
1821 /* Need to return memory to domheap */
1822 _shadow_prealloc(d, order, 1);
1823 ASSERT(!list_empty(&d->arch.paging.shadow.freelists[order]));
1824 sp = list_entry(d->arch.paging.shadow.freelists[order].next,
1825 struct shadow_page_info, list);
1826 list_del(&sp->list);
1827 #if defined(__x86_64__)
1828 /*
1829 * Re-instate lock field which we overwrite with shadow_page_info.
1830 * This was safe, since the lock is only used on guest pages.
1831 */
1832 for ( j = 0; j < 1U << order; j++ )
1833 spin_lock_init(&((struct page_info *)sp)[j].lock);
1834 #endif
1835 d->arch.paging.shadow.free_pages -= 1 << order;
1836 d->arch.paging.shadow.total_pages -= 1 << order;
1837 free_domheap_pages((struct page_info *)sp, order);
1840 /* Check to see if we need to yield and try again */
1841 if ( preempted && hypercall_preempt_check() )
1843 *preempted = 1;
1844 return 0;
1848 return 0;
1851 /* Return the size of the shadow pool, rounded up to the nearest MB */
1852 static unsigned int shadow_get_allocation(struct domain *d)
1854 unsigned int pg = d->arch.paging.shadow.total_pages;
1855 return ((pg >> (20 - PAGE_SHIFT))
1856 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1859 /**************************************************************************/
1860 /* Hash table for storing the guest->shadow mappings.
1861 * The table itself is an array of pointers to shadows; the shadows are then
1862 * threaded on a singly-linked list of shadows with the same hash value */
1864 #define SHADOW_HASH_BUCKETS 251
1865 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1867 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1868 typedef u32 key_t;
1869 static inline key_t sh_hash(unsigned long n, unsigned int t)
1871 unsigned char *p = (unsigned char *)&n;
1872 key_t k = t;
1873 int i;
1874 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1875 return k % SHADOW_HASH_BUCKETS;
1878 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1880 /* Before we get to the mechanism, define a pair of audit functions
1881 * that sanity-check the contents of the hash table. */
1882 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1883 /* Audit one bucket of the hash table */
1885 struct shadow_page_info *sp, *x;
1887 if ( !(SHADOW_AUDIT_ENABLE) )
1888 return;
1890 sp = d->arch.paging.shadow.hash_table[bucket];
1891 while ( sp )
1893 /* Not a shadow? */
1894 BUG_ON( sp->mbz != 0 );
1895 /* Bogus type? */
1896 BUG_ON( sp->type == 0 );
1897 BUG_ON( sp->type > SH_type_max_shadow );
1898 /* Wrong bucket? */
1899 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1900 /* Duplicate entry? */
1901 for ( x = sp->next_shadow; x; x = x->next_shadow )
1902 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1903 /* Follow the backpointer to the guest pagetable */
1904 if ( sp->type != SH_type_fl1_32_shadow
1905 && sp->type != SH_type_fl1_pae_shadow
1906 && sp->type != SH_type_fl1_64_shadow )
1908 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1909 /* Bad shadow flags on guest page? */
1910 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1911 /* Bad type count on guest page? */
1912 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1913 if ( sp->type == SH_type_l1_32_shadow
1914 || sp->type == SH_type_l1_pae_shadow
1915 || sp->type == SH_type_l1_64_shadow )
1917 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1918 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1920 if ( !page_is_out_of_sync(gpg) )
1922 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1923 " and not OOS but has typecount %#lx\n",
1924 sp->backpointer,
1925 mfn_x(shadow_page_to_mfn(sp)),
1926 gpg->u.inuse.type_info);
1927 BUG();
1931 else /* Not an l1 */
1932 #endif
1933 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1934 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1936 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1937 " but has typecount %#lx\n",
1938 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1939 gpg->u.inuse.type_info);
1940 BUG();
1943 /* That entry was OK; on we go */
1944 sp = sp->next_shadow;
1948 #else
1949 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1950 #endif /* Hashtable bucket audit */
1953 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1955 static void sh_hash_audit(struct domain *d)
1956 /* Full audit: audit every bucket in the table */
1958 int i;
1960 if ( !(SHADOW_AUDIT_ENABLE) )
1961 return;
1963 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1965 sh_hash_audit_bucket(d, i);
1969 #else
1970 #define sh_hash_audit(_d) do {} while(0)
1971 #endif /* Hashtable bucket audit */
1973 /* Allocate and initialise the table itself.
1974 * Returns 0 for success, 1 for error. */
1975 static int shadow_hash_alloc(struct domain *d)
1977 struct shadow_page_info **table;
1979 ASSERT(shadow_locked_by_me(d));
1980 ASSERT(!d->arch.paging.shadow.hash_table);
1982 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1983 if ( !table ) return 1;
1984 memset(table, 0,
1985 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1986 d->arch.paging.shadow.hash_table = table;
1987 return 0;
1990 /* Tear down the hash table and return all memory to Xen.
1991 * This function does not care whether the table is populated. */
1992 static void shadow_hash_teardown(struct domain *d)
1994 ASSERT(shadow_locked_by_me(d));
1995 ASSERT(d->arch.paging.shadow.hash_table);
1997 xfree(d->arch.paging.shadow.hash_table);
1998 d->arch.paging.shadow.hash_table = NULL;
2002 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
2003 /* Find an entry in the hash table. Returns the MFN of the shadow,
2004 * or INVALID_MFN if it doesn't exist */
2006 struct domain *d = v->domain;
2007 struct shadow_page_info *sp, *prev;
2008 key_t key;
2010 ASSERT(shadow_locked_by_me(d));
2011 ASSERT(d->arch.paging.shadow.hash_table);
2012 ASSERT(t);
2014 sh_hash_audit(d);
2016 perfc_incr(shadow_hash_lookups);
2017 key = sh_hash(n, t);
2018 sh_hash_audit_bucket(d, key);
2020 sp = d->arch.paging.shadow.hash_table[key];
2021 prev = NULL;
2022 while(sp)
2024 if ( sp->backpointer == n && sp->type == t )
2026 /* Pull-to-front if 'sp' isn't already the head item */
2027 if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
2029 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
2030 /* Can't reorder: someone is walking the hash chains */
2031 return shadow_page_to_mfn(sp);
2032 else
2034 ASSERT(prev);
2035 /* Delete sp from the list */
2036 prev->next_shadow = sp->next_shadow;
2037 /* Re-insert it at the head of the list */
2038 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
2039 d->arch.paging.shadow.hash_table[key] = sp;
2042 else
2044 perfc_incr(shadow_hash_lookup_head);
2046 return shadow_page_to_mfn(sp);
2048 prev = sp;
2049 sp = sp->next_shadow;
2052 perfc_incr(shadow_hash_lookup_miss);
2053 return _mfn(INVALID_MFN);
2056 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
2057 mfn_t smfn)
2058 /* Put a mapping (n,t)->smfn into the hash table */
2060 struct domain *d = v->domain;
2061 struct shadow_page_info *sp;
2062 key_t key;
2064 ASSERT(shadow_locked_by_me(d));
2065 ASSERT(d->arch.paging.shadow.hash_table);
2066 ASSERT(t);
2068 sh_hash_audit(d);
2070 perfc_incr(shadow_hash_inserts);
2071 key = sh_hash(n, t);
2072 sh_hash_audit_bucket(d, key);
2074 /* Insert this shadow at the top of the bucket */
2075 sp = mfn_to_shadow_page(smfn);
2076 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
2077 d->arch.paging.shadow.hash_table[key] = sp;
2079 sh_hash_audit_bucket(d, key);
2082 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
2083 mfn_t smfn)
2084 /* Excise the mapping (n,t)->smfn from the hash table */
2086 struct domain *d = v->domain;
2087 struct shadow_page_info *sp, *x;
2088 key_t key;
2090 ASSERT(shadow_locked_by_me(d));
2091 ASSERT(d->arch.paging.shadow.hash_table);
2092 ASSERT(t);
2094 sh_hash_audit(d);
2096 perfc_incr(shadow_hash_deletes);
2097 key = sh_hash(n, t);
2098 sh_hash_audit_bucket(d, key);
2100 sp = mfn_to_shadow_page(smfn);
2101 if ( d->arch.paging.shadow.hash_table[key] == sp )
2102 /* Easy case: we're deleting the head item. */
2103 d->arch.paging.shadow.hash_table[key] = sp->next_shadow;
2104 else
2106 /* Need to search for the one we want */
2107 x = d->arch.paging.shadow.hash_table[key];
2108 while ( 1 )
2110 ASSERT(x); /* We can't have hit the end, since our target is
2111 * still in the chain somehwere... */
2112 if ( x->next_shadow == sp )
2114 x->next_shadow = sp->next_shadow;
2115 break;
2117 x = x->next_shadow;
2120 sp->next_shadow = NULL;
2122 sh_hash_audit_bucket(d, key);
2125 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
2127 static void hash_foreach(struct vcpu *v,
2128 unsigned int callback_mask,
2129 hash_callback_t callbacks[],
2130 mfn_t callback_mfn)
2131 /* Walk the hash table looking at the types of the entries and
2132 * calling the appropriate callback function for each entry.
2133 * The mask determines which shadow types we call back for, and the array
2134 * of callbacks tells us which function to call.
2135 * Any callback may return non-zero to let us skip the rest of the scan.
2137 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
2138 * then return non-zero to terminate the scan. */
2140 int i, done = 0;
2141 struct domain *d = v->domain;
2142 struct shadow_page_info *x;
2144 /* Say we're here, to stop hash-lookups reordering the chains */
2145 ASSERT(shadow_locked_by_me(d));
2146 ASSERT(d->arch.paging.shadow.hash_walking == 0);
2147 d->arch.paging.shadow.hash_walking = 1;
2149 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
2151 /* WARNING: This is not safe against changes to the hash table.
2152 * The callback *must* return non-zero if it has inserted or
2153 * deleted anything from the hash (lookups are OK, though). */
2154 for ( x = d->arch.paging.shadow.hash_table[i]; x; x = x->next_shadow )
2156 if ( callback_mask & (1 << x->type) )
2158 ASSERT(x->type <= 15);
2159 ASSERT(callbacks[x->type] != NULL);
2160 done = callbacks[x->type](v, shadow_page_to_mfn(x),
2161 callback_mfn);
2162 if ( done ) break;
2165 if ( done ) break;
2167 d->arch.paging.shadow.hash_walking = 0;
2171 /**************************************************************************/
2172 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
2173 * which will decrement refcounts appropriately and return memory to the
2174 * free pool. */
2176 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
2178 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2179 unsigned int t = sp->type;
2182 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
2184 /* Double-check, if we can, that the shadowed page belongs to this
2185 * domain, (by following the back-pointer). */
2186 ASSERT(t == SH_type_fl1_32_shadow ||
2187 t == SH_type_fl1_pae_shadow ||
2188 t == SH_type_fl1_64_shadow ||
2189 t == SH_type_monitor_table ||
2190 (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
2191 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
2192 == v->domain));
2194 /* The down-shifts here are so that the switch statement is on nice
2195 * small numbers that the compiler will enjoy */
2196 switch ( t )
2198 case SH_type_l1_32_shadow:
2199 case SH_type_fl1_32_shadow:
2200 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(v, smfn);
2201 break;
2202 case SH_type_l2_32_shadow:
2203 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(v, smfn);
2204 break;
2206 case SH_type_l1_pae_shadow:
2207 case SH_type_fl1_pae_shadow:
2208 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(v, smfn);
2209 break;
2210 case SH_type_l2_pae_shadow:
2211 case SH_type_l2h_pae_shadow:
2212 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(v, smfn);
2213 break;
2215 #if CONFIG_PAGING_LEVELS >= 4
2216 case SH_type_l1_64_shadow:
2217 case SH_type_fl1_64_shadow:
2218 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(v, smfn);
2219 break;
2220 case SH_type_l2h_64_shadow:
2221 ASSERT(is_pv_32on64_vcpu(v));
2222 /* Fall through... */
2223 case SH_type_l2_64_shadow:
2224 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(v, smfn);
2225 break;
2226 case SH_type_l3_64_shadow:
2227 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(v, smfn);
2228 break;
2229 case SH_type_l4_64_shadow:
2230 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(v, smfn);
2231 break;
2232 #endif
2233 default:
2234 SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
2235 (unsigned long)t);
2236 BUG();
2240 static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
2242 if ( tb_init_done )
2244 /* Convert gmfn to gfn */
2245 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
2246 __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
2250 /**************************************************************************/
2251 /* Remove all writeable mappings of a guest frame from the shadow tables
2252 * Returns non-zero if we need to flush TLBs.
2253 * level and fault_addr desribe how we found this to be a pagetable;
2254 * level==0 means we have some other reason for revoking write access.
2255 * If level==0 we are allowed to fail, returning -1. */
2257 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
2258 unsigned int level,
2259 unsigned long fault_addr)
2261 /* Dispatch table for getting per-type functions */
2262 static hash_callback_t callbacks[SH_type_unused] = {
2263 NULL, /* none */
2264 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32 */
2265 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32 */
2266 NULL, /* l2_32 */
2267 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* l1_pae */
2268 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* fl1_pae */
2269 NULL, /* l2_pae */
2270 NULL, /* l2h_pae */
2271 #if CONFIG_PAGING_LEVELS >= 4
2272 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* l1_64 */
2273 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* fl1_64 */
2274 #else
2275 NULL, /* l1_64 */
2276 NULL, /* fl1_64 */
2277 #endif
2278 NULL, /* l2_64 */
2279 NULL, /* l2h_64 */
2280 NULL, /* l3_64 */
2281 NULL, /* l4_64 */
2282 NULL, /* p2m */
2283 NULL /* unused */
2284 };
2286 static unsigned int callback_mask =
2287 1 << SH_type_l1_32_shadow
2288 | 1 << SH_type_fl1_32_shadow
2289 | 1 << SH_type_l1_pae_shadow
2290 | 1 << SH_type_fl1_pae_shadow
2291 | 1 << SH_type_l1_64_shadow
2292 | 1 << SH_type_fl1_64_shadow
2294 struct page_info *pg = mfn_to_page(gmfn);
2296 ASSERT(shadow_locked_by_me(v->domain));
2298 /* Only remove writable mappings if we are doing shadow refcounts.
2299 * In guest refcounting, we trust Xen to already be restricting
2300 * all the writes to the guest page tables, so we do not need to
2301 * do more. */
2302 if ( !shadow_mode_refcounts(v->domain) )
2303 return 0;
2305 /* Early exit if it's already a pagetable, or otherwise not writeable */
2306 if ( (sh_mfn_is_a_page_table(gmfn)
2307 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2308 /* Unless they've been allowed to go out of sync with their shadows */
2309 && !mfn_oos_may_write(gmfn)
2310 #endif
2312 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2313 return 0;
2315 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP);
2317 perfc_incr(shadow_writeable);
2319 /* If this isn't a "normal" writeable page, the domain is trying to
2320 * put pagetables in special memory of some kind. We can't allow that. */
2321 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
2323 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
2324 PRtype_info "\n",
2325 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
2326 domain_crash(v->domain);
2329 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
2330 if ( v == current )
2332 unsigned long gfn;
2333 /* Heuristic: there is likely to be only one writeable mapping,
2334 * and that mapping is likely to be in the current pagetable,
2335 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
2337 #define GUESS(_a, _h) do { \
2338 if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
2339 perfc_incr(shadow_writeable_h_ ## _h); \
2340 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
2341 { \
2342 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); \
2343 return 1; \
2344 } \
2345 } while (0)
2347 if ( level == 0 && fault_addr )
2348 GUESS(fault_addr, 6);
2350 if ( v->arch.paging.mode->guest_levels == 2 )
2352 if ( level == 1 )
2353 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
2354 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
2356 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2357 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2358 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2361 else if ( v->arch.paging.mode->guest_levels == 3 )
2363 /* 32bit PAE w2k3: linear map at 0xC0000000 */
2364 switch ( level )
2366 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
2367 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
2370 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2371 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2372 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2374 #if CONFIG_PAGING_LEVELS >= 4
2375 else if ( v->arch.paging.mode->guest_levels == 4 )
2377 /* 64bit w2k3: linear map at 0xfffff68000000000 */
2378 switch ( level )
2380 case 1: GUESS(0xfffff68000000000UL
2381 + ((fault_addr & VADDR_MASK) >> 9), 3); break;
2382 case 2: GUESS(0xfffff6fb40000000UL
2383 + ((fault_addr & VADDR_MASK) >> 18), 3); break;
2384 case 3: GUESS(0xfffff6fb7da00000UL
2385 + ((fault_addr & VADDR_MASK) >> 27), 3); break;
2388 /* 64bit Linux direct map at 0xffff880000000000; older kernels
2389 * had it at 0xffff810000000000, and older kernels yet had it
2390 * at 0x0000010000000000UL */
2391 gfn = mfn_to_gfn(v->domain, gmfn);
2392 GUESS(0xffff880000000000UL + (gfn << PAGE_SHIFT), 4);
2393 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
2394 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
2395 /*
2396 * 64bit Solaris kernel page map at
2397 * kpm_vbase; 0xfffffe0000000000UL
2398 */
2399 GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4);
2401 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2403 #undef GUESS
2406 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2407 return 1;
2409 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
2410 * (entries in the fixmap) where linux maps its pagetables. Since
2411 * we expect to hit them most of the time, we start the search for
2412 * the writeable mapping by looking at the same MFN where the last
2413 * brute-force search succeeded. */
2415 if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
2417 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
2418 mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
2419 int shtype = mfn_to_shadow_page(last_smfn)->type;
2421 if ( callbacks[shtype] )
2422 callbacks[shtype](v, last_smfn, gmfn);
2424 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
2425 perfc_incr(shadow_writeable_h_5);
2428 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2429 return 1;
2431 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
2433 /* Brute-force search of all the shadows, by walking the hash */
2434 trace_shadow_wrmap_bf(gmfn);
2435 if ( level == 0 )
2436 perfc_incr(shadow_writeable_bf_1);
2437 else
2438 perfc_incr(shadow_writeable_bf);
2439 hash_foreach(v, callback_mask, callbacks, gmfn);
2441 /* If that didn't catch the mapping, then there's some non-pagetable
2442 * mapping -- ioreq page, grant mapping, &c. */
2443 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2445 if ( level == 0 )
2446 return -1;
2448 SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
2449 "%lu special-use mappings of it\n", mfn_x(gmfn),
2450 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2451 domain_crash(v->domain);
2454 /* We killed at least one writeable mapping, so must flush TLBs. */
2455 return 1;
2458 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2459 int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
2460 mfn_t smfn, unsigned long off)
2462 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2464 ASSERT(mfn_valid(smfn));
2465 ASSERT(mfn_valid(gmfn));
2467 if ( sp->type == SH_type_l1_32_shadow )
2469 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
2470 (v, gmfn, smfn, off);
2472 #if CONFIG_PAGING_LEVELS >= 3
2473 else if ( sp->type == SH_type_l1_pae_shadow )
2474 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
2475 (v, gmfn, smfn, off);
2476 #if CONFIG_PAGING_LEVELS >= 4
2477 else if ( sp->type == SH_type_l1_64_shadow )
2478 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
2479 (v, gmfn, smfn, off);
2480 #endif
2481 #endif
2483 return 0;
2485 #endif
2487 /**************************************************************************/
2488 /* Remove all mappings of a guest frame from the shadow tables.
2489 * Returns non-zero if we need to flush TLBs. */
2491 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
2493 struct page_info *page = mfn_to_page(gmfn);
2494 int expected_count, do_locking;
2496 /* Dispatch table for getting per-type functions */
2497 static hash_callback_t callbacks[SH_type_unused] = {
2498 NULL, /* none */
2499 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32 */
2500 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32 */
2501 NULL, /* l2_32 */
2502 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* l1_pae */
2503 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* fl1_pae */
2504 NULL, /* l2_pae */
2505 NULL, /* l2h_pae */
2506 #if CONFIG_PAGING_LEVELS >= 4
2507 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* l1_64 */
2508 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* fl1_64 */
2509 #else
2510 NULL, /* l1_64 */
2511 NULL, /* fl1_64 */
2512 #endif
2513 NULL, /* l2_64 */
2514 NULL, /* l2h_64 */
2515 NULL, /* l3_64 */
2516 NULL, /* l4_64 */
2517 NULL, /* p2m */
2518 NULL /* unused */
2519 };
2521 static unsigned int callback_mask =
2522 1 << SH_type_l1_32_shadow
2523 | 1 << SH_type_fl1_32_shadow
2524 | 1 << SH_type_l1_pae_shadow
2525 | 1 << SH_type_fl1_pae_shadow
2526 | 1 << SH_type_l1_64_shadow
2527 | 1 << SH_type_fl1_64_shadow
2530 perfc_incr(shadow_mappings);
2531 if ( (page->count_info & PGC_count_mask) == 0 )
2532 return 0;
2534 /* Although this is an externally visible function, we do not know
2535 * whether the shadow lock will be held when it is called (since it
2536 * can be called via put_page_type when we clear a shadow l1e).
2537 * If the lock isn't held, take it for the duration of the call. */
2538 do_locking = !shadow_locked_by_me(v->domain);
2539 if ( do_locking ) shadow_lock(v->domain);
2541 /* XXX TODO:
2542 * Heuristics for finding the (probably) single mapping of this gmfn */
2544 /* Brute-force search of all the shadows, by walking the hash */
2545 perfc_incr(shadow_mappings_bf);
2546 hash_foreach(v, callback_mask, callbacks, gmfn);
2548 /* If that didn't catch the mapping, something is very wrong */
2549 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2550 if ( (page->count_info & PGC_count_mask) != expected_count )
2552 /* Don't complain if we're in HVM and there are some extra mappings:
2553 * The qemu helper process has an untyped mapping of this dom's RAM
2554 * and the HVM restore program takes another. */
2555 if ( !(shadow_mode_external(v->domain)
2556 && (page->count_info & PGC_count_mask) <= 3
2557 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2559 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2560 "c=%08x t=%08lx\n", mfn_x(gmfn),
2561 page->count_info, page->u.inuse.type_info);
2565 if ( do_locking ) shadow_unlock(v->domain);
2567 /* We killed at least one mapping, so must flush TLBs. */
2568 return 1;
2572 /**************************************************************************/
2573 /* Remove all shadows of a guest frame from the shadow tables */
2575 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2576 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2577 * found there. Returns 1 if that was the only reference to this shadow */
2579 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2580 mfn_t pmfn;
2581 void *vaddr;
2582 int rc;
2584 ASSERT(sp->type > 0);
2585 ASSERT(sp->type < SH_type_max_shadow);
2586 ASSERT(sp->type != SH_type_l2_32_shadow);
2587 ASSERT(sp->type != SH_type_l2_pae_shadow);
2588 ASSERT(sp->type != SH_type_l2h_pae_shadow);
2589 ASSERT(sp->type != SH_type_l4_64_shadow);
2591 if (sp->up == 0) return 0;
2592 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2593 ASSERT(mfn_valid(pmfn));
2594 vaddr = sh_map_domain_page(pmfn);
2595 ASSERT(vaddr);
2596 vaddr += sp->up & (PAGE_SIZE-1);
2597 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2599 /* Is this the only reference to this shadow? */
2600 rc = (sp->count == 1) ? 1 : 0;
2602 /* Blank the offending entry */
2603 switch (sp->type)
2605 case SH_type_l1_32_shadow:
2606 case SH_type_l2_32_shadow:
2607 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(v, vaddr, pmfn);
2608 break;
2609 case SH_type_l1_pae_shadow:
2610 case SH_type_l2_pae_shadow:
2611 case SH_type_l2h_pae_shadow:
2612 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(v, vaddr, pmfn);
2613 break;
2614 #if CONFIG_PAGING_LEVELS >= 4
2615 case SH_type_l1_64_shadow:
2616 case SH_type_l2_64_shadow:
2617 case SH_type_l2h_64_shadow:
2618 case SH_type_l3_64_shadow:
2619 case SH_type_l4_64_shadow:
2620 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(v, vaddr, pmfn);
2621 break;
2622 #endif
2623 default: BUG(); /* Some wierd unknown shadow type */
2626 sh_unmap_domain_page(vaddr);
2627 if ( rc )
2628 perfc_incr(shadow_up_pointer);
2629 else
2630 perfc_incr(shadow_unshadow_bf);
2632 return rc;
2635 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2636 /* Remove the shadows of this guest page.
2637 * If fast != 0, just try the quick heuristic, which will remove
2638 * at most one reference to each shadow of the page. Otherwise, walk
2639 * all the shadow tables looking for refs to shadows of this gmfn.
2640 * If all != 0, kill the domain if we can't find all the shadows.
2641 * (all != 0 implies fast == 0)
2642 */
2644 struct page_info *pg = mfn_to_page(gmfn);
2645 mfn_t smfn;
2646 int do_locking;
2647 unsigned char t;
2649 /* Dispatch table for getting per-type functions: each level must
2650 * be called with the function to remove a lower-level shadow. */
2651 static hash_callback_t callbacks[SH_type_unused] = {
2652 NULL, /* none */
2653 NULL, /* l1_32 */
2654 NULL, /* fl1_32 */
2655 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 2), /* l2_32 */
2656 NULL, /* l1_pae */
2657 NULL, /* fl1_pae */
2658 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2_pae */
2659 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2h_pae */
2660 NULL, /* l1_64 */
2661 NULL, /* fl1_64 */
2662 #if CONFIG_PAGING_LEVELS >= 4
2663 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2_64 */
2664 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2h_64 */
2665 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, 4), /* l3_64 */
2666 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, 4), /* l4_64 */
2667 #else
2668 NULL, /* l2_64 */
2669 NULL, /* l2h_64 */
2670 NULL, /* l3_64 */
2671 NULL, /* l4_64 */
2672 #endif
2673 NULL, /* p2m */
2674 NULL /* unused */
2675 };
2677 /* Another lookup table, for choosing which mask to use */
2678 static unsigned int masks[SH_type_unused] = {
2679 0, /* none */
2680 1 << SH_type_l2_32_shadow, /* l1_32 */
2681 0, /* fl1_32 */
2682 0, /* l2_32 */
2683 ((1 << SH_type_l2h_pae_shadow)
2684 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2685 0, /* fl1_pae */
2686 0, /* l2_pae */
2687 0, /* l2h_pae */
2688 ((1 << SH_type_l2h_64_shadow)
2689 | (1 << SH_type_l2_64_shadow)), /* l1_64 */
2690 0, /* fl1_64 */
2691 1 << SH_type_l3_64_shadow, /* l2_64 */
2692 1 << SH_type_l3_64_shadow, /* l2h_64 */
2693 1 << SH_type_l4_64_shadow, /* l3_64 */
2694 0, /* l4_64 */
2695 0, /* p2m */
2696 0 /* unused */
2697 };
2699 ASSERT(!(all && fast));
2701 /* Although this is an externally visible function, we do not know
2702 * whether the shadow lock will be held when it is called (since it
2703 * can be called via put_page_type when we clear a shadow l1e).
2704 * If the lock isn't held, take it for the duration of the call. */
2705 do_locking = !shadow_locked_by_me(v->domain);
2706 if ( do_locking ) shadow_lock(v->domain);
2708 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2709 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2711 /* Bail out now if the page is not shadowed */
2712 if ( (pg->count_info & PGC_page_table) == 0 )
2714 if ( do_locking ) shadow_unlock(v->domain);
2715 return;
2718 /* Search for this shadow in all appropriate shadows */
2719 perfc_incr(shadow_unshadow);
2721 /* Lower-level shadows need to be excised from upper-level shadows.
2722 * This call to hash_foreach() looks dangerous but is in fact OK: each
2723 * call will remove at most one shadow, and terminate immediately when
2724 * it does remove it, so we never walk the hash after doing a deletion. */
2725 #define DO_UNSHADOW(_type) do { \
2726 t = (_type); \
2727 if( !(pg->count_info & PGC_page_table) \
2728 || !(pg->shadow_flags & (1 << t)) ) \
2729 break; \
2730 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2731 if ( unlikely(!mfn_valid(smfn)) ) \
2732 { \
2733 SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32 \
2734 " but no type-0x%"PRIx32" shadow\n", \
2735 mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \
2736 break; \
2737 } \
2738 if ( sh_type_is_pinnable(v, t) ) \
2739 sh_unpin(v, smfn); \
2740 else \
2741 sh_remove_shadow_via_pointer(v, smfn); \
2742 if( !fast \
2743 && (pg->count_info & PGC_page_table) \
2744 && (pg->shadow_flags & (1 << t)) ) \
2745 hash_foreach(v, masks[t], callbacks, smfn); \
2746 } while (0)
2748 DO_UNSHADOW(SH_type_l2_32_shadow);
2749 DO_UNSHADOW(SH_type_l1_32_shadow);
2750 DO_UNSHADOW(SH_type_l2h_pae_shadow);
2751 DO_UNSHADOW(SH_type_l2_pae_shadow);
2752 DO_UNSHADOW(SH_type_l1_pae_shadow);
2753 #if CONFIG_PAGING_LEVELS >= 4
2754 DO_UNSHADOW(SH_type_l4_64_shadow);
2755 DO_UNSHADOW(SH_type_l3_64_shadow);
2756 DO_UNSHADOW(SH_type_l2h_64_shadow);
2757 DO_UNSHADOW(SH_type_l2_64_shadow);
2758 DO_UNSHADOW(SH_type_l1_64_shadow);
2759 #endif
2761 #undef DO_UNSHADOW
2763 /* If that didn't catch the shadows, something is wrong */
2764 if ( !fast && all && (pg->count_info & PGC_page_table) )
2766 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2767 "(shadow_flags=%08x)\n",
2768 mfn_x(gmfn), pg->shadow_flags);
2769 domain_crash(v->domain);
2772 /* Need to flush TLBs now, so that linear maps are safe next time we
2773 * take a fault. */
2774 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2776 if ( do_locking ) shadow_unlock(v->domain);
2779 static void
2780 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2781 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2782 * Unshadow it, and recursively unshadow pages that reference it. */
2784 sh_remove_shadows(v, gmfn, 0, 1);
2785 /* XXX TODO:
2786 * Rework this hashtable walker to return a linked-list of all
2787 * the shadows it modified, then do breadth-first recursion
2788 * to find the way up to higher-level tables and unshadow them too.
2790 * The current code (just tearing down each page's shadows as we
2791 * detect that it is not a pagetable) is correct, but very slow.
2792 * It means extra emulated writes and slows down removal of mappings. */
2795 /**************************************************************************/
2797 static void sh_update_paging_modes(struct vcpu *v)
2799 struct domain *d = v->domain;
2800 struct paging_mode *old_mode = v->arch.paging.mode;
2802 ASSERT(shadow_locked_by_me(d));
2804 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2805 /* Make sure this vcpu has a virtual TLB array allocated */
2806 if ( unlikely(!v->arch.paging.vtlb) )
2808 v->arch.paging.vtlb = xmalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
2809 if ( unlikely(!v->arch.paging.vtlb) )
2811 SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
2812 d->domain_id, v->vcpu_id);
2813 domain_crash(v->domain);
2814 return;
2816 memset(v->arch.paging.vtlb, 0,
2817 VTLB_ENTRIES * sizeof (struct shadow_vtlb));
2818 spin_lock_init(&v->arch.paging.vtlb_lock);
2820 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2822 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2823 if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
2825 int i;
2826 for(i = 0; i < SHADOW_OOS_PAGES; i++)
2828 shadow_prealloc(d, SH_type_oos_snapshot, 1);
2829 v->arch.paging.shadow.oos_snapshot[i] =
2830 shadow_alloc(d, SH_type_oos_snapshot, 0);
2833 #endif /* OOS */
2835 // Valid transitions handled by this function:
2836 // - For PV guests:
2837 // - after a shadow mode has been changed
2838 // - For HVM guests:
2839 // - after a shadow mode has been changed
2840 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2841 //
2843 // First, tear down any old shadow tables held by this vcpu.
2844 //
2845 if ( v->arch.paging.mode )
2846 v->arch.paging.mode->shadow.detach_old_tables(v);
2848 if ( !is_hvm_domain(d) )
2850 ///
2851 /// PV guest
2852 ///
2853 #if CONFIG_PAGING_LEVELS == 4
2854 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2855 #else /* CONFIG_PAGING_LEVELS == 3 */
2856 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2857 #endif
2859 else
2861 ///
2862 /// HVM guest
2863 ///
2864 ASSERT(shadow_mode_translate(d));
2865 ASSERT(shadow_mode_external(d));
2867 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2868 /* Need to resync all our pages now, because if a page goes out
2869 * of sync with paging enabled and is resynced with paging
2870 * disabled, the resync will go wrong. */
2871 shadow_resync_all(v, 0);
2872 #endif /* OOS */
2874 if ( !hvm_paging_enabled(v) )
2876 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
2877 * pagetable for it, mapping 4 GB one-to-one using a single l2
2878 * page of 1024 superpage mappings */
2879 v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
2880 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2882 else
2884 #ifdef __x86_64__
2885 if ( hvm_long_mode_enabled(v) )
2887 // long mode guest...
2888 v->arch.paging.mode =
2889 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2891 else
2892 #endif
2893 if ( hvm_pae_enabled(v) )
2895 // 32-bit PAE mode guest...
2896 v->arch.paging.mode =
2897 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2899 else
2901 // 32-bit 2 level guest...
2902 v->arch.paging.mode =
2903 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2907 if ( pagetable_is_null(v->arch.monitor_table) )
2909 mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2910 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2911 make_cr3(v, mfn_x(mmfn));
2912 hvm_update_host_cr3(v);
2915 if ( v->arch.paging.mode != old_mode )
2917 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d gl=%u "
2918 "(was g=%u s=%u)\n",
2919 d->domain_id, v->vcpu_id,
2920 is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
2921 v->arch.paging.mode->guest_levels,
2922 v->arch.paging.mode->shadow.shadow_levels,
2923 old_mode ? old_mode->guest_levels : 0,
2924 old_mode ? old_mode->shadow.shadow_levels : 0);
2925 if ( old_mode &&
2926 (v->arch.paging.mode->shadow.shadow_levels !=
2927 old_mode->shadow.shadow_levels) )
2929 /* Need to make a new monitor table for the new mode */
2930 mfn_t new_mfn, old_mfn;
2932 if ( v != current && vcpu_runnable(v) )
2934 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2935 "this HVM vcpu's (d=%u v=%u) paging mode "
2936 "while it is running.\n",
2937 current->domain->domain_id, current->vcpu_id,
2938 v->domain->domain_id, v->vcpu_id);
2939 /* It's not safe to do that because we can't change
2940 * the host CR3 for a running domain */
2941 domain_crash(v->domain);
2942 return;
2945 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2946 v->arch.monitor_table = pagetable_null();
2947 new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2948 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2949 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
2950 mfn_x(new_mfn));
2952 /* Don't be running on the old monitor table when we
2953 * pull it down! Switch CR3, and warn the HVM code that
2954 * its host cr3 has changed. */
2955 make_cr3(v, mfn_x(new_mfn));
2956 if ( v == current )
2957 write_ptbase(v);
2958 hvm_update_host_cr3(v);
2959 old_mode->shadow.destroy_monitor_table(v, old_mfn);
2963 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2964 // These are HARD: think about the case where two CPU's have
2965 // different values for CR4.PSE and CR4.PGE at the same time.
2966 // This *does* happen, at least for CR4.PGE...
2969 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2970 /* We need to check that all the vcpus have paging enabled to
2971 * unsync PTs. */
2972 if ( is_hvm_domain(d) )
2974 int pe = 1;
2975 struct vcpu *vptr;
2977 for_each_vcpu(d, vptr)
2979 if ( !hvm_paging_enabled(vptr) )
2981 pe = 0;
2982 break;
2986 d->arch.paging.shadow.oos_active = pe;
2988 #endif /* OOS */
2990 v->arch.paging.mode->update_cr3(v, 0);
2993 void shadow_update_paging_modes(struct vcpu *v)
2995 shadow_lock(v->domain);
2996 sh_update_paging_modes(v);
2997 shadow_unlock(v->domain);
3000 /**************************************************************************/
3001 /* Turning on and off shadow features */
3003 static void sh_new_mode(struct domain *d, u32 new_mode)
3004 /* Inform all the vcpus that the shadow mode has been changed */
3006 struct vcpu *v;
3008 ASSERT(shadow_locked_by_me(d));
3009 ASSERT(d != current->domain);
3010 d->arch.paging.mode = new_mode;
3011 for_each_vcpu(d, v)
3012 sh_update_paging_modes(v);
3015 int shadow_enable(struct domain *d, u32 mode)
3016 /* Turn on "permanent" shadow features: external, translate, refcount.
3017 * Can only be called once on a domain, and these features cannot be
3018 * disabled.
3019 * Returns 0 for success, -errno for failure. */
3021 unsigned int old_pages;
3022 struct page_info *pg = NULL;
3023 uint32_t *e;
3024 int i, rv = 0;
3026 mode |= PG_SH_enable;
3028 domain_pause(d);
3030 /* Sanity check the arguments */
3031 if ( (d == current->domain) ||
3032 shadow_mode_enabled(d) ||
3033 ((mode & PG_translate) && !(mode & PG_refcounts)) ||
3034 ((mode & PG_external) && !(mode & PG_translate)) )
3036 rv = -EINVAL;
3037 goto out_unlocked;
3040 /* Init the shadow memory allocation if the user hasn't done so */
3041 old_pages = d->arch.paging.shadow.total_pages;
3042 if ( old_pages == 0 )
3044 unsigned int r;
3045 shadow_lock(d);
3046 r = sh_set_allocation(d, 256, NULL); /* Use at least 1MB */
3047 if ( r != 0 )
3049 sh_set_allocation(d, 0, NULL);
3050 rv = -ENOMEM;
3051 goto out_locked;
3053 shadow_unlock(d);
3056 /* Init the P2M table. Must be done before we take the shadow lock
3057 * to avoid possible deadlock. */
3058 if ( mode & PG_translate )
3060 rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
3061 if (rv != 0)
3062 goto out_unlocked;
3065 /* HVM domains need an extra pagetable for vcpus that think they
3066 * have paging disabled */
3067 if ( is_hvm_domain(d) )
3069 /* Get a single page from the shadow pool. Take it via the
3070 * P2M interface to make freeing it simpler afterwards. */
3071 pg = shadow_alloc_p2m_page(d);
3072 if ( pg == NULL )
3074 rv = -ENOMEM;
3075 goto out_unlocked;
3077 /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
3078 * of virtual address space onto the same physical address range */
3079 e = sh_map_domain_page(page_to_mfn(pg));
3080 for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
3081 e[i] = ((0x400000U * i)
3082 | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
3083 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3084 sh_unmap_domain_page(e);
3085 pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
3088 shadow_lock(d);
3090 /* Sanity check again with the lock held */
3091 if ( shadow_mode_enabled(d) )
3093 rv = -EINVAL;
3094 goto out_locked;
3097 /* Init the hash table */
3098 if ( shadow_hash_alloc(d) != 0 )
3100 rv = -ENOMEM;
3101 goto out_locked;
3104 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3105 /* We assume we're dealing with an older 64bit linux guest until we
3106 * see the guest use more than one l4 per vcpu. */
3107 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3108 #endif
3110 /* Record the 1-to-1 pagetable we just made */
3111 if ( is_hvm_domain(d) )
3112 d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
3114 /* Update the bits */
3115 sh_new_mode(d, mode);
3117 out_locked:
3118 shadow_unlock(d);
3119 out_unlocked:
3120 if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
3121 p2m_teardown(d);
3122 if ( rv != 0 && pg != NULL )
3123 shadow_free_p2m_page(d, pg);
3124 domain_unpause(d);
3125 return rv;
3128 void shadow_teardown(struct domain *d)
3129 /* Destroy the shadow pagetables of this domain and free its shadow memory.
3130 * Should only be called for dying domains. */
3132 struct vcpu *v;
3133 mfn_t mfn;
3134 struct list_head *entry, *n;
3135 struct page_info *pg;
3137 ASSERT(d->is_dying);
3138 ASSERT(d != current->domain);
3140 if ( !shadow_locked_by_me(d) )
3141 shadow_lock(d); /* Keep various asserts happy */
3143 if ( shadow_mode_enabled(d) )
3145 /* Release the shadow and monitor tables held by each vcpu */
3146 for_each_vcpu(d, v)
3148 if ( v->arch.paging.mode )
3150 v->arch.paging.mode->shadow.detach_old_tables(v);
3151 if ( shadow_mode_external(d) )
3153 mfn = pagetable_get_mfn(v->arch.monitor_table);
3154 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
3155 v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
3156 v->arch.monitor_table = pagetable_null();
3162 #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
3163 /* Free the virtual-TLB array attached to each vcpu */
3164 for_each_vcpu(d, v)
3166 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3167 if ( v->arch.paging.vtlb )
3169 xfree(v->arch.paging.vtlb);
3170 v->arch.paging.vtlb = NULL;
3172 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3174 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3176 int i;
3177 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
3178 for(i = 0; i < SHADOW_OOS_PAGES; i++)
3179 if ( mfn_valid(oos_snapshot[i]) )
3180 shadow_free(d, oos_snapshot[i]);
3182 #endif /* OOS */
3184 #endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
3186 list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
3188 list_del(entry);
3189 pg = list_entry(entry, struct page_info, list);
3190 shadow_free_p2m_page(d, pg);
3193 if ( d->arch.paging.shadow.total_pages != 0 )
3195 SHADOW_PRINTK("teardown of domain %u starts."
3196 " Shadow pages total = %u, free = %u, p2m=%u\n",
3197 d->domain_id,
3198 d->arch.paging.shadow.total_pages,
3199 d->arch.paging.shadow.free_pages,
3200 d->arch.paging.shadow.p2m_pages);
3201 /* Destroy all the shadows and release memory to domheap */
3202 sh_set_allocation(d, 0, NULL);
3203 /* Release the hash table back to xenheap */
3204 if (d->arch.paging.shadow.hash_table)
3205 shadow_hash_teardown(d);
3206 /* Should not have any more memory held */
3207 SHADOW_PRINTK("teardown done."
3208 " Shadow pages total = %u, free = %u, p2m=%u\n",
3209 d->arch.paging.shadow.total_pages,
3210 d->arch.paging.shadow.free_pages,
3211 d->arch.paging.shadow.p2m_pages);
3212 ASSERT(d->arch.paging.shadow.total_pages == 0);
3215 /* Free the non-paged-vcpus pagetable; must happen after we've
3216 * destroyed any shadows of it or sh_destroy_shadow will get confused. */
3217 if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
3219 for_each_vcpu(d, v)
3221 ASSERT(is_hvm_vcpu(v));
3222 if ( !hvm_paging_enabled(v) )
3223 v->arch.guest_table = pagetable_null();
3225 shadow_free_p2m_page(d,
3226 pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable));
3227 d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
3230 /* We leave the "permanent" shadow modes enabled, but clear the
3231 * log-dirty mode bit. We don't want any more mark_dirty()
3232 * calls now that we've torn down the bitmap */
3233 d->arch.paging.mode &= ~PG_log_dirty;
3235 if (d->dirty_vram) {
3236 xfree(d->dirty_vram->sl1ma);
3237 xfree(d->dirty_vram->dirty_bitmap);
3238 xfree(d->dirty_vram);
3239 d->dirty_vram = NULL;
3242 shadow_unlock(d);
3245 void shadow_final_teardown(struct domain *d)
3246 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
3248 SHADOW_PRINTK("dom %u final teardown starts."
3249 " Shadow pages total = %u, free = %u, p2m=%u\n",
3250 d->domain_id,
3251 d->arch.paging.shadow.total_pages,
3252 d->arch.paging.shadow.free_pages,
3253 d->arch.paging.shadow.p2m_pages);
3255 /* Double-check that the domain didn't have any shadow memory.
3256 * It is possible for a domain that never got domain_kill()ed
3257 * to get here with its shadow allocation intact. */
3258 if ( d->arch.paging.shadow.total_pages != 0 )
3259 shadow_teardown(d);
3261 /* It is now safe to pull down the p2m map. */
3262 p2m_teardown(d);
3264 SHADOW_PRINTK("dom %u final teardown done."
3265 " Shadow pages total = %u, free = %u, p2m=%u\n",
3266 d->domain_id,
3267 d->arch.paging.shadow.total_pages,
3268 d->arch.paging.shadow.free_pages,
3269 d->arch.paging.shadow.p2m_pages);
3272 static int shadow_one_bit_enable(struct domain *d, u32 mode)
3273 /* Turn on a single shadow mode feature */
3275 ASSERT(shadow_locked_by_me(d));
3277 /* Sanity check the call */
3278 if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
3280 return -EINVAL;
3283 mode |= PG_SH_enable;
3285 if ( d->arch.paging.mode == 0 )
3287 /* Init the shadow memory allocation and the hash table */
3288 if ( sh_set_allocation(d, 1, NULL) != 0
3289 || shadow_hash_alloc(d) != 0 )
3291 sh_set_allocation(d, 0, NULL);
3292 return -ENOMEM;
3296 /* Update the bits */
3297 sh_new_mode(d, d->arch.paging.mode | mode);
3299 return 0;
3302 static int shadow_one_bit_disable(struct domain *d, u32 mode)
3303 /* Turn off a single shadow mode feature */
3305 struct vcpu *v;
3306 ASSERT(shadow_locked_by_me(d));
3308 /* Sanity check the call */
3309 if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
3311 return -EINVAL;
3314 /* Update the bits */
3315 sh_new_mode(d, d->arch.paging.mode & ~mode);
3316 if ( d->arch.paging.mode == 0 )
3318 /* Get this domain off shadows */
3319 SHADOW_PRINTK("un-shadowing of domain %u starts."
3320 " Shadow pages total = %u, free = %u, p2m=%u\n",
3321 d->domain_id,
3322 d->arch.paging.shadow.total_pages,
3323 d->arch.paging.shadow.free_pages,
3324 d->arch.paging.shadow.p2m_pages);
3325 for_each_vcpu(d, v)
3327 if ( v->arch.paging.mode )
3328 v->arch.paging.mode->shadow.detach_old_tables(v);
3329 #if CONFIG_PAGING_LEVELS == 4
3330 if ( !(v->arch.flags & TF_kernel_mode) )
3331 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
3332 else
3333 #endif
3334 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
3338 /* Pull down the memory allocation */
3339 if ( sh_set_allocation(d, 0, NULL) != 0 )
3341 // XXX - How can this occur?
3342 // Seems like a bug to return an error now that we've
3343 // disabled the relevant shadow mode.
3344 //
3345 return -ENOMEM;
3347 shadow_hash_teardown(d);
3348 SHADOW_PRINTK("un-shadowing of domain %u done."
3349 " Shadow pages total = %u, free = %u, p2m=%u\n",
3350 d->domain_id,
3351 d->arch.paging.shadow.total_pages,
3352 d->arch.paging.shadow.free_pages,
3353 d->arch.paging.shadow.p2m_pages);
3356 return 0;
3359 /* Enable/disable ops for the "test" and "log-dirty" modes */
3360 static int shadow_test_enable(struct domain *d)
3362 int ret;
3364 domain_pause(d);
3365 shadow_lock(d);
3366 ret = shadow_one_bit_enable(d, PG_SH_enable);
3367 shadow_unlock(d);
3368 domain_unpause(d);
3370 return ret;
3373 static int shadow_test_disable(struct domain *d)
3375 int ret;
3377 domain_pause(d);
3378 shadow_lock(d);
3379 ret = shadow_one_bit_disable(d, PG_SH_enable);
3380 shadow_unlock(d);
3381 domain_unpause(d);
3383 return ret;
3386 /**************************************************************************/
3387 /* P2M map manipulations */
3389 /* shadow specific code which should be called when P2M table entry is updated
3390 * with new content. It is responsible for update the entry, as well as other
3391 * shadow processing jobs.
3392 */
3393 void
3394 shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn,
3395 l1_pgentry_t *p, mfn_t table_mfn,
3396 l1_pgentry_t new, unsigned int level)
3398 struct domain *d = v->domain;
3400 shadow_lock(d);
3402 /* If we're removing an MFN from the p2m, remove it from the shadows too */
3403 if ( level == 1 )
3405 mfn_t mfn = _mfn(l1e_get_pfn(*p));
3406 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3407 if ( p2m_is_valid(p2mt) && mfn_valid(mfn) )
3409 sh_remove_all_shadows_and_parents(v, mfn);
3410 if ( sh_remove_all_mappings(v, mfn) )
3411 flush_tlb_mask(d->domain_dirty_cpumask);
3415 /* If we're removing a superpage mapping from the p2m, we need to check
3416 * all the pages covered by it. If they're still there in the new
3417 * scheme, that's OK, but otherwise they must be unshadowed. */
3418 if ( level == 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) &&
3419 (l1e_get_flags(*p) & _PAGE_PSE) )
3421 unsigned int i;
3422 cpumask_t flushmask;
3423 mfn_t omfn = _mfn(l1e_get_pfn(*p));
3424 mfn_t nmfn = _mfn(l1e_get_pfn(new));
3425 l1_pgentry_t *npte = NULL;
3426 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3427 if ( p2m_is_valid(p2mt) && mfn_valid(omfn) )
3429 cpus_clear(flushmask);
3431 /* If we're replacing a superpage with a normal L1 page, map it */
3432 if ( (l1e_get_flags(new) & _PAGE_PRESENT)
3433 && !(l1e_get_flags(new) & _PAGE_PSE)
3434 && mfn_valid(nmfn) )
3435 npte = map_domain_page(mfn_x(nmfn));
3437 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3439 if ( !npte
3440 || !p2m_is_ram(p2m_flags_to_type(l1e_get_flags(npte[i])))
3441 || l1e_get_pfn(npte[i]) != mfn_x(omfn) )
3443 /* This GFN->MFN mapping has gone away */
3444 sh_remove_all_shadows_and_parents(v, omfn);
3445 if ( sh_remove_all_mappings(v, omfn) )
3446 cpus_or(flushmask, flushmask, d->domain_dirty_cpumask);
3448 omfn = _mfn(mfn_x(omfn) + 1);
3450 flush_tlb_mask(flushmask);
3452 if ( npte )
3453 unmap_domain_page(npte);
3457 /* Update the entry with new content */
3458 safe_write_pte(p, new);
3460 /* install P2M in monitors for PAE Xen */
3461 #if CONFIG_PAGING_LEVELS == 3
3462 if ( level == 3 )
3463 /* We have written to the p2m l3: need to sync the per-vcpu
3464 * copies of it in the monitor tables */
3465 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
3466 #endif
3468 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3469 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
3470 cached the fact that this is an mmio region in the shadow
3471 page tables. Blow the tables away to remove the cache.
3472 This is pretty heavy handed, but this is a rare operation
3473 (it might happen a dozen times during boot and then never
3474 again), so it doesn't matter too much. */
3475 if ( d->arch.paging.shadow.has_fast_mmio_entries )
3477 shadow_blow_tables(d);
3478 d->arch.paging.shadow.has_fast_mmio_entries = 0;
3480 #endif
3482 shadow_unlock(d);
3485 /**************************************************************************/
3486 /* Log-dirty mode support */
3488 /* Shadow specific code which is called in paging_log_dirty_enable().
3489 * Return 0 if no problem found.
3490 */
3491 int shadow_enable_log_dirty(struct domain *d)
3493 int ret;
3495 /* shadow lock is required here */
3496 shadow_lock(d);
3497 if ( shadow_mode_enabled(d) )
3499 /* This domain already has some shadows: need to clear them out
3500 * of the way to make sure that all references to guest memory are
3501 * properly write-protected */
3502 shadow_blow_tables(d);
3505 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3506 /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
3507 * change an l4e instead of cr3 to switch tables. Give them the
3508 * same optimization */
3509 if ( is_pv_32on64_domain(d) )
3510 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3511 #endif
3513 ret = shadow_one_bit_enable(d, PG_log_dirty);
3514 shadow_unlock(d);
3516 return ret;
3519 /* shadow specfic code which is called in paging_log_dirty_disable() */
3520 int shadow_disable_log_dirty(struct domain *d)
3522 int ret;
3524 /* shadow lock is required here */
3525 shadow_lock(d);
3526 ret = shadow_one_bit_disable(d, PG_log_dirty);
3527 shadow_unlock(d);
3529 return ret;
3532 /* This function is called when we CLEAN log dirty bitmap. See
3533 * paging_log_dirty_op() for details.
3534 */
3535 void shadow_clean_dirty_bitmap(struct domain *d)
3537 shadow_lock(d);
3538 /* Need to revoke write access to the domain's pages again.
3539 * In future, we'll have a less heavy-handed approach to this,
3540 * but for now, we just unshadow everything except Xen. */
3541 shadow_blow_tables(d);
3542 shadow_unlock(d);
3546 /**************************************************************************/
3547 /* VRAM dirty tracking support */
3548 int shadow_track_dirty_vram(struct domain *d,
3549 unsigned long begin_pfn,
3550 unsigned long nr,
3551 XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
3553 int rc;
3554 unsigned long end_pfn = begin_pfn + nr;
3555 unsigned long dirty_size = (nr + 7) / 8;
3556 int flush_tlb = 0;
3557 unsigned long i;
3558 p2m_type_t t;
3560 if (end_pfn < begin_pfn
3561 || begin_pfn > d->arch.p2m->max_mapped_pfn
3562 || end_pfn >= d->arch.p2m->max_mapped_pfn)
3563 return -EINVAL;
3565 shadow_lock(d);
3567 if ( d->dirty_vram && (!nr ||
3568 ( begin_pfn != d->dirty_vram->begin_pfn
3569 || end_pfn != d->dirty_vram->end_pfn )) )
3571 /* Different tracking, tear the previous down. */
3572 gdprintk(XENLOG_INFO, "stopping tracking VRAM %lx - %lx\n", d->dirty_vram->begin_pfn, d->dirty_vram->end_pfn);
3573 xfree(d->dirty_vram->sl1ma);
3574 xfree(d->dirty_vram->dirty_bitmap);
3575 xfree(d->dirty_vram);
3576 d->dirty_vram = NULL;
3579 if ( !nr )
3581 rc = 0;
3582 goto out;
3585 /* This should happen seldomly (Video mode change),
3586 * no need to be careful. */
3587 if ( !d->dirty_vram )
3589 /* Just recount from start. */
3590 for ( i = begin_pfn; i < end_pfn; i++ ) {
3591 mfn_t mfn = gfn_to_mfn(d, i, &t);
3592 if (mfn_x(mfn) != INVALID_MFN)
3593 flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
3596 gdprintk(XENLOG_INFO, "tracking VRAM %lx - %lx\n", begin_pfn, end_pfn);
3598 rc = -ENOMEM;
3599 if ( (d->dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL )
3600 goto out;
3601 d->dirty_vram->begin_pfn = begin_pfn;
3602 d->dirty_vram->end_pfn = end_pfn;
3604 if ( (d->dirty_vram->sl1ma = xmalloc_array(paddr_t, nr)) == NULL )
3605 goto out_dirty_vram;
3606 memset(d->dirty_vram->sl1ma, ~0, sizeof(paddr_t) * nr);
3608 if ( (d->dirty_vram->dirty_bitmap = xmalloc_array(uint8_t, dirty_size)) == NULL )
3609 goto out_sl1ma;
3610 memset(d->dirty_vram->dirty_bitmap, 0, dirty_size);
3612 d->dirty_vram->last_dirty = NOW();
3614 /* Tell the caller that this time we could not track dirty bits. */
3615 rc = -ENODATA;
3617 else if (d->dirty_vram->last_dirty == -1)
3619 /* still completely clean, just copy our empty bitmap */
3620 rc = -EFAULT;
3621 if ( copy_to_guest(dirty_bitmap, d->dirty_vram->dirty_bitmap, dirty_size) == 0 )
3622 rc = 0;
3624 else
3626 #ifdef __i386__
3627 unsigned long map_mfn = INVALID_MFN;
3628 void *map_sl1p = NULL;
3629 #endif
3631 /* Iterate over VRAM to track dirty bits. */
3632 for ( i = 0; i < nr; i++ ) {
3633 mfn_t mfn = gfn_to_mfn(d, begin_pfn + i, &t);
3634 struct page_info *page;
3635 u32 count_info;
3636 int dirty = 0;
3637 paddr_t sl1ma = d->dirty_vram->sl1ma[i];
3639 if (mfn_x(mfn) == INVALID_MFN)
3641 dirty = 1;
3643 else
3645 page = mfn_to_page(mfn);
3646 count_info = page->u.inuse.type_info & PGT_count_mask;
3647 switch (count_info)
3649 case 0:
3650 /* No guest reference, nothing to track. */
3651 break;
3652 case 1:
3653 /* One guest reference. */
3654 if ( sl1ma == INVALID_PADDR )
3656 /* We don't know which sl1e points to this, too bad. */
3657 dirty = 1;
3658 /* TODO: Heuristics for finding the single mapping of
3659 * this gmfn */
3660 flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
3662 else
3664 /* Hopefully the most common case: only one mapping,
3665 * whose dirty bit we can use. */
3666 l1_pgentry_t *sl1e;
3667 #ifdef __i386__
3668 void *sl1p = map_sl1p;
3669 unsigned long sl1mfn = paddr_to_pfn(sl1ma);
3671 if ( sl1mfn != map_mfn ) {
3672 if ( map_sl1p )
3673 sh_unmap_domain_page(map_sl1p);
3674 map_sl1p = sl1p = sh_map_domain_page(_mfn(sl1mfn));
3675 map_mfn = sl1mfn;
3677 sl1e = sl1p + (sl1ma & ~PAGE_MASK);
3678 #else
3679 sl1e = maddr_to_virt(sl1ma);
3680 #endif
3682 if ( l1e_get_flags(*sl1e) & _PAGE_DIRTY )
3684 dirty = 1;
3685 /* Note: this is atomic, so we may clear a
3686 * _PAGE_ACCESSED set by another processor. */
3687 l1e_remove_flags(*sl1e, _PAGE_DIRTY);
3688 flush_tlb = 1;
3691 break;
3692 default:
3693 /* More than one guest reference,
3694 * we don't afford tracking that. */
3695 dirty = 1;
3696 break;
3700 if ( dirty )
3702 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
3703 d->dirty_vram->last_dirty = NOW();
3707 #ifdef __i386__
3708 if ( map_sl1p )
3709 sh_unmap_domain_page(map_sl1p);
3710 #endif
3712 rc = -EFAULT;
3713 if ( copy_to_guest(dirty_bitmap, d->dirty_vram->dirty_bitmap, dirty_size) == 0 ) {
3714 memset(d->dirty_vram->dirty_bitmap, 0, dirty_size);
3715 if (d->dirty_vram->last_dirty + SECONDS(2) < NOW())
3717 /* was clean for more than two seconds, try to disable guest
3718 * write access */
3719 for ( i = begin_pfn; i < end_pfn; i++ ) {
3720 mfn_t mfn = gfn_to_mfn(d, i, &t);
3721 if (mfn_x(mfn) != INVALID_MFN)
3722 flush_tlb |= sh_remove_write_access(d->vcpu[0], mfn, 1, 0);
3724 d->dirty_vram->last_dirty = -1;
3726 rc = 0;
3729 if ( flush_tlb )
3730 flush_tlb_mask(d->domain_dirty_cpumask);
3731 goto out;
3733 out_sl1ma:
3734 xfree(d->dirty_vram->sl1ma);
3735 out_dirty_vram:
3736 xfree(d->dirty_vram);
3737 d->dirty_vram = NULL;
3739 out:
3740 shadow_unlock(d);
3741 return rc;
3744 /**************************************************************************/
3745 /* Shadow-control XEN_DOMCTL dispatcher */
3747 int shadow_domctl(struct domain *d,
3748 xen_domctl_shadow_op_t *sc,
3749 XEN_GUEST_HANDLE(void) u_domctl)
3751 int rc, preempted = 0;
3753 switch ( sc->op )
3755 case XEN_DOMCTL_SHADOW_OP_OFF:
3756 if ( d->arch.paging.mode == PG_SH_enable )
3757 if ( (rc = shadow_test_disable(d)) != 0 )
3758 return rc;
3759 return 0;
3761 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3762 return shadow_test_enable(d);
3764 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3765 return shadow_enable(d, PG_refcounts|PG_translate);
3767 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3768 return shadow_enable(d, sc->mode << PG_mode_shift);
3770 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3771 sc->mb = shadow_get_allocation(d);
3772 return 0;
3774 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3775 shadow_lock(d);
3776 if ( sc->mb == 0 && shadow_mode_enabled(d) )
3778 /* Can't set the allocation to zero unless the domain stops using
3779 * shadow pagetables first */
3780 SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
3781 " is still using shadows.\n", d->domain_id);
3782 shadow_unlock(d);
3783 return -EINVAL;
3785 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
3786 shadow_unlock(d);
3787 if ( preempted )
3788 /* Not finished. Set up to re-run the call. */
3789 rc = hypercall_create_continuation(
3790 __HYPERVISOR_domctl, "h", u_domctl);
3791 else
3792 /* Finished. Return the new allocation */
3793 sc->mb = shadow_get_allocation(d);
3794 return rc;
3796 default:
3797 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3798 return -EINVAL;
3803 /**************************************************************************/
3804 /* Auditing shadow tables */
3806 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3808 void shadow_audit_tables(struct vcpu *v)
3810 /* Dispatch table for getting per-type functions */
3811 static hash_callback_t callbacks[SH_type_unused] = {
3812 NULL, /* none */
3813 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 2), /* l1_32 */
3814 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 2), /* fl1_32 */
3815 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 2), /* l2_32 */
3816 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 3), /* l1_pae */
3817 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 3), /* fl1_pae */
3818 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2_pae */
3819 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2h_pae */
3820 #if CONFIG_PAGING_LEVELS >= 4
3821 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 4), /* l1_64 */
3822 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 4), /* fl1_64 */
3823 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2_64 */
3824 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2h_64 */
3825 SHADOW_INTERNAL_NAME(sh_audit_l3_table, 4), /* l3_64 */
3826 SHADOW_INTERNAL_NAME(sh_audit_l4_table, 4), /* l4_64 */
3827 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3828 NULL /* All the rest */
3829 };
3830 unsigned int mask;
3832 if ( !(SHADOW_AUDIT_ENABLE) )
3833 return;
3835 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3836 sh_oos_audit(v->domain);
3837 #endif
3839 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3840 mask = ~1; /* Audit every table in the system */
3841 else
3843 /* Audit only the current mode's tables */
3844 switch ( v->arch.paging.mode->guest_levels )
3846 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3847 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3848 |SHF_L2H_PAE); break;
3849 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3850 |SHF_L3_64|SHF_L4_64); break;
3851 default: BUG();
3855 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3858 #endif /* Shadow audit */
3860 /*
3861 * Local variables:
3862 * mode: C
3863 * c-set-style: "BSD"
3864 * c-basic-offset: 4
3865 * indent-tabs-mode: nil
3866 * End:
3867 */