ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 17913:7eab6475c60a

shadow: Re-initialise page_info's lock field when freeing shadow pages.

Fixes boot failure of xenU after destroying a HVM guest.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 27 14:38:41 2008 +0100 (2008-06-27)
parents e5ae980fe337
children c33a40b4c22b
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include <xen/numa.h>
40 #include "private.h"
43 /* Set up the shadow-specific parts of a domain struct at start of day.
44 * Called for every domain from arch_domain_create() */
45 void shadow_domain_init(struct domain *d)
46 {
47 int i;
48 shadow_lock_init(d);
49 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
50 INIT_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
51 INIT_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
52 INIT_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
54 /* Use shadow pagetables for log-dirty support */
55 paging_log_dirty_init(d, shadow_enable_log_dirty,
56 shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
58 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
59 d->arch.paging.shadow.oos_active = 0;
60 #endif
61 }
63 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
64 * job is to initialize the update_paging_modes() function pointer, which is
65 * used to initialized the rest of resources. Therefore, it really does not
66 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
67 * be compiled.
68 */
69 void shadow_vcpu_init(struct vcpu *v)
70 {
71 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
72 int i;
74 for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
75 {
76 v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
77 v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
78 }
79 #endif
81 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
82 }
84 #if SHADOW_AUDIT
85 int shadow_audit_enable = 0;
87 static void shadow_audit_key(unsigned char key)
88 {
89 shadow_audit_enable = !shadow_audit_enable;
90 printk("%s shadow_audit_enable=%d\n",
91 __func__, shadow_audit_enable);
92 }
94 static int __init shadow_audit_key_init(void)
95 {
96 register_keyhandler(
97 'O', shadow_audit_key, "toggle shadow audits");
98 return 0;
99 }
100 __initcall(shadow_audit_key_init);
101 #endif /* SHADOW_AUDIT */
103 int _shadow_mode_refcounts(struct domain *d)
104 {
105 return shadow_mode_refcounts(d);
106 }
109 /**************************************************************************/
110 /* x86 emulator support for the shadow code
111 */
113 struct segment_register *hvm_get_seg_reg(
114 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
115 {
116 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
117 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
118 hvm_get_segment_register(current, seg, seg_reg);
119 return seg_reg;
120 }
122 static int hvm_translate_linear_addr(
123 enum x86_segment seg,
124 unsigned long offset,
125 unsigned int bytes,
126 enum hvm_access_type access_type,
127 struct sh_emulate_ctxt *sh_ctxt,
128 unsigned long *paddr)
129 {
130 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
131 int okay;
133 okay = hvm_virtual_to_linear_addr(
134 seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr);
136 if ( !okay )
137 {
138 hvm_inject_exception(TRAP_gp_fault, 0, 0);
139 return X86EMUL_EXCEPTION;
140 }
142 return 0;
143 }
145 static int
146 hvm_read(enum x86_segment seg,
147 unsigned long offset,
148 unsigned long *val,
149 unsigned int bytes,
150 enum hvm_access_type access_type,
151 struct sh_emulate_ctxt *sh_ctxt)
152 {
153 unsigned long addr;
154 int rc;
156 rc = hvm_translate_linear_addr(
157 seg, offset, bytes, access_type, sh_ctxt, &addr);
158 if ( rc )
159 return rc;
161 *val = 0;
163 if ( access_type == hvm_access_insn_fetch )
164 rc = hvm_fetch_from_guest_virt(val, addr, bytes, 0);
165 else
166 rc = hvm_copy_from_guest_virt(val, addr, bytes, 0);
168 switch ( rc )
169 {
170 case HVMCOPY_okay:
171 return X86EMUL_OKAY;
172 case HVMCOPY_bad_gva_to_gfn:
173 return X86EMUL_EXCEPTION;
174 default:
175 break;
176 }
178 return X86EMUL_UNHANDLEABLE;
179 }
181 static int
182 hvm_emulate_read(enum x86_segment seg,
183 unsigned long offset,
184 unsigned long *val,
185 unsigned int bytes,
186 struct x86_emulate_ctxt *ctxt)
187 {
188 if ( !is_x86_user_segment(seg) )
189 return X86EMUL_UNHANDLEABLE;
190 return hvm_read(seg, offset, val, bytes, hvm_access_read,
191 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
192 }
194 static int
195 hvm_emulate_insn_fetch(enum x86_segment seg,
196 unsigned long offset,
197 unsigned long *val,
198 unsigned int bytes,
199 struct x86_emulate_ctxt *ctxt)
200 {
201 struct sh_emulate_ctxt *sh_ctxt =
202 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
203 unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
205 ASSERT(seg == x86_seg_cs);
207 /* Fall back if requested bytes are not in the prefetch cache. */
208 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
209 return hvm_read(seg, offset, val, bytes,
210 hvm_access_insn_fetch, sh_ctxt);
212 /* Hit the cache. Simple memcpy. */
213 *val = 0;
214 memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
215 return X86EMUL_OKAY;
216 }
218 static int
219 hvm_emulate_write(enum x86_segment seg,
220 unsigned long offset,
221 unsigned long val,
222 unsigned int bytes,
223 struct x86_emulate_ctxt *ctxt)
224 {
225 struct sh_emulate_ctxt *sh_ctxt =
226 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
227 struct vcpu *v = current;
228 unsigned long addr;
229 int rc;
231 if ( !is_x86_user_segment(seg) )
232 return X86EMUL_UNHANDLEABLE;
234 /* How many emulations could we save if we unshadowed on stack writes? */
235 if ( seg == x86_seg_ss )
236 perfc_incr(shadow_fault_emulate_stack);
238 rc = hvm_translate_linear_addr(
239 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
240 if ( rc )
241 return rc;
243 return v->arch.paging.mode->shadow.x86_emulate_write(
244 v, addr, &val, bytes, sh_ctxt);
245 }
247 static int
248 hvm_emulate_cmpxchg(enum x86_segment seg,
249 unsigned long offset,
250 void *p_old,
251 void *p_new,
252 unsigned int bytes,
253 struct x86_emulate_ctxt *ctxt)
254 {
255 struct sh_emulate_ctxt *sh_ctxt =
256 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
257 struct vcpu *v = current;
258 unsigned long addr, old[2], new[2];
259 int rc;
261 if ( !is_x86_user_segment(seg) )
262 return X86EMUL_UNHANDLEABLE;
264 rc = hvm_translate_linear_addr(
265 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
266 if ( rc )
267 return rc;
269 old[0] = new[0] = 0;
270 memcpy(old, p_old, bytes);
271 memcpy(new, p_new, bytes);
273 if ( bytes <= sizeof(long) )
274 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
275 v, addr, old[0], new[0], bytes, sh_ctxt);
277 #ifdef __i386__
278 if ( bytes == 8 )
279 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
280 v, addr, old[0], old[1], new[0], new[1], sh_ctxt);
281 #endif
283 return X86EMUL_UNHANDLEABLE;
284 }
286 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
287 .read = hvm_emulate_read,
288 .insn_fetch = hvm_emulate_insn_fetch,
289 .write = hvm_emulate_write,
290 .cmpxchg = hvm_emulate_cmpxchg,
291 };
293 static int
294 pv_emulate_read(enum x86_segment seg,
295 unsigned long offset,
296 unsigned long *val,
297 unsigned int bytes,
298 struct x86_emulate_ctxt *ctxt)
299 {
300 unsigned int rc;
302 if ( !is_x86_user_segment(seg) )
303 return X86EMUL_UNHANDLEABLE;
305 *val = 0;
306 if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
307 {
308 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
309 return X86EMUL_EXCEPTION;
310 }
312 return X86EMUL_OKAY;
313 }
315 static int
316 pv_emulate_write(enum x86_segment seg,
317 unsigned long offset,
318 unsigned long val,
319 unsigned int bytes,
320 struct x86_emulate_ctxt *ctxt)
321 {
322 struct sh_emulate_ctxt *sh_ctxt =
323 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
324 struct vcpu *v = current;
325 if ( !is_x86_user_segment(seg) )
326 return X86EMUL_UNHANDLEABLE;
327 return v->arch.paging.mode->shadow.x86_emulate_write(
328 v, offset, &val, bytes, sh_ctxt);
329 }
331 static int
332 pv_emulate_cmpxchg(enum x86_segment seg,
333 unsigned long offset,
334 void *p_old,
335 void *p_new,
336 unsigned int bytes,
337 struct x86_emulate_ctxt *ctxt)
338 {
339 struct sh_emulate_ctxt *sh_ctxt =
340 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
341 unsigned long old[2], new[2];
342 struct vcpu *v = current;
344 if ( !is_x86_user_segment(seg) )
345 return X86EMUL_UNHANDLEABLE;
347 old[0] = new[0] = 0;
348 memcpy(old, p_old, bytes);
349 memcpy(new, p_new, bytes);
351 if ( bytes <= sizeof(long) )
352 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
353 v, offset, old[0], new[0], bytes, sh_ctxt);
355 #ifdef __i386__
356 if ( bytes == 8 )
357 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
358 v, offset, old[0], old[1], new[0], new[1], sh_ctxt);
359 #endif
361 return X86EMUL_UNHANDLEABLE;
362 }
364 static struct x86_emulate_ops pv_shadow_emulator_ops = {
365 .read = pv_emulate_read,
366 .insn_fetch = pv_emulate_read,
367 .write = pv_emulate_write,
368 .cmpxchg = pv_emulate_cmpxchg,
369 };
371 struct x86_emulate_ops *shadow_init_emulation(
372 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
373 {
374 struct segment_register *creg, *sreg;
375 struct vcpu *v = current;
376 unsigned long addr;
378 sh_ctxt->ctxt.regs = regs;
379 sh_ctxt->ctxt.force_writeback = 0;
381 if ( !is_hvm_vcpu(v) )
382 {
383 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
384 return &pv_shadow_emulator_ops;
385 }
387 /* Segment cache initialisation. Primed with CS. */
388 sh_ctxt->valid_seg_regs = 0;
389 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
391 /* Work out the emulation mode. */
392 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
393 {
394 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
395 }
396 else
397 {
398 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
399 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
400 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
401 }
403 /* Attempt to prefetch whole instruction. */
404 sh_ctxt->insn_buf_eip = regs->eip;
405 sh_ctxt->insn_buf_bytes =
406 (!hvm_translate_linear_addr(
407 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
408 hvm_access_insn_fetch, sh_ctxt, &addr) &&
409 !hvm_fetch_from_guest_virt_nofault(
410 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
411 ? sizeof(sh_ctxt->insn_buf) : 0;
413 return &hvm_shadow_emulator_ops;
414 }
416 /* Update an initialized emulation context to prepare for the next
417 * instruction */
418 void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
419 struct cpu_user_regs *regs)
420 {
421 struct vcpu *v = current;
422 unsigned long addr, diff;
424 /* We don't refetch the segment bases, because we don't emulate
425 * writes to segment registers */
427 if ( is_hvm_vcpu(v) )
428 {
429 diff = regs->eip - sh_ctxt->insn_buf_eip;
430 if ( diff > sh_ctxt->insn_buf_bytes )
431 {
432 /* Prefetch more bytes. */
433 sh_ctxt->insn_buf_bytes =
434 (!hvm_translate_linear_addr(
435 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
436 hvm_access_insn_fetch, sh_ctxt, &addr) &&
437 !hvm_fetch_from_guest_virt_nofault(
438 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
439 ? sizeof(sh_ctxt->insn_buf) : 0;
440 sh_ctxt->insn_buf_eip = regs->eip;
441 }
442 }
443 }
446 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
447 /**************************************************************************/
448 /* Out-of-sync shadows. */
450 /* From time to time, we let a shadowed pagetable page go out of sync
451 * with its shadow: the guest is allowed to write directly to the page,
452 * and those writes are not synchronously reflected in the shadow.
453 * This lets us avoid many emulations if the guest is writing a lot to a
454 * pagetable, but it relaxes a pretty important invariant in the shadow
455 * pagetable design. Therefore, some rules:
456 *
457 * 1. Only L1 pagetables may go out of sync: any page that is shadowed
458 * at at higher level must be synchronously updated. This makes
459 * using linear shadow pagetables much less dangerous.
460 * That means that: (a) unsyncing code needs to check for higher-level
461 * shadows, and (b) promotion code needs to resync.
462 *
463 * 2. All shadow operations on a guest page require the page to be brought
464 * back into sync before proceeding. This must be done under the
465 * shadow lock so that the page is guaranteed to remain synced until
466 * the operation completes.
467 *
468 * Exceptions to this rule: the pagefault and invlpg handlers may
469 * update only one entry on an out-of-sync page without resyncing it.
470 *
471 * 3. Operations on shadows that do not start from a guest page need to
472 * be aware that they may be handling an out-of-sync shadow.
473 *
474 * 4. Operations that do not normally take the shadow lock (fast-path
475 * #PF handler, INVLPG) must fall back to a locking, syncing version
476 * if they see an out-of-sync table.
477 *
478 * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
479 * must explicitly resync all relevant pages or update their
480 * shadows.
481 *
482 * Currently out-of-sync pages are listed in a simple open-addressed
483 * hash table with a second chance (must resist temptation to radically
484 * over-engineer hash tables...) The virtual address of the access
485 * which caused us to unsync the page is also kept in the hash table, as
486 * a hint for finding the writable mappings later.
487 *
488 * We keep a hash per vcpu, because we want as much as possible to do
489 * the re-sync on the save vcpu we did the unsync on, so the VA hint
490 * will be valid.
491 */
494 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
495 static void sh_oos_audit(struct domain *d)
496 {
497 int idx, expected_idx, expected_idx_alt;
498 struct page_info *pg;
499 struct vcpu *v;
501 for_each_vcpu(d, v)
502 {
503 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
504 {
505 mfn_t *oos = v->arch.paging.shadow.oos;
506 if ( !mfn_valid(oos[idx]) )
507 continue;
509 expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
510 expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
511 if ( idx != expected_idx && idx != expected_idx_alt )
512 {
513 printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
514 __func__, idx, mfn_x(oos[idx]),
515 expected_idx, expected_idx_alt);
516 BUG();
517 }
518 pg = mfn_to_page(oos[idx]);
519 if ( !(pg->count_info & PGC_page_table) )
520 {
521 printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
522 __func__, idx, mfn_x(oos[idx]), pg->count_info);
523 BUG();
524 }
525 if ( !(pg->shadow_flags & SHF_out_of_sync) )
526 {
527 printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
528 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
529 BUG();
530 }
531 if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
532 {
533 printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
534 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
535 BUG();
536 }
537 }
538 }
539 }
540 #endif
542 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
543 void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
544 {
545 int idx;
546 struct vcpu *v;
547 mfn_t *oos;
549 ASSERT(mfn_is_out_of_sync(gmfn));
551 for_each_vcpu(d, v)
552 {
553 oos = v->arch.paging.shadow.oos;
554 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
555 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
556 idx = (idx + 1) % SHADOW_OOS_PAGES;
558 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
559 return;
560 }
562 SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
563 BUG();
564 }
565 #endif
567 /* Update the shadow, but keep the page out of sync. */
568 static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
569 {
570 struct page_info *pg = mfn_to_page(gmfn);
572 ASSERT(mfn_valid(gmfn));
573 ASSERT(page_is_out_of_sync(pg));
575 /* Call out to the appropriate per-mode resyncing function */
576 if ( pg->shadow_flags & SHF_L1_32 )
577 SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
578 else if ( pg->shadow_flags & SHF_L1_PAE )
579 SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
580 #if CONFIG_PAGING_LEVELS >= 4
581 else if ( pg->shadow_flags & SHF_L1_64 )
582 SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
583 #endif
584 }
586 #define _FIXUP_IDX(_b, _i) ((_b) * SHADOW_OOS_FT_HASH + (_i))
588 void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
589 mfn_t smfn, unsigned long off)
590 {
591 int idx, i, free = 0, free_slot = 0;
592 struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
594 idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
595 for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
596 {
597 if ( !mfn_valid(fixups[_FIXUP_IDX(idx, i)].gmfn)
598 || !mfn_is_out_of_sync(fixups[_FIXUP_IDX(idx, i)].gmfn) )
599 {
600 free = 1;
601 free_slot = _FIXUP_IDX(idx, i);
602 }
603 else if ( (mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn))
604 && (mfn_x(fixups[_FIXUP_IDX(idx, i)].smfn) == mfn_x(smfn))
605 && (fixups[_FIXUP_IDX(idx, i)].off == off) )
606 {
607 perfc_incr(shadow_oos_fixup_no_add);
608 return;
609 }
610 }
612 if ( free )
613 {
614 if ( !v->arch.paging.shadow.oos_fixup_used )
615 v->arch.paging.shadow.oos_fixup_used = 1;
616 fixups[free_slot].gmfn = gmfn;
617 fixups[free_slot].smfn = smfn;
618 fixups[free_slot].off = off;
619 perfc_incr(shadow_oos_fixup_add_ok);
620 return;
621 }
624 perfc_incr(shadow_oos_fixup_add_fail);
625 }
627 void oos_fixup_remove(struct vcpu *v, mfn_t gmfn)
628 {
629 int idx, i;
630 struct domain *d = v->domain;
632 perfc_incr(shadow_oos_fixup_remove);
634 /* If the domain is dying we might get called when deallocating
635 * the shadows. Fixup tables are already freed so exit now. */
636 if ( d->is_dying )
637 return;
639 idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
640 for_each_vcpu(d, v)
641 {
642 struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
643 for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
644 if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn) )
645 fixups[_FIXUP_IDX(idx, i)].gmfn = _mfn(INVALID_MFN);
646 }
647 }
649 int oos_fixup_flush(struct vcpu *v)
650 {
651 int i, rc = 0;
652 struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
654 perfc_incr(shadow_oos_fixup_flush);
656 if ( !v->arch.paging.shadow.oos_fixup_used )
657 return 0;
659 for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
660 {
661 if ( mfn_valid(fixups[i].gmfn) )
662 {
663 if ( mfn_is_out_of_sync(fixups[i].gmfn) )
664 rc |= sh_remove_write_access_from_sl1p(v, fixups[i].gmfn,
665 fixups[i].smfn,
666 fixups[i].off);
667 fixups[i].gmfn = _mfn(INVALID_MFN);
668 }
669 }
671 v->arch.paging.shadow.oos_fixup_used = 0;
673 return rc;
674 }
676 int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn)
677 {
678 int idx, i, rc = 0;
679 struct domain *d = v->domain;
681 perfc_incr(shadow_oos_fixup_flush_gmfn);
683 idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
684 for_each_vcpu(d, v)
685 {
686 struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
688 for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
689 {
690 if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) != mfn_x(gmfn) )
691 continue;
693 rc |= sh_remove_write_access_from_sl1p(v,
694 fixups[_FIXUP_IDX(idx,i)].gmfn,
695 fixups[_FIXUP_IDX(idx,i)].smfn,
696 fixups[_FIXUP_IDX(idx,i)].off);
698 fixups[_FIXUP_IDX(idx,i)].gmfn = _mfn(INVALID_MFN);
699 }
700 }
702 return rc;
703 }
705 static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned long va)
706 {
707 int ftlb = 0;
709 ftlb |= oos_fixup_flush_gmfn(v, gmfn);
711 switch ( sh_remove_write_access(v, gmfn, 0, va) )
712 {
713 default:
714 case 0:
715 break;
717 case 1:
718 ftlb |= 1;
719 break;
721 case -1:
722 /* An unfindable writeable typecount has appeared, probably via a
723 * grant table entry: can't shoot the mapping, so try to unshadow
724 * the page. If that doesn't work either, the guest is granting
725 * his pagetables and must be killed after all.
726 * This will flush the tlb, so we can return with no worries. */
727 sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
728 return 1;
729 }
731 if ( ftlb )
732 flush_tlb_mask(v->domain->domain_dirty_cpumask);
734 return 0;
735 }
738 /* Pull all the entries on an out-of-sync page back into sync. */
739 static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va, mfn_t snp)
740 {
741 struct page_info *pg = mfn_to_page(gmfn);
743 ASSERT(shadow_locked_by_me(v->domain));
744 ASSERT(mfn_is_out_of_sync(gmfn));
745 /* Guest page must be shadowed *only* as L1 when out of sync. */
746 ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
747 & ~SHF_L1_ANY));
748 ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
750 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
751 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
753 /* Need to pull write access so the page *stays* in sync. */
754 if ( oos_remove_write_access(v, gmfn, va) )
755 {
756 /* Page has been unshadowed. */
757 return;
758 }
760 /* No more writable mappings of this page, please */
761 pg->shadow_flags &= ~SHF_oos_may_write;
763 /* Update the shadows with current guest entries. */
764 _sh_resync_l1(v, gmfn, snp);
766 /* Now we know all the entries are synced, and will stay that way */
767 pg->shadow_flags &= ~SHF_out_of_sync;
768 perfc_incr(shadow_resync);
769 }
772 /* Add an MFN to the list of out-of-sync guest pagetables */
773 static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va)
774 {
775 int idx, oidx, swap = 0;
776 void *gptr, *gsnpptr;
777 mfn_t *oos = v->arch.paging.shadow.oos;
778 unsigned long *oos_va = v->arch.paging.shadow.oos_va;
779 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
781 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
782 oidx = idx;
784 if ( mfn_valid(oos[idx])
785 && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
786 {
787 /* Punt the current occupant into the next slot */
788 SWAP(oos[idx], gmfn);
789 SWAP(oos_va[idx], va);
790 swap = 1;
791 idx = (idx + 1) % SHADOW_OOS_PAGES;
792 }
793 if ( mfn_valid(oos[idx]) )
794 {
795 /* Crush the current occupant. */
796 _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
797 perfc_incr(shadow_unsync_evict);
798 }
799 oos[idx] = gmfn;
800 oos_va[idx] = va;
802 if ( swap )
803 SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
805 gptr = sh_map_domain_page(oos[oidx]);
806 gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
807 memcpy(gsnpptr, gptr, PAGE_SIZE);
808 sh_unmap_domain_page(gptr);
809 sh_unmap_domain_page(gsnpptr);
810 }
812 /* Remove an MFN from the list of out-of-sync guest pagetables */
813 static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
814 {
815 int idx;
816 mfn_t *oos;
817 struct domain *d = v->domain;
819 SHADOW_PRINTK("D%dV%d gmfn %lx\n",
820 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
822 for_each_vcpu(d, v)
823 {
824 oos = v->arch.paging.shadow.oos;
825 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
826 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
827 idx = (idx + 1) % SHADOW_OOS_PAGES;
828 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
829 {
830 oos[idx] = _mfn(INVALID_MFN);
831 return;
832 }
833 }
835 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
836 BUG();
837 }
839 mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
840 {
841 int idx;
842 mfn_t *oos;
843 mfn_t *oos_snapshot;
844 struct domain *d = v->domain;
846 for_each_vcpu(d, v)
847 {
848 oos = v->arch.paging.shadow.oos;
849 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
850 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
851 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
852 idx = (idx + 1) % SHADOW_OOS_PAGES;
853 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
854 {
855 return oos_snapshot[idx];
856 }
857 }
859 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
860 BUG();
861 return _mfn(INVALID_MFN);
862 }
864 /* Pull a single guest page back into sync */
865 void sh_resync(struct vcpu *v, mfn_t gmfn)
866 {
867 int idx;
868 mfn_t *oos;
869 unsigned long *oos_va;
870 mfn_t *oos_snapshot;
871 struct domain *d = v->domain;
873 for_each_vcpu(d, v)
874 {
875 oos = v->arch.paging.shadow.oos;
876 oos_va = v->arch.paging.shadow.oos_va;
877 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
878 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
879 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
880 idx = (idx + 1) % SHADOW_OOS_PAGES;
882 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
883 {
884 _sh_resync(v, gmfn, oos_va[idx], oos_snapshot[idx]);
885 oos[idx] = _mfn(INVALID_MFN);
886 return;
887 }
888 }
890 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
891 BUG();
892 }
894 /* Figure out whether it's definitely safe not to sync this l1 table,
895 * by making a call out to the mode in which that shadow was made. */
896 static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
897 {
898 struct page_info *pg = mfn_to_page(gl1mfn);
899 if ( pg->shadow_flags & SHF_L1_32 )
900 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
901 else if ( pg->shadow_flags & SHF_L1_PAE )
902 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
903 #if CONFIG_PAGING_LEVELS >= 4
904 else if ( pg->shadow_flags & SHF_L1_64 )
905 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
906 #endif
907 SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n",
908 mfn_x(gl1mfn));
909 BUG();
910 return 0; /* BUG() is no longer __attribute__((noreturn)). */
911 }
914 /* Pull all out-of-sync pages back into sync. Pages brought out of sync
915 * on other vcpus are allowed to remain out of sync, but their contents
916 * will be made safe (TLB flush semantics); pages unsynced by this vcpu
917 * are brought back into sync and write-protected. If skip != 0, we try
918 * to avoid resyncing at all if we think we can get away with it. */
919 void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking)
920 {
921 int idx;
922 struct vcpu *other;
923 mfn_t *oos = v->arch.paging.shadow.oos;
924 unsigned long *oos_va = v->arch.paging.shadow.oos_va;
925 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
927 SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
929 ASSERT(do_locking || shadow_locked_by_me(v->domain));
931 if ( !this )
932 goto resync_others;
934 if ( do_locking )
935 shadow_lock(v->domain);
937 if ( oos_fixup_flush(v) )
938 flush_tlb_mask(v->domain->domain_dirty_cpumask);
940 /* First: resync all of this vcpu's oos pages */
941 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
942 if ( mfn_valid(oos[idx]) )
943 {
944 /* Write-protect and sync contents */
945 _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
946 oos[idx] = _mfn(INVALID_MFN);
947 }
949 if ( do_locking )
950 shadow_unlock(v->domain);
952 resync_others:
953 if ( !others )
954 return;
956 /* Second: make all *other* vcpus' oos pages safe. */
957 for_each_vcpu(v->domain, other)
958 {
959 if ( v == other )
960 continue;
962 if ( do_locking )
963 shadow_lock(v->domain);
965 oos = other->arch.paging.shadow.oos;
966 oos_va = other->arch.paging.shadow.oos_va;
967 oos_snapshot = other->arch.paging.shadow.oos_snapshot;
968 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
969 {
970 if ( !mfn_valid(oos[idx]) )
971 continue;
973 if ( skip )
974 {
975 /* Update the shadows and leave the page OOS. */
976 if ( sh_skip_sync(v, oos[idx]) )
977 continue;
978 _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
979 }
980 else
981 {
982 /* Write-protect and sync contents */
983 _sh_resync(other, oos[idx], oos_va[idx], oos_snapshot[idx]);
984 oos[idx] = _mfn(INVALID_MFN);
985 }
986 }
988 if ( do_locking )
989 shadow_unlock(v->domain);
990 }
991 }
993 /* Allow a shadowed page to go out of sync */
994 int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va)
995 {
996 struct page_info *pg;
998 ASSERT(shadow_locked_by_me(v->domain));
1000 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
1001 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
1003 pg = mfn_to_page(gmfn);
1005 /* Guest page must be shadowed *only* as L1 and *only* once when out
1006 * of sync. Also, get out now if it's already out of sync.
1007 * Also, can't safely unsync if some vcpus have paging disabled.*/
1008 if ( pg->shadow_flags &
1009 ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
1010 || sh_page_has_multiple_shadows(pg)
1011 || !is_hvm_domain(v->domain)
1012 || !v->domain->arch.paging.shadow.oos_active )
1013 return 0;
1015 pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
1016 oos_hash_add(v, gmfn, va);
1017 perfc_incr(shadow_unsync);
1018 return 1;
1021 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
1024 /**************************************************************************/
1025 /* Code for "promoting" a guest page to the point where the shadow code is
1026 * willing to let it be treated as a guest page table. This generally
1027 * involves making sure there are no writable mappings available to the guest
1028 * for this page.
1029 */
1030 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
1032 struct page_info *page = mfn_to_page(gmfn);
1034 ASSERT(mfn_valid(gmfn));
1036 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1037 /* Is the page already shadowed and out of sync? */
1038 if ( page_is_out_of_sync(page) )
1039 sh_resync(v, gmfn);
1040 #endif
1042 /* We should never try to promote a gmfn that has writeable mappings */
1043 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
1044 || (page->u.inuse.type_info & PGT_count_mask) == 0
1045 || v->domain->is_shutting_down);
1047 /* Is the page already shadowed? */
1048 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
1049 page->shadow_flags = 0;
1051 ASSERT(!test_bit(type, &page->shadow_flags));
1052 set_bit(type, &page->shadow_flags);
1055 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
1057 struct page_info *page = mfn_to_page(gmfn);
1059 ASSERT(test_bit(_PGC_page_table, &page->count_info));
1060 ASSERT(test_bit(type, &page->shadow_flags));
1062 clear_bit(type, &page->shadow_flags);
1064 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
1066 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1067 /* Was the page out of sync? */
1068 if ( page_is_out_of_sync(page) )
1070 oos_hash_remove(v, gmfn);
1071 oos_fixup_remove(v, gmfn);
1073 #endif
1074 clear_bit(_PGC_page_table, &page->count_info);
1078 /**************************************************************************/
1079 /* Validate a pagetable change from the guest and update the shadows.
1080 * Returns a bitmask of SHADOW_SET_* flags. */
1082 int
1083 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
1085 int result = 0;
1086 struct page_info *page = mfn_to_page(gmfn);
1088 paging_mark_dirty(v->domain, mfn_x(gmfn));
1090 // Determine which types of shadows are affected, and update each.
1091 //
1092 // Always validate L1s before L2s to prevent another cpu with a linear
1093 // mapping of this gmfn from seeing a walk that results from
1094 // using the new L2 value and the old L1 value. (It is OK for such a
1095 // guest to see a walk that uses the old L2 value with the new L1 value,
1096 // as hardware could behave this way if one level of the pagewalk occurs
1097 // before the store, and the next level of the pagewalk occurs after the
1098 // store.
1099 //
1100 // Ditto for L2s before L3s, etc.
1101 //
1103 if ( !(page->count_info & PGC_page_table) )
1104 return 0; /* Not shadowed at all */
1106 if ( page->shadow_flags & SHF_L1_32 )
1107 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2)
1108 (v, gmfn, entry, size);
1109 if ( page->shadow_flags & SHF_L2_32 )
1110 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2)
1111 (v, gmfn, entry, size);
1113 if ( page->shadow_flags & SHF_L1_PAE )
1114 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3)
1115 (v, gmfn, entry, size);
1116 if ( page->shadow_flags & SHF_L2_PAE )
1117 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3)
1118 (v, gmfn, entry, size);
1119 if ( page->shadow_flags & SHF_L2H_PAE )
1120 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3)
1121 (v, gmfn, entry, size);
1123 #if CONFIG_PAGING_LEVELS >= 4
1124 if ( page->shadow_flags & SHF_L1_64 )
1125 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4)
1126 (v, gmfn, entry, size);
1127 if ( page->shadow_flags & SHF_L2_64 )
1128 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4)
1129 (v, gmfn, entry, size);
1130 if ( page->shadow_flags & SHF_L2H_64 )
1131 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4)
1132 (v, gmfn, entry, size);
1133 if ( page->shadow_flags & SHF_L3_64 )
1134 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4)
1135 (v, gmfn, entry, size);
1136 if ( page->shadow_flags & SHF_L4_64 )
1137 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4)
1138 (v, gmfn, entry, size);
1139 #else /* 32-bit hypervisor does not support 64-bit guests */
1140 ASSERT((page->shadow_flags
1141 & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
1142 #endif
1144 return result;
1148 void
1149 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
1150 void *entry, u32 size)
1151 /* This is the entry point for emulated writes to pagetables in HVM guests and
1152 * PV translated guests.
1153 */
1155 struct domain *d = v->domain;
1156 int rc;
1158 ASSERT(shadow_locked_by_me(v->domain));
1159 rc = sh_validate_guest_entry(v, gmfn, entry, size);
1160 if ( rc & SHADOW_SET_FLUSH )
1161 /* Need to flush TLBs to pick up shadow PT changes */
1162 flush_tlb_mask(d->domain_dirty_cpumask);
1163 if ( rc & SHADOW_SET_ERROR )
1165 /* This page is probably not a pagetable any more: tear it out of the
1166 * shadows, along with any tables that reference it.
1167 * Since the validate call above will have made a "safe" (i.e. zero)
1168 * shadow entry, we can let the domain live even if we can't fully
1169 * unshadow the page. */
1170 sh_remove_shadows(v, gmfn, 0, 0);
1174 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
1175 intpte_t new, mfn_t gmfn)
1176 /* Write a new value into the guest pagetable, and update the shadows
1177 * appropriately. Returns 0 if we page-faulted, 1 for success. */
1179 int failed;
1180 shadow_lock(v->domain);
1181 failed = __copy_to_user(p, &new, sizeof(new));
1182 if ( failed != sizeof(new) )
1183 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1184 shadow_unlock(v->domain);
1185 return (failed == 0);
1188 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
1189 intpte_t *old, intpte_t new, mfn_t gmfn)
1190 /* Cmpxchg a new value into the guest pagetable, and update the shadows
1191 * appropriately. Returns 0 if we page-faulted, 1 if not.
1192 * N.B. caller should check the value of "old" to see if the
1193 * cmpxchg itself was successful. */
1195 int failed;
1196 intpte_t t = *old;
1197 shadow_lock(v->domain);
1198 failed = cmpxchg_user(p, t, new);
1199 if ( t == *old )
1200 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
1201 *old = t;
1202 shadow_unlock(v->domain);
1203 return (failed == 0);
1207 /**************************************************************************/
1208 /* Memory management for shadow pages. */
1210 /* Allocating shadow pages
1211 * -----------------------
1213 * Most shadow pages are allocated singly, but there is one case where
1214 * we need to allocate multiple pages together: shadowing 32-bit guest
1215 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
1216 * of virtual address space, and needs to be shadowed by two PAE/64-bit
1217 * l1 tables (covering 2MB of virtual address space each). Similarly, a
1218 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
1219 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
1220 * contiguous and aligned; functions for handling offsets into them are
1221 * defined in shadow.c (shadow_l1_index() etc.)
1223 * This table shows the allocation behaviour of the different modes:
1225 * Xen paging pae pae 64b 64b 64b
1226 * Guest paging 32b pae 32b pae 64b
1227 * PV or HVM HVM * HVM HVM *
1228 * Shadow paging pae pae pae pae 64b
1230 * sl1 size 8k 4k 8k 4k 4k
1231 * sl2 size 16k 4k 16k 4k 4k
1232 * sl3 size - - - - 4k
1233 * sl4 size - - - - 4k
1235 * We allocate memory from xen in four-page units and break them down
1236 * with a simple buddy allocator. Can't use the xen allocator to handle
1237 * this as it only works for contiguous zones, and a domain's shadow
1238 * pool is made of fragments.
1240 * In HVM guests, the p2m table is built out of shadow pages, and we provide
1241 * a function for the p2m management to steal pages, in max-order chunks, from
1242 * the free pool. We don't provide for giving them back, yet.
1243 */
1245 /* Figure out the least acceptable quantity of shadow memory.
1246 * The minimum memory requirement for always being able to free up a
1247 * chunk of memory is very small -- only three max-order chunks per
1248 * vcpu to hold the top level shadows and pages with Xen mappings in them.
1250 * But for a guest to be guaranteed to successfully execute a single
1251 * instruction, we must be able to map a large number (about thirty) VAs
1252 * at the same time, which means that to guarantee progress, we must
1253 * allow for more than ninety allocated pages per vcpu. We round that
1254 * up to 128 pages, or half a megabyte per vcpu. */
1255 static unsigned int shadow_min_acceptable_pages(struct domain *d)
1257 u32 vcpu_count = 0;
1258 struct vcpu *v;
1260 for_each_vcpu(d, v)
1261 vcpu_count++;
1263 return (vcpu_count * 128);
1266 /* Figure out the order of allocation needed for a given shadow type */
1267 static inline u32
1268 shadow_order(unsigned int shadow_type)
1270 static const u32 type_to_order[SH_type_unused] = {
1271 0, /* SH_type_none */
1272 1, /* SH_type_l1_32_shadow */
1273 1, /* SH_type_fl1_32_shadow */
1274 2, /* SH_type_l2_32_shadow */
1275 0, /* SH_type_l1_pae_shadow */
1276 0, /* SH_type_fl1_pae_shadow */
1277 0, /* SH_type_l2_pae_shadow */
1278 0, /* SH_type_l2h_pae_shadow */
1279 0, /* SH_type_l1_64_shadow */
1280 0, /* SH_type_fl1_64_shadow */
1281 0, /* SH_type_l2_64_shadow */
1282 0, /* SH_type_l2h_64_shadow */
1283 0, /* SH_type_l3_64_shadow */
1284 0, /* SH_type_l4_64_shadow */
1285 2, /* SH_type_p2m_table */
1286 0, /* SH_type_monitor_table */
1287 0 /* SH_type_oos_snapshot */
1288 };
1289 ASSERT(shadow_type < SH_type_unused);
1290 return type_to_order[shadow_type];
1293 static inline unsigned int
1294 shadow_max_order(struct domain *d)
1296 return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0;
1299 /* Do we have at total of count pages of the requested order free? */
1300 static inline int space_is_available(
1301 struct domain *d,
1302 unsigned int order,
1303 unsigned int count)
1305 for ( ; order <= shadow_max_order(d); ++order )
1307 unsigned int n = count;
1308 const struct list_head *p;
1310 list_for_each ( p, &d->arch.paging.shadow.freelists[order] )
1311 if ( --n == 0 )
1312 return 1;
1313 count = (count + 1) >> 1;
1316 return 0;
1319 /* Dispatcher function: call the per-mode function that will unhook the
1320 * non-Xen mappings in this top-level shadow mfn */
1321 static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
1323 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1324 switch ( sp->type )
1326 case SH_type_l2_32_shadow:
1327 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v,smfn);
1328 break;
1329 case SH_type_l2_pae_shadow:
1330 case SH_type_l2h_pae_shadow:
1331 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v,smfn);
1332 break;
1333 #if CONFIG_PAGING_LEVELS >= 4
1334 case SH_type_l4_64_shadow:
1335 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v,smfn);
1336 break;
1337 #endif
1338 default:
1339 SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->type);
1340 BUG();
1345 /* Make sure there are at least count order-sized pages
1346 * available in the shadow page pool. */
1347 static void _shadow_prealloc(
1348 struct domain *d,
1349 unsigned int order,
1350 unsigned int count)
1352 /* Need a vpcu for calling unpins; for now, since we don't have
1353 * per-vcpu shadows, any will do */
1354 struct vcpu *v, *v2;
1355 struct list_head *l, *t;
1356 struct shadow_page_info *sp;
1357 mfn_t smfn;
1358 int i;
1360 ASSERT(order <= shadow_max_order(d));
1361 if ( space_is_available(d, order, count) ) return;
1363 v = current;
1364 if ( v->domain != d )
1365 v = d->vcpu[0];
1366 ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */
1368 /* Stage one: walk the list of pinned pages, unpinning them */
1369 perfc_incr(shadow_prealloc_1);
1370 list_for_each_backwards_safe(l, t, &d->arch.paging.shadow.pinned_shadows)
1372 sp = list_entry(l, struct shadow_page_info, list);
1373 smfn = shadow_page_to_mfn(sp);
1375 /* Unpin this top-level shadow */
1376 sh_unpin(v, smfn);
1378 /* See if that freed up enough space */
1379 if ( space_is_available(d, order, count) ) return;
1382 /* Stage two: all shadow pages are in use in hierarchies that are
1383 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
1384 * mappings. */
1385 perfc_incr(shadow_prealloc_2);
1387 for_each_vcpu(d, v2)
1388 for ( i = 0 ; i < 4 ; i++ )
1390 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
1392 shadow_unhook_mappings(v,
1393 pagetable_get_mfn(v2->arch.shadow_table[i]));
1395 /* See if that freed up enough space */
1396 if ( space_is_available(d, order, count) )
1398 flush_tlb_mask(d->domain_dirty_cpumask);
1399 return;
1404 /* Nothing more we can do: all remaining shadows are of pages that
1405 * hold Xen mappings for some vcpu. This can never happen. */
1406 SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n"
1407 " shadow pages total = %u, free = %u, p2m=%u\n",
1408 count, order,
1409 d->arch.paging.shadow.total_pages,
1410 d->arch.paging.shadow.free_pages,
1411 d->arch.paging.shadow.p2m_pages);
1412 BUG();
1415 /* Make sure there are at least count pages of the order according to
1416 * type available in the shadow page pool.
1417 * This must be called before any calls to shadow_alloc(). Since this
1418 * will free existing shadows to make room, it must be called early enough
1419 * to avoid freeing shadows that the caller is currently working on. */
1420 void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
1422 ASSERT(type != SH_type_p2m_table);
1423 return _shadow_prealloc(d, shadow_order(type), count);
1426 /* Deliberately free all the memory we can: this will tear down all of
1427 * this domain's shadows */
1428 static void shadow_blow_tables(struct domain *d)
1430 struct list_head *l, *t;
1431 struct shadow_page_info *sp;
1432 struct vcpu *v = d->vcpu[0];
1433 mfn_t smfn;
1434 int i;
1436 ASSERT(v != NULL);
1438 /* Pass one: unpin all pinned pages */
1439 list_for_each_backwards_safe(l,t, &d->arch.paging.shadow.pinned_shadows)
1441 sp = list_entry(l, struct shadow_page_info, list);
1442 smfn = shadow_page_to_mfn(sp);
1443 sh_unpin(v, smfn);
1446 /* Second pass: unhook entries of in-use shadows */
1447 for_each_vcpu(d, v)
1448 for ( i = 0 ; i < 4 ; i++ )
1449 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
1450 shadow_unhook_mappings(v,
1451 pagetable_get_mfn(v->arch.shadow_table[i]));
1453 /* Make sure everyone sees the unshadowings */
1454 flush_tlb_mask(d->domain_dirty_cpumask);
1457 void shadow_blow_tables_per_domain(struct domain *d)
1459 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL ) {
1460 shadow_lock(d);
1461 shadow_blow_tables(d);
1462 shadow_unlock(d);
1466 #ifndef NDEBUG
1467 /* Blow all shadows of all shadowed domains: this can be used to cause the
1468 * guest's pagetables to be re-shadowed if we suspect that the shadows
1469 * have somehow got out of sync */
1470 static void shadow_blow_all_tables(unsigned char c)
1472 struct domain *d;
1473 printk("'%c' pressed -> blowing all shadow tables\n", c);
1474 rcu_read_lock(&domlist_read_lock);
1475 for_each_domain(d)
1477 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
1479 shadow_lock(d);
1480 shadow_blow_tables(d);
1481 shadow_unlock(d);
1484 rcu_read_unlock(&domlist_read_lock);
1487 /* Register this function in the Xen console keypress table */
1488 static __init int shadow_blow_tables_keyhandler_init(void)
1490 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
1491 return 0;
1493 __initcall(shadow_blow_tables_keyhandler_init);
1494 #endif /* !NDEBUG */
1496 /* Allocate another shadow's worth of (contiguous, aligned) pages,
1497 * and fill in the type and backpointer fields of their page_infos.
1498 * Never fails to allocate. */
1499 mfn_t shadow_alloc(struct domain *d,
1500 u32 shadow_type,
1501 unsigned long backpointer)
1503 struct shadow_page_info *sp = NULL;
1504 unsigned int order = shadow_order(shadow_type);
1505 cpumask_t mask;
1506 void *p;
1507 int i;
1509 ASSERT(shadow_locked_by_me(d));
1510 if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d))
1511 order = shadow_max_order(d);
1512 ASSERT(order <= shadow_max_order(d));
1513 ASSERT(shadow_type != SH_type_none);
1514 perfc_incr(shadow_alloc);
1516 /* Find smallest order which can satisfy the request. */
1517 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
1518 if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
1519 goto found;
1521 /* If we get here, we failed to allocate. This should never happen.
1522 * It means that we didn't call shadow_prealloc() correctly before
1523 * we allocated. We can't recover by calling prealloc here, because
1524 * we might free up higher-level pages that the caller is working on. */
1525 SHADOW_ERROR("Can't allocate %i shadow pages!\n", 1 << order);
1526 BUG();
1528 found:
1529 sp = list_entry(d->arch.paging.shadow.freelists[i].next,
1530 struct shadow_page_info, list);
1531 list_del(&sp->list);
1533 /* We may have to halve the chunk a number of times. */
1534 while ( i != order )
1536 i--;
1537 sp->order = i;
1538 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[i]);
1539 sp += 1 << i;
1541 d->arch.paging.shadow.free_pages -= 1 << order;
1543 /* Init page info fields and clear the pages */
1544 for ( i = 0; i < 1<<order ; i++ )
1546 /* Before we overwrite the old contents of this page,
1547 * we need to be sure that no TLB holds a pointer to it. */
1548 mask = d->domain_dirty_cpumask;
1549 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
1550 if ( unlikely(!cpus_empty(mask)) )
1552 perfc_incr(shadow_alloc_tlbflush);
1553 flush_tlb_mask(mask);
1555 /* Now safe to clear the page for reuse */
1556 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
1557 ASSERT(p != NULL);
1558 clear_page(p);
1559 sh_unmap_domain_page(p);
1560 INIT_LIST_HEAD(&sp[i].list);
1561 sp[i].type = shadow_type;
1562 sp[i].pinned = 0;
1563 sp[i].count = 0;
1564 sp[i].backpointer = backpointer;
1565 sp[i].next_shadow = NULL;
1566 perfc_incr(shadow_alloc_count);
1568 return shadow_page_to_mfn(sp);
1572 /* Return some shadow pages to the pool. */
1573 void shadow_free(struct domain *d, mfn_t smfn)
1575 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1576 u32 shadow_type;
1577 unsigned long order;
1578 unsigned long mask;
1579 int i;
1581 ASSERT(shadow_locked_by_me(d));
1582 perfc_incr(shadow_free);
1584 shadow_type = sp->type;
1585 ASSERT(shadow_type != SH_type_none);
1586 ASSERT(shadow_type != SH_type_p2m_table);
1587 order = shadow_order(shadow_type);
1589 d->arch.paging.shadow.free_pages += 1 << order;
1591 for ( i = 0; i < 1<<order; i++ )
1593 #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION)
1594 struct vcpu *v;
1595 for_each_vcpu(d, v)
1597 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1598 /* No longer safe to look for a writeable mapping in this shadow */
1599 if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
1600 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1601 #endif
1602 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
1603 v->arch.paging.last_write_emul_ok = 0;
1604 #endif
1606 #endif
1607 /* Strip out the type: this is now a free shadow page */
1608 sp[i].type = 0;
1609 /* Remember the TLB timestamp so we will know whether to flush
1610 * TLBs when we reuse the page. Because the destructors leave the
1611 * contents of the pages in place, we can delay TLB flushes until
1612 * just before the allocator hands the page out again. */
1613 sp[i].tlbflush_timestamp = tlbflush_current_time();
1614 perfc_decr(shadow_alloc_count);
1617 /* Merge chunks as far as possible. */
1618 for ( ; order < shadow_max_order(d); ++order )
1620 mask = 1 << order;
1621 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
1622 /* Merge with predecessor block? */
1623 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
1624 break;
1625 list_del(&(sp-mask)->list);
1626 sp -= mask;
1627 } else {
1628 /* Merge with successor block? */
1629 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
1630 break;
1631 list_del(&(sp+mask)->list);
1635 sp->order = order;
1636 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1639 /* Divert some memory from the pool to be used by the p2m mapping.
1640 * This action is irreversible: the p2m mapping only ever grows.
1641 * That's OK because the p2m table only exists for translated domains,
1642 * and those domains can't ever turn off shadow mode.
1643 * Also, we only ever allocate a max-order chunk, so as to preserve
1644 * the invariant that shadow_prealloc() always works.
1645 * Returns 0 iff it can't get a chunk (the caller should then
1646 * free up some pages in domheap and call sh_set_allocation);
1647 * returns non-zero on success.
1648 */
1649 static int
1650 sh_alloc_p2m_pages(struct domain *d)
1652 struct page_info *pg;
1653 u32 i;
1654 unsigned int order = shadow_max_order(d);
1656 ASSERT(shadow_locked_by_me(d));
1658 if ( d->arch.paging.shadow.total_pages
1659 < (shadow_min_acceptable_pages(d) + (1 << order)) )
1660 return 0; /* Not enough shadow memory: need to increase it first */
1662 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1663 d->arch.paging.shadow.p2m_pages += (1 << order);
1664 d->arch.paging.shadow.total_pages -= (1 << order);
1665 for (i = 0; i < (1U << order); i++)
1667 /* Unlike shadow pages, mark p2m pages as owned by the domain.
1668 * Marking the domain as the owner would normally allow the guest to
1669 * create mappings of these pages, but these p2m pages will never be
1670 * in the domain's guest-physical address space, and so that is not
1671 * believed to be a concern.
1672 */
1673 page_set_owner(&pg[i], d);
1674 pg[i].count_info = 1;
1675 list_add_tail(&pg[i].list, &d->arch.paging.shadow.p2m_freelist);
1677 return 1;
1680 // Returns 0 if no memory is available...
1681 static struct page_info *
1682 shadow_alloc_p2m_page(struct domain *d)
1684 struct list_head *entry;
1685 struct page_info *pg;
1686 mfn_t mfn;
1687 void *p;
1689 shadow_lock(d);
1691 if ( list_empty(&d->arch.paging.shadow.p2m_freelist) &&
1692 !sh_alloc_p2m_pages(d) )
1694 shadow_unlock(d);
1695 return NULL;
1697 entry = d->arch.paging.shadow.p2m_freelist.next;
1698 list_del(entry);
1700 shadow_unlock(d);
1702 pg = list_entry(entry, struct page_info, list);
1703 mfn = page_to_mfn(pg);
1704 p = sh_map_domain_page(mfn);
1705 clear_page(p);
1706 sh_unmap_domain_page(p);
1708 return pg;
1711 static void
1712 shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1714 ASSERT(page_get_owner(pg) == d);
1715 /* Should have just the one ref we gave it in alloc_p2m_page() */
1716 if ( (pg->count_info & PGC_count_mask) != 1 )
1718 SHADOW_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1719 pg->count_info, pg->u.inuse.type_info);
1721 pg->count_info = 0;
1722 /* Free should not decrement domain's total allocation, since
1723 * these pages were allocated without an owner. */
1724 page_set_owner(pg, NULL);
1725 free_domheap_pages(pg, 0);
1726 d->arch.paging.shadow.p2m_pages--;
1727 perfc_decr(shadow_alloc_count);
1730 #if CONFIG_PAGING_LEVELS == 3
1731 static void p2m_install_entry_in_monitors(struct domain *d,
1732 l3_pgentry_t *l3e)
1733 /* Special case, only used for external-mode domains on PAE hosts:
1734 * update the mapping of the p2m table. Once again, this is trivial in
1735 * other paging modes (one top-level entry points to the top-level p2m,
1736 * no maintenance needed), but PAE makes life difficult by needing a
1737 * copy the eight l3es of the p2m table in eight l2h slots in the
1738 * monitor table. This function makes fresh copies when a p2m l3e
1739 * changes. */
1741 l2_pgentry_t *ml2e;
1742 struct vcpu *v;
1743 unsigned int index;
1745 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1746 ASSERT(index < MACHPHYS_MBYTES>>1);
1748 for_each_vcpu(d, v)
1750 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
1751 continue;
1752 ASSERT(shadow_mode_external(v->domain));
1754 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1755 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1757 if ( v == current ) /* OK to use linear map of monitor_table */
1758 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1759 else
1761 l3_pgentry_t *ml3e;
1762 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1763 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1764 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1765 ml2e += l2_table_offset(RO_MPT_VIRT_START);
1766 sh_unmap_domain_page(ml3e);
1768 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1769 if ( v != current )
1770 sh_unmap_domain_page(ml2e);
1773 #endif
1775 /* Set the pool of shadow pages to the required number of pages.
1776 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1777 * plus space for the p2m table.
1778 * Returns 0 for success, non-zero for failure. */
1779 static unsigned int sh_set_allocation(struct domain *d,
1780 unsigned int pages,
1781 int *preempted)
1783 struct shadow_page_info *sp;
1784 unsigned int lower_bound;
1785 unsigned int j, order = shadow_max_order(d);
1787 ASSERT(shadow_locked_by_me(d));
1789 /* Don't allocate less than the minimum acceptable, plus one page per
1790 * megabyte of RAM (for the p2m table) */
1791 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1792 if ( pages > 0 && pages < lower_bound )
1793 pages = lower_bound;
1794 /* Round up to largest block size */
1795 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1797 SHADOW_PRINTK("current %i target %i\n",
1798 d->arch.paging.shadow.total_pages, pages);
1800 while ( d->arch.paging.shadow.total_pages != pages )
1802 if ( d->arch.paging.shadow.total_pages < pages )
1804 /* Need to allocate more memory from domheap */
1805 sp = (struct shadow_page_info *)
1806 alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d)));
1807 if ( sp == NULL )
1809 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1810 return -ENOMEM;
1812 d->arch.paging.shadow.free_pages += 1 << order;
1813 d->arch.paging.shadow.total_pages += 1 << order;
1814 for ( j = 0; j < 1U << order; j++ )
1816 sp[j].type = 0;
1817 sp[j].pinned = 0;
1818 sp[j].count = 0;
1819 sp[j].mbz = 0;
1820 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1822 sp->order = order;
1823 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1825 else if ( d->arch.paging.shadow.total_pages > pages )
1827 /* Need to return memory to domheap */
1828 _shadow_prealloc(d, order, 1);
1829 ASSERT(!list_empty(&d->arch.paging.shadow.freelists[order]));
1830 sp = list_entry(d->arch.paging.shadow.freelists[order].next,
1831 struct shadow_page_info, list);
1832 list_del(&sp->list);
1833 #if defined(__x86_64__)
1834 /*
1835 * Re-instate lock field which we overwrite with shadow_page_info.
1836 * This was safe, since the lock is only used on guest pages.
1837 */
1838 for ( j = 0; j < 1U << order; j++ )
1839 spin_lock_init(&((struct page_info *)sp)[j].lock);
1840 #endif
1841 d->arch.paging.shadow.free_pages -= 1 << order;
1842 d->arch.paging.shadow.total_pages -= 1 << order;
1843 free_domheap_pages((struct page_info *)sp, order);
1846 /* Check to see if we need to yield and try again */
1847 if ( preempted && hypercall_preempt_check() )
1849 *preempted = 1;
1850 return 0;
1854 return 0;
1857 /* Return the size of the shadow pool, rounded up to the nearest MB */
1858 static unsigned int shadow_get_allocation(struct domain *d)
1860 unsigned int pg = d->arch.paging.shadow.total_pages;
1861 return ((pg >> (20 - PAGE_SHIFT))
1862 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1865 /**************************************************************************/
1866 /* Hash table for storing the guest->shadow mappings.
1867 * The table itself is an array of pointers to shadows; the shadows are then
1868 * threaded on a singly-linked list of shadows with the same hash value */
1870 #define SHADOW_HASH_BUCKETS 251
1871 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1873 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1874 typedef u32 key_t;
1875 static inline key_t sh_hash(unsigned long n, unsigned int t)
1877 unsigned char *p = (unsigned char *)&n;
1878 key_t k = t;
1879 int i;
1880 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1881 return k % SHADOW_HASH_BUCKETS;
1884 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1886 /* Before we get to the mechanism, define a pair of audit functions
1887 * that sanity-check the contents of the hash table. */
1888 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1889 /* Audit one bucket of the hash table */
1891 struct shadow_page_info *sp, *x;
1893 if ( !(SHADOW_AUDIT_ENABLE) )
1894 return;
1896 sp = d->arch.paging.shadow.hash_table[bucket];
1897 while ( sp )
1899 /* Not a shadow? */
1900 BUG_ON( sp->mbz != 0 );
1901 /* Bogus type? */
1902 BUG_ON( sp->type == 0 );
1903 BUG_ON( sp->type > SH_type_max_shadow );
1904 /* Wrong bucket? */
1905 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
1906 /* Duplicate entry? */
1907 for ( x = sp->next_shadow; x; x = x->next_shadow )
1908 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1909 /* Follow the backpointer to the guest pagetable */
1910 if ( sp->type != SH_type_fl1_32_shadow
1911 && sp->type != SH_type_fl1_pae_shadow
1912 && sp->type != SH_type_fl1_64_shadow )
1914 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1915 /* Bad shadow flags on guest page? */
1916 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1917 /* Bad type count on guest page? */
1918 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1919 if ( sp->type == SH_type_l1_32_shadow
1920 || sp->type == SH_type_l1_pae_shadow
1921 || sp->type == SH_type_l1_64_shadow )
1923 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1924 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1926 if ( !page_is_out_of_sync(gpg) )
1928 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1929 " and not OOS but has typecount %#lx\n",
1930 sp->backpointer,
1931 mfn_x(shadow_page_to_mfn(sp)),
1932 gpg->u.inuse.type_info);
1933 BUG();
1937 else /* Not an l1 */
1938 #endif
1939 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1940 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1942 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1943 " but has typecount %#lx\n",
1944 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
1945 gpg->u.inuse.type_info);
1946 BUG();
1949 /* That entry was OK; on we go */
1950 sp = sp->next_shadow;
1954 #else
1955 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
1956 #endif /* Hashtable bucket audit */
1959 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1961 static void sh_hash_audit(struct domain *d)
1962 /* Full audit: audit every bucket in the table */
1964 int i;
1966 if ( !(SHADOW_AUDIT_ENABLE) )
1967 return;
1969 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1971 sh_hash_audit_bucket(d, i);
1975 #else
1976 #define sh_hash_audit(_d) do {} while(0)
1977 #endif /* Hashtable bucket audit */
1979 /* Allocate and initialise the table itself.
1980 * Returns 0 for success, 1 for error. */
1981 static int shadow_hash_alloc(struct domain *d)
1983 struct shadow_page_info **table;
1985 ASSERT(shadow_locked_by_me(d));
1986 ASSERT(!d->arch.paging.shadow.hash_table);
1988 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1989 if ( !table ) return 1;
1990 memset(table, 0,
1991 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1992 d->arch.paging.shadow.hash_table = table;
1993 return 0;
1996 /* Tear down the hash table and return all memory to Xen.
1997 * This function does not care whether the table is populated. */
1998 static void shadow_hash_teardown(struct domain *d)
2000 ASSERT(shadow_locked_by_me(d));
2001 ASSERT(d->arch.paging.shadow.hash_table);
2003 xfree(d->arch.paging.shadow.hash_table);
2004 d->arch.paging.shadow.hash_table = NULL;
2008 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
2009 /* Find an entry in the hash table. Returns the MFN of the shadow,
2010 * or INVALID_MFN if it doesn't exist */
2012 struct domain *d = v->domain;
2013 struct shadow_page_info *sp, *prev;
2014 key_t key;
2016 ASSERT(shadow_locked_by_me(d));
2017 ASSERT(d->arch.paging.shadow.hash_table);
2018 ASSERT(t);
2020 sh_hash_audit(d);
2022 perfc_incr(shadow_hash_lookups);
2023 key = sh_hash(n, t);
2024 sh_hash_audit_bucket(d, key);
2026 sp = d->arch.paging.shadow.hash_table[key];
2027 prev = NULL;
2028 while(sp)
2030 if ( sp->backpointer == n && sp->type == t )
2032 /* Pull-to-front if 'sp' isn't already the head item */
2033 if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
2035 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
2036 /* Can't reorder: someone is walking the hash chains */
2037 return shadow_page_to_mfn(sp);
2038 else
2040 ASSERT(prev);
2041 /* Delete sp from the list */
2042 prev->next_shadow = sp->next_shadow;
2043 /* Re-insert it at the head of the list */
2044 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
2045 d->arch.paging.shadow.hash_table[key] = sp;
2048 else
2050 perfc_incr(shadow_hash_lookup_head);
2052 return shadow_page_to_mfn(sp);
2054 prev = sp;
2055 sp = sp->next_shadow;
2058 perfc_incr(shadow_hash_lookup_miss);
2059 return _mfn(INVALID_MFN);
2062 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
2063 mfn_t smfn)
2064 /* Put a mapping (n,t)->smfn into the hash table */
2066 struct domain *d = v->domain;
2067 struct shadow_page_info *sp;
2068 key_t key;
2070 ASSERT(shadow_locked_by_me(d));
2071 ASSERT(d->arch.paging.shadow.hash_table);
2072 ASSERT(t);
2074 sh_hash_audit(d);
2076 perfc_incr(shadow_hash_inserts);
2077 key = sh_hash(n, t);
2078 sh_hash_audit_bucket(d, key);
2080 /* Insert this shadow at the top of the bucket */
2081 sp = mfn_to_shadow_page(smfn);
2082 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
2083 d->arch.paging.shadow.hash_table[key] = sp;
2085 sh_hash_audit_bucket(d, key);
2088 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
2089 mfn_t smfn)
2090 /* Excise the mapping (n,t)->smfn from the hash table */
2092 struct domain *d = v->domain;
2093 struct shadow_page_info *sp, *x;
2094 key_t key;
2096 ASSERT(shadow_locked_by_me(d));
2097 ASSERT(d->arch.paging.shadow.hash_table);
2098 ASSERT(t);
2100 sh_hash_audit(d);
2102 perfc_incr(shadow_hash_deletes);
2103 key = sh_hash(n, t);
2104 sh_hash_audit_bucket(d, key);
2106 sp = mfn_to_shadow_page(smfn);
2107 if ( d->arch.paging.shadow.hash_table[key] == sp )
2108 /* Easy case: we're deleting the head item. */
2109 d->arch.paging.shadow.hash_table[key] = sp->next_shadow;
2110 else
2112 /* Need to search for the one we want */
2113 x = d->arch.paging.shadow.hash_table[key];
2114 while ( 1 )
2116 ASSERT(x); /* We can't have hit the end, since our target is
2117 * still in the chain somehwere... */
2118 if ( x->next_shadow == sp )
2120 x->next_shadow = sp->next_shadow;
2121 break;
2123 x = x->next_shadow;
2126 sp->next_shadow = NULL;
2128 sh_hash_audit_bucket(d, key);
2131 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
2133 static void hash_foreach(struct vcpu *v,
2134 unsigned int callback_mask,
2135 hash_callback_t callbacks[],
2136 mfn_t callback_mfn)
2137 /* Walk the hash table looking at the types of the entries and
2138 * calling the appropriate callback function for each entry.
2139 * The mask determines which shadow types we call back for, and the array
2140 * of callbacks tells us which function to call.
2141 * Any callback may return non-zero to let us skip the rest of the scan.
2143 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
2144 * then return non-zero to terminate the scan. */
2146 int i, done = 0;
2147 struct domain *d = v->domain;
2148 struct shadow_page_info *x;
2150 /* Say we're here, to stop hash-lookups reordering the chains */
2151 ASSERT(shadow_locked_by_me(d));
2152 ASSERT(d->arch.paging.shadow.hash_walking == 0);
2153 d->arch.paging.shadow.hash_walking = 1;
2155 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
2157 /* WARNING: This is not safe against changes to the hash table.
2158 * The callback *must* return non-zero if it has inserted or
2159 * deleted anything from the hash (lookups are OK, though). */
2160 for ( x = d->arch.paging.shadow.hash_table[i]; x; x = x->next_shadow )
2162 if ( callback_mask & (1 << x->type) )
2164 ASSERT(x->type <= 15);
2165 ASSERT(callbacks[x->type] != NULL);
2166 done = callbacks[x->type](v, shadow_page_to_mfn(x),
2167 callback_mfn);
2168 if ( done ) break;
2171 if ( done ) break;
2173 d->arch.paging.shadow.hash_walking = 0;
2177 /**************************************************************************/
2178 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
2179 * which will decrement refcounts appropriately and return memory to the
2180 * free pool. */
2182 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
2184 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2185 unsigned int t = sp->type;
2188 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
2190 /* Double-check, if we can, that the shadowed page belongs to this
2191 * domain, (by following the back-pointer). */
2192 ASSERT(t == SH_type_fl1_32_shadow ||
2193 t == SH_type_fl1_pae_shadow ||
2194 t == SH_type_fl1_64_shadow ||
2195 t == SH_type_monitor_table ||
2196 (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
2197 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
2198 == v->domain));
2200 /* The down-shifts here are so that the switch statement is on nice
2201 * small numbers that the compiler will enjoy */
2202 switch ( t )
2204 case SH_type_l1_32_shadow:
2205 case SH_type_fl1_32_shadow:
2206 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(v, smfn);
2207 break;
2208 case SH_type_l2_32_shadow:
2209 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(v, smfn);
2210 break;
2212 case SH_type_l1_pae_shadow:
2213 case SH_type_fl1_pae_shadow:
2214 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(v, smfn);
2215 break;
2216 case SH_type_l2_pae_shadow:
2217 case SH_type_l2h_pae_shadow:
2218 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(v, smfn);
2219 break;
2221 #if CONFIG_PAGING_LEVELS >= 4
2222 case SH_type_l1_64_shadow:
2223 case SH_type_fl1_64_shadow:
2224 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(v, smfn);
2225 break;
2226 case SH_type_l2h_64_shadow:
2227 ASSERT(is_pv_32on64_vcpu(v));
2228 /* Fall through... */
2229 case SH_type_l2_64_shadow:
2230 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(v, smfn);
2231 break;
2232 case SH_type_l3_64_shadow:
2233 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(v, smfn);
2234 break;
2235 case SH_type_l4_64_shadow:
2236 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(v, smfn);
2237 break;
2238 #endif
2239 default:
2240 SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
2241 (unsigned long)t);
2242 BUG();
2246 /**************************************************************************/
2247 /* Remove all writeable mappings of a guest frame from the shadow tables
2248 * Returns non-zero if we need to flush TLBs.
2249 * level and fault_addr desribe how we found this to be a pagetable;
2250 * level==0 means we have some other reason for revoking write access.
2251 * If level==0 we are allowed to fail, returning -1. */
2253 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
2254 unsigned int level,
2255 unsigned long fault_addr)
2257 /* Dispatch table for getting per-type functions */
2258 static hash_callback_t callbacks[SH_type_unused] = {
2259 NULL, /* none */
2260 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32 */
2261 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32 */
2262 NULL, /* l2_32 */
2263 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* l1_pae */
2264 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* fl1_pae */
2265 NULL, /* l2_pae */
2266 NULL, /* l2h_pae */
2267 #if CONFIG_PAGING_LEVELS >= 4
2268 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* l1_64 */
2269 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* fl1_64 */
2270 #else
2271 NULL, /* l1_64 */
2272 NULL, /* fl1_64 */
2273 #endif
2274 NULL, /* l2_64 */
2275 NULL, /* l2h_64 */
2276 NULL, /* l3_64 */
2277 NULL, /* l4_64 */
2278 NULL, /* p2m */
2279 NULL /* unused */
2280 };
2282 static unsigned int callback_mask =
2283 1 << SH_type_l1_32_shadow
2284 | 1 << SH_type_fl1_32_shadow
2285 | 1 << SH_type_l1_pae_shadow
2286 | 1 << SH_type_fl1_pae_shadow
2287 | 1 << SH_type_l1_64_shadow
2288 | 1 << SH_type_fl1_64_shadow
2290 struct page_info *pg = mfn_to_page(gmfn);
2292 ASSERT(shadow_locked_by_me(v->domain));
2294 /* Only remove writable mappings if we are doing shadow refcounts.
2295 * In guest refcounting, we trust Xen to already be restricting
2296 * all the writes to the guest page tables, so we do not need to
2297 * do more. */
2298 if ( !shadow_mode_refcounts(v->domain) )
2299 return 0;
2301 /* Early exit if it's already a pagetable, or otherwise not writeable */
2302 if ( (sh_mfn_is_a_page_table(gmfn)
2303 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2304 /* Unless they've been allowed to go out of sync with their shadows */
2305 && !mfn_oos_may_write(gmfn)
2306 #endif
2308 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2309 return 0;
2311 perfc_incr(shadow_writeable);
2313 /* If this isn't a "normal" writeable page, the domain is trying to
2314 * put pagetables in special memory of some kind. We can't allow that. */
2315 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
2317 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
2318 PRtype_info "\n",
2319 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
2320 domain_crash(v->domain);
2323 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
2324 if ( v == current )
2326 unsigned long gfn;
2327 /* Heuristic: there is likely to be only one writeable mapping,
2328 * and that mapping is likely to be in the current pagetable,
2329 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
2331 #define GUESS(_a, _h) do { \
2332 if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
2333 perfc_incr(shadow_writeable_h_ ## _h); \
2334 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
2335 return 1; \
2336 } while (0)
2338 if ( level == 0 && fault_addr )
2339 GUESS(fault_addr, 6);
2341 if ( v->arch.paging.mode->guest_levels == 2 )
2343 if ( level == 1 )
2344 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
2345 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
2347 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2348 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2349 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2352 else if ( v->arch.paging.mode->guest_levels == 3 )
2354 /* 32bit PAE w2k3: linear map at 0xC0000000 */
2355 switch ( level )
2357 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
2358 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
2361 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
2362 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
2363 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
2365 #if CONFIG_PAGING_LEVELS >= 4
2366 else if ( v->arch.paging.mode->guest_levels == 4 )
2368 /* 64bit w2k3: linear map at 0xfffff68000000000 */
2369 switch ( level )
2371 case 1: GUESS(0xfffff68000000000UL
2372 + ((fault_addr & VADDR_MASK) >> 9), 3); break;
2373 case 2: GUESS(0xfffff6fb40000000UL
2374 + ((fault_addr & VADDR_MASK) >> 18), 3); break;
2375 case 3: GUESS(0xfffff6fb7da00000UL
2376 + ((fault_addr & VADDR_MASK) >> 27), 3); break;
2379 /* 64bit Linux direct map at 0xffff810000000000; older kernels
2380 * had it at 0x0000010000000000UL */
2381 gfn = mfn_to_gfn(v->domain, gmfn);
2382 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
2383 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
2384 /*
2385 * 64bit Solaris kernel page map at
2386 * kpm_vbase; 0xfffffe0000000000UL
2387 */
2388 GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4);
2390 #endif /* CONFIG_PAGING_LEVELS >= 4 */
2392 #undef GUESS
2395 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2396 return 1;
2398 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
2399 * (entries in the fixmap) where linux maps its pagetables. Since
2400 * we expect to hit them most of the time, we start the search for
2401 * the writeable mapping by looking at the same MFN where the last
2402 * brute-force search succeeded. */
2404 if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
2406 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
2407 mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
2408 int shtype = mfn_to_shadow_page(last_smfn)->type;
2410 if ( callbacks[shtype] )
2411 callbacks[shtype](v, last_smfn, gmfn);
2413 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
2414 perfc_incr(shadow_writeable_h_5);
2417 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
2418 return 1;
2420 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
2422 /* Brute-force search of all the shadows, by walking the hash */
2423 if ( level == 0 )
2424 perfc_incr(shadow_writeable_bf_1);
2425 else
2426 perfc_incr(shadow_writeable_bf);
2427 hash_foreach(v, callback_mask, callbacks, gmfn);
2429 /* If that didn't catch the mapping, then there's some non-pagetable
2430 * mapping -- ioreq page, grant mapping, &c. */
2431 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
2433 if ( level == 0 )
2434 return -1;
2436 SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
2437 "%lu special-use mappings of it\n", mfn_x(gmfn),
2438 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
2439 domain_crash(v->domain);
2442 /* We killed at least one writeable mapping, so must flush TLBs. */
2443 return 1;
2446 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2447 int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
2448 mfn_t smfn, unsigned long off)
2450 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2452 ASSERT(mfn_valid(smfn));
2453 ASSERT(mfn_valid(gmfn));
2455 if ( sp->type == SH_type_l1_32_shadow )
2457 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
2458 (v, gmfn, smfn, off);
2460 #if CONFIG_PAGING_LEVELS >= 3
2461 else if ( sp->type == SH_type_l1_pae_shadow )
2462 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
2463 (v, gmfn, smfn, off);
2464 #if CONFIG_PAGING_LEVELS >= 4
2465 else if ( sp->type == SH_type_l1_64_shadow )
2466 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
2467 (v, gmfn, smfn, off);
2468 #endif
2469 #endif
2471 return 0;
2473 #endif
2475 /**************************************************************************/
2476 /* Remove all mappings of a guest frame from the shadow tables.
2477 * Returns non-zero if we need to flush TLBs. */
2479 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
2481 struct page_info *page = mfn_to_page(gmfn);
2482 int expected_count, do_locking;
2484 /* Dispatch table for getting per-type functions */
2485 static hash_callback_t callbacks[SH_type_unused] = {
2486 NULL, /* none */
2487 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32 */
2488 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32 */
2489 NULL, /* l2_32 */
2490 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* l1_pae */
2491 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* fl1_pae */
2492 NULL, /* l2_pae */
2493 NULL, /* l2h_pae */
2494 #if CONFIG_PAGING_LEVELS >= 4
2495 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* l1_64 */
2496 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* fl1_64 */
2497 #else
2498 NULL, /* l1_64 */
2499 NULL, /* fl1_64 */
2500 #endif
2501 NULL, /* l2_64 */
2502 NULL, /* l2h_64 */
2503 NULL, /* l3_64 */
2504 NULL, /* l4_64 */
2505 NULL, /* p2m */
2506 NULL /* unused */
2507 };
2509 static unsigned int callback_mask =
2510 1 << SH_type_l1_32_shadow
2511 | 1 << SH_type_fl1_32_shadow
2512 | 1 << SH_type_l1_pae_shadow
2513 | 1 << SH_type_fl1_pae_shadow
2514 | 1 << SH_type_l1_64_shadow
2515 | 1 << SH_type_fl1_64_shadow
2518 perfc_incr(shadow_mappings);
2519 if ( (page->count_info & PGC_count_mask) == 0 )
2520 return 0;
2522 /* Although this is an externally visible function, we do not know
2523 * whether the shadow lock will be held when it is called (since it
2524 * can be called via put_page_type when we clear a shadow l1e).
2525 * If the lock isn't held, take it for the duration of the call. */
2526 do_locking = !shadow_locked_by_me(v->domain);
2527 if ( do_locking ) shadow_lock(v->domain);
2529 /* XXX TODO:
2530 * Heuristics for finding the (probably) single mapping of this gmfn */
2532 /* Brute-force search of all the shadows, by walking the hash */
2533 perfc_incr(shadow_mappings_bf);
2534 hash_foreach(v, callback_mask, callbacks, gmfn);
2536 /* If that didn't catch the mapping, something is very wrong */
2537 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2538 if ( (page->count_info & PGC_count_mask) != expected_count )
2540 /* Don't complain if we're in HVM and there are some extra mappings:
2541 * The qemu helper process has an untyped mapping of this dom's RAM
2542 * and the HVM restore program takes another. */
2543 if ( !(shadow_mode_external(v->domain)
2544 && (page->count_info & PGC_count_mask) <= 3
2545 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2547 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2548 "c=%08x t=%08lx\n", mfn_x(gmfn),
2549 page->count_info, page->u.inuse.type_info);
2553 if ( do_locking ) shadow_unlock(v->domain);
2555 /* We killed at least one mapping, so must flush TLBs. */
2556 return 1;
2560 /**************************************************************************/
2561 /* Remove all shadows of a guest frame from the shadow tables */
2563 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2564 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2565 * found there. Returns 1 if that was the only reference to this shadow */
2567 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
2568 mfn_t pmfn;
2569 void *vaddr;
2570 int rc;
2572 ASSERT(sp->type > 0);
2573 ASSERT(sp->type < SH_type_max_shadow);
2574 ASSERT(sp->type != SH_type_l2_32_shadow);
2575 ASSERT(sp->type != SH_type_l2_pae_shadow);
2576 ASSERT(sp->type != SH_type_l2h_pae_shadow);
2577 ASSERT(sp->type != SH_type_l4_64_shadow);
2579 if (sp->up == 0) return 0;
2580 pmfn = _mfn(sp->up >> PAGE_SHIFT);
2581 ASSERT(mfn_valid(pmfn));
2582 vaddr = sh_map_domain_page(pmfn);
2583 ASSERT(vaddr);
2584 vaddr += sp->up & (PAGE_SIZE-1);
2585 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2587 /* Is this the only reference to this shadow? */
2588 rc = (sp->count == 1) ? 1 : 0;
2590 /* Blank the offending entry */
2591 switch (sp->type)
2593 case SH_type_l1_32_shadow:
2594 case SH_type_l2_32_shadow:
2595 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(v, vaddr, pmfn);
2596 break;
2597 case SH_type_l1_pae_shadow:
2598 case SH_type_l2_pae_shadow:
2599 case SH_type_l2h_pae_shadow:
2600 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(v, vaddr, pmfn);
2601 break;
2602 #if CONFIG_PAGING_LEVELS >= 4
2603 case SH_type_l1_64_shadow:
2604 case SH_type_l2_64_shadow:
2605 case SH_type_l2h_64_shadow:
2606 case SH_type_l3_64_shadow:
2607 case SH_type_l4_64_shadow:
2608 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(v, vaddr, pmfn);
2609 break;
2610 #endif
2611 default: BUG(); /* Some wierd unknown shadow type */
2614 sh_unmap_domain_page(vaddr);
2615 if ( rc )
2616 perfc_incr(shadow_up_pointer);
2617 else
2618 perfc_incr(shadow_unshadow_bf);
2620 return rc;
2623 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2624 /* Remove the shadows of this guest page.
2625 * If fast != 0, just try the quick heuristic, which will remove
2626 * at most one reference to each shadow of the page. Otherwise, walk
2627 * all the shadow tables looking for refs to shadows of this gmfn.
2628 * If all != 0, kill the domain if we can't find all the shadows.
2629 * (all != 0 implies fast == 0)
2630 */
2632 struct page_info *pg = mfn_to_page(gmfn);
2633 mfn_t smfn;
2634 int do_locking;
2635 unsigned char t;
2637 /* Dispatch table for getting per-type functions: each level must
2638 * be called with the function to remove a lower-level shadow. */
2639 static hash_callback_t callbacks[SH_type_unused] = {
2640 NULL, /* none */
2641 NULL, /* l1_32 */
2642 NULL, /* fl1_32 */
2643 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 2), /* l2_32 */
2644 NULL, /* l1_pae */
2645 NULL, /* fl1_pae */
2646 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2_pae */
2647 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2h_pae */
2648 NULL, /* l1_64 */
2649 NULL, /* fl1_64 */
2650 #if CONFIG_PAGING_LEVELS >= 4
2651 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2_64 */
2652 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2h_64 */
2653 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, 4), /* l3_64 */
2654 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, 4), /* l4_64 */
2655 #else
2656 NULL, /* l2_64 */
2657 NULL, /* l2h_64 */
2658 NULL, /* l3_64 */
2659 NULL, /* l4_64 */
2660 #endif
2661 NULL, /* p2m */
2662 NULL /* unused */
2663 };
2665 /* Another lookup table, for choosing which mask to use */
2666 static unsigned int masks[SH_type_unused] = {
2667 0, /* none */
2668 1 << SH_type_l2_32_shadow, /* l1_32 */
2669 0, /* fl1_32 */
2670 0, /* l2_32 */
2671 ((1 << SH_type_l2h_pae_shadow)
2672 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
2673 0, /* fl1_pae */
2674 0, /* l2_pae */
2675 0, /* l2h_pae */
2676 ((1 << SH_type_l2h_64_shadow)
2677 | (1 << SH_type_l2_64_shadow)), /* l1_64 */
2678 0, /* fl1_64 */
2679 1 << SH_type_l3_64_shadow, /* l2_64 */
2680 1 << SH_type_l3_64_shadow, /* l2h_64 */
2681 1 << SH_type_l4_64_shadow, /* l3_64 */
2682 0, /* l4_64 */
2683 0, /* p2m */
2684 0 /* unused */
2685 };
2687 ASSERT(!(all && fast));
2689 /* Although this is an externally visible function, we do not know
2690 * whether the shadow lock will be held when it is called (since it
2691 * can be called via put_page_type when we clear a shadow l1e).
2692 * If the lock isn't held, take it for the duration of the call. */
2693 do_locking = !shadow_locked_by_me(v->domain);
2694 if ( do_locking ) shadow_lock(v->domain);
2696 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2697 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2699 /* Bail out now if the page is not shadowed */
2700 if ( (pg->count_info & PGC_page_table) == 0 )
2702 if ( do_locking ) shadow_unlock(v->domain);
2703 return;
2706 /* Search for this shadow in all appropriate shadows */
2707 perfc_incr(shadow_unshadow);
2709 /* Lower-level shadows need to be excised from upper-level shadows.
2710 * This call to hash_foreach() looks dangerous but is in fact OK: each
2711 * call will remove at most one shadow, and terminate immediately when
2712 * it does remove it, so we never walk the hash after doing a deletion. */
2713 #define DO_UNSHADOW(_type) do { \
2714 t = (_type); \
2715 if( !(pg->count_info & PGC_page_table) \
2716 || !(pg->shadow_flags & (1 << t)) ) \
2717 break; \
2718 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2719 if ( unlikely(!mfn_valid(smfn)) ) \
2720 { \
2721 SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32 \
2722 " but no type-0x%"PRIx32" shadow\n", \
2723 mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \
2724 break; \
2725 } \
2726 if ( sh_type_is_pinnable(v, t) ) \
2727 sh_unpin(v, smfn); \
2728 else \
2729 sh_remove_shadow_via_pointer(v, smfn); \
2730 if( !fast \
2731 && (pg->count_info & PGC_page_table) \
2732 && (pg->shadow_flags & (1 << t)) ) \
2733 hash_foreach(v, masks[t], callbacks, smfn); \
2734 } while (0)
2736 DO_UNSHADOW(SH_type_l2_32_shadow);
2737 DO_UNSHADOW(SH_type_l1_32_shadow);
2738 DO_UNSHADOW(SH_type_l2h_pae_shadow);
2739 DO_UNSHADOW(SH_type_l2_pae_shadow);
2740 DO_UNSHADOW(SH_type_l1_pae_shadow);
2741 #if CONFIG_PAGING_LEVELS >= 4
2742 DO_UNSHADOW(SH_type_l4_64_shadow);
2743 DO_UNSHADOW(SH_type_l3_64_shadow);
2744 DO_UNSHADOW(SH_type_l2h_64_shadow);
2745 DO_UNSHADOW(SH_type_l2_64_shadow);
2746 DO_UNSHADOW(SH_type_l1_64_shadow);
2747 #endif
2749 #undef DO_UNSHADOW
2751 /* If that didn't catch the shadows, something is wrong */
2752 if ( !fast && all && (pg->count_info & PGC_page_table) )
2754 SHADOW_ERROR("can't find all shadows of mfn %05lx "
2755 "(shadow_flags=%08x)\n",
2756 mfn_x(gmfn), pg->shadow_flags);
2757 domain_crash(v->domain);
2760 /* Need to flush TLBs now, so that linear maps are safe next time we
2761 * take a fault. */
2762 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2764 if ( do_locking ) shadow_unlock(v->domain);
2767 static void
2768 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2769 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2770 * Unshadow it, and recursively unshadow pages that reference it. */
2772 sh_remove_shadows(v, gmfn, 0, 1);
2773 /* XXX TODO:
2774 * Rework this hashtable walker to return a linked-list of all
2775 * the shadows it modified, then do breadth-first recursion
2776 * to find the way up to higher-level tables and unshadow them too.
2778 * The current code (just tearing down each page's shadows as we
2779 * detect that it is not a pagetable) is correct, but very slow.
2780 * It means extra emulated writes and slows down removal of mappings. */
2783 /**************************************************************************/
2785 static void sh_update_paging_modes(struct vcpu *v)
2787 struct domain *d = v->domain;
2788 struct paging_mode *old_mode = v->arch.paging.mode;
2790 ASSERT(shadow_locked_by_me(d));
2792 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
2793 /* Make sure this vcpu has a virtual TLB array allocated */
2794 if ( unlikely(!v->arch.paging.vtlb) )
2796 v->arch.paging.vtlb = xmalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
2797 if ( unlikely(!v->arch.paging.vtlb) )
2799 SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
2800 d->domain_id, v->vcpu_id);
2801 domain_crash(v->domain);
2802 return;
2804 memset(v->arch.paging.vtlb, 0,
2805 VTLB_ENTRIES * sizeof (struct shadow_vtlb));
2806 spin_lock_init(&v->arch.paging.vtlb_lock);
2808 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
2810 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2811 if ( v->arch.paging.shadow.oos_fixups == NULL )
2813 int i;
2814 v->arch.paging.shadow.oos_fixups =
2815 alloc_xenheap_pages(SHADOW_OOS_FT_ORDER);
2816 if ( v->arch.paging.shadow.oos_fixups == NULL )
2818 SHADOW_ERROR("Could not allocate OOS fixup table"
2819 " for dom %u vcpu %u\n",
2820 v->domain->domain_id, v->vcpu_id);
2821 domain_crash(v->domain);
2822 return;
2824 for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
2825 v->arch.paging.shadow.oos_fixups[i].gmfn = _mfn(INVALID_MFN);
2828 if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
2830 int i;
2831 for(i = 0; i < SHADOW_OOS_PAGES; i++)
2833 shadow_prealloc(d, SH_type_oos_snapshot, 1);
2834 v->arch.paging.shadow.oos_snapshot[i] =
2835 shadow_alloc(d, SH_type_oos_snapshot, 0);
2838 #endif /* OOS */
2840 // Valid transitions handled by this function:
2841 // - For PV guests:
2842 // - after a shadow mode has been changed
2843 // - For HVM guests:
2844 // - after a shadow mode has been changed
2845 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2846 //
2848 // First, tear down any old shadow tables held by this vcpu.
2849 //
2850 if ( v->arch.paging.mode )
2851 v->arch.paging.mode->shadow.detach_old_tables(v);
2853 if ( !is_hvm_domain(d) )
2855 ///
2856 /// PV guest
2857 ///
2858 #if CONFIG_PAGING_LEVELS == 4
2859 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2860 #else /* CONFIG_PAGING_LEVELS == 3 */
2861 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2862 #endif
2864 else
2866 ///
2867 /// HVM guest
2868 ///
2869 ASSERT(shadow_mode_translate(d));
2870 ASSERT(shadow_mode_external(d));
2872 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2873 /* Need to resync all our pages now, because if a page goes out
2874 * of sync with paging enabled and is resynced with paging
2875 * disabled, the resync will go wrong. */
2876 shadow_resync_all(v, 0);
2877 #endif /* OOS */
2879 if ( !hvm_paging_enabled(v) )
2881 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
2882 * pagetable for it, mapping 4 GB one-to-one using a single l2
2883 * page of 1024 superpage mappings */
2884 v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
2885 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2887 else
2889 #ifdef __x86_64__
2890 if ( hvm_long_mode_enabled(v) )
2892 // long mode guest...
2893 v->arch.paging.mode =
2894 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
2896 else
2897 #endif
2898 if ( hvm_pae_enabled(v) )
2900 // 32-bit PAE mode guest...
2901 v->arch.paging.mode =
2902 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
2904 else
2906 // 32-bit 2 level guest...
2907 v->arch.paging.mode =
2908 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
2912 if ( pagetable_is_null(v->arch.monitor_table) )
2914 mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2915 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2916 make_cr3(v, mfn_x(mmfn));
2917 hvm_update_host_cr3(v);
2920 if ( v->arch.paging.mode != old_mode )
2922 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d gl=%u "
2923 "(was g=%u s=%u)\n",
2924 d->domain_id, v->vcpu_id,
2925 is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
2926 v->arch.paging.mode->guest_levels,
2927 v->arch.paging.mode->shadow.shadow_levels,
2928 old_mode ? old_mode->guest_levels : 0,
2929 old_mode ? old_mode->shadow.shadow_levels : 0);
2930 if ( old_mode &&
2931 (v->arch.paging.mode->shadow.shadow_levels !=
2932 old_mode->shadow.shadow_levels) )
2934 /* Need to make a new monitor table for the new mode */
2935 mfn_t new_mfn, old_mfn;
2937 if ( v != current && vcpu_runnable(v) )
2939 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2940 "this HVM vcpu's (d=%u v=%u) paging mode "
2941 "while it is running.\n",
2942 current->domain->domain_id, current->vcpu_id,
2943 v->domain->domain_id, v->vcpu_id);
2944 /* It's not safe to do that because we can't change
2945 * the host CR3 for a running domain */
2946 domain_crash(v->domain);
2947 return;
2950 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2951 v->arch.monitor_table = pagetable_null();
2952 new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2953 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2954 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
2955 mfn_x(new_mfn));
2957 /* Don't be running on the old monitor table when we
2958 * pull it down! Switch CR3, and warn the HVM code that
2959 * its host cr3 has changed. */
2960 make_cr3(v, mfn_x(new_mfn));
2961 if ( v == current )
2962 write_ptbase(v);
2963 hvm_update_host_cr3(v);
2964 old_mode->shadow.destroy_monitor_table(v, old_mfn);
2968 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2969 // These are HARD: think about the case where two CPU's have
2970 // different values for CR4.PSE and CR4.PGE at the same time.
2971 // This *does* happen, at least for CR4.PGE...
2974 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2975 /* We need to check that all the vcpus have paging enabled to
2976 * unsync PTs. */
2977 if ( is_hvm_domain(d) )
2979 int pe = 1;
2980 struct vcpu *vptr;
2982 for_each_vcpu(d, vptr)
2984 if ( !hvm_paging_enabled(vptr) )
2986 pe = 0;
2987 break;
2991 d->arch.paging.shadow.oos_active = pe;
2993 #endif /* OOS */
2995 v->arch.paging.mode->update_cr3(v, 0);
2998 void shadow_update_paging_modes(struct vcpu *v)
3000 shadow_lock(v->domain);
3001 sh_update_paging_modes(v);
3002 shadow_unlock(v->domain);
3005 /**************************************************************************/
3006 /* Turning on and off shadow features */
3008 static void sh_new_mode(struct domain *d, u32 new_mode)
3009 /* Inform all the vcpus that the shadow mode has been changed */
3011 struct vcpu *v;
3013 ASSERT(shadow_locked_by_me(d));
3014 ASSERT(d != current->domain);
3015 d->arch.paging.mode = new_mode;
3016 for_each_vcpu(d, v)
3017 sh_update_paging_modes(v);
3020 int shadow_enable(struct domain *d, u32 mode)
3021 /* Turn on "permanent" shadow features: external, translate, refcount.
3022 * Can only be called once on a domain, and these features cannot be
3023 * disabled.
3024 * Returns 0 for success, -errno for failure. */
3026 unsigned int old_pages;
3027 struct page_info *pg = NULL;
3028 uint32_t *e;
3029 int i, rv = 0;
3031 mode |= PG_SH_enable;
3033 domain_pause(d);
3035 /* Sanity check the arguments */
3036 if ( (d == current->domain) ||
3037 shadow_mode_enabled(d) ||
3038 ((mode & PG_translate) && !(mode & PG_refcounts)) ||
3039 ((mode & PG_external) && !(mode & PG_translate)) )
3041 rv = -EINVAL;
3042 goto out_unlocked;
3045 /* Init the shadow memory allocation if the user hasn't done so */
3046 old_pages = d->arch.paging.shadow.total_pages;
3047 if ( old_pages == 0 )
3049 unsigned int r;
3050 shadow_lock(d);
3051 r = sh_set_allocation(d, 256, NULL); /* Use at least 1MB */
3052 if ( r != 0 )
3054 sh_set_allocation(d, 0, NULL);
3055 rv = -ENOMEM;
3056 goto out_locked;
3058 shadow_unlock(d);
3061 /* Init the P2M table. Must be done before we take the shadow lock
3062 * to avoid possible deadlock. */
3063 if ( mode & PG_translate )
3065 rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
3066 if (rv != 0)
3067 goto out_unlocked;
3070 /* HVM domains need an extra pagetable for vcpus that think they
3071 * have paging disabled */
3072 if ( is_hvm_domain(d) )
3074 /* Get a single page from the shadow pool. Take it via the
3075 * P2M interface to make freeing it simpler afterwards. */
3076 pg = shadow_alloc_p2m_page(d);
3077 if ( pg == NULL )
3079 rv = -ENOMEM;
3080 goto out_unlocked;
3082 /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
3083 * of virtual address space onto the same physical address range */
3084 e = sh_map_domain_page(page_to_mfn(pg));
3085 for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
3086 e[i] = ((0x400000U * i)
3087 | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
3088 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3089 sh_unmap_domain_page(e);
3090 pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
3093 shadow_lock(d);
3095 /* Sanity check again with the lock held */
3096 if ( shadow_mode_enabled(d) )
3098 rv = -EINVAL;
3099 goto out_locked;
3102 /* Init the hash table */
3103 if ( shadow_hash_alloc(d) != 0 )
3105 rv = -ENOMEM;
3106 goto out_locked;
3109 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3110 /* We assume we're dealing with an older 64bit linux guest until we
3111 * see the guest use more than one l4 per vcpu. */
3112 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3113 #endif
3115 /* Record the 1-to-1 pagetable we just made */
3116 if ( is_hvm_domain(d) )
3117 d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
3119 /* Update the bits */
3120 sh_new_mode(d, mode);
3122 out_locked:
3123 shadow_unlock(d);
3124 out_unlocked:
3125 if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
3126 p2m_teardown(d);
3127 if ( rv != 0 && pg != NULL )
3128 shadow_free_p2m_page(d, pg);
3129 domain_unpause(d);
3130 return rv;
3133 void shadow_teardown(struct domain *d)
3134 /* Destroy the shadow pagetables of this domain and free its shadow memory.
3135 * Should only be called for dying domains. */
3137 struct vcpu *v;
3138 mfn_t mfn;
3139 struct list_head *entry, *n;
3140 struct page_info *pg;
3142 ASSERT(d->is_dying);
3143 ASSERT(d != current->domain);
3145 if ( !shadow_locked_by_me(d) )
3146 shadow_lock(d); /* Keep various asserts happy */
3148 if ( shadow_mode_enabled(d) )
3150 /* Release the shadow and monitor tables held by each vcpu */
3151 for_each_vcpu(d, v)
3153 if ( v->arch.paging.mode )
3155 v->arch.paging.mode->shadow.detach_old_tables(v);
3156 if ( shadow_mode_external(d) )
3158 mfn = pagetable_get_mfn(v->arch.monitor_table);
3159 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
3160 v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
3161 v->arch.monitor_table = pagetable_null();
3167 #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
3168 /* Free the virtual-TLB array attached to each vcpu */
3169 for_each_vcpu(d, v)
3171 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3172 if ( v->arch.paging.vtlb )
3174 xfree(v->arch.paging.vtlb);
3175 v->arch.paging.vtlb = NULL;
3177 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3179 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3180 if ( v->arch.paging.shadow.oos_fixups )
3182 free_xenheap_pages(v->arch.paging.shadow.oos_fixups,
3183 SHADOW_OOS_FT_ORDER);
3184 v->arch.paging.shadow.oos_fixups = NULL;
3188 int i;
3189 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
3190 for(i = 0; i < SHADOW_OOS_PAGES; i++)
3191 if ( mfn_valid(oos_snapshot[i]) )
3192 shadow_free(d, oos_snapshot[i]);
3194 #endif /* OOS */
3196 #endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
3198 list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
3200 list_del(entry);
3201 pg = list_entry(entry, struct page_info, list);
3202 shadow_free_p2m_page(d, pg);
3205 if ( d->arch.paging.shadow.total_pages != 0 )
3207 SHADOW_PRINTK("teardown of domain %u starts."
3208 " Shadow pages total = %u, free = %u, p2m=%u\n",
3209 d->domain_id,
3210 d->arch.paging.shadow.total_pages,
3211 d->arch.paging.shadow.free_pages,
3212 d->arch.paging.shadow.p2m_pages);
3213 /* Destroy all the shadows and release memory to domheap */
3214 sh_set_allocation(d, 0, NULL);
3215 /* Release the hash table back to xenheap */
3216 if (d->arch.paging.shadow.hash_table)
3217 shadow_hash_teardown(d);
3218 /* Should not have any more memory held */
3219 SHADOW_PRINTK("teardown done."
3220 " Shadow pages total = %u, free = %u, p2m=%u\n",
3221 d->arch.paging.shadow.total_pages,
3222 d->arch.paging.shadow.free_pages,
3223 d->arch.paging.shadow.p2m_pages);
3224 ASSERT(d->arch.paging.shadow.total_pages == 0);
3227 /* Free the non-paged-vcpus pagetable; must happen after we've
3228 * destroyed any shadows of it or sh_destroy_shadow will get confused. */
3229 if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
3231 for_each_vcpu(d, v)
3233 ASSERT(is_hvm_vcpu(v));
3234 if ( !hvm_paging_enabled(v) )
3235 v->arch.guest_table = pagetable_null();
3237 shadow_free_p2m_page(d,
3238 pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable));
3239 d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
3242 /* We leave the "permanent" shadow modes enabled, but clear the
3243 * log-dirty mode bit. We don't want any more mark_dirty()
3244 * calls now that we've torn down the bitmap */
3245 d->arch.paging.mode &= ~PG_log_dirty;
3247 if (d->dirty_vram) {
3248 xfree(d->dirty_vram->sl1ma);
3249 xfree(d->dirty_vram->dirty_bitmap);
3250 xfree(d->dirty_vram);
3251 d->dirty_vram = NULL;
3254 shadow_unlock(d);
3257 void shadow_final_teardown(struct domain *d)
3258 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
3260 SHADOW_PRINTK("dom %u final teardown starts."
3261 " Shadow pages total = %u, free = %u, p2m=%u\n",
3262 d->domain_id,
3263 d->arch.paging.shadow.total_pages,
3264 d->arch.paging.shadow.free_pages,
3265 d->arch.paging.shadow.p2m_pages);
3267 /* Double-check that the domain didn't have any shadow memory.
3268 * It is possible for a domain that never got domain_kill()ed
3269 * to get here with its shadow allocation intact. */
3270 if ( d->arch.paging.shadow.total_pages != 0 )
3271 shadow_teardown(d);
3273 /* It is now safe to pull down the p2m map. */
3274 p2m_teardown(d);
3276 SHADOW_PRINTK("dom %u final teardown done."
3277 " Shadow pages total = %u, free = %u, p2m=%u\n",
3278 d->domain_id,
3279 d->arch.paging.shadow.total_pages,
3280 d->arch.paging.shadow.free_pages,
3281 d->arch.paging.shadow.p2m_pages);
3284 static int shadow_one_bit_enable(struct domain *d, u32 mode)
3285 /* Turn on a single shadow mode feature */
3287 ASSERT(shadow_locked_by_me(d));
3289 /* Sanity check the call */
3290 if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
3292 return -EINVAL;
3295 mode |= PG_SH_enable;
3297 if ( d->arch.paging.mode == 0 )
3299 /* Init the shadow memory allocation and the hash table */
3300 if ( sh_set_allocation(d, 1, NULL) != 0
3301 || shadow_hash_alloc(d) != 0 )
3303 sh_set_allocation(d, 0, NULL);
3304 return -ENOMEM;
3308 /* Update the bits */
3309 sh_new_mode(d, d->arch.paging.mode | mode);
3311 return 0;
3314 static int shadow_one_bit_disable(struct domain *d, u32 mode)
3315 /* Turn off a single shadow mode feature */
3317 struct vcpu *v;
3318 ASSERT(shadow_locked_by_me(d));
3320 /* Sanity check the call */
3321 if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
3323 return -EINVAL;
3326 /* Update the bits */
3327 sh_new_mode(d, d->arch.paging.mode & ~mode);
3328 if ( d->arch.paging.mode == 0 )
3330 /* Get this domain off shadows */
3331 SHADOW_PRINTK("un-shadowing of domain %u starts."
3332 " Shadow pages total = %u, free = %u, p2m=%u\n",
3333 d->domain_id,
3334 d->arch.paging.shadow.total_pages,
3335 d->arch.paging.shadow.free_pages,
3336 d->arch.paging.shadow.p2m_pages);
3337 for_each_vcpu(d, v)
3339 if ( v->arch.paging.mode )
3340 v->arch.paging.mode->shadow.detach_old_tables(v);
3341 #if CONFIG_PAGING_LEVELS == 4
3342 if ( !(v->arch.flags & TF_kernel_mode) )
3343 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
3344 else
3345 #endif
3346 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
3350 /* Pull down the memory allocation */
3351 if ( sh_set_allocation(d, 0, NULL) != 0 )
3353 // XXX - How can this occur?
3354 // Seems like a bug to return an error now that we've
3355 // disabled the relevant shadow mode.
3356 //
3357 return -ENOMEM;
3359 shadow_hash_teardown(d);
3360 SHADOW_PRINTK("un-shadowing of domain %u done."
3361 " Shadow pages total = %u, free = %u, p2m=%u\n",
3362 d->domain_id,
3363 d->arch.paging.shadow.total_pages,
3364 d->arch.paging.shadow.free_pages,
3365 d->arch.paging.shadow.p2m_pages);
3368 return 0;
3371 /* Enable/disable ops for the "test" and "log-dirty" modes */
3372 static int shadow_test_enable(struct domain *d)
3374 int ret;
3376 domain_pause(d);
3377 shadow_lock(d);
3378 ret = shadow_one_bit_enable(d, PG_SH_enable);
3379 shadow_unlock(d);
3380 domain_unpause(d);
3382 return ret;
3385 static int shadow_test_disable(struct domain *d)
3387 int ret;
3389 domain_pause(d);
3390 shadow_lock(d);
3391 ret = shadow_one_bit_disable(d, PG_SH_enable);
3392 shadow_unlock(d);
3393 domain_unpause(d);
3395 return ret;
3398 /**************************************************************************/
3399 /* P2M map manipulations */
3401 /* shadow specific code which should be called when P2M table entry is updated
3402 * with new content. It is responsible for update the entry, as well as other
3403 * shadow processing jobs.
3404 */
3405 void
3406 shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn,
3407 l1_pgentry_t *p, mfn_t table_mfn,
3408 l1_pgentry_t new, unsigned int level)
3410 struct domain *d = v->domain;
3412 shadow_lock(d);
3414 /* If we're removing an MFN from the p2m, remove it from the shadows too */
3415 if ( level == 1 )
3417 mfn_t mfn = _mfn(l1e_get_pfn(*p));
3418 p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
3419 if ( p2m_is_valid(p2mt) && mfn_valid(mfn) )
3421 sh_remove_all_shadows_and_parents(v, mfn);
3422 if ( sh_remove_all_mappings(v, mfn) )
3423 flush_tlb_mask(d->domain_dirty_cpumask);
3427 /* Update the entry with new content */
3428 safe_write_pte(p, new);
3430 /* install P2M in monitors for PAE Xen */
3431 #if CONFIG_PAGING_LEVELS == 3
3432 if ( level == 3 )
3433 /* We have written to the p2m l3: need to sync the per-vcpu
3434 * copies of it in the monitor tables */
3435 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
3436 #endif
3438 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3439 /* If we're doing FAST_FAULT_PATH, then shadow mode may have
3440 cached the fact that this is an mmio region in the shadow
3441 page tables. Blow the tables away to remove the cache.
3442 This is pretty heavy handed, but this is a rare operation
3443 (it might happen a dozen times during boot and then never
3444 again), so it doesn't matter too much. */
3445 if ( d->arch.paging.shadow.has_fast_mmio_entries )
3447 shadow_blow_tables(d);
3448 d->arch.paging.shadow.has_fast_mmio_entries = 0;
3450 #endif
3452 shadow_unlock(d);
3455 /**************************************************************************/
3456 /* Log-dirty mode support */
3458 /* Shadow specific code which is called in paging_log_dirty_enable().
3459 * Return 0 if no problem found.
3460 */
3461 int shadow_enable_log_dirty(struct domain *d)
3463 int ret;
3465 /* shadow lock is required here */
3466 shadow_lock(d);
3467 if ( shadow_mode_enabled(d) )
3469 /* This domain already has some shadows: need to clear them out
3470 * of the way to make sure that all references to guest memory are
3471 * properly write-protected */
3472 shadow_blow_tables(d);
3475 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
3476 /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
3477 * change an l4e instead of cr3 to switch tables. Give them the
3478 * same optimization */
3479 if ( is_pv_32on64_domain(d) )
3480 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
3481 #endif
3483 ret = shadow_one_bit_enable(d, PG_log_dirty);
3484 shadow_unlock(d);
3486 return ret;
3489 /* shadow specfic code which is called in paging_log_dirty_disable() */
3490 int shadow_disable_log_dirty(struct domain *d)
3492 int ret;
3494 /* shadow lock is required here */
3495 shadow_lock(d);
3496 ret = shadow_one_bit_disable(d, PG_log_dirty);
3497 shadow_unlock(d);
3499 return ret;
3502 /* This function is called when we CLEAN log dirty bitmap. See
3503 * paging_log_dirty_op() for details.
3504 */
3505 void shadow_clean_dirty_bitmap(struct domain *d)
3507 shadow_lock(d);
3508 /* Need to revoke write access to the domain's pages again.
3509 * In future, we'll have a less heavy-handed approach to this,
3510 * but for now, we just unshadow everything except Xen. */
3511 shadow_blow_tables(d);
3512 shadow_unlock(d);
3516 /**************************************************************************/
3517 /* VRAM dirty tracking support */
3518 int shadow_track_dirty_vram(struct domain *d,
3519 unsigned long begin_pfn,
3520 unsigned long nr,
3521 XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
3523 int rc;
3524 unsigned long end_pfn = begin_pfn + nr;
3525 unsigned long dirty_size = (nr + 7) / 8;
3526 int flush_tlb = 0;
3527 unsigned long i;
3528 p2m_type_t t;
3530 if (end_pfn < begin_pfn
3531 || begin_pfn > d->arch.p2m->max_mapped_pfn
3532 || end_pfn >= d->arch.p2m->max_mapped_pfn)
3533 return -EINVAL;
3535 shadow_lock(d);
3537 if ( d->dirty_vram && (!nr ||
3538 ( begin_pfn != d->dirty_vram->begin_pfn
3539 || end_pfn != d->dirty_vram->end_pfn )) )
3541 /* Different tracking, tear the previous down. */
3542 gdprintk(XENLOG_INFO, "stopping tracking VRAM %lx - %lx\n", d->dirty_vram->begin_pfn, d->dirty_vram->end_pfn);
3543 xfree(d->dirty_vram->sl1ma);
3544 xfree(d->dirty_vram->dirty_bitmap);
3545 xfree(d->dirty_vram);
3546 d->dirty_vram = NULL;
3549 if ( !nr )
3551 rc = 0;
3552 goto out;
3555 /* This should happen seldomly (Video mode change),
3556 * no need to be careful. */
3557 if ( !d->dirty_vram )
3559 /* Just recount from start. */
3560 for ( i = begin_pfn; i < end_pfn; i++ ) {
3561 mfn_t mfn = gfn_to_mfn(d, i, &t);
3562 if (mfn_x(mfn) != INVALID_MFN)
3563 flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
3566 gdprintk(XENLOG_INFO, "tracking VRAM %lx - %lx\n", begin_pfn, end_pfn);
3568 rc = -ENOMEM;
3569 if ( (d->dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL )
3570 goto out;
3571 d->dirty_vram->begin_pfn = begin_pfn;
3572 d->dirty_vram->end_pfn = end_pfn;
3574 if ( (d->dirty_vram->sl1ma = xmalloc_array(paddr_t, nr)) == NULL )
3575 goto out_dirty_vram;
3576 memset(d->dirty_vram->sl1ma, ~0, sizeof(paddr_t) * nr);
3578 if ( (d->dirty_vram->dirty_bitmap = xmalloc_array(uint8_t, dirty_size)) == NULL )
3579 goto out_sl1ma;
3580 memset(d->dirty_vram->dirty_bitmap, 0, dirty_size);
3582 d->dirty_vram->last_dirty = NOW();
3584 /* Tell the caller that this time we could not track dirty bits. */
3585 rc = -ENODATA;
3587 else if (d->dirty_vram->last_dirty == -1)
3589 /* still completely clean, just copy our empty bitmap */
3590 rc = -EFAULT;
3591 if ( copy_to_guest(dirty_bitmap, d->dirty_vram->dirty_bitmap, dirty_size) == 0 )
3592 rc = 0;
3594 else
3596 #ifdef __i386__
3597 unsigned long map_mfn = INVALID_MFN;
3598 void *map_sl1p = NULL;
3599 #endif
3601 /* Iterate over VRAM to track dirty bits. */
3602 for ( i = 0; i < nr; i++ ) {
3603 mfn_t mfn = gfn_to_mfn(d, begin_pfn + i, &t);
3604 struct page_info *page;
3605 u32 count_info;
3606 int dirty = 0;
3607 paddr_t sl1ma = d->dirty_vram->sl1ma[i];
3609 if (mfn_x(mfn) == INVALID_MFN)
3611 dirty = 1;
3613 else
3615 page = mfn_to_page(mfn);
3616 count_info = page->u.inuse.type_info & PGT_count_mask;
3617 switch (count_info)
3619 case 0:
3620 /* No guest reference, nothing to track. */
3621 break;
3622 case 1:
3623 /* One guest reference. */
3624 if ( sl1ma == INVALID_PADDR )
3626 /* We don't know which sl1e points to this, too bad. */
3627 dirty = 1;
3628 /* TODO: Heuristics for finding the single mapping of
3629 * this gmfn */
3630 flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
3632 else
3634 /* Hopefully the most common case: only one mapping,
3635 * whose dirty bit we can use. */
3636 l1_pgentry_t *sl1e;
3637 #ifdef __i386__
3638 void *sl1p = map_sl1p;
3639 unsigned long sl1mfn = paddr_to_pfn(sl1ma);
3641 if ( sl1mfn != map_mfn ) {
3642 if ( map_sl1p )
3643 sh_unmap_domain_page(map_sl1p);
3644 map_sl1p = sl1p = sh_map_domain_page(_mfn(sl1mfn));
3645 map_mfn = sl1mfn;
3647 sl1e = sl1p + (sl1ma & ~PAGE_MASK);
3648 #else
3649 sl1e = maddr_to_virt(sl1ma);
3650 #endif
3652 if ( l1e_get_flags(*sl1e) & _PAGE_DIRTY )
3654 dirty = 1;
3655 /* Note: this is atomic, so we may clear a
3656 * _PAGE_ACCESSED set by another processor. */
3657 l1e_remove_flags(*sl1e, _PAGE_DIRTY);
3658 flush_tlb = 1;
3661 break;
3662 default:
3663 /* More than one guest reference,
3664 * we don't afford tracking that. */
3665 dirty = 1;
3666 break;
3670 if ( dirty )
3672 d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
3673 d->dirty_vram->last_dirty = NOW();
3677 #ifdef __i386__
3678 if ( map_sl1p )
3679 sh_unmap_domain_page(map_sl1p);
3680 #endif
3682 rc = -EFAULT;
3683 if ( copy_to_guest(dirty_bitmap, d->dirty_vram->dirty_bitmap, dirty_size) == 0 ) {
3684 memset(d->dirty_vram->dirty_bitmap, 0, dirty_size);
3685 if (d->dirty_vram->last_dirty + SECONDS(2) < NOW())
3687 /* was clean for more than two seconds, try to disable guest
3688 * write access */
3689 for ( i = begin_pfn; i < end_pfn; i++ ) {
3690 mfn_t mfn = gfn_to_mfn(d, i, &t);
3691 if (mfn_x(mfn) != INVALID_MFN)
3692 flush_tlb |= sh_remove_write_access(d->vcpu[0], mfn, 1, 0);
3694 d->dirty_vram->last_dirty = -1;
3696 rc = 0;
3699 if ( flush_tlb )
3700 flush_tlb_mask(d->domain_dirty_cpumask);
3701 goto out;
3703 out_sl1ma:
3704 xfree(d->dirty_vram->sl1ma);
3705 out_dirty_vram:
3706 xfree(d->dirty_vram);
3707 d->dirty_vram = NULL;
3709 out:
3710 shadow_unlock(d);
3711 return rc;
3714 /**************************************************************************/
3715 /* Shadow-control XEN_DOMCTL dispatcher */
3717 int shadow_domctl(struct domain *d,
3718 xen_domctl_shadow_op_t *sc,
3719 XEN_GUEST_HANDLE(void) u_domctl)
3721 int rc, preempted = 0;
3723 switch ( sc->op )
3725 case XEN_DOMCTL_SHADOW_OP_OFF:
3726 if ( d->arch.paging.mode == PG_SH_enable )
3727 if ( (rc = shadow_test_disable(d)) != 0 )
3728 return rc;
3729 return 0;
3731 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3732 return shadow_test_enable(d);
3734 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3735 return shadow_enable(d, PG_refcounts|PG_translate);
3737 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3738 return shadow_enable(d, sc->mode << PG_mode_shift);
3740 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3741 sc->mb = shadow_get_allocation(d);
3742 return 0;
3744 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3745 shadow_lock(d);
3746 if ( sc->mb == 0 && shadow_mode_enabled(d) )
3748 /* Can't set the allocation to zero unless the domain stops using
3749 * shadow pagetables first */
3750 SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
3751 " is still using shadows.\n", d->domain_id);
3752 shadow_unlock(d);
3753 return -EINVAL;
3755 rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
3756 shadow_unlock(d);
3757 if ( preempted )
3758 /* Not finished. Set up to re-run the call. */
3759 rc = hypercall_create_continuation(
3760 __HYPERVISOR_domctl, "h", u_domctl);
3761 else
3762 /* Finished. Return the new allocation */
3763 sc->mb = shadow_get_allocation(d);
3764 return rc;
3766 default:
3767 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3768 return -EINVAL;
3773 /**************************************************************************/
3774 /* Auditing shadow tables */
3776 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3778 void shadow_audit_tables(struct vcpu *v)
3780 /* Dispatch table for getting per-type functions */
3781 static hash_callback_t callbacks[SH_type_unused] = {
3782 NULL, /* none */
3783 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 2), /* l1_32 */
3784 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 2), /* fl1_32 */
3785 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 2), /* l2_32 */
3786 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 3), /* l1_pae */
3787 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 3), /* fl1_pae */
3788 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2_pae */
3789 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2h_pae */
3790 #if CONFIG_PAGING_LEVELS >= 4
3791 SHADOW_INTERNAL_NAME(sh_audit_l1_table, 4), /* l1_64 */
3792 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 4), /* fl1_64 */
3793 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2_64 */
3794 SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2h_64 */
3795 SHADOW_INTERNAL_NAME(sh_audit_l3_table, 4), /* l3_64 */
3796 SHADOW_INTERNAL_NAME(sh_audit_l4_table, 4), /* l4_64 */
3797 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3798 NULL /* All the rest */
3799 };
3800 unsigned int mask;
3802 if ( !(SHADOW_AUDIT_ENABLE) )
3803 return;
3805 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3806 sh_oos_audit(v->domain);
3807 #endif
3809 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3810 mask = ~1; /* Audit every table in the system */
3811 else
3813 /* Audit only the current mode's tables */
3814 switch ( v->arch.paging.mode->guest_levels )
3816 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3817 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3818 |SHF_L2H_PAE); break;
3819 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3820 |SHF_L3_64|SHF_L4_64); break;
3821 default: BUG();
3825 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3828 #endif /* Shadow audit */
3830 /*
3831 * Local variables:
3832 * mode: C
3833 * c-set-style: "BSD"
3834 * c-basic-offset: 4
3835 * indent-tabs-mode: nil
3836 * End:
3837 */