direct-io.hg

view xen/arch/x86/mm/shadow/common.c @ 11648:5f42b4824e45

[XEN] Fix interaction between tlbflush timestamp and shadow flags
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <tim.deegan@xensource.com>
date Thu Sep 28 17:09:11 2006 +0100 (2006-09-28)
parents 0e9055d69f12
children b6ee084892da
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include "private.h"
41 #if SHADOW_AUDIT
42 int shadow_audit_enable = 0;
44 static void shadow_audit_key(unsigned char key)
45 {
46 shadow_audit_enable = !shadow_audit_enable;
47 printk("%s shadow_audit_enable=%d\n",
48 __func__, shadow_audit_enable);
49 }
51 static int __init shadow_audit_key_init(void)
52 {
53 register_keyhandler(
54 'O', shadow_audit_key, "toggle shadow audits");
55 return 0;
56 }
57 __initcall(shadow_audit_key_init);
58 #endif /* SHADOW_AUDIT */
60 static void sh_free_log_dirty_bitmap(struct domain *d);
62 int _shadow_mode_refcounts(struct domain *d)
63 {
64 return shadow_mode_refcounts(d);
65 }
68 /**************************************************************************/
69 /* x86 emulator support for the shadow code
70 */
72 static int
73 sh_x86_emulate_read_std(unsigned long addr,
74 unsigned long *val,
75 unsigned int bytes,
76 struct x86_emulate_ctxt *ctxt)
77 {
78 struct vcpu *v = current;
79 if ( hvm_guest(v) )
80 {
81 *val = 0;
82 // XXX -- this is WRONG.
83 // It entirely ignores the permissions in the page tables.
84 // In this case, that is only a user vs supervisor access check.
85 //
86 if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
87 {
88 #if 0
89 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
90 v->domain->domain_id, v->vcpu_id,
91 addr, *val, bytes);
92 #endif
93 return X86EMUL_CONTINUE;
94 }
96 /* If we got here, there was nothing mapped here, or a bad GFN
97 * was mapped here. This should never happen: we're here because
98 * of a write fault at the end of the instruction we're emulating. */
99 SHADOW_PRINTK("read failed to va %#lx\n", addr);
100 return X86EMUL_PROPAGATE_FAULT;
101 }
102 else
103 {
104 SHADOW_PRINTK("this operation is not emulated yet\n");
105 return X86EMUL_UNHANDLEABLE;
106 }
107 }
109 static int
110 sh_x86_emulate_write_std(unsigned long addr,
111 unsigned long val,
112 unsigned int bytes,
113 struct x86_emulate_ctxt *ctxt)
114 {
115 struct vcpu *v = current;
116 #if 0
117 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
118 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
119 #endif
120 if ( hvm_guest(v) )
121 {
122 // XXX -- this is WRONG.
123 // It entirely ignores the permissions in the page tables.
124 // In this case, that includes user vs supervisor, and
125 // write access.
126 //
127 if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
128 return X86EMUL_CONTINUE;
130 /* If we got here, there was nothing mapped here, or a bad GFN
131 * was mapped here. This should never happen: we're here because
132 * of a write fault at the end of the instruction we're emulating,
133 * which should be handled by sh_x86_emulate_write_emulated. */
134 SHADOW_PRINTK("write failed to va %#lx\n", addr);
135 return X86EMUL_PROPAGATE_FAULT;
136 }
137 else
138 {
139 SHADOW_PRINTK("this operation is not emulated yet\n");
140 return X86EMUL_UNHANDLEABLE;
141 }
142 }
144 static int
145 sh_x86_emulate_write_emulated(unsigned long addr,
146 unsigned long val,
147 unsigned int bytes,
148 struct x86_emulate_ctxt *ctxt)
149 {
150 struct vcpu *v = current;
151 #if 0
152 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
153 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
154 #endif
155 if ( hvm_guest(v) )
156 {
157 return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
158 }
159 else
160 {
161 SHADOW_PRINTK("this operation is not emulated yet\n");
162 return X86EMUL_UNHANDLEABLE;
163 }
164 }
166 static int
167 sh_x86_emulate_cmpxchg_emulated(unsigned long addr,
168 unsigned long old,
169 unsigned long new,
170 unsigned int bytes,
171 struct x86_emulate_ctxt *ctxt)
172 {
173 struct vcpu *v = current;
174 #if 0
175 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
176 v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
177 #endif
178 if ( hvm_guest(v) )
179 {
180 return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new,
181 bytes, ctxt);
182 }
183 else
184 {
185 SHADOW_PRINTK("this operation is not emulated yet\n");
186 return X86EMUL_UNHANDLEABLE;
187 }
188 }
190 static int
191 sh_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
192 unsigned long old_lo,
193 unsigned long old_hi,
194 unsigned long new_lo,
195 unsigned long new_hi,
196 struct x86_emulate_ctxt *ctxt)
197 {
198 struct vcpu *v = current;
199 #if 0
200 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
201 v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
202 new_hi, new_lo, ctxt);
203 #endif
204 if ( hvm_guest(v) )
205 {
206 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
207 new_lo, new_hi, ctxt);
208 }
209 else
210 {
211 SHADOW_PRINTK("this operation is not emulated yet\n");
212 return X86EMUL_UNHANDLEABLE;
213 }
214 }
217 struct x86_emulate_ops shadow_emulator_ops = {
218 .read_std = sh_x86_emulate_read_std,
219 .write_std = sh_x86_emulate_write_std,
220 .read_emulated = sh_x86_emulate_read_std,
221 .write_emulated = sh_x86_emulate_write_emulated,
222 .cmpxchg_emulated = sh_x86_emulate_cmpxchg_emulated,
223 .cmpxchg8b_emulated = sh_x86_emulate_cmpxchg8b_emulated,
224 };
226 /**************************************************************************/
227 /* Code for "promoting" a guest page to the point where the shadow code is
228 * willing to let it be treated as a guest page table. This generally
229 * involves making sure there are no writable mappings available to the guest
230 * for this page.
231 */
232 void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type)
233 {
234 struct page_info *page = mfn_to_page(gmfn);
236 ASSERT(valid_mfn(gmfn));
238 /* We should never try to promote a gmfn that has writeable mappings */
239 ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
241 /* Is the page already shadowed? */
242 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
243 page->shadow_flags = 0;
245 ASSERT(!test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
246 set_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
247 }
249 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
250 {
251 struct page_info *page = mfn_to_page(gmfn);
253 ASSERT(test_bit(_PGC_page_table, &page->count_info));
254 ASSERT(test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
256 clear_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
258 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
259 {
260 /* tlbflush timestamp field is valid again */
261 page->tlbflush_timestamp = tlbflush_current_time();
262 clear_bit(_PGC_page_table, &page->count_info);
263 }
264 }
266 /**************************************************************************/
267 /* Validate a pagetable change from the guest and update the shadows.
268 * Returns a bitmask of SHADOW_SET_* flags. */
270 static int
271 __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
272 void *entry, u32 size)
273 {
274 int result = 0;
275 struct page_info *page = mfn_to_page(gmfn);
277 sh_mark_dirty(v->domain, gmfn);
279 // Determine which types of shadows are affected, and update each.
280 //
281 // Always validate L1s before L2s to prevent another cpu with a linear
282 // mapping of this gmfn from seeing a walk that results from
283 // using the new L2 value and the old L1 value. (It is OK for such a
284 // guest to see a walk that uses the old L2 value with the new L1 value,
285 // as hardware could behave this way if one level of the pagewalk occurs
286 // before the store, and the next level of the pagewalk occurs after the
287 // store.
288 //
289 // Ditto for L2s before L3s, etc.
290 //
292 if ( !(page->count_info & PGC_page_table) )
293 return 0; /* Not shadowed at all */
295 #if CONFIG_PAGING_LEVELS == 2
296 if ( page->shadow_flags & SHF_L1_32 )
297 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
298 (v, gmfn, entry, size);
299 #else
300 if ( page->shadow_flags & SHF_L1_32 )
301 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
302 (v, gmfn, entry, size);
303 #endif
305 #if CONFIG_PAGING_LEVELS == 2
306 if ( page->shadow_flags & SHF_L2_32 )
307 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
308 (v, gmfn, entry, size);
309 #else
310 if ( page->shadow_flags & SHF_L2_32 )
311 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
312 (v, gmfn, entry, size);
313 #endif
315 #if CONFIG_PAGING_LEVELS >= 3
316 if ( page->shadow_flags & SHF_L1_PAE )
317 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
318 (v, gmfn, entry, size);
319 if ( page->shadow_flags & SHF_L2_PAE )
320 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
321 (v, gmfn, entry, size);
322 if ( page->shadow_flags & SHF_L2H_PAE )
323 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
324 (v, gmfn, entry, size);
325 if ( page->shadow_flags & SHF_L3_PAE )
326 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 3, 3)
327 (v, gmfn, entry, size);
328 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
329 ASSERT((page->shadow_flags & (SHF_L3_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
330 #endif
332 #if CONFIG_PAGING_LEVELS >= 4
333 if ( page->shadow_flags & SHF_L1_64 )
334 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
335 (v, gmfn, entry, size);
336 if ( page->shadow_flags & SHF_L2_64 )
337 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
338 (v, gmfn, entry, size);
339 if ( page->shadow_flags & SHF_L3_64 )
340 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
341 (v, gmfn, entry, size);
342 if ( page->shadow_flags & SHF_L4_64 )
343 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
344 (v, gmfn, entry, size);
345 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
346 ASSERT((page->shadow_flags
347 & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
348 #endif
350 return result;
351 }
354 int
355 shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
356 /* This is the entry point from hypercalls. It returns a bitmask of all the
357 * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
358 {
359 int rc;
361 ASSERT(shadow_lock_is_acquired(v->domain));
362 rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
363 shadow_audit_tables(v);
364 return rc;
365 }
367 void
368 shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
369 void *entry, u32 size)
370 /* This is the entry point for emulated writes to pagetables in HVM guests */
371 {
372 struct domain *d = v->domain;
373 int rc;
375 ASSERT(shadow_lock_is_acquired(v->domain));
376 rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
377 if ( rc & SHADOW_SET_FLUSH )
378 /* Need to flush TLBs to pick up shadow PT changes */
379 flush_tlb_mask(d->domain_dirty_cpumask);
380 if ( rc & SHADOW_SET_ERROR )
381 {
382 /* This page is probably not a pagetable any more: tear it out of the
383 * shadows, along with any tables that reference it */
384 shadow_remove_all_shadows_and_parents(v, gmfn);
385 }
386 }
389 /**************************************************************************/
390 /* Memory management for shadow pages. */
392 /* Meaning of the count_info field in shadow pages
393 * ----------------------------------------------
394 *
395 * A count of all references to this page from other shadow pages and
396 * guest CR3s (a.k.a. v->arch.shadow.table).
397 *
398 * The top bits hold the shadow type and the pinned bit. Top-level
399 * shadows are pinned so that they don't disappear when not in a CR3
400 * somewhere.
401 *
402 * We don't need to use get|put_page for this as the updates are all
403 * protected by the shadow lock. We can't use get|put_page for this
404 * as the size of the count on shadow pages is different from that on
405 * normal guest pages.
406 */
408 /* Meaning of the type_info field in shadow pages
409 * ----------------------------------------------
410 *
411 * type_info use depends on the shadow type (from count_info)
412 *
413 * PGC_SH_none : This page is in the shadow free pool. type_info holds
414 * the chunk order for our freelist allocator.
415 *
416 * PGC_SH_l*_shadow : This page is in use as a shadow. type_info
417 * holds the mfn of the guest page being shadowed,
418 *
419 * PGC_SH_fl1_*_shadow : This page is being used to shatter a superpage.
420 * type_info holds the gfn being shattered.
421 *
422 * PGC_SH_monitor_table : This page is part of a monitor table.
423 * type_info is not used.
424 */
426 /* Meaning of the _domain field in shadow pages
427 * --------------------------------------------
428 *
429 * In shadow pages, this field will always have its least significant bit
430 * set. This ensures that all attempts to get_page() will fail (as all
431 * valid pickled domain pointers have a zero for their least significant bit).
432 * Instead, the remaining upper bits are used to record the shadow generation
433 * counter when the shadow was created.
434 */
436 /* Meaning of the shadow_flags field
437 * ----------------------------------
438 *
439 * In guest pages that are shadowed, one bit for each kind of shadow they have.
440 *
441 * In shadow pages, will be used for holding a representation of the populated
442 * entries in this shadow (either a min/max, or a bitmap, or ...)
443 *
444 * In monitor-table pages, holds the level of the particular page (to save
445 * spilling the shadow types into an extra bit by having three types of monitor
446 * page).
447 */
449 /* Meaning of the list_head struct in shadow pages
450 * -----------------------------------------------
451 *
452 * In free shadow pages, this is used to hold the free-lists of chunks.
453 *
454 * In top-level shadow tables, this holds a linked-list of all top-level
455 * shadows (used for recovering memory and destroying shadows).
456 *
457 * In lower-level shadows, this holds the physical address of a higher-level
458 * shadow entry that holds a reference to this shadow (or zero).
459 */
461 /* Allocating shadow pages
462 * -----------------------
463 *
464 * Most shadow pages are allocated singly, but there are two cases where we
465 * need to allocate multiple pages together.
466 *
467 * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
468 * A 32-bit guest l1 table covers 4MB of virtuial address space,
469 * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
470 * of virtual address space each). Similarly, a 32-bit guest l2 table
471 * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va
472 * each). These multi-page shadows are contiguous and aligned;
473 * functions for handling offsets into them are defined in shadow.c
474 * (shadow_l1_index() etc.)
475 *
476 * 2: Shadowing PAE top-level pages. Each guest page that contains
477 * any PAE top-level pages requires two shadow pages to shadow it.
478 * They contain alternating l3 tables and pae_l3_bookkeeping structs.
479 *
480 * This table shows the allocation behaviour of the different modes:
481 *
482 * Xen paging 32b pae pae 64b 64b 64b
483 * Guest paging 32b 32b pae 32b pae 64b
484 * PV or HVM * HVM * HVM HVM *
485 * Shadow paging 32b pae pae pae pae 64b
486 *
487 * sl1 size 4k 8k 4k 8k 4k 4k
488 * sl2 size 4k 16k 4k 16k 4k 4k
489 * sl3 size - - 8k - 8k 4k
490 * sl4 size - - - - - 4k
491 *
492 * We allocate memory from xen in four-page units and break them down
493 * with a simple buddy allocator. Can't use the xen allocator to handle
494 * this as it only works for contiguous zones, and a domain's shadow
495 * pool is made of fragments.
496 *
497 * In HVM guests, the p2m table is built out of shadow pages, and we provide
498 * a function for the p2m management to steal pages, in max-order chunks, from
499 * the free pool. We don't provide for giving them back, yet.
500 */
502 /* Figure out the least acceptable quantity of shadow memory.
503 * The minimum memory requirement for always being able to free up a
504 * chunk of memory is very small -- only three max-order chunks per
505 * vcpu to hold the top level shadows and pages with Xen mappings in them.
506 *
507 * But for a guest to be guaranteed to successfully execute a single
508 * instruction, we must be able to map a large number (about thirty) VAs
509 * at the same time, which means that to guarantee progress, we must
510 * allow for more than ninety allocated pages per vcpu. We round that
511 * up to 128 pages, or half a megabyte per vcpu. */
512 unsigned int shadow_min_acceptable_pages(struct domain *d)
513 {
514 u32 vcpu_count = 0;
515 struct vcpu *v;
517 for_each_vcpu(d, v)
518 vcpu_count++;
520 return (vcpu_count * 128);
521 }
523 /* Using the type_info field to store freelist order */
524 #define SH_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
525 #define SH_SET_PFN_ORDER(_p, _o) \
526 do { (_p)->u.inuse.type_info = (_o); } while (0)
529 /* Figure out the order of allocation needed for a given shadow type */
530 static inline u32
531 shadow_order(u32 shadow_type)
532 {
533 #if CONFIG_PAGING_LEVELS > 2
534 static const u32 type_to_order[16] = {
535 0, /* PGC_SH_none */
536 1, /* PGC_SH_l1_32_shadow */
537 1, /* PGC_SH_fl1_32_shadow */
538 2, /* PGC_SH_l2_32_shadow */
539 0, /* PGC_SH_l1_pae_shadow */
540 0, /* PGC_SH_fl1_pae_shadow */
541 0, /* PGC_SH_l2_pae_shadow */
542 0, /* PGC_SH_l2h_pae_shadow */
543 1, /* PGC_SH_l3_pae_shadow */
544 0, /* PGC_SH_l1_64_shadow */
545 0, /* PGC_SH_fl1_64_shadow */
546 0, /* PGC_SH_l2_64_shadow */
547 0, /* PGC_SH_l3_64_shadow */
548 0, /* PGC_SH_l4_64_shadow */
549 2, /* PGC_SH_p2m_table */
550 0 /* PGC_SH_monitor_table */
551 };
552 u32 type = (shadow_type & PGC_SH_type_mask) >> PGC_SH_type_shift;
553 return type_to_order[type];
554 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
555 return 0;
556 #endif
557 }
560 /* Do we have a free chunk of at least this order? */
561 static inline int chunk_is_available(struct domain *d, int order)
562 {
563 int i;
565 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
566 if ( !list_empty(&d->arch.shadow.freelists[i]) )
567 return 1;
568 return 0;
569 }
571 /* Dispatcher function: call the per-mode function that will unhook the
572 * non-Xen mappings in this top-level shadow mfn */
573 void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
574 {
575 struct page_info *pg = mfn_to_page(smfn);
576 switch ( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift )
577 {
578 case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
579 #if CONFIG_PAGING_LEVELS == 2
580 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
581 #else
582 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
583 #endif
584 break;
585 #if CONFIG_PAGING_LEVELS >= 3
586 case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
587 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
588 break;
589 #endif
590 #if CONFIG_PAGING_LEVELS >= 4
591 case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
592 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
593 break;
594 #endif
595 default:
596 SHADOW_PRINTK("top-level shadow has bad type %08lx\n",
597 (unsigned long)((pg->count_info & PGC_SH_type_mask)
598 >> PGC_SH_type_shift));
599 BUG();
600 }
601 }
604 /* Make sure there is at least one chunk of the required order available
605 * in the shadow page pool. This must be called before any calls to
606 * shadow_alloc(). Since this will free existing shadows to make room,
607 * it must be called early enough to avoid freeing shadows that the
608 * caller is currently working on. */
609 void shadow_prealloc(struct domain *d, unsigned int order)
610 {
611 /* Need a vpcu for calling unpins; for now, since we don't have
612 * per-vcpu shadows, any will do */
613 struct vcpu *v = d->vcpu[0];
614 struct list_head *l, *t;
615 struct page_info *pg;
616 mfn_t smfn;
618 if ( chunk_is_available(d, order) ) return;
620 /* Stage one: walk the list of top-level pages, unpinning them */
621 perfc_incrc(shadow_prealloc_1);
622 list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
623 {
624 pg = list_entry(l, struct page_info, list);
625 smfn = page_to_mfn(pg);
627 #if CONFIG_PAGING_LEVELS >= 3
628 if ( (pg->count_info & PGC_SH_type_mask) == PGC_SH_l3_pae_shadow )
629 {
630 /* For PAE, we need to unpin each subshadow on this shadow */
631 SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn);
632 }
633 else
634 #endif /* 32-bit code always takes this branch */
635 {
636 /* Unpin this top-level shadow */
637 sh_unpin(v, smfn);
638 }
640 /* See if that freed up a chunk of appropriate size */
641 if ( chunk_is_available(d, order) ) return;
642 }
644 /* Stage two: all shadow pages are in use in hierarchies that are
645 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
646 * mappings. */
647 perfc_incrc(shadow_prealloc_2);
648 v = current;
649 if ( v->domain != d )
650 v = d->vcpu[0];
651 /* Walk the list from the tail: recently used toplevels have been pulled
652 * to the head */
653 list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
654 {
655 pg = list_entry(l, struct page_info, list);
656 smfn = page_to_mfn(pg);
657 shadow_unhook_mappings(v, smfn);
659 /* Need to flush TLB if we've altered our own tables */
660 if ( !shadow_mode_external(d)
661 && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
662 local_flush_tlb();
664 /* See if that freed up a chunk of appropriate size */
665 if ( chunk_is_available(d, order) ) return;
666 }
668 /* Nothing more we can do: all remaining shadows are of pages that
669 * hold Xen mappings for some vcpu. This can never happen. */
670 SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
671 " shadow pages total = %u, free = %u, p2m=%u\n",
672 1 << order,
673 d->arch.shadow.total_pages,
674 d->arch.shadow.free_pages,
675 d->arch.shadow.p2m_pages);
676 BUG();
677 }
680 /* Allocate another shadow's worth of (contiguous, aligned) pages,
681 * and fill in the type and backpointer fields of their page_infos.
682 * Never fails to allocate. */
683 mfn_t shadow_alloc(struct domain *d,
684 u32 shadow_type,
685 unsigned long backpointer)
686 {
687 struct page_info *pg = NULL;
688 unsigned int order = shadow_order(shadow_type);
689 cpumask_t mask;
690 void *p;
691 int i;
693 ASSERT(shadow_lock_is_acquired(d));
694 ASSERT(order <= SHADOW_MAX_ORDER);
695 ASSERT(shadow_type != PGC_SH_none);
696 perfc_incrc(shadow_alloc);
698 /* Find smallest order which can satisfy the request. */
699 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
700 if ( !list_empty(&d->arch.shadow.freelists[i]) )
701 {
702 pg = list_entry(d->arch.shadow.freelists[i].next,
703 struct page_info, list);
704 list_del(&pg->list);
706 /* We may have to halve the chunk a number of times. */
707 while ( i != order )
708 {
709 i--;
710 SH_SET_PFN_ORDER(pg, i);
711 list_add_tail(&pg->list, &d->arch.shadow.freelists[i]);
712 pg += 1 << i;
713 }
714 d->arch.shadow.free_pages -= 1 << order;
716 /* Init page info fields and clear the pages */
717 for ( i = 0; i < 1<<order ; i++ )
718 {
719 pg[i].u.inuse.type_info = backpointer;
720 pg[i].count_info = shadow_type;
721 pg[i].shadow_flags = 0;
722 INIT_LIST_HEAD(&pg[i].list);
723 /* Before we overwrite the old contents of this page,
724 * we need to be sure that no TLB holds a pointer to it. */
725 mask = d->domain_dirty_cpumask;
726 tlbflush_filter(mask, pg[i].tlbflush_timestamp);
727 if ( unlikely(!cpus_empty(mask)) )
728 {
729 perfc_incrc(shadow_alloc_tlbflush);
730 flush_tlb_mask(mask);
731 }
732 /* Now safe to clear the page for reuse */
733 p = sh_map_domain_page(page_to_mfn(pg+i));
734 ASSERT(p != NULL);
735 clear_page(p);
736 sh_unmap_domain_page(p);
737 perfc_incr(shadow_alloc_count);
738 }
739 return page_to_mfn(pg);
740 }
742 /* If we get here, we failed to allocate. This should never happen.
743 * It means that we didn't call shadow_prealloc() correctly before
744 * we allocated. We can't recover by calling prealloc here, because
745 * we might free up higher-level pages that the caller is working on. */
746 SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
747 BUG();
748 }
751 /* Return some shadow pages to the pool. */
752 void shadow_free(struct domain *d, mfn_t smfn)
753 {
754 struct page_info *pg = mfn_to_page(smfn);
755 u32 shadow_type;
756 unsigned long order;
757 unsigned long mask;
758 int i;
760 ASSERT(shadow_lock_is_acquired(d));
761 perfc_incrc(shadow_free);
763 shadow_type = pg->count_info & PGC_SH_type_mask;
764 ASSERT(shadow_type != PGC_SH_none);
765 ASSERT(shadow_type != PGC_SH_p2m_table);
766 order = shadow_order(shadow_type);
768 d->arch.shadow.free_pages += 1 << order;
770 for ( i = 0; i < 1<<order; i++ )
771 {
772 /* Strip out the type: this is now a free shadow page */
773 pg[i].count_info = 0;
774 /* Remember the TLB timestamp so we will know whether to flush
775 * TLBs when we reuse the page. Because the destructors leave the
776 * contents of the pages in place, we can delay TLB flushes until
777 * just before the allocator hands the page out again. */
778 pg[i].tlbflush_timestamp = tlbflush_current_time();
779 perfc_decr(shadow_alloc_count);
780 }
782 /* Merge chunks as far as possible. */
783 while ( order < SHADOW_MAX_ORDER )
784 {
785 mask = 1 << order;
786 if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
787 /* Merge with predecessor block? */
788 if ( (((pg-mask)->count_info & PGC_SH_type_mask) != PGT_none)
789 || (SH_PFN_ORDER(pg-mask) != order) )
790 break;
791 list_del(&(pg-mask)->list);
792 pg -= mask;
793 } else {
794 /* Merge with successor block? */
795 if ( (((pg+mask)->count_info & PGC_SH_type_mask) != PGT_none)
796 || (SH_PFN_ORDER(pg+mask) != order) )
797 break;
798 list_del(&(pg+mask)->list);
799 }
800 order++;
801 }
803 SH_SET_PFN_ORDER(pg, order);
804 list_add_tail(&pg->list, &d->arch.shadow.freelists[order]);
805 }
807 /* Divert some memory from the pool to be used by the p2m mapping.
808 * This action is irreversible: the p2m mapping only ever grows.
809 * That's OK because the p2m table only exists for external domains,
810 * and those domains can't ever turn off shadow mode.
811 * Also, we only ever allocate a max-order chunk, so as to preserve
812 * the invariant that shadow_prealloc() always works.
813 * Returns 0 iff it can't get a chunk (the caller should then
814 * free up some pages in domheap and call set_sh_allocation);
815 * returns non-zero on success.
816 */
817 static int
818 shadow_alloc_p2m_pages(struct domain *d)
819 {
820 struct page_info *pg;
821 u32 i;
822 ASSERT(shadow_lock_is_acquired(d));
824 if ( d->arch.shadow.total_pages
825 < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
826 return 0; /* Not enough shadow memory: need to increase it first */
828 pg = mfn_to_page(shadow_alloc(d, PGC_SH_p2m_table, 0));
829 d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
830 d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
831 for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
832 {
833 /* Unlike shadow pages, mark p2m pages as owned by the domain */
834 page_set_owner(&pg[i], d);
835 list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
836 }
837 return 1;
838 }
840 // Returns 0 if no memory is available...
841 mfn_t
842 shadow_alloc_p2m_page(struct domain *d)
843 {
844 struct list_head *entry;
845 mfn_t mfn;
846 void *p;
848 if ( list_empty(&d->arch.shadow.p2m_freelist) &&
849 !shadow_alloc_p2m_pages(d) )
850 return _mfn(0);
851 entry = d->arch.shadow.p2m_freelist.next;
852 list_del(entry);
853 list_add_tail(entry, &d->arch.shadow.p2m_inuse);
854 mfn = page_to_mfn(list_entry(entry, struct page_info, list));
855 sh_get_ref(mfn, 0);
856 p = sh_map_domain_page(mfn);
857 clear_page(p);
858 sh_unmap_domain_page(p);
860 return mfn;
861 }
863 #if CONFIG_PAGING_LEVELS == 3
864 static void p2m_install_entry_in_monitors(struct domain *d,
865 l3_pgentry_t *l3e)
866 /* Special case, only used for external-mode domains on PAE hosts:
867 * update the mapping of the p2m table. Once again, this is trivial in
868 * other paging modes (one top-level entry points to the top-level p2m,
869 * no maintenance needed), but PAE makes life difficult by needing a
870 * copy the eight l3es of the p2m table in eight l2h slots in the
871 * monitor table. This function makes fresh copies when a p2m l3e
872 * changes. */
873 {
874 l2_pgentry_t *ml2e;
875 struct vcpu *v;
876 unsigned int index;
878 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
879 ASSERT(index < MACHPHYS_MBYTES>>1);
881 for_each_vcpu(d, v)
882 {
883 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
884 continue;
885 ASSERT(shadow_mode_external(v->domain));
887 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
888 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
890 if ( v == current ) /* OK to use linear map of monitor_table */
891 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
892 else
893 {
894 l3_pgentry_t *ml3e;
895 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
896 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
897 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
898 ml2e += l2_table_offset(RO_MPT_VIRT_START);
899 sh_unmap_domain_page(ml3e);
900 }
901 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
902 if ( v != current )
903 sh_unmap_domain_page(ml2e);
904 }
905 }
906 #endif
908 // Find the next level's P2M entry, checking for out-of-range gfn's...
909 // Returns NULL on error.
910 //
911 static l1_pgentry_t *
912 p2m_find_entry(void *table, unsigned long *gfn_remainder,
913 unsigned long gfn, u32 shift, u32 max)
914 {
915 u32 index;
917 index = *gfn_remainder >> shift;
918 if ( index >= max )
919 {
920 SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
921 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
922 gfn, *gfn_remainder, shift, index, max);
923 return NULL;
924 }
925 *gfn_remainder &= (1 << shift) - 1;
926 return (l1_pgentry_t *)table + index;
927 }
929 // Walk one level of the P2M table, allocating a new table if required.
930 // Returns 0 on error.
931 //
932 static int
933 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
934 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
935 u32 max, unsigned long type)
936 {
937 l1_pgentry_t *p2m_entry;
938 void *next;
940 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
941 shift, max)) )
942 return 0;
944 if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
945 {
946 mfn_t mfn = shadow_alloc_p2m_page(d);
947 if ( mfn_x(mfn) == 0 )
948 return 0;
949 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
950 mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
951 mfn_to_page(mfn)->count_info = 1;
952 #if CONFIG_PAGING_LEVELS == 3
953 if (type == PGT_l2_page_table)
954 {
955 /* We have written to the p2m l3: need to sync the per-vcpu
956 * copies of it in the monitor tables */
957 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
958 }
959 #endif
960 /* The P2M can be shadowed: keep the shadows synced */
961 if ( d->vcpu[0] )
962 (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
963 p2m_entry, sizeof *p2m_entry);
964 }
965 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
966 next = sh_map_domain_page(*table_mfn);
967 sh_unmap_domain_page(*table);
968 *table = next;
970 return 1;
971 }
973 // Returns 0 on error (out of memory)
974 int
975 shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
976 {
977 // XXX -- this might be able to be faster iff current->domain == d
978 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
979 void *table = sh_map_domain_page(table_mfn);
980 unsigned long gfn_remainder = gfn;
981 l1_pgentry_t *p2m_entry;
983 #if CONFIG_PAGING_LEVELS >= 4
984 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
985 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
986 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
987 return 0;
988 #endif
989 #if CONFIG_PAGING_LEVELS >= 3
990 // When using PAE Xen, we only allow 33 bits of pseudo-physical
991 // address in translated guests (i.e. 8 GBytes). This restriction
992 // comes from wanting to map the P2M table into the 16MB RO_MPT hole
993 // in Xen's address space for translated PV guests.
994 //
995 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
996 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
997 (CONFIG_PAGING_LEVELS == 3
998 ? 8
999 : L3_PAGETABLE_ENTRIES),
1000 PGT_l2_page_table) )
1001 return 0;
1002 #endif
1003 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1004 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
1005 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
1006 return 0;
1008 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
1009 0, L1_PAGETABLE_ENTRIES);
1010 ASSERT(p2m_entry);
1011 if ( valid_mfn(mfn) )
1012 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1013 else
1014 *p2m_entry = l1e_empty();
1016 /* The P2M can be shadowed: keep the shadows synced */
1017 (void) __shadow_validate_guest_entry(d->vcpu[0], table_mfn,
1018 p2m_entry, sizeof *p2m_entry);
1020 sh_unmap_domain_page(table);
1022 return 1;
1025 // Allocate a new p2m table for a domain.
1026 //
1027 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
1028 // controlled by CONFIG_PAGING_LEVELS).
1029 //
1030 // Returns 0 if p2m table could not be initialized
1031 //
1032 static int
1033 shadow_alloc_p2m_table(struct domain *d)
1035 mfn_t p2m_top;
1036 struct list_head *entry;
1037 unsigned int page_count = 0;
1039 SHADOW_PRINTK("allocating p2m table\n");
1040 ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
1042 p2m_top = shadow_alloc_p2m_page(d);
1043 mfn_to_page(p2m_top)->count_info = 1;
1044 mfn_to_page(p2m_top)->u.inuse.type_info =
1045 #if CONFIG_PAGING_LEVELS == 4
1046 PGT_l4_page_table
1047 #elif CONFIG_PAGING_LEVELS == 3
1048 PGT_l3_page_table
1049 #elif CONFIG_PAGING_LEVELS == 2
1050 PGT_l2_page_table
1051 #endif
1052 | 1 | PGT_validated;
1054 if ( mfn_x(p2m_top) == 0 )
1055 return 0;
1057 d->arch.phys_table = pagetable_from_mfn(p2m_top);
1059 SHADOW_PRINTK("populating p2m table\n");
1061 for ( entry = d->page_list.next;
1062 entry != &d->page_list;
1063 entry = entry->next )
1065 struct page_info *page = list_entry(entry, struct page_info, list);
1066 mfn_t mfn = page_to_mfn(page);
1067 unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
1068 page_count++;
1069 if (
1070 #ifdef __x86_64__
1071 (gfn != 0x5555555555555555L)
1072 #else
1073 (gfn != 0x55555555L)
1074 #endif
1075 && gfn != INVALID_M2P_ENTRY
1076 && !shadow_set_p2m_entry(d, gfn, mfn) )
1078 SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH_PRI_mfn "\n",
1079 gfn, mfn_x(mfn));
1080 return 0;
1084 SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
1085 return 1;
1088 mfn_t
1089 sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1090 /* Read another domain's p2m entries */
1092 mfn_t mfn;
1093 unsigned long addr = gpfn << PAGE_SHIFT;
1094 l2_pgentry_t *l2e;
1095 l1_pgentry_t *l1e;
1097 ASSERT(shadow_mode_translate(d));
1098 mfn = pagetable_get_mfn(d->arch.phys_table);
1101 #if CONFIG_PAGING_LEVELS > 2
1102 if ( gpfn >= (RO_MPT_VIRT_END-RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
1103 /* This pfn is higher than the p2m map can hold */
1104 return _mfn(INVALID_MFN);
1105 #endif
1108 #if CONFIG_PAGING_LEVELS >= 4
1110 l4_pgentry_t *l4e = sh_map_domain_page(mfn);
1111 l4e += l4_table_offset(addr);
1112 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1114 sh_unmap_domain_page(l4e);
1115 return _mfn(INVALID_MFN);
1117 mfn = _mfn(l4e_get_pfn(*l4e));
1118 sh_unmap_domain_page(l4e);
1120 #endif
1121 #if CONFIG_PAGING_LEVELS >= 3
1123 l3_pgentry_t *l3e = sh_map_domain_page(mfn);
1124 l3e += l3_table_offset(addr);
1125 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1127 sh_unmap_domain_page(l3e);
1128 return _mfn(INVALID_MFN);
1130 mfn = _mfn(l3e_get_pfn(*l3e));
1131 sh_unmap_domain_page(l3e);
1133 #endif
1135 l2e = sh_map_domain_page(mfn);
1136 l2e += l2_table_offset(addr);
1137 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1139 sh_unmap_domain_page(l2e);
1140 return _mfn(INVALID_MFN);
1142 mfn = _mfn(l2e_get_pfn(*l2e));
1143 sh_unmap_domain_page(l2e);
1145 l1e = sh_map_domain_page(mfn);
1146 l1e += l1_table_offset(addr);
1147 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1149 sh_unmap_domain_page(l1e);
1150 return _mfn(INVALID_MFN);
1152 mfn = _mfn(l1e_get_pfn(*l1e));
1153 sh_unmap_domain_page(l1e);
1155 return mfn;
1158 unsigned long
1159 shadow_gfn_to_mfn_foreign(unsigned long gpfn)
1161 return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
1165 static void shadow_p2m_teardown(struct domain *d)
1166 /* Return all the p2m pages to Xen.
1167 * We know we don't have any extra mappings to these pages */
1169 struct list_head *entry, *n;
1170 struct page_info *pg;
1172 d->arch.phys_table = pagetable_null();
1174 list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
1176 pg = list_entry(entry, struct page_info, list);
1177 list_del(entry);
1178 /* Should have just the one ref we gave it in alloc_p2m_page() */
1179 if ( (pg->count_info & PGC_SH_count_mask) != 1 )
1181 SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1182 pg->count_info, pg->u.inuse.type_info);
1184 ASSERT(page_get_owner(pg) == d);
1185 /* Free should not decrement domain's total allocation, since
1186 * these pages were allocated without an owner. */
1187 page_set_owner(pg, NULL);
1188 free_domheap_pages(pg, 0);
1189 d->arch.shadow.p2m_pages--;
1190 perfc_decr(shadow_alloc_count);
1192 list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
1194 list_del(entry);
1195 pg = list_entry(entry, struct page_info, list);
1196 ASSERT(page_get_owner(pg) == d);
1197 /* Free should not decrement domain's total allocation. */
1198 page_set_owner(pg, NULL);
1199 free_domheap_pages(pg, 0);
1200 d->arch.shadow.p2m_pages--;
1201 perfc_decr(shadow_alloc_count);
1203 ASSERT(d->arch.shadow.p2m_pages == 0);
1206 /* Set the pool of shadow pages to the required number of pages.
1207 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1208 * plus space for the p2m table.
1209 * Returns 0 for success, non-zero for failure. */
1210 static unsigned int set_sh_allocation(struct domain *d,
1211 unsigned int pages,
1212 int *preempted)
1214 struct page_info *pg;
1215 unsigned int lower_bound;
1216 int j;
1218 ASSERT(shadow_lock_is_acquired(d));
1220 /* Don't allocate less than the minimum acceptable, plus one page per
1221 * megabyte of RAM (for the p2m table) */
1222 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1223 if ( pages > 0 && pages < lower_bound )
1224 pages = lower_bound;
1225 /* Round up to largest block size */
1226 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1228 SHADOW_PRINTK("current %i target %i\n",
1229 d->arch.shadow.total_pages, pages);
1231 while ( d->arch.shadow.total_pages != pages )
1233 if ( d->arch.shadow.total_pages < pages )
1235 /* Need to allocate more memory from domheap */
1236 pg = alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
1237 if ( pg == NULL )
1239 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1240 return -ENOMEM;
1242 d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1243 d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1244 for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
1246 pg[j].u.inuse.type_info = 0; /* Free page */
1247 pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
1249 SH_SET_PFN_ORDER(pg, SHADOW_MAX_ORDER);
1250 list_add_tail(&pg->list,
1251 &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
1253 else if ( d->arch.shadow.total_pages > pages )
1255 /* Need to return memory to domheap */
1256 shadow_prealloc(d, SHADOW_MAX_ORDER);
1257 ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
1258 pg = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
1259 struct page_info, list);
1260 list_del(&pg->list);
1261 d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1262 d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1263 free_domheap_pages(pg, SHADOW_MAX_ORDER);
1266 /* Check to see if we need to yield and try again */
1267 if ( preempted && hypercall_preempt_check() )
1269 *preempted = 1;
1270 return 0;
1274 return 0;
1277 unsigned int shadow_set_allocation(struct domain *d,
1278 unsigned int megabytes,
1279 int *preempted)
1280 /* Hypercall interface to set the shadow memory allocation */
1282 unsigned int rv;
1283 shadow_lock(d);
1284 rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
1285 SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
1286 d->domain_id,
1287 d->arch.shadow.total_pages,
1288 shadow_get_allocation(d));
1289 shadow_unlock(d);
1290 return rv;
1293 /**************************************************************************/
1294 /* Hash table for storing the guest->shadow mappings */
1296 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1297 typedef u32 key_t;
1298 static inline key_t sh_hash(unsigned long n, u8 t)
1300 unsigned char *p = (unsigned char *)&n;
1301 key_t k = t;
1302 int i;
1303 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1304 return k;
1307 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1309 /* Before we get to the mechanism, define a pair of audit functions
1310 * that sanity-check the contents of the hash table. */
1311 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1312 /* Audit one bucket of the hash table */
1314 struct shadow_hash_entry *e, *x;
1315 struct page_info *pg;
1317 if ( !(SHADOW_AUDIT_ENABLE) )
1318 return;
1320 e = &d->arch.shadow.hash_table[bucket];
1321 if ( e->t == 0 ) return; /* Bucket is empty */
1322 while ( e )
1324 /* Empty link? */
1325 BUG_ON( e->t == 0 );
1326 /* Bogus type? */
1327 BUG_ON( e->t > (PGC_SH_max_shadow >> PGC_SH_type_shift) );
1328 /* Wrong bucket? */
1329 BUG_ON( sh_hash(e->n, e->t) % SHADOW_HASH_BUCKETS != bucket );
1330 /* Duplicate entry? */
1331 for ( x = e->next; x; x = x->next )
1332 BUG_ON( x->n == e->n && x->t == e->t );
1333 /* Bogus MFN? */
1334 BUG_ON( !valid_mfn(e->smfn) );
1335 pg = mfn_to_page(e->smfn);
1336 /* Not a shadow? */
1337 BUG_ON( page_get_owner(pg) != 0 );
1338 /* Wrong kind of shadow? */
1339 BUG_ON( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift
1340 != e->t );
1341 /* Bad backlink? */
1342 BUG_ON( pg->u.inuse.type_info != e->n );
1343 if ( e->t != (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
1344 && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
1345 && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) )
1347 /* Bad shadow flags on guest page? */
1348 BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<<e->t)) );
1350 /* That entry was OK; on we go */
1351 e = e->next;
1355 #else
1356 #define sh_hash_audit_bucket(_d, _b)
1357 #endif /* Hashtable bucket audit */
1360 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1362 static void sh_hash_audit(struct domain *d)
1363 /* Full audit: audit every bucket in the table */
1365 int i;
1367 if ( !(SHADOW_AUDIT_ENABLE) )
1368 return;
1370 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1372 sh_hash_audit_bucket(d, i);
1376 #else
1377 #define sh_hash_audit(_d)
1378 #endif /* Hashtable bucket audit */
1380 /* Memory management interface for bucket allocation.
1381 * These ought to come out of shadow memory, but at least on 32-bit
1382 * machines we are forced to allocate them from xenheap so that we can
1383 * address them. */
1384 static struct shadow_hash_entry *sh_alloc_hash_entry(struct domain *d)
1386 struct shadow_hash_entry *extra, *x;
1387 int i;
1389 /* We need to allocate a new node. Ensure the free list is not empty.
1390 * Allocate new entries in units the same size as the original table. */
1391 if ( unlikely(d->arch.shadow.hash_freelist == NULL) )
1393 size_t sz = sizeof(void *) + (SHADOW_HASH_BUCKETS * sizeof(*x));
1394 extra = xmalloc_bytes(sz);
1396 if ( extra == NULL )
1398 /* No memory left! */
1399 SHADOW_ERROR("xmalloc() failed when allocating hash buckets.\n");
1400 domain_crash_synchronous();
1402 memset(extra, 0, sz);
1404 /* Record the allocation block so it can be correctly freed later. */
1405 *((struct shadow_hash_entry **)&extra[SHADOW_HASH_BUCKETS]) =
1406 d->arch.shadow.hash_allocations;
1407 d->arch.shadow.hash_allocations = &extra[0];
1409 /* Thread a free chain through the newly-allocated nodes. */
1410 for ( i = 0; i < (SHADOW_HASH_BUCKETS - 1); i++ )
1411 extra[i].next = &extra[i+1];
1412 extra[i].next = NULL;
1414 /* Add the new nodes to the free list. */
1415 d->arch.shadow.hash_freelist = &extra[0];
1418 /* Allocate a new node from the free list. */
1419 x = d->arch.shadow.hash_freelist;
1420 d->arch.shadow.hash_freelist = x->next;
1421 return x;
1424 static void sh_free_hash_entry(struct domain *d, struct shadow_hash_entry *e)
1426 /* Mark the bucket as empty and return it to the free list */
1427 e->t = 0;
1428 e->next = d->arch.shadow.hash_freelist;
1429 d->arch.shadow.hash_freelist = e;
1433 /* Allocate and initialise the table itself.
1434 * Returns 0 for success, 1 for error. */
1435 static int shadow_hash_alloc(struct domain *d)
1437 struct shadow_hash_entry *table;
1439 ASSERT(shadow_lock_is_acquired(d));
1440 ASSERT(!d->arch.shadow.hash_table);
1442 table = xmalloc_array(struct shadow_hash_entry, SHADOW_HASH_BUCKETS);
1443 if ( !table ) return 1;
1444 memset(table, 0,
1445 SHADOW_HASH_BUCKETS * sizeof (struct shadow_hash_entry));
1446 d->arch.shadow.hash_table = table;
1447 return 0;
1450 /* Tear down the hash table and return all memory to Xen.
1451 * This function does not care whether the table is populated. */
1452 static void shadow_hash_teardown(struct domain *d)
1454 struct shadow_hash_entry *a, *n;
1456 ASSERT(shadow_lock_is_acquired(d));
1457 ASSERT(d->arch.shadow.hash_table);
1459 /* Return the table itself */
1460 xfree(d->arch.shadow.hash_table);
1461 d->arch.shadow.hash_table = NULL;
1463 /* Return any extra allocations */
1464 a = d->arch.shadow.hash_allocations;
1465 while ( a )
1467 /* We stored a linked-list pointer at the end of each allocation */
1468 n = *((struct shadow_hash_entry **)(&a[SHADOW_HASH_BUCKETS]));
1469 xfree(a);
1470 a = n;
1472 d->arch.shadow.hash_allocations = NULL;
1473 d->arch.shadow.hash_freelist = NULL;
1477 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
1478 /* Find an entry in the hash table. Returns the MFN of the shadow,
1479 * or INVALID_MFN if it doesn't exist */
1481 struct domain *d = v->domain;
1482 struct shadow_hash_entry *p, *x, *head;
1483 key_t key;
1485 ASSERT(shadow_lock_is_acquired(d));
1486 ASSERT(d->arch.shadow.hash_table);
1487 ASSERT(t);
1489 sh_hash_audit(d);
1491 perfc_incrc(shadow_hash_lookups);
1492 key = sh_hash(n, t);
1494 x = head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
1495 p = NULL;
1497 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1499 do
1501 ASSERT(x->t || ((x == head) && (x->next == NULL)));
1503 if ( x->n == n && x->t == t )
1505 /* Pull-to-front if 'x' isn't already the head item */
1506 if ( unlikely(x != head) )
1508 if ( unlikely(d->arch.shadow.hash_walking != 0) )
1509 /* Can't reorder: someone is walking the hash chains */
1510 return x->smfn;
1511 else
1513 /* Delete 'x' from list and reinsert after head. */
1514 p->next = x->next;
1515 x->next = head->next;
1516 head->next = x;
1518 /* Swap 'x' contents with head contents. */
1519 SWAP(head->n, x->n);
1520 SWAP(head->t, x->t);
1521 SWAP(head->smfn, x->smfn);
1524 else
1526 perfc_incrc(shadow_hash_lookup_head);
1528 return head->smfn;
1531 p = x;
1532 x = x->next;
1534 while ( x != NULL );
1536 perfc_incrc(shadow_hash_lookup_miss);
1537 return _mfn(INVALID_MFN);
1540 void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
1541 /* Put a mapping (n,t)->smfn into the hash table */
1543 struct domain *d = v->domain;
1544 struct shadow_hash_entry *x, *head;
1545 key_t key;
1547 ASSERT(shadow_lock_is_acquired(d));
1548 ASSERT(d->arch.shadow.hash_table);
1549 ASSERT(t);
1551 sh_hash_audit(d);
1553 perfc_incrc(shadow_hash_inserts);
1554 key = sh_hash(n, t);
1556 head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
1558 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1560 /* If the bucket is empty then insert the new page as the head item. */
1561 if ( head->t == 0 )
1563 head->n = n;
1564 head->t = t;
1565 head->smfn = smfn;
1566 ASSERT(head->next == NULL);
1568 else
1570 /* Insert a new entry directly after the head item. */
1571 x = sh_alloc_hash_entry(d);
1572 x->n = n;
1573 x->t = t;
1574 x->smfn = smfn;
1575 x->next = head->next;
1576 head->next = x;
1579 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1582 void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
1583 /* Excise the mapping (n,t)->smfn from the hash table */
1585 struct domain *d = v->domain;
1586 struct shadow_hash_entry *p, *x, *head;
1587 key_t key;
1589 ASSERT(shadow_lock_is_acquired(d));
1590 ASSERT(d->arch.shadow.hash_table);
1591 ASSERT(t);
1593 sh_hash_audit(d);
1595 perfc_incrc(shadow_hash_deletes);
1596 key = sh_hash(n, t);
1598 head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
1600 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1602 /* Match on head item? */
1603 if ( head->n == n && head->t == t )
1605 if ( (x = head->next) != NULL )
1607 /* Overwrite head with contents of following node. */
1608 head->n = x->n;
1609 head->t = x->t;
1610 head->smfn = x->smfn;
1612 /* Delete following node. */
1613 head->next = x->next;
1614 sh_free_hash_entry(d, x);
1616 else
1618 /* This bucket is now empty. Initialise the head node. */
1619 head->t = 0;
1622 else
1624 /* Not at the head; need to walk the chain */
1625 p = head;
1626 x = head->next;
1628 while(1)
1630 ASSERT(x); /* We can't have hit the end, since our target is
1631 * still in the chain somehwere... */
1632 if ( x->n == n && x->t == t )
1634 /* Delete matching node. */
1635 p->next = x->next;
1636 sh_free_hash_entry(d, x);
1637 break;
1639 p = x;
1640 x = x->next;
1644 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1647 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1649 static void hash_foreach(struct vcpu *v,
1650 unsigned int callback_mask,
1651 hash_callback_t callbacks[],
1652 mfn_t callback_mfn)
1653 /* Walk the hash table looking at the types of the entries and
1654 * calling the appropriate callback function for each entry.
1655 * The mask determines which shadow types we call back for, and the array
1656 * of callbacks tells us which function to call.
1657 * Any callback may return non-zero to let us skip the rest of the scan.
1659 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1660 * then return non-zero to terminate the scan. */
1662 int i, done = 0;
1663 struct domain *d = v->domain;
1664 struct shadow_hash_entry *x;
1666 /* Say we're here, to stop hash-lookups reordering the chains */
1667 ASSERT(shadow_lock_is_acquired(d));
1668 ASSERT(d->arch.shadow.hash_walking == 0);
1669 d->arch.shadow.hash_walking = 1;
1671 callback_mask &= ~1; /* Never attempt to call back on empty buckets */
1672 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1674 /* WARNING: This is not safe against changes to the hash table.
1675 * The callback *must* return non-zero if it has inserted or
1676 * deleted anything from the hash (lookups are OK, though). */
1677 for ( x = &d->arch.shadow.hash_table[i]; x; x = x->next )
1679 if ( callback_mask & (1 << x->t) )
1681 ASSERT(x->t <= 15);
1682 ASSERT(callbacks[x->t] != NULL);
1683 if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
1684 break;
1687 if ( done ) break;
1689 d->arch.shadow.hash_walking = 0;
1693 /**************************************************************************/
1694 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1695 * which will decrement refcounts appropriately and return memory to the
1696 * free pool. */
1698 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1700 struct page_info *pg = mfn_to_page(smfn);
1701 u32 t = pg->count_info & PGC_SH_type_mask;
1704 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1706 /* Double-check, if we can, that the shadowed page belongs to this
1707 * domain, (by following the back-pointer). */
1708 ASSERT(t == PGC_SH_fl1_32_shadow ||
1709 t == PGC_SH_fl1_pae_shadow ||
1710 t == PGC_SH_fl1_64_shadow ||
1711 t == PGC_SH_monitor_table ||
1712 (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info)))
1713 == v->domain));
1715 /* The down-shifts here are so that the switch statement is on nice
1716 * small numbers that the compiler will enjoy */
1717 switch ( t >> PGC_SH_type_shift )
1719 #if CONFIG_PAGING_LEVELS == 2
1720 case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
1721 case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
1722 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1723 break;
1724 case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
1725 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1726 break;
1727 #else /* PAE or 64bit */
1728 case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
1729 case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
1730 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1731 break;
1732 case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
1733 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1734 break;
1735 #endif
1737 #if CONFIG_PAGING_LEVELS >= 3
1738 case PGC_SH_l1_pae_shadow >> PGC_SH_type_shift:
1739 case PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift:
1740 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1741 break;
1742 case PGC_SH_l2_pae_shadow >> PGC_SH_type_shift:
1743 case PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift:
1744 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1745 break;
1746 case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
1747 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 3, 3)(v, smfn);
1748 break;
1749 #endif
1751 #if CONFIG_PAGING_LEVELS >= 4
1752 case PGC_SH_l1_64_shadow >> PGC_SH_type_shift:
1753 case PGC_SH_fl1_64_shadow >> PGC_SH_type_shift:
1754 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1755 break;
1756 case PGC_SH_l2_64_shadow >> PGC_SH_type_shift:
1757 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1758 break;
1759 case PGC_SH_l3_64_shadow >> PGC_SH_type_shift:
1760 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1761 break;
1762 case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
1763 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1764 break;
1765 #endif
1766 default:
1767 SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
1768 (unsigned long)t);
1769 BUG();
1773 /**************************************************************************/
1774 /* Remove all writeable mappings of a guest frame from the shadow tables
1775 * Returns non-zero if we need to flush TLBs.
1776 * level and fault_addr desribe how we found this to be a pagetable;
1777 * level==0 means we have some other reason for revoking write access.*/
1779 int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn,
1780 unsigned int level,
1781 unsigned long fault_addr)
1783 /* Dispatch table for getting per-type functions */
1784 static hash_callback_t callbacks[16] = {
1785 NULL, /* none */
1786 #if CONFIG_PAGING_LEVELS == 2
1787 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */
1788 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */
1789 #else
1790 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */
1791 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */
1792 #endif
1793 NULL, /* l2_32 */
1794 #if CONFIG_PAGING_LEVELS >= 3
1795 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */
1796 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
1797 #else
1798 NULL, /* l1_pae */
1799 NULL, /* fl1_pae */
1800 #endif
1801 NULL, /* l2_pae */
1802 NULL, /* l2h_pae */
1803 NULL, /* l3_pae */
1804 #if CONFIG_PAGING_LEVELS >= 4
1805 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */
1806 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */
1807 #else
1808 NULL, /* l1_64 */
1809 NULL, /* fl1_64 */
1810 #endif
1811 NULL, /* l2_64 */
1812 NULL, /* l3_64 */
1813 NULL, /* l4_64 */
1814 NULL, /* p2m */
1815 NULL /* unused */
1816 };
1818 static unsigned int callback_mask =
1819 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
1820 | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
1821 | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
1822 | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
1823 | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
1824 | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
1826 struct page_info *pg = mfn_to_page(gmfn);
1828 ASSERT(shadow_lock_is_acquired(v->domain));
1830 /* Only remove writable mappings if we are doing shadow refcounts.
1831 * In guest refcounting, we trust Xen to already be restricting
1832 * all the writes to the guest page tables, so we do not need to
1833 * do more. */
1834 if ( !shadow_mode_refcounts(v->domain) )
1835 return 0;
1837 /* Early exit if it's already a pagetable, or otherwise not writeable */
1838 if ( sh_mfn_is_a_page_table(gmfn)
1839 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1840 return 0;
1842 perfc_incrc(shadow_writeable);
1844 /* If this isn't a "normal" writeable page, the domain is trying to
1845 * put pagetables in special memory of some kind. We can't allow that. */
1846 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1848 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1849 PRtype_info "\n",
1850 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1851 domain_crash(v->domain);
1854 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1855 if ( v == current && level != 0 )
1857 unsigned long gfn;
1858 /* Heuristic: there is likely to be only one writeable mapping,
1859 * and that mapping is likely to be in the current pagetable,
1860 * either in the guest's linear map (linux, windows) or in a
1861 * magic slot used to map high memory regions (linux HIGHTPTE) */
1863 #define GUESS(_a, _h) do { \
1864 if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
1865 perfc_incrc(shadow_writeable_h_ ## _h); \
1866 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1867 return 1; \
1868 } while (0)
1871 if ( v->arch.shadow.mode->guest_levels == 2 )
1873 if ( level == 1 )
1874 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1875 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1877 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1878 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1879 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1882 #if CONFIG_PAGING_LEVELS >= 3
1883 else if ( v->arch.shadow.mode->guest_levels == 3 )
1885 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1886 switch ( level )
1888 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1889 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1892 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1893 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1894 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1896 #if CONFIG_PAGING_LEVELS >= 4
1897 else if ( v->arch.shadow.mode->guest_levels == 4 )
1899 /* 64bit w2k3: linear map at 0x0000070000000000 */
1900 switch ( level )
1902 case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
1903 case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
1904 case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
1907 /* Linux direct map at 0xffff810000000000 */
1908 gfn = sh_mfn_to_gfn(v->domain, gmfn);
1909 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
1911 #endif /* CONFIG_PAGING_LEVELS >= 4 */
1912 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1914 #undef GUESS
1917 #endif
1919 /* Brute-force search of all the shadows, by walking the hash */
1920 perfc_incrc(shadow_writeable_bf);
1921 hash_foreach(v, callback_mask, callbacks, gmfn);
1923 /* If that didn't catch the mapping, something is very wrong */
1924 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
1926 SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
1927 "%lu left\n", mfn_x(gmfn),
1928 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
1929 domain_crash(v->domain);
1932 /* We killed at least one writeable mapping, so must flush TLBs. */
1933 return 1;
1938 /**************************************************************************/
1939 /* Remove all mappings of a guest frame from the shadow tables.
1940 * Returns non-zero if we need to flush TLBs. */
1942 int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
1944 struct page_info *page = mfn_to_page(gmfn);
1945 int expected_count;
1947 /* Dispatch table for getting per-type functions */
1948 static hash_callback_t callbacks[16] = {
1949 NULL, /* none */
1950 #if CONFIG_PAGING_LEVELS == 2
1951 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */
1952 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */
1953 #else
1954 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */
1955 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */
1956 #endif
1957 NULL, /* l2_32 */
1958 #if CONFIG_PAGING_LEVELS >= 3
1959 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */
1960 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
1961 #else
1962 NULL, /* l1_pae */
1963 NULL, /* fl1_pae */
1964 #endif
1965 NULL, /* l2_pae */
1966 NULL, /* l2h_pae */
1967 NULL, /* l3_pae */
1968 #if CONFIG_PAGING_LEVELS >= 4
1969 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */
1970 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */
1971 #else
1972 NULL, /* l1_64 */
1973 NULL, /* fl1_64 */
1974 #endif
1975 NULL, /* l2_64 */
1976 NULL, /* l3_64 */
1977 NULL, /* l4_64 */
1978 NULL, /* p2m */
1979 NULL /* unused */
1980 };
1982 static unsigned int callback_mask =
1983 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
1984 | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
1985 | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
1986 | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
1987 | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
1988 | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
1991 perfc_incrc(shadow_mappings);
1992 if ( (page->count_info & PGC_count_mask) == 0 )
1993 return 0;
1995 ASSERT(shadow_lock_is_acquired(v->domain));
1997 /* XXX TODO:
1998 * Heuristics for finding the (probably) single mapping of this gmfn */
2000 /* Brute-force search of all the shadows, by walking the hash */
2001 perfc_incrc(shadow_mappings_bf);
2002 hash_foreach(v, callback_mask, callbacks, gmfn);
2004 /* If that didn't catch the mapping, something is very wrong */
2005 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2006 if ( (page->count_info & PGC_count_mask) != expected_count )
2008 /* Don't complain if we're in HVM and there's one extra mapping:
2009 * The qemu helper process has an untyped mapping of this dom's RAM */
2010 if ( !(shadow_mode_external(v->domain)
2011 && (page->count_info & PGC_count_mask) <= 2
2012 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2014 SHADOW_ERROR("can't find all mappings of mfn %lx: "
2015 "c=%08x t=%08lx\n", mfn_x(gmfn),
2016 page->count_info, page->u.inuse.type_info);
2020 /* We killed at least one mapping, so must flush TLBs. */
2021 return 1;
2025 /**************************************************************************/
2026 /* Remove all shadows of a guest frame from the shadow tables */
2028 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2029 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2030 * found there. Returns 1 if that was the only reference to this shadow */
2032 struct page_info *pg = mfn_to_page(smfn);
2033 mfn_t pmfn;
2034 void *vaddr;
2035 int rc;
2037 ASSERT((pg->count_info & PGC_SH_type_mask) > 0);
2038 ASSERT((pg->count_info & PGC_SH_type_mask) < PGC_SH_max_shadow);
2039 ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l2_32_shadow);
2040 ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l3_pae_shadow);
2041 ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l4_64_shadow);
2043 if (pg->up == 0) return 0;
2044 pmfn = _mfn(pg->up >> PAGE_SHIFT);
2045 ASSERT(valid_mfn(pmfn));
2046 vaddr = sh_map_domain_page(pmfn);
2047 ASSERT(vaddr);
2048 vaddr += pg->up & (PAGE_SIZE-1);
2049 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2051 /* Is this the only reference to this shadow? */
2052 rc = ((pg->count_info & PGC_SH_count_mask) == 1) ? 1 : 0;
2054 /* Blank the offending entry */
2055 switch ((pg->count_info & PGC_SH_type_mask))
2057 case PGC_SH_l1_32_shadow:
2058 case PGC_SH_l2_32_shadow:
2059 #if CONFIG_PAGING_LEVELS == 2
2060 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2061 #else
2062 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2063 #endif
2064 break;
2065 #if CONFIG_PAGING_LEVELS >=3
2066 case PGC_SH_l1_pae_shadow:
2067 case PGC_SH_l2_pae_shadow:
2068 case PGC_SH_l2h_pae_shadow:
2069 case PGC_SH_l3_pae_shadow:
2070 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2071 break;
2072 #if CONFIG_PAGING_LEVELS >= 4
2073 case PGC_SH_l1_64_shadow:
2074 case PGC_SH_l2_64_shadow:
2075 case PGC_SH_l3_64_shadow:
2076 case PGC_SH_l4_64_shadow:
2077 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2078 break;
2079 #endif
2080 #endif
2081 default: BUG(); /* Some wierd unknown shadow type */
2084 sh_unmap_domain_page(vaddr);
2085 if ( rc )
2086 perfc_incrc(shadow_up_pointer);
2087 else
2088 perfc_incrc(shadow_unshadow_bf);
2090 return rc;
2093 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
2094 /* Remove the shadows of this guest page.
2095 * If all != 0, find all shadows, if necessary by walking the tables.
2096 * Otherwise, just try the (much faster) heuristics, which will remove
2097 * at most one reference to each shadow of the page. */
2099 struct page_info *pg;
2100 mfn_t smfn;
2101 u32 sh_flags;
2102 unsigned char t;
2104 /* Dispatch table for getting per-type functions: each level must
2105 * be called with the function to remove a lower-level shadow. */
2106 static hash_callback_t callbacks[16] = {
2107 NULL, /* none */
2108 NULL, /* l1_32 */
2109 NULL, /* fl1_32 */
2110 #if CONFIG_PAGING_LEVELS == 2
2111 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2112 #else
2113 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2114 #endif
2115 NULL, /* l1_pae */
2116 NULL, /* fl1_pae */
2117 #if CONFIG_PAGING_LEVELS >= 3
2118 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2119 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2120 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,3,3), /* l3_pae */
2121 #else
2122 NULL, /* l2_pae */
2123 NULL, /* l2h_pae */
2124 NULL, /* l3_pae */
2125 #endif
2126 NULL, /* l1_64 */
2127 NULL, /* fl1_64 */
2128 #if CONFIG_PAGING_LEVELS >= 4
2129 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2130 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2131 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2132 #else
2133 NULL, /* l2_64 */
2134 NULL, /* l3_64 */
2135 NULL, /* l4_64 */
2136 #endif
2137 NULL, /* p2m */
2138 NULL /* unused */
2139 };
2141 /* Another lookup table, for choosing which mask to use */
2142 static unsigned int masks[16] = {
2143 0, /* none */
2144 1 << (PGC_SH_l2_32_shadow >> PGC_SH_type_shift), /* l1_32 */
2145 0, /* fl1_32 */
2146 0, /* l2_32 */
2147 ((1 << (PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift))
2148 | (1 << (PGC_SH_l2_pae_shadow >> PGC_SH_type_shift))), /* l1_pae */
2149 0, /* fl1_pae */
2150 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2_pae */
2151 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2h_pae */
2152 0, /* l3_pae */
2153 1 << (PGC_SH_l2_64_shadow >> PGC_SH_type_shift), /* l1_64 */
2154 0, /* fl1_64 */
2155 1 << (PGC_SH_l3_64_shadow >> PGC_SH_type_shift), /* l2_64 */
2156 1 << (PGC_SH_l4_64_shadow >> PGC_SH_type_shift), /* l3_64 */
2157 0, /* l4_64 */
2158 0, /* p2m */
2159 0 /* unused */
2160 };
2162 ASSERT(shadow_lock_is_acquired(v->domain));
2164 pg = mfn_to_page(gmfn);
2166 /* Bail out now if the page is not shadowed */
2167 if ( (pg->count_info & PGC_page_table) == 0 )
2168 return;
2170 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2171 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2173 /* Search for this shadow in all appropriate shadows */
2174 perfc_incrc(shadow_unshadow);
2175 sh_flags = pg->shadow_flags;
2177 /* Lower-level shadows need to be excised from upper-level shadows.
2178 * This call to hash_foreach() looks dangerous but is in fact OK: each
2179 * call will remove at most one shadow, and terminate immediately when
2180 * it does remove it, so we never walk the hash after doing a deletion. */
2181 #define DO_UNSHADOW(_type) do { \
2182 t = (_type) >> PGC_SH_type_shift; \
2183 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2184 if ( !sh_remove_shadow_via_pointer(v, smfn) && all ) \
2185 hash_foreach(v, masks[t], callbacks, smfn); \
2186 } while (0)
2188 /* Top-level shadows need to be unpinned */
2189 #define DO_UNPIN(_type) do { \
2190 t = (_type) >> PGC_SH_type_shift; \
2191 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2192 if ( mfn_to_page(smfn)->count_info & PGC_SH_pinned ) \
2193 sh_unpin(v, smfn); \
2194 if ( (_type) == PGC_SH_l3_pae_shadow ) \
2195 SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); \
2196 } while (0)
2198 if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(PGC_SH_l1_32_shadow);
2199 if ( sh_flags & SHF_L2_32 ) DO_UNPIN(PGC_SH_l2_32_shadow);
2200 #if CONFIG_PAGING_LEVELS >= 3
2201 if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(PGC_SH_l1_pae_shadow);
2202 if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(PGC_SH_l2_pae_shadow);
2203 if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(PGC_SH_l2h_pae_shadow);
2204 if ( sh_flags & SHF_L3_PAE ) DO_UNPIN(PGC_SH_l3_pae_shadow);
2205 #if CONFIG_PAGING_LEVELS >= 4
2206 if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(PGC_SH_l1_64_shadow);
2207 if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(PGC_SH_l2_64_shadow);
2208 if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(PGC_SH_l3_64_shadow);
2209 if ( sh_flags & SHF_L4_64 ) DO_UNPIN(PGC_SH_l4_64_shadow);
2210 #endif
2211 #endif
2213 #undef DO_UNSHADOW
2214 #undef DO_UNPIN
2217 #if CONFIG_PAGING_LEVELS > 2
2218 /* We may have caused some PAE l3 entries to change: need to
2219 * fix up the copies of them in various places */
2220 if ( sh_flags & (SHF_L2_PAE|SHF_L2H_PAE) )
2221 sh_pae_recopy(v->domain);
2222 #endif
2224 /* If that didn't catch the shadows, something is wrong */
2225 if ( all && (pg->count_info & PGC_page_table) )
2227 SHADOW_ERROR("can't find all shadows of mfn %05lx (shadow_flags=%08x)\n",
2228 mfn_x(gmfn), pg->shadow_flags);
2229 domain_crash(v->domain);
2233 void
2234 shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2235 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2236 * Unshadow it, and recursively unshadow pages that reference it. */
2238 shadow_remove_all_shadows(v, gmfn);
2239 /* XXX TODO:
2240 * Rework this hashtable walker to return a linked-list of all
2241 * the shadows it modified, then do breadth-first recursion
2242 * to find the way up to higher-level tables and unshadow them too.
2244 * The current code (just tearing down each page's shadows as we
2245 * detect that it is not a pagetable) is correct, but very slow.
2246 * It means extra emulated writes and slows down removal of mappings. */
2249 /**************************************************************************/
2251 void sh_update_paging_modes(struct vcpu *v)
2253 struct domain *d = v->domain;
2254 struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
2255 mfn_t old_guest_table;
2257 ASSERT(shadow_lock_is_acquired(d));
2259 // Valid transitions handled by this function:
2260 // - For PV guests:
2261 // - after a shadow mode has been changed
2262 // - For HVM guests:
2263 // - after a shadow mode has been changed
2264 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2265 //
2267 // Avoid determining the current shadow mode for uninitialized CPUs, as
2268 // we can not yet determine whether it is an HVM or PV domain.
2269 //
2270 if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
2272 printk("%s: postponing determination of shadow mode\n", __func__);
2273 return;
2276 // First, tear down any old shadow tables held by this vcpu.
2277 //
2278 shadow_detach_old_tables(v);
2280 if ( !hvm_guest(v) )
2282 ///
2283 /// PV guest
2284 ///
2285 #if CONFIG_PAGING_LEVELS == 4
2286 if ( pv_32bit_guest(v) )
2287 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
2288 else
2289 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2290 #elif CONFIG_PAGING_LEVELS == 3
2291 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2292 #elif CONFIG_PAGING_LEVELS == 2
2293 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2294 #else
2295 #error unexpected paging mode
2296 #endif
2298 else
2300 ///
2301 /// HVM guest
2302 ///
2303 ASSERT(shadow_mode_translate(d));
2304 ASSERT(shadow_mode_external(d));
2306 v->arch.shadow.hvm_paging_enabled = !!hvm_paging_enabled(v);
2307 if ( !v->arch.shadow.hvm_paging_enabled )
2310 /* Set v->arch.guest_table to use the p2m map, and choose
2311 * the appropriate shadow mode */
2312 old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2313 #if CONFIG_PAGING_LEVELS == 2
2314 v->arch.guest_table =
2315 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2316 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2317 #elif CONFIG_PAGING_LEVELS == 3
2318 v->arch.guest_table =
2319 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2320 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2321 #else /* CONFIG_PAGING_LEVELS == 4 */
2323 l4_pgentry_t *l4e;
2324 /* Use the start of the first l3 table as a PAE l3 */
2325 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2326 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2327 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2328 v->arch.guest_table =
2329 pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2330 sh_unmap_domain_page(l4e);
2332 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2333 #endif
2334 /* Fix up refcounts on guest_table */
2335 get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2336 if ( mfn_x(old_guest_table) != 0 )
2337 put_page(mfn_to_page(old_guest_table));
2339 else
2341 #ifdef __x86_64__
2342 if ( hvm_long_mode_enabled(v) )
2344 // long mode guest...
2345 v->arch.shadow.mode =
2346 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2348 else
2349 #endif
2350 if ( hvm_pae_enabled(v) )
2352 #if CONFIG_PAGING_LEVELS >= 3
2353 // 32-bit PAE mode guest...
2354 v->arch.shadow.mode =
2355 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2356 #else
2357 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2358 domain_crash(d);
2359 return;
2360 #endif
2362 else
2364 // 32-bit 2 level guest...
2365 #if CONFIG_PAGING_LEVELS >= 3
2366 v->arch.shadow.mode =
2367 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2368 #else
2369 v->arch.shadow.mode =
2370 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2371 #endif
2375 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
2377 mfn_t mmfn = shadow_make_monitor_table(v);
2378 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2379 v->arch.monitor_vtable = sh_map_domain_page(mmfn);
2382 if ( v->arch.shadow.mode != old_mode )
2384 SHADOW_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
2385 "(was g=%u s=%u)\n",
2386 d->domain_id, v->vcpu_id,
2387 v->arch.shadow.mode->guest_levels,
2388 v->arch.shadow.mode->shadow_levels,
2389 old_mode ? old_mode->guest_levels : 0,
2390 old_mode ? old_mode->shadow_levels : 0);
2391 if ( old_mode &&
2392 (v->arch.shadow.mode->shadow_levels !=
2393 old_mode->shadow_levels) )
2395 /* Need to make a new monitor table for the new mode */
2396 mfn_t new_mfn, old_mfn;
2398 if ( v != current )
2400 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2401 "this HVM vcpu's (d=%u v=%u) paging mode!\n",
2402 current->domain->domain_id, current->vcpu_id,
2403 v->domain->domain_id, v->vcpu_id);
2404 domain_crash(v->domain);
2405 return;
2408 sh_unmap_domain_page(v->arch.monitor_vtable);
2409 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2410 v->arch.monitor_table = pagetable_null();
2411 new_mfn = v->arch.shadow.mode->make_monitor_table(v);
2412 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2413 v->arch.monitor_vtable = sh_map_domain_page(new_mfn);
2414 SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
2415 mfn_x(new_mfn));
2417 /* Don't be running on the old monitor table when we
2418 * pull it down! Switch CR3, and warn the HVM code that
2419 * its host cr3 has changed. */
2420 make_cr3(v, mfn_x(new_mfn));
2421 write_ptbase(v);
2422 hvm_update_host_cr3(v);
2423 old_mode->destroy_monitor_table(v, old_mfn);
2427 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2428 // These are HARD: think about the case where two CPU's have
2429 // different values for CR4.PSE and CR4.PGE at the same time.
2430 // This *does* happen, at least for CR4.PGE...
2433 v->arch.shadow.mode->update_cr3(v);
2436 /**************************************************************************/
2437 /* Turning on and off shadow features */
2439 static void sh_new_mode(struct domain *d, u32 new_mode)
2440 /* Inform all the vcpus that the shadow mode has been changed */
2442 struct vcpu *v;
2444 ASSERT(shadow_lock_is_acquired(d));
2445 ASSERT(d != current->domain);
2446 d->arch.shadow.mode = new_mode;
2447 if ( new_mode & SHM2_translate )
2448 shadow_audit_p2m(d);
2449 for_each_vcpu(d, v)
2450 sh_update_paging_modes(v);
2453 static int shadow_enable(struct domain *d, u32 mode)
2454 /* Turn on "permanent" shadow features: external, translate, refcount.
2455 * Can only be called once on a domain, and these features cannot be
2456 * disabled.
2457 * Returns 0 for success, -errno for failure. */
2459 unsigned int old_pages;
2460 int rv = 0;
2462 mode |= SHM2_enable;
2464 domain_pause(d);
2465 shadow_lock(d);
2467 /* Sanity check the arguments */
2468 if ( (d == current->domain) ||
2469 shadow_mode_enabled(d) ||
2470 ((mode & SHM2_external) && !(mode & SHM2_translate)) )
2472 rv = -EINVAL;
2473 goto out;
2476 // XXX -- eventually would like to require that all memory be allocated
2477 // *after* shadow_enabled() is called... So here, we would test to make
2478 // sure that d->page_list is empty.
2479 #if 0
2480 spin_lock(&d->page_alloc_lock);
2481 if ( !list_empty(&d->page_list) )
2483 spin_unlock(&d->page_alloc_lock);
2484 rv = -EINVAL;
2485 goto out;
2487 spin_unlock(&d->page_alloc_lock);
2488 #endif
2490 /* Init the shadow memory allocation if the user hasn't done so */
2491 old_pages = d->arch.shadow.total_pages;
2492 if ( old_pages == 0 )
2493 if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
2495 set_sh_allocation(d, 0, NULL);
2496 rv = -ENOMEM;
2497 goto out;
2500 /* Init the hash table */
2501 if ( shadow_hash_alloc(d) != 0 )
2503 set_sh_allocation(d, old_pages, NULL);
2504 rv = -ENOMEM;
2505 goto out;
2508 /* Init the P2M table */
2509 if ( mode & SHM2_translate )
2510 if ( !shadow_alloc_p2m_table(d) )
2512 shadow_hash_teardown(d);
2513 set_sh_allocation(d, old_pages, NULL);
2514 shadow_p2m_teardown(d);
2515 rv = -ENOMEM;
2516 goto out;
2519 /* Update the bits */
2520 sh_new_mode(d, mode);
2521 shadow_audit_p2m(d);
2522 out:
2523 shadow_unlock(d);
2524 domain_unpause(d);
2525 return 0;
2528 void shadow_teardown(struct domain *d)
2529 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2530 * Should only be called for dying domains. */
2532 struct vcpu *v;
2533 mfn_t mfn;
2535 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2536 ASSERT(d != current->domain);
2538 if ( !shadow_lock_is_acquired(d) )
2539 shadow_lock(d); /* Keep various asserts happy */
2541 if ( shadow_mode_enabled(d) )
2543 /* Release the shadow and monitor tables held by each vcpu */
2544 for_each_vcpu(d, v)
2546 shadow_detach_old_tables(v);
2547 if ( shadow_mode_external(d) )
2549 mfn = pagetable_get_mfn(v->arch.monitor_table);
2550 if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
2551 shadow_destroy_monitor_table(v, mfn);
2552 v->arch.monitor_table = pagetable_null();
2557 if ( d->arch.shadow.total_pages != 0 )
2559 SHADOW_PRINTK("teardown of domain %u starts."
2560 " Shadow pages total = %u, free = %u, p2m=%u\n",
2561 d->domain_id,
2562 d->arch.shadow.total_pages,
2563 d->arch.shadow.free_pages,
2564 d->arch.shadow.p2m_pages);
2565 /* Destroy all the shadows and release memory to domheap */
2566 set_sh_allocation(d, 0, NULL);
2567 /* Release the hash table back to xenheap */
2568 if (d->arch.shadow.hash_table)
2569 shadow_hash_teardown(d);
2570 /* Release the log-dirty bitmap of dirtied pages */
2571 sh_free_log_dirty_bitmap(d);
2572 /* Should not have any more memory held */
2573 SHADOW_PRINTK("teardown done."
2574 " Shadow pages total = %u, free = %u, p2m=%u\n",
2575 d->arch.shadow.total_pages,
2576 d->arch.shadow.free_pages,
2577 d->arch.shadow.p2m_pages);
2578 ASSERT(d->arch.shadow.total_pages == 0);
2581 /* We leave the "permanent" shadow modes enabled, but clear the
2582 * log-dirty mode bit. We don't want any more mark_dirty()
2583 * calls now that we've torn down the bitmap */
2584 d->arch.shadow.mode &= ~SHM2_log_dirty;
2586 shadow_unlock(d);
2589 void shadow_final_teardown(struct domain *d)
2590 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2593 SHADOW_PRINTK("dom %u final teardown starts."
2594 " Shadow pages total = %u, free = %u, p2m=%u\n",
2595 d->domain_id,
2596 d->arch.shadow.total_pages,
2597 d->arch.shadow.free_pages,
2598 d->arch.shadow.p2m_pages);
2600 /* Double-check that the domain didn't have any shadow memory.
2601 * It is possible for a domain that never got domain_kill()ed
2602 * to get here with its shadow allocation intact. */
2603 if ( d->arch.shadow.total_pages != 0 )
2604 shadow_teardown(d);
2606 /* It is now safe to pull down the p2m map. */
2607 if ( d->arch.shadow.p2m_pages != 0 )
2608 shadow_p2m_teardown(d);
2610 SHADOW_PRINTK("dom %u final teardown done."
2611 " Shadow pages total = %u, free = %u, p2m=%u\n",
2612 d->domain_id,
2613 d->arch.shadow.total_pages,
2614 d->arch.shadow.free_pages,
2615 d->arch.shadow.p2m_pages);
2618 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2619 /* Turn on a single shadow mode feature */
2621 ASSERT(shadow_lock_is_acquired(d));
2623 /* Sanity check the call */
2624 if ( d == current->domain || (d->arch.shadow.mode & mode) )
2626 return -EINVAL;
2629 if ( d->arch.shadow.mode == 0 )
2631 /* Init the shadow memory allocation and the hash table */
2632 if ( set_sh_allocation(d, 1, NULL) != 0
2633 || shadow_hash_alloc(d) != 0 )
2635 set_sh_allocation(d, 0, NULL);
2636 return -ENOMEM;
2640 /* Update the bits */
2641 sh_new_mode(d, d->arch.shadow.mode | mode);
2643 return 0;
2646 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2647 /* Turn off a single shadow mode feature */
2649 struct vcpu *v;
2650 ASSERT(shadow_lock_is_acquired(d));
2652 /* Sanity check the call */
2653 if ( d == current->domain || !(d->arch.shadow.mode & mode) )
2655 return -EINVAL;
2658 /* Update the bits */
2659 sh_new_mode(d, d->arch.shadow.mode & ~mode);
2660 if ( d->arch.shadow.mode == 0 )
2662 /* Get this domain off shadows */
2663 SHADOW_PRINTK("un-shadowing of domain %u starts."
2664 " Shadow pages total = %u, free = %u, p2m=%u\n",
2665 d->domain_id,
2666 d->arch.shadow.total_pages,
2667 d->arch.shadow.free_pages,
2668 d->arch.shadow.p2m_pages);
2669 for_each_vcpu(d, v)
2671 shadow_detach_old_tables(v);
2672 #if CONFIG_PAGING_LEVELS == 4
2673 if ( !(v->arch.flags & TF_kernel_mode) )
2674 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2675 else
2676 #endif
2677 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2681 /* Pull down the memory allocation */
2682 if ( set_sh_allocation(d, 0, NULL) != 0 )
2684 // XXX - How can this occur?
2685 // Seems like a bug to return an error now that we've
2686 // disabled the relevant shadow mode.
2687 //
2688 return -ENOMEM;
2690 shadow_hash_teardown(d);
2691 SHADOW_PRINTK("un-shadowing of domain %u done."
2692 " Shadow pages total = %u, free = %u, p2m=%u\n",
2693 d->domain_id,
2694 d->arch.shadow.total_pages,
2695 d->arch.shadow.free_pages,
2696 d->arch.shadow.p2m_pages);
2699 return 0;
2702 /* Enable/disable ops for the "test" and "log-dirty" modes */
2703 int shadow_test_enable(struct domain *d)
2705 int ret;
2707 domain_pause(d);
2708 shadow_lock(d);
2710 if ( shadow_mode_enabled(d) )
2712 SHADOW_ERROR("Don't support enabling test mode"
2713 "on already shadowed doms\n");
2714 ret = -EINVAL;
2715 goto out;
2718 ret = shadow_one_bit_enable(d, SHM2_enable);
2719 out:
2720 shadow_unlock(d);
2721 domain_unpause(d);
2723 return ret;
2726 int shadow_test_disable(struct domain *d)
2728 int ret;
2730 domain_pause(d);
2731 shadow_lock(d);
2732 ret = shadow_one_bit_disable(d, SHM2_enable);
2733 shadow_unlock(d);
2734 domain_unpause(d);
2736 return ret;
2739 static int
2740 sh_alloc_log_dirty_bitmap(struct domain *d)
2742 ASSERT(d->arch.shadow.dirty_bitmap == NULL);
2743 d->arch.shadow.dirty_bitmap_size =
2744 (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
2745 ~(BITS_PER_LONG - 1);
2746 d->arch.shadow.dirty_bitmap =
2747 xmalloc_array(unsigned long,
2748 d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
2749 if ( d->arch.shadow.dirty_bitmap == NULL )
2751 d->arch.shadow.dirty_bitmap_size = 0;
2752 return -ENOMEM;
2754 memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
2756 return 0;
2759 static void
2760 sh_free_log_dirty_bitmap(struct domain *d)
2762 d->arch.shadow.dirty_bitmap_size = 0;
2763 if ( d->arch.shadow.dirty_bitmap )
2765 xfree(d->arch.shadow.dirty_bitmap);
2766 d->arch.shadow.dirty_bitmap = NULL;
2770 static int shadow_log_dirty_enable(struct domain *d)
2772 int ret;
2774 domain_pause(d);
2775 shadow_lock(d);
2777 if ( shadow_mode_log_dirty(d) )
2779 ret = -EINVAL;
2780 goto out;
2783 if ( shadow_mode_enabled(d) )
2785 SHADOW_ERROR("Don't (yet) support enabling log-dirty"
2786 "on already shadowed doms\n");
2787 ret = -EINVAL;
2788 goto out;
2791 ret = sh_alloc_log_dirty_bitmap(d);
2792 if ( ret != 0 )
2794 sh_free_log_dirty_bitmap(d);
2795 goto out;
2798 ret = shadow_one_bit_enable(d, SHM2_log_dirty);
2799 if ( ret != 0 )
2800 sh_free_log_dirty_bitmap(d);
2802 out:
2803 shadow_unlock(d);
2804 domain_unpause(d);
2805 return ret;
2808 static int shadow_log_dirty_disable(struct domain *d)
2810 int ret;
2812 domain_pause(d);
2813 shadow_lock(d);
2814 ret = shadow_one_bit_disable(d, SHM2_log_dirty);
2815 if ( !shadow_mode_log_dirty(d) )
2816 sh_free_log_dirty_bitmap(d);
2817 shadow_unlock(d);
2818 domain_unpause(d);
2820 return ret;
2823 /**************************************************************************/
2824 /* P2M map manipulations */
2826 static void
2827 sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
2829 struct vcpu *v;
2831 if ( !shadow_mode_translate(d) )
2832 return;
2834 v = current;
2835 if ( v->domain != d )
2836 v = d->vcpu[0];
2839 SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
2841 ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
2842 //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
2844 shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
2845 if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
2846 flush_tlb_mask(d->domain_dirty_cpumask);
2847 shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
2848 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
2851 void
2852 shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
2853 unsigned long mfn)
2855 shadow_lock(d);
2856 shadow_audit_p2m(d);
2857 sh_p2m_remove_page(d, gfn, mfn);
2858 shadow_audit_p2m(d);
2859 shadow_unlock(d);
2862 void
2863 shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
2864 unsigned long mfn)
2866 struct vcpu *v;
2867 unsigned long ogfn;
2868 mfn_t omfn;
2870 if ( !shadow_mode_translate(d) )
2871 return;
2873 v = current;
2874 if ( v->domain != d )
2875 v = d->vcpu[0];
2877 shadow_lock(d);
2878 shadow_audit_p2m(d);
2880 SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
2882 omfn = sh_gfn_to_mfn(d, gfn);
2883 if ( valid_mfn(omfn) )
2885 /* Get rid of the old mapping, especially any shadows */
2886 shadow_remove_all_shadows_and_parents(v, omfn);
2887 if ( shadow_remove_all_mappings(v, omfn) )
2888 flush_tlb_mask(d->domain_dirty_cpumask);
2889 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
2892 ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
2893 if (
2894 #ifdef __x86_64__
2895 (ogfn != 0x5555555555555555L)
2896 #else
2897 (ogfn != 0x55555555L)
2898 #endif
2899 && (ogfn != INVALID_M2P_ENTRY)
2900 && (ogfn != gfn) )
2902 /* This machine frame is already mapped at another physical address */
2903 SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
2904 mfn, ogfn, gfn);
2905 if ( valid_mfn(omfn = sh_gfn_to_mfn(d, ogfn)) )
2907 SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
2908 ogfn , mfn_x(omfn));
2909 if ( mfn_x(omfn) == mfn )
2910 sh_p2m_remove_page(d, ogfn, mfn);
2914 shadow_set_p2m_entry(d, gfn, _mfn(mfn));
2915 set_gpfn_from_mfn(mfn, gfn);
2916 shadow_audit_p2m(d);
2917 shadow_unlock(d);
2920 /**************************************************************************/
2921 /* Log-dirty mode support */
2923 /* Convert a shadow to log-dirty mode. */
2924 void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
2926 BUG();
2930 /* Read a domain's log-dirty bitmap and stats.
2931 * If the operation is a CLEAN, clear the bitmap and stats as well. */
2932 static int shadow_log_dirty_op(
2933 struct domain *d, struct xen_domctl_shadow_op *sc)
2935 int i, rv = 0, clean = 0;
2937 domain_pause(d);
2938 shadow_lock(d);
2940 clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
2942 SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
2943 (clean) ? "clean" : "peek",
2944 d->domain_id,
2945 d->arch.shadow.fault_count,
2946 d->arch.shadow.dirty_count);
2948 sc->stats.fault_count = d->arch.shadow.fault_count;
2949 sc->stats.dirty_count = d->arch.shadow.dirty_count;
2951 if ( clean )
2953 struct list_head *l, *t;
2954 struct page_info *pg;
2956 /* Need to revoke write access to the domain's pages again.
2957 * In future, we'll have a less heavy-handed approach to this,
2958 * but for now, we just unshadow everything except Xen. */
2959 list_for_each_safe(l, t, &d->arch.shadow.toplevel_shadows)
2961 pg = list_entry(l, struct page_info, list);
2962 shadow_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
2965 d->arch.shadow.fault_count = 0;
2966 d->arch.shadow.dirty_count = 0;
2969 if ( guest_handle_is_null(sc->dirty_bitmap) ||
2970 (d->arch.shadow.dirty_bitmap == NULL) )
2972 rv = -EINVAL;
2973 goto out;
2976 if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
2977 sc->pages = d->arch.shadow.dirty_bitmap_size;
2979 #define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
2980 for ( i = 0; i < sc->pages; i += CHUNK )
2982 int bytes = ((((sc->pages - i) > CHUNK)
2983 ? CHUNK
2984 : (sc->pages - i)) + 7) / 8;
2986 if ( copy_to_guest_offset(
2987 sc->dirty_bitmap,
2988 i/(8*sizeof(unsigned long)),
2989 d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
2990 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
2992 rv = -EINVAL;
2993 goto out;
2996 if ( clean )
2997 memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
2998 0, bytes);
3000 #undef CHUNK
3002 out:
3003 shadow_unlock(d);
3004 domain_unpause(d);
3005 return 0;
3009 /* Mark a page as dirty */
3010 void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
3012 unsigned long pfn;
3014 ASSERT(shadow_lock_is_acquired(d));
3015 ASSERT(shadow_mode_log_dirty(d));
3017 if ( !valid_mfn(gmfn) )
3018 return;
3020 ASSERT(d->arch.shadow.dirty_bitmap != NULL);
3022 /* We /really/ mean PFN here, even for non-translated guests. */
3023 pfn = get_gpfn_from_mfn(mfn_x(gmfn));
3025 /*
3026 * Values with the MSB set denote MFNs that aren't really part of the
3027 * domain's pseudo-physical memory map (e.g., the shared info frame).
3028 * Nothing to do here...
3029 */
3030 if ( unlikely(!VALID_M2P(pfn)) )
3031 return;
3033 /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
3034 if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
3036 if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
3038 SHADOW_DEBUG(LOGDIRTY,
3039 "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
3040 mfn_x(gmfn), pfn, d->domain_id);
3041 d->arch.shadow.dirty_count++;
3044 else
3046 SHADOW_PRINTK("mark_dirty OOR! "
3047 "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
3048 "owner=%d c=%08x t=%" PRtype_info "\n",
3049 mfn_x(gmfn),
3050 pfn,
3051 d->arch.shadow.dirty_bitmap_size,
3052 d->domain_id,
3053 (page_get_owner(mfn_to_page(gmfn))
3054 ? page_get_owner(mfn_to_page(gmfn))->domain_id
3055 : -1),
3056 mfn_to_page(gmfn)->count_info,
3057 mfn_to_page(gmfn)->u.inuse.type_info);
3062 /**************************************************************************/
3063 /* Shadow-control XEN_DOMCTL dispatcher */
3065 int shadow_domctl(struct domain *d,
3066 xen_domctl_shadow_op_t *sc,
3067 XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
3069 int rc, preempted = 0;
3071 if ( unlikely(d == current->domain) )
3073 DPRINTK("Don't try to do a shadow op on yourself!\n");
3074 return -EINVAL;
3077 switch ( sc->op )
3079 case XEN_DOMCTL_SHADOW_OP_OFF:
3080 if ( shadow_mode_log_dirty(d) )
3081 if ( (rc = shadow_log_dirty_disable(d)) != 0 )
3082 return rc;
3083 if ( d->arch.shadow.mode & SHM2_enable )
3084 if ( (rc = shadow_test_disable(d)) != 0 )
3085 return rc;
3086 return 0;
3088 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3089 return shadow_test_enable(d);
3091 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
3092 return shadow_log_dirty_enable(d);
3094 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3095 return shadow_enable(d, SHM2_refcounts|SHM2_translate);
3097 case XEN_DOMCTL_SHADOW_OP_CLEAN:
3098 case XEN_DOMCTL_SHADOW_OP_PEEK:
3099 return shadow_log_dirty_op(d, sc);
3101 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3102 if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
3103 return shadow_log_dirty_enable(d);
3104 return shadow_enable(d, sc->mode << SHM2_shift);
3106 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3107 sc->mb = shadow_get_allocation(d);
3108 return 0;
3110 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3111 rc = shadow_set_allocation(d, sc->mb, &preempted);
3112 if ( preempted )
3113 /* Not finished. Set up to re-run the call. */
3114 rc = hypercall_create_continuation(
3115 __HYPERVISOR_domctl, "h", u_domctl);
3116 else
3117 /* Finished. Return the new allocation */
3118 sc->mb = shadow_get_allocation(d);
3119 return rc;
3121 default:
3122 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3123 return -EINVAL;
3128 /**************************************************************************/
3129 /* Auditing shadow tables */
3131 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3133 void shadow_audit_tables(struct vcpu *v)
3135 /* Dispatch table for getting per-type functions */
3136 static hash_callback_t callbacks[16] = {
3137 NULL, /* none */
3138 #if CONFIG_PAGING_LEVELS == 2
3139 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
3140 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
3141 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
3142 #else
3143 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
3144 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
3145 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
3146 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
3147 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
3148 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
3149 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
3150 SHADOW_INTERNAL_NAME(sh_audit_l3_table,3,3), /* l3_pae */
3151 #if CONFIG_PAGING_LEVELS >= 4
3152 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
3153 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
3154 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
3155 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
3156 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
3157 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3158 #endif /* CONFIG_PAGING_LEVELS > 2 */
3159 NULL /* All the rest */
3160 };
3161 unsigned int mask;
3163 if ( !(SHADOW_AUDIT_ENABLE) )
3164 return;
3166 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3167 mask = ~1; /* Audit every table in the system */
3168 else
3170 /* Audit only the current mode's tables */
3171 switch ( v->arch.shadow.mode->guest_levels )
3173 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3174 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3175 |SHF_L2H_PAE|SHF_L3_PAE); break;
3176 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3177 |SHF_L3_64|SHF_L4_64); break;
3178 default: BUG();
3182 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3185 #endif /* Shadow audit */
3188 /**************************************************************************/
3189 /* Auditing p2m tables */
3191 #if SHADOW_AUDIT & SHADOW_AUDIT_P2M
3193 void shadow_audit_p2m(struct domain *d)
3195 struct list_head *entry;
3196 struct page_info *page;
3197 struct domain *od;
3198 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
3199 mfn_t p2mfn;
3200 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
3201 int test_linear;
3203 if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
3204 return;
3206 //SHADOW_PRINTK("p2m audit starts\n");
3208 test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
3209 if ( test_linear )
3210 local_flush_tlb();
3212 /* Audit part one: walk the domain's page allocation list, checking
3213 * the m2p entries. */
3214 for ( entry = d->page_list.next;
3215 entry != &d->page_list;
3216 entry = entry->next )
3218 page = list_entry(entry, struct page_info, list);
3219 mfn = mfn_x(page_to_mfn(page));
3221 // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
3223 od = page_get_owner(page);
3225 if ( od != d )
3227 SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
3228 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
3229 continue;
3232 gfn = get_gpfn_from_mfn(mfn);
3233 if ( gfn == INVALID_M2P_ENTRY )
3235 orphans_i++;
3236 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
3237 // mfn);
3238 continue;
3241 if ( gfn == 0x55555555 )
3243 orphans_d++;
3244 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
3245 // mfn);
3246 continue;
3249 p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
3250 if ( mfn_x(p2mfn) != mfn )
3252 mpbad++;
3253 SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
3254 " (-> gfn %#lx)\n",
3255 mfn, gfn, mfn_x(p2mfn),
3256 (mfn_valid(p2mfn)
3257 ? get_gpfn_from_mfn(mfn_x(p2mfn))
3258 : -1u));
3259 /* This m2p entry is stale: the domain has another frame in
3260 * this physical slot. No great disaster, but for neatness,
3261 * blow away the m2p entry. */
3262 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3265 if ( test_linear )
3267 lp2mfn = get_mfn_from_gpfn(gfn);
3268 if ( lp2mfn != mfn_x(p2mfn) )
3270 SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
3271 "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
3275 // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
3276 // mfn, gfn, p2mfn, lp2mfn);
3279 /* Audit part two: walk the domain's p2m table, checking the entries. */
3280 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
3282 l2_pgentry_t *l2e;
3283 l1_pgentry_t *l1e;
3284 int i1, i2;
3286 #if CONFIG_PAGING_LEVELS == 4
3287 l4_pgentry_t *l4e;
3288 l3_pgentry_t *l3e;
3289 int i3, i4;
3290 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3291 #elif CONFIG_PAGING_LEVELS == 3
3292 l3_pgentry_t *l3e;
3293 int i3;
3294 l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3295 #else /* CONFIG_PAGING_LEVELS == 2 */
3296 l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3297 #endif
3299 gfn = 0;
3300 #if CONFIG_PAGING_LEVELS >= 3
3301 #if CONFIG_PAGING_LEVELS >= 4
3302 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
3304 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
3306 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
3307 continue;
3309 l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
3310 #endif /* now at levels 3 or 4... */
3311 for ( i3 = 0;
3312 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
3313 i3++ )
3315 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
3317 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3318 continue;
3320 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
3321 #endif /* all levels... */
3322 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
3324 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
3326 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
3327 continue;
3329 l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
3331 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
3333 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
3334 continue;
3335 mfn = l1e_get_pfn(l1e[i1]);
3336 ASSERT(valid_mfn(_mfn(mfn)));
3337 m2pfn = get_gpfn_from_mfn(mfn);
3338 if ( m2pfn != gfn )
3340 pmbad++;
3341 SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
3342 " -> gfn %#lx\n", gfn, mfn, m2pfn);
3343 BUG();
3346 sh_unmap_domain_page(l1e);
3348 #if CONFIG_PAGING_LEVELS >= 3
3349 sh_unmap_domain_page(l2e);
3351 #if CONFIG_PAGING_LEVELS >= 4
3352 sh_unmap_domain_page(l3e);
3354 #endif
3355 #endif
3357 #if CONFIG_PAGING_LEVELS == 4
3358 sh_unmap_domain_page(l4e);
3359 #elif CONFIG_PAGING_LEVELS == 3
3360 sh_unmap_domain_page(l3e);
3361 #else /* CONFIG_PAGING_LEVELS == 2 */
3362 sh_unmap_domain_page(l2e);
3363 #endif
3367 //SHADOW_PRINTK("p2m audit complete\n");
3368 //if ( orphans_i | orphans_d | mpbad | pmbad )
3369 // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
3370 // orphans_i + orphans_d, orphans_i, orphans_d,
3371 if ( mpbad | pmbad )
3372 SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
3373 pmbad, mpbad);
3376 #endif /* p2m audit */
3378 /*
3379 * Local variables:
3380 * mode: C
3381 * c-set-style: "BSD"
3382 * c-basic-offset: 4
3383 * indent-tabs-mode: nil
3384 * End:
3385 */