ia64/xen-unstable

view xen/arch/x86/mm/shadow/common.c @ 11831:8631433e5195

[SHADOW] Trivial whitespace fix to a couple of printf format strings.
From: Brendan Cully <brendan@cs.ubc.ca>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Sun Oct 15 09:21:49 2006 +0100 (2006-10-15)
parents 058f4a2a8642
children 22885e4c1275
line source
1 /******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/types.h>
26 #include <xen/mm.h>
27 #include <xen/trace.h>
28 #include <xen/sched.h>
29 #include <xen/perfc.h>
30 #include <xen/irq.h>
31 #include <xen/domain_page.h>
32 #include <xen/guest_access.h>
33 #include <xen/keyhandler.h>
34 #include <asm/event.h>
35 #include <asm/page.h>
36 #include <asm/current.h>
37 #include <asm/flushtlb.h>
38 #include <asm/shadow.h>
39 #include "private.h"
41 #if SHADOW_AUDIT
42 int shadow_audit_enable = 0;
44 static void shadow_audit_key(unsigned char key)
45 {
46 shadow_audit_enable = !shadow_audit_enable;
47 printk("%s shadow_audit_enable=%d\n",
48 __func__, shadow_audit_enable);
49 }
51 static int __init shadow_audit_key_init(void)
52 {
53 register_keyhandler(
54 'O', shadow_audit_key, "toggle shadow audits");
55 return 0;
56 }
57 __initcall(shadow_audit_key_init);
58 #endif /* SHADOW_AUDIT */
60 static void sh_free_log_dirty_bitmap(struct domain *d);
62 int _shadow_mode_refcounts(struct domain *d)
63 {
64 return shadow_mode_refcounts(d);
65 }
68 /**************************************************************************/
69 /* x86 emulator support for the shadow code
70 */
72 static int
73 sh_x86_emulate_read_std(unsigned long addr,
74 unsigned long *val,
75 unsigned int bytes,
76 struct x86_emulate_ctxt *ctxt)
77 {
78 *val = 0;
79 // XXX -- this is WRONG.
80 // It entirely ignores the permissions in the page tables.
81 // In this case, that is only a user vs supervisor access check.
82 //
83 if ( hvm_copy_from_guest_virt(val, addr, bytes) == 0 )
84 {
85 #if 0
86 struct vcpu *v = current;
87 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
88 v->domain->domain_id, v->vcpu_id,
89 addr, *val, bytes);
90 #endif
91 return X86EMUL_CONTINUE;
92 }
94 /* If we got here, there was nothing mapped here, or a bad GFN
95 * was mapped here. This should never happen: we're here because
96 * of a write fault at the end of the instruction we're emulating. */
97 SHADOW_PRINTK("read failed to va %#lx\n", addr);
98 return X86EMUL_PROPAGATE_FAULT;
99 }
101 static int
102 sh_x86_emulate_write_std(unsigned long addr,
103 unsigned long val,
104 unsigned int bytes,
105 struct x86_emulate_ctxt *ctxt)
106 {
107 #if 0
108 struct vcpu *v = current;
109 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
110 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
111 #endif
113 // XXX -- this is WRONG.
114 // It entirely ignores the permissions in the page tables.
115 // In this case, that includes user vs supervisor, and
116 // write access.
117 //
118 if ( hvm_copy_to_guest_virt(addr, &val, bytes) == 0 )
119 return X86EMUL_CONTINUE;
121 /* If we got here, there was nothing mapped here, or a bad GFN
122 * was mapped here. This should never happen: we're here because
123 * of a write fault at the end of the instruction we're emulating,
124 * which should be handled by sh_x86_emulate_write_emulated. */
125 SHADOW_PRINTK("write failed to va %#lx\n", addr);
126 return X86EMUL_PROPAGATE_FAULT;
127 }
129 static int
130 sh_x86_emulate_write_emulated(unsigned long addr,
131 unsigned long val,
132 unsigned int bytes,
133 struct x86_emulate_ctxt *ctxt)
134 {
135 struct vcpu *v = current;
136 #if 0
137 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
138 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
139 #endif
140 return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
141 }
143 static int
144 sh_x86_emulate_cmpxchg_emulated(unsigned long addr,
145 unsigned long old,
146 unsigned long new,
147 unsigned int bytes,
148 struct x86_emulate_ctxt *ctxt)
149 {
150 struct vcpu *v = current;
151 #if 0
152 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
153 v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
154 #endif
155 return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new,
156 bytes, ctxt);
157 }
159 static int
160 sh_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
161 unsigned long old_lo,
162 unsigned long old_hi,
163 unsigned long new_lo,
164 unsigned long new_hi,
165 struct x86_emulate_ctxt *ctxt)
166 {
167 struct vcpu *v = current;
168 #if 0
169 SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
170 v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
171 new_hi, new_lo, ctxt);
172 #endif
173 return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
174 new_lo, new_hi, ctxt);
175 }
178 struct x86_emulate_ops shadow_emulator_ops = {
179 .read_std = sh_x86_emulate_read_std,
180 .write_std = sh_x86_emulate_write_std,
181 .read_emulated = sh_x86_emulate_read_std,
182 .write_emulated = sh_x86_emulate_write_emulated,
183 .cmpxchg_emulated = sh_x86_emulate_cmpxchg_emulated,
184 .cmpxchg8b_emulated = sh_x86_emulate_cmpxchg8b_emulated,
185 };
187 /**************************************************************************/
188 /* Code for "promoting" a guest page to the point where the shadow code is
189 * willing to let it be treated as a guest page table. This generally
190 * involves making sure there are no writable mappings available to the guest
191 * for this page.
192 */
193 void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type)
194 {
195 struct page_info *page = mfn_to_page(gmfn);
197 ASSERT(valid_mfn(gmfn));
199 /* We should never try to promote a gmfn that has writeable mappings */
200 ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
202 /* Is the page already shadowed? */
203 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
204 page->shadow_flags = 0;
206 ASSERT(!test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
207 set_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
208 }
210 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
211 {
212 struct page_info *page = mfn_to_page(gmfn);
214 ASSERT(test_bit(_PGC_page_table, &page->count_info));
215 ASSERT(test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
217 clear_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
219 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
220 {
221 /* tlbflush timestamp field is valid again */
222 page->tlbflush_timestamp = tlbflush_current_time();
223 clear_bit(_PGC_page_table, &page->count_info);
224 }
225 }
227 /**************************************************************************/
228 /* Validate a pagetable change from the guest and update the shadows.
229 * Returns a bitmask of SHADOW_SET_* flags. */
231 int
232 __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
233 void *entry, u32 size)
234 {
235 int result = 0;
236 struct page_info *page = mfn_to_page(gmfn);
238 sh_mark_dirty(v->domain, gmfn);
240 // Determine which types of shadows are affected, and update each.
241 //
242 // Always validate L1s before L2s to prevent another cpu with a linear
243 // mapping of this gmfn from seeing a walk that results from
244 // using the new L2 value and the old L1 value. (It is OK for such a
245 // guest to see a walk that uses the old L2 value with the new L1 value,
246 // as hardware could behave this way if one level of the pagewalk occurs
247 // before the store, and the next level of the pagewalk occurs after the
248 // store.
249 //
250 // Ditto for L2s before L3s, etc.
251 //
253 if ( !(page->count_info & PGC_page_table) )
254 return 0; /* Not shadowed at all */
256 #if CONFIG_PAGING_LEVELS == 2
257 if ( page->shadow_flags & SHF_L1_32 )
258 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
259 (v, gmfn, entry, size);
260 #else
261 if ( page->shadow_flags & SHF_L1_32 )
262 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
263 (v, gmfn, entry, size);
264 #endif
266 #if CONFIG_PAGING_LEVELS == 2
267 if ( page->shadow_flags & SHF_L2_32 )
268 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
269 (v, gmfn, entry, size);
270 #else
271 if ( page->shadow_flags & SHF_L2_32 )
272 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
273 (v, gmfn, entry, size);
274 #endif
276 #if CONFIG_PAGING_LEVELS >= 3
277 if ( page->shadow_flags & SHF_L1_PAE )
278 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
279 (v, gmfn, entry, size);
280 if ( page->shadow_flags & SHF_L2_PAE )
281 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
282 (v, gmfn, entry, size);
283 if ( page->shadow_flags & SHF_L2H_PAE )
284 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
285 (v, gmfn, entry, size);
286 if ( page->shadow_flags & SHF_L3_PAE )
287 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 3, 3)
288 (v, gmfn, entry, size);
289 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
290 ASSERT((page->shadow_flags & (SHF_L3_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
291 #endif
293 #if CONFIG_PAGING_LEVELS >= 4
294 if ( page->shadow_flags & SHF_L1_64 )
295 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
296 (v, gmfn, entry, size);
297 if ( page->shadow_flags & SHF_L2_64 )
298 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
299 (v, gmfn, entry, size);
300 if ( page->shadow_flags & SHF_L3_64 )
301 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
302 (v, gmfn, entry, size);
303 if ( page->shadow_flags & SHF_L4_64 )
304 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
305 (v, gmfn, entry, size);
306 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
307 ASSERT((page->shadow_flags
308 & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
309 #endif
311 return result;
312 }
315 int
316 shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
317 /* This is the entry point from hypercalls. It returns a bitmask of all the
318 * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
319 {
320 int rc;
322 ASSERT(shadow_lock_is_acquired(v->domain));
323 rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
324 shadow_audit_tables(v);
325 return rc;
326 }
328 void
329 shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
330 void *entry, u32 size)
331 /* This is the entry point for emulated writes to pagetables in HVM guests and
332 * PV translated guests.
333 */
334 {
335 struct domain *d = v->domain;
336 int rc;
338 ASSERT(shadow_lock_is_acquired(v->domain));
339 rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
340 if ( rc & SHADOW_SET_FLUSH )
341 /* Need to flush TLBs to pick up shadow PT changes */
342 flush_tlb_mask(d->domain_dirty_cpumask);
343 if ( rc & SHADOW_SET_ERROR )
344 {
345 /* This page is probably not a pagetable any more: tear it out of the
346 * shadows, along with any tables that reference it */
347 shadow_remove_all_shadows_and_parents(v, gmfn);
348 }
349 }
352 /**************************************************************************/
353 /* Memory management for shadow pages. */
355 /* Meaning of the count_info field in shadow pages
356 * ----------------------------------------------
357 *
358 * A count of all references to this page from other shadow pages and
359 * guest CR3s (a.k.a. v->arch.shadow.table).
360 *
361 * The top bits hold the shadow type and the pinned bit. Top-level
362 * shadows are pinned so that they don't disappear when not in a CR3
363 * somewhere.
364 *
365 * We don't need to use get|put_page for this as the updates are all
366 * protected by the shadow lock. We can't use get|put_page for this
367 * as the size of the count on shadow pages is different from that on
368 * normal guest pages.
369 */
371 /* Meaning of the type_info field in shadow pages
372 * ----------------------------------------------
373 *
374 * type_info use depends on the shadow type (from count_info)
375 *
376 * PGC_SH_none : This page is in the shadow free pool. type_info holds
377 * the chunk order for our freelist allocator.
378 *
379 * PGC_SH_l*_shadow : This page is in use as a shadow. type_info
380 * holds the mfn of the guest page being shadowed,
381 *
382 * PGC_SH_fl1_*_shadow : This page is being used to shatter a superpage.
383 * type_info holds the gfn being shattered.
384 *
385 * PGC_SH_monitor_table : This page is part of a monitor table.
386 * type_info is not used.
387 */
389 /* Meaning of the _domain field in shadow pages
390 * --------------------------------------------
391 *
392 * In shadow pages, this field will always have its least significant bit
393 * set. This ensures that all attempts to get_page() will fail (as all
394 * valid pickled domain pointers have a zero for their least significant bit).
395 * Instead, the remaining upper bits are used to record the shadow generation
396 * counter when the shadow was created.
397 */
399 /* Meaning of the shadow_flags field
400 * ----------------------------------
401 *
402 * In guest pages that are shadowed, one bit for each kind of shadow they have.
403 *
404 * In shadow pages, will be used for holding a representation of the populated
405 * entries in this shadow (either a min/max, or a bitmap, or ...)
406 *
407 * In monitor-table pages, holds the level of the particular page (to save
408 * spilling the shadow types into an extra bit by having three types of monitor
409 * page).
410 */
412 /* Meaning of the list_head struct in shadow pages
413 * -----------------------------------------------
414 *
415 * In free shadow pages, this is used to hold the free-lists of chunks.
416 *
417 * In top-level shadow tables, this holds a linked-list of all top-level
418 * shadows (used for recovering memory and destroying shadows).
419 *
420 * In lower-level shadows, this holds the physical address of a higher-level
421 * shadow entry that holds a reference to this shadow (or zero).
422 */
424 /* Allocating shadow pages
425 * -----------------------
426 *
427 * Most shadow pages are allocated singly, but there are two cases where we
428 * need to allocate multiple pages together.
429 *
430 * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
431 * A 32-bit guest l1 table covers 4MB of virtuial address space,
432 * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
433 * of virtual address space each). Similarly, a 32-bit guest l2 table
434 * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va
435 * each). These multi-page shadows are contiguous and aligned;
436 * functions for handling offsets into them are defined in shadow.c
437 * (shadow_l1_index() etc.)
438 *
439 * 2: Shadowing PAE top-level pages. Each guest page that contains
440 * any PAE top-level pages requires two shadow pages to shadow it.
441 * They contain alternating l3 tables and pae_l3_bookkeeping structs.
442 *
443 * This table shows the allocation behaviour of the different modes:
444 *
445 * Xen paging 32b pae pae 64b 64b 64b
446 * Guest paging 32b 32b pae 32b pae 64b
447 * PV or HVM * HVM * HVM HVM *
448 * Shadow paging 32b pae pae pae pae 64b
449 *
450 * sl1 size 4k 8k 4k 8k 4k 4k
451 * sl2 size 4k 16k 4k 16k 4k 4k
452 * sl3 size - - 8k - 8k 4k
453 * sl4 size - - - - - 4k
454 *
455 * We allocate memory from xen in four-page units and break them down
456 * with a simple buddy allocator. Can't use the xen allocator to handle
457 * this as it only works for contiguous zones, and a domain's shadow
458 * pool is made of fragments.
459 *
460 * In HVM guests, the p2m table is built out of shadow pages, and we provide
461 * a function for the p2m management to steal pages, in max-order chunks, from
462 * the free pool. We don't provide for giving them back, yet.
463 */
465 /* Figure out the least acceptable quantity of shadow memory.
466 * The minimum memory requirement for always being able to free up a
467 * chunk of memory is very small -- only three max-order chunks per
468 * vcpu to hold the top level shadows and pages with Xen mappings in them.
469 *
470 * But for a guest to be guaranteed to successfully execute a single
471 * instruction, we must be able to map a large number (about thirty) VAs
472 * at the same time, which means that to guarantee progress, we must
473 * allow for more than ninety allocated pages per vcpu. We round that
474 * up to 128 pages, or half a megabyte per vcpu. */
475 unsigned int shadow_min_acceptable_pages(struct domain *d)
476 {
477 u32 vcpu_count = 0;
478 struct vcpu *v;
480 for_each_vcpu(d, v)
481 vcpu_count++;
483 return (vcpu_count * 128);
484 }
486 /* Using the type_info field to store freelist order */
487 #define SH_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
488 #define SH_SET_PFN_ORDER(_p, _o) \
489 do { (_p)->u.inuse.type_info = (_o); } while (0)
492 /* Figure out the order of allocation needed for a given shadow type */
493 static inline u32
494 shadow_order(u32 shadow_type)
495 {
496 #if CONFIG_PAGING_LEVELS > 2
497 static const u32 type_to_order[16] = {
498 0, /* PGC_SH_none */
499 1, /* PGC_SH_l1_32_shadow */
500 1, /* PGC_SH_fl1_32_shadow */
501 2, /* PGC_SH_l2_32_shadow */
502 0, /* PGC_SH_l1_pae_shadow */
503 0, /* PGC_SH_fl1_pae_shadow */
504 0, /* PGC_SH_l2_pae_shadow */
505 0, /* PGC_SH_l2h_pae_shadow */
506 1, /* PGC_SH_l3_pae_shadow */
507 0, /* PGC_SH_l1_64_shadow */
508 0, /* PGC_SH_fl1_64_shadow */
509 0, /* PGC_SH_l2_64_shadow */
510 0, /* PGC_SH_l3_64_shadow */
511 0, /* PGC_SH_l4_64_shadow */
512 2, /* PGC_SH_p2m_table */
513 0 /* PGC_SH_monitor_table */
514 };
515 u32 type = (shadow_type & PGC_SH_type_mask) >> PGC_SH_type_shift;
516 return type_to_order[type];
517 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
518 return 0;
519 #endif
520 }
523 /* Do we have a free chunk of at least this order? */
524 static inline int chunk_is_available(struct domain *d, int order)
525 {
526 int i;
528 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
529 if ( !list_empty(&d->arch.shadow.freelists[i]) )
530 return 1;
531 return 0;
532 }
534 /* Dispatcher function: call the per-mode function that will unhook the
535 * non-Xen mappings in this top-level shadow mfn */
536 void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
537 {
538 struct page_info *pg = mfn_to_page(smfn);
539 switch ( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift )
540 {
541 case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
542 #if CONFIG_PAGING_LEVELS == 2
543 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
544 #else
545 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
546 #endif
547 break;
548 #if CONFIG_PAGING_LEVELS >= 3
549 case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
550 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
551 break;
552 #endif
553 #if CONFIG_PAGING_LEVELS >= 4
554 case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
555 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
556 break;
557 #endif
558 default:
559 SHADOW_PRINTK("top-level shadow has bad type %08lx\n",
560 (unsigned long)((pg->count_info & PGC_SH_type_mask)
561 >> PGC_SH_type_shift));
562 BUG();
563 }
564 }
567 /* Make sure there is at least one chunk of the required order available
568 * in the shadow page pool. This must be called before any calls to
569 * shadow_alloc(). Since this will free existing shadows to make room,
570 * it must be called early enough to avoid freeing shadows that the
571 * caller is currently working on. */
572 void shadow_prealloc(struct domain *d, unsigned int order)
573 {
574 /* Need a vpcu for calling unpins; for now, since we don't have
575 * per-vcpu shadows, any will do */
576 struct vcpu *v = d->vcpu[0];
577 struct list_head *l, *t;
578 struct page_info *pg;
579 mfn_t smfn;
581 if ( chunk_is_available(d, order) ) return;
583 /* Stage one: walk the list of top-level pages, unpinning them */
584 perfc_incrc(shadow_prealloc_1);
585 list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
586 {
587 pg = list_entry(l, struct page_info, list);
588 smfn = page_to_mfn(pg);
590 #if CONFIG_PAGING_LEVELS >= 3
591 if ( (pg->count_info & PGC_SH_type_mask) == PGC_SH_l3_pae_shadow )
592 {
593 /* For PAE, we need to unpin each subshadow on this shadow */
594 SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn);
595 }
596 else
597 #endif /* 32-bit code always takes this branch */
598 {
599 /* Unpin this top-level shadow */
600 sh_unpin(v, smfn);
601 }
603 /* See if that freed up a chunk of appropriate size */
604 if ( chunk_is_available(d, order) ) return;
605 }
607 /* Stage two: all shadow pages are in use in hierarchies that are
608 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
609 * mappings. */
610 perfc_incrc(shadow_prealloc_2);
611 v = current;
612 if ( v->domain != d )
613 v = d->vcpu[0];
614 /* Walk the list from the tail: recently used toplevels have been pulled
615 * to the head */
616 list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
617 {
618 pg = list_entry(l, struct page_info, list);
619 smfn = page_to_mfn(pg);
620 shadow_unhook_mappings(v, smfn);
622 /* Need to flush TLB if we've altered our own tables */
623 if ( !shadow_mode_external(d)
624 && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
625 local_flush_tlb();
627 /* See if that freed up a chunk of appropriate size */
628 if ( chunk_is_available(d, order) ) return;
629 }
631 /* Nothing more we can do: all remaining shadows are of pages that
632 * hold Xen mappings for some vcpu. This can never happen. */
633 SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
634 " shadow pages total = %u, free = %u, p2m=%u\n",
635 1 << order,
636 d->arch.shadow.total_pages,
637 d->arch.shadow.free_pages,
638 d->arch.shadow.p2m_pages);
639 BUG();
640 }
643 /* Allocate another shadow's worth of (contiguous, aligned) pages,
644 * and fill in the type and backpointer fields of their page_infos.
645 * Never fails to allocate. */
646 mfn_t shadow_alloc(struct domain *d,
647 u32 shadow_type,
648 unsigned long backpointer)
649 {
650 struct page_info *pg = NULL;
651 unsigned int order = shadow_order(shadow_type);
652 cpumask_t mask;
653 void *p;
654 int i;
656 ASSERT(shadow_lock_is_acquired(d));
657 ASSERT(order <= SHADOW_MAX_ORDER);
658 ASSERT(shadow_type != PGC_SH_none);
659 perfc_incrc(shadow_alloc);
661 /* Find smallest order which can satisfy the request. */
662 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
663 if ( !list_empty(&d->arch.shadow.freelists[i]) )
664 {
665 pg = list_entry(d->arch.shadow.freelists[i].next,
666 struct page_info, list);
667 list_del(&pg->list);
669 /* We may have to halve the chunk a number of times. */
670 while ( i != order )
671 {
672 i--;
673 SH_SET_PFN_ORDER(pg, i);
674 list_add_tail(&pg->list, &d->arch.shadow.freelists[i]);
675 pg += 1 << i;
676 }
677 d->arch.shadow.free_pages -= 1 << order;
679 /* Init page info fields and clear the pages */
680 for ( i = 0; i < 1<<order ; i++ )
681 {
682 pg[i].u.inuse.type_info = backpointer;
683 pg[i].count_info = shadow_type;
684 pg[i].shadow_flags = 0;
685 INIT_LIST_HEAD(&pg[i].list);
686 /* Before we overwrite the old contents of this page,
687 * we need to be sure that no TLB holds a pointer to it. */
688 mask = d->domain_dirty_cpumask;
689 tlbflush_filter(mask, pg[i].tlbflush_timestamp);
690 if ( unlikely(!cpus_empty(mask)) )
691 {
692 perfc_incrc(shadow_alloc_tlbflush);
693 flush_tlb_mask(mask);
694 }
695 /* Now safe to clear the page for reuse */
696 p = sh_map_domain_page(page_to_mfn(pg+i));
697 ASSERT(p != NULL);
698 clear_page(p);
699 sh_unmap_domain_page(p);
700 perfc_incr(shadow_alloc_count);
701 }
702 return page_to_mfn(pg);
703 }
705 /* If we get here, we failed to allocate. This should never happen.
706 * It means that we didn't call shadow_prealloc() correctly before
707 * we allocated. We can't recover by calling prealloc here, because
708 * we might free up higher-level pages that the caller is working on. */
709 SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
710 BUG();
711 }
714 /* Return some shadow pages to the pool. */
715 void shadow_free(struct domain *d, mfn_t smfn)
716 {
717 struct page_info *pg = mfn_to_page(smfn);
718 u32 shadow_type;
719 unsigned long order;
720 unsigned long mask;
721 int i;
723 ASSERT(shadow_lock_is_acquired(d));
724 perfc_incrc(shadow_free);
726 shadow_type = pg->count_info & PGC_SH_type_mask;
727 ASSERT(shadow_type != PGC_SH_none);
728 ASSERT(shadow_type != PGC_SH_p2m_table);
729 order = shadow_order(shadow_type);
731 d->arch.shadow.free_pages += 1 << order;
733 for ( i = 0; i < 1<<order; i++ )
734 {
735 /* Strip out the type: this is now a free shadow page */
736 pg[i].count_info = 0;
737 /* Remember the TLB timestamp so we will know whether to flush
738 * TLBs when we reuse the page. Because the destructors leave the
739 * contents of the pages in place, we can delay TLB flushes until
740 * just before the allocator hands the page out again. */
741 pg[i].tlbflush_timestamp = tlbflush_current_time();
742 perfc_decr(shadow_alloc_count);
743 }
745 /* Merge chunks as far as possible. */
746 while ( order < SHADOW_MAX_ORDER )
747 {
748 mask = 1 << order;
749 if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
750 /* Merge with predecessor block? */
751 if ( (((pg-mask)->count_info & PGC_SH_type_mask) != PGT_none)
752 || (SH_PFN_ORDER(pg-mask) != order) )
753 break;
754 list_del(&(pg-mask)->list);
755 pg -= mask;
756 } else {
757 /* Merge with successor block? */
758 if ( (((pg+mask)->count_info & PGC_SH_type_mask) != PGT_none)
759 || (SH_PFN_ORDER(pg+mask) != order) )
760 break;
761 list_del(&(pg+mask)->list);
762 }
763 order++;
764 }
766 SH_SET_PFN_ORDER(pg, order);
767 list_add_tail(&pg->list, &d->arch.shadow.freelists[order]);
768 }
770 /* Divert some memory from the pool to be used by the p2m mapping.
771 * This action is irreversible: the p2m mapping only ever grows.
772 * That's OK because the p2m table only exists for translated domains,
773 * and those domains can't ever turn off shadow mode.
774 * Also, we only ever allocate a max-order chunk, so as to preserve
775 * the invariant that shadow_prealloc() always works.
776 * Returns 0 iff it can't get a chunk (the caller should then
777 * free up some pages in domheap and call set_sh_allocation);
778 * returns non-zero on success.
779 */
780 static int
781 shadow_alloc_p2m_pages(struct domain *d)
782 {
783 struct page_info *pg;
784 u32 i;
785 ASSERT(shadow_lock_is_acquired(d));
787 if ( d->arch.shadow.total_pages
788 < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
789 return 0; /* Not enough shadow memory: need to increase it first */
791 pg = mfn_to_page(shadow_alloc(d, PGC_SH_p2m_table, 0));
792 d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
793 d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
794 for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
795 {
796 /* Unlike shadow pages, mark p2m pages as owned by the domain.
797 * Marking the domain as the owner would normally allow the guest to
798 * create mappings of these pages, but these p2m pages will never be
799 * in the domain's guest-physical address space, and so that is not
800 * believed to be a concern.
801 */
802 page_set_owner(&pg[i], d);
803 list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
804 }
805 return 1;
806 }
808 // Returns 0 if no memory is available...
809 mfn_t
810 shadow_alloc_p2m_page(struct domain *d)
811 {
812 struct list_head *entry;
813 mfn_t mfn;
814 void *p;
816 if ( list_empty(&d->arch.shadow.p2m_freelist) &&
817 !shadow_alloc_p2m_pages(d) )
818 return _mfn(0);
819 entry = d->arch.shadow.p2m_freelist.next;
820 list_del(entry);
821 list_add_tail(entry, &d->arch.shadow.p2m_inuse);
822 mfn = page_to_mfn(list_entry(entry, struct page_info, list));
823 sh_get_ref(mfn, 0);
824 p = sh_map_domain_page(mfn);
825 clear_page(p);
826 sh_unmap_domain_page(p);
828 return mfn;
829 }
831 #if CONFIG_PAGING_LEVELS == 3
832 static void p2m_install_entry_in_monitors(struct domain *d,
833 l3_pgentry_t *l3e)
834 /* Special case, only used for external-mode domains on PAE hosts:
835 * update the mapping of the p2m table. Once again, this is trivial in
836 * other paging modes (one top-level entry points to the top-level p2m,
837 * no maintenance needed), but PAE makes life difficult by needing a
838 * copy the eight l3es of the p2m table in eight l2h slots in the
839 * monitor table. This function makes fresh copies when a p2m l3e
840 * changes. */
841 {
842 l2_pgentry_t *ml2e;
843 struct vcpu *v;
844 unsigned int index;
846 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
847 ASSERT(index < MACHPHYS_MBYTES>>1);
849 for_each_vcpu(d, v)
850 {
851 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
852 continue;
853 ASSERT(shadow_mode_external(v->domain));
855 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
856 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
858 if ( v == current ) /* OK to use linear map of monitor_table */
859 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
860 else
861 {
862 l3_pgentry_t *ml3e;
863 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
864 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
865 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
866 ml2e += l2_table_offset(RO_MPT_VIRT_START);
867 sh_unmap_domain_page(ml3e);
868 }
869 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
870 if ( v != current )
871 sh_unmap_domain_page(ml2e);
872 }
873 }
874 #endif
876 // Find the next level's P2M entry, checking for out-of-range gfn's...
877 // Returns NULL on error.
878 //
879 static l1_pgentry_t *
880 p2m_find_entry(void *table, unsigned long *gfn_remainder,
881 unsigned long gfn, u32 shift, u32 max)
882 {
883 u32 index;
885 index = *gfn_remainder >> shift;
886 if ( index >= max )
887 {
888 SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
889 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
890 gfn, *gfn_remainder, shift, index, max);
891 return NULL;
892 }
893 *gfn_remainder &= (1 << shift) - 1;
894 return (l1_pgentry_t *)table + index;
895 }
897 // Walk one level of the P2M table, allocating a new table if required.
898 // Returns 0 on error.
899 //
900 static int
901 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
902 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
903 u32 max, unsigned long type)
904 {
905 l1_pgentry_t *p2m_entry;
906 void *next;
908 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
909 shift, max)) )
910 return 0;
912 if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
913 {
914 mfn_t mfn = shadow_alloc_p2m_page(d);
915 if ( mfn_x(mfn) == 0 )
916 return 0;
917 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
918 mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
919 mfn_to_page(mfn)->count_info = 1;
920 #if CONFIG_PAGING_LEVELS == 3
921 if (type == PGT_l2_page_table)
922 {
923 /* We have written to the p2m l3: need to sync the per-vcpu
924 * copies of it in the monitor tables */
925 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
926 }
927 #endif
928 /* The P2M can be shadowed: keep the shadows synced */
929 if ( d->vcpu[0] )
930 (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
931 p2m_entry, sizeof *p2m_entry);
932 }
933 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
934 next = sh_map_domain_page(*table_mfn);
935 sh_unmap_domain_page(*table);
936 *table = next;
938 return 1;
939 }
941 // Returns 0 on error (out of memory)
942 int
943 shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
944 {
945 // XXX -- this might be able to be faster iff current->domain == d
946 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
947 void *table = sh_map_domain_page(table_mfn);
948 unsigned long gfn_remainder = gfn;
949 l1_pgentry_t *p2m_entry;
951 #if CONFIG_PAGING_LEVELS >= 4
952 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
953 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
954 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
955 return 0;
956 #endif
957 #if CONFIG_PAGING_LEVELS >= 3
958 // When using PAE Xen, we only allow 33 bits of pseudo-physical
959 // address in translated guests (i.e. 8 GBytes). This restriction
960 // comes from wanting to map the P2M table into the 16MB RO_MPT hole
961 // in Xen's address space for translated PV guests.
962 //
963 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
964 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
965 (CONFIG_PAGING_LEVELS == 3
966 ? 8
967 : L3_PAGETABLE_ENTRIES),
968 PGT_l2_page_table) )
969 return 0;
970 #endif
971 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
972 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
973 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
974 return 0;
976 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
977 0, L1_PAGETABLE_ENTRIES);
978 ASSERT(p2m_entry);
979 if ( valid_mfn(mfn) )
980 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
981 else
982 *p2m_entry = l1e_empty();
984 /* The P2M can be shadowed: keep the shadows synced */
985 (void) __shadow_validate_guest_entry(d->vcpu[0], table_mfn,
986 p2m_entry, sizeof *p2m_entry);
988 sh_unmap_domain_page(table);
990 return 1;
991 }
993 // Allocate a new p2m table for a domain.
994 //
995 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
996 // controlled by CONFIG_PAGING_LEVELS).
997 //
998 // Returns 0 if p2m table could not be initialized
999 //
1000 static int
1001 shadow_alloc_p2m_table(struct domain *d)
1003 mfn_t p2m_top;
1004 struct list_head *entry;
1005 unsigned int page_count = 0;
1007 SHADOW_PRINTK("allocating p2m table\n");
1008 ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
1010 p2m_top = shadow_alloc_p2m_page(d);
1011 mfn_to_page(p2m_top)->count_info = 1;
1012 mfn_to_page(p2m_top)->u.inuse.type_info =
1013 #if CONFIG_PAGING_LEVELS == 4
1014 PGT_l4_page_table
1015 #elif CONFIG_PAGING_LEVELS == 3
1016 PGT_l3_page_table
1017 #elif CONFIG_PAGING_LEVELS == 2
1018 PGT_l2_page_table
1019 #endif
1020 | 1 | PGT_validated;
1022 if ( mfn_x(p2m_top) == 0 )
1023 return 0;
1025 d->arch.phys_table = pagetable_from_mfn(p2m_top);
1027 SHADOW_PRINTK("populating p2m table\n");
1029 for ( entry = d->page_list.next;
1030 entry != &d->page_list;
1031 entry = entry->next )
1033 struct page_info *page = list_entry(entry, struct page_info, list);
1034 mfn_t mfn = page_to_mfn(page);
1035 unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
1036 page_count++;
1037 if (
1038 #ifdef __x86_64__
1039 (gfn != 0x5555555555555555L)
1040 #else
1041 (gfn != 0x55555555L)
1042 #endif
1043 && gfn != INVALID_M2P_ENTRY
1044 && !shadow_set_p2m_entry(d, gfn, mfn) )
1046 SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH_PRI_mfn "\n",
1047 gfn, mfn_x(mfn));
1048 return 0;
1052 SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
1053 return 1;
1056 mfn_t
1057 sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1058 /* Read another domain's p2m entries */
1060 mfn_t mfn;
1061 unsigned long addr = gpfn << PAGE_SHIFT;
1062 l2_pgentry_t *l2e;
1063 l1_pgentry_t *l1e;
1065 ASSERT(shadow_mode_translate(d));
1066 mfn = pagetable_get_mfn(d->arch.phys_table);
1069 #if CONFIG_PAGING_LEVELS > 2
1070 if ( gpfn >= (RO_MPT_VIRT_END-RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
1071 /* This pfn is higher than the p2m map can hold */
1072 return _mfn(INVALID_MFN);
1073 #endif
1076 #if CONFIG_PAGING_LEVELS >= 4
1078 l4_pgentry_t *l4e = sh_map_domain_page(mfn);
1079 l4e += l4_table_offset(addr);
1080 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1082 sh_unmap_domain_page(l4e);
1083 return _mfn(INVALID_MFN);
1085 mfn = _mfn(l4e_get_pfn(*l4e));
1086 sh_unmap_domain_page(l4e);
1088 #endif
1089 #if CONFIG_PAGING_LEVELS >= 3
1091 l3_pgentry_t *l3e = sh_map_domain_page(mfn);
1092 l3e += l3_table_offset(addr);
1093 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1095 sh_unmap_domain_page(l3e);
1096 return _mfn(INVALID_MFN);
1098 mfn = _mfn(l3e_get_pfn(*l3e));
1099 sh_unmap_domain_page(l3e);
1101 #endif
1103 l2e = sh_map_domain_page(mfn);
1104 l2e += l2_table_offset(addr);
1105 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1107 sh_unmap_domain_page(l2e);
1108 return _mfn(INVALID_MFN);
1110 mfn = _mfn(l2e_get_pfn(*l2e));
1111 sh_unmap_domain_page(l2e);
1113 l1e = sh_map_domain_page(mfn);
1114 l1e += l1_table_offset(addr);
1115 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1117 sh_unmap_domain_page(l1e);
1118 return _mfn(INVALID_MFN);
1120 mfn = _mfn(l1e_get_pfn(*l1e));
1121 sh_unmap_domain_page(l1e);
1123 return mfn;
1126 unsigned long
1127 shadow_gfn_to_mfn_foreign(unsigned long gpfn)
1129 return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
1133 static void shadow_p2m_teardown(struct domain *d)
1134 /* Return all the p2m pages to Xen.
1135 * We know we don't have any extra mappings to these pages */
1137 struct list_head *entry, *n;
1138 struct page_info *pg;
1140 d->arch.phys_table = pagetable_null();
1142 list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
1144 pg = list_entry(entry, struct page_info, list);
1145 list_del(entry);
1146 /* Should have just the one ref we gave it in alloc_p2m_page() */
1147 if ( (pg->count_info & PGC_SH_count_mask) != 1 )
1149 SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1150 pg->count_info, pg->u.inuse.type_info);
1152 ASSERT(page_get_owner(pg) == d);
1153 /* Free should not decrement domain's total allocation, since
1154 * these pages were allocated without an owner. */
1155 page_set_owner(pg, NULL);
1156 free_domheap_pages(pg, 0);
1157 d->arch.shadow.p2m_pages--;
1158 perfc_decr(shadow_alloc_count);
1160 list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
1162 list_del(entry);
1163 pg = list_entry(entry, struct page_info, list);
1164 ASSERT(page_get_owner(pg) == d);
1165 /* Free should not decrement domain's total allocation. */
1166 page_set_owner(pg, NULL);
1167 free_domheap_pages(pg, 0);
1168 d->arch.shadow.p2m_pages--;
1169 perfc_decr(shadow_alloc_count);
1171 ASSERT(d->arch.shadow.p2m_pages == 0);
1174 /* Set the pool of shadow pages to the required number of pages.
1175 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1176 * plus space for the p2m table.
1177 * Returns 0 for success, non-zero for failure. */
1178 static unsigned int set_sh_allocation(struct domain *d,
1179 unsigned int pages,
1180 int *preempted)
1182 struct page_info *pg;
1183 unsigned int lower_bound;
1184 int j;
1186 ASSERT(shadow_lock_is_acquired(d));
1188 /* Don't allocate less than the minimum acceptable, plus one page per
1189 * megabyte of RAM (for the p2m table) */
1190 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1191 if ( pages > 0 && pages < lower_bound )
1192 pages = lower_bound;
1193 /* Round up to largest block size */
1194 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1196 SHADOW_PRINTK("current %i target %i\n",
1197 d->arch.shadow.total_pages, pages);
1199 while ( d->arch.shadow.total_pages != pages )
1201 if ( d->arch.shadow.total_pages < pages )
1203 /* Need to allocate more memory from domheap */
1204 pg = alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
1205 if ( pg == NULL )
1207 SHADOW_PRINTK("failed to allocate shadow pages.\n");
1208 return -ENOMEM;
1210 d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1211 d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1212 for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
1214 pg[j].u.inuse.type_info = 0; /* Free page */
1215 pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
1217 SH_SET_PFN_ORDER(pg, SHADOW_MAX_ORDER);
1218 list_add_tail(&pg->list,
1219 &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
1221 else if ( d->arch.shadow.total_pages > pages )
1223 /* Need to return memory to domheap */
1224 shadow_prealloc(d, SHADOW_MAX_ORDER);
1225 ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
1226 pg = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
1227 struct page_info, list);
1228 list_del(&pg->list);
1229 d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1230 d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1231 free_domheap_pages(pg, SHADOW_MAX_ORDER);
1234 /* Check to see if we need to yield and try again */
1235 if ( preempted && hypercall_preempt_check() )
1237 *preempted = 1;
1238 return 0;
1242 return 0;
1245 unsigned int shadow_set_allocation(struct domain *d,
1246 unsigned int megabytes,
1247 int *preempted)
1248 /* Hypercall interface to set the shadow memory allocation */
1250 unsigned int rv;
1251 shadow_lock(d);
1252 rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
1253 SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
1254 d->domain_id,
1255 d->arch.shadow.total_pages,
1256 shadow_get_allocation(d));
1257 shadow_unlock(d);
1258 return rv;
1261 /**************************************************************************/
1262 /* Hash table for storing the guest->shadow mappings */
1264 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1265 typedef u32 key_t;
1266 static inline key_t sh_hash(unsigned long n, u8 t)
1268 unsigned char *p = (unsigned char *)&n;
1269 key_t k = t;
1270 int i;
1271 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1272 return k;
1275 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1277 /* Before we get to the mechanism, define a pair of audit functions
1278 * that sanity-check the contents of the hash table. */
1279 static void sh_hash_audit_bucket(struct domain *d, int bucket)
1280 /* Audit one bucket of the hash table */
1282 struct shadow_hash_entry *e, *x;
1283 struct page_info *pg;
1285 if ( !(SHADOW_AUDIT_ENABLE) )
1286 return;
1288 e = &d->arch.shadow.hash_table[bucket];
1289 if ( e->t == 0 ) return; /* Bucket is empty */
1290 while ( e )
1292 /* Empty link? */
1293 BUG_ON( e->t == 0 );
1294 /* Bogus type? */
1295 BUG_ON( e->t > (PGC_SH_max_shadow >> PGC_SH_type_shift) );
1296 /* Wrong bucket? */
1297 BUG_ON( sh_hash(e->n, e->t) % SHADOW_HASH_BUCKETS != bucket );
1298 /* Duplicate entry? */
1299 for ( x = e->next; x; x = x->next )
1300 BUG_ON( x->n == e->n && x->t == e->t );
1301 /* Bogus MFN? */
1302 BUG_ON( !valid_mfn(e->smfn) );
1303 pg = mfn_to_page(e->smfn);
1304 /* Not a shadow? */
1305 BUG_ON( page_get_owner(pg) != 0 );
1306 /* Wrong kind of shadow? */
1307 BUG_ON( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift
1308 != e->t );
1309 /* Bad backlink? */
1310 BUG_ON( pg->u.inuse.type_info != e->n );
1311 if ( e->t != (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
1312 && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
1313 && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) )
1315 /* Bad shadow flags on guest page? */
1316 BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<<e->t)) );
1318 /* That entry was OK; on we go */
1319 e = e->next;
1323 #else
1324 #define sh_hash_audit_bucket(_d, _b)
1325 #endif /* Hashtable bucket audit */
1328 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1330 static void sh_hash_audit(struct domain *d)
1331 /* Full audit: audit every bucket in the table */
1333 int i;
1335 if ( !(SHADOW_AUDIT_ENABLE) )
1336 return;
1338 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1340 sh_hash_audit_bucket(d, i);
1344 #else
1345 #define sh_hash_audit(_d)
1346 #endif /* Hashtable bucket audit */
1348 /* Memory management interface for bucket allocation.
1349 * These ought to come out of shadow memory, but at least on 32-bit
1350 * machines we are forced to allocate them from xenheap so that we can
1351 * address them. */
1352 static struct shadow_hash_entry *sh_alloc_hash_entry(struct domain *d)
1354 struct shadow_hash_entry *extra, *x;
1355 int i;
1357 /* We need to allocate a new node. Ensure the free list is not empty.
1358 * Allocate new entries in units the same size as the original table. */
1359 if ( unlikely(d->arch.shadow.hash_freelist == NULL) )
1361 size_t sz = sizeof(void *) + (SHADOW_HASH_BUCKETS * sizeof(*x));
1362 extra = xmalloc_bytes(sz);
1364 if ( extra == NULL )
1366 /* No memory left! */
1367 SHADOW_ERROR("xmalloc() failed when allocating hash buckets.\n");
1368 domain_crash_synchronous();
1370 memset(extra, 0, sz);
1372 /* Record the allocation block so it can be correctly freed later. */
1373 *((struct shadow_hash_entry **)&extra[SHADOW_HASH_BUCKETS]) =
1374 d->arch.shadow.hash_allocations;
1375 d->arch.shadow.hash_allocations = &extra[0];
1377 /* Thread a free chain through the newly-allocated nodes. */
1378 for ( i = 0; i < (SHADOW_HASH_BUCKETS - 1); i++ )
1379 extra[i].next = &extra[i+1];
1380 extra[i].next = NULL;
1382 /* Add the new nodes to the free list. */
1383 d->arch.shadow.hash_freelist = &extra[0];
1386 /* Allocate a new node from the free list. */
1387 x = d->arch.shadow.hash_freelist;
1388 d->arch.shadow.hash_freelist = x->next;
1389 return x;
1392 static void sh_free_hash_entry(struct domain *d, struct shadow_hash_entry *e)
1394 /* Mark the bucket as empty and return it to the free list */
1395 e->t = 0;
1396 e->next = d->arch.shadow.hash_freelist;
1397 d->arch.shadow.hash_freelist = e;
1401 /* Allocate and initialise the table itself.
1402 * Returns 0 for success, 1 for error. */
1403 static int shadow_hash_alloc(struct domain *d)
1405 struct shadow_hash_entry *table;
1407 ASSERT(shadow_lock_is_acquired(d));
1408 ASSERT(!d->arch.shadow.hash_table);
1410 table = xmalloc_array(struct shadow_hash_entry, SHADOW_HASH_BUCKETS);
1411 if ( !table ) return 1;
1412 memset(table, 0,
1413 SHADOW_HASH_BUCKETS * sizeof (struct shadow_hash_entry));
1414 d->arch.shadow.hash_table = table;
1415 return 0;
1418 /* Tear down the hash table and return all memory to Xen.
1419 * This function does not care whether the table is populated. */
1420 static void shadow_hash_teardown(struct domain *d)
1422 struct shadow_hash_entry *a, *n;
1424 ASSERT(shadow_lock_is_acquired(d));
1425 ASSERT(d->arch.shadow.hash_table);
1427 /* Return the table itself */
1428 xfree(d->arch.shadow.hash_table);
1429 d->arch.shadow.hash_table = NULL;
1431 /* Return any extra allocations */
1432 a = d->arch.shadow.hash_allocations;
1433 while ( a )
1435 /* We stored a linked-list pointer at the end of each allocation */
1436 n = *((struct shadow_hash_entry **)(&a[SHADOW_HASH_BUCKETS]));
1437 xfree(a);
1438 a = n;
1440 d->arch.shadow.hash_allocations = NULL;
1441 d->arch.shadow.hash_freelist = NULL;
1445 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
1446 /* Find an entry in the hash table. Returns the MFN of the shadow,
1447 * or INVALID_MFN if it doesn't exist */
1449 struct domain *d = v->domain;
1450 struct shadow_hash_entry *p, *x, *head;
1451 key_t key;
1453 ASSERT(shadow_lock_is_acquired(d));
1454 ASSERT(d->arch.shadow.hash_table);
1455 ASSERT(t);
1457 sh_hash_audit(d);
1459 perfc_incrc(shadow_hash_lookups);
1460 key = sh_hash(n, t);
1462 x = head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
1463 p = NULL;
1465 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1467 do
1469 ASSERT(x->t || ((x == head) && (x->next == NULL)));
1471 if ( x->n == n && x->t == t )
1473 /* Pull-to-front if 'x' isn't already the head item */
1474 if ( unlikely(x != head) )
1476 if ( unlikely(d->arch.shadow.hash_walking != 0) )
1477 /* Can't reorder: someone is walking the hash chains */
1478 return x->smfn;
1479 else
1481 /* Delete 'x' from list and reinsert after head. */
1482 p->next = x->next;
1483 x->next = head->next;
1484 head->next = x;
1486 /* Swap 'x' contents with head contents. */
1487 SWAP(head->n, x->n);
1488 SWAP(head->t, x->t);
1489 SWAP(head->smfn, x->smfn);
1492 else
1494 perfc_incrc(shadow_hash_lookup_head);
1496 return head->smfn;
1499 p = x;
1500 x = x->next;
1502 while ( x != NULL );
1504 perfc_incrc(shadow_hash_lookup_miss);
1505 return _mfn(INVALID_MFN);
1508 void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
1509 /* Put a mapping (n,t)->smfn into the hash table */
1511 struct domain *d = v->domain;
1512 struct shadow_hash_entry *x, *head;
1513 key_t key;
1515 ASSERT(shadow_lock_is_acquired(d));
1516 ASSERT(d->arch.shadow.hash_table);
1517 ASSERT(t);
1519 sh_hash_audit(d);
1521 perfc_incrc(shadow_hash_inserts);
1522 key = sh_hash(n, t);
1524 head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
1526 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1528 /* If the bucket is empty then insert the new page as the head item. */
1529 if ( head->t == 0 )
1531 head->n = n;
1532 head->t = t;
1533 head->smfn = smfn;
1534 ASSERT(head->next == NULL);
1536 else
1538 /* Insert a new entry directly after the head item. */
1539 x = sh_alloc_hash_entry(d);
1540 x->n = n;
1541 x->t = t;
1542 x->smfn = smfn;
1543 x->next = head->next;
1544 head->next = x;
1547 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1550 void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
1551 /* Excise the mapping (n,t)->smfn from the hash table */
1553 struct domain *d = v->domain;
1554 struct shadow_hash_entry *p, *x, *head;
1555 key_t key;
1557 ASSERT(shadow_lock_is_acquired(d));
1558 ASSERT(d->arch.shadow.hash_table);
1559 ASSERT(t);
1561 sh_hash_audit(d);
1563 perfc_incrc(shadow_hash_deletes);
1564 key = sh_hash(n, t);
1566 head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
1568 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1570 /* Match on head item? */
1571 if ( head->n == n && head->t == t )
1573 if ( (x = head->next) != NULL )
1575 /* Overwrite head with contents of following node. */
1576 head->n = x->n;
1577 head->t = x->t;
1578 head->smfn = x->smfn;
1580 /* Delete following node. */
1581 head->next = x->next;
1582 sh_free_hash_entry(d, x);
1584 else
1586 /* This bucket is now empty. Initialise the head node. */
1587 head->t = 0;
1590 else
1592 /* Not at the head; need to walk the chain */
1593 p = head;
1594 x = head->next;
1596 while(1)
1598 ASSERT(x); /* We can't have hit the end, since our target is
1599 * still in the chain somehwere... */
1600 if ( x->n == n && x->t == t )
1602 /* Delete matching node. */
1603 p->next = x->next;
1604 sh_free_hash_entry(d, x);
1605 break;
1607 p = x;
1608 x = x->next;
1612 sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
1615 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1617 static void hash_foreach(struct vcpu *v,
1618 unsigned int callback_mask,
1619 hash_callback_t callbacks[],
1620 mfn_t callback_mfn)
1621 /* Walk the hash table looking at the types of the entries and
1622 * calling the appropriate callback function for each entry.
1623 * The mask determines which shadow types we call back for, and the array
1624 * of callbacks tells us which function to call.
1625 * Any callback may return non-zero to let us skip the rest of the scan.
1627 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1628 * then return non-zero to terminate the scan. */
1630 int i, done = 0;
1631 struct domain *d = v->domain;
1632 struct shadow_hash_entry *x;
1634 /* Say we're here, to stop hash-lookups reordering the chains */
1635 ASSERT(shadow_lock_is_acquired(d));
1636 ASSERT(d->arch.shadow.hash_walking == 0);
1637 d->arch.shadow.hash_walking = 1;
1639 callback_mask &= ~1; /* Never attempt to call back on empty buckets */
1640 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
1642 /* WARNING: This is not safe against changes to the hash table.
1643 * The callback *must* return non-zero if it has inserted or
1644 * deleted anything from the hash (lookups are OK, though). */
1645 for ( x = &d->arch.shadow.hash_table[i]; x; x = x->next )
1647 if ( callback_mask & (1 << x->t) )
1649 ASSERT(x->t <= 15);
1650 ASSERT(callbacks[x->t] != NULL);
1651 if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
1652 break;
1655 if ( done ) break;
1657 d->arch.shadow.hash_walking = 0;
1661 /**************************************************************************/
1662 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1663 * which will decrement refcounts appropriately and return memory to the
1664 * free pool. */
1666 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1668 struct page_info *pg = mfn_to_page(smfn);
1669 u32 t = pg->count_info & PGC_SH_type_mask;
1672 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1674 /* Double-check, if we can, that the shadowed page belongs to this
1675 * domain, (by following the back-pointer). */
1676 ASSERT(t == PGC_SH_fl1_32_shadow ||
1677 t == PGC_SH_fl1_pae_shadow ||
1678 t == PGC_SH_fl1_64_shadow ||
1679 t == PGC_SH_monitor_table ||
1680 (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info)))
1681 == v->domain));
1683 /* The down-shifts here are so that the switch statement is on nice
1684 * small numbers that the compiler will enjoy */
1685 switch ( t >> PGC_SH_type_shift )
1687 #if CONFIG_PAGING_LEVELS == 2
1688 case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
1689 case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
1690 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
1691 break;
1692 case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
1693 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1694 break;
1695 #else /* PAE or 64bit */
1696 case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
1697 case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
1698 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1699 break;
1700 case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
1701 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1702 break;
1703 #endif
1705 #if CONFIG_PAGING_LEVELS >= 3
1706 case PGC_SH_l1_pae_shadow >> PGC_SH_type_shift:
1707 case PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift:
1708 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1709 break;
1710 case PGC_SH_l2_pae_shadow >> PGC_SH_type_shift:
1711 case PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift:
1712 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1713 break;
1714 case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
1715 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 3, 3)(v, smfn);
1716 break;
1717 #endif
1719 #if CONFIG_PAGING_LEVELS >= 4
1720 case PGC_SH_l1_64_shadow >> PGC_SH_type_shift:
1721 case PGC_SH_fl1_64_shadow >> PGC_SH_type_shift:
1722 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1723 break;
1724 case PGC_SH_l2_64_shadow >> PGC_SH_type_shift:
1725 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1726 break;
1727 case PGC_SH_l3_64_shadow >> PGC_SH_type_shift:
1728 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1729 break;
1730 case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
1731 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1732 break;
1733 #endif
1734 default:
1735 SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
1736 (unsigned long)t);
1737 BUG();
1741 /**************************************************************************/
1742 /* Remove all writeable mappings of a guest frame from the shadow tables
1743 * Returns non-zero if we need to flush TLBs.
1744 * level and fault_addr desribe how we found this to be a pagetable;
1745 * level==0 means we have some other reason for revoking write access.*/
1747 int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn,
1748 unsigned int level,
1749 unsigned long fault_addr)
1751 /* Dispatch table for getting per-type functions */
1752 static hash_callback_t callbacks[16] = {
1753 NULL, /* none */
1754 #if CONFIG_PAGING_LEVELS == 2
1755 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */
1756 SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */
1757 #else
1758 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */
1759 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */
1760 #endif
1761 NULL, /* l2_32 */
1762 #if CONFIG_PAGING_LEVELS >= 3
1763 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */
1764 SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
1765 #else
1766 NULL, /* l1_pae */
1767 NULL, /* fl1_pae */
1768 #endif
1769 NULL, /* l2_pae */
1770 NULL, /* l2h_pae */
1771 NULL, /* l3_pae */
1772 #if CONFIG_PAGING_LEVELS >= 4
1773 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */
1774 SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */
1775 #else
1776 NULL, /* l1_64 */
1777 NULL, /* fl1_64 */
1778 #endif
1779 NULL, /* l2_64 */
1780 NULL, /* l3_64 */
1781 NULL, /* l4_64 */
1782 NULL, /* p2m */
1783 NULL /* unused */
1784 };
1786 static unsigned int callback_mask =
1787 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
1788 | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
1789 | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
1790 | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
1791 | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
1792 | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
1794 struct page_info *pg = mfn_to_page(gmfn);
1796 ASSERT(shadow_lock_is_acquired(v->domain));
1798 /* Only remove writable mappings if we are doing shadow refcounts.
1799 * In guest refcounting, we trust Xen to already be restricting
1800 * all the writes to the guest page tables, so we do not need to
1801 * do more. */
1802 if ( !shadow_mode_refcounts(v->domain) )
1803 return 0;
1805 /* Early exit if it's already a pagetable, or otherwise not writeable */
1806 if ( sh_mfn_is_a_page_table(gmfn)
1807 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1808 return 0;
1810 perfc_incrc(shadow_writeable);
1812 /* If this isn't a "normal" writeable page, the domain is trying to
1813 * put pagetables in special memory of some kind. We can't allow that. */
1814 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1816 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
1817 PRtype_info "\n",
1818 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1819 domain_crash(v->domain);
1822 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1823 if ( v == current && level != 0 )
1825 unsigned long gfn;
1826 /* Heuristic: there is likely to be only one writeable mapping,
1827 * and that mapping is likely to be in the current pagetable,
1828 * either in the guest's linear map (linux, windows) or in a
1829 * magic slot used to map high memory regions (linux HIGHTPTE) */
1831 #define GUESS(_a, _h) do { \
1832 if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
1833 perfc_incrc(shadow_writeable_h_ ## _h); \
1834 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1835 return 1; \
1836 } while (0)
1839 if ( v->arch.shadow.mode->guest_levels == 2 )
1841 if ( level == 1 )
1842 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1843 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1845 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1846 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1847 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1850 #if CONFIG_PAGING_LEVELS >= 3
1851 else if ( v->arch.shadow.mode->guest_levels == 3 )
1853 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1854 switch ( level )
1856 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1857 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1860 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1861 if ((gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
1862 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1864 #if CONFIG_PAGING_LEVELS >= 4
1865 else if ( v->arch.shadow.mode->guest_levels == 4 )
1867 /* 64bit w2k3: linear map at 0x0000070000000000 */
1868 switch ( level )
1870 case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
1871 case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
1872 case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
1875 /* Linux direct map at 0xffff810000000000 */
1876 gfn = sh_mfn_to_gfn(v->domain, gmfn);
1877 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
1879 #endif /* CONFIG_PAGING_LEVELS >= 4 */
1880 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1882 #undef GUESS
1885 #endif
1887 /* Brute-force search of all the shadows, by walking the hash */
1888 perfc_incrc(shadow_writeable_bf);
1889 hash_foreach(v, callback_mask, callbacks, gmfn);
1891 /* If that didn't catch the mapping, something is very wrong */
1892 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
1894 SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
1895 "%lu left\n", mfn_x(gmfn),
1896 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
1897 domain_crash(v->domain);
1900 /* We killed at least one writeable mapping, so must flush TLBs. */
1901 return 1;
1906 /**************************************************************************/
1907 /* Remove all mappings of a guest frame from the shadow tables.
1908 * Returns non-zero if we need to flush TLBs. */
1910 int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
1912 struct page_info *page = mfn_to_page(gmfn);
1913 int expected_count;
1915 /* Dispatch table for getting per-type functions */
1916 static hash_callback_t callbacks[16] = {
1917 NULL, /* none */
1918 #if CONFIG_PAGING_LEVELS == 2
1919 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */
1920 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */
1921 #else
1922 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */
1923 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */
1924 #endif
1925 NULL, /* l2_32 */
1926 #if CONFIG_PAGING_LEVELS >= 3
1927 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */
1928 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
1929 #else
1930 NULL, /* l1_pae */
1931 NULL, /* fl1_pae */
1932 #endif
1933 NULL, /* l2_pae */
1934 NULL, /* l2h_pae */
1935 NULL, /* l3_pae */
1936 #if CONFIG_PAGING_LEVELS >= 4
1937 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */
1938 SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */
1939 #else
1940 NULL, /* l1_64 */
1941 NULL, /* fl1_64 */
1942 #endif
1943 NULL, /* l2_64 */
1944 NULL, /* l3_64 */
1945 NULL, /* l4_64 */
1946 NULL, /* p2m */
1947 NULL /* unused */
1948 };
1950 static unsigned int callback_mask =
1951 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
1952 | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
1953 | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
1954 | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
1955 | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
1956 | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
1959 perfc_incrc(shadow_mappings);
1960 if ( (page->count_info & PGC_count_mask) == 0 )
1961 return 0;
1963 ASSERT(shadow_lock_is_acquired(v->domain));
1965 /* XXX TODO:
1966 * Heuristics for finding the (probably) single mapping of this gmfn */
1968 /* Brute-force search of all the shadows, by walking the hash */
1969 perfc_incrc(shadow_mappings_bf);
1970 hash_foreach(v, callback_mask, callbacks, gmfn);
1972 /* If that didn't catch the mapping, something is very wrong */
1973 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
1974 if ( (page->count_info & PGC_count_mask) != expected_count )
1976 /* Don't complain if we're in HVM and there's one extra mapping:
1977 * The qemu helper process has an untyped mapping of this dom's RAM */
1978 if ( !(shadow_mode_external(v->domain)
1979 && (page->count_info & PGC_count_mask) <= 2
1980 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
1982 SHADOW_ERROR("can't find all mappings of mfn %lx: "
1983 "c=%08x t=%08lx\n", mfn_x(gmfn),
1984 page->count_info, page->u.inuse.type_info);
1988 /* We killed at least one mapping, so must flush TLBs. */
1989 return 1;
1993 /**************************************************************************/
1994 /* Remove all shadows of a guest frame from the shadow tables */
1996 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
1997 /* Follow this shadow's up-pointer, if it has one, and remove the reference
1998 * found there. Returns 1 if that was the only reference to this shadow */
2000 struct page_info *pg = mfn_to_page(smfn);
2001 mfn_t pmfn;
2002 void *vaddr;
2003 int rc;
2005 ASSERT((pg->count_info & PGC_SH_type_mask) > 0);
2006 ASSERT((pg->count_info & PGC_SH_type_mask) < PGC_SH_max_shadow);
2007 ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l2_32_shadow);
2008 ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l3_pae_shadow);
2009 ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l4_64_shadow);
2011 if (pg->up == 0) return 0;
2012 pmfn = _mfn(pg->up >> PAGE_SHIFT);
2013 ASSERT(valid_mfn(pmfn));
2014 vaddr = sh_map_domain_page(pmfn);
2015 ASSERT(vaddr);
2016 vaddr += pg->up & (PAGE_SIZE-1);
2017 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2019 /* Is this the only reference to this shadow? */
2020 rc = ((pg->count_info & PGC_SH_count_mask) == 1) ? 1 : 0;
2022 /* Blank the offending entry */
2023 switch ((pg->count_info & PGC_SH_type_mask))
2025 case PGC_SH_l1_32_shadow:
2026 case PGC_SH_l2_32_shadow:
2027 #if CONFIG_PAGING_LEVELS == 2
2028 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2029 #else
2030 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2031 #endif
2032 break;
2033 #if CONFIG_PAGING_LEVELS >=3
2034 case PGC_SH_l1_pae_shadow:
2035 case PGC_SH_l2_pae_shadow:
2036 case PGC_SH_l2h_pae_shadow:
2037 case PGC_SH_l3_pae_shadow:
2038 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2039 break;
2040 #if CONFIG_PAGING_LEVELS >= 4
2041 case PGC_SH_l1_64_shadow:
2042 case PGC_SH_l2_64_shadow:
2043 case PGC_SH_l3_64_shadow:
2044 case PGC_SH_l4_64_shadow:
2045 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2046 break;
2047 #endif
2048 #endif
2049 default: BUG(); /* Some wierd unknown shadow type */
2052 sh_unmap_domain_page(vaddr);
2053 if ( rc )
2054 perfc_incrc(shadow_up_pointer);
2055 else
2056 perfc_incrc(shadow_unshadow_bf);
2058 return rc;
2061 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
2062 /* Remove the shadows of this guest page.
2063 * If all != 0, find all shadows, if necessary by walking the tables.
2064 * Otherwise, just try the (much faster) heuristics, which will remove
2065 * at most one reference to each shadow of the page. */
2067 struct page_info *pg;
2068 mfn_t smfn;
2069 u32 sh_flags;
2070 unsigned char t;
2072 /* Dispatch table for getting per-type functions: each level must
2073 * be called with the function to remove a lower-level shadow. */
2074 static hash_callback_t callbacks[16] = {
2075 NULL, /* none */
2076 NULL, /* l1_32 */
2077 NULL, /* fl1_32 */
2078 #if CONFIG_PAGING_LEVELS == 2
2079 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
2080 #else
2081 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
2082 #endif
2083 NULL, /* l1_pae */
2084 NULL, /* fl1_pae */
2085 #if CONFIG_PAGING_LEVELS >= 3
2086 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
2087 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2088 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,3,3), /* l3_pae */
2089 #else
2090 NULL, /* l2_pae */
2091 NULL, /* l2h_pae */
2092 NULL, /* l3_pae */
2093 #endif
2094 NULL, /* l1_64 */
2095 NULL, /* fl1_64 */
2096 #if CONFIG_PAGING_LEVELS >= 4
2097 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
2098 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
2099 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
2100 #else
2101 NULL, /* l2_64 */
2102 NULL, /* l3_64 */
2103 NULL, /* l4_64 */
2104 #endif
2105 NULL, /* p2m */
2106 NULL /* unused */
2107 };
2109 /* Another lookup table, for choosing which mask to use */
2110 static unsigned int masks[16] = {
2111 0, /* none */
2112 1 << (PGC_SH_l2_32_shadow >> PGC_SH_type_shift), /* l1_32 */
2113 0, /* fl1_32 */
2114 0, /* l2_32 */
2115 ((1 << (PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift))
2116 | (1 << (PGC_SH_l2_pae_shadow >> PGC_SH_type_shift))), /* l1_pae */
2117 0, /* fl1_pae */
2118 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2_pae */
2119 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2h_pae */
2120 0, /* l3_pae */
2121 1 << (PGC_SH_l2_64_shadow >> PGC_SH_type_shift), /* l1_64 */
2122 0, /* fl1_64 */
2123 1 << (PGC_SH_l3_64_shadow >> PGC_SH_type_shift), /* l2_64 */
2124 1 << (PGC_SH_l4_64_shadow >> PGC_SH_type_shift), /* l3_64 */
2125 0, /* l4_64 */
2126 0, /* p2m */
2127 0 /* unused */
2128 };
2130 ASSERT(shadow_lock_is_acquired(v->domain));
2132 pg = mfn_to_page(gmfn);
2134 /* Bail out now if the page is not shadowed */
2135 if ( (pg->count_info & PGC_page_table) == 0 )
2136 return;
2138 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2139 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2141 /* Search for this shadow in all appropriate shadows */
2142 perfc_incrc(shadow_unshadow);
2143 sh_flags = pg->shadow_flags;
2145 /* Lower-level shadows need to be excised from upper-level shadows.
2146 * This call to hash_foreach() looks dangerous but is in fact OK: each
2147 * call will remove at most one shadow, and terminate immediately when
2148 * it does remove it, so we never walk the hash after doing a deletion. */
2149 #define DO_UNSHADOW(_type) do { \
2150 t = (_type) >> PGC_SH_type_shift; \
2151 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2152 if ( !sh_remove_shadow_via_pointer(v, smfn) && all ) \
2153 hash_foreach(v, masks[t], callbacks, smfn); \
2154 } while (0)
2156 /* Top-level shadows need to be unpinned */
2157 #define DO_UNPIN(_type) do { \
2158 t = (_type) >> PGC_SH_type_shift; \
2159 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
2160 if ( mfn_to_page(smfn)->count_info & PGC_SH_pinned ) \
2161 sh_unpin(v, smfn); \
2162 if ( (_type) == PGC_SH_l3_pae_shadow ) \
2163 SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); \
2164 } while (0)
2166 if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(PGC_SH_l1_32_shadow);
2167 if ( sh_flags & SHF_L2_32 ) DO_UNPIN(PGC_SH_l2_32_shadow);
2168 #if CONFIG_PAGING_LEVELS >= 3
2169 if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(PGC_SH_l1_pae_shadow);
2170 if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(PGC_SH_l2_pae_shadow);
2171 if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(PGC_SH_l2h_pae_shadow);
2172 if ( sh_flags & SHF_L3_PAE ) DO_UNPIN(PGC_SH_l3_pae_shadow);
2173 #if CONFIG_PAGING_LEVELS >= 4
2174 if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(PGC_SH_l1_64_shadow);
2175 if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(PGC_SH_l2_64_shadow);
2176 if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(PGC_SH_l3_64_shadow);
2177 if ( sh_flags & SHF_L4_64 ) DO_UNPIN(PGC_SH_l4_64_shadow);
2178 #endif
2179 #endif
2181 #undef DO_UNSHADOW
2182 #undef DO_UNPIN
2185 #if CONFIG_PAGING_LEVELS > 2
2186 /* We may have caused some PAE l3 entries to change: need to
2187 * fix up the copies of them in various places */
2188 if ( sh_flags & (SHF_L2_PAE|SHF_L2H_PAE) )
2189 sh_pae_recopy(v->domain);
2190 #endif
2192 /* If that didn't catch the shadows, something is wrong */
2193 if ( all && (pg->count_info & PGC_page_table) )
2195 SHADOW_ERROR("can't find all shadows of mfn %05lx (shadow_flags=%08x)\n",
2196 mfn_x(gmfn), pg->shadow_flags);
2197 domain_crash(v->domain);
2201 void
2202 shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2203 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2204 * Unshadow it, and recursively unshadow pages that reference it. */
2206 shadow_remove_all_shadows(v, gmfn);
2207 /* XXX TODO:
2208 * Rework this hashtable walker to return a linked-list of all
2209 * the shadows it modified, then do breadth-first recursion
2210 * to find the way up to higher-level tables and unshadow them too.
2212 * The current code (just tearing down each page's shadows as we
2213 * detect that it is not a pagetable) is correct, but very slow.
2214 * It means extra emulated writes and slows down removal of mappings. */
2217 /**************************************************************************/
2219 void sh_update_paging_modes(struct vcpu *v)
2221 struct domain *d = v->domain;
2222 struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
2223 mfn_t old_guest_table;
2225 ASSERT(shadow_lock_is_acquired(d));
2227 // Valid transitions handled by this function:
2228 // - For PV guests:
2229 // - after a shadow mode has been changed
2230 // - For HVM guests:
2231 // - after a shadow mode has been changed
2232 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2233 //
2235 // Avoid determining the current shadow mode for uninitialized CPUs, as
2236 // we can not yet determine whether it is an HVM or PV domain.
2237 //
2238 if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
2240 SHADOW_PRINTK("%s: postponing determination of shadow mode\n", __func__);
2241 return;
2244 // First, tear down any old shadow tables held by this vcpu.
2245 //
2246 shadow_detach_old_tables(v);
2248 if ( !hvm_guest(v) )
2250 ///
2251 /// PV guest
2252 ///
2253 #if CONFIG_PAGING_LEVELS == 4
2254 if ( pv_32bit_guest(v) )
2255 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
2256 else
2257 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2258 #elif CONFIG_PAGING_LEVELS == 3
2259 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2260 #elif CONFIG_PAGING_LEVELS == 2
2261 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2262 #else
2263 #error unexpected paging mode
2264 #endif
2265 v->arch.shadow.translate_enabled = !!shadow_mode_translate(d);
2267 else
2269 ///
2270 /// HVM guest
2271 ///
2272 ASSERT(shadow_mode_translate(d));
2273 ASSERT(shadow_mode_external(d));
2275 v->arch.shadow.translate_enabled = !!hvm_paging_enabled(v);
2276 if ( !v->arch.shadow.translate_enabled )
2279 /* Set v->arch.guest_table to use the p2m map, and choose
2280 * the appropriate shadow mode */
2281 old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2282 #if CONFIG_PAGING_LEVELS == 2
2283 v->arch.guest_table =
2284 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2285 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2286 #elif CONFIG_PAGING_LEVELS == 3
2287 v->arch.guest_table =
2288 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2289 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2290 #else /* CONFIG_PAGING_LEVELS == 4 */
2292 l4_pgentry_t *l4e;
2293 /* Use the start of the first l3 table as a PAE l3 */
2294 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2295 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2296 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2297 v->arch.guest_table =
2298 pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2299 sh_unmap_domain_page(l4e);
2301 v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2302 #endif
2303 /* Fix up refcounts on guest_table */
2304 get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2305 if ( mfn_x(old_guest_table) != 0 )
2306 put_page(mfn_to_page(old_guest_table));
2308 else
2310 #ifdef __x86_64__
2311 if ( hvm_long_mode_enabled(v) )
2313 // long mode guest...
2314 v->arch.shadow.mode =
2315 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2317 else
2318 #endif
2319 if ( hvm_pae_enabled(v) )
2321 #if CONFIG_PAGING_LEVELS >= 3
2322 // 32-bit PAE mode guest...
2323 v->arch.shadow.mode =
2324 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2325 #else
2326 SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2327 domain_crash(d);
2328 return;
2329 #endif
2331 else
2333 // 32-bit 2 level guest...
2334 #if CONFIG_PAGING_LEVELS >= 3
2335 v->arch.shadow.mode =
2336 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2337 #else
2338 v->arch.shadow.mode =
2339 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2340 #endif
2344 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
2346 mfn_t mmfn = shadow_make_monitor_table(v);
2347 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2348 v->arch.monitor_vtable = sh_map_domain_page(mmfn);
2351 if ( v->arch.shadow.mode != old_mode )
2353 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2354 "(was g=%u s=%u)\n",
2355 d->domain_id, v->vcpu_id,
2356 hvm_guest(v) ? !!hvm_paging_enabled(v) : 1,
2357 v->arch.shadow.mode->guest_levels,
2358 v->arch.shadow.mode->shadow_levels,
2359 old_mode ? old_mode->guest_levels : 0,
2360 old_mode ? old_mode->shadow_levels : 0);
2361 if ( old_mode &&
2362 (v->arch.shadow.mode->shadow_levels !=
2363 old_mode->shadow_levels) )
2365 /* Need to make a new monitor table for the new mode */
2366 mfn_t new_mfn, old_mfn;
2368 if ( v != current )
2370 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2371 "this HVM vcpu's (d=%u v=%u) paging mode!\n",
2372 current->domain->domain_id, current->vcpu_id,
2373 v->domain->domain_id, v->vcpu_id);
2374 domain_crash(v->domain);
2375 return;
2378 sh_unmap_domain_page(v->arch.monitor_vtable);
2379 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2380 v->arch.monitor_table = pagetable_null();
2381 new_mfn = v->arch.shadow.mode->make_monitor_table(v);
2382 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2383 v->arch.monitor_vtable = sh_map_domain_page(new_mfn);
2384 SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
2385 mfn_x(new_mfn));
2387 /* Don't be running on the old monitor table when we
2388 * pull it down! Switch CR3, and warn the HVM code that
2389 * its host cr3 has changed. */
2390 make_cr3(v, mfn_x(new_mfn));
2391 write_ptbase(v);
2392 hvm_update_host_cr3(v);
2393 old_mode->destroy_monitor_table(v, old_mfn);
2397 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2398 // These are HARD: think about the case where two CPU's have
2399 // different values for CR4.PSE and CR4.PGE at the same time.
2400 // This *does* happen, at least for CR4.PGE...
2403 v->arch.shadow.mode->update_cr3(v);
2406 /**************************************************************************/
2407 /* Turning on and off shadow features */
2409 static void sh_new_mode(struct domain *d, u32 new_mode)
2410 /* Inform all the vcpus that the shadow mode has been changed */
2412 struct vcpu *v;
2414 ASSERT(shadow_lock_is_acquired(d));
2415 ASSERT(d != current->domain);
2416 d->arch.shadow.mode = new_mode;
2417 if ( new_mode & SHM2_translate )
2418 shadow_audit_p2m(d);
2419 for_each_vcpu(d, v)
2420 sh_update_paging_modes(v);
2423 static int shadow_enable(struct domain *d, u32 mode)
2424 /* Turn on "permanent" shadow features: external, translate, refcount.
2425 * Can only be called once on a domain, and these features cannot be
2426 * disabled.
2427 * Returns 0 for success, -errno for failure. */
2429 unsigned int old_pages;
2430 int rv = 0;
2432 mode |= SHM2_enable;
2434 domain_pause(d);
2435 shadow_lock(d);
2437 /* Sanity check the arguments */
2438 if ( (d == current->domain) ||
2439 shadow_mode_enabled(d) ||
2440 ((mode & SHM2_translate) && !(mode & SHM2_refcounts)) ||
2441 ((mode & SHM2_external) && !(mode & SHM2_translate)) )
2443 rv = -EINVAL;
2444 goto out;
2447 // XXX -- eventually would like to require that all memory be allocated
2448 // *after* shadow_enabled() is called... So here, we would test to make
2449 // sure that d->page_list is empty.
2450 #if 0
2451 spin_lock(&d->page_alloc_lock);
2452 if ( !list_empty(&d->page_list) )
2454 spin_unlock(&d->page_alloc_lock);
2455 rv = -EINVAL;
2456 goto out;
2458 spin_unlock(&d->page_alloc_lock);
2459 #endif
2461 /* Init the shadow memory allocation if the user hasn't done so */
2462 old_pages = d->arch.shadow.total_pages;
2463 if ( old_pages == 0 )
2464 if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
2466 set_sh_allocation(d, 0, NULL);
2467 rv = -ENOMEM;
2468 goto out;
2471 /* Init the hash table */
2472 if ( shadow_hash_alloc(d) != 0 )
2474 set_sh_allocation(d, old_pages, NULL);
2475 rv = -ENOMEM;
2476 goto out;
2479 /* Init the P2M table */
2480 if ( mode & SHM2_translate )
2481 if ( !shadow_alloc_p2m_table(d) )
2483 shadow_hash_teardown(d);
2484 set_sh_allocation(d, old_pages, NULL);
2485 shadow_p2m_teardown(d);
2486 rv = -ENOMEM;
2487 goto out;
2490 /* Update the bits */
2491 sh_new_mode(d, mode);
2492 shadow_audit_p2m(d);
2493 out:
2494 shadow_unlock(d);
2495 domain_unpause(d);
2496 return rv;
2499 void shadow_teardown(struct domain *d)
2500 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2501 * Should only be called for dying domains. */
2503 struct vcpu *v;
2504 mfn_t mfn;
2506 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2507 ASSERT(d != current->domain);
2509 if ( !shadow_lock_is_acquired(d) )
2510 shadow_lock(d); /* Keep various asserts happy */
2512 if ( shadow_mode_enabled(d) )
2514 /* Release the shadow and monitor tables held by each vcpu */
2515 for_each_vcpu(d, v)
2517 shadow_detach_old_tables(v);
2518 if ( shadow_mode_external(d) )
2520 mfn = pagetable_get_mfn(v->arch.monitor_table);
2521 if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
2522 shadow_destroy_monitor_table(v, mfn);
2523 v->arch.monitor_table = pagetable_null();
2528 if ( d->arch.shadow.total_pages != 0 )
2530 SHADOW_PRINTK("teardown of domain %u starts."
2531 " Shadow pages total = %u, free = %u, p2m=%u\n",
2532 d->domain_id,
2533 d->arch.shadow.total_pages,
2534 d->arch.shadow.free_pages,
2535 d->arch.shadow.p2m_pages);
2536 /* Destroy all the shadows and release memory to domheap */
2537 set_sh_allocation(d, 0, NULL);
2538 /* Release the hash table back to xenheap */
2539 if (d->arch.shadow.hash_table)
2540 shadow_hash_teardown(d);
2541 /* Release the log-dirty bitmap of dirtied pages */
2542 sh_free_log_dirty_bitmap(d);
2543 /* Should not have any more memory held */
2544 SHADOW_PRINTK("teardown done."
2545 " Shadow pages total = %u, free = %u, p2m=%u\n",
2546 d->arch.shadow.total_pages,
2547 d->arch.shadow.free_pages,
2548 d->arch.shadow.p2m_pages);
2549 ASSERT(d->arch.shadow.total_pages == 0);
2552 /* We leave the "permanent" shadow modes enabled, but clear the
2553 * log-dirty mode bit. We don't want any more mark_dirty()
2554 * calls now that we've torn down the bitmap */
2555 d->arch.shadow.mode &= ~SHM2_log_dirty;
2557 shadow_unlock(d);
2560 void shadow_final_teardown(struct domain *d)
2561 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2564 SHADOW_PRINTK("dom %u final teardown starts."
2565 " Shadow pages total = %u, free = %u, p2m=%u\n",
2566 d->domain_id,
2567 d->arch.shadow.total_pages,
2568 d->arch.shadow.free_pages,
2569 d->arch.shadow.p2m_pages);
2571 /* Double-check that the domain didn't have any shadow memory.
2572 * It is possible for a domain that never got domain_kill()ed
2573 * to get here with its shadow allocation intact. */
2574 if ( d->arch.shadow.total_pages != 0 )
2575 shadow_teardown(d);
2577 /* It is now safe to pull down the p2m map. */
2578 if ( d->arch.shadow.p2m_pages != 0 )
2579 shadow_p2m_teardown(d);
2581 SHADOW_PRINTK("dom %u final teardown done."
2582 " Shadow pages total = %u, free = %u, p2m=%u\n",
2583 d->domain_id,
2584 d->arch.shadow.total_pages,
2585 d->arch.shadow.free_pages,
2586 d->arch.shadow.p2m_pages);
2589 static int shadow_one_bit_enable(struct domain *d, u32 mode)
2590 /* Turn on a single shadow mode feature */
2592 ASSERT(shadow_lock_is_acquired(d));
2594 /* Sanity check the call */
2595 if ( d == current->domain || (d->arch.shadow.mode & mode) )
2597 return -EINVAL;
2600 if ( d->arch.shadow.mode == 0 )
2602 /* Init the shadow memory allocation and the hash table */
2603 if ( set_sh_allocation(d, 1, NULL) != 0
2604 || shadow_hash_alloc(d) != 0 )
2606 set_sh_allocation(d, 0, NULL);
2607 return -ENOMEM;
2611 /* Update the bits */
2612 sh_new_mode(d, d->arch.shadow.mode | mode);
2614 return 0;
2617 static int shadow_one_bit_disable(struct domain *d, u32 mode)
2618 /* Turn off a single shadow mode feature */
2620 struct vcpu *v;
2621 ASSERT(shadow_lock_is_acquired(d));
2623 /* Sanity check the call */
2624 if ( d == current->domain || !(d->arch.shadow.mode & mode) )
2626 return -EINVAL;
2629 /* Update the bits */
2630 sh_new_mode(d, d->arch.shadow.mode & ~mode);
2631 if ( d->arch.shadow.mode == 0 )
2633 /* Get this domain off shadows */
2634 SHADOW_PRINTK("un-shadowing of domain %u starts."
2635 " Shadow pages total = %u, free = %u, p2m=%u\n",
2636 d->domain_id,
2637 d->arch.shadow.total_pages,
2638 d->arch.shadow.free_pages,
2639 d->arch.shadow.p2m_pages);
2640 for_each_vcpu(d, v)
2642 shadow_detach_old_tables(v);
2643 #if CONFIG_PAGING_LEVELS == 4
2644 if ( !(v->arch.flags & TF_kernel_mode) )
2645 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2646 else
2647 #endif
2648 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2652 /* Pull down the memory allocation */
2653 if ( set_sh_allocation(d, 0, NULL) != 0 )
2655 // XXX - How can this occur?
2656 // Seems like a bug to return an error now that we've
2657 // disabled the relevant shadow mode.
2658 //
2659 return -ENOMEM;
2661 shadow_hash_teardown(d);
2662 SHADOW_PRINTK("un-shadowing of domain %u done."
2663 " Shadow pages total = %u, free = %u, p2m=%u\n",
2664 d->domain_id,
2665 d->arch.shadow.total_pages,
2666 d->arch.shadow.free_pages,
2667 d->arch.shadow.p2m_pages);
2670 return 0;
2673 /* Enable/disable ops for the "test" and "log-dirty" modes */
2674 int shadow_test_enable(struct domain *d)
2676 int ret;
2678 domain_pause(d);
2679 shadow_lock(d);
2681 if ( shadow_mode_enabled(d) )
2683 SHADOW_ERROR("Don't support enabling test mode"
2684 " on already shadowed doms\n");
2685 ret = -EINVAL;
2686 goto out;
2689 ret = shadow_one_bit_enable(d, SHM2_enable);
2690 out:
2691 shadow_unlock(d);
2692 domain_unpause(d);
2694 return ret;
2697 int shadow_test_disable(struct domain *d)
2699 int ret;
2701 domain_pause(d);
2702 shadow_lock(d);
2703 ret = shadow_one_bit_disable(d, SHM2_enable);
2704 shadow_unlock(d);
2705 domain_unpause(d);
2707 return ret;
2710 static int
2711 sh_alloc_log_dirty_bitmap(struct domain *d)
2713 ASSERT(d->arch.shadow.dirty_bitmap == NULL);
2714 d->arch.shadow.dirty_bitmap_size =
2715 (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
2716 ~(BITS_PER_LONG - 1);
2717 d->arch.shadow.dirty_bitmap =
2718 xmalloc_array(unsigned long,
2719 d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
2720 if ( d->arch.shadow.dirty_bitmap == NULL )
2722 d->arch.shadow.dirty_bitmap_size = 0;
2723 return -ENOMEM;
2725 memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
2727 return 0;
2730 static void
2731 sh_free_log_dirty_bitmap(struct domain *d)
2733 d->arch.shadow.dirty_bitmap_size = 0;
2734 if ( d->arch.shadow.dirty_bitmap )
2736 xfree(d->arch.shadow.dirty_bitmap);
2737 d->arch.shadow.dirty_bitmap = NULL;
2741 static int shadow_log_dirty_enable(struct domain *d)
2743 int ret;
2745 domain_pause(d);
2746 shadow_lock(d);
2748 if ( shadow_mode_log_dirty(d) )
2750 ret = -EINVAL;
2751 goto out;
2754 if ( shadow_mode_enabled(d) )
2756 SHADOW_ERROR("Don't (yet) support enabling log-dirty"
2757 " on already shadowed doms\n");
2758 ret = -EINVAL;
2759 goto out;
2762 ret = sh_alloc_log_dirty_bitmap(d);
2763 if ( ret != 0 )
2765 sh_free_log_dirty_bitmap(d);
2766 goto out;
2769 ret = shadow_one_bit_enable(d, SHM2_log_dirty);
2770 if ( ret != 0 )
2771 sh_free_log_dirty_bitmap(d);
2773 out:
2774 shadow_unlock(d);
2775 domain_unpause(d);
2776 return ret;
2779 static int shadow_log_dirty_disable(struct domain *d)
2781 int ret;
2783 domain_pause(d);
2784 shadow_lock(d);
2785 ret = shadow_one_bit_disable(d, SHM2_log_dirty);
2786 if ( !shadow_mode_log_dirty(d) )
2787 sh_free_log_dirty_bitmap(d);
2788 shadow_unlock(d);
2789 domain_unpause(d);
2791 return ret;
2794 /**************************************************************************/
2795 /* P2M map manipulations */
2797 static void
2798 sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
2800 struct vcpu *v;
2802 if ( !shadow_mode_translate(d) )
2803 return;
2805 v = current;
2806 if ( v->domain != d )
2807 v = d->vcpu[0];
2810 SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
2812 ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
2813 //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
2815 shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
2816 if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
2817 flush_tlb_mask(d->domain_dirty_cpumask);
2818 shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
2819 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
2822 void
2823 shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
2824 unsigned long mfn)
2826 shadow_lock(d);
2827 shadow_audit_p2m(d);
2828 sh_p2m_remove_page(d, gfn, mfn);
2829 shadow_audit_p2m(d);
2830 shadow_unlock(d);
2833 void
2834 shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
2835 unsigned long mfn)
2837 struct vcpu *v;
2838 unsigned long ogfn;
2839 mfn_t omfn;
2841 if ( !shadow_mode_translate(d) )
2842 return;
2844 v = current;
2845 if ( v->domain != d )
2846 v = d->vcpu[0];
2848 shadow_lock(d);
2849 shadow_audit_p2m(d);
2851 SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
2853 omfn = sh_gfn_to_mfn(d, gfn);
2854 if ( valid_mfn(omfn) )
2856 /* Get rid of the old mapping, especially any shadows */
2857 shadow_remove_all_shadows_and_parents(v, omfn);
2858 if ( shadow_remove_all_mappings(v, omfn) )
2859 flush_tlb_mask(d->domain_dirty_cpumask);
2860 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
2863 ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
2864 if (
2865 #ifdef __x86_64__
2866 (ogfn != 0x5555555555555555L)
2867 #else
2868 (ogfn != 0x55555555L)
2869 #endif
2870 && (ogfn != INVALID_M2P_ENTRY)
2871 && (ogfn != gfn) )
2873 /* This machine frame is already mapped at another physical address */
2874 SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
2875 mfn, ogfn, gfn);
2876 if ( valid_mfn(omfn = sh_gfn_to_mfn(d, ogfn)) )
2878 SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
2879 ogfn , mfn_x(omfn));
2880 if ( mfn_x(omfn) == mfn )
2881 sh_p2m_remove_page(d, ogfn, mfn);
2885 shadow_set_p2m_entry(d, gfn, _mfn(mfn));
2886 set_gpfn_from_mfn(mfn, gfn);
2887 shadow_audit_p2m(d);
2888 shadow_unlock(d);
2891 /**************************************************************************/
2892 /* Log-dirty mode support */
2894 /* Convert a shadow to log-dirty mode. */
2895 void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
2897 BUG();
2901 /* Read a domain's log-dirty bitmap and stats.
2902 * If the operation is a CLEAN, clear the bitmap and stats as well. */
2903 static int shadow_log_dirty_op(
2904 struct domain *d, struct xen_domctl_shadow_op *sc)
2906 int i, rv = 0, clean = 0;
2908 domain_pause(d);
2909 shadow_lock(d);
2911 clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
2913 SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
2914 (clean) ? "clean" : "peek",
2915 d->domain_id,
2916 d->arch.shadow.fault_count,
2917 d->arch.shadow.dirty_count);
2919 sc->stats.fault_count = d->arch.shadow.fault_count;
2920 sc->stats.dirty_count = d->arch.shadow.dirty_count;
2922 if ( clean )
2924 struct list_head *l, *t;
2925 struct page_info *pg;
2927 /* Need to revoke write access to the domain's pages again.
2928 * In future, we'll have a less heavy-handed approach to this,
2929 * but for now, we just unshadow everything except Xen. */
2930 list_for_each_safe(l, t, &d->arch.shadow.toplevel_shadows)
2932 pg = list_entry(l, struct page_info, list);
2933 shadow_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
2936 d->arch.shadow.fault_count = 0;
2937 d->arch.shadow.dirty_count = 0;
2940 if ( guest_handle_is_null(sc->dirty_bitmap) ||
2941 (d->arch.shadow.dirty_bitmap == NULL) )
2943 rv = -EINVAL;
2944 goto out;
2947 if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
2948 sc->pages = d->arch.shadow.dirty_bitmap_size;
2950 #define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
2951 for ( i = 0; i < sc->pages; i += CHUNK )
2953 int bytes = ((((sc->pages - i) > CHUNK)
2954 ? CHUNK
2955 : (sc->pages - i)) + 7) / 8;
2957 if ( copy_to_guest_offset(
2958 sc->dirty_bitmap,
2959 i/(8*sizeof(unsigned long)),
2960 d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
2961 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
2963 rv = -EINVAL;
2964 goto out;
2967 if ( clean )
2968 memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
2969 0, bytes);
2971 #undef CHUNK
2973 out:
2974 shadow_unlock(d);
2975 domain_unpause(d);
2976 return 0;
2980 /* Mark a page as dirty */
2981 void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
2983 unsigned long pfn;
2985 ASSERT(shadow_lock_is_acquired(d));
2986 ASSERT(shadow_mode_log_dirty(d));
2988 if ( !valid_mfn(gmfn) )
2989 return;
2991 ASSERT(d->arch.shadow.dirty_bitmap != NULL);
2993 /* We /really/ mean PFN here, even for non-translated guests. */
2994 pfn = get_gpfn_from_mfn(mfn_x(gmfn));
2996 /*
2997 * Values with the MSB set denote MFNs that aren't really part of the
2998 * domain's pseudo-physical memory map (e.g., the shared info frame).
2999 * Nothing to do here...
3000 */
3001 if ( unlikely(!VALID_M2P(pfn)) )
3002 return;
3004 /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
3005 if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
3007 if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
3009 SHADOW_DEBUG(LOGDIRTY,
3010 "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
3011 mfn_x(gmfn), pfn, d->domain_id);
3012 d->arch.shadow.dirty_count++;
3015 else
3017 SHADOW_PRINTK("mark_dirty OOR! "
3018 "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
3019 "owner=%d c=%08x t=%" PRtype_info "\n",
3020 mfn_x(gmfn),
3021 pfn,
3022 d->arch.shadow.dirty_bitmap_size,
3023 d->domain_id,
3024 (page_get_owner(mfn_to_page(gmfn))
3025 ? page_get_owner(mfn_to_page(gmfn))->domain_id
3026 : -1),
3027 mfn_to_page(gmfn)->count_info,
3028 mfn_to_page(gmfn)->u.inuse.type_info);
3033 /**************************************************************************/
3034 /* Shadow-control XEN_DOMCTL dispatcher */
3036 int shadow_domctl(struct domain *d,
3037 xen_domctl_shadow_op_t *sc,
3038 XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
3040 int rc, preempted = 0;
3042 if ( unlikely(d == current->domain) )
3044 DPRINTK("Don't try to do a shadow op on yourself!\n");
3045 return -EINVAL;
3048 switch ( sc->op )
3050 case XEN_DOMCTL_SHADOW_OP_OFF:
3051 if ( shadow_mode_log_dirty(d) )
3052 if ( (rc = shadow_log_dirty_disable(d)) != 0 )
3053 return rc;
3054 if ( d->arch.shadow.mode & SHM2_enable )
3055 if ( (rc = shadow_test_disable(d)) != 0 )
3056 return rc;
3057 return 0;
3059 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3060 return shadow_test_enable(d);
3062 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
3063 return shadow_log_dirty_enable(d);
3065 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3066 return shadow_enable(d, SHM2_refcounts|SHM2_translate);
3068 case XEN_DOMCTL_SHADOW_OP_CLEAN:
3069 case XEN_DOMCTL_SHADOW_OP_PEEK:
3070 return shadow_log_dirty_op(d, sc);
3072 case XEN_DOMCTL_SHADOW_OP_ENABLE:
3073 if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
3074 return shadow_log_dirty_enable(d);
3075 return shadow_enable(d, sc->mode << SHM2_shift);
3077 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3078 sc->mb = shadow_get_allocation(d);
3079 return 0;
3081 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3082 rc = shadow_set_allocation(d, sc->mb, &preempted);
3083 if ( preempted )
3084 /* Not finished. Set up to re-run the call. */
3085 rc = hypercall_create_continuation(
3086 __HYPERVISOR_domctl, "h", u_domctl);
3087 else
3088 /* Finished. Return the new allocation */
3089 sc->mb = shadow_get_allocation(d);
3090 return rc;
3092 default:
3093 SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3094 return -EINVAL;
3099 /**************************************************************************/
3100 /* Auditing shadow tables */
3102 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3104 void shadow_audit_tables(struct vcpu *v)
3106 /* Dispatch table for getting per-type functions */
3107 static hash_callback_t callbacks[16] = {
3108 NULL, /* none */
3109 #if CONFIG_PAGING_LEVELS == 2
3110 SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
3111 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
3112 SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
3113 #else
3114 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
3115 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
3116 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
3117 SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
3118 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
3119 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
3120 SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
3121 SHADOW_INTERNAL_NAME(sh_audit_l3_table,3,3), /* l3_pae */
3122 #if CONFIG_PAGING_LEVELS >= 4
3123 SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
3124 SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
3125 SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
3126 SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
3127 SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
3128 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3129 #endif /* CONFIG_PAGING_LEVELS > 2 */
3130 NULL /* All the rest */
3131 };
3132 unsigned int mask;
3134 if ( !(SHADOW_AUDIT_ENABLE) )
3135 return;
3137 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3138 mask = ~1; /* Audit every table in the system */
3139 else
3141 /* Audit only the current mode's tables */
3142 switch ( v->arch.shadow.mode->guest_levels )
3144 case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3145 case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3146 |SHF_L2H_PAE|SHF_L3_PAE); break;
3147 case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
3148 |SHF_L3_64|SHF_L4_64); break;
3149 default: BUG();
3153 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3156 #endif /* Shadow audit */
3159 /**************************************************************************/
3160 /* Auditing p2m tables */
3162 #if SHADOW_AUDIT & SHADOW_AUDIT_P2M
3164 void shadow_audit_p2m(struct domain *d)
3166 struct list_head *entry;
3167 struct page_info *page;
3168 struct domain *od;
3169 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
3170 mfn_t p2mfn;
3171 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
3172 int test_linear;
3174 if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
3175 return;
3177 //SHADOW_PRINTK("p2m audit starts\n");
3179 test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
3180 if ( test_linear )
3181 local_flush_tlb();
3183 /* Audit part one: walk the domain's page allocation list, checking
3184 * the m2p entries. */
3185 for ( entry = d->page_list.next;
3186 entry != &d->page_list;
3187 entry = entry->next )
3189 page = list_entry(entry, struct page_info, list);
3190 mfn = mfn_x(page_to_mfn(page));
3192 // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
3194 od = page_get_owner(page);
3196 if ( od != d )
3198 SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
3199 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
3200 continue;
3203 gfn = get_gpfn_from_mfn(mfn);
3204 if ( gfn == INVALID_M2P_ENTRY )
3206 orphans_i++;
3207 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
3208 // mfn);
3209 continue;
3212 if ( gfn == 0x55555555 )
3214 orphans_d++;
3215 //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
3216 // mfn);
3217 continue;
3220 p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
3221 if ( mfn_x(p2mfn) != mfn )
3223 mpbad++;
3224 SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
3225 " (-> gfn %#lx)\n",
3226 mfn, gfn, mfn_x(p2mfn),
3227 (mfn_valid(p2mfn)
3228 ? get_gpfn_from_mfn(mfn_x(p2mfn))
3229 : -1u));
3230 /* This m2p entry is stale: the domain has another frame in
3231 * this physical slot. No great disaster, but for neatness,
3232 * blow away the m2p entry. */
3233 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3236 if ( test_linear )
3238 lp2mfn = get_mfn_from_gpfn(gfn);
3239 if ( lp2mfn != mfn_x(p2mfn) )
3241 SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
3242 "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
3246 // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
3247 // mfn, gfn, p2mfn, lp2mfn);
3250 /* Audit part two: walk the domain's p2m table, checking the entries. */
3251 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
3253 l2_pgentry_t *l2e;
3254 l1_pgentry_t *l1e;
3255 int i1, i2;
3257 #if CONFIG_PAGING_LEVELS == 4
3258 l4_pgentry_t *l4e;
3259 l3_pgentry_t *l3e;
3260 int i3, i4;
3261 l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3262 #elif CONFIG_PAGING_LEVELS == 3
3263 l3_pgentry_t *l3e;
3264 int i3;
3265 l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3266 #else /* CONFIG_PAGING_LEVELS == 2 */
3267 l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3268 #endif
3270 gfn = 0;
3271 #if CONFIG_PAGING_LEVELS >= 3
3272 #if CONFIG_PAGING_LEVELS >= 4
3273 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
3275 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
3277 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
3278 continue;
3280 l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
3281 #endif /* now at levels 3 or 4... */
3282 for ( i3 = 0;
3283 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
3284 i3++ )
3286 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
3288 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3289 continue;
3291 l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
3292 #endif /* all levels... */
3293 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
3295 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
3297 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
3298 continue;
3300 l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
3302 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
3304 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
3305 continue;
3306 mfn = l1e_get_pfn(l1e[i1]);
3307 ASSERT(valid_mfn(_mfn(mfn)));
3308 m2pfn = get_gpfn_from_mfn(mfn);
3309 if ( m2pfn != gfn )
3311 pmbad++;
3312 SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
3313 " -> gfn %#lx\n", gfn, mfn, m2pfn);
3314 BUG();
3317 sh_unmap_domain_page(l1e);
3319 #if CONFIG_PAGING_LEVELS >= 3
3320 sh_unmap_domain_page(l2e);
3322 #if CONFIG_PAGING_LEVELS >= 4
3323 sh_unmap_domain_page(l3e);
3325 #endif
3326 #endif
3328 #if CONFIG_PAGING_LEVELS == 4
3329 sh_unmap_domain_page(l4e);
3330 #elif CONFIG_PAGING_LEVELS == 3
3331 sh_unmap_domain_page(l3e);
3332 #else /* CONFIG_PAGING_LEVELS == 2 */
3333 sh_unmap_domain_page(l2e);
3334 #endif
3338 //SHADOW_PRINTK("p2m audit complete\n");
3339 //if ( orphans_i | orphans_d | mpbad | pmbad )
3340 // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
3341 // orphans_i + orphans_d, orphans_i, orphans_d,
3342 if ( mpbad | pmbad )
3343 SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
3344 pmbad, mpbad);
3347 #endif /* p2m audit */
3349 /*
3350 * Local variables:
3351 * mode: C
3352 * c-set-style: "BSD"
3353 * c-basic-offset: 4
3354 * indent-tabs-mode: nil
3355 * End:
3356 */