ia64/xen-unstable

view xen/arch/x86/shadow2-common.c @ 11212:ca9f3a7b1b03

[XEN] Remove shadow2 dependencies from common code.
Disable debugtrace functionality even in debug builds:
it's currently only used by shadow2 maintainers.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Fri Aug 18 17:59:26 2006 +0100 (2006-08-18)
parents 395bfcf84451
children 45a84091144e
line source
1 /******************************************************************************
2 * arch/x86/shadow2-common.c
3 *
4 * Shadow2 code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #define SHADOW2 1
26 #include <xen/config.h>
27 #include <xen/types.h>
28 #include <xen/mm.h>
29 #include <xen/trace.h>
30 #include <xen/sched.h>
31 #include <xen/perfc.h>
32 #include <xen/irq.h>
33 #include <xen/domain_page.h>
34 #include <xen/guest_access.h>
35 #include <xen/keyhandler.h>
36 #include <asm/event.h>
37 #include <asm/page.h>
38 #include <asm/current.h>
39 #include <asm/flushtlb.h>
40 #include <asm/shadow2.h>
41 #include <asm/shadow2-private.h>
43 #if SHADOW2_AUDIT
44 int shadow2_audit_enable = 0;
46 static void shadow2_audit_key(unsigned char key)
47 {
48 shadow2_audit_enable = !shadow2_audit_enable;
49 printk("%s shadow2_audit_enable=%d\n",
50 __func__, shadow2_audit_enable);
51 }
53 static int __init shadow2_audit_key_init(void)
54 {
55 register_keyhandler(
56 'O', shadow2_audit_key, "toggle shadow2 audits");
57 return 0;
58 }
59 __initcall(shadow2_audit_key_init);
60 #endif /* SHADOW2_AUDIT */
62 static void sh2_free_log_dirty_bitmap(struct domain *d);
64 int _shadow2_mode_refcounts(struct domain *d)
65 {
66 return shadow2_mode_refcounts(d);
67 }
70 /**************************************************************************/
71 /* x86 emulator support for the shadow2 code
72 */
74 static int
75 sh2_x86_emulate_read_std(unsigned long addr,
76 unsigned long *val,
77 unsigned int bytes,
78 struct x86_emulate_ctxt *ctxt)
79 {
80 struct vcpu *v = current;
81 if ( hvm_guest(v) )
82 {
83 *val = 0;
84 // XXX -- this is WRONG.
85 // It entirely ignores the permissions in the page tables.
86 // In this case, that is only a user vs supervisor access check.
87 //
88 if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
89 {
90 #if 0
91 SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
92 v->domain->domain_id, v->vcpu_id,
93 addr, *val, bytes);
94 #endif
95 return X86EMUL_CONTINUE;
96 }
98 /* If we got here, there was nothing mapped here, or a bad GFN
99 * was mapped here. This should never happen: we're here because
100 * of a write fault at the end of the instruction we're emulating. */
101 SHADOW2_PRINTK("read failed to va %#lx\n", addr);
102 return X86EMUL_PROPAGATE_FAULT;
103 }
104 else
105 {
106 SHADOW2_PRINTK("this operation is not emulated yet\n");
107 return X86EMUL_UNHANDLEABLE;
108 }
109 }
111 static int
112 sh2_x86_emulate_write_std(unsigned long addr,
113 unsigned long val,
114 unsigned int bytes,
115 struct x86_emulate_ctxt *ctxt)
116 {
117 struct vcpu *v = current;
118 #if 0
119 SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
120 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
121 #endif
122 if ( hvm_guest(v) )
123 {
124 // XXX -- this is WRONG.
125 // It entirely ignores the permissions in the page tables.
126 // In this case, that includes user vs supervisor, and
127 // write access.
128 //
129 if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
130 return X86EMUL_CONTINUE;
132 /* If we got here, there was nothing mapped here, or a bad GFN
133 * was mapped here. This should never happen: we're here because
134 * of a write fault at the end of the instruction we're emulating,
135 * which should be handled by sh2_x86_emulate_write_emulated. */
136 SHADOW2_PRINTK("write failed to va %#lx\n", addr);
137 return X86EMUL_PROPAGATE_FAULT;
138 }
139 else
140 {
141 SHADOW2_PRINTK("this operation is not emulated yet\n");
142 return X86EMUL_UNHANDLEABLE;
143 }
144 }
146 static int
147 sh2_x86_emulate_write_emulated(unsigned long addr,
148 unsigned long val,
149 unsigned int bytes,
150 struct x86_emulate_ctxt *ctxt)
151 {
152 struct vcpu *v = current;
153 #if 0
154 SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
155 v->domain->domain_id, v->vcpu_id, addr, val, bytes);
156 #endif
157 if ( hvm_guest(v) )
158 {
159 return v->arch.shadow2->x86_emulate_write(v, addr, &val, bytes, ctxt);
160 }
161 else
162 {
163 SHADOW2_PRINTK("this operation is not emulated yet\n");
164 return X86EMUL_UNHANDLEABLE;
165 }
166 }
168 static int
169 sh2_x86_emulate_cmpxchg_emulated(unsigned long addr,
170 unsigned long old,
171 unsigned long new,
172 unsigned int bytes,
173 struct x86_emulate_ctxt *ctxt)
174 {
175 struct vcpu *v = current;
176 #if 0
177 SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
178 v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
179 #endif
180 if ( hvm_guest(v) )
181 {
182 return v->arch.shadow2->x86_emulate_cmpxchg(v, addr, old, new,
183 bytes, ctxt);
184 }
185 else
186 {
187 SHADOW2_PRINTK("this operation is not emulated yet\n");
188 return X86EMUL_UNHANDLEABLE;
189 }
190 }
192 static int
193 sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
194 unsigned long old_lo,
195 unsigned long old_hi,
196 unsigned long new_lo,
197 unsigned long new_hi,
198 struct x86_emulate_ctxt *ctxt)
199 {
200 struct vcpu *v = current;
201 #if 0
202 SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
203 v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
204 new_hi, new_lo, ctxt);
205 #endif
206 if ( hvm_guest(v) )
207 {
208 return v->arch.shadow2->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
209 new_lo, new_hi, ctxt);
210 }
211 else
212 {
213 SHADOW2_PRINTK("this operation is not emulated yet\n");
214 return X86EMUL_UNHANDLEABLE;
215 }
216 }
219 struct x86_emulate_ops shadow2_emulator_ops = {
220 .read_std = sh2_x86_emulate_read_std,
221 .write_std = sh2_x86_emulate_write_std,
222 .read_emulated = sh2_x86_emulate_read_std,
223 .write_emulated = sh2_x86_emulate_write_emulated,
224 .cmpxchg_emulated = sh2_x86_emulate_cmpxchg_emulated,
225 .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated,
226 };
229 /**************************************************************************/
230 /* Code for "promoting" a guest page to the point where the shadow code is
231 * willing to let it be treated as a guest page table. This generally
232 * involves making sure there are no writable mappings available to the guest
233 * for this page.
234 */
235 void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type)
236 {
237 struct page_info *page = mfn_to_page(gmfn);
238 unsigned long type_info;
240 ASSERT(valid_mfn(gmfn));
242 /* We should never try to promote a gmfn that has writeable mappings */
243 ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0);
245 // Is the page already shadowed?
246 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
247 {
248 // No prior shadow exists...
250 // Grab a type-ref. We don't really care if we are racing with another
251 // vcpu or not, or even what kind of type we get; we just want the type
252 // count to be > 0.
253 //
254 do {
255 type_info =
256 page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
257 } while ( !get_page_type(page, type_info) );
259 // Now that the type ref is non-zero, we can safely use the
260 // shadow2_flags.
261 //
262 page->shadow2_flags = 0;
263 }
265 ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
266 set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
267 }
269 void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type)
270 {
271 struct page_info *page = mfn_to_page(gmfn);
273 ASSERT(test_bit(_PGC_page_table, &page->count_info));
274 ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
276 clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
278 if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 )
279 {
280 // release the extra type ref
281 put_page_type(page);
283 // clear the is-a-page-table bit.
284 clear_bit(_PGC_page_table, &page->count_info);
285 }
286 }
288 /**************************************************************************/
289 /* Validate a pagetable change from the guest and update the shadows.
290 * Returns a bitmask of SHADOW2_SET_* flags. */
292 static int
293 __shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
294 void *entry, u32 size)
295 {
296 int result = 0;
297 struct page_info *page = mfn_to_page(gmfn);
299 sh2_mark_dirty(v->domain, gmfn);
301 // Determine which types of shadows are affected, and update each.
302 //
303 // Always validate L1s before L2s to prevent another cpu with a linear
304 // mapping of this gmfn from seeing a walk that results from
305 // using the new L2 value and the old L1 value. (It is OK for such a
306 // guest to see a walk that uses the old L2 value with the new L1 value,
307 // as hardware could behave this way if one level of the pagewalk occurs
308 // before the store, and the next level of the pagewalk occurs after the
309 // store.
310 //
311 // Ditto for L2s before L3s, etc.
312 //
314 if ( !(page->count_info & PGC_page_table) )
315 return 0; /* Not shadowed at all */
317 #if CONFIG_PAGING_LEVELS == 2
318 if ( page->shadow2_flags & SH2F_L1_32 )
319 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2)
320 (v, gmfn, entry, size);
321 #else
322 if ( page->shadow2_flags & SH2F_L1_32 )
323 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2)
324 (v, gmfn, entry, size);
325 #endif
327 #if CONFIG_PAGING_LEVELS == 2
328 if ( page->shadow2_flags & SH2F_L2_32 )
329 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2)
330 (v, gmfn, entry, size);
331 #else
332 if ( page->shadow2_flags & SH2F_L2_32 )
333 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2)
334 (v, gmfn, entry, size);
335 #endif
337 #if CONFIG_PAGING_LEVELS >= 3
338 if ( page->shadow2_flags & SH2F_L1_PAE )
339 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3)
340 (v, gmfn, entry, size);
341 if ( page->shadow2_flags & SH2F_L2_PAE )
342 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3)
343 (v, gmfn, entry, size);
344 if ( page->shadow2_flags & SH2F_L2H_PAE )
345 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3)
346 (v, gmfn, entry, size);
347 if ( page->shadow2_flags & SH2F_L3_PAE )
348 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3)
349 (v, gmfn, entry, size);
350 #else /* 32-bit non-PAE hypervisor does not support PAE guests */
351 ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0);
352 #endif
354 #if CONFIG_PAGING_LEVELS >= 4
355 if ( page->shadow2_flags & SH2F_L1_64 )
356 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4)
357 (v, gmfn, entry, size);
358 if ( page->shadow2_flags & SH2F_L2_64 )
359 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4)
360 (v, gmfn, entry, size);
361 if ( page->shadow2_flags & SH2F_L3_64 )
362 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4)
363 (v, gmfn, entry, size);
364 if ( page->shadow2_flags & SH2F_L4_64 )
365 result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4)
366 (v, gmfn, entry, size);
367 #else /* 32-bit/PAE hypervisor does not support 64-bit guests */
368 ASSERT((page->shadow2_flags
369 & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0);
370 #endif
372 return result;
373 }
376 int
377 shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
378 /* This is the entry point from hypercalls. It returns a bitmask of all the
379 * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
380 {
381 int rc;
383 ASSERT(shadow2_lock_is_acquired(v->domain));
384 rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
385 shadow2_audit_tables(v);
386 return rc;
387 }
389 void
390 shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
391 void *entry, u32 size)
392 /* This is the entry point for emulated writes to pagetables in HVM guests */
393 {
394 struct domain *d = v->domain;
395 int rc;
397 ASSERT(shadow2_lock_is_acquired(v->domain));
398 rc = __shadow2_validate_guest_entry(v, gmfn, entry, size);
399 if ( rc & SHADOW2_SET_FLUSH )
400 {
401 // Flush everyone except the local processor, which will flush when it
402 // re-enters the HVM guest.
403 //
404 cpumask_t mask = d->domain_dirty_cpumask;
405 cpu_clear(v->processor, mask);
406 flush_tlb_mask(mask);
407 }
408 if ( rc & SHADOW2_SET_ERROR )
409 {
410 /* This page is probably not a pagetable any more: tear it out of the
411 * shadows, along with any tables that reference it */
412 shadow2_remove_all_shadows_and_parents(v, gmfn);
413 }
414 /* We ignore the other bits: since we are about to change CR3 on
415 * VMENTER we don't need to do any extra TLB flushes. */
416 }
419 /**************************************************************************/
420 /* Memory management for shadow pages. */
422 /* Meaning of the count_info field in shadow pages
423 * ----------------------------------------------
424 *
425 * A count of all references to this page from other shadow pages and
426 * guest CR3s (a.k.a. v->arch.shadow_table).
427 *
428 * The top bits hold the shadow type and the pinned bit. Top-level
429 * shadows are pinned so that they don't disappear when not in a CR3
430 * somewhere.
431 *
432 * We don't need to use get|put_page for this as the updates are all
433 * protected by the shadow lock. We can't use get|put_page for this
434 * as the size of the count on shadow pages is different from that on
435 * normal guest pages.
436 */
438 /* Meaning of the type_info field in shadow pages
439 * ----------------------------------------------
440 *
441 * type_info use depends on the shadow type (from count_info)
442 *
443 * PGC_SH2_none : This page is in the shadow2 free pool. type_info holds
444 * the chunk order for our freelist allocator.
445 *
446 * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info
447 * holds the mfn of the guest page being shadowed,
448 *
449 * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage.
450 * type_info holds the gfn being shattered.
451 *
452 * PGC_SH2_monitor_table : This page is part of a monitor table.
453 * type_info is not used.
454 */
456 /* Meaning of the _domain field in shadow pages
457 * --------------------------------------------
458 *
459 * In shadow pages, this field will always have its least significant bit
460 * set. This ensures that all attempts to get_page() will fail (as all
461 * valid pickled domain pointers have a zero for their least significant bit).
462 * Instead, the remaining upper bits are used to record the shadow generation
463 * counter when the shadow was created.
464 */
466 /* Meaning of the shadow2_flags field
467 * ----------------------------------
468 *
469 * In guest pages that are shadowed, one bit for each kind of shadow they have.
470 *
471 * In shadow pages, will be used for holding a representation of the populated
472 * entries in this shadow (either a min/max, or a bitmap, or ...)
473 *
474 * In monitor-table pages, holds the level of the particular page (to save
475 * spilling the shadow types into an extra bit by having three types of monitor
476 * page).
477 */
479 /* Meaning of the list_head struct in shadow pages
480 * -----------------------------------------------
481 *
482 * In free shadow pages, this is used to hold the free-lists of chunks.
483 *
484 * In top-level shadow tables, this holds a linked-list of all top-level
485 * shadows (used for recovering memory and destroying shadows).
486 *
487 * In lower-level shadows, this holds the physical address of a higher-level
488 * shadow entry that holds a reference to this shadow (or zero).
489 */
491 /* Allocating shadow pages
492 * -----------------------
493 *
494 * Most shadow pages are allocated singly, but there are two cases where we
495 * need to allocate multiple pages together.
496 *
497 * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
498 * A 32-bit guest l1 table covers 4MB of virtuial address space,
499 * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
500 * of virtual address space each). Similarly, a 32-bit guest l2 table
501 * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va
502 * each). These multi-page shadows are contiguous and aligned;
503 * functions for handling offsets into them are defined in shadow2.c
504 * (shadow_l1_index() etc.)
505 *
506 * 2: Shadowing PAE top-level pages. Each guest page that contains
507 * any PAE top-level pages requires two shadow pages to shadow it.
508 * They contain alternating l3 tables and pae_l3_bookkeeping structs.
509 *
510 * This table shows the allocation behaviour of the different modes:
511 *
512 * Xen paging 32b pae pae 64b 64b 64b
513 * Guest paging 32b 32b pae 32b pae 64b
514 * PV or HVM * HVM * HVM HVM *
515 * Shadow paging 32b pae pae pae pae 64b
516 *
517 * sl1 size 4k 8k 4k 8k 4k 4k
518 * sl2 size 4k 16k 4k 16k 4k 4k
519 * sl3 size - - 8k - 8k 4k
520 * sl4 size - - - - - 4k
521 *
522 * We allocate memory from xen in four-page units and break them down
523 * with a simple buddy allocator. Can't use the xen allocator to handle
524 * this as it only works for contiguous zones, and a domain's shadow
525 * pool is made of fragments.
526 *
527 * In HVM guests, the p2m table is built out of shadow pages, and we provide
528 * a function for the p2m management to steal pages, in max-order chunks, from
529 * the free pool. We don't provide for giving them back, yet.
530 */
532 /* Figure out the least acceptable quantity of shadow memory.
533 * The minimum memory requirement for always being able to free up a
534 * chunk of memory is very small -- only three max-order chunks per
535 * vcpu to hold the top level shadows and pages with Xen mappings in them.
536 *
537 * But for a guest to be guaranteed to successfully execute a single
538 * instruction, we must be able to map a large number (about thirty) VAs
539 * at the same time, which means that to guarantee progress, we must
540 * allow for more than ninety allocated pages per vcpu. We round that
541 * up to 128 pages, or half a megabyte per vcpu. */
542 unsigned int shadow2_min_acceptable_pages(struct domain *d)
543 {
544 u32 vcpu_count = 0;
545 struct vcpu *v;
547 for_each_vcpu(d, v)
548 vcpu_count++;
550 return (vcpu_count * 128);
551 }
553 /* Using the type_info field to store freelist order */
554 #define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
555 #define SH2_SET_PFN_ORDER(_p, _o) \
556 do { (_p)->u.inuse.type_info = (_o); } while (0)
559 /* Figure out the order of allocation needed for a given shadow type */
560 static inline u32
561 shadow_order(u32 shadow_type)
562 {
563 #if CONFIG_PAGING_LEVELS > 2
564 static const u32 type_to_order[16] = {
565 0, /* PGC_SH2_none */
566 1, /* PGC_SH2_l1_32_shadow */
567 1, /* PGC_SH2_fl1_32_shadow */
568 2, /* PGC_SH2_l2_32_shadow */
569 0, /* PGC_SH2_l1_pae_shadow */
570 0, /* PGC_SH2_fl1_pae_shadow */
571 0, /* PGC_SH2_l2_pae_shadow */
572 0, /* PGC_SH2_l2h_pae_shadow */
573 1, /* PGC_SH2_l3_pae_shadow */
574 0, /* PGC_SH2_l1_64_shadow */
575 0, /* PGC_SH2_fl1_64_shadow */
576 0, /* PGC_SH2_l2_64_shadow */
577 0, /* PGC_SH2_l3_64_shadow */
578 0, /* PGC_SH2_l4_64_shadow */
579 2, /* PGC_SH2_p2m_table */
580 0 /* PGC_SH2_monitor_table */
581 };
582 u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift;
583 return type_to_order[type];
584 #else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
585 return 0;
586 #endif
587 }
590 /* Do we have a free chunk of at least this order? */
591 static inline int chunk_is_available(struct domain *d, int order)
592 {
593 int i;
595 for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
596 if ( !list_empty(&d->arch.shadow2_freelists[i]) )
597 return 1;
598 return 0;
599 }
601 /* Dispatcher function: call the per-mode function that will unhook the
602 * non-Xen mappings in this top-level shadow mfn */
603 void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn)
604 {
605 struct page_info *pg = mfn_to_page(smfn);
606 switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift )
607 {
608 case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
609 #if CONFIG_PAGING_LEVELS == 2
610 SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn);
611 #else
612 SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn);
613 #endif
614 break;
615 #if CONFIG_PAGING_LEVELS >= 3
616 case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
617 SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn);
618 break;
619 #endif
620 #if CONFIG_PAGING_LEVELS >= 4
621 case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
622 SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn);
623 break;
624 #endif
625 default:
626 SHADOW2_PRINTK("top-level shadow has bad type %08lx\n",
627 (unsigned long)((pg->count_info & PGC_SH2_type_mask)
628 >> PGC_SH2_type_shift));
629 BUG();
630 }
631 }
634 /* Make sure there is at least one chunk of the required order available
635 * in the shadow page pool. This must be called before any calls to
636 * shadow2_alloc(). Since this will free existing shadows to make room,
637 * it must be called early enough to avoid freeing shadows that the
638 * caller is currently working on. */
639 void shadow2_prealloc(struct domain *d, unsigned int order)
640 {
641 /* Need a vpcu for calling unpins; for now, since we don't have
642 * per-vcpu shadows, any will do */
643 struct vcpu *v = d->vcpu[0];
644 struct list_head *l, *t;
645 struct page_info *pg;
646 mfn_t smfn;
648 if ( chunk_is_available(d, order) ) return;
650 /* Stage one: walk the list of top-level pages, unpinning them */
651 perfc_incrc(shadow2_prealloc_1);
652 list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
653 {
654 pg = list_entry(l, struct page_info, list);
655 smfn = page_to_mfn(pg);
657 #if CONFIG_PAGING_LEVELS >= 3
658 if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow )
659 {
660 /* For PAE, we need to unpin each subshadow on this shadow */
661 SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn);
662 }
663 else
664 #endif /* 32-bit code always takes this branch */
665 {
666 /* Unpin this top-level shadow */
667 sh2_unpin(v, smfn);
668 }
670 /* See if that freed up a chunk of appropriate size */
671 if ( chunk_is_available(d, order) ) return;
672 }
674 /* Stage two: all shadow pages are in use in hierarchies that are
675 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
676 * mappings. */
677 perfc_incrc(shadow2_prealloc_2);
678 v = current;
679 if ( v->domain != d )
680 v = d->vcpu[0];
681 /* Walk the list from the tail: recently used toplevels have been pulled
682 * to the head */
683 list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
684 {
685 pg = list_entry(l, struct page_info, list);
686 smfn = page_to_mfn(pg);
687 shadow2_unhook_mappings(v, smfn);
689 /* Need to flush TLB if we've altered our own tables */
690 if ( !shadow2_mode_external(d)
691 && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
692 local_flush_tlb();
694 /* See if that freed up a chunk of appropriate size */
695 if ( chunk_is_available(d, order) ) return;
696 }
698 /* Nothing more we can do: all remaining shadows are of pages that
699 * hold Xen mappings for some vcpu. This can never happen. */
700 SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n"
701 " shadow pages total = %u, free = %u, p2m=%u\n",
702 1 << order,
703 d->arch.shadow2_total_pages,
704 d->arch.shadow2_free_pages,
705 d->arch.shadow2_p2m_pages);
706 BUG();
707 }
710 /* Allocate another shadow's worth of (contiguous, aligned) pages,
711 * and fill in the type and backpointer fields of their page_infos.
712 * Never fails to allocate. */
713 mfn_t shadow2_alloc(struct domain *d,
714 u32 shadow_type,
715 unsigned long backpointer)
716 {
717 struct page_info *pg = NULL;
718 unsigned int order = shadow_order(shadow_type);
719 cpumask_t mask;
720 void *p;
721 int i;
723 ASSERT(shadow2_lock_is_acquired(d));
724 ASSERT(order <= SHADOW2_MAX_ORDER);
725 ASSERT(shadow_type != PGC_SH2_none);
726 perfc_incrc(shadow2_alloc);
728 /* Find smallest order which can satisfy the request. */
729 for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
730 if ( !list_empty(&d->arch.shadow2_freelists[i]) )
731 {
732 pg = list_entry(d->arch.shadow2_freelists[i].next,
733 struct page_info, list);
734 list_del(&pg->list);
736 /* We may have to halve the chunk a number of times. */
737 while ( i != order )
738 {
739 i--;
740 SH2_SET_PFN_ORDER(pg, i);
741 list_add_tail(&pg->list, &d->arch.shadow2_freelists[i]);
742 pg += 1 << i;
743 }
744 d->arch.shadow2_free_pages -= 1 << order;
746 /* Init page info fields and clear the pages */
747 for ( i = 0; i < 1<<order ; i++ )
748 {
749 pg[i].u.inuse.type_info = backpointer;
750 pg[i].count_info = shadow_type;
751 pg[i].shadow2_flags = 0;
752 INIT_LIST_HEAD(&pg[i].list);
753 /* Before we overwrite the old contents of this page,
754 * we need to be sure that no TLB holds a pointer to it. */
755 mask = d->domain_dirty_cpumask;
756 tlbflush_filter(mask, pg[i].tlbflush_timestamp);
757 if ( unlikely(!cpus_empty(mask)) )
758 {
759 perfc_incrc(shadow2_alloc_tlbflush);
760 flush_tlb_mask(mask);
761 }
762 /* Now safe to clear the page for reuse */
763 p = sh2_map_domain_page(page_to_mfn(pg+i));
764 ASSERT(p != NULL);
765 clear_page(p);
766 sh2_unmap_domain_page(p);
767 perfc_incr(shadow2_alloc_count);
768 }
769 return page_to_mfn(pg);
770 }
772 /* If we get here, we failed to allocate. This should never happen.
773 * It means that we didn't call shadow2_prealloc() correctly before
774 * we allocated. We can't recover by calling prealloc here, because
775 * we might free up higher-level pages that the caller is working on. */
776 SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
777 BUG();
778 }
781 /* Return some shadow pages to the pool. */
782 void shadow2_free(struct domain *d, mfn_t smfn)
783 {
784 struct page_info *pg = mfn_to_page(smfn);
785 u32 shadow_type;
786 unsigned long order;
787 unsigned long mask;
788 int i;
790 ASSERT(shadow2_lock_is_acquired(d));
791 perfc_incrc(shadow2_free);
793 shadow_type = pg->count_info & PGC_SH2_type_mask;
794 ASSERT(shadow_type != PGC_SH2_none);
795 ASSERT(shadow_type != PGC_SH2_p2m_table);
796 order = shadow_order(shadow_type);
798 d->arch.shadow2_free_pages += 1 << order;
800 for ( i = 0; i < 1<<order; i++ )
801 {
802 /* Strip out the type: this is now a free shadow page */
803 pg[i].count_info = 0;
804 /* Remember the TLB timestamp so we will know whether to flush
805 * TLBs when we reuse the page. Because the destructors leave the
806 * contents of the pages in place, we can delay TLB flushes until
807 * just before the allocator hands the page out again. */
808 pg[i].tlbflush_timestamp = tlbflush_current_time();
809 perfc_decr(shadow2_alloc_count);
810 }
812 /* Merge chunks as far as possible. */
813 while ( order < SHADOW2_MAX_ORDER )
814 {
815 mask = 1 << order;
816 if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
817 /* Merge with predecessor block? */
818 if ( (((pg-mask)->count_info & PGC_SH2_type_mask) != PGT_none)
819 || (SH2_PFN_ORDER(pg-mask) != order) )
820 break;
821 list_del(&(pg-mask)->list);
822 pg -= mask;
823 } else {
824 /* Merge with successor block? */
825 if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none)
826 || (SH2_PFN_ORDER(pg+mask) != order) )
827 break;
828 list_del(&(pg+mask)->list);
829 }
830 order++;
831 }
833 SH2_SET_PFN_ORDER(pg, order);
834 list_add_tail(&pg->list, &d->arch.shadow2_freelists[order]);
835 }
837 /* Divert some memory from the pool to be used by the p2m mapping.
838 * This action is irreversible: the p2m mapping only ever grows.
839 * That's OK because the p2m table only exists for external domains,
840 * and those domains can't ever turn off shadow mode.
841 * Also, we only ever allocate a max-order chunk, so as to preserve
842 * the invariant that shadow2_prealloc() always works.
843 * Returns 0 iff it can't get a chunk (the caller should then
844 * free up some pages in domheap and call set_sh2_allocation);
845 * returns non-zero on success.
846 */
847 static int
848 shadow2_alloc_p2m_pages(struct domain *d)
849 {
850 struct page_info *pg;
851 u32 i;
852 ASSERT(shadow2_lock_is_acquired(d));
854 if ( d->arch.shadow2_total_pages
855 < (shadow2_min_acceptable_pages(d) + (1<<SHADOW2_MAX_ORDER)) )
856 return 0; /* Not enough shadow memory: need to increase it first */
858 pg = mfn_to_page(shadow2_alloc(d, PGC_SH2_p2m_table, 0));
859 d->arch.shadow2_p2m_pages += (1<<SHADOW2_MAX_ORDER);
860 d->arch.shadow2_total_pages -= (1<<SHADOW2_MAX_ORDER);
861 for (i = 0; i < (1<<SHADOW2_MAX_ORDER); i++)
862 {
863 /* Unlike shadow pages, mark p2m pages as owned by the domain */
864 page_set_owner(&pg[i], d);
865 list_add_tail(&pg[i].list, &d->arch.shadow2_p2m_freelist);
866 }
867 return 1;
868 }
870 // Returns 0 if no memory is available...
871 mfn_t
872 shadow2_alloc_p2m_page(struct domain *d)
873 {
874 struct list_head *entry;
875 mfn_t mfn;
876 void *p;
878 if ( list_empty(&d->arch.shadow2_p2m_freelist) &&
879 !shadow2_alloc_p2m_pages(d) )
880 return _mfn(0);
881 entry = d->arch.shadow2_p2m_freelist.next;
882 list_del(entry);
883 list_add_tail(entry, &d->arch.shadow2_p2m_inuse);
884 mfn = page_to_mfn(list_entry(entry, struct page_info, list));
885 sh2_get_ref(mfn, 0);
886 p = sh2_map_domain_page(mfn);
887 clear_page(p);
888 sh2_unmap_domain_page(p);
890 return mfn;
891 }
893 #if CONFIG_PAGING_LEVELS == 3
894 static void p2m_install_entry_in_monitors(struct domain *d,
895 l3_pgentry_t *l3e)
896 /* Special case, only used for external-mode domains on PAE hosts:
897 * update the mapping of the p2m table. Once again, this is trivial in
898 * other paging modes (one top-level entry points to the top-level p2m,
899 * no maintenance needed), but PAE makes life difficult by needing a
900 * copy the eight l3es of the p2m table in eight l2h slots in the
901 * monitor table. This function makes fresh copies when a p2m l3e
902 * changes. */
903 {
904 l2_pgentry_t *ml2e;
905 struct vcpu *v;
906 unsigned int index;
908 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
909 ASSERT(index < MACHPHYS_MBYTES>>1);
911 for_each_vcpu(d, v)
912 {
913 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
914 continue;
915 ASSERT(shadow2_mode_external(v->domain));
917 SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
918 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
920 if ( v == current ) /* OK to use linear map of monitor_table */
921 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
922 else
923 {
924 l3_pgentry_t *ml3e;
925 ml3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
926 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
927 ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
928 ml2e += l2_table_offset(RO_MPT_VIRT_START);
929 sh2_unmap_domain_page(ml3e);
930 }
931 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
932 if ( v != current )
933 sh2_unmap_domain_page(ml2e);
934 }
935 }
936 #endif
938 // Find the next level's P2M entry, checking for out-of-range gfn's...
939 // Returns NULL on error.
940 //
941 static l1_pgentry_t *
942 p2m_find_entry(void *table, unsigned long *gfn_remainder,
943 unsigned long gfn, u32 shift, u32 max)
944 {
945 u32 index;
947 index = *gfn_remainder >> shift;
948 if ( index >= max )
949 {
950 SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range "
951 "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
952 gfn, *gfn_remainder, shift, index, max);
953 return NULL;
954 }
955 *gfn_remainder &= (1 << shift) - 1;
956 return (l1_pgentry_t *)table + index;
957 }
959 // Walk one level of the P2M table, allocating a new table if required.
960 // Returns 0 on error.
961 //
962 static int
963 p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
964 unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
965 u32 max, unsigned long type)
966 {
967 l1_pgentry_t *p2m_entry;
968 void *next;
970 if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
971 shift, max)) )
972 return 0;
974 if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
975 {
976 mfn_t mfn = shadow2_alloc_p2m_page(d);
977 if ( mfn_x(mfn) == 0 )
978 return 0;
979 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
980 mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
981 mfn_to_page(mfn)->count_info = 1;
982 #if CONFIG_PAGING_LEVELS == 3
983 if (type == PGT_l2_page_table)
984 {
985 /* We have written to the p2m l3: need to sync the per-vcpu
986 * copies of it in the monitor tables */
987 p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
988 }
989 #endif
990 /* The P2M can be shadowed: keep the shadows synced */
991 if ( d->vcpu[0] )
992 (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn,
993 p2m_entry, sizeof *p2m_entry);
994 }
995 *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
996 next = sh2_map_domain_page(*table_mfn);
997 sh2_unmap_domain_page(*table);
998 *table = next;
1000 return 1;
1003 // Returns 0 on error (out of memory)
1004 int
1005 shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
1007 // XXX -- this might be able to be faster iff current->domain == d
1008 mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
1009 void *table = sh2_map_domain_page(table_mfn);
1010 unsigned long gfn_remainder = gfn;
1011 l1_pgentry_t *p2m_entry;
1013 #if CONFIG_PAGING_LEVELS >= 4
1014 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1015 L4_PAGETABLE_SHIFT - PAGE_SHIFT,
1016 L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
1017 return 0;
1018 #endif
1019 #if CONFIG_PAGING_LEVELS >= 3
1020 // When using PAE Xen, we only allow 33 bits of pseudo-physical
1021 // address in translated guests (i.e. 8 GBytes). This restriction
1022 // comes from wanting to map the P2M table into the 16MB RO_MPT hole
1023 // in Xen's address space for translated PV guests.
1024 //
1025 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1026 L3_PAGETABLE_SHIFT - PAGE_SHIFT,
1027 (CONFIG_PAGING_LEVELS == 3
1028 ? 8
1029 : L3_PAGETABLE_ENTRIES),
1030 PGT_l2_page_table) )
1031 return 0;
1032 #endif
1033 if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
1034 L2_PAGETABLE_SHIFT - PAGE_SHIFT,
1035 L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
1036 return 0;
1038 p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
1039 0, L1_PAGETABLE_ENTRIES);
1040 ASSERT(p2m_entry);
1041 if ( valid_mfn(mfn) )
1042 *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
1043 else
1044 *p2m_entry = l1e_empty();
1046 /* The P2M can be shadowed: keep the shadows synced */
1047 (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn,
1048 p2m_entry, sizeof *p2m_entry);
1050 sh2_unmap_domain_page(table);
1052 return 1;
1055 // Allocate a new p2m table for a domain.
1056 //
1057 // The structure of the p2m table is that of a pagetable for xen (i.e. it is
1058 // controlled by CONFIG_PAGING_LEVELS).
1059 //
1060 // Returns 0 if p2m table could not be initialized
1061 //
1062 static int
1063 shadow2_alloc_p2m_table(struct domain *d)
1065 mfn_t p2m_top;
1066 struct list_head *entry;
1067 unsigned int page_count = 0;
1069 SHADOW2_PRINTK("allocating p2m table\n");
1070 ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
1072 p2m_top = shadow2_alloc_p2m_page(d);
1073 mfn_to_page(p2m_top)->count_info = 1;
1074 mfn_to_page(p2m_top)->u.inuse.type_info =
1075 #if CONFIG_PAGING_LEVELS == 4
1076 PGT_l4_page_table
1077 #elif CONFIG_PAGING_LEVELS == 3
1078 PGT_l3_page_table
1079 #elif CONFIG_PAGING_LEVELS == 2
1080 PGT_l2_page_table
1081 #endif
1082 | 1 | PGT_validated;
1084 if ( mfn_x(p2m_top) == 0 )
1085 return 0;
1087 d->arch.phys_table = pagetable_from_mfn(p2m_top);
1089 SHADOW2_PRINTK("populating p2m table\n");
1091 for ( entry = d->page_list.next;
1092 entry != &d->page_list;
1093 entry = entry->next )
1095 struct page_info *page = list_entry(entry, struct page_info, list);
1096 mfn_t mfn = page_to_mfn(page);
1097 unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
1098 page_count++;
1099 if (
1100 #ifdef __x86_64__
1101 (gfn != 0x5555555555555555L)
1102 #else
1103 (gfn != 0x55555555L)
1104 #endif
1105 && gfn != INVALID_M2P_ENTRY
1106 && !shadow2_set_p2m_entry(d, gfn, mfn) )
1108 SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH2_PRI_mfn "\n",
1109 gfn, mfn_x(mfn));
1110 return 0;
1114 SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count);
1115 return 1;
1118 mfn_t
1119 sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
1120 /* Read another domain's p2m entries */
1122 mfn_t mfn;
1123 unsigned long addr = gpfn << PAGE_SHIFT;
1124 l2_pgentry_t *l2e;
1125 l1_pgentry_t *l1e;
1127 ASSERT(shadow2_mode_translate(d));
1128 mfn = pagetable_get_mfn(d->arch.phys_table);
1131 #if CONFIG_PAGING_LEVELS > 2
1132 if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
1133 /* This pfn is higher than the p2m map can hold */
1134 return _mfn(INVALID_MFN);
1135 #endif
1138 #if CONFIG_PAGING_LEVELS >= 4
1140 l4_pgentry_t *l4e = sh2_map_domain_page(mfn);
1141 l4e += l4_table_offset(addr);
1142 if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
1144 sh2_unmap_domain_page(l4e);
1145 return _mfn(INVALID_MFN);
1147 mfn = _mfn(l4e_get_pfn(*l4e));
1148 sh2_unmap_domain_page(l4e);
1150 #endif
1151 #if CONFIG_PAGING_LEVELS >= 3
1153 l3_pgentry_t *l3e = sh2_map_domain_page(mfn);
1154 l3e += l3_table_offset(addr);
1155 if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
1157 sh2_unmap_domain_page(l3e);
1158 return _mfn(INVALID_MFN);
1160 mfn = _mfn(l3e_get_pfn(*l3e));
1161 sh2_unmap_domain_page(l3e);
1163 #endif
1165 l2e = sh2_map_domain_page(mfn);
1166 l2e += l2_table_offset(addr);
1167 if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
1169 sh2_unmap_domain_page(l2e);
1170 return _mfn(INVALID_MFN);
1172 mfn = _mfn(l2e_get_pfn(*l2e));
1173 sh2_unmap_domain_page(l2e);
1175 l1e = sh2_map_domain_page(mfn);
1176 l1e += l1_table_offset(addr);
1177 if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
1179 sh2_unmap_domain_page(l1e);
1180 return _mfn(INVALID_MFN);
1182 mfn = _mfn(l1e_get_pfn(*l1e));
1183 sh2_unmap_domain_page(l1e);
1185 return mfn;
1188 unsigned long
1189 shadow2_gfn_to_mfn_foreign(unsigned long gpfn)
1191 return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn));
1195 static void shadow2_p2m_teardown(struct domain *d)
1196 /* Return all the p2m pages to Xen.
1197 * We know we don't have any extra mappings to these pages */
1199 struct list_head *entry, *n;
1200 struct page_info *pg;
1202 d->arch.phys_table = pagetable_null();
1204 list_for_each_safe(entry, n, &d->arch.shadow2_p2m_inuse)
1206 pg = list_entry(entry, struct page_info, list);
1207 list_del(entry);
1208 /* Should have just the one ref we gave it in alloc_p2m_page() */
1209 if ( (pg->count_info & PGC_SH2_count_mask) != 1 )
1211 SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1212 pg->count_info, pg->u.inuse.type_info);
1214 ASSERT(page_get_owner(pg) == d);
1215 /* Free should not decrement domain's total allocation, since
1216 * these pages were allocated without an owner. */
1217 page_set_owner(pg, NULL);
1218 free_domheap_pages(pg, 0);
1219 d->arch.shadow2_p2m_pages--;
1220 perfc_decr(shadow2_alloc_count);
1222 list_for_each_safe(entry, n, &d->arch.shadow2_p2m_freelist)
1224 list_del(entry);
1225 pg = list_entry(entry, struct page_info, list);
1226 ASSERT(page_get_owner(pg) == d);
1227 /* Free should not decrement domain's total allocation. */
1228 page_set_owner(pg, NULL);
1229 free_domheap_pages(pg, 0);
1230 d->arch.shadow2_p2m_pages--;
1231 perfc_decr(shadow2_alloc_count);
1233 ASSERT(d->arch.shadow2_p2m_pages == 0);
1236 /* Set the pool of shadow pages to the required number of pages.
1237 * Input will be rounded up to at least shadow2_min_acceptable_pages(),
1238 * plus space for the p2m table.
1239 * Returns 0 for success, non-zero for failure. */
1240 static unsigned int set_sh2_allocation(struct domain *d,
1241 unsigned int pages,
1242 int *preempted)
1244 struct page_info *pg;
1245 unsigned int lower_bound;
1246 int j;
1248 ASSERT(shadow2_lock_is_acquired(d));
1250 /* Don't allocate less than the minimum acceptable, plus one page per
1251 * megabyte of RAM (for the p2m table) */
1252 lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256);
1253 if ( pages > 0 && pages < lower_bound )
1254 pages = lower_bound;
1255 /* Round up to largest block size */
1256 pages = (pages + ((1<<SHADOW2_MAX_ORDER)-1)) & ~((1<<SHADOW2_MAX_ORDER)-1);
1258 SHADOW2_PRINTK("current %i target %i\n",
1259 d->arch.shadow2_total_pages, pages);
1261 while ( d->arch.shadow2_total_pages != pages )
1263 if ( d->arch.shadow2_total_pages < pages )
1265 /* Need to allocate more memory from domheap */
1266 pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0);
1267 if ( pg == NULL )
1269 SHADOW2_PRINTK("failed to allocate shadow pages.\n");
1270 return -ENOMEM;
1272 d->arch.shadow2_free_pages += 1<<SHADOW2_MAX_ORDER;
1273 d->arch.shadow2_total_pages += 1<<SHADOW2_MAX_ORDER;
1274 for ( j = 0; j < 1<<SHADOW2_MAX_ORDER; j++ )
1276 pg[j].u.inuse.type_info = 0; /* Free page */
1277 pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
1279 SH2_SET_PFN_ORDER(pg, SHADOW2_MAX_ORDER);
1280 list_add_tail(&pg->list,
1281 &d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]);
1283 else if ( d->arch.shadow2_total_pages > pages )
1285 /* Need to return memory to domheap */
1286 shadow2_prealloc(d, SHADOW2_MAX_ORDER);
1287 ASSERT(!list_empty(&d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]));
1288 pg = list_entry(d->arch.shadow2_freelists[SHADOW2_MAX_ORDER].next,
1289 struct page_info, list);
1290 list_del(&pg->list);
1291 d->arch.shadow2_free_pages -= 1<<SHADOW2_MAX_ORDER;
1292 d->arch.shadow2_total_pages -= 1<<SHADOW2_MAX_ORDER;
1293 free_domheap_pages(pg, SHADOW2_MAX_ORDER);
1296 /* Check to see if we need to yield and try again */
1297 if ( preempted && hypercall_preempt_check() )
1299 *preempted = 1;
1300 return 0;
1304 return 0;
1307 unsigned int shadow2_set_allocation(struct domain *d,
1308 unsigned int megabytes,
1309 int *preempted)
1310 /* Hypercall interface to set the shadow memory allocation */
1312 unsigned int rv;
1313 shadow2_lock(d);
1314 rv = set_sh2_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
1315 SHADOW2_PRINTK("dom %u allocation now %u pages (%u MB)\n",
1316 d->domain_id,
1317 d->arch.shadow2_total_pages,
1318 shadow2_get_allocation(d));
1319 shadow2_unlock(d);
1320 return rv;
1323 /**************************************************************************/
1324 /* Hash table for storing the guest->shadow mappings */
1326 /* Hash function that takes a gfn or mfn, plus another byte of type info */
1327 typedef u32 key_t;
1328 static inline key_t sh2_hash(unsigned long n, u8 t)
1330 unsigned char *p = (unsigned char *)&n;
1331 key_t k = t;
1332 int i;
1333 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1334 return k;
1337 #if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL)
1339 /* Before we get to the mechanism, define a pair of audit functions
1340 * that sanity-check the contents of the hash table. */
1341 static void sh2_hash_audit_bucket(struct domain *d, int bucket)
1342 /* Audit one bucket of the hash table */
1344 struct shadow2_hash_entry *e, *x;
1345 struct page_info *pg;
1347 if ( !(SHADOW2_AUDIT_ENABLE) )
1348 return;
1350 e = &d->arch.shadow2_hash_table[bucket];
1351 if ( e->t == 0 ) return; /* Bucket is empty */
1352 while ( e )
1354 /* Empty link? */
1355 BUG_ON( e->t == 0 );
1356 /* Bogus type? */
1357 BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) );
1358 /* Wrong bucket? */
1359 BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket );
1360 /* Duplicate entry? */
1361 for ( x = e->next; x; x = x->next )
1362 BUG_ON( x->n == e->n && x->t == e->t );
1363 /* Bogus MFN? */
1364 BUG_ON( !valid_mfn(e->smfn) );
1365 pg = mfn_to_page(e->smfn);
1366 /* Not a shadow? */
1367 BUG_ON( page_get_owner(pg) != 0 );
1368 /* Wrong kind of shadow? */
1369 BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift
1370 != e->t );
1371 /* Bad backlink? */
1372 BUG_ON( pg->u.inuse.type_info != e->n );
1373 if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
1374 && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
1375 && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) )
1377 /* Bad shadow flags on guest page? */
1378 BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<<e->t)) );
1380 /* That entry was OK; on we go */
1381 e = e->next;
1385 #else
1386 #define sh2_hash_audit_bucket(_d, _b)
1387 #endif /* Hashtable bucket audit */
1390 #if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL
1392 static void sh2_hash_audit(struct domain *d)
1393 /* Full audit: audit every bucket in the table */
1395 int i;
1397 if ( !(SHADOW2_AUDIT_ENABLE) )
1398 return;
1400 for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ )
1402 sh2_hash_audit_bucket(d, i);
1406 #else
1407 #define sh2_hash_audit(_d)
1408 #endif /* Hashtable bucket audit */
1410 /* Memory management interface for bucket allocation.
1411 * These ought to come out of shadow memory, but at least on 32-bit
1412 * machines we are forced to allocate them from xenheap so that we can
1413 * address them. */
1414 static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d)
1416 struct shadow2_hash_entry *extra, *x;
1417 int i;
1419 /* We need to allocate a new node. Ensure the free list is not empty.
1420 * Allocate new entries in units the same size as the original table. */
1421 if ( unlikely(d->arch.shadow2_hash_freelist == NULL) )
1423 size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x));
1424 extra = xmalloc_bytes(sz);
1426 if ( extra == NULL )
1428 /* No memory left! */
1429 SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n");
1430 domain_crash_synchronous();
1432 memset(extra, 0, sz);
1434 /* Record the allocation block so it can be correctly freed later. */
1435 *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) =
1436 d->arch.shadow2_hash_allocations;
1437 d->arch.shadow2_hash_allocations = &extra[0];
1439 /* Thread a free chain through the newly-allocated nodes. */
1440 for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ )
1441 extra[i].next = &extra[i+1];
1442 extra[i].next = NULL;
1444 /* Add the new nodes to the free list. */
1445 d->arch.shadow2_hash_freelist = &extra[0];
1448 /* Allocate a new node from the free list. */
1449 x = d->arch.shadow2_hash_freelist;
1450 d->arch.shadow2_hash_freelist = x->next;
1451 return x;
1454 static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e)
1456 /* Mark the bucket as empty and return it to the free list */
1457 e->t = 0;
1458 e->next = d->arch.shadow2_hash_freelist;
1459 d->arch.shadow2_hash_freelist = e;
1463 /* Allocate and initialise the table itself.
1464 * Returns 0 for success, 1 for error. */
1465 static int shadow2_hash_alloc(struct domain *d)
1467 struct shadow2_hash_entry *table;
1469 ASSERT(shadow2_lock_is_acquired(d));
1470 ASSERT(!d->arch.shadow2_hash_table);
1472 table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS);
1473 if ( !table ) return 1;
1474 memset(table, 0,
1475 SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry));
1476 d->arch.shadow2_hash_table = table;
1477 return 0;
1480 /* Tear down the hash table and return all memory to Xen.
1481 * This function does not care whether the table is populated. */
1482 static void shadow2_hash_teardown(struct domain *d)
1484 struct shadow2_hash_entry *a, *n;
1486 ASSERT(shadow2_lock_is_acquired(d));
1487 ASSERT(d->arch.shadow2_hash_table);
1489 /* Return the table itself */
1490 xfree(d->arch.shadow2_hash_table);
1491 d->arch.shadow2_hash_table = NULL;
1493 /* Return any extra allocations */
1494 a = d->arch.shadow2_hash_allocations;
1495 while ( a )
1497 /* We stored a linked-list pointer at the end of each allocation */
1498 n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS]));
1499 xfree(a);
1500 a = n;
1502 d->arch.shadow2_hash_allocations = NULL;
1503 d->arch.shadow2_hash_freelist = NULL;
1507 mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
1508 /* Find an entry in the hash table. Returns the MFN of the shadow,
1509 * or INVALID_MFN if it doesn't exist */
1511 struct domain *d = v->domain;
1512 struct shadow2_hash_entry *p, *x, *head;
1513 key_t key;
1515 ASSERT(shadow2_lock_is_acquired(d));
1516 ASSERT(d->arch.shadow2_hash_table);
1517 ASSERT(t);
1519 sh2_hash_audit(d);
1521 perfc_incrc(shadow2_hash_lookups);
1522 key = sh2_hash(n, t);
1524 x = head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
1525 p = NULL;
1527 sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
1529 do
1531 ASSERT(x->t || ((x == head) && (x->next == NULL)));
1533 if ( x->n == n && x->t == t )
1535 /* Pull-to-front if 'x' isn't already the head item */
1536 if ( unlikely(x != head) )
1538 if ( unlikely(d->arch.shadow2_hash_walking != 0) )
1539 /* Can't reorder: someone is walking the hash chains */
1540 return x->smfn;
1541 else
1543 /* Delete 'x' from list and reinsert after head. */
1544 p->next = x->next;
1545 x->next = head->next;
1546 head->next = x;
1548 /* Swap 'x' contents with head contents. */
1549 SWAP(head->n, x->n);
1550 SWAP(head->t, x->t);
1551 SWAP(head->smfn, x->smfn);
1554 else
1556 perfc_incrc(shadow2_hash_lookup_head);
1558 return head->smfn;
1561 p = x;
1562 x = x->next;
1564 while ( x != NULL );
1566 perfc_incrc(shadow2_hash_lookup_miss);
1567 return _mfn(INVALID_MFN);
1570 void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
1571 /* Put a mapping (n,t)->smfn into the hash table */
1573 struct domain *d = v->domain;
1574 struct shadow2_hash_entry *x, *head;
1575 key_t key;
1577 ASSERT(shadow2_lock_is_acquired(d));
1578 ASSERT(d->arch.shadow2_hash_table);
1579 ASSERT(t);
1581 sh2_hash_audit(d);
1583 perfc_incrc(shadow2_hash_inserts);
1584 key = sh2_hash(n, t);
1586 head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
1588 sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
1590 /* If the bucket is empty then insert the new page as the head item. */
1591 if ( head->t == 0 )
1593 head->n = n;
1594 head->t = t;
1595 head->smfn = smfn;
1596 ASSERT(head->next == NULL);
1598 else
1600 /* Insert a new entry directly after the head item. */
1601 x = sh2_alloc_hash_entry(d);
1602 x->n = n;
1603 x->t = t;
1604 x->smfn = smfn;
1605 x->next = head->next;
1606 head->next = x;
1609 sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
1612 void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
1613 /* Excise the mapping (n,t)->smfn from the hash table */
1615 struct domain *d = v->domain;
1616 struct shadow2_hash_entry *p, *x, *head;
1617 key_t key;
1619 ASSERT(shadow2_lock_is_acquired(d));
1620 ASSERT(d->arch.shadow2_hash_table);
1621 ASSERT(t);
1623 sh2_hash_audit(d);
1625 perfc_incrc(shadow2_hash_deletes);
1626 key = sh2_hash(n, t);
1628 head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
1630 sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
1632 /* Match on head item? */
1633 if ( head->n == n && head->t == t )
1635 if ( (x = head->next) != NULL )
1637 /* Overwrite head with contents of following node. */
1638 head->n = x->n;
1639 head->t = x->t;
1640 head->smfn = x->smfn;
1642 /* Delete following node. */
1643 head->next = x->next;
1644 sh2_free_hash_entry(d, x);
1646 else
1648 /* This bucket is now empty. Initialise the head node. */
1649 head->t = 0;
1652 else
1654 /* Not at the head; need to walk the chain */
1655 p = head;
1656 x = head->next;
1658 while(1)
1660 ASSERT(x); /* We can't have hit the end, since our target is
1661 * still in the chain somehwere... */
1662 if ( x->n == n && x->t == t )
1664 /* Delete matching node. */
1665 p->next = x->next;
1666 sh2_free_hash_entry(d, x);
1667 break;
1669 p = x;
1670 x = x->next;
1674 sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
1677 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1679 static void hash_foreach(struct vcpu *v,
1680 unsigned int callback_mask,
1681 hash_callback_t callbacks[],
1682 mfn_t callback_mfn)
1683 /* Walk the hash table looking at the types of the entries and
1684 * calling the appropriate callback function for each entry.
1685 * The mask determines which shadow types we call back for, and the array
1686 * of callbacks tells us which function to call.
1687 * Any callback may return non-zero to let us skip the rest of the scan.
1689 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1690 * then return non-zero to terminate the scan. */
1692 int i, done = 0;
1693 struct domain *d = v->domain;
1694 struct shadow2_hash_entry *x;
1696 /* Say we're here, to stop hash-lookups reordering the chains */
1697 ASSERT(shadow2_lock_is_acquired(d));
1698 ASSERT(d->arch.shadow2_hash_walking == 0);
1699 d->arch.shadow2_hash_walking = 1;
1701 callback_mask &= ~1; /* Never attempt to call back on empty buckets */
1702 for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ )
1704 /* WARNING: This is not safe against changes to the hash table.
1705 * The callback *must* return non-zero if it has inserted or
1706 * deleted anything from the hash (lookups are OK, though). */
1707 for ( x = &d->arch.shadow2_hash_table[i]; x; x = x->next )
1709 if ( callback_mask & (1 << x->t) )
1711 ASSERT(x->t <= 15);
1712 ASSERT(callbacks[x->t] != NULL);
1713 if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
1714 break;
1717 if ( done ) break;
1719 d->arch.shadow2_hash_walking = 0;
1723 /**************************************************************************/
1724 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
1725 * which will decrement refcounts appropriately and return memory to the
1726 * free pool. */
1728 void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn)
1730 struct page_info *pg = mfn_to_page(smfn);
1731 u32 t = pg->count_info & PGC_SH2_type_mask;
1734 SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1736 /* Double-check, if we can, that the shadowed page belongs to this
1737 * domain, (by following the back-pointer). */
1738 ASSERT(t == PGC_SH2_fl1_32_shadow ||
1739 t == PGC_SH2_fl1_pae_shadow ||
1740 t == PGC_SH2_fl1_64_shadow ||
1741 t == PGC_SH2_monitor_table ||
1742 (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info)))
1743 == v->domain));
1745 /* The down-shifts here are so that the switch statement is on nice
1746 * small numbers that the compiler will enjoy */
1747 switch ( t >> PGC_SH2_type_shift )
1749 #if CONFIG_PAGING_LEVELS == 2
1750 case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
1751 case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
1752 SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn);
1753 break;
1754 case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
1755 SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn);
1756 break;
1757 #else /* PAE or 64bit */
1758 case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
1759 case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
1760 SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn);
1761 break;
1762 case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
1763 SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn);
1764 break;
1765 #endif
1767 #if CONFIG_PAGING_LEVELS >= 3
1768 case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift:
1769 case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift:
1770 SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn);
1771 break;
1772 case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift:
1773 case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift:
1774 SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn);
1775 break;
1776 case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
1777 SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn);
1778 break;
1779 #endif
1781 #if CONFIG_PAGING_LEVELS >= 4
1782 case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift:
1783 case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift:
1784 SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn);
1785 break;
1786 case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift:
1787 SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn);
1788 break;
1789 case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift:
1790 SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn);
1791 break;
1792 case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
1793 SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn);
1794 break;
1795 #endif
1796 default:
1797 SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n",
1798 (unsigned long)t);
1799 BUG();
1803 /**************************************************************************/
1804 /* Remove all writeable mappings of a guest frame from the shadow tables
1805 * Returns non-zero if we need to flush TLBs.
1806 * level and fault_addr desribe how we found this to be a pagetable;
1807 * level==0 means we have some other reason for revoking write access.*/
1809 int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn,
1810 unsigned int level,
1811 unsigned long fault_addr)
1813 /* Dispatch table for getting per-type functions */
1814 static hash_callback_t callbacks[16] = {
1815 NULL, /* none */
1816 #if CONFIG_PAGING_LEVELS == 2
1817 SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32 */
1818 SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32 */
1819 #else
1820 SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32 */
1821 SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32 */
1822 #endif
1823 NULL, /* l2_32 */
1824 #if CONFIG_PAGING_LEVELS >= 3
1825 SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae */
1826 SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */
1827 #else
1828 NULL, /* l1_pae */
1829 NULL, /* fl1_pae */
1830 #endif
1831 NULL, /* l2_pae */
1832 NULL, /* l2h_pae */
1833 NULL, /* l3_pae */
1834 #if CONFIG_PAGING_LEVELS >= 4
1835 SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64 */
1836 SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64 */
1837 #else
1838 NULL, /* l1_64 */
1839 NULL, /* fl1_64 */
1840 #endif
1841 NULL, /* l2_64 */
1842 NULL, /* l3_64 */
1843 NULL, /* l4_64 */
1844 NULL, /* p2m */
1845 NULL /* unused */
1846 };
1848 static unsigned int callback_mask =
1849 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
1850 | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
1851 | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
1852 | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
1853 | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
1854 | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
1856 struct page_info *pg = mfn_to_page(gmfn);
1858 ASSERT(shadow2_lock_is_acquired(v->domain));
1860 /* Only remove writable mappings if we are doing shadow refcounts.
1861 * In guest refcounting, we trust Xen to already be restricting
1862 * all the writes to the guest page tables, so we do not need to
1863 * do more. */
1864 if ( !shadow2_mode_refcounts(v->domain) )
1865 return 0;
1867 /* Early exit if it's already a pagetable, or otherwise not writeable */
1868 if ( sh2_mfn_is_a_page_table(gmfn)
1869 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1870 return 0;
1872 perfc_incrc(shadow2_writeable);
1874 /* If this isn't a "normal" writeable page, the domain is trying to
1875 * put pagetables in special memory of some kind. We can't allow that. */
1876 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1878 SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %"
1879 PRtype_info "\n",
1880 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1881 domain_crash(v->domain);
1884 #if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
1885 if ( v == current && level != 0 )
1887 unsigned long gfn;
1888 /* Heuristic: there is likely to be only one writeable mapping,
1889 * and that mapping is likely to be in the current pagetable,
1890 * either in the guest's linear map (linux, windows) or in a
1891 * magic slot used to map high memory regions (linux HIGHTPTE) */
1893 #define GUESS(_a, _h) do { \
1894 if ( v->arch.shadow2->guess_wrmap(v, (_a), gmfn) ) \
1895 perfc_incrc(shadow2_writeable_h_ ## _h); \
1896 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
1897 return 1; \
1898 } while (0)
1901 /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
1902 if ( v == current
1903 && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
1904 GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
1906 if ( v->arch.shadow2->guest_levels == 2 )
1908 if ( level == 1 )
1909 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1910 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1912 #if CONFIG_PAGING_LEVELS >= 3
1913 else if ( v->arch.shadow2->guest_levels == 3 )
1915 /* 32bit PAE w2k3: linear map at 0xC0000000 */
1916 switch ( level )
1918 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1919 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1922 #if CONFIG_PAGING_LEVELS >= 4
1923 else if ( v->arch.shadow2->guest_levels == 4 )
1925 /* 64bit w2k3: linear map at 0x0000070000000000 */
1926 switch ( level )
1928 case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
1929 case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
1930 case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
1933 #endif /* CONFIG_PAGING_LEVELS >= 4 */
1934 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1936 #undef GUESS
1939 #endif
1941 /* Brute-force search of all the shadows, by walking the hash */
1942 perfc_incrc(shadow2_writeable_bf);
1943 hash_foreach(v, callback_mask, callbacks, gmfn);
1945 /* If that didn't catch the mapping, something is very wrong */
1946 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
1948 SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: "
1949 "%lu left\n", mfn_x(gmfn),
1950 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
1951 domain_crash(v->domain);
1954 /* We killed at least one writeable mapping, so must flush TLBs. */
1955 return 1;
1960 /**************************************************************************/
1961 /* Remove all mappings of a guest frame from the shadow tables.
1962 * Returns non-zero if we need to flush TLBs. */
1964 int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
1966 struct page_info *page = mfn_to_page(gmfn);
1967 int expected_count;
1969 /* Dispatch table for getting per-type functions */
1970 static hash_callback_t callbacks[16] = {
1971 NULL, /* none */
1972 #if CONFIG_PAGING_LEVELS == 2
1973 SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32 */
1974 SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32 */
1975 #else
1976 SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32 */
1977 SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32 */
1978 #endif
1979 NULL, /* l2_32 */
1980 #if CONFIG_PAGING_LEVELS >= 3
1981 SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae */
1982 SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */
1983 #else
1984 NULL, /* l1_pae */
1985 NULL, /* fl1_pae */
1986 #endif
1987 NULL, /* l2_pae */
1988 NULL, /* l2h_pae */
1989 NULL, /* l3_pae */
1990 #if CONFIG_PAGING_LEVELS >= 4
1991 SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64 */
1992 SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64 */
1993 #else
1994 NULL, /* l1_64 */
1995 NULL, /* fl1_64 */
1996 #endif
1997 NULL, /* l2_64 */
1998 NULL, /* l3_64 */
1999 NULL, /* l4_64 */
2000 NULL, /* p2m */
2001 NULL /* unused */
2002 };
2004 static unsigned int callback_mask =
2005 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
2006 | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
2007 | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
2008 | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
2009 | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
2010 | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
2013 perfc_incrc(shadow2_mappings);
2014 if ( (page->count_info & PGC_count_mask) == 0 )
2015 return 0;
2017 ASSERT(shadow2_lock_is_acquired(v->domain));
2019 /* XXX TODO:
2020 * Heuristics for finding the (probably) single mapping of this gmfn */
2022 /* Brute-force search of all the shadows, by walking the hash */
2023 perfc_incrc(shadow2_mappings_bf);
2024 hash_foreach(v, callback_mask, callbacks, gmfn);
2026 /* If that didn't catch the mapping, something is very wrong */
2027 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
2028 if ( (page->count_info & PGC_count_mask) != expected_count )
2030 /* Don't complain if we're in HVM and there's one extra mapping:
2031 * The qemu helper process has an untyped mapping of this dom's RAM */
2032 if ( !(shadow2_mode_external(v->domain)
2033 && (page->count_info & PGC_count_mask) <= 2
2034 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
2036 SHADOW2_ERROR("can't find all mappings of mfn %lx: "
2037 "c=%08x t=%08lx\n", mfn_x(gmfn),
2038 page->count_info, page->u.inuse.type_info);
2042 /* We killed at least one mapping, so must flush TLBs. */
2043 return 1;
2047 /**************************************************************************/
2048 /* Remove all shadows of a guest frame from the shadow tables */
2050 static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
2051 /* Follow this shadow's up-pointer, if it has one, and remove the reference
2052 * found there. Returns 1 if that was the only reference to this shadow */
2054 struct page_info *pg = mfn_to_page(smfn);
2055 mfn_t pmfn;
2056 void *vaddr;
2057 int rc;
2059 ASSERT((pg->count_info & PGC_SH2_type_mask) > 0);
2060 ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow);
2061 ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow);
2062 ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow);
2063 ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow);
2065 if (pg->up == 0) return 0;
2066 pmfn = _mfn(pg->up >> PAGE_SHIFT);
2067 ASSERT(valid_mfn(pmfn));
2068 vaddr = sh2_map_domain_page(pmfn);
2069 ASSERT(vaddr);
2070 vaddr += pg->up & (PAGE_SIZE-1);
2071 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
2073 /* Is this the only reference to this shadow? */
2074 rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0;
2076 /* Blank the offending entry */
2077 switch ((pg->count_info & PGC_SH2_type_mask))
2079 case PGC_SH2_l1_32_shadow:
2080 case PGC_SH2_l2_32_shadow:
2081 #if CONFIG_PAGING_LEVELS == 2
2082 SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn);
2083 #else
2084 SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn);
2085 #endif
2086 break;
2087 #if CONFIG_PAGING_LEVELS >=3
2088 case PGC_SH2_l1_pae_shadow:
2089 case PGC_SH2_l2_pae_shadow:
2090 case PGC_SH2_l2h_pae_shadow:
2091 case PGC_SH2_l3_pae_shadow:
2092 SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn);
2093 break;
2094 #if CONFIG_PAGING_LEVELS >= 4
2095 case PGC_SH2_l1_64_shadow:
2096 case PGC_SH2_l2_64_shadow:
2097 case PGC_SH2_l3_64_shadow:
2098 case PGC_SH2_l4_64_shadow:
2099 SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn);
2100 break;
2101 #endif
2102 #endif
2103 default: BUG(); /* Some wierd unknown shadow type */
2106 sh2_unmap_domain_page(vaddr);
2107 if ( rc )
2108 perfc_incrc(shadow2_up_pointer);
2109 else
2110 perfc_incrc(shadow2_unshadow_bf);
2112 return rc;
2115 void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
2116 /* Remove the shadows of this guest page.
2117 * If all != 0, find all shadows, if necessary by walking the tables.
2118 * Otherwise, just try the (much faster) heuristics, which will remove
2119 * at most one reference to each shadow of the page. */
2121 struct page_info *pg;
2122 mfn_t smfn;
2123 u32 sh_flags;
2124 unsigned char t;
2126 /* Dispatch table for getting per-type functions: each level must
2127 * be called with the function to remove a lower-level shadow. */
2128 static hash_callback_t callbacks[16] = {
2129 NULL, /* none */
2130 NULL, /* l1_32 */
2131 NULL, /* fl1_32 */
2132 #if CONFIG_PAGING_LEVELS == 2
2133 SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32 */
2134 #else
2135 SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32 */
2136 #endif
2137 NULL, /* l1_pae */
2138 NULL, /* fl1_pae */
2139 #if CONFIG_PAGING_LEVELS >= 3
2140 SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae */
2141 SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */
2142 SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae */
2143 #else
2144 NULL, /* l2_pae */
2145 NULL, /* l2h_pae */
2146 NULL, /* l3_pae */
2147 #endif
2148 NULL, /* l1_64 */
2149 NULL, /* fl1_64 */
2150 #if CONFIG_PAGING_LEVELS >= 4
2151 SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64 */
2152 SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64 */
2153 SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64 */
2154 #else
2155 NULL, /* l2_64 */
2156 NULL, /* l3_64 */
2157 NULL, /* l4_64 */
2158 #endif
2159 NULL, /* p2m */
2160 NULL /* unused */
2161 };
2163 /* Another lookup table, for choosing which mask to use */
2164 static unsigned int masks[16] = {
2165 0, /* none */
2166 1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32 */
2167 0, /* fl1_32 */
2168 0, /* l2_32 */
2169 ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift))
2170 | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae */
2171 0, /* fl1_pae */
2172 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae */
2173 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae */
2174 0, /* l3_pae */
2175 1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64 */
2176 0, /* fl1_64 */
2177 1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64 */
2178 1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64 */
2179 0, /* l4_64 */
2180 0, /* p2m */
2181 0 /* unused */
2182 };
2184 ASSERT(shadow2_lock_is_acquired(v->domain));
2186 pg = mfn_to_page(gmfn);
2188 /* Bale out now if the page is not shadowed */
2189 if ( (pg->count_info & PGC_page_table) == 0 )
2190 return;
2192 SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2193 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2195 /* Search for this shadow in all appropriate shadows */
2196 perfc_incrc(shadow2_unshadow);
2197 sh_flags = pg->shadow2_flags;
2199 /* Lower-level shadows need to be excised from upper-level shadows.
2200 * This call to hash_foreach() looks dangerous but is in fact OK: each
2201 * call will remove at most one shadow, and terminate immediately when
2202 * it does remove it, so we never walk the hash after doing a deletion. */
2203 #define DO_UNSHADOW(_type) do { \
2204 t = (_type) >> PGC_SH2_type_shift; \
2205 smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \
2206 if ( !sh2_remove_shadow_via_pointer(v, smfn) && all ) \
2207 hash_foreach(v, masks[t], callbacks, smfn); \
2208 } while (0)
2210 /* Top-level shadows need to be unpinned */
2211 #define DO_UNPIN(_type) do { \
2212 t = (_type) >> PGC_SH2_type_shift; \
2213 smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \
2214 if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned ) \
2215 sh2_unpin(v, smfn); \
2216 if ( (_type) == PGC_SH2_l3_pae_shadow ) \
2217 SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \
2218 } while (0)
2220 if ( sh_flags & SH2F_L1_32 ) DO_UNSHADOW(PGC_SH2_l1_32_shadow);
2221 if ( sh_flags & SH2F_L2_32 ) DO_UNPIN(PGC_SH2_l2_32_shadow);
2222 #if CONFIG_PAGING_LEVELS >= 3
2223 if ( sh_flags & SH2F_L1_PAE ) DO_UNSHADOW(PGC_SH2_l1_pae_shadow);
2224 if ( sh_flags & SH2F_L2_PAE ) DO_UNSHADOW(PGC_SH2_l2_pae_shadow);
2225 if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow);
2226 if ( sh_flags & SH2F_L3_PAE ) DO_UNPIN(PGC_SH2_l3_pae_shadow);
2227 #if CONFIG_PAGING_LEVELS >= 4
2228 if ( sh_flags & SH2F_L1_64 ) DO_UNSHADOW(PGC_SH2_l1_64_shadow);
2229 if ( sh_flags & SH2F_L2_64 ) DO_UNSHADOW(PGC_SH2_l2_64_shadow);
2230 if ( sh_flags & SH2F_L3_64 ) DO_UNSHADOW(PGC_SH2_l3_64_shadow);
2231 if ( sh_flags & SH2F_L4_64 ) DO_UNPIN(PGC_SH2_l4_64_shadow);
2232 #endif
2233 #endif
2235 #undef DO_UNSHADOW
2236 #undef DO_UNPIN
2239 #if CONFIG_PAGING_LEVELS > 2
2240 /* We may have caused some PAE l3 entries to change: need to
2241 * fix up the copies of them in various places */
2242 if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) )
2243 sh2_pae_recopy(v->domain);
2244 #endif
2246 /* If that didn't catch the shadows, something is wrong */
2247 if ( all && (pg->count_info & PGC_page_table) )
2249 SHADOW2_ERROR("can't find all shadows of mfn %05lx (shadow2_flags=%08x)\n",
2250 mfn_x(gmfn), pg->shadow2_flags);
2251 domain_crash(v->domain);
2255 void
2256 shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2257 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2258 * Unshadow it, and recursively unshadow pages that reference it. */
2260 shadow2_remove_all_shadows(v, gmfn);
2261 /* XXX TODO:
2262 * Rework this hashtable walker to return a linked-list of all
2263 * the shadows it modified, then do breadth-first recursion
2264 * to find the way up to higher-level tables and unshadow them too.
2266 * The current code (just tearing down each page's shadows as we
2267 * detect that it is not a pagetable) is correct, but very slow.
2268 * It means extra emulated writes and slows down removal of mappings. */
2271 /**************************************************************************/
2273 void sh2_update_paging_modes(struct vcpu *v)
2275 struct domain *d = v->domain;
2276 struct shadow2_entry_points *old_entries = v->arch.shadow2;
2277 mfn_t old_guest_table;
2279 ASSERT(shadow2_lock_is_acquired(d));
2281 // Valid transitions handled by this function:
2282 // - For PV guests:
2283 // - after a shadow mode has been changed
2284 // - For HVM guests:
2285 // - after a shadow mode has been changed
2286 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2287 //
2289 // Avoid determining the current shadow2 mode for uninitialized CPUs, as
2290 // we can not yet determine whether it is an HVM or PV domain.
2291 //
2292 if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
2294 printk("%s: postponing determination of shadow2 mode\n", __func__);
2295 return;
2298 // First, tear down any old shadow tables held by this vcpu.
2299 //
2300 if ( v->arch.shadow2 )
2301 shadow2_detach_old_tables(v);
2303 if ( !hvm_guest(v) )
2305 ///
2306 /// PV guest
2307 ///
2308 #if CONFIG_PAGING_LEVELS == 4
2309 if ( pv_32bit_guest(v) )
2310 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 3);
2311 else
2312 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
2313 #elif CONFIG_PAGING_LEVELS == 3
2314 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3);
2315 #elif CONFIG_PAGING_LEVELS == 2
2316 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2);
2317 #else
2318 #error unexpected paging mode
2319 #endif
2321 else
2323 ///
2324 /// HVM guest
2325 ///
2326 ASSERT(shadow2_mode_translate(d));
2327 ASSERT(shadow2_mode_external(d));
2329 if ( !hvm_paging_enabled(v) )
2331 // paging disabled...
2332 clear_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
2334 /* Set v->arch.guest_table to use the p2m map, and choose
2335 * the appropriate shadow mode */
2336 old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2337 #if CONFIG_PAGING_LEVELS == 2
2338 v->arch.guest_table =
2339 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2340 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,2,2);
2341 #elif CONFIG_PAGING_LEVELS == 3
2342 v->arch.guest_table =
2343 pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2344 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
2345 #else /* CONFIG_PAGING_LEVELS == 4 */
2347 l4_pgentry_t *l4e;
2348 /* Use the start of the first l3 table as a PAE l3 */
2349 ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2350 l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2351 ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2352 v->arch.guest_table =
2353 pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2354 sh2_unmap_domain_page(l4e);
2356 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
2357 #endif
2358 /* Fix up refcounts on guest_table */
2359 get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2360 if ( mfn_x(old_guest_table) != 0 )
2361 put_page(mfn_to_page(old_guest_table));
2363 else
2365 set_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
2367 #ifdef __x86_64__
2368 if ( hvm_long_mode_enabled(v) )
2370 // long mode guest...
2371 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
2373 else
2374 #endif
2375 if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
2377 #if CONFIG_PAGING_LEVELS >= 3
2378 // 32-bit PAE mode guest...
2379 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3);
2380 #else
2381 SHADOW2_ERROR("PAE not supported in 32-bit Xen\n");
2382 domain_crash(d);
2383 return;
2384 #endif
2386 else
2388 // 32-bit 2 level guest...
2389 #if CONFIG_PAGING_LEVELS >= 3
2390 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 2);
2391 #else
2392 v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2);
2393 #endif
2397 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
2399 mfn_t mmfn = shadow2_make_monitor_table(v);
2400 v->arch.monitor_table = pagetable_from_mfn(mmfn);
2401 v->arch.monitor_vtable = sh2_map_domain_page(mmfn);
2404 if ( v->arch.shadow2 != old_entries )
2406 SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
2407 "(was g=%u s=%u)\n",
2408 d->domain_id, v->vcpu_id,
2409 v->arch.shadow2->guest_levels,
2410 v->arch.shadow2->shadow_levels,
2411 old_entries ? old_entries->guest_levels : 0,
2412 old_entries ? old_entries->shadow_levels : 0);
2413 if ( old_entries &&
2414 (v->arch.shadow2->shadow_levels !=
2415 old_entries->shadow_levels) )
2417 /* Need to make a new monitor table for the new mode */
2418 mfn_t new_mfn, old_mfn;
2420 if ( v != current )
2422 SHADOW2_ERROR("Some third party (d=%u v=%u) is changing "
2423 "this HVM vcpu's (d=%u v=%u) paging mode!\n",
2424 current->domain->domain_id, current->vcpu_id,
2425 v->domain->domain_id, v->vcpu_id);
2426 domain_crash(v->domain);
2427 return;
2430 sh2_unmap_domain_page(v->arch.monitor_vtable);
2431 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2432 v->arch.monitor_table = pagetable_null();
2433 new_mfn = v->arch.shadow2->make_monitor_table(v);
2434 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2435 v->arch.monitor_vtable = sh2_map_domain_page(new_mfn);
2436 SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n",
2437 mfn_x(new_mfn));
2439 /* Don't be running on the old monitor table when we
2440 * pull it down! Switch CR3, and warn the HVM code that
2441 * its host cr3 has changed. */
2442 make_cr3(v, mfn_x(new_mfn));
2443 write_ptbase(v);
2444 hvm_update_host_cr3(v);
2445 old_entries->destroy_monitor_table(v, old_mfn);
2449 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2450 // These are HARD: think about the case where two CPU's have
2451 // different values for CR4.PSE and CR4.PGE at the same time.
2452 // This *does* happen, at least for CR4.PGE...
2455 v->arch.shadow2->update_cr3(v);
2458 /**************************************************************************/
2459 /* Turning on and off shadow2 features */
2461 static void sh2_new_mode(struct domain *d, u32 new_mode)
2462 /* Inform all the vcpus that the shadow mode has been changed */
2464 struct vcpu *v;
2466 ASSERT(shadow2_lock_is_acquired(d));
2467 ASSERT(d != current->domain);
2468 d->arch.shadow2_mode = new_mode;
2469 if ( new_mode & SHM2_translate )
2470 shadow2_audit_p2m(d);
2471 for_each_vcpu(d, v)
2472 sh2_update_paging_modes(v);
2475 static int shadow2_enable(struct domain *d, u32 mode)
2476 /* Turn on "permanent" shadow features: external, translate, refcount.
2477 * Can only be called once on a domain, and these features cannot be
2478 * disabled.
2479 * Returns 0 for success, -errno for failure. */
2481 unsigned int old_pages;
2482 int rv = 0;
2484 domain_pause(d);
2485 shadow2_lock(d);
2487 /* Sanity check the arguments */
2488 if ( d == current->domain
2489 || shadow2_mode_enabled(d)
2490 || !(mode & SHM2_enable)
2491 || ((mode & SHM2_external) && !(mode & SHM2_translate)) )
2493 rv = -EINVAL;
2494 goto out;
2497 // XXX -- eventually would like to require that all memory be allocated
2498 // *after* shadow2_enabled() is called... So here, we would test to make
2499 // sure that d->page_list is empty.
2500 #if 0
2501 spin_lock(&d->page_alloc_lock);
2502 if ( !list_empty(&d->page_list) )
2504 spin_unlock(&d->page_alloc_lock);
2505 rv = -EINVAL;
2506 goto out;
2508 spin_unlock(&d->page_alloc_lock);
2509 #endif
2511 /* Init the shadow memory allocation if the user hasn't done so */
2512 old_pages = d->arch.shadow2_total_pages;
2513 if ( old_pages == 0 )
2514 if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
2516 set_sh2_allocation(d, 0, NULL);
2517 rv = -ENOMEM;
2518 goto out;
2521 /* Init the hash table */
2522 if ( shadow2_hash_alloc(d) != 0 )
2524 set_sh2_allocation(d, old_pages, NULL);
2525 rv = -ENOMEM;
2526 goto out;
2529 /* Init the P2M table */
2530 if ( mode & SHM2_translate )
2531 if ( !shadow2_alloc_p2m_table(d) )
2533 shadow2_hash_teardown(d);
2534 set_sh2_allocation(d, old_pages, NULL);
2535 shadow2_p2m_teardown(d);
2536 rv = -ENOMEM;
2537 goto out;
2540 /* Update the bits */
2541 sh2_new_mode(d, mode);
2542 shadow2_audit_p2m(d);
2543 out:
2544 shadow2_unlock(d);
2545 domain_unpause(d);
2546 return 0;
2549 void shadow2_teardown(struct domain *d)
2550 /* Destroy the shadow pagetables of this domain and free its shadow memory.
2551 * Should only be called for dying domains. */
2553 struct vcpu *v;
2554 mfn_t mfn;
2556 ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
2557 ASSERT(d != current->domain);
2559 if ( !shadow2_lock_is_acquired(d) )
2560 shadow2_lock(d); /* Keep various asserts happy */
2562 if ( shadow2_mode_enabled(d) )
2564 /* Release the shadow and monitor tables held by each vcpu */
2565 for_each_vcpu(d, v)
2567 if ( v->arch.shadow2 )
2568 shadow2_detach_old_tables(v);
2569 if ( shadow2_mode_external(d) )
2571 mfn = pagetable_get_mfn(v->arch.monitor_table);
2572 if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
2573 shadow2_destroy_monitor_table(v, mfn);
2574 v->arch.monitor_table = pagetable_null();
2579 if ( d->arch.shadow2_total_pages != 0 )
2581 SHADOW2_PRINTK("teardown of domain %u starts."
2582 " Shadow pages total = %u, free = %u, p2m=%u\n",
2583 d->domain_id,
2584 d->arch.shadow2_total_pages,
2585 d->arch.shadow2_free_pages,
2586 d->arch.shadow2_p2m_pages);
2587 /* Destroy all the shadows and release memory to domheap */
2588 set_sh2_allocation(d, 0, NULL);
2589 /* Release the hash table back to xenheap */
2590 if (d->arch.shadow2_hash_table)
2591 shadow2_hash_teardown(d);
2592 /* Release the log-dirty bitmap of dirtied pages */
2593 sh2_free_log_dirty_bitmap(d);
2594 /* Should not have any more memory held */
2595 SHADOW2_PRINTK("teardown done."
2596 " Shadow pages total = %u, free = %u, p2m=%u\n",
2597 d->arch.shadow2_total_pages,
2598 d->arch.shadow2_free_pages,
2599 d->arch.shadow2_p2m_pages);
2600 ASSERT(d->arch.shadow2_total_pages == 0);
2603 /* We leave the "permanent" shadow modes enabled, but clear the
2604 * log-dirty mode bit. We don't want any more mark_dirty()
2605 * calls now that we've torn down the bitmap */
2606 d->arch.shadow2_mode &= ~SHM2_log_dirty;
2608 shadow2_unlock(d);
2611 void shadow2_final_teardown(struct domain *d)
2612 /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2615 SHADOW2_PRINTK("dom %u final teardown starts."
2616 " Shadow pages total = %u, free = %u, p2m=%u\n",
2617 d->domain_id,
2618 d->arch.shadow2_total_pages,
2619 d->arch.shadow2_free_pages,
2620 d->arch.shadow2_p2m_pages);
2622 /* Double-check that the domain didn't have any shadow memory.
2623 * It is possible for a domain that never got domain_kill()ed
2624 * to get here with its shadow allocation intact. */
2625 if ( d->arch.shadow2_total_pages != 0 )
2626 shadow2_teardown(d);
2628 /* It is now safe to pull down the p2m map. */
2629 if ( d->arch.shadow2_p2m_pages != 0 )
2630 shadow2_p2m_teardown(d);
2632 SHADOW2_PRINTK("dom %u final teardown done."
2633 " Shadow pages total = %u, free = %u, p2m=%u\n",
2634 d->domain_id,
2635 d->arch.shadow2_total_pages,
2636 d->arch.shadow2_free_pages,
2637 d->arch.shadow2_p2m_pages);
2640 static int shadow2_one_bit_enable(struct domain *d, u32 mode)
2641 /* Turn on a single shadow mode feature */
2643 ASSERT(shadow2_lock_is_acquired(d));
2645 /* Sanity check the call */
2646 if ( d == current->domain || (d->arch.shadow2_mode & mode) )
2648 return -EINVAL;
2651 if ( d->arch.shadow2_mode == 0 )
2653 /* Init the shadow memory allocation and the hash table */
2654 if ( set_sh2_allocation(d, 1, NULL) != 0
2655 || shadow2_hash_alloc(d) != 0 )
2657 set_sh2_allocation(d, 0, NULL);
2658 return -ENOMEM;
2662 /* Update the bits */
2663 sh2_new_mode(d, d->arch.shadow2_mode | mode);
2665 return 0;
2668 static int shadow2_one_bit_disable(struct domain *d, u32 mode)
2669 /* Turn off a single shadow mode feature */
2671 struct vcpu *v;
2672 ASSERT(shadow2_lock_is_acquired(d));
2674 /* Sanity check the call */
2675 if ( d == current->domain || !(d->arch.shadow2_mode & mode) )
2677 return -EINVAL;
2680 /* Update the bits */
2681 sh2_new_mode(d, d->arch.shadow2_mode & ~mode);
2682 if ( d->arch.shadow2_mode == 0 )
2684 /* Get this domain off shadows */
2685 SHADOW2_PRINTK("un-shadowing of domain %u starts."
2686 " Shadow pages total = %u, free = %u, p2m=%u\n",
2687 d->domain_id,
2688 d->arch.shadow2_total_pages,
2689 d->arch.shadow2_free_pages,
2690 d->arch.shadow2_p2m_pages);
2691 for_each_vcpu(d, v)
2693 if ( v->arch.shadow2 )
2694 shadow2_detach_old_tables(v);
2695 #if CONFIG_PAGING_LEVELS == 4
2696 if ( !(v->arch.flags & TF_kernel_mode) )
2697 make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2698 else
2699 #endif
2700 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2704 /* Pull down the memory allocation */
2705 if ( set_sh2_allocation(d, 0, NULL) != 0 )
2707 // XXX - How can this occur?
2708 // Seems like a bug to return an error now that we've
2709 // disabled the relevant shadow mode.
2710 //
2711 return -ENOMEM;
2713 shadow2_hash_teardown(d);
2714 SHADOW2_PRINTK("un-shadowing of domain %u done."
2715 " Shadow pages total = %u, free = %u, p2m=%u\n",
2716 d->domain_id,
2717 d->arch.shadow2_total_pages,
2718 d->arch.shadow2_free_pages,
2719 d->arch.shadow2_p2m_pages);
2722 return 0;
2725 /* Enable/disable ops for the "test" and "log-dirty" modes */
2726 int shadow2_test_enable(struct domain *d)
2728 int ret;
2730 domain_pause(d);
2731 shadow2_lock(d);
2733 if ( shadow2_mode_enabled(d) )
2735 SHADOW2_ERROR("Don't support enabling test mode"
2736 "on already shadowed doms\n");
2737 ret = -EINVAL;
2738 goto out;
2741 ret = shadow2_one_bit_enable(d, SHM2_enable);
2742 out:
2743 shadow2_unlock(d);
2744 domain_unpause(d);
2746 return ret;
2749 int shadow2_test_disable(struct domain *d)
2751 int ret;
2753 domain_pause(d);
2754 shadow2_lock(d);
2755 ret = shadow2_one_bit_disable(d, SHM2_enable);
2756 shadow2_unlock(d);
2757 domain_unpause(d);
2759 return ret;
2762 static int
2763 sh2_alloc_log_dirty_bitmap(struct domain *d)
2765 ASSERT(d->arch.shadow_dirty_bitmap == NULL);
2766 d->arch.shadow_dirty_bitmap_size =
2767 (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
2768 ~(BITS_PER_LONG - 1);
2769 d->arch.shadow_dirty_bitmap =
2770 xmalloc_array(unsigned long,
2771 d->arch.shadow_dirty_bitmap_size / BITS_PER_LONG);
2772 if ( d->arch.shadow_dirty_bitmap == NULL )
2774 d->arch.shadow_dirty_bitmap_size = 0;
2775 return -ENOMEM;
2777 memset(d->arch.shadow_dirty_bitmap, 0, d->arch.shadow_dirty_bitmap_size/8);
2779 return 0;
2782 static void
2783 sh2_free_log_dirty_bitmap(struct domain *d)
2785 d->arch.shadow_dirty_bitmap_size = 0;
2786 if ( d->arch.shadow_dirty_bitmap )
2788 xfree(d->arch.shadow_dirty_bitmap);
2789 d->arch.shadow_dirty_bitmap = NULL;
2793 static int shadow2_log_dirty_enable(struct domain *d)
2795 int ret;
2797 domain_pause(d);
2798 shadow2_lock(d);
2800 if ( shadow2_mode_log_dirty(d) )
2802 ret = -EINVAL;
2803 goto out;
2806 if ( shadow2_mode_enabled(d) )
2808 SHADOW2_ERROR("Don't (yet) support enabling log-dirty"
2809 "on already shadowed doms\n");
2810 ret = -EINVAL;
2811 goto out;
2814 ret = sh2_alloc_log_dirty_bitmap(d);
2815 if ( ret != 0 )
2817 sh2_free_log_dirty_bitmap(d);
2818 goto out;
2821 ret = shadow2_one_bit_enable(d, SHM2_log_dirty);
2822 if ( ret != 0 )
2823 sh2_free_log_dirty_bitmap(d);
2825 out:
2826 shadow2_unlock(d);
2827 domain_unpause(d);
2828 return ret;
2831 static int shadow2_log_dirty_disable(struct domain *d)
2833 int ret;
2835 domain_pause(d);
2836 shadow2_lock(d);
2837 ret = shadow2_one_bit_disable(d, SHM2_log_dirty);
2838 if ( !shadow2_mode_log_dirty(d) )
2839 sh2_free_log_dirty_bitmap(d);
2840 shadow2_unlock(d);
2841 domain_unpause(d);
2843 return ret;
2846 /**************************************************************************/
2847 /* P2M map manipulations */
2849 static void
2850 sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
2852 struct vcpu *v;
2854 if ( !shadow2_mode_translate(d) )
2855 return;
2857 v = current;
2858 if ( v->domain != d )
2859 v = d->vcpu[0];
2862 SHADOW2_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
2864 ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn);
2865 //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn);
2867 shadow2_remove_all_shadows_and_parents(v, _mfn(mfn));
2868 if ( shadow2_remove_all_mappings(v, _mfn(mfn)) )
2869 flush_tlb_mask(d->domain_dirty_cpumask);
2870 shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
2871 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
2874 void
2875 shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
2876 unsigned long mfn)
2878 shadow2_lock(d);
2879 shadow2_audit_p2m(d);
2880 sh2_p2m_remove_page(d, gfn, mfn);
2881 shadow2_audit_p2m(d);
2882 shadow2_unlock(d);
2885 void
2886 shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
2887 unsigned long mfn)
2889 struct vcpu *v;
2890 unsigned long ogfn;
2891 mfn_t omfn;
2893 if ( !shadow2_mode_translate(d) )
2894 return;
2896 v = current;
2897 if ( v->domain != d )
2898 v = d->vcpu[0];
2900 shadow2_lock(d);
2901 shadow2_audit_p2m(d);
2903 SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
2905 omfn = sh2_gfn_to_mfn(d, gfn);
2906 if ( valid_mfn(omfn) )
2908 /* Get rid of the old mapping, especially any shadows */
2909 shadow2_remove_all_shadows_and_parents(v, omfn);
2910 if ( shadow2_remove_all_mappings(v, omfn) )
2911 flush_tlb_mask(d->domain_dirty_cpumask);
2912 set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
2915 ogfn = sh2_mfn_to_gfn(d, _mfn(mfn));
2916 if (
2917 #ifdef __x86_64__
2918 (ogfn != 0x5555555555555555L)
2919 #else
2920 (ogfn != 0x55555555L)
2921 #endif
2922 && (ogfn != INVALID_M2P_ENTRY)
2923 && (ogfn != gfn) )
2925 /* This machine frame is already mapped at another physical address */
2926 SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
2927 mfn, ogfn, gfn);
2928 if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) )
2930 SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
2931 ogfn , mfn_x(omfn));
2932 if ( mfn_x(omfn) == mfn )
2933 sh2_p2m_remove_page(d, ogfn, mfn);
2937 shadow2_set_p2m_entry(d, gfn, _mfn(mfn));
2938 set_gpfn_from_mfn(mfn, gfn);
2939 shadow2_audit_p2m(d);
2940 shadow2_unlock(d);
2943 /**************************************************************************/
2944 /* Log-dirty mode support */
2946 /* Convert a shadow to log-dirty mode. */
2947 void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
2949 BUG();
2953 /* Read a domain's log-dirty bitmap and stats.
2954 * If the operation is a CLEAN, clear the bitmap and stats as well. */
2955 static int shadow2_log_dirty_op(struct domain *d, dom0_shadow_control_t *sc)
2957 int i, rv = 0, clean = 0;
2959 domain_pause(d);
2960 shadow2_lock(d);
2962 if ( sc->op == DOM0_SHADOW_CONTROL_OP_CLEAN
2963 || sc->op == DOM0_SHADOW_CONTROL_OP_FLUSH )
2964 clean = 1;
2965 else
2966 ASSERT(sc->op == DOM0_SHADOW_CONTROL_OP_PEEK);
2968 SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
2969 (clean) ? "clean" : "peek",
2970 d->domain_id,
2971 d->arch.shadow_fault_count,
2972 d->arch.shadow_dirty_count);
2974 sc->stats.fault_count = d->arch.shadow_fault_count;
2975 sc->stats.dirty_count = d->arch.shadow_dirty_count;
2977 if ( clean )
2979 struct list_head *l, *t;
2980 struct page_info *pg;
2982 /* Need to revoke write access to the domain's pages again.
2983 * In future, we'll have a less heavy-handed approach to this,
2984 * but for now, we just unshadow everything except Xen. */
2985 list_for_each_safe(l, t, &d->arch.shadow2_toplevel_shadows)
2987 pg = list_entry(l, struct page_info, list);
2988 shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
2991 d->arch.shadow_fault_count = 0;
2992 d->arch.shadow_dirty_count = 0;
2995 if ( guest_handle_is_null(sc->dirty_bitmap) ||
2996 (d->arch.shadow_dirty_bitmap == NULL) )
2998 rv = -EINVAL;
2999 goto out;
3002 if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
3003 sc->pages = d->arch.shadow_dirty_bitmap_size;
3005 #define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
3006 for ( i = 0; i < sc->pages; i += CHUNK )
3008 int bytes = ((((sc->pages - i) > CHUNK)
3009 ? CHUNK
3010 : (sc->pages - i)) + 7) / 8;
3012 if ( copy_to_guest_offset(
3013 sc->dirty_bitmap,
3014 i/(8*sizeof(unsigned long)),
3015 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
3016 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
3018 rv = -EINVAL;
3019 goto out;
3022 if ( clean )
3023 memset(d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
3024 0, bytes);
3026 #undef CHUNK
3028 out:
3029 shadow2_unlock(d);
3030 domain_unpause(d);
3031 return 0;
3035 /* Mark a page as dirty */
3036 void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn)
3038 unsigned long pfn;
3040 ASSERT(shadow2_lock_is_acquired(d));
3041 ASSERT(shadow2_mode_log_dirty(d));
3043 if ( !valid_mfn(gmfn) )
3044 return;
3046 ASSERT(d->arch.shadow_dirty_bitmap != NULL);
3048 /* We /really/ mean PFN here, even for non-translated guests. */
3049 pfn = get_gpfn_from_mfn(mfn_x(gmfn));
3051 /*
3052 * Values with the MSB set denote MFNs that aren't really part of the
3053 * domain's pseudo-physical memory map (e.g., the shared info frame).
3054 * Nothing to do here...
3055 */
3056 if ( unlikely(!VALID_M2P(pfn)) )
3057 return;
3059 /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */
3060 if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) )
3062 if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
3064 SHADOW2_DEBUG(LOGDIRTY,
3065 "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n",
3066 mfn_x(gmfn), pfn, d->domain_id);
3067 d->arch.shadow_dirty_count++;
3070 else
3072 SHADOW2_PRINTK("mark_dirty OOR! "
3073 "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
3074 "owner=%d c=%08x t=%" PRtype_info "\n",
3075 mfn_x(gmfn),
3076 pfn,
3077 d->arch.shadow_dirty_bitmap_size,
3078 d->domain_id,
3079 (page_get_owner(mfn_to_page(gmfn))
3080 ? page_get_owner(mfn_to_page(gmfn))->domain_id
3081 : -1),
3082 mfn_to_page(gmfn)->count_info,
3083 mfn_to_page(gmfn)->u.inuse.type_info);
3088 /**************************************************************************/
3089 /* Shadow-control DOM0_OP dispatcher */
3091 int shadow2_control_op(struct domain *d,
3092 dom0_shadow_control_t *sc,
3093 XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op)
3095 int rc, preempted = 0;
3097 if ( unlikely(d == current->domain) )
3099 DPRINTK("Don't try to do a shadow op on yourself!\n");
3100 return -EINVAL;
3103 switch ( sc->op )
3105 case DOM0_SHADOW_CONTROL_OP_OFF:
3106 if ( shadow2_mode_log_dirty(d) )
3107 if ( (rc = shadow2_log_dirty_disable(d)) != 0 )
3108 return rc;
3109 if ( d->arch.shadow2_mode & SHM2_enable )
3110 if ( (rc = shadow2_test_disable(d)) != 0 )
3111 return rc;
3112 return 0;
3114 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
3115 return shadow2_test_enable(d);
3117 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
3118 return shadow2_log_dirty_enable(d);
3120 case DOM0_SHADOW_CONTROL_OP_FLUSH:
3121 case DOM0_SHADOW_CONTROL_OP_CLEAN:
3122 case DOM0_SHADOW_CONTROL_OP_PEEK:
3123 return shadow2_log_dirty_op(d, sc);
3127 case DOM0_SHADOW2_CONTROL_OP_ENABLE:
3128 return shadow2_enable(d, sc->mode << SHM2_shift);
3130 case DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION:
3131 sc->mb = shadow2_get_allocation(d);
3132 return 0;
3134 case DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION:
3135 rc = shadow2_set_allocation(d, sc->mb, &preempted);
3136 if ( preempted )
3137 /* Not finished. Set up to re-run the call. */
3138 rc = hypercall_create_continuation(
3139 __HYPERVISOR_dom0_op, "h", u_dom0_op);
3140 else
3141 /* Finished. Return the new allocation */
3142 sc->mb = shadow2_get_allocation(d);
3143 return rc;
3146 default:
3147 SHADOW2_ERROR("Bad shadow op %u\n", sc->op);
3148 return -EINVAL;
3153 /**************************************************************************/
3154 /* Auditing shadow tables */
3156 #if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
3158 void shadow2_audit_tables(struct vcpu *v)
3160 /* Dispatch table for getting per-type functions */
3161 static hash_callback_t callbacks[16] = {
3162 NULL, /* none */
3163 #if CONFIG_PAGING_LEVELS == 2
3164 SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2), /* l1_32 */
3165 SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32 */
3166 SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2), /* l2_32 */
3167 #else
3168 SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2), /* l1_32 */
3169 SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32 */
3170 SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2), /* l2_32 */
3171 SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3), /* l1_pae */
3172 SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */
3173 SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2_pae */
3174 SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2h_pae */
3175 SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3), /* l3_pae */
3176 #if CONFIG_PAGING_LEVELS >= 4
3177 SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4), /* l1_64 */
3178 SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64 */
3179 SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4), /* l2_64 */
3180 SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4), /* l3_64 */
3181 SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4), /* l4_64 */
3182 #endif /* CONFIG_PAGING_LEVELS >= 4 */
3183 #endif /* CONFIG_PAGING_LEVELS > 2 */
3184 NULL /* All the rest */
3185 };
3186 unsigned int mask;
3188 if ( !(SHADOW2_AUDIT_ENABLE) )
3189 return;
3191 if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL )
3192 mask = ~1; /* Audit every table in the system */
3193 else
3195 /* Audit only the current mode's tables */
3196 switch (v->arch.shadow2->guest_levels)
3198 case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break;
3199 case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE
3200 |SH2F_L2H_PAE|SH2F_L3_PAE); break;
3201 case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64
3202 |SH2F_L3_64|SH2F_L4_64); break;
3203 default: BUG();
3207 hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3210 #endif /* Shadow audit */
3213 /**************************************************************************/
3214 /* Auditing p2m tables */
3216 #if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
3218 void shadow2_audit_p2m(struct domain *d)
3220 struct list_head *entry;
3221 struct page_info *page;
3222 struct domain *od;
3223 unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
3224 mfn_t p2mfn;
3225 unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
3226 int test_linear;
3228 if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) )
3229 return;
3231 //SHADOW2_PRINTK("p2m audit starts\n");
3233 test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
3234 if ( test_linear )
3235 local_flush_tlb();
3237 /* Audit part one: walk the domain's page allocation list, checking
3238 * the m2p entries. */
3239 for ( entry = d->page_list.next;
3240 entry != &d->page_list;
3241 entry = entry->next )
3243 page = list_entry(entry, struct page_info, list);
3244 mfn = mfn_x(page_to_mfn(page));
3246 // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
3248 od = page_get_owner(page);
3250 if ( od != d )
3252 SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
3253 mfn, od, (od?od->domain_id:-1), d, d->domain_id);
3254 continue;
3257 gfn = get_gpfn_from_mfn(mfn);
3258 if ( gfn == INVALID_M2P_ENTRY )
3260 orphans_i++;
3261 //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
3262 // mfn);
3263 continue;
3266 if ( gfn == 0x55555555 )
3268 orphans_d++;
3269 //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
3270 // mfn);
3271 continue;
3274 p2mfn = sh2_gfn_to_mfn_foreign(d, gfn);
3275 if ( mfn_x(p2mfn) != mfn )
3277 mpbad++;
3278 SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
3279 " (-> gfn %#lx)\n",
3280 mfn, gfn, mfn_x(p2mfn),
3281 (mfn_valid(p2mfn)
3282 ? get_gpfn_from_mfn(mfn_x(p2mfn))
3283 : -1u));
3284 /* This m2p entry is stale: the domain has another frame in
3285 * this physical slot. No great disaster, but for neatness,
3286 * blow away the m2p entry. */
3287 set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
3290 if ( test_linear )
3292 lp2mfn = get_mfn_from_gpfn(gfn);
3293 if ( lp2mfn != mfn_x(p2mfn) )
3295 SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
3296 "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
3300 // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
3301 // mfn, gfn, p2mfn, lp2mfn);
3304 /* Audit part two: walk the domain's p2m table, checking the entries. */
3305 if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
3307 l2_pgentry_t *l2e;
3308 l1_pgentry_t *l1e;
3309 int i1, i2;
3311 #if CONFIG_PAGING_LEVELS == 4
3312 l4_pgentry_t *l4e;
3313 l3_pgentry_t *l3e;
3314 int i3, i4;
3315 l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3316 #elif CONFIG_PAGING_LEVELS == 3
3317 l3_pgentry_t *l3e;
3318 int i3;
3319 l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3320 #else /* CONFIG_PAGING_LEVELS == 2 */
3321 l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
3322 #endif
3324 gfn = 0;
3325 #if CONFIG_PAGING_LEVELS >= 3
3326 #if CONFIG_PAGING_LEVELS >= 4
3327 for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
3329 if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
3331 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
3332 continue;
3334 l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
3335 #endif /* now at levels 3 or 4... */
3336 for ( i3 = 0;
3337 i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
3338 i3++ )
3340 if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
3342 gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3343 continue;
3345 l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
3346 #endif /* all levels... */
3347 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
3349 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
3351 gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
3352 continue;
3354 l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
3356 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
3358 if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
3359 continue;
3360 mfn = l1e_get_pfn(l1e[i1]);
3361 ASSERT(valid_mfn(_mfn(mfn)));
3362 m2pfn = get_gpfn_from_mfn(mfn);
3363 if ( m2pfn != gfn )
3365 pmbad++;
3366 SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
3367 " -> gfn %#lx\n", gfn, mfn, m2pfn);
3368 BUG();
3371 sh2_unmap_domain_page(l1e);
3373 #if CONFIG_PAGING_LEVELS >= 3
3374 sh2_unmap_domain_page(l2e);
3376 #if CONFIG_PAGING_LEVELS >= 4
3377 sh2_unmap_domain_page(l3e);
3379 #endif
3380 #endif
3382 #if CONFIG_PAGING_LEVELS == 4
3383 sh2_unmap_domain_page(l4e);
3384 #elif CONFIG_PAGING_LEVELS == 3
3385 sh2_unmap_domain_page(l3e);
3386 #else /* CONFIG_PAGING_LEVELS == 2 */
3387 sh2_unmap_domain_page(l2e);
3388 #endif
3392 //SHADOW2_PRINTK("p2m audit complete\n");
3393 //if ( orphans_i | orphans_d | mpbad | pmbad )
3394 // SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
3395 // orphans_i + orphans_d, orphans_i, orphans_d,
3396 if ( mpbad | pmbad )
3397 SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
3398 pmbad, mpbad);
3401 #endif /* p2m audit */
3403 /*
3404 * Local variables:
3405 * mode: C
3406 * c-set-style: "BSD"
3407 * c-basic-offset: 4
3408 * indent-tabs-mode: nil
3409 * End:
3410 */