direct-io.hg

view xen/arch/x86/shadow32.c @ 8736:8aeb417387ca

Fix some more pfn/mfn/gmfn/gpfn inconsistencies. Fix some direct
uses of max_page variable to use the mfn_valid() predicate.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Feb 02 12:18:28 2006 +0100 (2006-02-02)
parents 0c94043f5c5b
children 0e7bdd973e17
line source
1 /******************************************************************************
2 * arch/x86/shadow.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
23 #include <xen/config.h>
24 #include <xen/types.h>
25 #include <xen/mm.h>
26 #include <xen/domain_page.h>
27 #include <asm/shadow.h>
28 #include <asm/page.h>
29 #include <xen/event.h>
30 #include <xen/sched.h>
31 #include <xen/trace.h>
33 #define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
34 #define va_to_l1mfn(_ed, _va) \
35 (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]))
37 static void shadow_free_snapshot(struct domain *d,
38 struct out_of_sync_entry *entry);
39 static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
40 static void free_writable_pte_predictions(struct domain *d);
42 #if SHADOW_DEBUG
43 static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
44 #endif
46 /********
48 There's a per-domain shadow table spin lock which works fine for SMP
49 hosts. We don't have to worry about interrupts as no shadow operations
50 happen in an interrupt context. It's probably not quite ready for SMP
51 guest operation as we have to worry about synchonisation between gpte
52 and spte updates. Its possible that this might only happen in a
53 hypercall context, in which case we'll probably at have a per-domain
54 hypercall lock anyhow (at least initially).
56 ********/
58 static inline int
59 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
60 unsigned long new_type)
61 {
62 struct page_info *page = mfn_to_page(gmfn);
63 int pinned = 0, okay = 1;
65 if ( page_out_of_sync(page) )
66 {
67 // Don't know how long ago this snapshot was taken.
68 // Can't trust it to be recent enough.
69 //
70 __shadow_sync_mfn(d, gmfn);
71 }
73 if ( !shadow_mode_refcounts(d) )
74 return 1;
76 if ( unlikely(page_is_page_table(page)) )
77 return 1;
79 FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
81 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
82 {
83 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
84 __func__, gpfn, gmfn);
85 #if 1 || defined(LIVE_DANGEROUSLY)
86 set_bit(_PGC_page_table, &page->count_info);
87 return 1;
88 #endif
89 return 0;
91 }
93 // To convert this page to use as a page table, the writable count
94 // should now be zero. Test this by grabbing the page as an page table,
95 // and then immediately releasing. This will also deal with any
96 // necessary TLB flushing issues for us.
97 //
98 // The cruft here about pinning doesn't really work right. This
99 // needs rethinking/rewriting... Need to gracefully deal with the
100 // TLB flushes required when promoting a writable page, and also deal
101 // with any outstanding (external) writable refs to this page (by
102 // refusing to promote it). The pinning headache complicates this
103 // code -- it would all get much simpler if we stop using
104 // shadow_lock() and move the shadow code to BIGLOCK().
105 //
106 if ( unlikely(!get_page(page, d)) )
107 BUG(); // XXX -- needs more thought for a graceful failure
108 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
109 {
110 pinned = 1;
111 put_page_and_type(page);
112 }
113 if ( get_page_type(page, PGT_base_page_table) )
114 {
115 set_bit(_PGC_page_table, &page->count_info);
116 put_page_type(page);
117 }
118 else
119 {
120 printk("shadow_promote: get_page_type failed "
121 "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
122 d->domain_id, gpfn, gmfn, new_type);
123 okay = 0;
124 }
126 // Now put the type back to writable...
127 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
128 BUG(); // XXX -- needs more thought for a graceful failure
129 if ( unlikely(pinned) )
130 {
131 if ( unlikely(test_and_set_bit(_PGT_pinned,
132 &page->u.inuse.type_info)) )
133 BUG(); // hmm... someone pinned this again?
134 }
135 else
136 put_page_and_type(page);
138 return okay;
139 }
141 static inline void
142 shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
143 {
144 if ( !shadow_mode_refcounts(d) )
145 return;
147 ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table);
149 if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
150 {
151 clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info);
153 if ( page_out_of_sync(mfn_to_page(gmfn)) )
154 {
155 remove_out_of_sync_entries(d, gmfn);
156 }
157 }
158 }
160 /*
161 * Things in shadow mode that collect get_page() refs to the domain's
162 * pages are:
163 * - PGC_allocated takes a gen count, just like normal.
164 * - A writable page can be pinned (paravirtualized guests may consider
165 * these pages to be L1s or L2s, and don't know the difference).
166 * Pinning a page takes a gen count (but, for domains in shadow mode,
167 * it *doesn't* take a type count)
168 * - CR3 grabs a ref to whatever it points at, just like normal.
169 * - Shadow mode grabs an initial gen count for itself, as a placehold
170 * for whatever references will exist.
171 * - Shadow PTEs that point to a page take a gen count, just like regular
172 * PTEs. However, they don't get a type count, as get_page_type() is
173 * hardwired to keep writable pages' counts at 1 for domains in shadow
174 * mode.
175 * - Whenever we shadow a page, the entry in the shadow hash grabs a
176 * general ref to the page.
177 * - Whenever a page goes out of sync, the out of sync entry grabs a
178 * general ref to the page.
179 */
180 /*
181 * page_info fields for pages allocated as shadow pages:
182 *
183 * All 32 bits of count_info are a simple count of refs to this shadow
184 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
185 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
186 * references.
187 *
188 * u.inuse._domain is left NULL, to prevent accidently allow some random
189 * domain from gaining permissions to map this page.
190 *
191 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
192 * shadowed.
193 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
194 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
195 * is currently exists because this is a shadow of a root page, and we
196 * don't want to let those disappear just because no CR3 is currently pointing
197 * at it.
198 *
199 * tlbflush_timestamp holds a min & max index of valid page table entries
200 * within the shadow page.
201 */
203 static inline unsigned long
204 alloc_shadow_page(struct domain *d,
205 unsigned long gpfn, unsigned long gmfn,
206 u32 psh_type)
207 {
208 struct page_info *page;
209 unsigned long smfn;
210 int pin = 0;
211 void *l1;
213 // Currently, we only keep pre-zero'ed pages around for use as L1's...
214 // This will change. Soon.
215 //
216 if ( psh_type == PGT_l1_shadow )
217 {
218 if ( !list_empty(&d->arch.free_shadow_frames) )
219 {
220 struct list_head *entry = d->arch.free_shadow_frames.next;
221 page = list_entry(entry, struct page_info, list);
222 list_del(entry);
223 perfc_decr(free_l1_pages);
224 }
225 else
226 {
227 page = alloc_domheap_page(NULL);
228 l1 = map_domain_page(page_to_mfn(page));
229 memset(l1, 0, PAGE_SIZE);
230 unmap_domain_page(l1);
231 }
232 }
233 else
234 page = alloc_domheap_page(NULL);
236 if ( unlikely(page == NULL) )
237 {
238 printk("Couldn't alloc shadow page! dom%d count=%d\n",
239 d->domain_id, d->arch.shadow_page_count);
240 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
241 perfc_value(shadow_l1_pages),
242 perfc_value(shadow_l2_pages),
243 perfc_value(hl2_table_pages),
244 perfc_value(snapshot_pages));
245 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
246 }
248 smfn = page_to_mfn(page);
250 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
251 page->u.inuse.type_info = psh_type | gmfn;
252 page->count_info = 0;
253 page->tlbflush_timestamp = 0;
255 switch ( psh_type )
256 {
257 case PGT_l1_shadow:
258 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
259 goto fail;
260 perfc_incr(shadow_l1_pages);
261 d->arch.shadow_page_count++;
262 break;
264 case PGT_l2_shadow:
265 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
266 goto fail;
267 perfc_incr(shadow_l2_pages);
268 d->arch.shadow_page_count++;
269 if ( PGT_l2_page_table == PGT_root_page_table )
270 pin = 1;
272 break;
274 case PGT_hl2_shadow:
275 // Treat an hl2 as an L1 for purposes of promotion.
276 // For external mode domains, treat them as an L2 for purposes of
277 // pinning.
278 //
279 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
280 goto fail;
281 perfc_incr(hl2_table_pages);
282 d->arch.hl2_page_count++;
283 if ( shadow_mode_external(d) &&
284 (PGT_l2_page_table == PGT_root_page_table) )
285 pin = 1;
287 break;
289 case PGT_snapshot:
290 perfc_incr(snapshot_pages);
291 d->arch.snapshot_page_count++;
292 break;
294 default:
295 printk("Alloc shadow weird page type type=%08x\n", psh_type);
296 BUG();
297 break;
298 }
300 // Don't add a new shadow of something that already has a snapshot.
301 //
302 ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
304 set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
306 if ( pin )
307 shadow_pin(smfn);
309 return smfn;
311 fail:
312 FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?",
313 gpfn, gmfn);
314 free_domheap_page(page);
315 return 0;
316 }
318 static void inline
319 free_shadow_l1_table(struct domain *d, unsigned long smfn)
320 {
321 l1_pgentry_t *pl1e = map_domain_page(smfn);
322 int i;
323 struct page_info *spage = mfn_to_page(smfn);
324 u32 min_max = spage->tlbflush_timestamp;
325 int min = SHADOW_MIN(min_max);
326 int max = SHADOW_MAX(min_max);
328 for ( i = min; i <= max; i++ )
329 {
330 shadow_put_page_from_l1e(pl1e[i], d);
331 pl1e[i] = l1e_empty();
332 }
334 unmap_domain_page(pl1e);
335 }
337 static void inline
338 free_shadow_hl2_table(struct domain *d, unsigned long smfn)
339 {
340 l1_pgentry_t *hl2 = map_domain_page(smfn);
341 int i, limit;
343 SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
345 if ( shadow_mode_external(d) )
346 limit = L2_PAGETABLE_ENTRIES;
347 else
348 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
350 for ( i = 0; i < limit; i++ )
351 {
352 if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
353 put_page(mfn_to_page(l1e_get_pfn(hl2[i])));
354 }
356 unmap_domain_page(hl2);
357 }
359 static void inline
360 free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
361 {
362 l2_pgentry_t *pl2e = map_domain_page(smfn);
363 int i, external = shadow_mode_external(d);
365 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
366 if ( external || is_guest_l2_slot(type, i) )
367 if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
368 put_shadow_ref(l2e_get_pfn(pl2e[i]));
370 if ( (PGT_base_page_table == PGT_l2_page_table) &&
371 shadow_mode_translate(d) && !external )
372 {
373 // free the ref to the hl2
374 //
375 put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
376 }
378 unmap_domain_page(pl2e);
379 }
381 void free_shadow_page(unsigned long smfn)
382 {
383 struct page_info *page = mfn_to_page(smfn);
384 unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
385 struct domain *d = page_get_owner(mfn_to_page(gmfn));
386 unsigned long gpfn = mfn_to_gmfn(d, gmfn);
387 unsigned long type = page->u.inuse.type_info & PGT_type_mask;
389 SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
391 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
393 delete_shadow_status(d, gpfn, gmfn, type);
395 switch ( type )
396 {
397 case PGT_l1_shadow:
398 perfc_decr(shadow_l1_pages);
399 shadow_demote(d, gpfn, gmfn);
400 free_shadow_l1_table(d, smfn);
401 d->arch.shadow_page_count--;
402 break;
404 case PGT_l2_shadow:
405 perfc_decr(shadow_l2_pages);
406 shadow_demote(d, gpfn, gmfn);
407 free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
408 d->arch.shadow_page_count--;
409 break;
411 case PGT_hl2_shadow:
412 perfc_decr(hl2_table_pages);
413 shadow_demote(d, gpfn, gmfn);
414 free_shadow_hl2_table(d, smfn);
415 d->arch.hl2_page_count--;
416 break;
418 case PGT_snapshot:
419 perfc_decr(snapshot_pages);
420 d->arch.snapshot_page_count--;
421 break;
423 default:
424 printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n",
425 page_to_mfn(page), page->u.inuse.type_info);
426 break;
427 }
429 // No TLB flushes are needed the next time this page gets allocated.
430 //
431 page->tlbflush_timestamp = 0;
432 page->u.free.cpumask = CPU_MASK_NONE;
434 if ( type == PGT_l1_shadow )
435 {
436 list_add(&page->list, &d->arch.free_shadow_frames);
437 perfc_incr(free_l1_pages);
438 }
439 else
440 free_domheap_page(page);
441 }
443 void
444 remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
445 {
446 unsigned long smfn;
448 //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype);
450 shadow_lock(d);
452 while ( stype >= PGT_l1_shadow )
453 {
454 smfn = __shadow_status(d, gpfn, stype);
455 if ( smfn && MFN_PINNED(smfn) )
456 shadow_unpin(smfn);
457 stype -= PGT_l1_shadow;
458 }
460 shadow_unlock(d);
461 }
463 static void inline
464 release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
465 {
466 struct page_info *page;
468 page = mfn_to_page(entry->gmfn);
470 // Decrement ref count of guest & shadow pages
471 //
472 put_page(page);
474 // Only use entries that have low bits clear...
475 //
476 if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
477 {
478 put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
479 entry->writable_pl1e = -2;
480 }
481 else
482 ASSERT( entry->writable_pl1e == -1 );
484 // Free the snapshot
485 //
486 shadow_free_snapshot(d, entry);
487 }
489 static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
490 {
491 struct out_of_sync_entry *entry = d->arch.out_of_sync;
492 struct out_of_sync_entry **prev = &d->arch.out_of_sync;
493 struct out_of_sync_entry *found = NULL;
495 // NB: Be careful not to call something that manipulates this list
496 // while walking it. Collect the results into a separate list
497 // first, then walk that list.
498 //
499 while ( entry )
500 {
501 if ( entry->gmfn == gmfn )
502 {
503 // remove from out of sync list
504 *prev = entry->next;
506 // add to found list
507 entry->next = found;
508 found = entry;
510 entry = *prev;
511 continue;
512 }
513 prev = &entry->next;
514 entry = entry->next;
515 }
517 prev = NULL;
518 entry = found;
519 while ( entry )
520 {
521 release_out_of_sync_entry(d, entry);
523 prev = &entry->next;
524 entry = entry->next;
525 }
527 // Add found list to free list
528 if ( prev )
529 {
530 *prev = d->arch.out_of_sync_free;
531 d->arch.out_of_sync_free = found;
532 }
533 }
535 static void free_out_of_sync_state(struct domain *d)
536 {
537 struct out_of_sync_entry *entry;
539 // NB: Be careful not to call something that manipulates this list
540 // while walking it. Remove one item at a time, and always
541 // restart from start of list.
542 //
543 while ( (entry = d->arch.out_of_sync) )
544 {
545 d->arch.out_of_sync = entry->next;
546 release_out_of_sync_entry(d, entry);
548 entry->next = d->arch.out_of_sync_free;
549 d->arch.out_of_sync_free = entry;
550 }
551 }
553 static void free_shadow_pages(struct domain *d)
554 {
555 int i;
556 struct shadow_status *x;
557 struct vcpu *v;
558 struct list_head *list_ent, *tmp;
560 /*
561 * WARNING! The shadow page table must not currently be in use!
562 * e.g., You are expected to have paused the domain and synchronized CR3.
563 */
565 if( !d->arch.shadow_ht ) return;
567 shadow_audit(d, 1);
569 // first, remove any outstanding refs from out_of_sync entries...
570 //
571 free_out_of_sync_state(d);
573 // second, remove any outstanding refs from v->arch.shadow_table
574 // and CR3.
575 //
576 for_each_vcpu(d, v)
577 {
578 if ( pagetable_get_paddr(v->arch.shadow_table) )
579 {
580 put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
581 v->arch.shadow_table = mk_pagetable(0);
582 }
584 if ( v->arch.monitor_shadow_ref )
585 {
586 put_shadow_ref(v->arch.monitor_shadow_ref);
587 v->arch.monitor_shadow_ref = 0;
588 }
589 }
591 // For external shadows, remove the monitor table's refs
592 //
593 if ( shadow_mode_external(d) )
594 {
595 for_each_vcpu(d, v)
596 {
597 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
599 if ( mpl2e )
600 {
601 l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
602 l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
604 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
605 {
606 put_shadow_ref(l2e_get_pfn(hl2e));
607 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
608 }
609 if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
610 {
611 put_shadow_ref(l2e_get_pfn(smfn));
612 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
613 }
614 }
615 }
616 }
618 // Now, the only refs to shadow pages that are left are from the shadow
619 // pages themselves. We just unpin the pinned pages, and the rest
620 // should automatically disappear.
621 //
622 // NB: Beware: each explicitly or implicit call to free_shadow_page
623 // can/will result in the hash bucket getting rewritten out from
624 // under us... First, collect the list of pinned pages, then
625 // free them.
626 //
627 // FIXME: it would be good to just free all the pages referred to in
628 // the hash table without going through each of them to decrement their
629 // reference counts. In shadow_mode_refcount(), we've gotta do the hard
630 // work, but only for L1 shadows. If we're not in refcount mode, then
631 // there's no real hard work to do at all. Need to be careful with the
632 // writable_pte_predictions and snapshot entries in the hash table, but
633 // that's about it.
634 //
635 for ( i = 0; i < shadow_ht_buckets; i++ )
636 {
637 u32 count;
638 unsigned long *mfn_list;
640 /* Skip empty buckets. */
641 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
642 continue;
644 count = 0;
646 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) {
647 /* Skip entries that are writable_pred) */
648 switch(x->gpfn_and_flags & PGT_type_mask){
649 case PGT_l1_shadow:
650 case PGT_l2_shadow:
651 case PGT_l3_shadow:
652 case PGT_l4_shadow:
653 case PGT_hl2_shadow:
654 if ( MFN_PINNED(x->smfn) )
655 count++;
656 break;
657 case PGT_snapshot:
658 case PGT_writable_pred:
659 break;
660 default:
661 BUG();
663 }
664 }
666 if ( !count )
667 continue;
669 mfn_list = xmalloc_array(unsigned long, count);
670 count = 0;
671 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) {
672 /* Skip entries that are writable_pred) */
673 switch(x->gpfn_and_flags & PGT_type_mask){
674 case PGT_l1_shadow:
675 case PGT_l2_shadow:
676 case PGT_l3_shadow:
677 case PGT_l4_shadow:
678 case PGT_hl2_shadow:
679 if ( MFN_PINNED(x->smfn) )
680 mfn_list[count++] = x->smfn;
681 break;
682 case PGT_snapshot:
683 case PGT_writable_pred:
684 break;
685 default:
686 BUG();
688 }
689 }
691 while ( count )
692 {
693 shadow_unpin(mfn_list[--count]);
694 }
695 xfree(mfn_list);
696 }
698 /* Now free the pre-zero'ed pages from the domain */
699 list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
700 {
701 struct page_info *page = list_entry(list_ent, struct page_info, list);
703 list_del(list_ent);
704 perfc_decr(free_l1_pages);
706 free_domheap_page(page);
707 }
709 shadow_audit(d, 0);
711 SH_VLOG("Free shadow table.");
712 }
714 void shadow_mode_init(void)
715 {
716 }
718 int _shadow_mode_refcounts(struct domain *d)
719 {
720 return shadow_mode_refcounts(d);
721 }
723 static void alloc_monitor_pagetable(struct vcpu *v)
724 {
725 unsigned long mmfn;
726 l2_pgentry_t *mpl2e;
727 struct page_info *mmfn_info;
728 struct domain *d = v->domain;
729 int i;
731 ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
733 mmfn_info = alloc_domheap_page(NULL);
734 ASSERT(mmfn_info != NULL);
736 mmfn = page_to_mfn(mmfn_info);
737 mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
738 memset(mpl2e, 0, PAGE_SIZE);
740 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
741 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
742 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
744 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
745 mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
746 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
747 __PAGE_HYPERVISOR);
749 // map the phys_to_machine map into the Read-Only MPT space for this domain
750 mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
751 l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
752 __PAGE_HYPERVISOR);
754 // Don't (yet) have mappings for these...
755 // Don't want to accidentally see the idle_pg_table's linear mapping.
756 //
757 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
758 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
760 v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
761 v->arch.monitor_vtable = mpl2e;
762 }
764 /*
765 * Free the pages for monitor_table and hl2_table
766 */
767 void free_monitor_pagetable(struct vcpu *v)
768 {
769 l2_pgentry_t *mpl2e, hl2e, sl2e;
770 unsigned long mfn;
772 ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
774 mpl2e = v->arch.monitor_vtable;
776 /*
777 * First get the mfn for hl2_table by looking at monitor_table
778 */
779 hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
780 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
781 {
782 mfn = l2e_get_pfn(hl2e);
783 ASSERT(mfn);
784 put_shadow_ref(mfn);
785 }
787 sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
788 if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
789 {
790 mfn = l2e_get_pfn(sl2e);
791 ASSERT(mfn);
792 put_shadow_ref(mfn);
793 }
795 /*
796 * Then free monitor_table.
797 */
798 mfn = pagetable_get_pfn(v->arch.monitor_table);
799 unmap_domain_page_global(v->arch.monitor_vtable);
800 free_domheap_page(mfn_to_page(mfn));
802 v->arch.monitor_table = mk_pagetable(0);
803 v->arch.monitor_vtable = 0;
804 }
806 int
807 set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
808 struct domain_mmap_cache *l2cache,
809 struct domain_mmap_cache *l1cache)
810 {
811 unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table);
812 l2_pgentry_t *l2, l2e;
813 l1_pgentry_t *l1;
814 struct page_info *l1page;
815 unsigned long va = pfn << PAGE_SHIFT;
817 ASSERT(tabpfn != 0);
818 ASSERT(shadow_lock_is_acquired(d));
820 l2 = map_domain_page_with_cache(tabpfn, l2cache);
821 l2e = l2[l2_table_offset(va)];
822 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
823 {
824 l1page = alloc_domheap_page(NULL);
825 if ( !l1page )
826 {
827 unmap_domain_page_with_cache(l2, l2cache);
828 return 0;
829 }
831 l1 = map_domain_page_with_cache(page_to_mfn(l1page), l1cache);
832 memset(l1, 0, PAGE_SIZE);
833 unmap_domain_page_with_cache(l1, l1cache);
835 l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR);
836 l2[l2_table_offset(va)] = l2e;
837 }
838 unmap_domain_page_with_cache(l2, l2cache);
840 l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache);
841 l1[l1_table_offset(va)] = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
842 unmap_domain_page_with_cache(l1, l1cache);
844 return 1;
845 }
847 static int
848 alloc_p2m_table(struct domain *d)
849 {
850 struct list_head *list_ent;
851 struct page_info *page, *l2page;
852 l2_pgentry_t *l2;
853 unsigned long mfn, pfn;
854 struct domain_mmap_cache l1cache, l2cache;
856 l2page = alloc_domheap_page(NULL);
857 if ( l2page == NULL )
858 return 0;
860 domain_mmap_cache_init(&l1cache);
861 domain_mmap_cache_init(&l2cache);
863 d->arch.phys_table = mk_pagetable(page_to_maddr(l2page));
864 l2 = map_domain_page_with_cache(page_to_mfn(l2page), &l2cache);
865 memset(l2, 0, PAGE_SIZE);
866 unmap_domain_page_with_cache(l2, &l2cache);
868 list_ent = d->page_list.next;
869 while ( list_ent != &d->page_list )
870 {
871 page = list_entry(list_ent, struct page_info, list);
872 mfn = page_to_mfn(page);
873 pfn = get_gpfn_from_mfn(mfn);
874 ASSERT(pfn != INVALID_M2P_ENTRY);
875 ASSERT(pfn < (1u<<20));
877 set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
879 list_ent = page->list.next;
880 }
882 list_ent = d->xenpage_list.next;
883 while ( list_ent != &d->xenpage_list )
884 {
885 page = list_entry(list_ent, struct page_info, list);
886 mfn = page_to_mfn(page);
887 pfn = get_gpfn_from_mfn(mfn);
888 if ( (pfn != INVALID_M2P_ENTRY) &&
889 (pfn < (1u<<20)) )
890 {
891 set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
892 }
894 list_ent = page->list.next;
895 }
897 domain_mmap_cache_destroy(&l2cache);
898 domain_mmap_cache_destroy(&l1cache);
900 return 1;
901 }
903 static void
904 free_p2m_table(struct domain *d)
905 {
906 // uh, this needs some work... :)
907 BUG();
908 }
910 int __shadow_mode_enable(struct domain *d, unsigned int mode)
911 {
912 struct vcpu *v;
913 int new_modes = (mode & ~d->arch.shadow_mode);
915 if(!new_modes) /* Nothing to do - return success */
916 return 0;
918 // can't take anything away by calling this function.
919 ASSERT(!(d->arch.shadow_mode & ~mode));
921 for_each_vcpu(d, v)
922 {
923 invalidate_shadow_ldt(v);
925 // We need to set these up for __update_pagetables().
926 // See the comment there.
928 /*
929 * arch.guest_vtable
930 */
931 if ( v->arch.guest_vtable &&
932 (v->arch.guest_vtable != __linear_l2_table) )
933 {
934 unmap_domain_page_global(v->arch.guest_vtable);
935 }
936 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
937 v->arch.guest_vtable = __linear_l2_table;
938 else
939 v->arch.guest_vtable = NULL;
941 /*
942 * arch.shadow_vtable
943 */
944 if ( v->arch.shadow_vtable &&
945 (v->arch.shadow_vtable != __shadow_linear_l2_table) )
946 {
947 unmap_domain_page_global(v->arch.shadow_vtable);
948 }
949 if ( !(mode & SHM_external) )
950 v->arch.shadow_vtable = __shadow_linear_l2_table;
951 else
952 v->arch.shadow_vtable = NULL;
954 /*
955 * arch.hl2_vtable
956 */
957 if ( v->arch.hl2_vtable &&
958 (v->arch.hl2_vtable != __linear_hl2_table) )
959 {
960 unmap_domain_page_global(v->arch.hl2_vtable);
961 }
962 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
963 v->arch.hl2_vtable = __linear_hl2_table;
964 else
965 v->arch.hl2_vtable = NULL;
967 /*
968 * arch.monitor_table & arch.monitor_vtable
969 */
970 if ( v->arch.monitor_vtable )
971 {
972 free_monitor_pagetable(v);
973 }
974 if ( mode & SHM_external )
975 {
976 alloc_monitor_pagetable(v);
977 }
978 }
980 if ( new_modes & SHM_enable )
981 {
982 ASSERT( !d->arch.shadow_ht );
983 d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
984 if ( d->arch.shadow_ht == NULL )
985 goto nomem;
987 memset(d->arch.shadow_ht, 0,
988 shadow_ht_buckets * sizeof(struct shadow_status));
989 }
991 if ( new_modes & SHM_log_dirty )
992 {
993 ASSERT( !d->arch.shadow_dirty_bitmap );
994 d->arch.shadow_dirty_bitmap_size =
995 (d->shared_info->arch.max_pfn + 63) & ~63;
996 d->arch.shadow_dirty_bitmap =
997 xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
998 (8 * sizeof(unsigned long)));
999 if ( d->arch.shadow_dirty_bitmap == NULL )
1001 d->arch.shadow_dirty_bitmap_size = 0;
1002 goto nomem;
1004 memset(d->arch.shadow_dirty_bitmap, 0,
1005 d->arch.shadow_dirty_bitmap_size/8);
1008 if ( new_modes & SHM_translate )
1010 if ( !(new_modes & SHM_external) )
1012 ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
1013 if ( !alloc_p2m_table(d) )
1015 printk("alloc_p2m_table failed (out-of-memory?)\n");
1016 goto nomem;
1019 else
1021 // external guests provide their own memory for their P2M maps.
1022 //
1023 ASSERT(d == page_get_owner(mfn_to_page(pagetable_get_pfn(
1024 d->arch.phys_table))));
1028 // Get rid of any shadow pages from any previous shadow mode.
1029 //
1030 free_shadow_pages(d);
1032 d->arch.shadow_mode = mode;
1034 if ( shadow_mode_refcounts(d) )
1036 struct list_head *list_ent;
1037 struct page_info *page;
1039 /*
1040 * Tear down its counts by disassembling its page-table-based refcounts
1041 * Also remove CR3's gcount/tcount.
1042 * That leaves things like GDTs and LDTs and external refs in tact.
1044 * Most pages will be writable tcount=0.
1045 * Some will still be L1 tcount=0 or L2 tcount=0.
1046 * Maybe some pages will be type none tcount=0.
1047 * Pages granted external writable refs (via grant tables?) will
1048 * still have a non-zero tcount. That's OK.
1050 * gcounts will generally be 1 for PGC_allocated.
1051 * GDTs and LDTs will have additional gcounts.
1052 * Any grant-table based refs will still be in the gcount.
1054 * We attempt to grab writable refs to each page thus setting its type
1055 * Immediately put back those type refs.
1057 * Assert that no pages are left with L1/L2/L3/L4 type.
1058 */
1059 audit_adjust_pgtables(d, -1, 1);
1062 for (list_ent = d->page_list.next; list_ent != &d->page_list;
1063 list_ent = page->list.next) {
1065 page = list_entry(list_ent, struct page_info, list);
1067 if ( !get_page_type(page, PGT_writable_page) )
1068 BUG();
1069 put_page_type(page);
1070 /*
1071 * We use tlbflush_timestamp as back pointer to smfn, and need to
1072 * clean up it.
1073 */
1074 if (shadow_mode_external(d))
1075 page->tlbflush_timestamp = 0;
1078 audit_adjust_pgtables(d, 1, 1);
1082 return 0;
1084 nomem:
1085 if ( (new_modes & SHM_enable) )
1087 xfree(d->arch.shadow_ht);
1088 d->arch.shadow_ht = NULL;
1090 if ( (new_modes & SHM_log_dirty) )
1092 xfree(d->arch.shadow_dirty_bitmap);
1093 d->arch.shadow_dirty_bitmap = NULL;
1095 if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) &&
1096 pagetable_get_paddr(d->arch.phys_table) )
1098 free_p2m_table(d);
1100 return -ENOMEM;
1103 int shadow_mode_enable(struct domain *d, unsigned int mode)
1105 int rc;
1106 shadow_lock(d);
1107 rc = __shadow_mode_enable(d, mode);
1108 shadow_unlock(d);
1109 return rc;
1112 static void
1113 translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
1115 int i;
1116 l1_pgentry_t *l1;
1118 l1 = map_domain_page(l1mfn);
1119 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1121 if ( is_guest_l1_slot(i) &&
1122 (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
1124 unsigned long mfn = l1e_get_pfn(l1[i]);
1125 unsigned long gpfn = mfn_to_gmfn(d, mfn);
1126 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1127 l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
1130 unmap_domain_page(l1);
1133 // This is not general enough to handle arbitrary pagetables
1134 // with shared L1 pages, etc., but it is sufficient for bringing
1135 // up dom0.
1136 //
1137 void
1138 translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
1139 unsigned int type)
1141 int i;
1142 l2_pgentry_t *l2;
1144 ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
1146 l2 = map_domain_page(l2mfn);
1147 for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
1149 if ( is_guest_l2_slot(type, i) &&
1150 (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
1152 unsigned long mfn = l2e_get_pfn(l2[i]);
1153 unsigned long gpfn = mfn_to_gmfn(d, mfn);
1154 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1155 l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
1156 translate_l1pgtable(d, p2m, mfn);
1159 unmap_domain_page(l2);
1162 static void free_shadow_ht_entries(struct domain *d)
1164 struct shadow_status *x, *n;
1166 SH_VLOG("freed tables count=%d l1=%d l2=%d",
1167 d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
1168 perfc_value(shadow_l2_pages));
1170 n = d->arch.shadow_ht_extras;
1171 while ( (x = n) != NULL )
1173 d->arch.shadow_extras_count--;
1174 n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
1175 xfree(x);
1178 d->arch.shadow_ht_extras = NULL;
1179 d->arch.shadow_ht_free = NULL;
1181 ASSERT(d->arch.shadow_extras_count == 0);
1182 SH_VLOG("freed extras, now %d", d->arch.shadow_extras_count);
1184 if ( d->arch.shadow_dirty_bitmap != NULL )
1186 xfree(d->arch.shadow_dirty_bitmap);
1187 d->arch.shadow_dirty_bitmap = 0;
1188 d->arch.shadow_dirty_bitmap_size = 0;
1191 xfree(d->arch.shadow_ht);
1192 d->arch.shadow_ht = NULL;
1195 static void free_out_of_sync_entries(struct domain *d)
1197 struct out_of_sync_entry *x, *n;
1199 n = d->arch.out_of_sync_extras;
1200 while ( (x = n) != NULL )
1202 d->arch.out_of_sync_extras_count--;
1203 n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
1204 xfree(x);
1207 d->arch.out_of_sync_extras = NULL;
1208 d->arch.out_of_sync_free = NULL;
1209 d->arch.out_of_sync = NULL;
1211 ASSERT(d->arch.out_of_sync_extras_count == 0);
1212 FSH_LOG("freed extra out_of_sync entries, now %d",
1213 d->arch.out_of_sync_extras_count);
1216 void __shadow_mode_disable(struct domain *d)
1218 struct vcpu *v;
1219 #ifndef NDEBUG
1220 int i;
1221 #endif
1223 if ( unlikely(!shadow_mode_enabled(d)) )
1224 return;
1226 free_shadow_pages(d);
1227 free_writable_pte_predictions(d);
1229 #ifndef NDEBUG
1230 for ( i = 0; i < shadow_ht_buckets; i++ )
1232 if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
1234 printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
1235 __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
1236 BUG();
1239 #endif
1241 d->arch.shadow_mode = 0;
1243 free_shadow_ht_entries(d);
1244 free_out_of_sync_entries(d);
1246 for_each_vcpu(d, v)
1247 update_pagetables(v);
1250 static int shadow_mode_table_op(
1251 struct domain *d, dom0_shadow_control_t *sc)
1253 unsigned int op = sc->op;
1254 int i, rc = 0;
1255 struct vcpu *v;
1257 ASSERT(shadow_lock_is_acquired(d));
1259 SH_VLOG("shadow mode table op %lx %lx count %d",
1260 (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */
1261 (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
1262 d->arch.shadow_page_count);
1264 shadow_audit(d, 1);
1266 switch ( op )
1268 case DOM0_SHADOW_CONTROL_OP_FLUSH:
1269 free_shadow_pages(d);
1271 d->arch.shadow_fault_count = 0;
1272 d->arch.shadow_dirty_count = 0;
1274 break;
1276 case DOM0_SHADOW_CONTROL_OP_CLEAN:
1277 free_shadow_pages(d);
1279 sc->stats.fault_count = d->arch.shadow_fault_count;
1280 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1282 d->arch.shadow_fault_count = 0;
1283 d->arch.shadow_dirty_count = 0;
1285 if ( (sc->dirty_bitmap == NULL) ||
1286 (d->arch.shadow_dirty_bitmap == NULL) )
1288 rc = -EINVAL;
1289 break;
1292 if(sc->pages > d->arch.shadow_dirty_bitmap_size)
1293 sc->pages = d->arch.shadow_dirty_bitmap_size;
1295 #define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
1296 for ( i = 0; i < sc->pages; i += chunk )
1298 int bytes = ((((sc->pages - i) > chunk) ?
1299 chunk : (sc->pages - i)) + 7) / 8;
1301 if (copy_to_user(
1302 sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
1303 d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
1304 bytes))
1306 rc = -EINVAL;
1307 break;
1310 memset(
1311 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
1312 0, bytes);
1315 break;
1317 case DOM0_SHADOW_CONTROL_OP_PEEK:
1318 sc->stats.fault_count = d->arch.shadow_fault_count;
1319 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1321 if ( (sc->dirty_bitmap == NULL) ||
1322 (d->arch.shadow_dirty_bitmap == NULL) )
1324 rc = -EINVAL;
1325 break;
1328 if(sc->pages > d->arch.shadow_dirty_bitmap_size)
1329 sc->pages = d->arch.shadow_dirty_bitmap_size;
1331 if (copy_to_user(sc->dirty_bitmap,
1332 d->arch.shadow_dirty_bitmap, (sc->pages+7)/8))
1334 rc = -EINVAL;
1335 break;
1338 break;
1340 default:
1341 rc = -EINVAL;
1342 break;
1345 SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
1346 shadow_audit(d, 1);
1348 for_each_vcpu(d,v)
1349 __update_pagetables(v);
1351 return rc;
1354 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
1356 unsigned int op = sc->op;
1357 int rc = 0;
1358 struct vcpu *v;
1360 if ( unlikely(d == current->domain) )
1362 DPRINTK("Don't try to do a shadow op on yourself!\n");
1363 return -EINVAL;
1366 domain_pause(d);
1368 shadow_lock(d);
1370 switch ( op )
1372 case DOM0_SHADOW_CONTROL_OP_OFF:
1373 if ( shadow_mode_enabled(d) )
1375 __shadow_sync_all(d);
1376 __shadow_mode_disable(d);
1378 break;
1380 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
1381 free_shadow_pages(d);
1382 rc = __shadow_mode_enable(d, SHM_enable);
1383 break;
1385 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
1386 free_shadow_pages(d);
1387 rc = __shadow_mode_enable(
1388 d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
1389 break;
1391 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
1392 free_shadow_pages(d);
1393 rc = __shadow_mode_enable(
1394 d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate|SHM_wr_pt_pte);
1395 break;
1397 default:
1398 rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
1399 break;
1402 shadow_unlock(d);
1404 for_each_vcpu(d,v)
1405 update_pagetables(v);
1407 domain_unpause(d);
1409 return rc;
1412 unsigned long
1413 get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn)
1415 unsigned long va, tabpfn;
1416 l1_pgentry_t *l1, l1e;
1417 l2_pgentry_t *l2, l2e;
1419 ASSERT(shadow_mode_translate(d));
1421 perfc_incrc(get_mfn_from_gpfn_foreign);
1423 va = gpfn << PAGE_SHIFT;
1424 tabpfn = pagetable_get_pfn(d->arch.phys_table);
1425 l2 = map_domain_page(tabpfn);
1426 l2e = l2[l2_table_offset(va)];
1427 unmap_domain_page(l2);
1428 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1430 printk("%s(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte "\n",
1431 __func__, d->domain_id, gpfn, l2e_get_intpte(l2e));
1432 return INVALID_MFN;
1434 l1 = map_domain_page(l2e_get_pfn(l2e));
1435 l1e = l1[l1_table_offset(va)];
1436 unmap_domain_page(l1);
1438 #if 0
1439 printk("%s(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx l1tab=%lx, l1e=%lx\n",
1440 __func__, d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, l1tab, l1e);
1441 #endif
1443 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
1445 printk("%s(d->id=%d, gpfn=%lx) => 0 l1e=%" PRIpte "\n",
1446 __func__, d->domain_id, gpfn, l1e_get_intpte(l1e));
1447 return INVALID_MFN;
1450 return l1e_get_pfn(l1e);
1453 static unsigned long
1454 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
1455 unsigned long smfn)
1457 unsigned long hl2mfn;
1458 l1_pgentry_t *hl2;
1459 l2_pgentry_t *gpgd;
1460 int limit;
1461 int x;
1463 ASSERT(PGT_base_page_table == PGT_l2_page_table);
1465 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
1467 printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
1468 gpfn, gmfn);
1469 BUG(); /* XXX Deal gracefully with failure. */
1472 SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
1473 gpfn, gmfn, smfn, hl2mfn);
1474 perfc_incrc(shadow_hl2_table_count);
1476 hl2 = map_domain_page(hl2mfn);
1478 if ( shadow_mode_external(d) )
1479 limit = L2_PAGETABLE_ENTRIES;
1480 else
1481 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
1483 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
1485 if ( !shadow_mode_external(d) )
1487 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
1488 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1490 // Setup easy access to the GL2, SL2, and HL2 frames.
1491 //
1492 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
1493 l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
1494 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1495 l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
1496 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
1497 l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
1500 gpgd = map_domain_page(gmfn);
1501 for (x = 0; x < DOMAIN_ENTRIES_PER_L2_PAGETABLE; x++)
1502 validate_hl2e_change(d, gpgd[x], &hl2[x]);
1503 unmap_domain_page(gpgd);
1505 unmap_domain_page(hl2);
1507 return hl2mfn;
1510 /*
1511 * This could take and use a snapshot, and validate the entire page at
1512 * once, or it could continue to fault in entries one at a time...
1513 * Might be worth investigating...
1514 */
1515 static unsigned long shadow_l2_table(
1516 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1518 unsigned long smfn;
1519 l2_pgentry_t *spl2e;
1520 int i;
1522 SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
1524 perfc_incrc(shadow_l2_table_count);
1526 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
1528 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
1529 gpfn, gmfn);
1530 BUG(); /* XXX Deal gracefully with failure. */
1533 spl2e = (l2_pgentry_t *)map_domain_page(smfn);
1535 /* Install hypervisor and 2x linear p.t. mapings. */
1536 if ( (PGT_base_page_table == PGT_l2_page_table) &&
1537 !shadow_mode_external(d) )
1539 /*
1540 * We could proactively fill in PDEs for pages that are already
1541 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
1542 * (restriction required for coherence of the accessed bit). However,
1543 * we tried it and it didn't help performance. This is simpler.
1544 */
1545 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
1547 /* Install hypervisor and 2x linear p.t. mapings. */
1548 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1549 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1550 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1552 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1553 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
1555 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1556 spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1557 l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
1558 arch.mm_perdomain_pt) + i,
1559 __PAGE_HYPERVISOR);
1561 if ( shadow_mode_translate(d) ) // NB: not external
1563 unsigned long hl2mfn;
1565 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
1566 l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
1567 __PAGE_HYPERVISOR);
1569 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
1570 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
1572 // shadow_mode_translate (but not external) sl2 tables hold a
1573 // ref to their hl2.
1574 //
1575 if ( !get_shadow_ref(hl2mfn) )
1576 BUG();
1578 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1579 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
1581 else
1582 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1583 l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
1585 else
1587 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
1590 unmap_domain_page(spl2e);
1592 SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
1593 return smfn;
1596 void shadow_map_l1_into_current_l2(unsigned long va)
1598 struct vcpu *v = current;
1599 struct domain *d = v->domain;
1600 l1_pgentry_t *gpl1e, *spl1e;
1601 l2_pgentry_t gl2e, sl2e;
1602 unsigned long gl1pfn, gl1mfn, sl1mfn;
1603 int i, init_table = 0;
1605 __guest_get_l2e(v, va, &gl2e);
1606 ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
1607 gl1pfn = l2e_get_pfn(gl2e);
1609 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
1611 /* This L1 is NOT already shadowed so we need to shadow it. */
1612 SH_VVLOG("4a: l1 not shadowed");
1614 gl1mfn = gmfn_to_mfn(d, gl1pfn);
1615 if ( unlikely(!VALID_MFN(gl1mfn)) )
1617 // Attempt to use an invalid pfn as an L1 page.
1618 // XXX this needs to be more graceful!
1619 BUG();
1622 if ( unlikely(!(sl1mfn =
1623 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
1625 printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
1626 gl1pfn, gl1mfn);
1627 BUG(); /* XXX Need to deal gracefully with failure. */
1630 perfc_incrc(shadow_l1_table_count);
1631 init_table = 1;
1633 else
1635 /* This L1 is shadowed already, but the L2 entry is missing. */
1636 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
1639 #ifndef NDEBUG
1641 l2_pgentry_t old_sl2e;
1642 __shadow_get_l2e(v, va, &old_sl2e);
1643 ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
1645 #endif
1647 if ( !get_shadow_ref(sl1mfn) )
1648 BUG();
1649 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
1650 __guest_set_l2e(v, va, gl2e);
1651 __shadow_set_l2e(v, va, sl2e);
1653 if ( init_table )
1655 l1_pgentry_t sl1e;
1656 int index = l1_table_offset(va);
1657 int min = 1, max = 0;
1659 gpl1e = &(linear_pg_table[l1_linear_offset(va) &
1660 ~(L1_PAGETABLE_ENTRIES-1)]);
1662 spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
1663 ~(L1_PAGETABLE_ENTRIES-1)]);
1665 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1667 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
1668 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
1669 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
1670 sl1e = l1e_empty();
1671 if ( l1e_get_flags(sl1e) == 0 )
1673 // First copy entries from 0 until first invalid.
1674 // Then copy entries from index until first invalid.
1675 //
1676 if ( i < index ) {
1677 i = index - 1;
1678 continue;
1680 break;
1682 spl1e[i] = sl1e;
1683 if ( unlikely(i < min) )
1684 min = i;
1685 if ( likely(i > max) )
1686 max = i;
1687 set_guest_back_ptr(d, sl1e, sl1mfn, i);
1690 mfn_to_page(sl1mfn)->tlbflush_timestamp =
1691 SHADOW_ENCODE_MIN_MAX(min, max);
1695 void shadow_invlpg(struct vcpu *v, unsigned long va)
1697 struct domain *d = v->domain;
1698 l1_pgentry_t gpte, spte;
1700 ASSERT(shadow_mode_enabled(d));
1702 shadow_lock(d);
1704 __shadow_sync_va(v, va);
1706 // XXX mafetter: will need to think about 4MB pages...
1708 // It's not strictly necessary to update the shadow here,
1709 // but it might save a fault later.
1710 //
1711 if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
1712 sizeof(gpte))) {
1713 perfc_incrc(shadow_invlpg_faults);
1714 shadow_unlock(d);
1715 return;
1717 l1pte_propagate_from_guest(d, gpte, &spte);
1718 shadow_set_l1e(va, spte, 1);
1720 shadow_unlock(d);
1723 struct out_of_sync_entry *
1724 shadow_alloc_oos_entry(struct domain *d)
1726 struct out_of_sync_entry *f, *extra;
1727 unsigned size, i;
1729 if ( unlikely(d->arch.out_of_sync_free == NULL) )
1731 FSH_LOG("Allocate more fullshadow tuple blocks.");
1733 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
1734 extra = xmalloc_bytes(size);
1736 /* XXX Should be more graceful here. */
1737 if ( extra == NULL )
1738 BUG();
1740 memset(extra, 0, size);
1742 /* Record the allocation block so it can be correctly freed later. */
1743 d->arch.out_of_sync_extras_count++;
1744 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
1745 d->arch.out_of_sync_extras;
1746 d->arch.out_of_sync_extras = &extra[0];
1748 /* Thread a free chain through the newly-allocated nodes. */
1749 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
1750 extra[i].next = &extra[i+1];
1751 extra[i].next = NULL;
1753 /* Add the new nodes to the free list. */
1754 d->arch.out_of_sync_free = &extra[0];
1757 /* Allocate a new node from the quicklist. */
1758 f = d->arch.out_of_sync_free;
1759 d->arch.out_of_sync_free = f->next;
1761 return f;
1764 static inline unsigned long
1765 shadow_make_snapshot(
1766 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1768 unsigned long smfn, sl1mfn = 0;
1769 void *original, *snapshot;
1770 u32 min_max = 0;
1771 int min, max, length;
1773 if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
1775 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
1776 return SHADOW_SNAPSHOT_ELSEWHERE;
1779 perfc_incrc(shadow_make_snapshot);
1781 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
1783 printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
1784 "Dom%d snapshot_count_count=%d\n",
1785 gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
1786 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
1789 if ( !get_shadow_ref(smfn) )
1790 BUG();
1792 if ( shadow_mode_refcounts(d) &&
1793 (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
1794 min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
1795 mfn_to_page(smfn)->tlbflush_timestamp = min_max;
1797 min = SHADOW_MIN(min_max);
1798 max = SHADOW_MAX(min_max);
1799 length = max - min + 1;
1800 perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
1802 min *= sizeof(l1_pgentry_t);
1803 length *= sizeof(l1_pgentry_t);
1805 original = map_domain_page(gmfn);
1806 snapshot = map_domain_page(smfn);
1807 memcpy(snapshot + min, original + min, length);
1808 unmap_domain_page(original);
1809 unmap_domain_page(snapshot);
1811 return smfn;
1814 static void
1815 shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
1817 void *snapshot;
1819 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1820 return;
1822 // Clear the out_of_sync bit.
1823 //
1824 clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info);
1826 // XXX Need to think about how to protect the domain's
1827 // information less expensively.
1828 //
1829 snapshot = map_domain_page(entry->snapshot_mfn);
1830 memset(snapshot, 0, PAGE_SIZE);
1831 unmap_domain_page(snapshot);
1833 put_shadow_ref(entry->snapshot_mfn);
1836 struct out_of_sync_entry *
1837 __shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
1838 unsigned long mfn)
1840 struct domain *d = v->domain;
1841 struct page_info *page = mfn_to_page(mfn);
1842 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
1844 ASSERT(shadow_lock_is_acquired(d));
1845 ASSERT(mfn_valid(mfn));
1847 #ifndef NDEBUG
1849 u32 type = page->u.inuse.type_info & PGT_type_mask;
1850 if ( shadow_mode_refcounts(d) )
1852 ASSERT(type == PGT_writable_page);
1854 else
1856 ASSERT(type && (type < PGT_l4_page_table));
1859 #endif
1861 FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08lx", __func__,
1862 gpfn, mfn, page->count_info, page->u.inuse.type_info);
1864 // XXX this will require some more thought... Cross-domain sharing and
1865 // modification of page tables? Hmm...
1866 //
1867 if ( d != page_get_owner(page) )
1868 BUG();
1870 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
1872 entry->v = v;
1873 entry->gpfn = gpfn;
1874 entry->gmfn = mfn;
1875 entry->writable_pl1e = -1;
1877 #if SHADOW_DEBUG
1878 mark_shadows_as_reflecting_snapshot(d, gpfn);
1879 #endif
1881 // increment guest's ref count to represent the entry in the
1882 // full shadow out-of-sync list.
1883 //
1884 get_page(page, d);
1886 return entry;
1889 struct out_of_sync_entry *
1890 shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
1891 unsigned long mfn)
1893 struct out_of_sync_entry *entry =
1894 __shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
1895 struct domain *d = v->domain;
1897 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1898 // Add to the out-of-sync list
1899 //
1900 entry->next = d->arch.out_of_sync;
1901 d->arch.out_of_sync = entry;
1903 return entry;
1906 void shadow_mark_va_out_of_sync(
1907 struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
1909 struct out_of_sync_entry *entry =
1910 __shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
1911 l2_pgentry_t sl2e;
1912 struct domain *d = v->domain;
1914 // We need the address of shadow PTE that maps @va.
1915 // It might not exist yet. Make sure it's there.
1916 //
1917 __shadow_get_l2e(v, va, &sl2e);
1918 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
1920 // either this L1 isn't shadowed yet, or the shadow isn't linked into
1921 // the current L2.
1922 shadow_map_l1_into_current_l2(va);
1923 __shadow_get_l2e(v, va, &sl2e);
1925 ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
1927 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1928 // NB: this is stored as a machine address.
1929 entry->writable_pl1e =
1930 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
1931 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
1932 entry->va = va;
1934 // Increment shadow's page count to represent the reference
1935 // inherent in entry->writable_pl1e
1936 //
1937 if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
1938 BUG();
1940 // Add to the out-of-sync list
1941 //
1942 entry->next = d->arch.out_of_sync;
1943 d->arch.out_of_sync = entry;
1945 FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
1946 __func__, va, entry->writable_pl1e);
1949 /*
1950 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
1951 * Returns 0 otherwise.
1952 */
1953 static int snapshot_entry_matches(
1954 struct domain *d, l1_pgentry_t *guest_pt,
1955 unsigned long gpfn, unsigned index)
1957 unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
1958 l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
1959 int entries_match;
1961 perfc_incrc(snapshot_entry_matches_calls);
1963 if ( !smfn )
1964 return 0;
1966 snapshot = map_domain_page(smfn);
1968 if (__copy_from_user(&gpte, &guest_pt[index],
1969 sizeof(gpte))) {
1970 unmap_domain_page(snapshot);
1971 return 0;
1974 // This could probably be smarter, but this is sufficent for
1975 // our current needs.
1976 //
1977 entries_match = !l1e_has_changed(gpte, snapshot[index],
1978 PAGE_FLAG_MASK);
1980 unmap_domain_page(snapshot);
1982 #ifdef PERF_COUNTERS
1983 if ( entries_match )
1984 perfc_incrc(snapshot_entry_matches_true);
1985 #endif
1987 return entries_match;
1990 /*
1991 * Returns 1 if va's shadow mapping is out-of-sync.
1992 * Returns 0 otherwise.
1993 */
1994 int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
1996 struct domain *d = v->domain;
1997 unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
1998 unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
1999 l2_pgentry_t l2e;
2000 unsigned long l1pfn, l1mfn;
2002 ASSERT(shadow_lock_is_acquired(d));
2003 ASSERT(VALID_M2P(l2pfn));
2005 perfc_incrc(shadow_out_of_sync_calls);
2007 if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
2008 !snapshot_entry_matches(d, (l1_pgentry_t *)v->arch.guest_vtable,
2009 l2pfn, l2_table_offset(va)) )
2010 return 1;
2012 __guest_get_l2e(v, va, &l2e);
2013 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
2014 return 0;
2016 l1pfn = l2e_get_pfn(l2e);
2017 l1mfn = gmfn_to_mfn(d, l1pfn);
2019 // If the l1 pfn is invalid, it can't be out of sync...
2020 if ( !VALID_MFN(l1mfn) )
2021 return 0;
2023 if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
2024 !snapshot_entry_matches(
2025 d, &linear_pg_table[l1_linear_offset(va) & ~(L1_PAGETABLE_ENTRIES-1)],
2026 l1pfn, l1_table_offset(va)) )
2027 return 1;
2029 return 0;
2032 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
2033 static inline unsigned long
2034 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
2036 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
2039 static inline void
2040 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
2042 unsigned long score = prediction & PGT_score_mask;
2043 int create = (score == 0);
2045 // saturating addition
2046 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
2047 score = score ? score : PGT_score_mask;
2049 prediction = (prediction & PGT_mfn_mask) | score;
2051 //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create);
2052 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
2054 if ( create )
2055 perfc_incr(writable_pte_predictions);
2058 static inline void
2059 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
2061 unsigned long score = prediction & PGT_score_mask;
2062 ASSERT(score);
2064 // divide score by 2... We don't like bad predictions.
2065 //
2066 score = (score >> 1) & PGT_score_mask;
2068 prediction = (prediction & PGT_mfn_mask) | score;
2070 //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score);
2072 if ( score )
2073 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
2074 else
2076 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
2077 perfc_decr(writable_pte_predictions);
2081 static void
2082 free_writable_pte_predictions(struct domain *d)
2084 int i;
2085 struct shadow_status *x;
2087 for ( i = 0; i < shadow_ht_buckets; i++ )
2089 u32 count;
2090 unsigned long *gpfn_list;
2092 /* Skip empty buckets. */
2093 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
2094 continue;
2096 count = 0;
2097 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
2098 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
2099 count++;
2101 gpfn_list = xmalloc_array(unsigned long, count);
2102 count = 0;
2103 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
2104 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
2105 gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
2107 while ( count )
2109 count--;
2110 /* delete_shadow_status() may do a shadow_audit(), so we need to
2111 * keep an accurate count of writable_pte_predictions to keep it
2112 * happy.
2113 */
2114 delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
2115 perfc_decr(writable_pte_predictions);
2118 xfree(gpfn_list);
2122 static int fix_entry(
2123 struct domain *d,
2124 l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
2126 l1_pgentry_t old = *pt;
2127 l1_pgentry_t new = old;
2129 l1e_remove_flags(new,_PAGE_RW);
2130 if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
2131 BUG();
2132 (*found)++;
2133 *pt = new;
2134 if ( is_l1_shadow )
2135 shadow_put_page_from_l1e(old, d);
2137 return (*found == max_refs_to_find);
2140 static u32 remove_all_write_access_in_ptpage(
2141 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
2142 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
2143 u32 max_refs_to_find, unsigned long prediction)
2145 l1_pgentry_t *pt = map_domain_page(pt_mfn);
2146 l1_pgentry_t match;
2147 unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
2148 int i;
2149 u32 found = 0;
2150 int is_l1_shadow =
2151 ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
2152 PGT_l1_shadow);
2154 match = l1e_from_pfn(readonly_gmfn, flags);
2156 if ( shadow_mode_external(d) ) {
2157 i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask)
2158 >> PGT_va_shift;
2160 if ( (i >= 0 && i < L1_PAGETABLE_ENTRIES) &&
2161 !l1e_has_changed(pt[i], match, flags) &&
2162 fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) &&
2163 !prediction )
2164 goto out;
2167 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
2169 if ( unlikely(!l1e_has_changed(pt[i], match, flags)) &&
2170 fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) )
2171 break;
2174 out:
2175 unmap_domain_page(pt);
2177 return found;
2180 int shadow_remove_all_write_access(
2181 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
2183 int i;
2184 struct shadow_status *a;
2185 u32 found = 0, write_refs;
2186 unsigned long predicted_smfn;
2188 ASSERT(shadow_lock_is_acquired(d));
2189 ASSERT(VALID_MFN(readonly_gmfn));
2191 perfc_incrc(remove_write_access);
2193 // If it's not a writable page, then no writable refs can be outstanding.
2194 //
2195 if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
2196 PGT_writable_page )
2198 perfc_incrc(remove_write_not_writable);
2199 return 1;
2202 // How many outstanding writable PTEs for this page are there?
2203 //
2204 write_refs =
2205 (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
2206 if ( write_refs && MFN_PINNED(readonly_gmfn) )
2208 write_refs--;
2211 if ( write_refs == 0 )
2213 perfc_incrc(remove_write_no_work);
2214 return 1;
2217 if ( shadow_mode_external(d) ) {
2218 if (--write_refs == 0)
2219 return 0;
2221 // Use the back pointer to locate the shadow page that can contain
2222 // the PTE of interest
2223 if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
2224 found += remove_all_write_access_in_ptpage(
2225 d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
2226 if ( found == write_refs )
2227 return 0;
2231 // Search all the shadow L1 page tables...
2232 //
2233 for (i = 0; i < shadow_ht_buckets; i++)
2235 a = &d->arch.shadow_ht[i];
2236 while ( a && a->gpfn_and_flags )
2238 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
2240 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
2241 if ( found == write_refs )
2242 return 0;
2245 a = a->next;
2249 FSH_LOG("%s: looking for %d refs, found %d refs",
2250 __func__, write_refs, found);
2252 return 0;
2255 static u32 remove_all_access_in_page(
2256 struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
2258 l1_pgentry_t *pl1e = map_domain_page(l1mfn);
2259 l1_pgentry_t match, ol2e;
2260 unsigned long flags = _PAGE_PRESENT;
2261 int i;
2262 u32 count = 0;
2263 int is_l1_shadow =
2264 ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) ==
2265 PGT_l1_shadow);
2267 match = l1e_from_pfn(forbidden_gmfn, flags);
2269 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
2271 if ( l1e_has_changed(pl1e[i], match, flags) )
2272 continue;
2274 ol2e = pl1e[i];
2275 pl1e[i] = l1e_empty();
2276 count++;
2278 if ( is_l1_shadow )
2279 shadow_put_page_from_l1e(ol2e, d);
2280 else /* must be an hl2 page */
2281 put_page(mfn_to_page(forbidden_gmfn));
2284 unmap_domain_page(pl1e);
2286 return count;
2289 u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
2291 int i;
2292 struct shadow_status *a;
2293 u32 count = 0;
2295 if ( unlikely(!shadow_mode_enabled(d)) )
2296 return 0;
2298 ASSERT(shadow_lock_is_acquired(d));
2299 perfc_incrc(remove_all_access);
2301 for (i = 0; i < shadow_ht_buckets; i++)
2303 a = &d->arch.shadow_ht[i];
2304 while ( a && a->gpfn_and_flags )
2306 switch (a->gpfn_and_flags & PGT_type_mask)
2308 case PGT_l1_shadow:
2309 case PGT_l2_shadow:
2310 case PGT_l3_shadow:
2311 case PGT_l4_shadow:
2312 case PGT_hl2_shadow:
2313 count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
2314 break;
2315 case PGT_snapshot:
2316 case PGT_writable_pred:
2317 // these can't hold refs to the forbidden page
2318 break;
2319 default:
2320 BUG();
2323 a = a->next;
2327 return count;
2330 static int resync_all(struct domain *d, u32 stype)
2332 struct out_of_sync_entry *entry;
2333 unsigned i;
2334 unsigned long smfn;
2335 void *guest, *shadow, *snapshot;
2336 int need_flush = 0, external = shadow_mode_external(d);
2337 int unshadow;
2338 int changed;
2339 u32 min_max_shadow, min_max_snapshot;
2340 int min_shadow, max_shadow, min_snapshot, max_snapshot;
2341 struct vcpu *v;
2343 ASSERT(shadow_lock_is_acquired(d));
2345 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2347 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
2348 continue;
2350 smfn = __shadow_status(d, entry->gpfn, stype);
2352 if ( !smfn )
2354 // For heavy weight shadows: no need to update refcounts if
2355 // there's no shadow page.
2356 //
2357 if ( shadow_mode_refcounts(d) )
2358 continue;
2360 // For light weight shadows: only need up resync the refcounts to
2361 // the new contents of the guest page iff this it has the right
2362 // page type.
2363 //
2364 if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
2365 continue;
2368 FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
2369 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
2371 // Compare guest's new contents to its snapshot, validating
2372 // and updating its shadow as appropriate.
2373 //
2374 guest = map_domain_page(entry->gmfn);
2375 snapshot = map_domain_page(entry->snapshot_mfn);
2377 if ( smfn )
2378 shadow = map_domain_page(smfn);
2379 else
2380 shadow = NULL;
2382 unshadow = 0;
2384 switch ( stype ) {
2385 case PGT_l1_shadow:
2387 l1_pgentry_t *guest1 = guest;
2388 l1_pgentry_t *shadow1 = shadow;
2389 l1_pgentry_t *snapshot1 = snapshot;
2390 int unshadow_l1 = 0;
2392 ASSERT(shadow_mode_write_l1(d) ||
2393 shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
2395 if ( !shadow_mode_refcounts(d) )
2396 revalidate_l1(d, guest1, snapshot1);
2398 if ( !smfn )
2399 break;
2401 min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
2402 min_shadow = SHADOW_MIN(min_max_shadow);
2403 max_shadow = SHADOW_MAX(min_max_shadow);
2405 min_max_snapshot =
2406 mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
2407 min_snapshot = SHADOW_MIN(min_max_snapshot);
2408 max_snapshot = SHADOW_MAX(min_max_snapshot);
2410 changed = 0;
2412 for ( i = min_shadow; i <= max_shadow; i++ )
2414 if ( (i < min_snapshot) || (i > max_snapshot) ||
2415 l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
2417 int error;
2419 error = validate_pte_change(d, guest1[i], &shadow1[i]);
2420 if ( error == -1 )
2421 unshadow_l1 = 1;
2422 else {
2423 need_flush |= error;
2424 set_guest_back_ptr(d, shadow1[i], smfn, i);
2427 // can't update snapshots of linear page tables -- they
2428 // are used multiple times...
2429 //
2430 // snapshot[i] = new_pte;
2431 changed++;
2434 perfc_incrc(resync_l1);
2435 perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
2436 perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
2437 if (unshadow_l1) {
2438 l2_pgentry_t l2e;
2440 __shadow_get_l2e(entry->v, entry->va, &l2e);
2441 if (l2e_get_flags(l2e) & _PAGE_PRESENT) {
2442 put_shadow_ref(l2e_get_pfn(l2e));
2443 l2e = l2e_empty();
2444 __shadow_set_l2e(entry->v, entry->va, l2e);
2446 if (entry->v == current)
2447 need_flush = 1;
2451 break;
2453 case PGT_l2_shadow:
2455 int max = -1;
2457 l2_pgentry_t *guest2 = guest;
2458 l2_pgentry_t *shadow2 = shadow;
2459 l2_pgentry_t *snapshot2 = snapshot;
2461 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
2462 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
2464 changed = 0;
2465 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2467 l2_pgentry_t new_pde = guest2[i];
2469 if ( !is_guest_l2_slot(0,i) && !external )
2470 continue;
2472 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
2474 need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
2476 // can't update snapshots of linear page tables -- they
2477 // are used multiple times...
2478 //
2479 // snapshot[i] = new_pde;
2481 changed++;
2483 if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
2484 max = i;
2486 // XXX - This hack works for linux guests.
2487 // Need a better solution long term.
2488 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
2489 unlikely(l2e_get_intpte(new_pde) != 0) &&
2490 !unshadow && MFN_PINNED(smfn) )
2491 unshadow = 1;
2493 if ( max == -1 )
2494 unshadow = 1;
2495 perfc_incrc(resync_l2);
2496 perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
2497 break;
2499 case PGT_hl2_shadow:
2501 l2_pgentry_t *guest2 = guest;
2502 l2_pgentry_t *snapshot2 = snapshot;
2503 l1_pgentry_t *shadow2 = shadow;
2505 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
2506 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
2508 changed = 0;
2509 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2511 l2_pgentry_t new_pde = guest2[i];
2513 if ( !is_guest_l2_slot(0, i) && !external )
2514 continue;
2516 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
2518 need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
2520 // can't update snapshots of linear page tables -- they
2521 // are used multiple times...
2522 //
2523 // snapshot[i] = new_pde;
2525 changed++;
2528 perfc_incrc(resync_hl2);
2529 perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
2530 break;
2532 default:
2533 BUG();
2536 if ( smfn )
2537 unmap_domain_page(shadow);
2538 unmap_domain_page(snapshot);
2539 unmap_domain_page(guest);
2541 if ( unlikely(unshadow) )
2543 for_each_vcpu(d, v)
2544 if(smfn == pagetable_get_pfn(v->arch.shadow_table))
2545 return need_flush;
2546 perfc_incrc(unshadow_l2_count);
2547 shadow_unpin(smfn);
2548 if ( unlikely(shadow_mode_external(d)) )
2550 unsigned long hl2mfn;
2552 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
2553 MFN_PINNED(hl2mfn) )
2554 shadow_unpin(hl2mfn);
2559 return need_flush;
2562 void __shadow_sync_all(struct domain *d)
2564 struct out_of_sync_entry *entry;
2565 int need_flush = 0;
2566 l1_pgentry_t *ppte, opte, npte;
2567 cpumask_t other_vcpus_mask;
2569 perfc_incrc(shadow_sync_all);
2571 ASSERT(shadow_lock_is_acquired(d));
2573 // First, remove all write permissions to the page tables
2574 //
2575 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2577 // Skip entries that have low bits set... Those aren't
2578 // real PTEs.
2579 //
2580 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
2581 continue;
2583 ppte = (l1_pgentry_t *)(
2584 (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
2585 (entry->writable_pl1e & ~PAGE_MASK));
2586 opte = npte = *ppte;
2587 l1e_remove_flags(npte, _PAGE_RW);
2589 if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
2590 !shadow_get_page_from_l1e(npte, d) )
2591 BUG();
2592 *ppte = npte;
2593 set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT,
2594 (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
2595 shadow_put_page_from_l1e(opte, d);
2597 unmap_domain_page(ppte);
2600 /* Other VCPUs mustn't use the revoked writable mappings. */
2601 other_vcpus_mask = d->domain_dirty_cpumask;
2602 cpu_clear(smp_processor_id(), other_vcpus_mask);
2603 flush_tlb_mask(other_vcpus_mask);
2605 /* Flush ourself later. */
2606 need_flush = 1;
2608 /* Second, resync all L1 pages, then L2 pages, etc... */
2609 need_flush |= resync_all(d, PGT_l1_shadow);
2610 if ( shadow_mode_translate(d) )
2611 need_flush |= resync_all(d, PGT_hl2_shadow);
2612 need_flush |= resync_all(d, PGT_l2_shadow);
2614 if ( need_flush && !unlikely(shadow_mode_external(d)) )
2615 local_flush_tlb();
2617 free_out_of_sync_state(d);
2620 int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
2622 l1_pgentry_t gpte, spte, orig_gpte;
2623 struct vcpu *v = current;
2624 struct domain *d = v->domain;
2625 l2_pgentry_t gpde;
2627 spte = l1e_empty();
2629 SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
2630 va, (unsigned long)regs->error_code);
2631 perfc_incrc(shadow_fault_calls);
2633 check_pagetable(v, "pre-sf");
2635 /*
2636 * Don't let someone else take the guest's table pages out-of-sync.
2637 */
2638 shadow_lock(d);
2640 /* XXX - FIX THIS COMMENT!!!
2641 * STEP 1. Check to see if this fault might have been caused by an
2642 * out-of-sync table page entry, or if we should pass this
2643 * fault onto the guest.
2644 */
2645 __shadow_sync_va(v, va);
2647 /*
2648 * STEP 2. Check the guest PTE.
2649 */
2650 __guest_get_l2e(v, va, &gpde);
2651 if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
2653 SH_VVLOG("shadow_fault - EXIT: L2 not present (%x)",
2654 l2e_get_intpte(gpde));
2655 perfc_incrc(shadow_fault_bail_pde_not_present);
2656 goto fail;
2659 // This can't fault because we hold the shadow lock and we've ensured that
2660 // the mapping is in-sync, so the check of the PDE's present bit, above,
2661 // covers this access.
2662 //
2663 orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
2664 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2666 SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ") (gpde %" PRIpte ")",
2667 l1e_get_intpte(gpte),
2668 l2e_get_intpte(gpde));
2669 perfc_incrc(shadow_fault_bail_pte_not_present);
2670 goto fail;
2673 /* Write fault? */
2674 if ( regs->error_code & 2 )
2676 int allow_writes = 0;
2678 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
2680 if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
2682 allow_writes = 1;
2683 l1e_add_flags(gpte, _PAGE_RW);
2685 else
2687 /* Write fault on a read-only mapping. */
2688 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
2689 l1e_get_intpte(gpte));
2690 perfc_incrc(shadow_fault_bail_ro_mapping);
2691 goto fail;
2694 else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
2696 SH_LOG("l1pte_write_fault: no write access to page table page");
2697 domain_crash_synchronous();
2700 /* User access violation in guest? */
2701 if ( unlikely((regs->error_code & 4) &&
2702 !(l1e_get_flags(gpte) & _PAGE_USER)))
2704 SH_VVLOG("shadow_fault - EXIT: wr fault on super page (%" PRIpte ")",
2705 l1e_get_intpte(gpte));
2706 goto fail;
2710 if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
2712 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
2713 perfc_incrc(write_fault_bail);
2714 shadow_unlock(d);
2715 return 0;
2718 if ( allow_writes )
2719 l1e_remove_flags(gpte, _PAGE_RW);
2721 else
2723 /* Read-protection violation in guest? */
2724 if ( unlikely((regs->error_code & 1) ))
2726 SH_VVLOG("shadow_fault - EXIT: read fault on super page (%" PRIpte ")",
2727 l1e_get_intpte(gpte));
2728 goto fail;
2733 if ( !l1pte_read_fault(d, &gpte, &spte) )
2735 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
2736 perfc_incrc(read_fault_bail);
2737 shadow_unlock(d);
2738 return 0;
2742 /*
2743 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
2744 */
2745 if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
2747 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
2748 if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
2749 &gpte, sizeof(gpte))) )
2751 printk("%s() failed, crashing domain %d "
2752 "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
2753 __func__,d->domain_id, l2e_get_intpte(gpde), va);
2754 domain_crash_synchronous();
2757 __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
2760 shadow_set_l1e(va, spte, 1);
2762 perfc_incrc(shadow_fault_fixed);
2763 d->arch.shadow_fault_count++;
2765 shadow_unlock(d);
2767 check_pagetable(v, "post-sf");
2768 return EXCRET_fault_fixed;
2770 fail:
2771 shadow_unlock(d);
2772 return 0;
2775 void shadow_l1_normal_pt_update(
2776 struct domain *d,
2777 unsigned long pa, l1_pgentry_t gpte,
2778 struct domain_mmap_cache *cache)
2780 unsigned long sl1mfn;
2781 l1_pgentry_t *spl1e, spte;
2783 shadow_lock(d);
2785 sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
2786 if ( sl1mfn )
2788 SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte,
2789 (void *)pa, l1e_get_intpte(gpte));
2790 l1pte_propagate_from_guest(current->domain, gpte, &spte);
2792 spl1e = map_domain_page_with_cache(sl1mfn, cache);
2793 spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
2794 unmap_domain_page_with_cache(spl1e, cache);
2797 shadow_unlock(d);
2800 void shadow_l2_normal_pt_update(
2801 struct domain *d,
2802 unsigned long pa, l2_pgentry_t gpde,
2803 struct domain_mmap_cache *cache)
2805 unsigned long sl2mfn, hl2mfn;
2806 l2_pgentry_t *spl2e;
2807 l1_pgentry_t *hl2e;
2809 shadow_lock(d);
2811 sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
2812 if ( sl2mfn )
2814 SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
2815 (void *)pa, l2e_get_intpte(gpde));
2816 spl2e = map_domain_page_with_cache(sl2mfn, cache);
2817 validate_pde_change(d, gpde,
2818 &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
2819 unmap_domain_page_with_cache(spl2e, cache);
2821 hl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT,
2822 PGT_hl2_shadow);
2823 if ( hl2mfn )
2825 hl2e = map_domain_page(hl2mfn);
2826 validate_hl2e_change(d, gpde,
2827 &hl2e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)]);
2828 unmap_domain_page(hl2e);
2831 shadow_unlock(d);
2834 #if CONFIG_PAGING_LEVELS >= 3
2835 void shadow_l3_normal_pt_update(
2836 struct domain *d,
2837 unsigned long pa, l3_pgentry_t gpde,
2838 struct domain_mmap_cache *cache)
2840 BUG(); // not yet implemented
2842 #endif
2844 #if CONFIG_PAGING_LEVELS >= 4
2845 void shadow_l4_normal_pt_update(
2846 struct domain *d,
2847 unsigned long pa, l4_pgentry_t gpde,
2848 struct domain_mmap_cache *cache)
2850 BUG(); // not yet implemented
2852 #endif
2854 int shadow_do_update_va_mapping(unsigned long va,
2855 l1_pgentry_t val,
2856 struct vcpu *v)
2858 struct domain *d = v->domain;
2859 l1_pgentry_t spte;
2860 int rc = 0;
2862 shadow_lock(d);
2864 // This is actually overkill - we don't need to sync the L1 itself,
2865 // just everything involved in getting to this L1 (i.e. we need
2866 // linear_pg_table[l1_linear_offset(va)] to be in sync)...
2867 //
2868 __shadow_sync_va(v, va);
2870 l1pte_propagate_from_guest(d, val, &spte);
2871 shadow_set_l1e(va, spte, 0);
2873 /*
2874 * If we're in log-dirty mode then we need to note that we've updated
2875 * the PTE in the PT-holding page. We need the machine frame number
2876 * for this.
2877 */
2878 __mark_dirty(d, va_to_l1mfn(v, va));
2880 shadow_unlock(d);
2882 return rc;
2886 /*
2887 * What lives where in the 32-bit address space in the various shadow modes,
2888 * and what it uses to get/maintain that mapping.
2890 * SHADOW MODE: none enable translate external
2892 * 4KB things:
2893 * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
2894 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
2895 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
2896 * monitor_vtable n/a n/a n/a mapped once
2898 * 4MB things:
2899 * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
2900 * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
2901 * monitor_linear n/a n/a n/a ???
2902 * perdomain perdomain perdomain perdomain perdomain
2903 * R/O M2P R/O M2P R/O M2P n/a n/a
2904 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
2905 * P2M n/a n/a R/O M2P R/O M2P
2907 * NB:
2908 * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
2909 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
2910 * all play a part in maintaining these mappings.
2911 */
2912 void __update_pagetables(struct vcpu *v)
2914 struct domain *d = v->domain;
2915 unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
2916 unsigned long gpfn = mfn_to_gmfn(d, gmfn);
2917 unsigned long smfn, hl2mfn, old_smfn;
2918 int need_sync = 0;
2920 int max_mode = ( shadow_mode_external(d) ? SHM_external
2921 : shadow_mode_translate(d) ? SHM_translate
2922 : shadow_mode_enabled(d) ? SHM_enable
2923 : 0 );
2925 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
2926 ASSERT( max_mode );
2928 /*
2929 * arch.guest_vtable
2930 */
2931 if ( max_mode & (SHM_enable | SHM_external) )
2933 if ( likely(v->arch.guest_vtable != NULL) )
2934 unmap_domain_page_global(v->arch.guest_vtable);
2935 v->arch.guest_vtable = map_domain_page_global(gmfn);
2938 /*
2939 * arch.shadow_table
2940 */
2941 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
2942 smfn = shadow_l2_table(d, gpfn, gmfn);
2943 else
2945 /*
2946 * move sync later in order to avoid this smfn been
2947 * unshadowed occasionally
2948 */
2949 need_sync = 1;
2951 if ( !get_shadow_ref(smfn) )
2952 BUG();
2953 old_smfn = pagetable_get_pfn(v->arch.shadow_table);
2954 v->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
2955 if ( old_smfn )
2956 put_shadow_ref(old_smfn);
2958 SH_VVLOG("__update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
2960 /*
2961 * arch.shadow_vtable
2962 */
2963 if ( max_mode == SHM_external )
2965 if ( v->arch.shadow_vtable )
2966 unmap_domain_page_global(v->arch.shadow_vtable);
2967 v->arch.shadow_vtable = map_domain_page_global(smfn);
2970 /*
2971 * arch.hl2_vtable
2972 */
2974 // if max_mode == SHM_translate, then the hl2 is already installed
2975 // correctly in its smfn, and there's nothing to do.
2976 //
2977 if ( max_mode == SHM_external )
2979 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
2980 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
2981 if ( v->arch.hl2_vtable )
2982 unmap_domain_page_global(v->arch.hl2_vtable);
2983 v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
2986 /*
2987 * fixup pointers in monitor table, as necessary
2988 */
2989 if ( max_mode == SHM_external )
2991 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
2992 l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
2993 l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
2995 ASSERT( shadow_mode_translate(d) );
2997 if ( !get_shadow_ref(hl2mfn) )
2998 BUG();
2999 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
3000 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
3001 if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
3002 put_shadow_ref(l2e_get_pfn(old_hl2e));
3004 if ( !get_shadow_ref(smfn) )
3005 BUG();
3006 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3007 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
3008 if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
3009 put_shadow_ref(l2e_get_pfn(old_sl2e));
3011 // XXX - maybe this can be optimized somewhat??
3012 local_flush_tlb();
3015 if(likely(need_sync))
3016 shadow_sync_all(d);
3019 void clear_all_shadow_status(struct domain *d)
3021 shadow_lock(d);
3022 free_shadow_pages(d);
3023 free_shadow_ht_entries(d);
3024 d->arch.shadow_ht =
3025 xmalloc_array(struct shadow_status, shadow_ht_buckets);
3026 if ( d->arch.shadow_ht == NULL ) {
3027 printk("clear all shadow status:xmalloc fail\n");
3028 domain_crash_synchronous();
3030 memset(d->arch.shadow_ht, 0,
3031 shadow_ht_buckets * sizeof(struct shadow_status));
3033 free_out_of_sync_entries(d);
3034 shadow_unlock(d);
3037 /************************************************************************/
3038 /************************************************************************/
3039 /************************************************************************/
3041 #if SHADOW_DEBUG
3043 // The following is entirely for _check_pagetable()'s benefit.
3044 // _check_pagetable() wants to know whether a given entry in a
3045 // shadow page table is supposed to be the shadow of the guest's
3046 // current entry, or the shadow of the entry held in the snapshot
3047 // taken above.
3048 //
3049 // Here, we mark all currently existing entries as reflecting
3050 // the snapshot, above. All other places in xen that update
3051 // the shadow will keep the shadow in sync with the guest's
3052 // entries (via l1pte_propagate_from_guest and friends), which clear
3053 // the SHADOW_REFLECTS_SNAPSHOT bit.
3054 //
3055 static void
3056 mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
3058 unsigned long smfn;
3059 l1_pgentry_t *l1e;
3060 l2_pgentry_t *l2e;
3061 unsigned i;
3063 if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
3065 l1e = map_domain_page(smfn);
3066 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3067 if ( is_guest_l1_slot(i) &&
3068 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
3069 l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
3070 unmap_domain_page(l1e);
3073 if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
3075 l2e = map_domain_page(smfn);
3076 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3077 if ( is_guest_l2_slot(0, i) &&
3078 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
3079 l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
3080 unmap_domain_page(l2e);
3084 // BUG: these are not SMP safe...
3085 static int sh_l2_present;
3086 static int sh_l1_present;
3087 static char *sh_check_name;
3088 int shadow_status_noswap;
3090 #define v2m(_v, _adr) ({ \
3091 unsigned long _a = (unsigned long)(_adr); \
3092 l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \
3093 unsigned long _pa = -1; \
3094 if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
3095 { \
3096 l1_pgentry_t _pte; \
3097 _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
3098 if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
3099 _pa = l1e_get_paddr(_pte); \
3100 } \
3101 _pa | (_a & ~PAGE_MASK); \
3102 })
3104 #define FAIL(_f, _a...) \
3105 do { \
3106 printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \
3107 sh_check_name, level, l2_idx, l1_idx, ## _a, \
3108 __FILE__, __LINE__); \
3109 printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \
3110 " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \
3111 " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \
3112 " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \
3113 l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \
3114 l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \
3115 p_guest_pte, p_shadow_pte, p_snapshot_pte, \
3116 (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \
3117 (void *)v2m(v, p_snapshot_pte), \
3118 (l2_idx << L2_PAGETABLE_SHIFT) | \
3119 (l1_idx << L1_PAGETABLE_SHIFT)); \
3120 errors++; \
3121 } while ( 0 )
3123 static int check_pte(
3124 struct vcpu *v,
3125 l1_pgentry_t *p_guest_pte,
3126 l1_pgentry_t *p_shadow_pte,
3127 l1_pgentry_t *p_snapshot_pte,
3128 int level, int l2_idx, int l1_idx)
3130 struct domain *d = v->domain;
3131 l1_pgentry_t guest_pte = *p_guest_pte;
3132 l1_pgentry_t shadow_pte = *p_shadow_pte;
3133 l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
3134 l1_pgentry_t eff_guest_pte = l1e_empty();
3135 unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
3136 int errors = 0, guest_writable;
3137 int page_table_page;
3139 if ( (l1e_get_intpte(shadow_pte) == 0) ||
3140 (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
3141 (l1e_get_intpte(shadow_pte) == 0x00000E00) )
3142 return errors; /* always safe */
3144 if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
3145 FAIL("Non zero not present shadow_pte");
3147 if ( level == 2 ) sh_l2_present++;
3148 if ( level == 1 ) sh_l1_present++;
3150 if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
3151 eff_guest_pte = snapshot_pte;
3152 else
3153 eff_guest_pte = guest_pte;
3155 if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
3156 FAIL("Guest not present yet shadow is");
3158 mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
3160 if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
3161 FAIL("Corrupt?");
3163 if ( (level == 1) &&
3164 (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
3165 !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
3166 FAIL("Dirty coherence");
3168 if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
3169 !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
3170 FAIL("Accessed coherence");
3172 if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
3173 FAIL("global bit set in shadow");
3175 eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
3176 eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
3177 shadow_mfn = l1e_get_pfn(shadow_pte);
3179 if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
3180 FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
3181 __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
3183 page_table_page = mfn_is_page_table(eff_guest_mfn);
3185 guest_writable =
3186 (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
3187 (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
3189 if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
3191 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n",
3192 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
3193 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
3194 page_table_page);
3195 FAIL("RW coherence");
3198 if ( (level == 1) &&
3199 (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
3200 !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
3202 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n",
3203 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
3204 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
3205 page_table_page);
3206 FAIL("RW2 coherence");
3209 if ( eff_guest_mfn == shadow_mfn )
3211 if ( level > 1 )
3212 FAIL("Linear map ???"); /* XXX this will fail on BSD */
3214 else
3216 if ( level < 2 )
3217 FAIL("Shadow in L1 entry?");
3219 if ( level == 2 )
3221 if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
3222 FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
3223 __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
3225 else
3226 BUG(); // XXX -- not handled yet.
3229 return errors;
3231 #undef FAIL
3232 #undef v2m
3234 static int check_l1_table(
3235 struct vcpu *v, unsigned long gpfn,
3236 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
3238 struct domain *d = v->domain;
3239 int i;
3240 unsigned long snapshot_mfn;
3241 l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
3242 int errors = 0;
3244 if ( page_out_of_sync(mfn_to_page(gmfn)) )
3246 snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
3247 ASSERT(snapshot_mfn);
3248 p_snapshot = map_domain_page(snapshot_mfn);
3251 p_guest = map_domain_page(gmfn);
3252 p_shadow = map_domain_page(smfn);
3254 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3255 errors += check_pte(v, p_guest+i, p_shadow+i,
3256 p_snapshot ? p_snapshot+i : NULL,
3257 1, l2_idx, i);
3259 unmap_domain_page(p_shadow);
3260 unmap_domain_page(p_guest);
3261 if ( p_snapshot )
3262 unmap_domain_page(p_snapshot);
3264 return errors;
3267 #define FAILPT(_f, _a...) \
3268 do { \
3269 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
3270 errors++; \
3271 } while ( 0 )
3273 int check_l2_table(
3274 struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
3276 struct domain *d = v->domain;
3277 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
3278 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
3279 l2_pgentry_t match;
3280 int i;
3281 int errors = 0;
3282 int limit;
3284 if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
3285 FAILPT("domain doesn't own page");
3286 if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
3287 FAILPT("bogus owner for snapshot page");
3288 if ( page_get_owner(mfn_to_page(smfn)) != NULL )
3289 FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
3290 smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
3292 #if 0
3293 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
3294 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
3295 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
3296 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
3298 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
3299 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
3300 i++ )
3301 printk("+++ (%d) %lx %lx\n",i,
3302 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
3303 FAILPT("hypervisor entries inconsistent");
3306 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
3307 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
3308 FAILPT("hypervisor linear map inconsistent");
3309 #endif
3311 match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
3312 if ( !shadow_mode_external(d) &&
3313 l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
3314 match, PAGE_FLAG_MASK))
3316 FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
3317 l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
3318 L2_PAGETABLE_SHIFT]),
3319 l2e_get_intpte(match));
3322 match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
3323 if ( !shadow_mode_external(d) &&
3324 l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
3325 match, PAGE_FLAG_MASK))
3327 FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
3328 l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
3329 d->arch.mm_perdomain_pt,
3330 l2e_get_intpte(match));
3333 if ( shadow_mode_external(d) )
3334 limit = L2_PAGETABLE_ENTRIES;
3335 else
3336 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
3338 /* Check the whole L2. */
3339 for ( i = 0; i < limit; i++ )
3340 errors += check_pte(v,
3341 (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
3342 (l1_pgentry_t*)(&spl2e[i]),
3343 NULL,
3344 2, i, 0);
3346 unmap_domain_page(spl2e);
3347 unmap_domain_page(gpl2e);
3349 #if 1
3350 if ( errors )
3351 printk("check_l2_table returning %d errors\n", errors);
3352 #endif
3354 return errors;
3356 #undef FAILPT
3358 int _check_pagetable(struct vcpu *v, char *s)
3360 struct domain *d = v->domain;
3361 pagetable_t pt = v->arch.guest_table;
3362 unsigned long gptbase = pagetable_get_paddr(pt);
3363 unsigned long ptbase_pfn, smfn;
3364 unsigned long i;
3365 l2_pgentry_t *gpl2e, *spl2e;
3366 unsigned long ptbase_mfn = 0;
3367 int errors = 0, limit, oos_pdes = 0;
3369 //_audit_domain(d, AUDIT_QUIET);
3370 shadow_lock(d);
3372 sh_check_name = s;
3373 //SH_VVLOG("%s-PT Audit", s);
3374 sh_l2_present = sh_l1_present = 0;
3375 perfc_incrc(check_pagetable);
3377 ptbase_mfn = gptbase >> PAGE_SHIFT;
3378 ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
3380 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
3382 printk("%s-PT %lx not shadowed\n", s, gptbase);
3383 goto out;
3385 if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
3387 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
3388 oos_pdes = 1;
3389 ASSERT(ptbase_mfn);
3392 errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
3394 gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
3395 spl2e = (l2_pgentry_t *) map_domain_page(smfn);
3397 /* Go back and recurse. */
3398 if ( shadow_mode_external(d) )
3399 limit = L2_PAGETABLE_ENTRIES;
3400 else
3401 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
3403 for ( i = 0; i < limit; i++ )
3405 unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
3406 unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
3407 unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
3409 if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */
3411 errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
3415 unmap_domain_page(spl2e);
3416 unmap_domain_page(gpl2e);
3418 out:
3419 if ( errors )
3420 BUG();
3422 shadow_unlock(d);
3424 return errors;
3427 int _check_all_pagetables(struct vcpu *v, char *s)
3429 struct domain *d = v->domain;
3430 int i;
3431 struct shadow_status *a;
3432 unsigned long gmfn;
3433 int errors = 0;
3435 shadow_status_noswap = 1;
3437 sh_check_name = s;
3438 SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
3439 sh_l2_present = sh_l1_present = 0;
3440 perfc_incrc(check_all_pagetables);
3442 for (i = 0; i < shadow_ht_buckets; i++)
3444 a = &d->arch.shadow_ht[i];
3445 while ( a && a->gpfn_and_flags )
3447 gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
3449 switch ( a->gpfn_and_flags & PGT_type_mask )
3451 case PGT_l1_shadow:
3452 errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
3453 gmfn, a->smfn, 0);
3454 break;
3455 case PGT_l2_shadow:
3456 errors += check_l2_table(v, gmfn, a->smfn,
3457 page_out_of_sync(mfn_to_page(gmfn)));
3458 break;
3459 case PGT_l3_shadow:
3460 case PGT_l4_shadow:
3461 case PGT_hl2_shadow:
3462 BUG(); // XXX - ought to fix this...
3463 break;
3464 case PGT_snapshot:
3465 case PGT_writable_pred:
3466 break;
3467 default:
3468 errors++;
3469 printk("unexpected shadow type %lx, gpfn=%lx, "
3470 "gmfn=%lx smfn=%lx\n",
3471 a->gpfn_and_flags & PGT_type_mask,
3472 a->gpfn_and_flags & PGT_mfn_mask,
3473 gmfn, a->smfn);
3474 BUG();
3476 a = a->next;
3480 shadow_status_noswap = 0;
3482 if ( errors )
3483 BUG();
3485 return errors;
3488 #endif // SHADOW_DEBUG
3490 /*
3491 * Local variables:
3492 * mode: C
3493 * c-set-style: "BSD"
3494 * c-basic-offset: 4
3495 * tab-width: 4
3496 * indent-tabs-mode: nil
3497 * End:
3498 */