ia64/xen-unstable

view xen/arch/x86/shadow32.c @ 8974:0349fb4de335

Clean up some vmx code.

Signed-off-by: Xin Li <xin.b.li@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Feb 23 11:34:11 2006 +0100 (2006-02-23)
parents 8fb4392c1d87
children 71f2d19cd3a5
line source
1 /******************************************************************************
2 * arch/x86/shadow.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
23 #include <xen/config.h>
24 #include <xen/types.h>
25 #include <xen/mm.h>
26 #include <xen/domain_page.h>
27 #include <asm/shadow.h>
28 #include <asm/page.h>
29 #include <xen/event.h>
30 #include <xen/sched.h>
31 #include <xen/trace.h>
33 #define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
34 #define va_to_l1mfn(_ed, _va) \
35 (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]))
37 static void shadow_free_snapshot(struct domain *d,
38 struct out_of_sync_entry *entry);
39 static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
40 static void free_writable_pte_predictions(struct domain *d);
42 #if SHADOW_DEBUG
43 static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
44 #endif
46 static void free_p2m_table(struct vcpu *v);
48 /********
50 There's a per-domain shadow table spin lock which works fine for SMP
51 hosts. We don't have to worry about interrupts as no shadow operations
52 happen in an interrupt context. It's probably not quite ready for SMP
53 guest operation as we have to worry about synchonisation between gpte
54 and spte updates. Its possible that this might only happen in a
55 hypercall context, in which case we'll probably at have a per-domain
56 hypercall lock anyhow (at least initially).
58 ********/
60 static inline int
61 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
62 unsigned long new_type)
63 {
64 struct page_info *page = mfn_to_page(gmfn);
65 int pinned = 0, okay = 1;
67 if ( page_out_of_sync(page) )
68 {
69 // Don't know how long ago this snapshot was taken.
70 // Can't trust it to be recent enough.
71 //
72 __shadow_sync_mfn(d, gmfn);
73 }
75 if ( !shadow_mode_refcounts(d) )
76 return 1;
78 if ( unlikely(page_is_page_table(page)) )
79 return 1;
81 FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
83 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
84 {
85 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
86 __func__, gpfn, gmfn);
87 #if 1 || defined(LIVE_DANGEROUSLY)
88 set_bit(_PGC_page_table, &page->count_info);
89 return 1;
90 #endif
91 return 0;
93 }
95 // To convert this page to use as a page table, the writable count
96 // should now be zero. Test this by grabbing the page as an page table,
97 // and then immediately releasing. This will also deal with any
98 // necessary TLB flushing issues for us.
99 //
100 // The cruft here about pinning doesn't really work right. This
101 // needs rethinking/rewriting... Need to gracefully deal with the
102 // TLB flushes required when promoting a writable page, and also deal
103 // with any outstanding (external) writable refs to this page (by
104 // refusing to promote it). The pinning headache complicates this
105 // code -- it would all get much simpler if we stop using
106 // shadow_lock() and move the shadow code to BIGLOCK().
107 //
108 if ( unlikely(!get_page(page, d)) )
109 BUG(); // XXX -- needs more thought for a graceful failure
110 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
111 {
112 pinned = 1;
113 put_page_and_type(page);
114 }
115 if ( get_page_type(page, PGT_base_page_table) )
116 {
117 set_bit(_PGC_page_table, &page->count_info);
118 put_page_type(page);
119 }
120 else
121 {
122 printk("shadow_promote: get_page_type failed "
123 "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
124 d->domain_id, gpfn, gmfn, new_type);
125 okay = 0;
126 }
128 // Now put the type back to writable...
129 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
130 BUG(); // XXX -- needs more thought for a graceful failure
131 if ( unlikely(pinned) )
132 {
133 if ( unlikely(test_and_set_bit(_PGT_pinned,
134 &page->u.inuse.type_info)) )
135 BUG(); // hmm... someone pinned this again?
136 }
137 else
138 put_page_and_type(page);
140 return okay;
141 }
143 static inline void
144 shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
145 {
146 if ( !shadow_mode_refcounts(d) )
147 return;
149 ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table);
151 if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
152 {
153 clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info);
155 if ( page_out_of_sync(mfn_to_page(gmfn)) )
156 {
157 remove_out_of_sync_entries(d, gmfn);
158 }
159 }
160 }
162 /*
163 * Things in shadow mode that collect get_page() refs to the domain's
164 * pages are:
165 * - PGC_allocated takes a gen count, just like normal.
166 * - A writable page can be pinned (paravirtualized guests may consider
167 * these pages to be L1s or L2s, and don't know the difference).
168 * Pinning a page takes a gen count (but, for domains in shadow mode,
169 * it *doesn't* take a type count)
170 * - CR3 grabs a ref to whatever it points at, just like normal.
171 * - Shadow mode grabs an initial gen count for itself, as a placehold
172 * for whatever references will exist.
173 * - Shadow PTEs that point to a page take a gen count, just like regular
174 * PTEs. However, they don't get a type count, as get_page_type() is
175 * hardwired to keep writable pages' counts at 1 for domains in shadow
176 * mode.
177 * - Whenever we shadow a page, the entry in the shadow hash grabs a
178 * general ref to the page.
179 * - Whenever a page goes out of sync, the out of sync entry grabs a
180 * general ref to the page.
181 */
182 /*
183 * page_info fields for pages allocated as shadow pages:
184 *
185 * All 32 bits of count_info are a simple count of refs to this shadow
186 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
187 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
188 * references.
189 *
190 * u.inuse._domain is left NULL, to prevent accidently allow some random
191 * domain from gaining permissions to map this page.
192 *
193 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
194 * shadowed.
195 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
196 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
197 * is currently exists because this is a shadow of a root page, and we
198 * don't want to let those disappear just because no CR3 is currently pointing
199 * at it.
200 *
201 * tlbflush_timestamp holds a min & max index of valid page table entries
202 * within the shadow page.
203 */
205 static inline unsigned long
206 alloc_shadow_page(struct domain *d,
207 unsigned long gpfn, unsigned long gmfn,
208 u32 psh_type)
209 {
210 struct page_info *page;
211 unsigned long smfn;
212 int pin = 0;
213 void *l1;
215 // Currently, we only keep pre-zero'ed pages around for use as L1's...
216 // This will change. Soon.
217 //
218 if ( psh_type == PGT_l1_shadow )
219 {
220 if ( !list_empty(&d->arch.free_shadow_frames) )
221 {
222 struct list_head *entry = d->arch.free_shadow_frames.next;
223 page = list_entry(entry, struct page_info, list);
224 list_del(entry);
225 perfc_decr(free_l1_pages);
226 }
227 else
228 {
229 page = alloc_domheap_page(NULL);
230 l1 = map_domain_page(page_to_mfn(page));
231 memset(l1, 0, PAGE_SIZE);
232 unmap_domain_page(l1);
233 }
234 }
235 else
236 page = alloc_domheap_page(NULL);
238 if ( unlikely(page == NULL) )
239 {
240 printk("Couldn't alloc shadow page! dom%d count=%d\n",
241 d->domain_id, d->arch.shadow_page_count);
242 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
243 perfc_value(shadow_l1_pages),
244 perfc_value(shadow_l2_pages),
245 perfc_value(hl2_table_pages),
246 perfc_value(snapshot_pages));
247 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
248 }
250 smfn = page_to_mfn(page);
252 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
253 page->u.inuse.type_info = psh_type | gmfn;
254 page->count_info = 0;
255 page->tlbflush_timestamp = 0;
257 switch ( psh_type )
258 {
259 case PGT_l1_shadow:
260 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
261 goto fail;
262 perfc_incr(shadow_l1_pages);
263 d->arch.shadow_page_count++;
264 break;
266 case PGT_l2_shadow:
267 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
268 goto fail;
269 perfc_incr(shadow_l2_pages);
270 d->arch.shadow_page_count++;
271 if ( PGT_l2_page_table == PGT_root_page_table )
272 pin = 1;
274 break;
276 case PGT_hl2_shadow:
277 // Treat an hl2 as an L1 for purposes of promotion.
278 // For external mode domains, treat them as an L2 for purposes of
279 // pinning.
280 //
281 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
282 goto fail;
283 perfc_incr(hl2_table_pages);
284 d->arch.hl2_page_count++;
285 if ( shadow_mode_external(d) &&
286 (PGT_l2_page_table == PGT_root_page_table) )
287 pin = 1;
289 break;
291 case PGT_snapshot:
292 perfc_incr(snapshot_pages);
293 d->arch.snapshot_page_count++;
294 break;
296 default:
297 printk("Alloc shadow weird page type type=%08x\n", psh_type);
298 BUG();
299 break;
300 }
302 // Don't add a new shadow of something that already has a snapshot.
303 //
304 ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
306 set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
308 if ( pin )
309 shadow_pin(smfn);
311 return smfn;
313 fail:
314 FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?",
315 gpfn, gmfn);
316 free_domheap_page(page);
317 return 0;
318 }
320 static void inline
321 free_shadow_l1_table(struct domain *d, unsigned long smfn)
322 {
323 l1_pgentry_t *pl1e = map_domain_page(smfn);
324 int i;
325 struct page_info *spage = mfn_to_page(smfn);
326 u32 min_max = spage->tlbflush_timestamp;
327 int min = SHADOW_MIN(min_max);
328 int max = SHADOW_MAX(min_max);
330 for ( i = min; i <= max; i++ )
331 {
332 shadow_put_page_from_l1e(pl1e[i], d);
333 pl1e[i] = l1e_empty();
334 }
336 unmap_domain_page(pl1e);
337 }
339 static void inline
340 free_shadow_hl2_table(struct domain *d, unsigned long smfn)
341 {
342 l1_pgentry_t *hl2 = map_domain_page(smfn);
343 int i, limit;
345 SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
347 if ( shadow_mode_external(d) )
348 limit = L2_PAGETABLE_ENTRIES;
349 else
350 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
352 for ( i = 0; i < limit; i++ )
353 {
354 if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
355 put_page(mfn_to_page(l1e_get_pfn(hl2[i])));
356 }
358 unmap_domain_page(hl2);
359 }
361 static void inline
362 free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
363 {
364 l2_pgentry_t *pl2e = map_domain_page(smfn);
365 int i, external = shadow_mode_external(d);
367 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
368 if ( external || is_guest_l2_slot(type, i) )
369 if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
370 put_shadow_ref(l2e_get_pfn(pl2e[i]));
372 if ( (PGT_base_page_table == PGT_l2_page_table) &&
373 shadow_mode_translate(d) && !external )
374 {
375 // free the ref to the hl2
376 //
377 put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
378 }
380 unmap_domain_page(pl2e);
381 }
383 void free_shadow_page(unsigned long smfn)
384 {
385 struct page_info *page = mfn_to_page(smfn);
386 unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
387 struct domain *d = page_get_owner(mfn_to_page(gmfn));
388 unsigned long gpfn = mfn_to_gmfn(d, gmfn);
389 unsigned long type = page->u.inuse.type_info & PGT_type_mask;
391 SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
393 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
395 delete_shadow_status(d, gpfn, gmfn, type);
397 switch ( type )
398 {
399 case PGT_l1_shadow:
400 perfc_decr(shadow_l1_pages);
401 shadow_demote(d, gpfn, gmfn);
402 free_shadow_l1_table(d, smfn);
403 d->arch.shadow_page_count--;
404 break;
406 case PGT_l2_shadow:
407 perfc_decr(shadow_l2_pages);
408 shadow_demote(d, gpfn, gmfn);
409 free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
410 d->arch.shadow_page_count--;
411 break;
413 case PGT_hl2_shadow:
414 perfc_decr(hl2_table_pages);
415 shadow_demote(d, gpfn, gmfn);
416 free_shadow_hl2_table(d, smfn);
417 d->arch.hl2_page_count--;
418 break;
420 case PGT_snapshot:
421 perfc_decr(snapshot_pages);
422 d->arch.snapshot_page_count--;
423 break;
425 default:
426 printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n",
427 page_to_mfn(page), page->u.inuse.type_info);
428 break;
429 }
431 // No TLB flushes are needed the next time this page gets allocated.
432 //
433 page->tlbflush_timestamp = 0;
434 page->u.free.cpumask = CPU_MASK_NONE;
436 if ( type == PGT_l1_shadow )
437 {
438 list_add(&page->list, &d->arch.free_shadow_frames);
439 perfc_incr(free_l1_pages);
440 }
441 else
442 free_domheap_page(page);
443 }
445 void
446 remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
447 {
448 unsigned long smfn;
450 //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype);
452 shadow_lock(d);
454 while ( stype >= PGT_l1_shadow )
455 {
456 smfn = __shadow_status(d, gpfn, stype);
457 if ( smfn && MFN_PINNED(smfn) )
458 shadow_unpin(smfn);
459 stype -= PGT_l1_shadow;
460 }
462 shadow_unlock(d);
463 }
465 static void inline
466 release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
467 {
468 struct page_info *page;
470 page = mfn_to_page(entry->gmfn);
472 // Decrement ref count of guest & shadow pages
473 //
474 put_page(page);
476 // Only use entries that have low bits clear...
477 //
478 if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
479 {
480 put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
481 entry->writable_pl1e = -2;
482 }
483 else
484 ASSERT( entry->writable_pl1e == -1 );
486 // Free the snapshot
487 //
488 shadow_free_snapshot(d, entry);
489 }
491 static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
492 {
493 struct out_of_sync_entry *entry = d->arch.out_of_sync;
494 struct out_of_sync_entry **prev = &d->arch.out_of_sync;
495 struct out_of_sync_entry *found = NULL;
497 // NB: Be careful not to call something that manipulates this list
498 // while walking it. Collect the results into a separate list
499 // first, then walk that list.
500 //
501 while ( entry )
502 {
503 if ( entry->gmfn == gmfn )
504 {
505 // remove from out of sync list
506 *prev = entry->next;
508 // add to found list
509 entry->next = found;
510 found = entry;
512 entry = *prev;
513 continue;
514 }
515 prev = &entry->next;
516 entry = entry->next;
517 }
519 prev = NULL;
520 entry = found;
521 while ( entry )
522 {
523 release_out_of_sync_entry(d, entry);
525 prev = &entry->next;
526 entry = entry->next;
527 }
529 // Add found list to free list
530 if ( prev )
531 {
532 *prev = d->arch.out_of_sync_free;
533 d->arch.out_of_sync_free = found;
534 }
535 }
537 static void free_out_of_sync_state(struct domain *d)
538 {
539 struct out_of_sync_entry *entry;
541 // NB: Be careful not to call something that manipulates this list
542 // while walking it. Remove one item at a time, and always
543 // restart from start of list.
544 //
545 while ( (entry = d->arch.out_of_sync) )
546 {
547 d->arch.out_of_sync = entry->next;
548 release_out_of_sync_entry(d, entry);
550 entry->next = d->arch.out_of_sync_free;
551 d->arch.out_of_sync_free = entry;
552 }
553 }
555 static void free_shadow_pages(struct domain *d)
556 {
557 int i;
558 struct shadow_status *x;
559 struct vcpu *v;
560 struct list_head *list_ent, *tmp;
562 /*
563 * WARNING! The shadow page table must not currently be in use!
564 * e.g., You are expected to have paused the domain and synchronized CR3.
565 */
567 if( !d->arch.shadow_ht ) return;
569 shadow_audit(d, 1);
571 // first, remove any outstanding refs from out_of_sync entries...
572 //
573 free_out_of_sync_state(d);
575 // second, remove any outstanding refs from v->arch.shadow_table
576 // and CR3.
577 //
578 for_each_vcpu(d, v)
579 {
580 if ( pagetable_get_paddr(v->arch.shadow_table) )
581 {
582 put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
583 v->arch.shadow_table = mk_pagetable(0);
584 }
586 if ( v->arch.monitor_shadow_ref )
587 {
588 put_shadow_ref(v->arch.monitor_shadow_ref);
589 v->arch.monitor_shadow_ref = 0;
590 }
591 }
593 // For external shadows, remove the monitor table's refs
594 //
595 if ( shadow_mode_external(d) )
596 {
597 for_each_vcpu(d, v)
598 {
599 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
601 if ( mpl2e )
602 {
603 l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
604 l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
606 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
607 {
608 put_shadow_ref(l2e_get_pfn(hl2e));
609 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
610 }
611 if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
612 {
613 put_shadow_ref(l2e_get_pfn(smfn));
614 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
615 }
616 }
617 }
618 }
620 // Now, the only refs to shadow pages that are left are from the shadow
621 // pages themselves. We just unpin the pinned pages, and the rest
622 // should automatically disappear.
623 //
624 // NB: Beware: each explicitly or implicit call to free_shadow_page
625 // can/will result in the hash bucket getting rewritten out from
626 // under us... First, collect the list of pinned pages, then
627 // free them.
628 //
629 // FIXME: it would be good to just free all the pages referred to in
630 // the hash table without going through each of them to decrement their
631 // reference counts. In shadow_mode_refcount(), we've gotta do the hard
632 // work, but only for L1 shadows. If we're not in refcount mode, then
633 // there's no real hard work to do at all. Need to be careful with the
634 // writable_pte_predictions and snapshot entries in the hash table, but
635 // that's about it.
636 //
637 for ( i = 0; i < shadow_ht_buckets; i++ )
638 {
639 u32 count;
640 unsigned long *mfn_list;
642 /* Skip empty buckets. */
643 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
644 continue;
646 count = 0;
648 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) {
649 /* Skip entries that are writable_pred) */
650 switch(x->gpfn_and_flags & PGT_type_mask){
651 case PGT_l1_shadow:
652 case PGT_l2_shadow:
653 case PGT_l3_shadow:
654 case PGT_l4_shadow:
655 case PGT_hl2_shadow:
656 if ( MFN_PINNED(x->smfn) )
657 count++;
658 break;
659 case PGT_snapshot:
660 case PGT_writable_pred:
661 break;
662 default:
663 BUG();
665 }
666 }
668 if ( !count )
669 continue;
671 mfn_list = xmalloc_array(unsigned long, count);
672 count = 0;
673 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) {
674 /* Skip entries that are writable_pred) */
675 switch(x->gpfn_and_flags & PGT_type_mask){
676 case PGT_l1_shadow:
677 case PGT_l2_shadow:
678 case PGT_l3_shadow:
679 case PGT_l4_shadow:
680 case PGT_hl2_shadow:
681 if ( MFN_PINNED(x->smfn) )
682 mfn_list[count++] = x->smfn;
683 break;
684 case PGT_snapshot:
685 case PGT_writable_pred:
686 break;
687 default:
688 BUG();
690 }
691 }
693 while ( count )
694 {
695 shadow_unpin(mfn_list[--count]);
696 }
697 xfree(mfn_list);
698 }
700 /* Now free the pre-zero'ed pages from the domain */
701 list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
702 {
703 struct page_info *page = list_entry(list_ent, struct page_info, list);
705 list_del(list_ent);
706 perfc_decr(free_l1_pages);
708 free_domheap_page(page);
709 }
711 shadow_audit(d, 0);
713 SH_VLOG("Free shadow table.");
714 }
716 void shadow_mode_init(void)
717 {
718 }
720 int _shadow_mode_refcounts(struct domain *d)
721 {
722 return shadow_mode_refcounts(d);
723 }
725 static void alloc_monitor_pagetable(struct vcpu *v)
726 {
727 unsigned long mmfn;
728 l2_pgentry_t *mpl2e;
729 struct page_info *mmfn_info;
730 struct domain *d = v->domain;
731 int i;
733 ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
735 mmfn_info = alloc_domheap_page(NULL);
736 ASSERT(mmfn_info != NULL);
738 mmfn = page_to_mfn(mmfn_info);
739 mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
740 memset(mpl2e, 0, PAGE_SIZE);
742 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
743 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
744 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
746 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
747 mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
748 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
749 __PAGE_HYPERVISOR);
751 // Don't (yet) have mappings for these...
752 // Don't want to accidentally see the idle_pg_table's linear mapping.
753 //
754 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
755 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
756 mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = l2e_empty();
758 v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
759 v->arch.monitor_vtable = mpl2e;
761 if ( v->vcpu_id == 0 )
762 alloc_p2m_table(d);
763 }
765 /*
766 * Free the pages for monitor_table and hl2_table
767 */
768 void free_monitor_pagetable(struct vcpu *v)
769 {
770 l2_pgentry_t *mpl2e, hl2e, sl2e;
771 unsigned long mfn;
773 ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
775 mpl2e = v->arch.monitor_vtable;
777 /*
778 * First get the mfn for hl2_table by looking at monitor_table
779 */
780 hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
781 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
782 {
783 mfn = l2e_get_pfn(hl2e);
784 ASSERT(mfn);
785 put_shadow_ref(mfn);
786 }
788 sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
789 if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
790 {
791 mfn = l2e_get_pfn(sl2e);
792 ASSERT(mfn);
793 put_shadow_ref(mfn);
794 }
796 if ( v->vcpu_id == 0 )
797 free_p2m_table(v);
799 /*
800 * Then free monitor_table.
801 */
802 mfn = pagetable_get_pfn(v->arch.monitor_table);
803 unmap_domain_page_global(v->arch.monitor_vtable);
804 free_domheap_page(mfn_to_page(mfn));
806 v->arch.monitor_table = mk_pagetable(0);
807 v->arch.monitor_vtable = 0;
808 }
810 static int
811 map_p2m_entry(
812 l1_pgentry_t *l1tab, unsigned long va, unsigned long gpa, unsigned long mfn)
813 {
814 unsigned long *l0tab = NULL;
815 l1_pgentry_t l1e = { 0 };
816 struct page_info *page;
818 l1e = l1tab[l1_table_offset(va)];
819 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
820 {
821 page = alloc_domheap_page(NULL);
822 if ( !page )
823 goto fail;
825 if ( l0tab )
826 unmap_domain_page(l0tab);
827 l0tab = map_domain_page(page_to_mfn(page));
828 memset(l0tab, 0, PAGE_SIZE );
829 l1e = l1tab[l1_table_offset(va)] =
830 l1e_from_page(page, __PAGE_HYPERVISOR);
831 }
832 else if ( l0tab == NULL)
833 l0tab = map_domain_page(l1e_get_pfn(l1e));
835 l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
837 if ( l0tab )
838 unmap_domain_page(l0tab);
840 return 1;
842 fail:
843 return 0;
844 }
846 int
847 set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
848 struct domain_mmap_cache *l2cache,
849 struct domain_mmap_cache *l1cache)
850 {
851 unsigned long tabpfn;
852 l2_pgentry_t *l2, l2e;
853 l1_pgentry_t *l1;
854 struct page_info *l1page;
855 unsigned long va = pfn << PAGE_SHIFT;
856 int error;
858 if ( shadow_mode_external(d) )
859 {
860 tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
861 va = RO_MPT_VIRT_START + (pfn * sizeof (unsigned long));
862 }
863 else
864 {
865 tabpfn = pagetable_get_pfn(d->arch.phys_table);
866 va = pfn << PAGE_SHIFT;
867 }
869 ASSERT(tabpfn != 0);
870 ASSERT(shadow_lock_is_acquired(d));
872 l2 = map_domain_page_with_cache(tabpfn, l2cache);
874 /*
875 * The following code covers (SHM_translate | SHM_external) mode.
876 */
878 if ( shadow_mode_external(d) )
879 {
880 l1_pgentry_t *l1tab = NULL;
881 l2_pgentry_t l2e;
883 l2e = l2[l2_table_offset(va)];
885 ASSERT( l2e_get_flags(l2e) & _PAGE_PRESENT );
887 l1tab = map_domain_page(l2e_get_pfn(l2e));
888 error = map_p2m_entry(l1tab, va, pfn, mfn);
889 if ( !error )
890 domain_crash_synchronous();
892 unmap_domain_page(l1tab);
893 unmap_domain_page_with_cache(l2, l2cache);
895 return 1;
896 }
898 /*
899 * The following code covers SHM_translate mode.
900 */
901 ASSERT(shadow_mode_translate(d));
903 l2e = l2[l2_table_offset(va)];
904 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
905 {
906 l1page = alloc_domheap_page(NULL);
907 if ( !l1page )
908 {
909 unmap_domain_page_with_cache(l2, l2cache);
910 return 0;
911 }
913 l1 = map_domain_page_with_cache(page_to_mfn(l1page), l1cache);
914 memset(l1, 0, PAGE_SIZE);
915 unmap_domain_page_with_cache(l1, l1cache);
917 l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR);
918 l2[l2_table_offset(va)] = l2e;
919 }
920 unmap_domain_page_with_cache(l2, l2cache);
922 l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache);
923 l1[l1_table_offset(va)] = (l1_pgentry_t) { mfn };
924 unmap_domain_page_with_cache(l1, l1cache);
926 return 1;
927 }
929 int
930 alloc_p2m_table(struct domain *d)
931 {
932 struct list_head *list_ent;
933 unsigned long va = RO_MPT_VIRT_START; /* phys_to_machine_mapping */
935 l2_pgentry_t *l2tab = NULL;
936 l1_pgentry_t *l1tab = NULL;
937 l2_pgentry_t l2e = { 0 };
938 struct page_info *page;
939 unsigned long gpfn, mfn;
940 int error;
942 if ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) )
943 {
944 l2tab = map_domain_page(
945 pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
946 l2e = l2tab[l2_table_offset(va)];
947 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
948 {
949 page = alloc_domheap_page(NULL);
951 l1tab = map_domain_page(page_to_mfn(page));
952 memset(l1tab, 0, PAGE_SIZE);
953 l2e = l2tab[l2_table_offset(va)] =
954 l2e_from_page(page, __PAGE_HYPERVISOR);
955 }
956 else
957 l1tab = map_domain_page(l2e_get_pfn(l2e));
958 }
959 else
960 {
961 page = alloc_domheap_page(NULL);
963 l1tab = map_domain_page(page_to_mfn(page));
964 memset(l1tab, 0, PAGE_SIZE);
965 d->arch.phys_table = mk_pagetable(page_to_maddr(page));
966 }
968 list_ent = d->page_list.next;
970 for ( gpfn = 0; list_ent != &d->page_list; gpfn++ )
971 {
972 page = list_entry(list_ent, struct page_info, list);
973 mfn = page_to_mfn(page);
975 error = map_p2m_entry(l1tab, va, gpfn, mfn);
976 if ( !error )
977 domain_crash_synchronous();
979 list_ent = frame_table[mfn].list.next;
980 va += sizeof(mfn);
981 }
983 if (l2tab)
984 unmap_domain_page(l2tab);
985 unmap_domain_page(l1tab);
987 return 1;
988 }
990 static void
991 free_p2m_table(struct vcpu *v)
992 {
993 unsigned long va;
994 l2_pgentry_t *l2tab;
995 l1_pgentry_t *l1tab;
996 l2_pgentry_t l2e;
997 l1_pgentry_t l1e;
999 ASSERT ( pagetable_get_pfn(v->arch.monitor_table) );
1001 l2tab = map_domain_page(
1002 pagetable_get_pfn(v->arch.monitor_table));
1004 for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
1006 int i;
1008 l2e = l2tab[l2_table_offset(va)];
1009 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
1011 l1tab = map_domain_page(l2e_get_pfn(l2e));
1012 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1014 l1e = l1tab[l1_table_offset(va)];
1016 if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
1017 free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
1018 va += PAGE_SIZE;
1020 unmap_domain_page(l1tab);
1021 free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
1024 unmap_domain_page(l2tab);
1027 int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs)
1029 struct vcpu *v = current;
1030 struct domain *d = v->domain;
1031 l2_pgentry_t sl2e;
1032 l1_pgentry_t sl1e;
1033 l1_pgentry_t *sple = NULL;
1034 unsigned long mfn, smfn;
1035 struct page_info *page;
1037 /*
1038 * If the faulting address is within the MMIO range, we continue
1039 * on handling the #PF as such.
1040 */
1041 if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN )
1042 return 0;
1044 shadow_lock(d);
1046 __direct_get_l2e(v, vpa, &sl2e);
1048 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
1050 page = alloc_domheap_page(NULL);
1051 if ( !page )
1052 goto nomem;
1054 smfn = page_to_mfn(page);
1055 sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER);
1057 sple = (l1_pgentry_t *)map_domain_page(smfn);
1058 memset(sple, 0, PAGE_SIZE);
1059 __direct_set_l2e(v, vpa, sl2e);
1062 if ( !sple )
1063 sple = (l1_pgentry_t *)map_domain_page(l2e_get_pfn(sl2e));
1065 sl1e = sple[l1_table_offset(vpa)];
1067 if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) )
1069 sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER);
1070 sple[l1_table_offset(vpa)] = sl1e;
1073 if (sple)
1074 unmap_domain_page(sple);
1076 shadow_unlock(d);
1077 return EXCRET_fault_fixed;
1079 nomem:
1080 shadow_direct_map_clean(d);
1081 domain_crash_synchronous();
1085 int shadow_direct_map_init(struct domain *d)
1087 struct page_info *page;
1088 l2_pgentry_t *root;
1090 if ( !(page = alloc_domheap_page(NULL)) )
1091 return 0;
1093 root = map_domain_page(page_to_mfn(page));
1094 memset(root, 0, PAGE_SIZE);
1095 unmap_domain_page(root);
1097 d->arch.phys_table = mk_pagetable(page_to_maddr(page));
1099 return 1;
1102 void shadow_direct_map_clean(struct domain *d)
1104 int i;
1105 unsigned long mfn;
1106 l2_pgentry_t *l2e;
1108 mfn = pagetable_get_pfn(d->arch.phys_table);
1110 /*
1111 * We may fail very early before direct map is built.
1112 */
1113 if ( !mfn )
1114 return;
1116 l2e = map_domain_page(mfn);
1118 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1120 if ( l2e_get_flags(l2e[i]) & _PAGE_PRESENT )
1121 free_domheap_page(mfn_to_page(l2e_get_pfn(l2e[i])));
1123 free_domheap_page(mfn_to_page(mfn));
1125 unmap_domain_page(l2e);
1127 d->arch.phys_table = mk_pagetable(0);
1130 int __shadow_mode_enable(struct domain *d, unsigned int mode)
1132 struct vcpu *v;
1133 int new_modes = (mode & ~d->arch.shadow_mode);
1135 if(!new_modes) /* Nothing to do - return success */
1136 return 0;
1138 // can't take anything away by calling this function.
1139 ASSERT(!(d->arch.shadow_mode & ~mode));
1141 for_each_vcpu(d, v)
1143 invalidate_shadow_ldt(v);
1145 // We need to set these up for __update_pagetables().
1146 // See the comment there.
1148 /*
1149 * arch.guest_vtable
1150 */
1151 if ( v->arch.guest_vtable &&
1152 (v->arch.guest_vtable != __linear_l2_table) )
1154 unmap_domain_page_global(v->arch.guest_vtable);
1156 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
1157 v->arch.guest_vtable = __linear_l2_table;
1158 else
1159 v->arch.guest_vtable = NULL;
1161 /*
1162 * arch.shadow_vtable
1163 */
1164 if ( v->arch.shadow_vtable &&
1165 (v->arch.shadow_vtable != __shadow_linear_l2_table) )
1167 unmap_domain_page_global(v->arch.shadow_vtable);
1169 if ( !(mode & SHM_external) )
1170 v->arch.shadow_vtable = __shadow_linear_l2_table;
1171 else
1172 v->arch.shadow_vtable = NULL;
1174 /*
1175 * arch.hl2_vtable
1176 */
1177 if ( v->arch.hl2_vtable &&
1178 (v->arch.hl2_vtable != __linear_hl2_table) )
1180 unmap_domain_page_global(v->arch.hl2_vtable);
1182 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
1183 v->arch.hl2_vtable = __linear_hl2_table;
1184 else
1185 v->arch.hl2_vtable = NULL;
1187 /*
1188 * arch.monitor_table & arch.monitor_vtable
1189 */
1190 if ( v->arch.monitor_vtable )
1192 free_monitor_pagetable(v);
1194 if ( mode & SHM_external )
1196 alloc_monitor_pagetable(v);
1200 if ( new_modes & SHM_enable )
1202 ASSERT( !d->arch.shadow_ht );
1203 d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
1204 if ( d->arch.shadow_ht == NULL )
1205 goto nomem;
1207 memset(d->arch.shadow_ht, 0,
1208 shadow_ht_buckets * sizeof(struct shadow_status));
1211 if ( new_modes & SHM_log_dirty )
1213 ASSERT( !d->arch.shadow_dirty_bitmap );
1214 d->arch.shadow_dirty_bitmap_size =
1215 (d->shared_info->arch.max_pfn + 63) & ~63;
1216 d->arch.shadow_dirty_bitmap =
1217 xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
1218 (8 * sizeof(unsigned long)));
1219 if ( d->arch.shadow_dirty_bitmap == NULL )
1221 d->arch.shadow_dirty_bitmap_size = 0;
1222 goto nomem;
1224 memset(d->arch.shadow_dirty_bitmap, 0,
1225 d->arch.shadow_dirty_bitmap_size/8);
1228 if ( new_modes & SHM_translate )
1230 if ( !(new_modes & SHM_external) )
1232 ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
1233 if ( !alloc_p2m_table(d) )
1235 printk("alloc_p2m_table failed (out-of-memory?)\n");
1236 goto nomem;
1241 // Get rid of any shadow pages from any previous shadow mode.
1242 //
1243 free_shadow_pages(d);
1245 d->arch.shadow_mode = mode;
1247 if ( shadow_mode_refcounts(d) )
1249 struct list_head *list_ent;
1250 struct page_info *page;
1252 /*
1253 * Tear down its counts by disassembling its page-table-based refcounts
1254 * Also remove CR3's gcount/tcount.
1255 * That leaves things like GDTs and LDTs and external refs in tact.
1257 * Most pages will be writable tcount=0.
1258 * Some will still be L1 tcount=0 or L2 tcount=0.
1259 * Maybe some pages will be type none tcount=0.
1260 * Pages granted external writable refs (via grant tables?) will
1261 * still have a non-zero tcount. That's OK.
1263 * gcounts will generally be 1 for PGC_allocated.
1264 * GDTs and LDTs will have additional gcounts.
1265 * Any grant-table based refs will still be in the gcount.
1267 * We attempt to grab writable refs to each page thus setting its type
1268 * Immediately put back those type refs.
1270 * Assert that no pages are left with L1/L2/L3/L4 type.
1271 */
1272 audit_adjust_pgtables(d, -1, 1);
1275 for (list_ent = d->page_list.next; list_ent != &d->page_list;
1276 list_ent = page->list.next) {
1278 page = list_entry(list_ent, struct page_info, list);
1280 if ( !get_page_type(page, PGT_writable_page) )
1281 BUG();
1282 put_page_type(page);
1283 /*
1284 * We use tlbflush_timestamp as back pointer to smfn, and need to
1285 * clean up it.
1286 */
1287 if (shadow_mode_external(d))
1288 page->tlbflush_timestamp = 0;
1291 audit_adjust_pgtables(d, 1, 1);
1295 return 0;
1297 nomem:
1298 if ( (new_modes & SHM_enable) )
1300 xfree(d->arch.shadow_ht);
1301 d->arch.shadow_ht = NULL;
1303 if ( (new_modes & SHM_log_dirty) )
1305 xfree(d->arch.shadow_dirty_bitmap);
1306 d->arch.shadow_dirty_bitmap = NULL;
1309 return -ENOMEM;
1312 int shadow_mode_enable(struct domain *d, unsigned int mode)
1314 int rc;
1315 shadow_lock(d);
1316 rc = __shadow_mode_enable(d, mode);
1317 shadow_unlock(d);
1318 return rc;
1321 static void
1322 translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
1324 int i;
1325 l1_pgentry_t *l1;
1327 l1 = map_domain_page(l1mfn);
1328 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1330 if ( is_guest_l1_slot(i) &&
1331 (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
1333 unsigned long mfn = l1e_get_pfn(l1[i]);
1334 unsigned long gpfn = mfn_to_gmfn(d, mfn);
1335 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1336 l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
1339 unmap_domain_page(l1);
1342 // This is not general enough to handle arbitrary pagetables
1343 // with shared L1 pages, etc., but it is sufficient for bringing
1344 // up dom0.
1345 //
1346 void
1347 translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
1348 unsigned int type)
1350 int i;
1351 l2_pgentry_t *l2;
1353 ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
1355 l2 = map_domain_page(l2mfn);
1356 for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
1358 if ( is_guest_l2_slot(type, i) &&
1359 (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
1361 unsigned long mfn = l2e_get_pfn(l2[i]);
1362 unsigned long gpfn = mfn_to_gmfn(d, mfn);
1363 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1364 l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
1365 translate_l1pgtable(d, p2m, mfn);
1368 unmap_domain_page(l2);
1371 static void free_shadow_ht_entries(struct domain *d)
1373 struct shadow_status *x, *n;
1375 SH_VLOG("freed tables count=%d l1=%d l2=%d",
1376 d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
1377 perfc_value(shadow_l2_pages));
1379 n = d->arch.shadow_ht_extras;
1380 while ( (x = n) != NULL )
1382 d->arch.shadow_extras_count--;
1383 n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
1384 xfree(x);
1387 d->arch.shadow_ht_extras = NULL;
1388 d->arch.shadow_ht_free = NULL;
1390 ASSERT(d->arch.shadow_extras_count == 0);
1391 SH_VLOG("freed extras, now %d", d->arch.shadow_extras_count);
1393 if ( d->arch.shadow_dirty_bitmap != NULL )
1395 xfree(d->arch.shadow_dirty_bitmap);
1396 d->arch.shadow_dirty_bitmap = 0;
1397 d->arch.shadow_dirty_bitmap_size = 0;
1400 xfree(d->arch.shadow_ht);
1401 d->arch.shadow_ht = NULL;
1404 static void free_out_of_sync_entries(struct domain *d)
1406 struct out_of_sync_entry *x, *n;
1408 n = d->arch.out_of_sync_extras;
1409 while ( (x = n) != NULL )
1411 d->arch.out_of_sync_extras_count--;
1412 n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
1413 xfree(x);
1416 d->arch.out_of_sync_extras = NULL;
1417 d->arch.out_of_sync_free = NULL;
1418 d->arch.out_of_sync = NULL;
1420 ASSERT(d->arch.out_of_sync_extras_count == 0);
1421 FSH_LOG("freed extra out_of_sync entries, now %d",
1422 d->arch.out_of_sync_extras_count);
1425 void __shadow_mode_disable(struct domain *d)
1427 struct vcpu *v;
1428 #ifndef NDEBUG
1429 int i;
1430 #endif
1432 if ( unlikely(!shadow_mode_enabled(d)) )
1433 return;
1435 free_shadow_pages(d);
1436 free_writable_pte_predictions(d);
1438 #ifndef NDEBUG
1439 for ( i = 0; i < shadow_ht_buckets; i++ )
1441 if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
1443 printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
1444 __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
1445 BUG();
1448 #endif
1450 d->arch.shadow_mode = 0;
1452 free_shadow_ht_entries(d);
1453 free_out_of_sync_entries(d);
1455 for_each_vcpu(d, v)
1456 update_pagetables(v);
1459 static int shadow_mode_table_op(
1460 struct domain *d, dom0_shadow_control_t *sc)
1462 unsigned int op = sc->op;
1463 int i, rc = 0;
1464 struct vcpu *v;
1466 ASSERT(shadow_lock_is_acquired(d));
1468 SH_VLOG("shadow mode table op %lx %lx count %d",
1469 (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */
1470 (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
1471 d->arch.shadow_page_count);
1473 shadow_audit(d, 1);
1475 switch ( op )
1477 case DOM0_SHADOW_CONTROL_OP_FLUSH:
1478 free_shadow_pages(d);
1480 d->arch.shadow_fault_count = 0;
1481 d->arch.shadow_dirty_count = 0;
1483 break;
1485 case DOM0_SHADOW_CONTROL_OP_CLEAN:
1486 free_shadow_pages(d);
1488 sc->stats.fault_count = d->arch.shadow_fault_count;
1489 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1491 d->arch.shadow_fault_count = 0;
1492 d->arch.shadow_dirty_count = 0;
1494 if ( (sc->dirty_bitmap == NULL) ||
1495 (d->arch.shadow_dirty_bitmap == NULL) )
1497 rc = -EINVAL;
1498 break;
1501 if(sc->pages > d->arch.shadow_dirty_bitmap_size)
1502 sc->pages = d->arch.shadow_dirty_bitmap_size;
1504 #define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
1505 for ( i = 0; i < sc->pages; i += chunk )
1507 int bytes = ((((sc->pages - i) > chunk) ?
1508 chunk : (sc->pages - i)) + 7) / 8;
1510 if (copy_to_user(
1511 sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
1512 d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
1513 bytes))
1515 rc = -EINVAL;
1516 break;
1519 memset(
1520 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
1521 0, bytes);
1524 break;
1526 case DOM0_SHADOW_CONTROL_OP_PEEK:
1527 sc->stats.fault_count = d->arch.shadow_fault_count;
1528 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1530 if ( (sc->dirty_bitmap == NULL) ||
1531 (d->arch.shadow_dirty_bitmap == NULL) )
1533 rc = -EINVAL;
1534 break;
1537 if(sc->pages > d->arch.shadow_dirty_bitmap_size)
1538 sc->pages = d->arch.shadow_dirty_bitmap_size;
1540 if (copy_to_user(sc->dirty_bitmap,
1541 d->arch.shadow_dirty_bitmap, (sc->pages+7)/8))
1543 rc = -EINVAL;
1544 break;
1547 break;
1549 default:
1550 rc = -EINVAL;
1551 break;
1554 SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
1555 shadow_audit(d, 1);
1557 for_each_vcpu(d,v)
1558 __update_pagetables(v);
1560 return rc;
1563 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
1565 unsigned int op = sc->op;
1566 int rc = 0;
1567 struct vcpu *v;
1569 if ( unlikely(d == current->domain) )
1571 DPRINTK("Don't try to do a shadow op on yourself!\n");
1572 return -EINVAL;
1575 domain_pause(d);
1577 shadow_lock(d);
1579 switch ( op )
1581 case DOM0_SHADOW_CONTROL_OP_OFF:
1582 if ( shadow_mode_enabled(d) )
1584 __shadow_sync_all(d);
1585 __shadow_mode_disable(d);
1587 break;
1589 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
1590 free_shadow_pages(d);
1591 rc = __shadow_mode_enable(d, SHM_enable);
1592 break;
1594 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
1595 free_shadow_pages(d);
1596 rc = __shadow_mode_enable(
1597 d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
1598 break;
1600 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
1601 free_shadow_pages(d);
1602 rc = __shadow_mode_enable(
1603 d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate|SHM_wr_pt_pte);
1604 break;
1606 default:
1607 rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
1608 break;
1611 shadow_unlock(d);
1613 for_each_vcpu(d,v)
1614 update_pagetables(v);
1616 domain_unpause(d);
1618 return rc;
1621 unsigned long
1622 get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn)
1624 unsigned long va, tabpfn;
1625 l1_pgentry_t *l1, l1e;
1626 l2_pgentry_t *l2, l2e;
1628 ASSERT(shadow_mode_translate(d));
1630 perfc_incrc(get_mfn_from_gpfn_foreign);
1632 if ( shadow_mode_external(d) )
1634 unsigned long mfn;
1635 unsigned long *l0;
1637 va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
1639 tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
1640 if ( !tabpfn )
1641 return INVALID_MFN;
1643 l2 = map_domain_page(tabpfn);
1644 l2e = l2[l2_table_offset(va)];
1645 unmap_domain_page(l2);
1646 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1647 return INVALID_MFN;
1649 l1 = map_domain_page(l2e_get_pfn(l2e));
1650 l1e = l1[l1_table_offset(va)];
1651 unmap_domain_page(l1);
1652 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
1653 return INVALID_MFN;
1655 l0 = map_domain_page(l1e_get_pfn(l1e));
1656 mfn = l0[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)];
1657 unmap_domain_page(l0);
1658 return mfn;
1660 else
1662 va = gpfn << PAGE_SHIFT;
1663 tabpfn = pagetable_get_pfn(d->arch.phys_table);
1664 l2 = map_domain_page(tabpfn);
1665 l2e = l2[l2_table_offset(va)];
1666 unmap_domain_page(l2);
1667 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1669 printk("%s(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte "\n",
1670 __func__, d->domain_id, gpfn, l2e_get_intpte(l2e));
1671 return INVALID_MFN;
1673 l1 = map_domain_page(l2e_get_pfn(l2e));
1674 l1e = l1[l1_table_offset(va)];
1675 unmap_domain_page(l1);
1676 #if 0
1677 printk("%s(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx l1tab=%lx, l1e=%lx\n",
1678 __func__, d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, l1tab, l1e);
1679 #endif
1681 return l1e_get_intpte(l1e);
1686 static unsigned long
1687 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
1688 unsigned long smfn)
1690 unsigned long hl2mfn;
1691 l1_pgentry_t *hl2;
1692 l2_pgentry_t *gpgd;
1693 int limit;
1694 int x;
1696 ASSERT(PGT_base_page_table == PGT_l2_page_table);
1698 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
1700 printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
1701 gpfn, gmfn);
1702 BUG(); /* XXX Deal gracefully with failure. */
1705 SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
1706 gpfn, gmfn, smfn, hl2mfn);
1707 perfc_incrc(shadow_hl2_table_count);
1709 hl2 = map_domain_page(hl2mfn);
1711 if ( shadow_mode_external(d) )
1712 limit = L2_PAGETABLE_ENTRIES;
1713 else
1714 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
1716 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
1718 if ( !shadow_mode_external(d) )
1720 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
1721 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1723 // Setup easy access to the GL2, SL2, and HL2 frames.
1724 //
1725 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
1726 l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
1727 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1728 l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
1729 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
1730 l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
1733 gpgd = map_domain_page(gmfn);
1734 for (x = 0; x < DOMAIN_ENTRIES_PER_L2_PAGETABLE; x++)
1735 validate_hl2e_change(d, gpgd[x], &hl2[x]);
1736 unmap_domain_page(gpgd);
1738 unmap_domain_page(hl2);
1740 return hl2mfn;
1743 /*
1744 * This could take and use a snapshot, and validate the entire page at
1745 * once, or it could continue to fault in entries one at a time...
1746 * Might be worth investigating...
1747 */
1748 static unsigned long shadow_l2_table(
1749 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1751 unsigned long smfn;
1752 l2_pgentry_t *spl2e;
1753 int i;
1755 SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
1757 perfc_incrc(shadow_l2_table_count);
1759 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
1761 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
1762 gpfn, gmfn);
1763 BUG(); /* XXX Deal gracefully with failure. */
1766 spl2e = (l2_pgentry_t *)map_domain_page(smfn);
1768 /* Install hypervisor and 2x linear p.t. mapings. */
1769 if ( (PGT_base_page_table == PGT_l2_page_table) &&
1770 !shadow_mode_external(d) )
1772 /*
1773 * We could proactively fill in PDEs for pages that are already
1774 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
1775 * (restriction required for coherence of the accessed bit). However,
1776 * we tried it and it didn't help performance. This is simpler.
1777 */
1778 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
1780 /* Install hypervisor and 2x linear p.t. mapings. */
1781 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1782 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1783 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1785 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1786 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
1788 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1789 spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1790 l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
1791 arch.mm_perdomain_pt) + i,
1792 __PAGE_HYPERVISOR);
1794 if ( shadow_mode_translate(d) ) // NB: not external
1796 unsigned long hl2mfn;
1798 ASSERT(pagetable_get_paddr(d->arch.phys_table));
1799 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
1800 l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
1801 __PAGE_HYPERVISOR);
1803 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
1804 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
1806 // shadow_mode_translate (but not external) sl2 tables hold a
1807 // ref to their hl2.
1808 //
1809 if ( !get_shadow_ref(hl2mfn) )
1810 BUG();
1812 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1813 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
1815 else
1816 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1817 l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
1819 else
1821 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
1824 unmap_domain_page(spl2e);
1826 SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
1827 return smfn;
1830 void shadow_map_l1_into_current_l2(unsigned long va)
1832 struct vcpu *v = current;
1833 struct domain *d = v->domain;
1834 l1_pgentry_t *gpl1e, *spl1e;
1835 l2_pgentry_t gl2e, sl2e;
1836 unsigned long gl1pfn, gl1mfn, sl1mfn;
1837 int i, init_table = 0;
1839 __guest_get_l2e(v, va, &gl2e);
1840 ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
1841 gl1pfn = l2e_get_pfn(gl2e);
1843 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
1845 /* This L1 is NOT already shadowed so we need to shadow it. */
1846 SH_VVLOG("4a: l1 not shadowed");
1848 gl1mfn = gmfn_to_mfn(d, gl1pfn);
1849 if ( unlikely(!VALID_MFN(gl1mfn)) )
1851 // Attempt to use an invalid pfn as an L1 page.
1852 // XXX this needs to be more graceful!
1853 BUG();
1856 if ( unlikely(!(sl1mfn =
1857 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
1859 printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
1860 gl1pfn, gl1mfn);
1861 BUG(); /* XXX Need to deal gracefully with failure. */
1864 perfc_incrc(shadow_l1_table_count);
1865 init_table = 1;
1867 else
1869 /* This L1 is shadowed already, but the L2 entry is missing. */
1870 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
1873 #ifndef NDEBUG
1875 l2_pgentry_t old_sl2e;
1876 __shadow_get_l2e(v, va, &old_sl2e);
1877 ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
1879 #endif
1881 if ( !get_shadow_ref(sl1mfn) )
1882 BUG();
1883 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
1884 __guest_set_l2e(v, va, gl2e);
1885 __shadow_set_l2e(v, va, sl2e);
1887 if ( init_table )
1889 l1_pgentry_t sl1e;
1890 int index = l1_table_offset(va);
1891 int min = 1, max = 0;
1893 gpl1e = &(linear_pg_table[l1_linear_offset(va) &
1894 ~(L1_PAGETABLE_ENTRIES-1)]);
1896 spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
1897 ~(L1_PAGETABLE_ENTRIES-1)]);
1899 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1901 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
1902 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
1903 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
1904 sl1e = l1e_empty();
1905 if ( l1e_get_flags(sl1e) == 0 )
1907 // First copy entries from 0 until first invalid.
1908 // Then copy entries from index until first invalid.
1909 //
1910 if ( i < index ) {
1911 i = index - 1;
1912 continue;
1914 break;
1916 spl1e[i] = sl1e;
1917 if ( unlikely(i < min) )
1918 min = i;
1919 if ( likely(i > max) )
1920 max = i;
1921 set_guest_back_ptr(d, sl1e, sl1mfn, i);
1924 mfn_to_page(sl1mfn)->tlbflush_timestamp =
1925 SHADOW_ENCODE_MIN_MAX(min, max);
1929 void shadow_invlpg(struct vcpu *v, unsigned long va)
1931 struct domain *d = v->domain;
1932 l1_pgentry_t gpte, spte;
1934 ASSERT(shadow_mode_enabled(d));
1936 shadow_lock(d);
1938 __shadow_sync_va(v, va);
1940 // XXX mafetter: will need to think about 4MB pages...
1942 // It's not strictly necessary to update the shadow here,
1943 // but it might save a fault later.
1944 //
1945 if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
1946 sizeof(gpte))) {
1947 perfc_incrc(shadow_invlpg_faults);
1948 shadow_unlock(d);
1949 return;
1951 l1pte_propagate_from_guest(d, gpte, &spte);
1952 shadow_set_l1e(va, spte, 1);
1954 shadow_unlock(d);
1957 struct out_of_sync_entry *
1958 shadow_alloc_oos_entry(struct domain *d)
1960 struct out_of_sync_entry *f, *extra;
1961 unsigned size, i;
1963 if ( unlikely(d->arch.out_of_sync_free == NULL) )
1965 FSH_LOG("Allocate more fullshadow tuple blocks.");
1967 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
1968 extra = xmalloc_bytes(size);
1970 /* XXX Should be more graceful here. */
1971 if ( extra == NULL )
1972 BUG();
1974 memset(extra, 0, size);
1976 /* Record the allocation block so it can be correctly freed later. */
1977 d->arch.out_of_sync_extras_count++;
1978 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
1979 d->arch.out_of_sync_extras;
1980 d->arch.out_of_sync_extras = &extra[0];
1982 /* Thread a free chain through the newly-allocated nodes. */
1983 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
1984 extra[i].next = &extra[i+1];
1985 extra[i].next = NULL;
1987 /* Add the new nodes to the free list. */
1988 d->arch.out_of_sync_free = &extra[0];
1991 /* Allocate a new node from the quicklist. */
1992 f = d->arch.out_of_sync_free;
1993 d->arch.out_of_sync_free = f->next;
1995 return f;
1998 static inline unsigned long
1999 shadow_make_snapshot(
2000 struct domain *d, unsigned long gpfn, unsigned long gmfn)
2002 unsigned long smfn, sl1mfn = 0;
2003 void *original, *snapshot;
2004 u32 min_max = 0;
2005 int min, max, length;
2007 if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
2009 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
2010 return SHADOW_SNAPSHOT_ELSEWHERE;
2013 perfc_incrc(shadow_make_snapshot);
2015 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
2017 printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
2018 "Dom%d snapshot_count_count=%d\n",
2019 gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
2020 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
2023 if ( !get_shadow_ref(smfn) )
2024 BUG();
2026 if ( shadow_mode_refcounts(d) &&
2027 (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
2028 min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
2029 mfn_to_page(smfn)->tlbflush_timestamp = min_max;
2031 min = SHADOW_MIN(min_max);
2032 max = SHADOW_MAX(min_max);
2033 length = max - min + 1;
2034 perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
2036 min *= sizeof(l1_pgentry_t);
2037 length *= sizeof(l1_pgentry_t);
2039 original = map_domain_page(gmfn);
2040 snapshot = map_domain_page(smfn);
2041 memcpy(snapshot + min, original + min, length);
2042 unmap_domain_page(original);
2043 unmap_domain_page(snapshot);
2045 return smfn;
2048 static void
2049 shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
2051 void *snapshot;
2053 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
2054 return;
2056 // Clear the out_of_sync bit.
2057 //
2058 clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info);
2060 // XXX Need to think about how to protect the domain's
2061 // information less expensively.
2062 //
2063 snapshot = map_domain_page(entry->snapshot_mfn);
2064 memset(snapshot, 0, PAGE_SIZE);
2065 unmap_domain_page(snapshot);
2067 put_shadow_ref(entry->snapshot_mfn);
2070 struct out_of_sync_entry *
2071 __shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
2072 unsigned long mfn)
2074 struct domain *d = v->domain;
2075 struct page_info *page = mfn_to_page(mfn);
2076 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
2078 ASSERT(shadow_lock_is_acquired(d));
2079 ASSERT(mfn_valid(mfn));
2081 #ifndef NDEBUG
2083 u32 type = page->u.inuse.type_info & PGT_type_mask;
2084 if ( shadow_mode_refcounts(d) )
2086 ASSERT(type == PGT_writable_page);
2088 else
2090 ASSERT(type && (type < PGT_l4_page_table));
2093 #endif
2095 FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08lx", __func__,
2096 gpfn, mfn, page->count_info, page->u.inuse.type_info);
2098 // XXX this will require some more thought... Cross-domain sharing and
2099 // modification of page tables? Hmm...
2100 //
2101 if ( d != page_get_owner(page) )
2102 BUG();
2104 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
2106 entry->v = v;
2107 entry->gpfn = gpfn;
2108 entry->gmfn = mfn;
2109 entry->writable_pl1e = -1;
2111 #if SHADOW_DEBUG
2112 mark_shadows_as_reflecting_snapshot(d, gpfn);
2113 #endif
2115 // increment guest's ref count to represent the entry in the
2116 // full shadow out-of-sync list.
2117 //
2118 get_page(page, d);
2120 return entry;
2123 struct out_of_sync_entry *
2124 shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
2125 unsigned long mfn)
2127 struct out_of_sync_entry *entry =
2128 __shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
2129 struct domain *d = v->domain;
2131 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
2132 // Add to the out-of-sync list
2133 //
2134 entry->next = d->arch.out_of_sync;
2135 d->arch.out_of_sync = entry;
2137 return entry;
2140 void shadow_mark_va_out_of_sync(
2141 struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
2143 struct out_of_sync_entry *entry =
2144 __shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
2145 l2_pgentry_t sl2e;
2146 struct domain *d = v->domain;
2148 // We need the address of shadow PTE that maps @va.
2149 // It might not exist yet. Make sure it's there.
2150 //
2151 __shadow_get_l2e(v, va, &sl2e);
2152 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
2154 // either this L1 isn't shadowed yet, or the shadow isn't linked into
2155 // the current L2.
2156 shadow_map_l1_into_current_l2(va);
2157 __shadow_get_l2e(v, va, &sl2e);
2159 ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
2161 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
2162 // NB: this is stored as a machine address.
2163 entry->writable_pl1e =
2164 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
2165 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
2166 entry->va = va;
2168 // Increment shadow's page count to represent the reference
2169 // inherent in entry->writable_pl1e
2170 //
2171 if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
2172 BUG();
2174 // Add to the out-of-sync list
2175 //
2176 entry->next = d->arch.out_of_sync;
2177 d->arch.out_of_sync = entry;
2179 FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
2180 __func__, va, entry->writable_pl1e);
2183 /*
2184 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
2185 * Returns 0 otherwise.
2186 */
2187 static int snapshot_entry_matches(
2188 struct domain *d, l1_pgentry_t *guest_pt,
2189 unsigned long gpfn, unsigned index)
2191 unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
2192 l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
2193 int entries_match;
2195 perfc_incrc(snapshot_entry_matches_calls);
2197 if ( !smfn )
2198 return 0;
2200 snapshot = map_domain_page(smfn);
2202 if (__copy_from_user(&gpte, &guest_pt[index],
2203 sizeof(gpte))) {
2204 unmap_domain_page(snapshot);
2205 return 0;
2208 // This could probably be smarter, but this is sufficent for
2209 // our current needs.
2210 //
2211 entries_match = !l1e_has_changed(gpte, snapshot[index],
2212 PAGE_FLAG_MASK);
2214 unmap_domain_page(snapshot);
2216 #ifdef PERF_COUNTERS
2217 if ( entries_match )
2218 perfc_incrc(snapshot_entry_matches_true);
2219 #endif
2221 return entries_match;
2224 /*
2225 * Returns 1 if va's shadow mapping is out-of-sync.
2226 * Returns 0 otherwise.
2227 */
2228 int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
2230 struct domain *d = v->domain;
2231 unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
2232 unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
2233 l2_pgentry_t l2e;
2234 unsigned long l1pfn, l1mfn;
2236 ASSERT(shadow_lock_is_acquired(d));
2237 ASSERT(VALID_M2P(l2pfn));
2239 perfc_incrc(shadow_out_of_sync_calls);
2241 if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
2242 !snapshot_entry_matches(d, (l1_pgentry_t *)v->arch.guest_vtable,
2243 l2pfn, l2_table_offset(va)) )
2244 return 1;
2246 __guest_get_l2e(v, va, &l2e);
2247 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
2248 return 0;
2250 l1pfn = l2e_get_pfn(l2e);
2251 l1mfn = gmfn_to_mfn(d, l1pfn);
2253 // If the l1 pfn is invalid, it can't be out of sync...
2254 if ( !VALID_MFN(l1mfn) )
2255 return 0;
2257 if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
2258 !snapshot_entry_matches(
2259 d, &linear_pg_table[l1_linear_offset(va) & ~(L1_PAGETABLE_ENTRIES-1)],
2260 l1pfn, l1_table_offset(va)) )
2261 return 1;
2263 return 0;
2266 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
2267 static inline unsigned long
2268 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
2270 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
2273 static inline void
2274 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
2276 unsigned long score = prediction & PGT_score_mask;
2277 int create = (score == 0);
2279 // saturating addition
2280 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
2281 score = score ? score : PGT_score_mask;
2283 prediction = (prediction & PGT_mfn_mask) | score;
2285 //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create);
2286 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
2288 if ( create )
2289 perfc_incr(writable_pte_predictions);
2292 static inline void
2293 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
2295 unsigned long score = prediction & PGT_score_mask;
2296 ASSERT(score);
2298 // divide score by 2... We don't like bad predictions.
2299 //
2300 score = (score >> 1) & PGT_score_mask;
2302 prediction = (prediction & PGT_mfn_mask) | score;
2304 //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score);
2306 if ( score )
2307 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
2308 else
2310 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
2311 perfc_decr(writable_pte_predictions);
2315 static void
2316 free_writable_pte_predictions(struct domain *d)
2318 int i;
2319 struct shadow_status *x;
2321 for ( i = 0; i < shadow_ht_buckets; i++ )
2323 u32 count;
2324 unsigned long *gpfn_list;
2326 /* Skip empty buckets. */
2327 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
2328 continue;
2330 count = 0;
2331 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
2332 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
2333 count++;
2335 gpfn_list = xmalloc_array(unsigned long, count);
2336 count = 0;
2337 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
2338 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
2339 gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
2341 while ( count )
2343 count--;
2344 /* delete_shadow_status() may do a shadow_audit(), so we need to
2345 * keep an accurate count of writable_pte_predictions to keep it
2346 * happy.
2347 */
2348 delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
2349 perfc_decr(writable_pte_predictions);
2352 xfree(gpfn_list);
2356 static int fix_entry(
2357 struct domain *d,
2358 l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
2360 l1_pgentry_t old = *pt;
2361 l1_pgentry_t new = old;
2363 l1e_remove_flags(new,_PAGE_RW);
2364 if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
2365 BUG();
2366 (*found)++;
2367 *pt = new;
2368 if ( is_l1_shadow )
2369 shadow_put_page_from_l1e(old, d);
2371 return (*found == max_refs_to_find);
2374 static u32 remove_all_write_access_in_ptpage(
2375 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
2376 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
2377 u32 max_refs_to_find, unsigned long prediction)
2379 l1_pgentry_t *pt = map_domain_page(pt_mfn);
2380 l1_pgentry_t match;
2381 unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
2382 int i;
2383 u32 found = 0;
2384 int is_l1_shadow =
2385 ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
2386 PGT_l1_shadow);
2388 match = l1e_from_pfn(readonly_gmfn, flags);
2390 if ( shadow_mode_external(d) ) {
2391 i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask)
2392 >> PGT_va_shift;
2394 if ( (i >= 0 && i < L1_PAGETABLE_ENTRIES) &&
2395 !l1e_has_changed(pt[i], match, flags) &&
2396 fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) &&
2397 !prediction )
2398 goto out;
2401 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
2403 if ( unlikely(!l1e_has_changed(pt[i], match, flags)) &&
2404 fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) )
2405 break;
2408 out:
2409 unmap_domain_page(pt);
2411 return found;
2414 int shadow_remove_all_write_access(
2415 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
2417 int i;
2418 struct shadow_status *a;
2419 u32 found = 0, write_refs;
2420 unsigned long predicted_smfn;
2422 ASSERT(shadow_lock_is_acquired(d));
2423 ASSERT(VALID_MFN(readonly_gmfn));
2425 perfc_incrc(remove_write_access);
2427 // If it's not a writable page, then no writable refs can be outstanding.
2428 //
2429 if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
2430 PGT_writable_page )
2432 perfc_incrc(remove_write_not_writable);
2433 return 1;
2436 // How many outstanding writable PTEs for this page are there?
2437 //
2438 write_refs =
2439 (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
2440 if ( write_refs && MFN_PINNED(readonly_gmfn) )
2442 write_refs--;
2445 if ( write_refs == 0 )
2447 perfc_incrc(remove_write_no_work);
2448 return 1;
2451 if ( shadow_mode_external(d) ) {
2452 if (--write_refs == 0)
2453 return 0;
2455 // Use the back pointer to locate the shadow page that can contain
2456 // the PTE of interest
2457 if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
2458 found += remove_all_write_access_in_ptpage(
2459 d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
2460 if ( found == write_refs )
2461 return 0;
2465 // Search all the shadow L1 page tables...
2466 //
2467 for (i = 0; i < shadow_ht_buckets; i++)
2469 a = &d->arch.shadow_ht[i];
2470 while ( a && a->gpfn_and_flags )
2472 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
2474 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
2475 if ( found == write_refs )
2476 return 0;
2479 a = a->next;
2483 FSH_LOG("%s: looking for %d refs, found %d refs",
2484 __func__, write_refs, found);
2486 return 0;
2489 static u32 remove_all_access_in_page(
2490 struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
2492 l1_pgentry_t *pl1e = map_domain_page(l1mfn);
2493 l1_pgentry_t match, ol2e;
2494 unsigned long flags = _PAGE_PRESENT;
2495 int i;
2496 u32 count = 0;
2497 int is_l1_shadow =
2498 ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) ==
2499 PGT_l1_shadow);
2501 match = l1e_from_pfn(forbidden_gmfn, flags);
2503 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
2505 if ( l1e_has_changed(pl1e[i], match, flags) )
2506 continue;
2508 ol2e = pl1e[i];
2509 pl1e[i] = l1e_empty();
2510 count++;
2512 if ( is_l1_shadow )
2513 shadow_put_page_from_l1e(ol2e, d);
2514 else /* must be an hl2 page */
2515 put_page(mfn_to_page(forbidden_gmfn));
2518 unmap_domain_page(pl1e);
2520 return count;
2523 u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
2525 int i;
2526 struct shadow_status *a;
2527 u32 count = 0;
2529 if ( unlikely(!shadow_mode_enabled(d)) )
2530 return 0;
2532 ASSERT(shadow_lock_is_acquired(d));
2533 perfc_incrc(remove_all_access);
2535 for (i = 0; i < shadow_ht_buckets; i++)
2537 a = &d->arch.shadow_ht[i];
2538 while ( a && a->gpfn_and_flags )
2540 switch (a->gpfn_and_flags & PGT_type_mask)
2542 case PGT_l1_shadow:
2543 case PGT_l2_shadow:
2544 case PGT_l3_shadow:
2545 case PGT_l4_shadow:
2546 case PGT_hl2_shadow:
2547 count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
2548 break;
2549 case PGT_snapshot:
2550 case PGT_writable_pred:
2551 // these can't hold refs to the forbidden page
2552 break;
2553 default:
2554 BUG();
2557 a = a->next;
2561 return count;
2564 static int resync_all(struct domain *d, u32 stype)
2566 struct out_of_sync_entry *entry;
2567 unsigned i;
2568 unsigned long smfn;
2569 void *guest, *shadow, *snapshot;
2570 int need_flush = 0, external = shadow_mode_external(d);
2571 int unshadow;
2572 int changed;
2573 u32 min_max_shadow, min_max_snapshot;
2574 int min_shadow, max_shadow, min_snapshot, max_snapshot;
2575 struct vcpu *v;
2577 ASSERT(shadow_lock_is_acquired(d));
2579 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2581 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
2582 continue;
2584 smfn = __shadow_status(d, entry->gpfn, stype);
2586 if ( !smfn )
2588 // For heavy weight shadows: no need to update refcounts if
2589 // there's no shadow page.
2590 //
2591 if ( shadow_mode_refcounts(d) )
2592 continue;
2594 // For light weight shadows: only need up resync the refcounts to
2595 // the new contents of the guest page iff this it has the right
2596 // page type.
2597 //
2598 if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
2599 continue;
2602 FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
2603 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
2605 // Compare guest's new contents to its snapshot, validating
2606 // and updating its shadow as appropriate.
2607 //
2608 guest = map_domain_page(entry->gmfn);
2609 snapshot = map_domain_page(entry->snapshot_mfn);
2611 if ( smfn )
2612 shadow = map_domain_page(smfn);
2613 else
2614 shadow = NULL;
2616 unshadow = 0;
2618 switch ( stype ) {
2619 case PGT_l1_shadow:
2621 l1_pgentry_t *guest1 = guest;
2622 l1_pgentry_t *shadow1 = shadow;
2623 l1_pgentry_t *snapshot1 = snapshot;
2624 int unshadow_l1 = 0;
2626 ASSERT(shadow_mode_write_l1(d) ||
2627 shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
2629 if ( !shadow_mode_refcounts(d) )
2630 revalidate_l1(d, guest1, snapshot1);
2632 if ( !smfn )
2633 break;
2635 min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
2636 min_shadow = SHADOW_MIN(min_max_shadow);
2637 max_shadow = SHADOW_MAX(min_max_shadow);
2639 min_max_snapshot =
2640 mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
2641 min_snapshot = SHADOW_MIN(min_max_snapshot);
2642 max_snapshot = SHADOW_MAX(min_max_snapshot);
2644 changed = 0;
2646 for ( i = min_shadow; i <= max_shadow; i++ )
2648 if ( (i < min_snapshot) || (i > max_snapshot) ||
2649 l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
2651 int error;
2653 error = validate_pte_change(d, guest1[i], &shadow1[i]);
2654 if ( error == -1 )
2655 unshadow_l1 = 1;
2656 else {
2657 need_flush |= error;
2658 set_guest_back_ptr(d, shadow1[i], smfn, i);
2661 // can't update snapshots of linear page tables -- they
2662 // are used multiple times...
2663 //
2664 // snapshot[i] = new_pte;
2665 changed++;
2668 perfc_incrc(resync_l1);
2669 perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
2670 perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
2671 if (unshadow_l1) {
2672 l2_pgentry_t l2e;
2674 __shadow_get_l2e(entry->v, entry->va, &l2e);
2675 if (l2e_get_flags(l2e) & _PAGE_PRESENT) {
2676 put_shadow_ref(l2e_get_pfn(l2e));
2677 l2e = l2e_empty();
2678 __shadow_set_l2e(entry->v, entry->va, l2e);
2680 if (entry->v == current)
2681 need_flush = 1;
2685 break;
2687 case PGT_l2_shadow:
2689 int max = -1;
2691 l2_pgentry_t *guest2 = guest;
2692 l2_pgentry_t *shadow2 = shadow;
2693 l2_pgentry_t *snapshot2 = snapshot;
2695 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
2696 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
2698 changed = 0;
2699 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2701 l2_pgentry_t new_pde = guest2[i];
2703 if ( !is_guest_l2_slot(0,i) && !external )
2704 continue;
2706 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
2708 need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
2710 // can't update snapshots of linear page tables -- they
2711 // are used multiple times...
2712 //
2713 // snapshot[i] = new_pde;
2715 changed++;
2717 if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
2718 max = i;
2720 // XXX - This hack works for linux guests.
2721 // Need a better solution long term.
2722 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
2723 unlikely(l2e_get_intpte(new_pde) != 0) &&
2724 !unshadow && MFN_PINNED(smfn) )
2725 unshadow = 1;
2727 if ( max == -1 )
2728 unshadow = 1;
2729 perfc_incrc(resync_l2);
2730 perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
2731 break;
2733 case PGT_hl2_shadow:
2735 l2_pgentry_t *guest2 = guest;
2736 l2_pgentry_t *snapshot2 = snapshot;
2737 l1_pgentry_t *shadow2 = shadow;
2739 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
2740 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
2742 changed = 0;
2743 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2745 l2_pgentry_t new_pde = guest2[i];
2747 if ( !is_guest_l2_slot(0, i) && !external )
2748 continue;
2750 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
2752 need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
2754 // can't update snapshots of linear page tables -- they
2755 // are used multiple times...
2756 //
2757 // snapshot[i] = new_pde;
2759 changed++;
2762 perfc_incrc(resync_hl2);
2763 perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
2764 break;
2766 default:
2767 BUG();
2770 if ( smfn )
2771 unmap_domain_page(shadow);
2772 unmap_domain_page(snapshot);
2773 unmap_domain_page(guest);
2775 if ( unlikely(unshadow) )
2777 for_each_vcpu(d, v)
2778 if(smfn == pagetable_get_pfn(v->arch.shadow_table))
2779 return need_flush;
2780 perfc_incrc(unshadow_l2_count);
2781 shadow_unpin(smfn);
2782 if ( unlikely(shadow_mode_external(d)) )
2784 unsigned long hl2mfn;
2786 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
2787 MFN_PINNED(hl2mfn) )
2788 shadow_unpin(hl2mfn);
2793 return need_flush;
2796 void __shadow_sync_all(struct domain *d)
2798 struct out_of_sync_entry *entry;
2799 int need_flush = 0;
2800 l1_pgentry_t *ppte, opte, npte;
2801 cpumask_t other_vcpus_mask;
2803 perfc_incrc(shadow_sync_all);
2805 ASSERT(shadow_lock_is_acquired(d));
2807 // First, remove all write permissions to the page tables
2808 //
2809 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2811 // Skip entries that have low bits set... Those aren't
2812 // real PTEs.
2813 //
2814 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
2815 continue;
2817 ppte = (l1_pgentry_t *)(
2818 (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
2819 (entry->writable_pl1e & ~PAGE_MASK));
2820 opte = npte = *ppte;
2821 l1e_remove_flags(npte, _PAGE_RW);
2823 if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
2824 !shadow_get_page_from_l1e(npte, d) )
2825 BUG();
2826 *ppte = npte;
2827 set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT,
2828 (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
2829 shadow_put_page_from_l1e(opte, d);
2831 unmap_domain_page(ppte);
2834 /* Other VCPUs mustn't use the revoked writable mappings. */
2835 other_vcpus_mask = d->domain_dirty_cpumask;
2836 cpu_clear(smp_processor_id(), other_vcpus_mask);
2837 flush_tlb_mask(other_vcpus_mask);
2839 /* Flush ourself later. */
2840 need_flush = 1;
2842 /* Second, resync all L1 pages, then L2 pages, etc... */
2843 need_flush |= resync_all(d, PGT_l1_shadow);
2844 if ( shadow_mode_translate(d) )
2845 need_flush |= resync_all(d, PGT_hl2_shadow);
2846 need_flush |= resync_all(d, PGT_l2_shadow);
2848 if ( need_flush && !unlikely(shadow_mode_external(d)) )
2849 local_flush_tlb();
2851 free_out_of_sync_state(d);
2854 int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
2856 l1_pgentry_t gpte, spte, orig_gpte;
2857 struct vcpu *v = current;
2858 struct domain *d = v->domain;
2859 l2_pgentry_t gpde;
2861 spte = l1e_empty();
2863 SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
2864 va, (unsigned long)regs->error_code);
2865 perfc_incrc(shadow_fault_calls);
2867 check_pagetable(v, "pre-sf");
2869 /*
2870 * Don't let someone else take the guest's table pages out-of-sync.
2871 */
2872 shadow_lock(d);
2874 /* XXX - FIX THIS COMMENT!!!
2875 * STEP 1. Check to see if this fault might have been caused by an
2876 * out-of-sync table page entry, or if we should pass this
2877 * fault onto the guest.
2878 */
2879 __shadow_sync_va(v, va);
2881 /*
2882 * STEP 2. Check the guest PTE.
2883 */
2884 __guest_get_l2e(v, va, &gpde);
2885 if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
2887 SH_VVLOG("shadow_fault - EXIT: L2 not present (%x)",
2888 l2e_get_intpte(gpde));
2889 perfc_incrc(shadow_fault_bail_pde_not_present);
2890 goto fail;
2893 // This can't fault because we hold the shadow lock and we've ensured that
2894 // the mapping is in-sync, so the check of the PDE's present bit, above,
2895 // covers this access.
2896 //
2897 orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
2898 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2900 SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ") (gpde %" PRIpte ")",
2901 l1e_get_intpte(gpte),
2902 l2e_get_intpte(gpde));
2903 perfc_incrc(shadow_fault_bail_pte_not_present);
2904 goto fail;
2907 /* Write fault? */
2908 if ( regs->error_code & 2 )
2910 int allow_writes = 0;
2912 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
2914 if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
2916 allow_writes = 1;
2917 l1e_add_flags(gpte, _PAGE_RW);
2919 else
2921 /* Write fault on a read-only mapping. */
2922 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
2923 l1e_get_intpte(gpte));
2924 perfc_incrc(shadow_fault_bail_ro_mapping);
2925 goto fail;
2928 else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
2930 SH_LOG("l1pte_write_fault: no write access to page table page");
2931 domain_crash_synchronous();
2934 /* User access violation in guest? */
2935 if ( unlikely((regs->error_code & 4) &&
2936 !(l1e_get_flags(gpte) & _PAGE_USER)))
2938 SH_VVLOG("shadow_fault - EXIT: wr fault on super page (%" PRIpte ")",
2939 l1e_get_intpte(gpte));
2940 goto fail;
2944 if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
2946 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
2947 perfc_incrc(write_fault_bail);
2948 shadow_unlock(d);
2949 return 0;
2952 if ( allow_writes )
2953 l1e_remove_flags(gpte, _PAGE_RW);
2955 else
2957 /* Read-protection violation in guest? */
2958 if ( unlikely((regs->error_code & 1) ))
2960 SH_VVLOG("shadow_fault - EXIT: read fault on super page (%" PRIpte ")",
2961 l1e_get_intpte(gpte));
2962 goto fail;
2967 if ( !l1pte_read_fault(d, &gpte, &spte) )
2969 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
2970 perfc_incrc(read_fault_bail);
2971 shadow_unlock(d);
2972 return 0;
2976 /*
2977 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
2978 */
2979 if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
2981 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
2982 if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
2983 &gpte, sizeof(gpte))) )
2985 printk("%s() failed, crashing domain %d "
2986 "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
2987 __func__,d->domain_id, l2e_get_intpte(gpde), va);
2988 domain_crash_synchronous();
2991 __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
2994 shadow_set_l1e(va, spte, 1);
2996 perfc_incrc(shadow_fault_fixed);
2997 d->arch.shadow_fault_count++;
2999 shadow_unlock(d);
3001 check_pagetable(v, "post-sf");
3002 return EXCRET_fault_fixed;
3004 fail:
3005 shadow_unlock(d);
3006 return 0;
3009 void shadow_l1_normal_pt_update(
3010 struct domain *d,
3011 unsigned long pa, l1_pgentry_t gpte,
3012 struct domain_mmap_cache *cache)
3014 unsigned long sl1mfn;
3015 l1_pgentry_t *spl1e, spte;
3017 shadow_lock(d);
3019 sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
3020 if ( sl1mfn )
3022 SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte,
3023 (void *)pa, l1e_get_intpte(gpte));
3024 l1pte_propagate_from_guest(current->domain, gpte, &spte);
3026 spl1e = map_domain_page_with_cache(sl1mfn, cache);
3027 spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
3028 unmap_domain_page_with_cache(spl1e, cache);
3031 shadow_unlock(d);
3034 void shadow_l2_normal_pt_update(
3035 struct domain *d,
3036 unsigned long pa, l2_pgentry_t gpde,
3037 struct domain_mmap_cache *cache)
3039 unsigned long sl2mfn, hl2mfn;
3040 l2_pgentry_t *spl2e;
3041 l1_pgentry_t *hl2e;
3043 shadow_lock(d);
3045 sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
3046 if ( sl2mfn )
3048 SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
3049 (void *)pa, l2e_get_intpte(gpde));
3050 spl2e = map_domain_page_with_cache(sl2mfn, cache);
3051 validate_pde_change(d, gpde,
3052 &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
3053 unmap_domain_page_with_cache(spl2e, cache);
3055 hl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT,
3056 PGT_hl2_shadow);
3057 if ( hl2mfn )
3059 hl2e = map_domain_page(hl2mfn);
3060 validate_hl2e_change(d, gpde,
3061 &hl2e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)]);
3062 unmap_domain_page(hl2e);
3065 shadow_unlock(d);
3068 #if CONFIG_PAGING_LEVELS >= 3
3069 void shadow_l3_normal_pt_update(
3070 struct domain *d,
3071 unsigned long pa, l3_pgentry_t gpde,
3072 struct domain_mmap_cache *cache)
3074 BUG(); // not yet implemented
3076 #endif
3078 #if CONFIG_PAGING_LEVELS >= 4
3079 void shadow_l4_normal_pt_update(
3080 struct domain *d,
3081 unsigned long pa, l4_pgentry_t gpde,
3082 struct domain_mmap_cache *cache)
3084 BUG(); // not yet implemented
3086 #endif
3088 int shadow_do_update_va_mapping(unsigned long va,
3089 l1_pgentry_t val,
3090 struct vcpu *v)
3092 struct domain *d = v->domain;
3093 l1_pgentry_t spte;
3094 int rc = 0;
3096 shadow_lock(d);
3098 // This is actually overkill - we don't need to sync the L1 itself,
3099 // just everything involved in getting to this L1 (i.e. we need
3100 // linear_pg_table[l1_linear_offset(va)] to be in sync)...
3101 //
3102 __shadow_sync_va(v, va);
3104 l1pte_propagate_from_guest(d, val, &spte);
3105 shadow_set_l1e(va, spte, 0);
3107 /*
3108 * If we're in log-dirty mode then we need to note that we've updated
3109 * the PTE in the PT-holding page. We need the machine frame number
3110 * for this.
3111 */
3112 __mark_dirty(d, va_to_l1mfn(v, va));
3114 shadow_unlock(d);
3116 return rc;
3120 /*
3121 * What lives where in the 32-bit address space in the various shadow modes,
3122 * and what it uses to get/maintain that mapping.
3124 * SHADOW MODE: none enable translate external
3126 * 4KB things:
3127 * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
3128 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
3129 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
3130 * monitor_vtable n/a n/a n/a mapped once
3132 * 4MB things:
3133 * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
3134 * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
3135 * monitor_linear n/a n/a n/a ???
3136 * perdomain perdomain perdomain perdomain perdomain
3137 * R/O M2P R/O M2P R/O M2P n/a n/a
3138 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
3139 * P2M n/a n/a R/O M2P R/O M2P
3141 * NB:
3142 * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
3143 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
3144 * all play a part in maintaining these mappings.
3145 */
3146 void __update_pagetables(struct vcpu *v)
3148 struct domain *d = v->domain;
3149 unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
3150 unsigned long gpfn = mfn_to_gmfn(d, gmfn);
3151 unsigned long smfn, hl2mfn, old_smfn;
3152 int need_sync = 0;
3154 int max_mode = ( shadow_mode_external(d) ? SHM_external
3155 : shadow_mode_translate(d) ? SHM_translate
3156 : shadow_mode_enabled(d) ? SHM_enable
3157 : 0 );
3159 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
3160 ASSERT( max_mode );
3162 /*
3163 * arch.guest_vtable
3164 */
3165 if ( max_mode & (SHM_enable | SHM_external) )
3167 if ( likely(v->arch.guest_vtable != NULL) )
3168 unmap_domain_page_global(v->arch.guest_vtable);
3169 v->arch.guest_vtable = map_domain_page_global(gmfn);
3172 /*
3173 * arch.shadow_table
3174 */
3175 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
3176 smfn = shadow_l2_table(d, gpfn, gmfn);
3177 else
3179 /*
3180 * move sync later in order to avoid this smfn been
3181 * unshadowed occasionally
3182 */
3183 need_sync = 1;
3185 if ( !get_shadow_ref(smfn) )
3186 BUG();
3187 old_smfn = pagetable_get_pfn(v->arch.shadow_table);
3188 v->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
3189 if ( old_smfn )
3190 put_shadow_ref(old_smfn);
3192 SH_VVLOG("__update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
3194 /*
3195 * arch.shadow_vtable
3196 */
3197 if ( max_mode == SHM_external )
3199 if ( v->arch.shadow_vtable )
3200 unmap_domain_page_global(v->arch.shadow_vtable);
3201 v->arch.shadow_vtable = map_domain_page_global(smfn);
3204 /*
3205 * arch.hl2_vtable
3206 */
3208 // if max_mode == SHM_translate, then the hl2 is already installed
3209 // correctly in its smfn, and there's nothing to do.
3210 //
3211 if ( max_mode == SHM_external )
3213 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
3214 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
3215 if ( v->arch.hl2_vtable )
3216 unmap_domain_page_global(v->arch.hl2_vtable);
3217 v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
3220 /*
3221 * fixup pointers in monitor table, as necessary
3222 */
3223 if ( max_mode == SHM_external )
3225 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
3226 l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
3227 l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
3229 ASSERT( shadow_mode_translate(d) );
3231 if ( !get_shadow_ref(hl2mfn) )
3232 BUG();
3233 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
3234 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
3235 if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
3236 put_shadow_ref(l2e_get_pfn(old_hl2e));
3238 if ( !get_shadow_ref(smfn) )
3239 BUG();
3240 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
3241 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
3242 if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
3243 put_shadow_ref(l2e_get_pfn(old_sl2e));
3245 // XXX - maybe this can be optimized somewhat??
3246 local_flush_tlb();
3249 if(likely(need_sync))
3250 shadow_sync_all(d);
3253 void clear_all_shadow_status(struct domain *d)
3255 shadow_lock(d);
3256 free_shadow_pages(d);
3257 free_shadow_ht_entries(d);
3258 d->arch.shadow_ht =
3259 xmalloc_array(struct shadow_status, shadow_ht_buckets);
3260 if ( d->arch.shadow_ht == NULL ) {
3261 printk("clear all shadow status:xmalloc fail\n");
3262 domain_crash_synchronous();
3264 memset(d->arch.shadow_ht, 0,
3265 shadow_ht_buckets * sizeof(struct shadow_status));
3267 free_out_of_sync_entries(d);
3268 shadow_unlock(d);
3271 /************************************************************************/
3272 /************************************************************************/
3273 /************************************************************************/
3275 #if SHADOW_DEBUG
3277 // The following is entirely for _check_pagetable()'s benefit.
3278 // _check_pagetable() wants to know whether a given entry in a
3279 // shadow page table is supposed to be the shadow of the guest's
3280 // current entry, or the shadow of the entry held in the snapshot
3281 // taken above.
3282 //
3283 // Here, we mark all currently existing entries as reflecting
3284 // the snapshot, above. All other places in xen that update
3285 // the shadow will keep the shadow in sync with the guest's
3286 // entries (via l1pte_propagate_from_guest and friends), which clear
3287 // the SHADOW_REFLECTS_SNAPSHOT bit.
3288 //
3289 static void
3290 mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
3292 unsigned long smfn;
3293 l1_pgentry_t *l1e;
3294 l2_pgentry_t *l2e;
3295 unsigned i;
3297 if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
3299 l1e = map_domain_page(smfn);
3300 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3301 if ( is_guest_l1_slot(i) &&
3302 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
3303 l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
3304 unmap_domain_page(l1e);
3307 if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
3309 l2e = map_domain_page(smfn);
3310 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3311 if ( is_guest_l2_slot(0, i) &&
3312 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
3313 l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
3314 unmap_domain_page(l2e);
3318 // BUG: these are not SMP safe...
3319 static int sh_l2_present;
3320 static int sh_l1_present;
3321 static char *sh_check_name;
3322 int shadow_status_noswap;
3324 #define v2m(_v, _adr) ({ \
3325 unsigned long _a = (unsigned long)(_adr); \
3326 l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \
3327 unsigned long _pa = -1; \
3328 if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
3329 { \
3330 l1_pgentry_t _pte; \
3331 _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
3332 if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
3333 _pa = l1e_get_paddr(_pte); \
3334 } \
3335 _pa | (_a & ~PAGE_MASK); \
3336 })
3338 #define FAIL(_f, _a...) \
3339 do { \
3340 printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \
3341 sh_check_name, level, l2_idx, l1_idx, ## _a, \
3342 __FILE__, __LINE__); \
3343 printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \
3344 " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \
3345 " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \
3346 " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \
3347 l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \
3348 l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \
3349 p_guest_pte, p_shadow_pte, p_snapshot_pte, \
3350 (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \
3351 (void *)v2m(v, p_snapshot_pte), \
3352 (l2_idx << L2_PAGETABLE_SHIFT) | \
3353 (l1_idx << L1_PAGETABLE_SHIFT)); \
3354 errors++; \
3355 } while ( 0 )
3357 static int check_pte(
3358 struct vcpu *v,
3359 l1_pgentry_t *p_guest_pte,
3360 l1_pgentry_t *p_shadow_pte,
3361 l1_pgentry_t *p_snapshot_pte,
3362 int level, int l2_idx, int l1_idx)
3364 struct domain *d = v->domain;
3365 l1_pgentry_t guest_pte = *p_guest_pte;
3366 l1_pgentry_t shadow_pte = *p_shadow_pte;
3367 l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
3368 l1_pgentry_t eff_guest_pte = l1e_empty();
3369 unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
3370 int errors = 0, guest_writable;
3371 int page_table_page;
3373 if ( (l1e_get_intpte(shadow_pte) == 0) ||
3374 (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
3375 (l1e_get_intpte(shadow_pte) == 0x00000E00) )
3376 return errors; /* always safe */
3378 if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
3379 FAIL("Non zero not present shadow_pte");
3381 if ( level == 2 ) sh_l2_present++;
3382 if ( level == 1 ) sh_l1_present++;
3384 if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
3385 eff_guest_pte = snapshot_pte;
3386 else
3387 eff_guest_pte = guest_pte;
3389 if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
3390 FAIL("Guest not present yet shadow is");
3392 mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
3394 if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
3395 FAIL("Corrupt?");
3397 if ( (level == 1) &&
3398 (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
3399 !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
3400 FAIL("Dirty coherence");
3402 if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
3403 !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
3404 FAIL("Accessed coherence");
3406 if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
3407 FAIL("global bit set in shadow");
3409 eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
3410 eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
3411 shadow_mfn = l1e_get_pfn(shadow_pte);
3413 if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
3414 FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
3415 __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
3417 page_table_page = mfn_is_page_table(eff_guest_mfn);
3419 guest_writable =
3420 (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
3421 (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
3423 if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
3425 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n",
3426 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
3427 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
3428 page_table_page);
3429 FAIL("RW coherence");
3432 if ( (level == 1) &&
3433 (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
3434 !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
3436 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n",
3437 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
3438 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
3439 page_table_page);
3440 FAIL("RW2 coherence");
3443 if ( eff_guest_mfn == shadow_mfn )
3445 if ( level > 1 )
3446 FAIL("Linear map ???"); /* XXX this will fail on BSD */
3448 else
3450 if ( level < 2 )
3451 FAIL("Shadow in L1 entry?");
3453 if ( level == 2 )
3455 if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
3456 FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
3457 __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
3459 else
3460 BUG(); // XXX -- not handled yet.
3463 return errors;
3465 #undef FAIL
3466 #undef v2m
3468 static int check_l1_table(
3469 struct vcpu *v, unsigned long gpfn,
3470 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
3472 struct domain *d = v->domain;
3473 int i;
3474 unsigned long snapshot_mfn;
3475 l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
3476 int errors = 0;
3478 if ( page_out_of_sync(mfn_to_page(gmfn)) )
3480 snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
3481 ASSERT(snapshot_mfn);
3482 p_snapshot = map_domain_page(snapshot_mfn);
3485 p_guest = map_domain_page(gmfn);
3486 p_shadow = map_domain_page(smfn);
3488 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3489 errors += check_pte(v, p_guest+i, p_shadow+i,
3490 p_snapshot ? p_snapshot+i : NULL,
3491 1, l2_idx, i);
3493 unmap_domain_page(p_shadow);
3494 unmap_domain_page(p_guest);
3495 if ( p_snapshot )
3496 unmap_domain_page(p_snapshot);
3498 return errors;
3501 #define FAILPT(_f, _a...) \
3502 do { \
3503 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
3504 errors++; \
3505 } while ( 0 )
3507 int check_l2_table(
3508 struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
3510 struct domain *d = v->domain;
3511 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
3512 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
3513 l2_pgentry_t match;
3514 int i;
3515 int errors = 0;
3516 int limit;
3518 if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
3519 FAILPT("domain doesn't own page");
3520 if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
3521 FAILPT("bogus owner for snapshot page");
3522 if ( page_get_owner(mfn_to_page(smfn)) != NULL )
3523 FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
3524 smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
3526 #if 0
3527 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
3528 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
3529 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
3530 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
3532 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
3533 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
3534 i++ )
3535 printk("+++ (%d) %lx %lx\n",i,
3536 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
3537 FAILPT("hypervisor entries inconsistent");
3540 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
3541 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
3542 FAILPT("hypervisor linear map inconsistent");
3543 #endif
3545 match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
3546 if ( !shadow_mode_external(d) &&
3547 l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
3548 match, PAGE_FLAG_MASK))
3550 FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
3551 l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
3552 L2_PAGETABLE_SHIFT]),
3553 l2e_get_intpte(match));
3556 match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
3557 if ( !shadow_mode_external(d) &&
3558 l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
3559 match, PAGE_FLAG_MASK))
3561 FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
3562 l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
3563 d->arch.mm_perdomain_pt,
3564 l2e_get_intpte(match));
3567 if ( shadow_mode_external(d) )
3568 limit = L2_PAGETABLE_ENTRIES;
3569 else
3570 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
3572 /* Check the whole L2. */
3573 for ( i = 0; i < limit; i++ )
3574 errors += check_pte(v,
3575 (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
3576 (l1_pgentry_t*)(&spl2e[i]),
3577 NULL,
3578 2, i, 0);
3580 unmap_domain_page(spl2e);
3581 unmap_domain_page(gpl2e);
3583 #if 1
3584 if ( errors )
3585 printk("check_l2_table returning %d errors\n", errors);
3586 #endif
3588 return errors;
3590 #undef FAILPT
3592 int _check_pagetable(struct vcpu *v, char *s)
3594 struct domain *d = v->domain;
3595 pagetable_t pt = v->arch.guest_table;
3596 unsigned long gptbase = pagetable_get_paddr(pt);
3597 unsigned long ptbase_pfn, smfn;
3598 unsigned long i;
3599 l2_pgentry_t *gpl2e, *spl2e;
3600 unsigned long ptbase_mfn = 0;
3601 int errors = 0, limit, oos_pdes = 0;
3603 //_audit_domain(d, AUDIT_QUIET);
3604 shadow_lock(d);
3606 sh_check_name = s;
3607 //SH_VVLOG("%s-PT Audit", s);
3608 sh_l2_present = sh_l1_present = 0;
3609 perfc_incrc(check_pagetable);
3611 ptbase_mfn = gptbase >> PAGE_SHIFT;
3612 ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
3614 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
3616 printk("%s-PT %lx not shadowed\n", s, gptbase);
3617 goto out;
3619 if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
3621 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
3622 oos_pdes = 1;
3623 ASSERT(ptbase_mfn);
3626 errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
3628 gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
3629 spl2e = (l2_pgentry_t *) map_domain_page(smfn);
3631 /* Go back and recurse. */
3632 if ( shadow_mode_external(d) )
3633 limit = L2_PAGETABLE_ENTRIES;
3634 else
3635 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
3637 for ( i = 0; i < limit; i++ )
3639 unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
3640 unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
3641 unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
3643 if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */
3645 errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
3649 unmap_domain_page(spl2e);
3650 unmap_domain_page(gpl2e);
3652 out:
3653 if ( errors )
3654 BUG();
3656 shadow_unlock(d);
3658 return errors;
3661 int _check_all_pagetables(struct vcpu *v, char *s)
3663 struct domain *d = v->domain;
3664 int i;
3665 struct shadow_status *a;
3666 unsigned long gmfn;
3667 int errors = 0;
3669 shadow_status_noswap = 1;
3671 sh_check_name = s;
3672 SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
3673 sh_l2_present = sh_l1_present = 0;
3674 perfc_incrc(check_all_pagetables);
3676 for (i = 0; i < shadow_ht_buckets; i++)
3678 a = &d->arch.shadow_ht[i];
3679 while ( a && a->gpfn_and_flags )
3681 gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
3683 switch ( a->gpfn_and_flags & PGT_type_mask )
3685 case PGT_l1_shadow:
3686 errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
3687 gmfn, a->smfn, 0);
3688 break;
3689 case PGT_l2_shadow:
3690 errors += check_l2_table(v, gmfn, a->smfn,
3691 page_out_of_sync(mfn_to_page(gmfn)));
3692 break;
3693 case PGT_l3_shadow:
3694 case PGT_l4_shadow:
3695 case PGT_hl2_shadow:
3696 BUG(); // XXX - ought to fix this...
3697 break;
3698 case PGT_snapshot:
3699 case PGT_writable_pred:
3700 break;
3701 default:
3702 errors++;
3703 printk("unexpected shadow type %lx, gpfn=%lx, "
3704 "gmfn=%lx smfn=%lx\n",
3705 a->gpfn_and_flags & PGT_type_mask,
3706 a->gpfn_and_flags & PGT_mfn_mask,
3707 gmfn, a->smfn);
3708 BUG();
3710 a = a->next;
3714 shadow_status_noswap = 0;
3716 if ( errors )
3717 BUG();
3719 return errors;
3722 #endif // SHADOW_DEBUG
3724 /*
3725 * Local variables:
3726 * mode: C
3727 * c-set-style: "BSD"
3728 * c-basic-offset: 4
3729 * tab-width: 4
3730 * indent-tabs-mode: nil
3731 * End:
3732 */