ia64/xen-unstable

view xen/arch/x86/shadow.c @ 4279:3fe0f99cb576

bitkeeper revision 1.1259 (4241a247utqWSkcNHjYx45_xeNZjqg)

Unshadow any L2 that, when updated, has no valid entries left...

Signed-off-by: michael.fetterman@cl.cam.ac.uk
author mafetter@fleming.research
date Wed Mar 23 17:07:19 2005 +0000 (2005-03-23)
parents 0c149b605692
children b40fc0992e25
line source
1 /******************************************************************************
2 * arch/x86/shadow.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
22 #include <xen/config.h>
23 #include <xen/types.h>
24 #include <xen/mm.h>
25 #include <asm/shadow.h>
26 #include <asm/domain_page.h>
27 #include <asm/page.h>
28 #include <xen/event.h>
29 #include <xen/trace.h>
31 static void shadow_free_snapshot(struct domain *d,
32 struct out_of_sync_entry *entry);
33 static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
35 /********
37 There's a per-domain shadow table spin lock which works fine for SMP
38 hosts. We don't have to worry about interrupts as no shadow operations
39 happen in an interrupt context. It's probably not quite ready for SMP
40 guest operation as we have to worry about synchonisation between gpte
41 and spte updates. Its possible that this might only happen in a
42 hypercall context, in which case we'll probably at have a per-domain
43 hypercall lock anyhow (at least initially).
45 ********/
47 static inline int
48 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
49 unsigned long new_type)
50 {
51 struct pfn_info *page = pfn_to_page(gmfn);
52 int pinned = 0, okay = 1;
54 if ( page_out_of_sync(page) )
55 {
56 // Don't know how long ago this snapshot was taken.
57 // Can't trust it to be recent enough.
58 //
59 __shadow_sync_mfn(d, gmfn);
60 }
62 if ( unlikely(page_is_page_table(page)) )
63 return 1;
65 FSH_LOG("%s: gpfn=%p gmfn=%p nt=%p", __func__, gpfn, gmfn, new_type);
67 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
68 {
69 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%p gmfn=%p\n",
70 __func__, gpfn, gmfn);
71 return 0;
72 }
74 // To convert this page to use as a page table, the writable count
75 // should now be zero. Test this by grabbing the page as an page table,
76 // and then immediately releasing. This will also deal with any
77 // necessary TLB flushing issues for us.
78 //
79 // The cruft here about pinning doesn't really work right. This
80 // needs rethinking/rewriting... Need to gracefully deal with the
81 // TLB flushes required when promoting a writable page, and also deal
82 // with any outstanding (external) writable refs to this page (by
83 // refusing to promote it). The pinning headache complicates this
84 // code -- it would all much get simpler if we stop using
85 // shadow_lock() and move the shadow code to BIGLOCK().
86 //
87 if ( unlikely(!get_page(page, d)) )
88 BUG(); // XXX -- needs more thought for a graceful failure
89 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
90 {
91 pinned = 1;
92 put_page_and_type(page);
93 }
94 if ( get_page_type(page, PGT_base_page_table) )
95 {
96 set_bit(_PGC_page_table, &page->count_info);
97 put_page_type(page);
98 }
99 else
100 {
101 printk("shadow_promote: get_page_type failed "
102 "dom%d gpfn=%p gmfn=%p t=%x\n",
103 d->id, gpfn, gmfn, new_type);
104 okay = 0;
105 }
107 // Now put the type back to writable...
108 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
109 BUG(); // XXX -- needs more thought for a graceful failure
110 if ( unlikely(pinned) )
111 {
112 if ( unlikely(test_and_set_bit(_PGT_pinned,
113 &page->u.inuse.type_info)) )
114 BUG(); // hmm... someone pinned this again?
115 }
116 else
117 put_page_and_type(page);
119 return okay;
120 }
122 static inline void
123 shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
124 {
125 ASSERT(frame_table[gmfn].count_info & PGC_page_table);
127 if ( shadow_max_pgtable_type(d, gpfn) == PGT_none )
128 {
129 clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
131 if ( page_out_of_sync(pfn_to_page(gmfn)) )
132 {
133 remove_out_of_sync_entries(d, gmfn);
134 }
135 }
136 }
138 /*
139 * Things in shadow mode that collect get_page() refs to the domain's
140 * pages are:
141 * - PGC_allocated takes a gen count, just like normal.
142 * - A writable page can be pinned (paravirtualized guests may consider
143 * these pages to be L1s or L2s, and don't know the difference).
144 * Pinning a page takes a gen count (but, for domains in shadow mode,
145 * it *doesn't* take a type count)
146 * - CR3 grabs a ref to whatever it points at, just like normal.
147 * - Shadow mode grabs an initial gen count for itself, as a placehold
148 * for whatever references will exist.
149 * - Shadow PTEs that point to a page take a gen count, just like regular
150 * PTEs. However, they don't get a type count, as get_page_type() is
151 * hardwired to keep writable pages' counts at 1 for domains in shadow
152 * mode.
153 * - Whenever we shadow a page, the entry in the shadow hash grabs a
154 * general ref to the page.
155 * - Whenever a page goes out of sync, the out of sync entry grabs a
156 * general ref to the page.
157 */
158 /*
159 * pfn_info fields for pages allocated as shadow pages:
160 *
161 * All 32 bits of count_info are a simple count of refs to this shadow
162 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
163 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
164 * references.
165 *
166 * u.inuse._domain is left NULL, to prevent accidently allow some random
167 * domain from gaining permissions to map this page.
168 *
169 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
170 * shadowed.
171 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
172 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
173 * is currently exists because this is a shadow of a root page, and we
174 * don't want to let those disappear just because no CR3 is currently pointing
175 * at it.
176 *
177 * tlbflush_timestamp holds a min & max index of valid page table entries
178 * within the shadow page.
179 */
181 static inline unsigned long
182 alloc_shadow_page(struct domain *d,
183 unsigned long gpfn, unsigned long gmfn,
184 u32 psh_type)
185 {
186 struct pfn_info *page;
187 unsigned long smfn;
188 int pin = 0;
190 page = alloc_domheap_page(NULL);
191 if ( unlikely(page == NULL) )
192 {
193 printk("Couldn't alloc shadow page! dom%d count=%d\n",
194 d->id, d->arch.shadow_page_count);
195 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
196 perfc_value(shadow_l1_pages),
197 perfc_value(shadow_l2_pages),
198 perfc_value(hl2_table_pages),
199 perfc_value(snapshot_pages));
200 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
201 }
203 smfn = page_to_pfn(page);
205 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
206 page->u.inuse.type_info = psh_type | gmfn;
207 page->count_info = 0;
208 page->tlbflush_timestamp = 0;
210 switch ( psh_type )
211 {
212 case PGT_l1_shadow:
213 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
214 goto fail;
215 perfc_incr(shadow_l1_pages);
216 d->arch.shadow_page_count++;
217 break;
219 case PGT_l2_shadow:
220 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
221 goto fail;
222 perfc_incr(shadow_l2_pages);
223 d->arch.shadow_page_count++;
224 if ( PGT_l2_page_table == PGT_root_page_table )
225 pin = 1;
227 break;
229 case PGT_hl2_shadow:
230 // Treat an hl2 as an L1 for purposes of promotion.
231 // For external mode domains, treat them as an L2 for purposes of
232 // pinning.
233 //
234 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
235 goto fail;
236 perfc_incr(hl2_table_pages);
237 d->arch.hl2_page_count++;
238 if ( shadow_mode_external(d) &&
239 (PGT_l2_page_table == PGT_root_page_table) )
240 pin = 1;
242 break;
244 case PGT_snapshot:
245 perfc_incr(snapshot_pages);
246 d->arch.snapshot_page_count++;
247 break;
249 default:
250 printk("Alloc shadow weird page type type=%08x\n", psh_type);
251 BUG();
252 break;
253 }
255 set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
257 if ( pin )
258 shadow_pin(smfn);
260 return smfn;
262 fail:
263 FSH_LOG("promotion of pfn=%p mfn=%p failed! external gnttab refs?\n",
264 gpfn, gmfn);
265 free_domheap_page(page);
266 return 0;
267 }
269 static void inline
270 free_shadow_l1_table(struct domain *d, unsigned long smfn)
271 {
272 l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
273 int i;
275 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
276 put_page_from_l1e(pl1e[i], d);
278 unmap_domain_mem(pl1e);
279 }
281 static void inline
282 free_shadow_hl2_table(struct domain *d, unsigned long smfn)
283 {
284 l1_pgentry_t *hl2 = map_domain_mem(smfn << PAGE_SHIFT);
285 int i, limit;
287 if ( shadow_mode_external(d) )
288 limit = L2_PAGETABLE_ENTRIES;
289 else
290 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
292 for ( i = 0; i < limit; i++ )
293 {
294 unsigned long hl2e = l1_pgentry_val(hl2[i]);
295 if ( hl2e & _PAGE_PRESENT )
296 put_page(pfn_to_page(hl2e >> PAGE_SHIFT));
297 }
299 unmap_domain_mem(hl2);
300 }
302 static void inline
303 free_shadow_l2_table(struct domain *d, unsigned long smfn)
304 {
305 unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT);
306 int i, external = shadow_mode_external(d);
308 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
309 if ( external || is_guest_l2_slot(i) )
310 if ( pl2e[i] & _PAGE_PRESENT )
311 put_shadow_ref(pl2e[i] >> PAGE_SHIFT);
313 if ( (PGT_base_page_table == PGT_l2_page_table) &&
314 shadow_mode_translate(d) &&
315 !shadow_mode_external(d) )
316 {
317 // free the ref to the hl2
318 //
319 put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]
320 >> PAGE_SHIFT);
321 }
323 unmap_domain_mem(pl2e);
324 }
326 void free_shadow_page(unsigned long smfn)
327 {
328 struct pfn_info *page = &frame_table[smfn];
329 unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
330 struct domain *d = page_get_owner(pfn_to_page(gmfn));
331 unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
332 unsigned long type = page->u.inuse.type_info & PGT_type_mask;
334 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
336 delete_shadow_status(d, gpfn, gmfn, type);
338 switch ( type )
339 {
340 case PGT_l1_shadow:
341 perfc_decr(shadow_l1_pages);
342 shadow_demote(d, gpfn, gmfn);
343 free_shadow_l1_table(d, smfn);
344 break;
346 case PGT_l2_shadow:
347 perfc_decr(shadow_l2_pages);
348 shadow_demote(d, gpfn, gmfn);
349 free_shadow_l2_table(d, smfn);
350 break;
352 case PGT_hl2_shadow:
353 perfc_decr(hl2_table_pages);
354 shadow_demote(d, gpfn, gmfn);
355 free_shadow_hl2_table(d, smfn);
356 break;
358 case PGT_snapshot:
359 perfc_decr(snapshot_pages);
360 break;
362 default:
363 printk("Free shadow weird page type mfn=%08x type=%08x\n",
364 page-frame_table, page->u.inuse.type_info);
365 break;
366 }
368 d->arch.shadow_page_count--;
370 // No TLB flushes are needed the next time this page gets allocated.
371 //
372 page->tlbflush_timestamp = 0;
373 page->u.free.cpu_mask = 0;
375 free_domheap_page(page);
376 }
378 static void inline
379 release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
380 {
381 struct pfn_info *page;
383 page = &frame_table[entry->gmfn];
385 // Decrement ref count of guest & shadow pages
386 //
387 put_page(page);
389 // Only use entries that have low bits clear...
390 //
391 if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
392 {
393 put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
394 entry->writable_pl1e = -2;
395 }
396 else
397 ASSERT( entry->writable_pl1e == -1 );
399 // Free the snapshot
400 //
401 shadow_free_snapshot(d, entry);
402 }
404 static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
405 {
406 struct out_of_sync_entry *entry = d->arch.out_of_sync;
407 struct out_of_sync_entry **prev = &d->arch.out_of_sync;
408 struct out_of_sync_entry *found = NULL;
410 // NB: Be careful not to call something that manipulates this list
411 // while walking it. Collect the results into a separate list
412 // first, then walk that list.
413 //
414 while ( entry )
415 {
416 if ( entry->gmfn == gmfn )
417 {
418 // remove from out of sync list
419 *prev = entry->next;
421 // add to found list
422 entry->next = found;
423 found = entry;
425 entry = *prev;
426 continue;
427 }
428 prev = &entry->next;
429 entry = entry->next;
430 }
432 prev = NULL;
433 entry = found;
434 while ( entry )
435 {
436 release_out_of_sync_entry(d, entry);
438 prev = &entry->next;
439 entry = entry->next;
440 }
442 // Add found list to free list
443 if ( prev )
444 {
445 *prev = d->arch.out_of_sync_free;
446 d->arch.out_of_sync_free = found;
447 }
448 }
450 static void free_out_of_sync_state(struct domain *d)
451 {
452 struct out_of_sync_entry *entry;
454 // NB: Be careful not to call something that manipulates this list
455 // while walking it. Remove one item at a time, and always
456 // restart from start of list.
457 //
458 while ( (entry = d->arch.out_of_sync) )
459 {
460 d->arch.out_of_sync = entry->next;
461 release_out_of_sync_entry(d, entry);
463 entry->next = d->arch.out_of_sync_free;
464 d->arch.out_of_sync_free = entry;
465 }
466 }
468 static void free_shadow_pages(struct domain *d)
469 {
470 int i, free = 0;
471 struct shadow_status *x, *n;
472 struct exec_domain *e;
474 /*
475 * WARNING! The shadow page table must not currently be in use!
476 * e.g., You are expected to have paused the domain and synchronized CR3.
477 */
479 shadow_audit(d, 1);
481 if( !d->arch.shadow_ht ) return;
483 // first, remove any outstanding refs from out_of_sync entries...
484 //
485 free_out_of_sync_state(d);
487 // second, remove any outstanding refs from ed->arch.shadow_table...
488 //
489 for_each_exec_domain(d, e)
490 {
491 if ( pagetable_val(e->arch.shadow_table) )
492 {
493 put_shadow_ref(pagetable_val(e->arch.shadow_table) >> PAGE_SHIFT);
494 e->arch.shadow_table = mk_pagetable(0);
495 }
496 }
498 // Now, the only refs to shadow pages that are left are from the shadow
499 // pages themselves. We can just free them.
500 //
501 for ( i = 0; i < shadow_ht_buckets; i++ )
502 {
503 /* Skip empty buckets. */
504 x = &d->arch.shadow_ht[i];
505 if ( x->gpfn_and_flags == 0 )
506 continue;
508 /* Free the head page. */
509 free_shadow_page(x->smfn);
511 /* Reinitialise the head node. */
512 x->gpfn_and_flags = 0;
513 x->smfn = 0;
514 n = x->next;
515 x->next = NULL;
517 free++;
519 /* Iterate over non-head nodes. */
520 for ( x = n; x != NULL; x = n )
521 {
522 /* Free the shadow page. */
523 free_shadow_page(x->smfn);
525 /* Re-initialise the chain node. */
526 x->gpfn_and_flags = 0;
527 x->smfn = 0;
529 /* Add to the free list. */
530 n = x->next;
531 x->next = d->arch.shadow_ht_free;
532 d->arch.shadow_ht_free = x;
534 free++;
535 }
537 shadow_audit(d, 0);
538 }
540 SH_LOG("Free shadow table. Freed=%d.", free);
541 }
543 void shadow_mode_init(void)
544 {
545 }
547 static void alloc_monitor_pagetable(struct exec_domain *ed)
548 {
549 unsigned long mmfn;
550 l2_pgentry_t *mpl2e;
551 struct pfn_info *mmfn_info;
552 struct domain *d = ed->domain;
554 ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
556 mmfn_info = alloc_domheap_page(NULL);
557 ASSERT( mmfn_info );
559 mmfn = (unsigned long) (mmfn_info - frame_table);
560 mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
561 memset(mpl2e, 0, PAGE_SIZE);
563 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
564 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
565 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
567 mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
568 mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK)
569 | __PAGE_HYPERVISOR);
571 // map the phys_to_machine map into the Read-Only MPT space for this domain
572 mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
573 mk_l2_pgentry(pagetable_val(d->arch.phys_table) | __PAGE_HYPERVISOR);
575 ed->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
576 ed->arch.monitor_vtable = mpl2e;
577 }
579 /*
580 * Free the pages for monitor_table and hl2_table
581 */
582 void free_monitor_pagetable(struct exec_domain *ed)
583 {
584 l2_pgentry_t *mpl2e, hl2e;
585 unsigned long mfn;
587 ASSERT( pagetable_val(ed->arch.monitor_table) );
588 ASSERT( shadow_mode_external(ed->domain) );
590 mpl2e = ed->arch.monitor_vtable;
592 /*
593 * First get the mfn for hl2_table by looking at monitor_table
594 */
595 hl2e = mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT];
596 ASSERT(l2_pgentry_val(hl2e) & _PAGE_PRESENT);
597 mfn = l2_pgentry_val(hl2e) >> PAGE_SHIFT;
598 ASSERT(mfn);
600 put_shadow_ref(mfn);
601 unmap_domain_mem(mpl2e);
603 /*
604 * Then free monitor_table.
605 */
606 mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
607 free_domheap_page(&frame_table[mfn]);
609 ed->arch.monitor_table = mk_pagetable(0);
610 ed->arch.monitor_vtable = 0;
611 }
613 int
614 set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn)
615 {
616 unsigned long phystab = pagetable_val(d->arch.phys_table);
617 l2_pgentry_t *l2, l2e;
618 l1_pgentry_t *l1;
619 struct pfn_info *l1page;
620 unsigned long va = pfn << PAGE_SHIFT;
622 ASSERT( phystab );
624 #ifdef WATCH_MAP_DOMAIN_CALLERS
625 int old_map_domain_mem_noisy = map_domain_mem_noisy;
626 map_domain_mem_noisy = 0;
627 #endif
629 l2 = map_domain_mem(phystab);
630 if ( !l2_pgentry_val(l2e = l2[l2_table_offset(va)]) )
631 {
632 l1page = alloc_domheap_page(NULL);
633 if ( !l1page )
634 return 0;
636 l1 = map_domain_mem(page_to_pfn(l1page) << PAGE_SHIFT);
637 memset(l1, 0, PAGE_SIZE);
638 unmap_domain_mem(l1);
640 l2e = l2[l2_table_offset(va)] =
641 mk_l2_pgentry((page_to_pfn(l1page) << PAGE_SHIFT) |
642 __PAGE_HYPERVISOR);
643 }
644 unmap_domain_mem(l2);
646 l1 = map_domain_mem(l2_pgentry_val(l2e) & PAGE_MASK);
647 l1[l1_table_offset(va)] = mk_l1_pgentry((mfn << PAGE_SHIFT) |
648 __PAGE_HYPERVISOR);
649 unmap_domain_mem(l1);
651 #ifdef WATCH_MAP_DOMAIN_CALLERS
652 map_domain_mem_noisy = old_map_domain_mem_noisy;
653 #endif
655 return 1;
656 }
658 static int
659 alloc_p2m_table(struct domain *d)
660 {
661 struct list_head *list_ent;
662 struct pfn_info *page, *l2page;
663 l2_pgentry_t *l2;
664 unsigned long mfn, pfn;
666 l2page = alloc_domheap_page(NULL);
667 if ( !l2page )
668 return 0;
669 d->arch.phys_table = mk_pagetable(page_to_pfn(l2page) << PAGE_SHIFT);
670 l2 = map_domain_mem(page_to_pfn(l2page) << PAGE_SHIFT);
671 memset(l2, 0, PAGE_SIZE);
672 unmap_domain_mem(l2);
674 list_ent = d->page_list.next;
675 while ( list_ent != &d->page_list )
676 {
677 page = list_entry(list_ent, struct pfn_info, list);
678 mfn = page_to_pfn(page);
679 pfn = machine_to_phys_mapping[mfn];
680 ASSERT(pfn != INVALID_M2P_ENTRY);
681 ASSERT(pfn < (1u<<20));
683 set_p2m_entry(d, pfn, mfn);
685 list_ent = page->list.next;
686 }
688 return 1;
689 }
691 static void
692 free_p2m_table(struct domain *d)
693 {
694 // uh, this needs some work... :)
695 BUG();
696 }
698 int __shadow_mode_enable(struct domain *d, unsigned int mode)
699 {
700 struct exec_domain *ed;
701 int new_modes = (mode & ~d->arch.shadow_mode);
703 // Gotta be adding something to call this function.
704 ASSERT(new_modes);
706 // can't take anything away by calling this function.
707 ASSERT(!(d->arch.shadow_mode & ~mode));
709 for_each_exec_domain(d, ed)
710 {
711 invalidate_shadow_ldt(ed);
713 // We need to set these up for __update_pagetables().
714 // See the comment there.
716 /*
717 * arch.guest_vtable
718 */
719 if ( ed->arch.guest_vtable &&
720 (ed->arch.guest_vtable != __linear_l2_table) )
721 {
722 unmap_domain_mem(ed->arch.guest_vtable);
723 }
724 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
725 ed->arch.guest_vtable = __linear_l2_table;
726 else
727 ed->arch.guest_vtable = NULL;
729 /*
730 * arch.shadow_vtable
731 */
732 if ( ed->arch.shadow_vtable &&
733 (ed->arch.shadow_vtable != __shadow_linear_l2_table) )
734 {
735 unmap_domain_mem(ed->arch.shadow_vtable);
736 }
737 if ( !(mode & SHM_external) )
738 ed->arch.shadow_vtable = __shadow_linear_l2_table;
739 else
740 ed->arch.shadow_vtable = NULL;
742 /*
743 * arch.hl2_vtable
744 */
745 if ( ed->arch.hl2_vtable &&
746 (ed->arch.hl2_vtable != __linear_hl2_table) )
747 {
748 unmap_domain_mem(ed->arch.hl2_vtable);
749 }
750 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
751 ed->arch.hl2_vtable = __linear_hl2_table;
752 else
753 ed->arch.hl2_vtable = NULL;
755 /*
756 * arch.monitor_table & arch.monitor_vtable
757 */
758 if ( ed->arch.monitor_vtable )
759 {
760 free_monitor_pagetable(ed);
761 }
762 if ( mode & SHM_external )
763 {
764 alloc_monitor_pagetable(ed);
765 }
766 }
768 if ( new_modes & SHM_enable )
769 {
770 ASSERT( !d->arch.shadow_ht );
771 d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
772 if ( d->arch.shadow_ht == NULL )
773 goto nomem;
775 memset(d->arch.shadow_ht, 0,
776 shadow_ht_buckets * sizeof(struct shadow_status));
777 }
779 if ( new_modes & SHM_log_dirty )
780 {
781 ASSERT( !d->arch.shadow_dirty_bitmap );
782 d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
783 d->arch.shadow_dirty_bitmap =
784 xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
785 (8 * sizeof(unsigned long)));
786 if ( d->arch.shadow_dirty_bitmap == NULL )
787 {
788 d->arch.shadow_dirty_bitmap_size = 0;
789 goto nomem;
790 }
791 memset(d->arch.shadow_dirty_bitmap, 0,
792 d->arch.shadow_dirty_bitmap_size/8);
793 }
795 if ( new_modes & SHM_translate )
796 {
797 if ( !(new_modes & SHM_external) )
798 {
799 ASSERT( !pagetable_val(d->arch.phys_table) );
800 if ( !alloc_p2m_table(d) )
801 {
802 printk("alloc_p2m_table failed (out-of-memory?)\n");
803 goto nomem;
804 }
805 }
806 else
807 {
808 // external guests provide their own memory for their P2M maps.
809 //
810 ASSERT( d == page_get_owner(&frame_table[pagetable_val(
811 d->arch.phys_table)>>PAGE_SHIFT]) );
812 }
813 }
815 printk("audit1\n");
816 _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
817 printk("audit1 done\n");
819 // Get rid of any shadow pages from any previous shadow mode.
820 //
821 free_shadow_pages(d);
823 printk("audit2\n");
824 _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
825 printk("audit2 done\n");
827 // Turn off writable page tables.
828 // It doesn't mix with shadow mode.
829 // And shadow mode offers a superset of functionality.
830 //
831 vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables);
833 /*
834 * Tear down it's counts by disassembling its page-table-based ref counts.
835 * Also remove CR3's gcount/tcount.
836 * That leaves things like GDTs and LDTs and external refs in tact.
837 *
838 * Most pages will be writable tcount=0.
839 * Some will still be L1 tcount=0 or L2 tcount=0.
840 * Maybe some pages will be type none tcount=0.
841 * Pages granted external writable refs (via grant tables?) will
842 * still have a non-zero tcount. That's OK.
843 *
844 * gcounts will generally be 1 for PGC_allocated.
845 * GDTs and LDTs will have additional gcounts.
846 * Any grant-table based refs will still be in the gcount.
847 *
848 * We attempt to grab writable refs to each page (thus setting its type).
849 * Immediately put back those type refs.
850 *
851 * Assert that no pages are left with L1/L2/L3/L4 type.
852 */
853 audit_adjust_pgtables(d, -1, 1);
854 d->arch.shadow_mode = mode;
856 struct list_head *list_ent = d->page_list.next;
857 while ( list_ent != &d->page_list )
858 {
859 struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
860 if ( !get_page_type(page, PGT_writable_page) )
861 BUG();
862 put_page_type(page);
864 list_ent = page->list.next;
865 }
867 audit_adjust_pgtables(d, 1, 1);
869 printk("audit3\n");
870 _audit_domain(d, AUDIT_ALREADY_LOCKED);
871 printk("audit3 done\n");
873 return 0;
875 nomem:
876 if ( (new_modes & SHM_enable) && (d->arch.shadow_ht != NULL) )
877 {
878 xfree(d->arch.shadow_ht);
879 d->arch.shadow_ht = NULL;
880 }
881 if ( (new_modes & SHM_log_dirty) && (d->arch.shadow_dirty_bitmap != NULL) )
882 {
883 xfree(d->arch.shadow_dirty_bitmap);
884 d->arch.shadow_dirty_bitmap = NULL;
885 }
886 if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) &&
887 pagetable_val(d->arch.phys_table) )
888 {
889 free_p2m_table(d);
890 }
891 return -ENOMEM;
892 }
894 int shadow_mode_enable(struct domain *d, unsigned int mode)
895 {
896 int rc;
897 shadow_lock(d);
898 rc = __shadow_mode_enable(d, mode);
899 shadow_unlock(d);
900 return rc;
901 }
903 static void
904 translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
905 {
906 int i;
907 l1_pgentry_t *l1;
909 l1 = map_domain_mem(l1mfn << PAGE_SHIFT);
910 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
911 {
912 if ( is_guest_l1_slot(i) &&
913 (l1_pgentry_val(l1[i]) & _PAGE_PRESENT) )
914 {
915 unsigned long mfn = l1_pgentry_val(l1[i]) >> PAGE_SHIFT;
916 unsigned long gpfn = __mfn_to_gpfn(d, mfn);
917 ASSERT((l1_pgentry_val(p2m[gpfn]) >> PAGE_SHIFT) == mfn);
918 l1[i] = mk_l1_pgentry((gpfn << PAGE_SHIFT) |
919 (l1_pgentry_val(l1[i]) & ~PAGE_MASK));
920 }
921 }
922 unmap_domain_mem(l1);
923 }
925 // This is not general enough to handle arbitrary pagetables
926 // with shared L1 pages, etc., but it is sufficient for bringing
927 // up dom0.
928 //
929 void
930 translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn)
931 {
932 int i;
933 l2_pgentry_t *l2;
935 ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
937 l2 = map_domain_mem(l2mfn << PAGE_SHIFT);
938 for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
939 {
940 if ( is_guest_l2_slot(i) &&
941 (l2_pgentry_val(l2[i]) & _PAGE_PRESENT) )
942 {
943 unsigned long mfn = l2_pgentry_val(l2[i]) >> PAGE_SHIFT;
944 unsigned long gpfn = __mfn_to_gpfn(d, mfn);
945 ASSERT((l1_pgentry_val(p2m[gpfn]) >> PAGE_SHIFT) == mfn);
946 l2[i] = mk_l2_pgentry((gpfn << PAGE_SHIFT) |
947 (l2_pgentry_val(l2[i]) & ~PAGE_MASK));
948 translate_l1pgtable(d, p2m, mfn);
949 }
950 }
951 unmap_domain_mem(l2);
952 }
954 static void free_shadow_ht_entries(struct domain *d)
955 {
956 struct shadow_status *x, *n;
958 SH_VLOG("freed tables count=%d l1=%d l2=%d",
959 d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
960 perfc_value(shadow_l2_pages));
962 n = d->arch.shadow_ht_extras;
963 while ( (x = n) != NULL )
964 {
965 d->arch.shadow_extras_count--;
966 n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
967 xfree(x);
968 }
970 d->arch.shadow_ht_extras = NULL;
971 d->arch.shadow_ht_free = NULL;
973 ASSERT(d->arch.shadow_extras_count == 0);
974 SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
976 if ( d->arch.shadow_dirty_bitmap != NULL )
977 {
978 xfree(d->arch.shadow_dirty_bitmap);
979 d->arch.shadow_dirty_bitmap = 0;
980 d->arch.shadow_dirty_bitmap_size = 0;
981 }
983 xfree(d->arch.shadow_ht);
984 d->arch.shadow_ht = NULL;
985 }
987 static void free_out_of_sync_entries(struct domain *d)
988 {
989 struct out_of_sync_entry *x, *n;
991 n = d->arch.out_of_sync_extras;
992 while ( (x = n) != NULL )
993 {
994 d->arch.out_of_sync_extras_count--;
995 n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
996 xfree(x);
997 }
999 d->arch.out_of_sync_extras = NULL;
1000 d->arch.out_of_sync_free = NULL;
1001 d->arch.out_of_sync = NULL;
1003 ASSERT(d->arch.out_of_sync_extras_count == 0);
1004 FSH_LOG("freed extra out_of_sync entries, now %d",
1005 d->arch.out_of_sync_extras_count);
1008 void __shadow_mode_disable(struct domain *d)
1010 // This needs rethinking for the full shadow mode stuff.
1011 //
1012 // Among other things, ref counts need to be restored to a sensible
1013 // state for a non-shadow-mode guest...
1014 // This is probably easiest to do by stealing code from audit_domain().
1015 //
1016 BUG();
1018 free_shadow_pages(d);
1020 d->arch.shadow_mode = 0;
1022 free_shadow_ht_entries(d);
1023 free_out_of_sync_entries(d);
1026 static int shadow_mode_table_op(
1027 struct domain *d, dom0_shadow_control_t *sc)
1029 unsigned int op = sc->op;
1030 int i, rc = 0;
1031 struct exec_domain *ed;
1033 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1035 SH_VLOG("shadow mode table op %p %p count %d",
1036 pagetable_val(d->exec_domain[0]->arch.guest_table), /* XXX SMP */
1037 pagetable_val(d->exec_domain[0]->arch.shadow_table), /* XXX SMP */
1038 d->arch.shadow_page_count);
1040 shadow_audit(d, 1);
1042 switch ( op )
1044 case DOM0_SHADOW_CONTROL_OP_FLUSH:
1045 free_shadow_pages(d);
1047 d->arch.shadow_fault_count = 0;
1048 d->arch.shadow_dirty_count = 0;
1049 d->arch.shadow_dirty_net_count = 0;
1050 d->arch.shadow_dirty_block_count = 0;
1052 break;
1054 case DOM0_SHADOW_CONTROL_OP_CLEAN:
1055 free_shadow_pages(d);
1057 sc->stats.fault_count = d->arch.shadow_fault_count;
1058 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1059 sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count;
1060 sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
1062 d->arch.shadow_fault_count = 0;
1063 d->arch.shadow_dirty_count = 0;
1064 d->arch.shadow_dirty_net_count = 0;
1065 d->arch.shadow_dirty_block_count = 0;
1067 if ( (d->max_pages > sc->pages) ||
1068 (sc->dirty_bitmap == NULL) ||
1069 (d->arch.shadow_dirty_bitmap == NULL) )
1071 rc = -EINVAL;
1072 break;
1075 sc->pages = d->max_pages;
1077 #define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
1078 for ( i = 0; i < d->max_pages; i += chunk )
1080 int bytes = ((((d->max_pages - i) > chunk) ?
1081 chunk : (d->max_pages - i)) + 7) / 8;
1083 if (copy_to_user(
1084 sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
1085 d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
1086 bytes))
1088 // copy_to_user can fail when copying to guest app memory.
1089 // app should zero buffer after mallocing, and pin it
1090 rc = -EINVAL;
1091 memset(
1092 d->arch.shadow_dirty_bitmap +
1093 (i/(8*sizeof(unsigned long))),
1094 0, (d->max_pages/8) - (i/(8*sizeof(unsigned long))));
1095 break;
1098 memset(
1099 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
1100 0, bytes);
1103 break;
1105 case DOM0_SHADOW_CONTROL_OP_PEEK:
1106 sc->stats.fault_count = d->arch.shadow_fault_count;
1107 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1108 sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count;
1109 sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
1111 if ( (d->max_pages > sc->pages) ||
1112 (sc->dirty_bitmap == NULL) ||
1113 (d->arch.shadow_dirty_bitmap == NULL) )
1115 rc = -EINVAL;
1116 break;
1119 sc->pages = d->max_pages;
1120 if (copy_to_user(
1121 sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8))
1123 rc = -EINVAL;
1124 break;
1127 break;
1129 default:
1130 rc = -EINVAL;
1131 break;
1134 SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
1135 shadow_audit(d, 1);
1137 for_each_exec_domain(d,ed)
1138 __update_pagetables(ed);
1140 return rc;
1143 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
1145 unsigned int op = sc->op;
1146 int rc = 0;
1147 struct exec_domain *ed;
1149 if ( unlikely(d == current->domain) )
1151 DPRINTK("Don't try to do a shadow op on yourself!\n");
1152 return -EINVAL;
1155 domain_pause(d);
1156 synchronise_pagetables(~0UL);
1158 shadow_lock(d);
1160 switch ( op )
1162 case DOM0_SHADOW_CONTROL_OP_OFF:
1163 shadow_mode_disable(d);
1164 break;
1166 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
1167 free_shadow_pages(d);
1168 rc = __shadow_mode_enable(d, SHM_enable);
1169 break;
1171 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
1172 free_shadow_pages(d);
1173 rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
1174 break;
1176 default:
1177 rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
1178 break;
1181 shadow_unlock(d);
1183 for_each_exec_domain(d,ed)
1184 update_pagetables(ed);
1186 domain_unpause(d);
1188 return rc;
1191 /*
1192 * XXX KAF: Why is this VMX specific?
1193 */
1194 void vmx_shadow_clear_state(struct domain *d)
1196 SH_VVLOG("vmx_clear_shadow_state:");
1197 shadow_lock(d);
1198 free_shadow_pages(d);
1199 shadow_unlock(d);
1202 unsigned long
1203 gpfn_to_mfn_safe(struct domain *d, unsigned long gpfn)
1205 ASSERT( shadow_mode_translate(d) );
1207 perfc_incrc(gpfn_to_mfn_safe);
1209 unsigned long va = gpfn << PAGE_SHIFT;
1210 unsigned long phystab = pagetable_val(d->arch.phys_table);
1211 l2_pgentry_t *l2 = map_domain_mem(phystab);
1212 l2_pgentry_t l2e = l2[l2_table_offset(va)];
1213 unmap_domain_mem(l2);
1214 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
1216 printk("gpfn_to_mfn_safe(d->id=%d, gpfn=%p) => 0 l2e=%p\n",
1217 d->id, gpfn, l2_pgentry_val(l2e));
1218 return INVALID_MFN;
1220 unsigned long l1tab = l2_pgentry_val(l2e) & PAGE_MASK;
1221 l1_pgentry_t *l1 = map_domain_mem(l1tab);
1222 l1_pgentry_t l1e = l1[l1_table_offset(va)];
1223 unmap_domain_mem(l1);
1225 printk("gpfn_to_mfn_safe(d->id=%d, gpfn=%p) => %p phystab=%p l2e=%p l1tab=%p, l1e=%p\n",
1226 d->id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, phystab, l2e, l1tab, l1e);
1228 if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) )
1230 printk("gpfn_to_mfn_safe(d->id=%d, gpfn=%p) => 0 l1e=%p\n",
1231 d->id, gpfn, l1_pgentry_val(l1e));
1232 return INVALID_MFN;
1235 return l1_pgentry_val(l1e) >> PAGE_SHIFT;
1238 static unsigned long
1239 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
1240 unsigned long smfn)
1242 unsigned long hl2mfn;
1243 l1_pgentry_t *hl2;
1244 int limit;
1246 ASSERT(PGT_base_page_table == PGT_l2_page_table);
1248 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
1250 printk("Couldn't alloc an HL2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
1251 BUG(); /* XXX Deal gracefully with failure. */
1254 perfc_incrc(shadow_hl2_table_count);
1256 hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
1258 if ( shadow_mode_external(d) )
1259 limit = L2_PAGETABLE_ENTRIES;
1260 else
1261 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
1263 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
1265 if ( !shadow_mode_external(d) )
1267 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
1268 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1270 // Setup easy access to the GL2, SL2, and HL2 frames.
1271 //
1272 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
1273 mk_l1_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
1274 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1275 mk_l1_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
1276 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
1277 mk_l1_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
1280 unmap_domain_mem(hl2);
1282 return hl2mfn;
1285 /*
1286 * This could take and use a snapshot, and validate the entire page at
1287 * once, or it could continue to fault in entries one at a time...
1288 * Might be worth investigating...
1289 */
1290 static unsigned long shadow_l2_table(
1291 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1293 unsigned long smfn;
1294 l2_pgentry_t *spl2e;
1296 SH_VVLOG("shadow_l2_table(gpfn=%p, gmfn=%p)", gpfn, gmfn);
1298 perfc_incrc(shadow_l2_table_count);
1300 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
1302 printk("Couldn't alloc an L2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
1303 BUG(); /* XXX Deal gracefully with failure. */
1306 spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
1308 /* Install hypervisor and 2x linear p.t. mapings. */
1309 if ( (PGT_base_page_table == PGT_l2_page_table) &&
1310 !shadow_mode_external(d) )
1312 /*
1313 * We could proactively fill in PDEs for pages that are already
1314 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
1315 * (restriction required for coherence of the accessed bit). However,
1316 * we tried it and it didn't help performance. This is simpler.
1317 */
1318 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
1320 /* Install hypervisor and 2x linear p.t. mapings. */
1321 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1322 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
1323 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
1325 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1326 mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
1328 spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
1329 mk_l2_pgentry(__pa(page_get_owner(
1330 &frame_table[gmfn])->arch.mm_perdomain_pt) |
1331 __PAGE_HYPERVISOR);
1333 if ( shadow_mode_translate(d) ) // NB: not external
1335 unsigned long hl2mfn;
1337 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
1338 mk_l2_pgentry(pagetable_val(d->arch.phys_table) |
1339 __PAGE_HYPERVISOR);
1341 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
1342 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
1344 // shadow_mode_translate (but not external) sl2 tables hold a
1345 // ref to their hl2.
1346 //
1347 if ( !get_shadow_ref(hl2mfn) )
1348 BUG();
1350 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1351 mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
1353 else
1354 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1355 mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
1357 else
1359 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
1362 unmap_domain_mem(spl2e);
1364 SH_VLOG("shadow_l2_table(%p -> %p)", gmfn, smfn);
1365 return smfn;
1368 void shadow_map_l1_into_current_l2(unsigned long va)
1370 struct exec_domain *ed = current;
1371 struct domain *d = ed->domain;
1372 unsigned long *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, gl1mfn, sl1mfn;
1373 int i, init_table = 0;
1375 __guest_get_l2e(ed, va, &gl2e);
1376 ASSERT(gl2e & _PAGE_PRESENT);
1377 gl1pfn = gl2e >> PAGE_SHIFT;
1379 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
1381 /* This L1 is NOT already shadowed so we need to shadow it. */
1382 SH_VVLOG("4a: l1 not shadowed");
1384 gl1mfn = __gpfn_to_mfn(d, gl1pfn);
1385 if ( unlikely(!VALID_MFN(gl1mfn)) )
1387 // Attempt to use an invalid pfn as an L1 page.
1388 // XXX this needs to be more graceful!
1389 BUG();
1392 if ( unlikely(!(sl1mfn =
1393 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
1395 printk("Couldn't alloc an L1 shadow for pfn=%p mfn=%p\n",
1396 gl1pfn, gl1mfn);
1397 BUG(); /* XXX Need to deal gracefully with failure. */
1400 perfc_incrc(shadow_l1_table_count);
1401 init_table = 1;
1403 else
1405 /* This L1 is shadowed already, but the L2 entry is missing. */
1406 SH_VVLOG("4b: was shadowed, l2 missing (%p)", sl1mfn);
1409 #ifndef NDEBUG
1410 unsigned long old_sl2e;
1411 __shadow_get_l2e(ed, va, &old_sl2e);
1412 ASSERT( !(old_sl2e & _PAGE_PRESENT) );
1413 #endif
1415 if ( !get_shadow_ref(sl1mfn) )
1416 BUG();
1417 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
1418 __guest_set_l2e(ed, va, gl2e);
1419 __shadow_set_l2e(ed, va, sl2e);
1421 if ( init_table )
1423 gpl1e = (unsigned long *)
1424 &(linear_pg_table[l1_linear_offset(va) &
1425 ~(L1_PAGETABLE_ENTRIES-1)]);
1427 spl1e = (unsigned long *)
1428 &(shadow_linear_pg_table[l1_linear_offset(va) &
1429 ~(L1_PAGETABLE_ENTRIES-1)]);
1431 memset(spl1e, 0, PAGE_SIZE);
1433 unsigned long sl1e;
1434 int index = l1_table_offset(va);
1435 int min = 1, max = 0;
1437 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1439 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
1440 if ( (sl1e & _PAGE_PRESENT) &&
1441 !shadow_get_page_from_l1e(mk_l1_pgentry(sl1e), d) )
1442 sl1e = 0;
1443 if ( sl1e == 0 )
1445 // First copy entries from 0 until first invalid.
1446 // Then copy entries from index until first invalid.
1447 //
1448 if ( i < index ) {
1449 i = index - 1;
1450 continue;
1452 break;
1454 spl1e[i] = sl1e;
1455 if ( unlikely(i < min) )
1456 min = i;
1457 if ( likely(i > max) )
1458 max = i;
1461 frame_table[sl1mfn].tlbflush_timestamp =
1462 SHADOW_ENCODE_MIN_MAX(min, max);
1466 void shadow_invlpg(struct exec_domain *ed, unsigned long va)
1468 struct domain *d = ed->domain;
1469 unsigned long gpte, spte;
1471 ASSERT(shadow_mode_enabled(d));
1473 shadow_lock(d);
1475 __shadow_sync_va(ed, va);
1477 // XXX mafetter: will need to think about 4MB pages...
1479 // It's not strictly necessary to update the shadow here,
1480 // but it might save a fault later.
1481 //
1482 if (__get_user(gpte, (unsigned long *)
1483 &linear_pg_table[va >> PAGE_SHIFT])) {
1484 perfc_incrc(shadow_invlpg_faults);
1485 return;
1487 l1pte_propagate_from_guest(d, gpte, &spte);
1488 shadow_set_l1e(va, spte, 1);
1490 shadow_unlock(d);
1493 struct out_of_sync_entry *
1494 shadow_alloc_oos_entry(struct domain *d)
1496 struct out_of_sync_entry *f, *extra;
1497 unsigned size, i;
1499 if ( unlikely(d->arch.out_of_sync_free == NULL) )
1501 FSH_LOG("Allocate more fullshadow tuple blocks.");
1503 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
1504 extra = xmalloc_bytes(size);
1506 /* XXX Should be more graceful here. */
1507 if ( extra == NULL )
1508 BUG();
1510 memset(extra, 0, size);
1512 /* Record the allocation block so it can be correctly freed later. */
1513 d->arch.out_of_sync_extras_count++;
1514 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
1515 d->arch.out_of_sync_extras;
1516 d->arch.out_of_sync_extras = &extra[0];
1518 /* Thread a free chain through the newly-allocated nodes. */
1519 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
1520 extra[i].next = &extra[i+1];
1521 extra[i].next = NULL;
1523 /* Add the new nodes to the free list. */
1524 d->arch.out_of_sync_free = &extra[0];
1527 /* Allocate a new node from the quicklist. */
1528 f = d->arch.out_of_sync_free;
1529 d->arch.out_of_sync_free = f->next;
1531 return f;
1534 static inline unsigned long
1535 shadow_make_snapshot(
1536 struct domain *d, unsigned long gpfn, unsigned long gmfn)
1538 unsigned long smfn;
1539 void *original, *snapshot;
1541 if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
1543 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
1544 return SHADOW_SNAPSHOT_ELSEWHERE;
1547 perfc_incrc(shadow_make_snapshot);
1549 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
1551 printk("Couldn't alloc fullshadow snapshot for pfn=%p mfn=%p!\n"
1552 "Dom%d snapshot_count_count=%d\n",
1553 gpfn, gmfn, d->id, d->arch.snapshot_page_count);
1554 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
1557 if ( !get_shadow_ref(smfn) )
1558 BUG();
1560 original = map_domain_mem(gmfn << PAGE_SHIFT);
1561 snapshot = map_domain_mem(smfn << PAGE_SHIFT);
1562 memcpy(snapshot, original, PAGE_SIZE);
1563 unmap_domain_mem(original);
1564 unmap_domain_mem(snapshot);
1566 return smfn;
1569 static void
1570 shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
1572 void *snapshot;
1574 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1575 return;
1577 // Clear the out_of_sync bit.
1578 //
1579 clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
1581 // XXX Need to think about how to protect the domain's
1582 // information less expensively.
1583 //
1584 snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
1585 memset(snapshot, 0, PAGE_SIZE);
1586 unmap_domain_mem(snapshot);
1588 put_shadow_ref(entry->snapshot_mfn);
1591 struct out_of_sync_entry *
1592 shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn,
1593 unsigned long mfn)
1595 struct domain *d = ed->domain;
1596 struct pfn_info *page = &frame_table[mfn];
1597 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
1599 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1600 ASSERT(pfn_is_ram(mfn));
1601 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page);
1603 FSH_LOG("mark_mfn_out_of_sync(gpfn=%p, mfn=%p) c=%p t=%p",
1604 gpfn, mfn, page->count_info, page->u.inuse.type_info);
1606 // XXX this will require some more thought... Cross-domain sharing and
1607 // modification of page tables? Hmm...
1608 //
1609 if ( d != page_get_owner(page) )
1610 BUG();
1612 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
1614 entry->gpfn = gpfn;
1615 entry->gmfn = mfn;
1616 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1617 entry->writable_pl1e = -1;
1619 // increment guest's ref count to represent the entry in the
1620 // full shadow out-of-sync list.
1621 //
1622 get_page(page, d);
1624 // Add to the out-of-sync list
1625 //
1626 entry->next = d->arch.out_of_sync;
1627 d->arch.out_of_sync = entry;
1629 return entry;
1632 void shadow_mark_va_out_of_sync(
1633 struct exec_domain *ed, unsigned long gpfn, unsigned long mfn, unsigned long va)
1635 struct out_of_sync_entry *entry =
1636 shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
1637 unsigned long sl2e;
1639 // We need the address of shadow PTE that maps @va.
1640 // It might not exist yet. Make sure it's there.
1641 //
1642 __shadow_get_l2e(ed, va, &sl2e);
1643 if ( !(sl2e & _PAGE_PRESENT) )
1645 // either this L1 isn't shadowed yet, or the shadow isn't linked into
1646 // the current L2.
1647 shadow_map_l1_into_current_l2(va);
1648 __shadow_get_l2e(ed, va, &sl2e);
1650 ASSERT(sl2e & _PAGE_PRESENT);
1652 // NB: this is stored as a machine address.
1653 entry->writable_pl1e =
1654 ((sl2e & PAGE_MASK) |
1655 (sizeof(l1_pgentry_t) * l1_table_offset(va)));
1656 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
1658 // Increment shadow's page count to represent the reference
1659 // inherent in entry->writable_pl1e
1660 //
1661 if ( !get_shadow_ref(sl2e >> PAGE_SHIFT) )
1662 BUG();
1664 FSH_LOG("mark_out_of_sync(va=%p -> writable_pl1e=%p)",
1665 va, entry->writable_pl1e);
1668 /*
1669 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
1670 * Returns 0 otherwise.
1671 */
1672 static int snapshot_entry_matches(
1673 struct exec_domain *ed, unsigned long gmfn, unsigned index)
1675 unsigned long gpfn = __mfn_to_gpfn(ed->domain, gmfn);
1676 unsigned long smfn = __shadow_status(ed->domain, gpfn, PGT_snapshot);
1677 unsigned long *guest, *snapshot;
1678 int compare;
1680 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
1682 perfc_incrc(snapshot_entry_matches_calls);
1684 if ( !smfn )
1685 return 0;
1687 guest = map_domain_mem(gmfn << PAGE_SHIFT);
1688 snapshot = map_domain_mem(smfn << PAGE_SHIFT);
1690 // This could probably be smarter, but this is sufficent for
1691 // our current needs.
1692 //
1693 compare = (guest[index] == snapshot[index]);
1695 unmap_domain_mem(guest);
1696 unmap_domain_mem(snapshot);
1698 #ifdef PERF_COUNTERS
1699 if ( compare )
1700 perfc_incrc(snapshot_entry_matches_true);
1701 #endif
1703 return compare;
1706 /*
1707 * Returns 1 if va's shadow mapping is out-of-sync.
1708 * Returns 0 otherwise.
1709 */
1710 int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
1712 struct domain *d = ed->domain;
1713 unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
1714 unsigned long l2e;
1715 unsigned long l1mfn;
1717 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1719 perfc_incrc(shadow_out_of_sync_calls);
1721 if ( page_out_of_sync(&frame_table[l2mfn]) &&
1722 !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) )
1723 return 1;
1725 __guest_get_l2e(ed, va, &l2e);
1726 if ( !(l2e & _PAGE_PRESENT) )
1727 return 0;
1729 l1mfn = __gpfn_to_mfn(d, l2e >> PAGE_SHIFT);
1731 // If the l1 pfn is invalid, it can't be out of sync...
1732 if ( !VALID_MFN(l1mfn) )
1733 return 0;
1735 if ( page_out_of_sync(&frame_table[l1mfn]) &&
1736 !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) )
1737 return 1;
1739 return 0;
1742 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
1743 static inline unsigned long
1744 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
1746 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
1749 static inline void
1750 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1752 unsigned long score = prediction & PGT_score_mask;
1753 int create = (score == 0);
1755 // saturating addition
1756 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
1757 score = score ? score : PGT_score_mask;
1759 prediction = (prediction & PGT_mfn_mask) | score;
1761 //printk("increase gpfn=%p pred=%p create=%d\n", gpfn, prediction, create);
1762 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1764 if ( create )
1765 perfc_incr(writable_pte_predictions);
1768 static inline void
1769 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1771 unsigned long score = prediction & PGT_score_mask;
1772 ASSERT(score);
1774 // divide score by 2... We don't like bad predictions.
1775 //
1776 score = (score >> 1) & PGT_score_mask;
1778 prediction = (prediction & PGT_mfn_mask) | score;
1780 //printk("decrease gpfn=%p pred=%p score=%p\n", gpfn, prediction, score);
1782 if ( score )
1783 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1784 else
1786 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
1787 perfc_decr(writable_pte_predictions);
1791 static u32 remove_all_write_access_in_ptpage(
1792 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
1793 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
1794 u32 max_refs_to_find, unsigned long prediction)
1796 unsigned long *pt = map_domain_mem(pt_mfn << PAGE_SHIFT);
1797 unsigned long match =
1798 (readonly_gmfn << PAGE_SHIFT) | _PAGE_RW | _PAGE_PRESENT;
1799 unsigned long mask = PAGE_MASK | _PAGE_RW | _PAGE_PRESENT;
1800 int i;
1801 u32 found = 0;
1802 int is_l1_shadow =
1803 ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
1804 PGT_l1_shadow);
1806 #define MATCH_ENTRY(_i) (((pt[_i] ^ match) & mask) == 0)
1808 // returns true if all refs have been found and fixed.
1809 //
1810 int fix_entry(int i)
1812 unsigned long old = pt[i];
1813 unsigned long new = old & ~_PAGE_RW;
1815 if ( is_l1_shadow && !shadow_get_page_from_l1e(mk_l1_pgentry(new), d) )
1816 BUG();
1817 found++;
1818 pt[i] = new;
1819 if ( is_l1_shadow )
1820 put_page_from_l1e(mk_l1_pgentry(old), d);
1822 #if 0
1823 printk("removed write access to pfn=%p mfn=%p in smfn=%p entry %x "
1824 "is_l1_shadow=%d\n",
1825 readonly_gpfn, readonly_gmfn, pt_mfn, i, is_l1_shadow);
1826 #endif
1828 return (found == max_refs_to_find);
1831 if ( MATCH_ENTRY(readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1)) &&
1832 fix_entry(readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1)) )
1834 perfc_incrc(remove_write_fast_exit);
1835 increase_writable_pte_prediction(d, readonly_gpfn, prediction);
1836 unmap_domain_mem(pt);
1837 return found;
1840 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1842 if ( unlikely(MATCH_ENTRY(i)) && fix_entry(i) )
1843 break;
1846 unmap_domain_mem(pt);
1848 return found;
1849 #undef MATCH_ENTRY
1852 int shadow_remove_all_write_access(
1853 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
1855 int i;
1856 struct shadow_status *a;
1857 u32 found = 0, fixups, write_refs;
1858 unsigned long prediction, predicted_gpfn, predicted_smfn;
1860 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1861 ASSERT(VALID_MFN(readonly_gmfn));
1863 perfc_incrc(remove_write_access);
1865 // If it's not a writable page, then no writable refs can be outstanding.
1866 //
1867 if ( (frame_table[readonly_gmfn].u.inuse.type_info & PGT_type_mask) !=
1868 PGT_writable_page )
1870 perfc_incrc(remove_write_not_writable);
1871 return 1;
1874 // How many outstanding writable PTEs for this page are there?
1875 //
1876 write_refs = (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask);
1877 if ( write_refs && (frame_table[readonly_gmfn].u.inuse.type_info & PGT_pinned) )
1878 write_refs--;
1880 if ( write_refs == 0 )
1882 perfc_incrc(remove_write_no_work);
1883 return 1;
1886 // Before searching all the L1 page tables, check the typical culprit first.
1887 //
1888 if ( (prediction = predict_writable_pte_page(d, readonly_gpfn)) )
1890 predicted_gpfn = prediction & PGT_mfn_mask;
1891 if ( (predicted_smfn = __shadow_status(d, predicted_gpfn, PGT_l1_shadow)) &&
1892 (fixups = remove_all_write_access_in_ptpage(d, predicted_gpfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, prediction)) )
1894 found += fixups;
1895 if ( found == write_refs )
1897 perfc_incrc(remove_write_predicted);
1898 return 1;
1901 else
1903 perfc_incrc(remove_write_bad_prediction);
1904 decrease_writable_pte_prediction(d, readonly_gpfn, prediction);
1908 // Search all the shadow L1 page tables...
1909 //
1910 for (i = 0; i < shadow_ht_buckets; i++)
1912 a = &d->arch.shadow_ht[i];
1913 while ( a && a->gpfn_and_flags )
1915 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
1917 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
1918 if ( found == write_refs )
1919 return 1;
1922 a = a->next;
1926 FSH_LOG("%s: looking for %d refs, found %d refs\n",
1927 __func__, write_refs, found);
1929 return 0;
1932 static u32 remove_all_access_in_page(
1933 struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
1935 unsigned long *pl1e = map_domain_mem(l1mfn << PAGE_SHIFT);
1936 unsigned long match = (forbidden_gmfn << PAGE_SHIFT) | _PAGE_PRESENT;
1937 unsigned long mask = PAGE_MASK | _PAGE_PRESENT;
1938 int i;
1939 u32 count = 0;
1940 int is_l1_shadow =
1941 ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
1942 PGT_l1_shadow);
1944 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1946 if ( unlikely(((pl1e[i] ^ match) & mask) == 0) )
1948 unsigned long ol2e = pl1e[i];
1949 pl1e[i] = 0;
1950 count++;
1952 if ( is_l1_shadow )
1953 put_page_from_l1e(mk_l1_pgentry(ol2e), d);
1954 else /* must be an hl2 page */
1955 put_page(&frame_table[forbidden_gmfn]);
1959 unmap_domain_mem(pl1e);
1961 return count;
1964 u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
1966 int i;
1967 struct shadow_status *a;
1968 u32 count = 0;
1970 ASSERT(spin_is_locked(&d->arch.shadow_lock));
1972 for (i = 0; i < shadow_ht_buckets; i++)
1974 a = &d->arch.shadow_ht[i];
1975 while ( a && a->gpfn_and_flags )
1977 switch (a->gpfn_and_flags & PGT_type_mask)
1979 case PGT_l1_shadow:
1980 case PGT_l2_shadow:
1981 case PGT_l3_shadow:
1982 case PGT_l4_shadow:
1983 case PGT_hl2_shadow:
1984 count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
1985 break;
1986 case PGT_snapshot:
1987 case PGT_writable_pred:
1988 // these can't hold refs to the forbidden page
1989 break;
1990 default:
1991 BUG();
1994 a = a->next;
1998 return count;
2001 static int resync_all(struct domain *d, u32 stype)
2003 struct out_of_sync_entry *entry;
2004 unsigned i;
2005 unsigned long smfn;
2006 unsigned long *guest, *shadow, *snapshot;
2007 int need_flush = 0, external = shadow_mode_external(d);
2008 int unshadow;
2009 unsigned long min_max;
2010 int min, max;
2012 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2014 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2016 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
2017 continue;
2019 if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) )
2020 continue;
2022 FSH_LOG("resyncing t=%p gpfn=%p gmfn=%p smfn=%p snapshot_mfn=%p",
2023 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
2025 // Compare guest's new contents to its snapshot, validating
2026 // and updating its shadow as appropriate.
2027 //
2028 guest = map_domain_mem(entry->gmfn << PAGE_SHIFT);
2029 snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
2030 shadow = map_domain_mem(smfn << PAGE_SHIFT);
2031 unshadow = 0;
2033 switch ( stype ) {
2034 case PGT_l1_shadow:
2035 min_max = pfn_to_page(smfn)->tlbflush_timestamp;
2036 min = SHADOW_MIN(min_max);
2037 max = SHADOW_MAX(min_max);
2038 for ( i = min; i <= max; i++ )
2040 unsigned new_pte = guest[i];
2041 if ( new_pte != snapshot[i] )
2043 need_flush |= validate_pte_change(d, new_pte, &shadow[i]);
2045 // can't update snapshots of linear page tables -- they
2046 // are used multiple times...
2047 //
2048 // snapshot[i] = new_pte;
2051 break;
2052 case PGT_l2_shadow:
2053 max = -1;
2054 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2056 if ( !is_guest_l2_slot(i) && !external )
2057 continue;
2059 unsigned new_pde = guest[i];
2060 if ( new_pde != snapshot[i] )
2062 need_flush |= validate_pde_change(d, new_pde, &shadow[i]);
2064 // can't update snapshots of linear page tables -- they
2065 // are used multiple times...
2066 //
2067 // snapshot[i] = new_pde;
2069 if ( new_pde != 0 )
2070 max = i;
2072 // XXX - This hack works for linux guests.
2073 // Need a better solution long term.
2074 if ( !(new_pde & _PAGE_PRESENT) && unlikely(new_pde != 0) &&
2075 !unshadow &&
2076 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
2077 unshadow = 1;
2079 if ( max == -1 )
2080 unshadow = 1;
2081 break;
2082 default:
2083 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2085 if ( !is_guest_l2_slot(i) && !external )
2086 continue;
2088 unsigned new_pde = guest[i];
2089 if ( new_pde != snapshot[i] )
2091 need_flush |= validate_hl2e_change(d, new_pde, &shadow[i]);
2093 // can't update snapshots of linear page tables -- they
2094 // are used multiple times...
2095 //
2096 // snapshot[i] = new_pde;
2099 break;
2102 unmap_domain_mem(shadow);
2103 unmap_domain_mem(snapshot);
2104 unmap_domain_mem(guest);
2106 if ( unlikely(unshadow) )
2108 perfc_incrc(unshadow_l2_count);
2109 shadow_unpin(smfn);
2113 return need_flush;
2116 void __shadow_sync_all(struct domain *d)
2118 struct out_of_sync_entry *entry;
2119 int need_flush = 0;
2121 perfc_incrc(shadow_sync_all);
2123 ASSERT(spin_is_locked(&d->arch.shadow_lock));
2125 // First, remove all write permissions to the page tables
2126 //
2127 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2129 // Skip entries that have low bits set... Those aren't
2130 // real PTEs.
2131 //
2132 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
2133 continue;
2135 unsigned long *ppte = map_domain_mem(entry->writable_pl1e);
2136 unsigned long opte = *ppte;
2137 unsigned long npte = opte & ~_PAGE_RW;
2139 if ( (npte & _PAGE_PRESENT) &&
2140 !shadow_get_page_from_l1e(mk_l1_pgentry(npte), d) )
2141 BUG();
2142 *ppte = npte;
2143 put_page_from_l1e(mk_l1_pgentry(opte), d);
2145 unmap_domain_mem(ppte);
2148 // XXX mafetter: SMP perf bug.
2149 //
2150 // With the current algorithm, we've gotta flush all the TLBs
2151 // before we can safely continue. I don't think we want to
2152 // do it this way, so I think we should consider making
2153 // entirely private copies of the shadow for each vcpu, and/or
2154 // possibly having a mix of private and shared shadow state
2155 // (any path from a PTE that grants write access to an out-of-sync
2156 // page table page needs to be vcpu private).
2157 //
2158 flush_tlb_all();
2160 // Second, resync all L1 pages, then L2 pages, etc...
2161 //
2162 need_flush |= resync_all(d, PGT_l1_shadow);
2163 if ( shadow_mode_translate(d) )
2164 need_flush |= resync_all(d, PGT_hl2_shadow);
2165 need_flush |= resync_all(d, PGT_l2_shadow);
2167 if ( need_flush )
2168 local_flush_tlb();
2170 free_out_of_sync_state(d);
2173 int shadow_fault(unsigned long va, struct xen_regs *regs)
2175 unsigned long gpte, spte = 0, orig_gpte;
2176 struct exec_domain *ed = current;
2177 struct domain *d = ed->domain;
2178 unsigned long gpde;
2180 SH_VVLOG("shadow_fault( va=%p, code=%lu )", va, regs->error_code );
2181 perfc_incrc(shadow_fault_calls);
2183 check_pagetable(ed, "pre-sf");
2185 /*
2186 * Don't let someone else take the guest's table pages out-of-sync.
2187 */
2188 shadow_lock(d);
2190 /* XXX - FIX THIS COMMENT!!!
2191 * STEP 1. Check to see if this fault might have been caused by an
2192 * out-of-sync table page entry, or if we should pass this
2193 * fault onto the guest.
2194 */
2195 __shadow_sync_va(ed, va);
2197 /*
2198 * STEP 2. Check the guest PTE.
2199 */
2200 __guest_get_l2e(ed, va, &gpde);
2201 if ( unlikely(!(gpde & _PAGE_PRESENT)) )
2203 SH_VVLOG("shadow_fault - EXIT: L1 not present" );
2204 perfc_incrc(shadow_fault_bail_pde_not_present);
2205 shadow_unlock(d);
2206 return 0;
2209 // This can't fault because we hold the shadow lock and we've ensured that
2210 // the mapping is in-sync, so the check of the PDE's present bit, above,
2211 // covers this access.
2212 //
2213 orig_gpte = gpte = l1_pgentry_val(linear_pg_table[l1_linear_offset(va)]);
2214 if ( unlikely(!(gpte & _PAGE_PRESENT)) )
2216 SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
2217 perfc_incrc(shadow_fault_bail_pte_not_present);
2218 shadow_unlock(d);
2219 return 0;
2222 /* Write fault? */
2223 if ( regs->error_code & 2 )
2225 if ( unlikely(!(gpte & _PAGE_RW)) )
2227 /* Write fault on a read-only mapping. */
2228 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte);
2229 perfc_incrc(shadow_fault_bail_ro_mapping);
2230 shadow_unlock(d);
2231 return 0;
2234 if ( !l1pte_write_fault(ed, &gpte, &spte, va) )
2236 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
2237 perfc_incrc(write_fault_bail);
2238 shadow_unlock(d);
2239 return 0;
2242 else
2244 if ( !l1pte_read_fault(d, &gpte, &spte) )
2246 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
2247 perfc_incrc(read_fault_bail);
2248 shadow_unlock(d);
2249 return 0;
2253 /*
2254 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
2255 */
2257 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
2258 if ( unlikely(__put_user(gpte, (unsigned long *)
2259 &linear_pg_table[l1_linear_offset(va)])) )
2261 printk("shadow_fault(): crashing domain %d "
2262 "due to a read-only L2 page table (gpde=%p), va=%p\n",
2263 d->id, gpde, va);
2264 domain_crash();
2267 // if necessary, record the page table page as dirty
2268 if ( unlikely(shadow_mode_log_dirty(d)) && (orig_gpte != gpte) )
2269 mark_dirty(d, __gpfn_to_mfn(d, gpde >> PAGE_SHIFT));
2271 shadow_set_l1e(va, spte, 1);
2273 perfc_incrc(shadow_fault_fixed);
2274 d->arch.shadow_fault_count++;
2276 shadow_unlock(d);
2278 check_pagetable(ed, "post-sf");
2279 return EXCRET_fault_fixed;
2282 /*
2283 * What lives where in the 32-bit address space in the various shadow modes,
2284 * and what it uses to get/maintain that mapping.
2286 * SHADOW MODE: none enable translate external
2288 * 4KB things:
2289 * guest_vtable lin_l2 mapped per gpdt lin_l2 via hl2 mapped per gpdt
2290 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gpdt
2291 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gpdt
2292 * monitor_vtable n/a n/a n/a mapped once
2294 * 4MB things:
2295 * guest_linear lin via gpdt lin via gpdt lin via hl2 lin via hl2
2296 * shadow_linear n/a sh_lin via spdt sh_lin via spdt sh_lin via spdt
2297 * monitor_linear n/a n/a n/a ???
2298 * perdomain perdomain perdomain perdomain perdomain
2299 * R/O M2P R/O M2P R/O M2P n/a n/a
2300 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
2301 * P2M n/a n/a R/O M2P R/O M2P
2303 * NB:
2304 * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
2305 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
2306 * all play a part in maintaining these mappings.
2307 */
2308 void __update_pagetables(struct exec_domain *ed)
2310 struct domain *d = ed->domain;
2311 unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
2312 unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
2313 unsigned long smfn, hl2mfn, old_smfn;
2315 int max_mode = ( shadow_mode_external(d) ? SHM_external
2316 : shadow_mode_translate(d) ? SHM_translate
2317 : shadow_mode_enabled(d) ? SHM_enable
2318 : 0 );
2320 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
2321 ASSERT( max_mode );
2323 /*
2324 * arch.guest_vtable
2325 */
2326 if ( max_mode & (SHM_enable | SHM_external) )
2328 if ( likely(ed->arch.guest_vtable != NULL) )
2329 unmap_domain_mem(ed->arch.guest_vtable);
2330 ed->arch.guest_vtable = map_domain_mem(gmfn << PAGE_SHIFT);
2333 /*
2334 * arch.shadow_table
2335 */
2336 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
2337 smfn = shadow_l2_table(d, gpfn, gmfn);
2338 if ( !get_shadow_ref(smfn) )
2339 BUG();
2340 old_smfn = pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT;
2341 ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
2342 if ( old_smfn )
2343 put_shadow_ref(old_smfn);
2345 SH_VVLOG("__update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
2347 /*
2348 * arch.shadow_vtable
2349 */
2350 if ( max_mode == SHM_external )
2352 if ( ed->arch.shadow_vtable )
2353 unmap_domain_mem(ed->arch.shadow_vtable);
2354 ed->arch.shadow_vtable = map_domain_mem(smfn << PAGE_SHIFT);
2357 /*
2358 * arch.hl2_vtable
2359 */
2361 // if max_mode == SHM_translate, then the hl2 is already installed
2362 // correctly in its smfn, and there's nothing to do.
2363 //
2364 if ( max_mode == SHM_external )
2366 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
2367 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
2368 if ( !get_shadow_ref(hl2mfn) )
2369 BUG();
2371 if ( ed->arch.hl2_vtable )
2372 unmap_domain_mem(ed->arch.hl2_vtable);
2373 ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
2376 /*
2377 * fixup pointers in monitor table, as necessary
2378 */
2379 if ( max_mode == SHM_external )
2381 l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
2383 ASSERT( shadow_mode_translate(d) );
2385 BUG(); // ref counts for hl2mfn and smfn need to be maintained!
2387 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
2388 mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
2390 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
2391 mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
2393 // XXX - maybe this can be optimized somewhat??
2394 local_flush_tlb();
2399 /************************************************************************/
2400 /************************************************************************/
2401 /************************************************************************/
2403 #if SHADOW_DEBUG
2405 // BUG: these are not SMP safe...
2406 static int sh_l2_present;
2407 static int sh_l1_present;
2408 char * sh_check_name;
2409 int shadow_status_noswap;
2411 #define v2m(adr) ({ \
2412 unsigned long _a = (unsigned long)(adr); \
2413 unsigned long _pte = l1_pgentry_val( \
2414 shadow_linear_pg_table[_a >> PAGE_SHIFT]); \
2415 unsigned long _pa = _pte & PAGE_MASK; \
2416 _pa | (_a & ~PAGE_MASK); \
2417 })
2419 #define FAIL(_f, _a...) \
2420 do { \
2421 printk("XXX %s-FAIL (%d,%d,%d)" _f "\n" \
2422 "g=%08lx s=%08lx &g=%08lx &s=%08lx" \
2423 " v2m(&g)=%08lx v2m(&s)=%08lx ea=%08lx\n", \
2424 sh_check_name, level, l2_idx, l1_idx, ## _a , \
2425 gpte, spte, pgpte, pspte, \
2426 v2m(pgpte), v2m(pspte), \
2427 (l2_idx << L2_PAGETABLE_SHIFT) | \
2428 (l1_idx << L1_PAGETABLE_SHIFT)); \
2429 errors++; \
2430 } while ( 0 )
2432 static int check_pte(
2433 struct domain *d, unsigned long *pgpte, unsigned long *pspte,
2434 int level, int l2_idx, int l1_idx, int oos_ptes)
2436 unsigned gpte = *pgpte;
2437 unsigned spte = *pspte;
2438 unsigned long mask, gpfn, smfn, gmfn;
2439 int errors = 0;
2440 int page_table_page;
2442 if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) )
2443 return errors; /* always safe */
2445 if ( !(spte & _PAGE_PRESENT) )
2446 FAIL("Non zero not present spte");
2448 if ( level == 2 ) sh_l2_present++;
2449 if ( level == 1 ) sh_l1_present++;
2451 if ( !(gpte & _PAGE_PRESENT) )
2452 FAIL("Guest not present yet shadow is");
2454 mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|PAGE_MASK);
2456 if ( (spte & mask) != (gpte & mask) )
2457 FAIL("Corrupt?");
2459 if ( (level == 1) &&
2460 (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) && !oos_ptes )
2461 FAIL("Dirty coherence");
2463 if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) && !oos_ptes )
2464 FAIL("Accessed coherence");
2466 smfn = spte >> PAGE_SHIFT;
2467 gpfn = gpte >> PAGE_SHIFT;
2468 gmfn = __gpfn_to_mfn(d, gpfn);
2470 if ( !VALID_MFN(gmfn) )
2471 FAIL("invalid gpfn=%p gpte=%p\n", __func__, gpfn, gpte);
2473 page_table_page = mfn_is_page_table(gmfn);
2475 if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) && !oos_ptes )
2477 printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d "
2478 "oos_ptes=%d\n",
2479 gpfn, gmfn, smfn,
2480 frame_table[gmfn].u.inuse.type_info,
2481 page_table_page, oos_ptes);
2482 FAIL("RW coherence");
2485 if ( (level == 1) &&
2486 (spte & _PAGE_RW ) &&
2487 !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) &&
2488 !oos_ptes )
2490 printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d "
2491 "oos_ptes=%d\n",
2492 gpfn, gmfn, smfn,
2493 frame_table[gmfn].u.inuse.type_info,
2494 page_table_page, oos_ptes);
2495 FAIL("RW2 coherence");
2498 if ( gmfn == smfn )
2500 if ( level > 1 )
2501 FAIL("Linear map ???"); /* XXX this will fail on BSD */
2503 else
2505 if ( level < 2 )
2506 FAIL("Shadow in L1 entry?");
2508 if ( level == 2 )
2510 if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn )
2511 FAIL("smfn problem gpfn=%p smfn=%p", gpfn,
2512 __shadow_status(d, gpfn, PGT_l1_shadow));
2514 else
2515 BUG(); // XXX -- not handled yet.
2518 return errors;
2521 static int check_l1_table(
2522 struct domain *d, unsigned long gpfn,
2523 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
2525 int i;
2526 unsigned long *gpl1e, *spl1e;
2527 int errors = 0, oos_ptes = 0;
2529 // First check to see if this guest page is currently the active
2530 // PTWR page. If so, then we compare the (old) cached copy of the
2531 // guest page to the shadow, and not the currently writable (and
2532 // thus potentially out-of-sync) guest page.
2533 //
2534 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
2536 int cpu = current->processor;
2538 for ( i = 0; i < ARRAY_SIZE(ptwr_info->ptinfo); i++)
2540 if ( ptwr_info[cpu].ptinfo[i].l1va &&
2541 ((v2m(ptwr_info[cpu].ptinfo[i].pl1e) >> PAGE_SHIFT) == gmfn) )
2543 unsigned long old = gmfn;
2544 gmfn = (v2m(ptwr_info[cpu].ptinfo[i].page) >> PAGE_SHIFT);
2545 printk("hit1 ptwr_info[%d].ptinfo[%d].l1va, mfn=0x%08x, snapshot=0x%08x\n",
2546 cpu, i, old, gmfn);
2551 if ( page_out_of_sync(pfn_to_page(gmfn)) )
2553 gmfn = __shadow_status(d, gpfn, PGT_snapshot);
2554 oos_ptes = 1;
2555 ASSERT(gmfn);
2558 gpl1e = map_domain_mem(gmfn << PAGE_SHIFT);
2559 spl1e = map_domain_mem(smfn << PAGE_SHIFT);
2561 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2562 errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes);
2564 unmap_domain_mem(spl1e);
2565 unmap_domain_mem(gpl1e);
2567 return errors;
2570 #define FAILPT(_f, _a...) \
2571 do { \
2572 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
2573 errors++; \
2574 } while ( 0 )
2576 int check_l2_table(
2577 struct domain *d, unsigned long gmfn, unsigned long smfn, int oos_pdes)
2579 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_mem(gmfn << PAGE_SHIFT);
2580 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
2581 int i;
2582 int errors = 0;
2583 int limit;
2585 if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
2586 FAILPT("domain doesn't own page");
2587 if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
2588 FAILPT("bogus owner for snapshot page");
2589 if ( page_get_owner(pfn_to_page(smfn)) != NULL )
2590 FAILPT("shadow page mfn=0x%08x is owned by someone, domid=%d",
2591 smfn, page_get_owner(pfn_to_page(smfn))->id);
2593 #if 0
2594 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2595 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2596 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
2597 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
2599 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2600 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
2601 i++ )
2602 printk("+++ (%d) %p %p\n",i,
2603 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
2604 FAILPT("hypervisor entries inconsistent");
2607 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
2608 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
2609 FAILPT("hypervisor linear map inconsistent");
2610 #endif
2612 if ( !shadow_mode_external(d) &&
2613 (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >>
2614 L2_PAGETABLE_SHIFT]) !=
2615 ((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
2617 FAILPT("hypervisor shadow linear map inconsistent %p %p",
2618 l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >>
2619 L2_PAGETABLE_SHIFT]),
2620 (smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
2623 if ( !shadow_mode_external(d) &&
2624 (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
2625 ((__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR))) )
2627 FAILPT("hypervisor per-domain map inconsistent saw %p, expected (va=%p) %p",
2628 l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
2629 d->arch.mm_perdomain_pt,
2630 (__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR));
2633 if ( shadow_mode_external(d) )
2634 limit = L2_PAGETABLE_ENTRIES;
2635 else
2636 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2638 /* Check the whole L2. */
2639 for ( i = 0; i < limit; i++ )
2640 errors += check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i, 0, 0);
2642 unmap_domain_mem(spl2e);
2643 unmap_domain_mem(gpl2e);
2645 #if 1
2646 if ( errors )
2647 printk("check_l2_table returning %d errors\n", errors);
2648 #endif
2650 return errors;
2653 int _check_pagetable(struct exec_domain *ed, char *s)
2655 struct domain *d = ed->domain;
2656 pagetable_t pt = ed->arch.guest_table;
2657 unsigned long gptbase = pagetable_val(pt);
2658 unsigned long ptbase_pfn, smfn;
2659 unsigned long i;
2660 l2_pgentry_t *gpl2e, *spl2e;
2661 unsigned long ptbase_mfn = 0;
2662 int errors = 0, limit, oos_pdes = 0;
2664 _audit_domain(d, AUDIT_QUIET);
2665 shadow_lock(d);
2667 sh_check_name = s;
2668 SH_VVLOG("%s-PT Audit", s);
2669 sh_l2_present = sh_l1_present = 0;
2670 perfc_incrc(check_pagetable);
2672 ptbase_mfn = gptbase >> PAGE_SHIFT;
2673 ptbase_pfn = __mfn_to_gpfn(d, ptbase_mfn);
2675 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
2677 printk("%s-PT %p not shadowed\n", s, gptbase);
2678 errors++;
2679 goto out;
2681 if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
2683 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
2684 oos_pdes = 1;
2685 ASSERT(ptbase_mfn);
2688 errors += check_l2_table(d, ptbase_mfn, smfn, oos_pdes);
2690 gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT );
2691 spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
2693 /* Go back and recurse. */
2694 if ( shadow_mode_external(d) )
2695 limit = L2_PAGETABLE_ENTRIES;
2696 else
2697 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2699 for ( i = 0; i < limit; i++ )
2701 unsigned long gl1pfn = l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT;
2702 unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
2703 unsigned long sl1mfn = l2_pgentry_val(spl2e[i]) >> PAGE_SHIFT;
2705 if ( l2_pgentry_val(spl2e[i]) != 0 )
2707 errors += check_l1_table(d, gl1pfn, gl1mfn, sl1mfn, i);
2711 unmap_domain_mem(spl2e);
2712 unmap_domain_mem(gpl2e);
2714 SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
2715 sh_l2_present, sh_l1_present);
2717 out:
2718 if ( errors )
2719 BUG();
2721 shadow_unlock(d);
2723 return errors;
2726 int _check_all_pagetables(struct exec_domain *ed, char *s)
2728 struct domain *d = ed->domain;
2729 int i;
2730 struct shadow_status *a;
2731 unsigned long gmfn;
2732 int errors = 0;
2734 shadow_status_noswap = 1;
2736 sh_check_name = s;
2737 SH_VVLOG("%s-PT Audit domid=%d", s, d->id);
2738 sh_l2_present = sh_l1_present = 0;
2739 perfc_incrc(check_all_pagetables);
2741 for (i = 0; i < shadow_ht_buckets; i++)
2743 a = &d->arch.shadow_ht[i];
2744 while ( a && a->gpfn_and_flags )
2746 gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
2748 switch ( a->gpfn_and_flags & PGT_type_mask )
2750 case PGT_l1_shadow:
2751 errors += check_l1_table(d, a->gpfn_and_flags & PGT_mfn_mask,
2752 gmfn, a->smfn, 0);
2753 break;
2754 case PGT_l2_shadow:
2755 errors += check_l2_table(d, gmfn, a->smfn,
2756 page_out_of_sync(pfn_to_page(gmfn)));
2757 break;
2758 case PGT_l3_shadow:
2759 case PGT_l4_shadow:
2760 case PGT_hl2_shadow:
2761 BUG(); // XXX - ought to fix this...
2762 break;
2763 case PGT_snapshot:
2764 case PGT_writable_ref:
2765 break;
2766 default:
2767 errors++;
2768 printk("unexpected shadow type %p, gpfn=%p, "
2769 "gmfn=%p smfn=%p\n",
2770 a->gpfn_and_flags & PGT_type_mask,
2771 a->gpfn_and_flags & PGT_mfn_mask,
2772 gmfn, a->smfn);
2773 BUG();
2775 a = a->next;
2779 shadow_status_noswap = 0;
2781 if ( errors )
2782 BUG();
2784 return errors;
2787 #endif // SHADOW_DEBUG
2789 /*
2790 * Local variables:
2791 * mode: C
2792 * c-set-style: "BSD"
2793 * c-basic-offset: 4
2794 * tab-width: 4
2795 * indent-tabs-mode: nil
2796 */