ia64/xen-unstable

view xen/arch/x86/shadow.c @ 6537:f36aee6f8902

Drop shadow lock on exit

Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Arun Sharma <arun.sharma@intel.com>
author adsharma@los-vmm.sc.intel.com
date Wed Aug 17 11:22:31 2005 -0800 (2005-08-17)
parents c1bcea912992
children 84ee014ebd41
line source
1 /******************************************************************************
2 * arch/x86/shadow_64.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 /*
22 * Jun Nakajima <jun.nakajima@intel.com>
23 * Chengyuan Li <chengyuan.li@intel.com>
24 *
25 * Extended to support 64-bit guests.
26 */
28 #include <xen/config.h>
29 #include <xen/types.h>
30 #include <xen/mm.h>
31 #include <xen/domain_page.h>
32 #include <asm/shadow.h>
33 #include <asm/page.h>
34 #include <xen/event.h>
35 #include <xen/sched.h>
36 #include <xen/trace.h>
38 extern void free_shadow_pages(struct domain *d);
40 #if SHADOW_DEBUG
41 static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
42 #endif
44 #if CONFIG_PAGING_LEVELS == 3
45 #include <asm/shadow_64.h>
46 static unsigned long shadow_l3_table(
47 struct domain *d, unsigned long gpfn, unsigned long gmfn);
48 #endif
50 #if CONFIG_PAGING_LEVELS == 4
51 #include <asm/shadow_64.h>
52 static unsigned long shadow_l4_table(
53 struct domain *d, unsigned long gpfn, unsigned long gmfn);
54 static void shadow_map_into_current(struct vcpu *v,
55 unsigned long va, unsigned int from, unsigned int to);
56 #endif
58 /********
60 There's a per-domain shadow table spin lock which works fine for SMP
61 hosts. We don't have to worry about interrupts as no shadow operations
62 happen in an interrupt context. It's probably not quite ready for SMP
63 guest operation as we have to worry about synchonisation between gpte
64 and spte updates. Its possible that this might only happen in a
65 hypercall context, in which case we'll probably at have a per-domain
66 hypercall lock anyhow (at least initially).
68 ********/
70 static inline int
71 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
72 unsigned long new_type)
73 {
74 struct pfn_info *page = pfn_to_page(gmfn);
75 int pinned = 0, okay = 1;
77 if ( page_out_of_sync(page) )
78 {
79 // Don't know how long ago this snapshot was taken.
80 // Can't trust it to be recent enough.
81 //
82 __shadow_sync_mfn(d, gmfn);
83 }
85 if ( !shadow_mode_refcounts(d) )
86 return 1;
88 if ( unlikely(page_is_page_table(page)) )
89 return 1;
91 FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
93 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
94 {
95 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
96 __func__, gpfn, gmfn);
97 #if 1 || defined(LIVE_DANGEROUSLY)
98 set_bit(_PGC_page_table, &page->count_info);
99 return 1;
100 #endif
101 return 0;
103 }
105 // To convert this page to use as a page table, the writable count
106 // should now be zero. Test this by grabbing the page as an page table,
107 // and then immediately releasing. This will also deal with any
108 // necessary TLB flushing issues for us.
109 //
110 // The cruft here about pinning doesn't really work right. This
111 // needs rethinking/rewriting... Need to gracefully deal with the
112 // TLB flushes required when promoting a writable page, and also deal
113 // with any outstanding (external) writable refs to this page (by
114 // refusing to promote it). The pinning headache complicates this
115 // code -- it would all get much simpler if we stop using
116 // shadow_lock() and move the shadow code to BIGLOCK().
117 //
118 if ( unlikely(!get_page(page, d)) )
119 BUG(); // XXX -- needs more thought for a graceful failure
120 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
121 {
122 pinned = 1;
123 put_page_and_type(page);
124 }
125 if ( get_page_type(page, PGT_base_page_table) )
126 {
127 set_bit(_PGC_page_table, &page->count_info);
128 put_page_type(page);
129 }
130 else
131 {
132 printk("shadow_promote: get_page_type failed "
133 "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
134 d->domain_id, gpfn, gmfn, new_type);
135 okay = 0;
136 }
138 // Now put the type back to writable...
139 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
140 BUG(); // XXX -- needs more thought for a graceful failure
141 if ( unlikely(pinned) )
142 {
143 if ( unlikely(test_and_set_bit(_PGT_pinned,
144 &page->u.inuse.type_info)) )
145 BUG(); // hmm... someone pinned this again?
146 }
147 else
148 put_page_and_type(page);
150 return okay;
151 }
154 /*
155 * Things in shadow mode that collect get_page() refs to the domain's
156 * pages are:
157 * - PGC_allocated takes a gen count, just like normal.
158 * - A writable page can be pinned (paravirtualized guests may consider
159 * these pages to be L1s or L2s, and don't know the difference).
160 * Pinning a page takes a gen count (but, for domains in shadow mode,
161 * it *doesn't* take a type count)
162 * - CR3 grabs a ref to whatever it points at, just like normal.
163 * - Shadow mode grabs an initial gen count for itself, as a placehold
164 * for whatever references will exist.
165 * - Shadow PTEs that point to a page take a gen count, just like regular
166 * PTEs. However, they don't get a type count, as get_page_type() is
167 * hardwired to keep writable pages' counts at 1 for domains in shadow
168 * mode.
169 * - Whenever we shadow a page, the entry in the shadow hash grabs a
170 * general ref to the page.
171 * - Whenever a page goes out of sync, the out of sync entry grabs a
172 * general ref to the page.
173 */
174 /*
175 * pfn_info fields for pages allocated as shadow pages:
176 *
177 * All 32 bits of count_info are a simple count of refs to this shadow
178 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
179 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
180 * references.
181 *
182 * u.inuse._domain is left NULL, to prevent accidently allow some random
183 * domain from gaining permissions to map this page.
184 *
185 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
186 * shadowed.
187 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
188 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
189 * is currently exists because this is a shadow of a root page, and we
190 * don't want to let those disappear just because no CR3 is currently pointing
191 * at it.
192 *
193 * tlbflush_timestamp holds a min & max index of valid page table entries
194 * within the shadow page.
195 */
197 static inline unsigned long
198 alloc_shadow_page(struct domain *d,
199 unsigned long gpfn, unsigned long gmfn,
200 u32 psh_type)
201 {
202 struct pfn_info *page;
203 unsigned long smfn;
204 int pin = 0;
206 // Currently, we only keep pre-zero'ed pages around for use as L1's...
207 // This will change. Soon.
208 //
209 if ( psh_type == PGT_l1_shadow )
210 {
211 if ( !list_empty(&d->arch.free_shadow_frames) )
212 {
213 struct list_head *entry = d->arch.free_shadow_frames.next;
214 page = list_entry(entry, struct pfn_info, list);
215 list_del(entry);
216 perfc_decr(free_l1_pages);
217 }
218 else
219 {
220 page = alloc_domheap_page(NULL);
221 void *l1 = map_domain_page(page_to_pfn(page));
222 memset(l1, 0, PAGE_SIZE);
223 unmap_domain_page(l1);
224 }
225 }
226 else {
227 page = alloc_domheap_page(NULL);
228 void *lp = map_domain_page(page_to_pfn(page));
229 memset(lp, 0, PAGE_SIZE);
230 unmap_domain_page(lp);
232 }
233 if ( unlikely(page == NULL) )
234 {
235 printk("Couldn't alloc shadow page! dom%d count=%d\n",
236 d->domain_id, d->arch.shadow_page_count);
237 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
238 perfc_value(shadow_l1_pages),
239 perfc_value(shadow_l2_pages),
240 perfc_value(hl2_table_pages),
241 perfc_value(snapshot_pages));
242 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
243 }
245 smfn = page_to_pfn(page);
247 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
248 page->u.inuse.type_info = psh_type | gmfn;
249 page->count_info = 0;
250 page->tlbflush_timestamp = 0;
252 switch ( psh_type )
253 {
254 case PGT_l1_shadow:
255 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
256 goto fail;
257 perfc_incr(shadow_l1_pages);
258 d->arch.shadow_page_count++;
259 break;
261 case PGT_l2_shadow:
262 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
263 goto fail;
264 perfc_incr(shadow_l2_pages);
265 d->arch.shadow_page_count++;
266 if ( PGT_l2_page_table == PGT_root_page_table )
267 pin = 1;
269 break;
271 case PGT_l3_shadow:
272 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
273 goto fail;
274 perfc_incr(shadow_l3_pages);
275 d->arch.shadow_page_count++;
276 break;
278 case PGT_l4_shadow:
279 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
280 goto fail;
281 perfc_incr(shadow_l4_pages);
282 d->arch.shadow_page_count++;
283 if ( PGT_l4_page_table == PGT_root_page_table )
284 pin = 1;
285 break;
287 #if CONFIG_PAGING_LEVELS >= 4
288 case PGT_fl1_shadow:
289 perfc_incr(shadow_l1_pages);
290 d->arch.shadow_page_count++;
291 break;
292 #else
294 case PGT_hl2_shadow:
295 // Treat an hl2 as an L1 for purposes of promotion.
296 // For external mode domains, treat them as an L2 for purposes of
297 // pinning.
298 //
299 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
300 goto fail;
301 perfc_incr(hl2_table_pages);
302 d->arch.hl2_page_count++;
303 if ( shadow_mode_external(d) &&
304 (PGT_l2_page_table == PGT_root_page_table) )
305 pin = 1;
307 break;
308 #endif
309 case PGT_snapshot:
310 perfc_incr(snapshot_pages);
311 d->arch.snapshot_page_count++;
312 break;
314 default:
315 printk("Alloc shadow weird page type type=%08x\n", psh_type);
316 BUG();
317 break;
318 }
320 // Don't add a new shadow of something that already has a snapshot.
321 //
322 ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
324 set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
326 if ( pin )
327 shadow_pin(smfn);
329 return smfn;
331 fail:
332 FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?",
333 gpfn, gmfn);
334 free_domheap_page(page);
335 return 0;
336 }
338 #if CONFIG_PAGING_LEVELS == 2
339 static unsigned long
340 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
341 unsigned long smfn)
342 {
343 unsigned long hl2mfn;
344 l1_pgentry_t *hl2;
345 int limit;
347 ASSERT(PGT_base_page_table == PGT_l2_page_table);
349 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
350 {
351 printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
352 gpfn, gmfn);
353 BUG(); /* XXX Deal gracefully with failure. */
354 }
356 SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
357 gpfn, gmfn, smfn, hl2mfn);
358 perfc_incrc(shadow_hl2_table_count);
360 hl2 = map_domain_page(hl2mfn);
362 #ifdef __i386__
363 if ( shadow_mode_external(d) )
364 limit = L2_PAGETABLE_ENTRIES;
365 else
366 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
367 #else
368 limit = 0; /* XXX x86/64 XXX */
369 #endif
371 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
373 if ( !shadow_mode_external(d) )
374 {
375 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
376 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
378 // Setup easy access to the GL2, SL2, and HL2 frames.
379 //
380 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
381 l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
382 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
383 l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
384 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
385 l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
386 }
388 unmap_domain_page(hl2);
390 return hl2mfn;
391 }
393 /*
394 * This could take and use a snapshot, and validate the entire page at
395 * once, or it could continue to fault in entries one at a time...
396 * Might be worth investigating...
397 */
398 static unsigned long shadow_l2_table(
399 struct domain *d, unsigned long gpfn, unsigned long gmfn)
400 {
401 unsigned long smfn;
402 l2_pgentry_t *spl2e;
404 SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
406 perfc_incrc(shadow_l2_table_count);
408 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
409 {
410 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
411 gpfn, gmfn);
412 BUG(); /* XXX Deal gracefully with failure. */
413 }
415 spl2e = (l2_pgentry_t *)map_domain_page(smfn);
417 /* Install hypervisor and 2x linear p.t. mapings. */
418 if ( (PGT_base_page_table == PGT_l2_page_table) &&
419 !shadow_mode_external(d) )
420 {
421 /*
422 * We could proactively fill in PDEs for pages that are already
423 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
424 * (restriction required for coherence of the accessed bit). However,
425 * we tried it and it didn't help performance. This is simpler.
426 */
427 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
429 /* Install hypervisor and 2x linear p.t. mapings. */
430 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
431 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
432 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
434 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
435 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
437 spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
438 l2e_from_paddr(__pa(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt),
439 __PAGE_HYPERVISOR);
441 if ( shadow_mode_translate(d) ) // NB: not external
442 {
443 unsigned long hl2mfn;
445 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
446 l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
447 __PAGE_HYPERVISOR);
449 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
450 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
452 // shadow_mode_translate (but not external) sl2 tables hold a
453 // ref to their hl2.
454 //
455 if ( !get_shadow_ref(hl2mfn) )
456 BUG();
458 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
459 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
460 }
461 else
462 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
463 l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
464 }
465 else
466 {
467 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
468 }
470 unmap_domain_page(spl2e);
472 SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
473 return smfn;
474 }
475 #endif
477 static void shadow_map_l1_into_current_l2(unsigned long va)
478 {
479 struct vcpu *v = current;
480 struct domain *d = v->domain;
481 l1_pgentry_t *gpl1e, *spl1e;
482 l2_pgentry_t gl2e, sl2e;
483 unsigned long gl1pfn, gl1mfn, sl1mfn;
484 int i, init_table = 0;
486 __guest_get_l2e(v, va, &gl2e);
487 ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
488 gl1pfn = l2e_get_pfn(gl2e);
490 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
491 {
492 /* This L1 is NOT already shadowed so we need to shadow it. */
493 SH_VVLOG("4a: l1 not shadowed");
495 gl1mfn = __gpfn_to_mfn(d, gl1pfn);
496 if ( unlikely(!VALID_MFN(gl1mfn)) )
497 {
498 // Attempt to use an invalid pfn as an L1 page.
499 // XXX this needs to be more graceful!
500 BUG();
501 }
503 if ( unlikely(!(sl1mfn =
504 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
505 {
506 printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
507 gl1pfn, gl1mfn);
508 BUG(); /* XXX Need to deal gracefully with failure. */
509 }
511 perfc_incrc(shadow_l1_table_count);
512 init_table = 1;
513 }
514 else
515 {
516 /* This L1 is shadowed already, but the L2 entry is missing. */
517 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
518 }
520 #ifndef NDEBUG
521 l2_pgentry_t old_sl2e;
522 __shadow_get_l2e(v, va, &old_sl2e);
523 ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
524 #endif
526 if ( !get_shadow_ref(sl1mfn) )
527 BUG();
528 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
529 __guest_set_l2e(v, va, &gl2e);
530 __shadow_set_l2e(v, va, &sl2e);
532 if ( init_table )
533 {
534 l1_pgentry_t sl1e;
535 int index = l1_table_offset(va);
536 int min = 1, max = 0;
538 unsigned long entries, pt_va;
539 l1_pgentry_t tmp_sl1e;
540 l1_pgentry_t tmp_gl1e;//Prepare for double compile
543 entries = PAGE_SIZE / sizeof(l1_pgentry_t);
544 pt_va = ((va >> L1_PAGETABLE_SHIFT) & ~(entries - 1)) << L1_PAGETABLE_SHIFT;
545 gpl1e = (l1_pgentry_t *) __guest_get_l1e(v, pt_va, &tmp_gl1e);
547 entries = PAGE_SIZE / sizeof(l1_pgentry_t);
548 pt_va = ((va >> L1_PAGETABLE_SHIFT) & ~(entries - 1)) << L1_PAGETABLE_SHIFT;
549 spl1e = (l1_pgentry_t *) __shadow_get_l1e(v, pt_va, &tmp_sl1e);
551 /*
552 gpl1e = &(linear_pg_table[l1_linear_offset(va) &
553 ~(L1_PAGETABLE_ENTRIES-1)]);
555 spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
556 ~(L1_PAGETABLE_ENTRIES-1)]);*/
558 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
559 {
560 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
561 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
562 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
563 sl1e = l1e_empty();
564 if ( l1e_get_flags(sl1e) == 0 )
565 {
566 // First copy entries from 0 until first invalid.
567 // Then copy entries from index until first invalid.
568 //
569 if ( i < index ) {
570 i = index - 1;
571 continue;
572 }
573 break;
574 }
575 spl1e[i] = sl1e;
576 if ( unlikely(i < min) )
577 min = i;
578 if ( likely(i > max) )
579 max = i;
580 }
582 frame_table[sl1mfn].tlbflush_timestamp =
583 SHADOW_ENCODE_MIN_MAX(min, max);
584 }
585 }
587 static void
588 shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
589 {
590 struct vcpu *v = current;
591 struct domain *d = v->domain;
592 l2_pgentry_t sl2e;
594 __shadow_get_l2e(v, va, &sl2e);
595 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
596 {
597 /*
598 * Either the L1 is not shadowed, or the shadow isn't linked into
599 * the current shadow L2.
600 */
601 if ( create_l1_shadow )
602 {
603 perfc_incrc(shadow_set_l1e_force_map);
604 shadow_map_l1_into_current_l2(va);
605 }
606 else /* check to see if it exists; if so, link it in */
607 {
608 l2_pgentry_t gpde = linear_l2_table(v)[l2_table_offset(va)];
609 unsigned long gl1pfn = l2e_get_pfn(gpde);
610 unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
612 ASSERT( l2e_get_flags(gpde) & _PAGE_PRESENT );
614 if ( sl1mfn )
615 {
616 perfc_incrc(shadow_set_l1e_unlinked);
617 if ( !get_shadow_ref(sl1mfn) )
618 BUG();
619 l2pde_general(d, &gpde, &sl2e, sl1mfn);
620 __guest_set_l2e(v, va, &gpde);
621 __shadow_set_l2e(v, va, &sl2e);
622 }
623 else
624 {
625 // no shadow exists, so there's nothing to do.
626 perfc_incrc(shadow_set_l1e_fail);
627 return;
628 }
629 }
630 }
632 if ( shadow_mode_refcounts(d) )
633 {
634 l1_pgentry_t old_spte;
635 __shadow_get_l1e(v, va, &old_spte);
637 // only do the ref counting if something important changed.
638 //
639 if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
640 {
641 if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
642 !shadow_get_page_from_l1e(new_spte, d) )
643 new_spte = l1e_empty();
644 if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
645 shadow_put_page_from_l1e(old_spte, d);
646 }
647 }
649 __shadow_set_l1e(v, va, &new_spte);
651 shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
652 }
654 static void shadow_invlpg_32(struct vcpu *v, unsigned long va)
655 {
656 struct domain *d = v->domain;
657 l1_pgentry_t gpte, spte;
659 ASSERT(shadow_mode_enabled(d));
661 shadow_lock(d);
663 __shadow_sync_va(v, va);
665 // XXX mafetter: will need to think about 4MB pages...
667 // It's not strictly necessary to update the shadow here,
668 // but it might save a fault later.
669 //
670 /*if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
671 sizeof(gpte))) {*/
672 if (unlikely(!__guest_get_l1e(v, va, &gpte))) {
673 perfc_incrc(shadow_invlpg_faults);
674 shadow_unlock(d);
675 return;
676 }
677 l1pte_propagate_from_guest(d, gpte, &spte);
678 shadow_set_l1e(va, spte, 1);
680 shadow_unlock(d);
681 }
683 static struct out_of_sync_entry *
684 shadow_alloc_oos_entry(struct domain *d)
685 {
686 struct out_of_sync_entry *f, *extra;
687 unsigned size, i;
689 if ( unlikely(d->arch.out_of_sync_free == NULL) )
690 {
691 FSH_LOG("Allocate more fullshadow tuple blocks.");
693 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
694 extra = xmalloc_bytes(size);
696 /* XXX Should be more graceful here. */
697 if ( extra == NULL )
698 BUG();
700 memset(extra, 0, size);
702 /* Record the allocation block so it can be correctly freed later. */
703 d->arch.out_of_sync_extras_count++;
704 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
705 d->arch.out_of_sync_extras;
706 d->arch.out_of_sync_extras = &extra[0];
708 /* Thread a free chain through the newly-allocated nodes. */
709 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
710 extra[i].next = &extra[i+1];
711 extra[i].next = NULL;
713 /* Add the new nodes to the free list. */
714 d->arch.out_of_sync_free = &extra[0];
715 }
717 /* Allocate a new node from the quicklist. */
718 f = d->arch.out_of_sync_free;
719 d->arch.out_of_sync_free = f->next;
721 return f;
722 }
724 static inline unsigned long
725 shadow_make_snapshot(
726 struct domain *d, unsigned long gpfn, unsigned long gmfn)
727 {
728 unsigned long smfn, sl1mfn = 0;
729 void *original, *snapshot;
730 u32 min_max = 0;
731 int min, max, length;
733 if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
734 {
735 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
736 return SHADOW_SNAPSHOT_ELSEWHERE;
737 }
739 perfc_incrc(shadow_make_snapshot);
741 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
742 {
743 printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
744 "Dom%d snapshot_count_count=%d\n",
745 gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
746 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
747 }
749 if ( !get_shadow_ref(smfn) )
750 BUG();
752 if ( shadow_mode_refcounts(d) &&
753 (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
754 min_max = pfn_to_page(sl1mfn)->tlbflush_timestamp;
755 pfn_to_page(smfn)->tlbflush_timestamp = min_max;
757 min = SHADOW_MIN(min_max);
758 max = SHADOW_MAX(min_max);
759 length = max - min + 1;
760 perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
762 min *= sizeof(l1_pgentry_t);
763 length *= sizeof(l1_pgentry_t);
765 original = map_domain_page(gmfn);
766 snapshot = map_domain_page(smfn);
767 memcpy(snapshot + min, original + min, length);
768 unmap_domain_page(original);
769 unmap_domain_page(snapshot);
771 return smfn;
772 }
774 static struct out_of_sync_entry *
775 mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
776 unsigned long mfn)
777 {
778 struct domain *d = v->domain;
779 struct pfn_info *page = &frame_table[mfn];
780 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
782 ASSERT(shadow_lock_is_acquired(d));
783 ASSERT(pfn_valid(mfn));
785 #ifndef NDEBUG
786 u32 type = page->u.inuse.type_info & PGT_type_mask;
787 if ( shadow_mode_refcounts(d) )
788 {
789 ASSERT(type == PGT_writable_page);
790 }
791 else
792 {
793 ASSERT(type && (type < PGT_l4_page_table));
794 }
795 #endif
797 FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
798 gpfn, mfn, page->count_info, page->u.inuse.type_info);
800 // XXX this will require some more thought... Cross-domain sharing and
801 // modification of page tables? Hmm...
802 //
803 if ( d != page_get_owner(page) )
804 BUG();
806 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
808 entry->gpfn = gpfn;
809 entry->gmfn = mfn;
810 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
811 entry->writable_pl1e = -1;
813 #if SHADOW_DEBUG
814 mark_shadows_as_reflecting_snapshot(d, gpfn);
815 #endif
817 // increment guest's ref count to represent the entry in the
818 // full shadow out-of-sync list.
819 //
820 get_page(page, d);
822 // Add to the out-of-sync list
823 //
824 entry->next = d->arch.out_of_sync;
825 d->arch.out_of_sync = entry;
827 return entry;
828 }
830 static void shadow_mark_va_out_of_sync(
831 struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
832 {
833 struct out_of_sync_entry *entry =
834 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
835 l2_pgentry_t sl2e;
837 #if CONFIG_PAGING_LEVELS >= 4
838 {
839 l4_pgentry_t sl4e;
840 l3_pgentry_t sl3e;
842 __shadow_get_l4e(v, va, &sl4e);
843 if ( !(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
844 shadow_map_into_current(v, va, L3, L4);
845 }
847 if (!__shadow_get_l3e(v, va, &sl3e)) {
848 BUG();
849 }
851 if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
852 shadow_map_into_current(v, va, L2, L3);
853 }
854 }
855 #endif
857 // We need the address of shadow PTE that maps @va.
858 // It might not exist yet. Make sure it's there.
859 //
860 __shadow_get_l2e(v, va, &sl2e);
861 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
862 {
863 // either this L1 isn't shadowed yet, or the shadow isn't linked into
864 // the current L2.
865 shadow_map_l1_into_current_l2(va);
866 __shadow_get_l2e(v, va, &sl2e);
867 }
868 ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
870 // NB: this is stored as a machine address.
871 entry->writable_pl1e =
872 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
873 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
875 // Increment shadow's page count to represent the reference
876 // inherent in entry->writable_pl1e
877 //
878 if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
879 BUG();
881 FSH_LOG("mark_out_of_sync(va=%lx -> writable_pl1e=%lx)",
882 va, entry->writable_pl1e);
883 }
885 /*
886 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
887 * Returns 0 otherwise.
888 */
889 static int snapshot_entry_matches(
890 struct domain *d, l1_pgentry_t *guest_pt,
891 unsigned long gpfn, unsigned index)
892 {
893 unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
894 l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
895 int entries_match;
897 perfc_incrc(snapshot_entry_matches_calls);
899 if ( !smfn )
900 return 0;
902 snapshot = map_domain_page(smfn);
904 if (__copy_from_user(&gpte, &guest_pt[index],
905 sizeof(gpte)))
906 return 0;
908 // This could probably be smarter, but this is sufficent for
909 // our current needs.
910 //
911 entries_match = !l1e_has_changed(gpte, snapshot[index],
912 PAGE_FLAG_MASK);
914 unmap_domain_page(snapshot);
916 #ifdef PERF_COUNTERS
917 if ( entries_match )
918 perfc_incrc(snapshot_entry_matches_true);
919 #endif
921 return entries_match;
922 }
924 /*
925 * Returns 1 if va's shadow mapping is out-of-sync.
926 * Returns 0 otherwise.
927 */
928 static int is_out_of_sync(struct vcpu *v, unsigned long va) /* __shadow_out_of_sync */
929 {
930 struct domain *d = v->domain;
931 #if defined (__x86_64__)
932 unsigned long l2mfn = ((v->arch.flags & TF_kernel_mode)?
933 pagetable_get_pfn(v->arch.guest_table) :
934 pagetable_get_pfn(v->arch.guest_table_user));
935 #else
936 unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
937 #endif
938 unsigned long l2pfn = __mfn_to_gpfn(d, l2mfn);
939 l2_pgentry_t l2e;
940 unsigned long l1pfn, l1mfn;
941 l1_pgentry_t *guest_pt;
942 l1_pgentry_t tmp_gle;
943 unsigned long pt_va;
945 ASSERT(shadow_lock_is_acquired(d));
946 ASSERT(VALID_M2P(l2pfn));
948 perfc_incrc(shadow_out_of_sync_calls);
950 #if CONFIG_PAGING_LEVELS >= 4
951 if (d->arch.ops->guest_paging_levels == L4) { /* Mode F */
952 pgentry_64_t le;
953 unsigned long gmfn;
954 unsigned long gpfn;
955 int i;
957 gmfn = l2mfn;
958 gpfn = l2pfn;
959 guest_pt = (l1_pgentry_t *)v->arch.guest_vtable;
961 for (i = L4; i >= L3; i--) {
962 if ( page_out_of_sync(&frame_table[gmfn]) &&
963 !snapshot_entry_matches(
964 d, guest_pt, gpfn, table_offset_64(va, i)) )
965 return 1;
967 __rw_entry(v, va, &le, GUEST_ENTRY | GET_ENTRY | i);
968 if ( !(entry_get_flags(le) & _PAGE_PRESENT) )
969 return 0;
970 gpfn = entry_get_pfn(le);
971 gmfn = __gpfn_to_mfn(d, gpfn);
972 if ( !VALID_MFN(gmfn) )
973 return 0;
974 /* Todo: check!*/
975 guest_pt = (l1_pgentry_t *)map_domain_page(gmfn);
977 }
979 /* L2 */
980 if ( page_out_of_sync(&frame_table[gmfn]) &&
981 !snapshot_entry_matches(d, guest_pt, gpfn, l2_table_offset(va)) )
982 return 1;
985 } else
986 #endif
988 if ( page_out_of_sync(&frame_table[l2mfn]) &&
989 !snapshot_entry_matches(d, (l1_pgentry_t *)v->arch.guest_vtable,
990 l2pfn, l2_table_offset(va)) )
991 return 1;
993 __guest_get_l2e(v, va, &l2e);
994 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
995 (l2e_get_flags(l2e) & _PAGE_PSE))
996 return 0;
998 l1pfn = l2e_get_pfn(l2e);
999 l1mfn = __gpfn_to_mfn(d, l1pfn);
1001 // If the l1 pfn is invalid, it can't be out of sync...
1002 if ( !VALID_MFN(l1mfn) )
1003 return 0;
1005 pt_va = ((va >> L1_PAGETABLE_SHIFT) & ~(L1_PAGETABLE_ENTRIES - 1))
1006 << L1_PAGETABLE_SHIFT;
1007 guest_pt = (l1_pgentry_t *) __guest_get_l1e(v, pt_va, &tmp_gle);
1009 if ( page_out_of_sync(&frame_table[l1mfn]) &&
1010 !snapshot_entry_matches(
1011 d, guest_pt, l1pfn, l1_table_offset(va)) )
1012 return 1;
1014 return 0;
1017 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
1018 static inline unsigned long
1019 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
1021 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
1024 static inline void
1025 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1027 unsigned long score = prediction & PGT_score_mask;
1028 int create = (score == 0);
1030 // saturating addition
1031 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
1032 score = score ? score : PGT_score_mask;
1034 prediction = (prediction & PGT_mfn_mask) | score;
1036 //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create);
1037 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1039 if ( create )
1040 perfc_incr(writable_pte_predictions);
1043 static inline void
1044 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1046 unsigned long score = prediction & PGT_score_mask;
1047 ASSERT(score);
1049 // divide score by 2... We don't like bad predictions.
1050 //
1051 score = (score >> 1) & PGT_score_mask;
1053 prediction = (prediction & PGT_mfn_mask) | score;
1055 //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score);
1057 if ( score )
1058 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1059 else
1061 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
1062 perfc_decr(writable_pte_predictions);
1066 static u32 remove_all_write_access_in_ptpage(
1067 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
1068 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
1069 u32 max_refs_to_find, unsigned long prediction)
1071 l1_pgentry_t *pt = map_domain_page(pt_mfn);
1072 l1_pgentry_t match;
1073 unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
1074 int i;
1075 u32 found = 0;
1076 int is_l1_shadow =
1077 ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
1078 PGT_l1_shadow);
1079 #if CONFIG_PAGING_LEVELS == 4
1080 is_l1_shadow |=
1081 ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
1082 PGT_fl1_shadow);
1083 #endif
1085 match = l1e_from_pfn(readonly_gmfn, flags);
1087 // returns true if all refs have been found and fixed.
1088 //
1089 int fix_entry(int i)
1091 l1_pgentry_t old = pt[i];
1092 l1_pgentry_t new = old;
1094 l1e_remove_flags(new,_PAGE_RW);
1095 if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
1096 BUG();
1097 found++;
1098 pt[i] = new;
1099 if ( is_l1_shadow )
1100 shadow_put_page_from_l1e(old, d);
1102 #if 0
1103 printk("removed write access to pfn=%lx mfn=%lx in smfn=%lx entry %x "
1104 "is_l1_shadow=%d\n",
1105 readonly_gpfn, readonly_gmfn, pt_mfn, i, is_l1_shadow);
1106 #endif
1108 return (found == max_refs_to_find);
1111 i = readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1);
1112 if ( !l1e_has_changed(pt[i], match, flags) && fix_entry(i) )
1114 perfc_incrc(remove_write_fast_exit);
1115 increase_writable_pte_prediction(d, readonly_gpfn, prediction);
1116 unmap_domain_page(pt);
1117 return found;
1120 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1122 if ( unlikely(!l1e_has_changed(pt[i], match, flags)) && fix_entry(i) )
1123 break;
1126 unmap_domain_page(pt);
1128 return found;
1129 #undef MATCH_ENTRY
1132 static int remove_all_write_access(
1133 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
1135 int i;
1136 struct shadow_status *a;
1137 u32 found = 0, fixups, write_refs;
1138 unsigned long prediction, predicted_gpfn, predicted_smfn;
1140 ASSERT(shadow_lock_is_acquired(d));
1141 ASSERT(VALID_MFN(readonly_gmfn));
1143 perfc_incrc(remove_write_access);
1145 // If it's not a writable page, then no writable refs can be outstanding.
1146 //
1147 if ( (frame_table[readonly_gmfn].u.inuse.type_info & PGT_type_mask) !=
1148 PGT_writable_page )
1150 perfc_incrc(remove_write_not_writable);
1151 return 1;
1154 // How many outstanding writable PTEs for this page are there?
1155 //
1156 write_refs =
1157 (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask);
1158 if ( write_refs && MFN_PINNED(readonly_gmfn) )
1160 write_refs--;
1163 if ( write_refs == 0 )
1165 perfc_incrc(remove_write_no_work);
1166 return 1;
1169 // Before searching all the L1 page tables, check the typical culprit first
1170 //
1171 if ( (prediction = predict_writable_pte_page(d, readonly_gpfn)) )
1173 predicted_gpfn = prediction & PGT_mfn_mask;
1174 if ( (predicted_smfn = __shadow_status(d, predicted_gpfn, PGT_l1_shadow)) &&
1175 (fixups = remove_all_write_access_in_ptpage(d, predicted_gpfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, prediction)) )
1177 found += fixups;
1178 if ( found == write_refs )
1180 perfc_incrc(remove_write_predicted);
1181 return 1;
1184 else
1186 perfc_incrc(remove_write_bad_prediction);
1187 decrease_writable_pte_prediction(d, readonly_gpfn, prediction);
1191 // Search all the shadow L1 page tables...
1192 //
1193 for (i = 0; i < shadow_ht_buckets; i++)
1195 a = &d->arch.shadow_ht[i];
1196 while ( a && a->gpfn_and_flags )
1198 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow
1199 #if CONFIG_PAGING_LEVELS >= 4
1200 || (a->gpfn_and_flags & PGT_type_mask) == PGT_fl1_shadow
1201 #endif
1205 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
1206 if ( found == write_refs )
1207 return 1;
1210 a = a->next;
1214 FSH_LOG("%s: looking for %d refs, found %d refs",
1215 __func__, write_refs, found);
1217 return 0;
1221 static int resync_all(struct domain *d, u32 stype)
1223 struct out_of_sync_entry *entry;
1224 unsigned i;
1225 unsigned long smfn;
1226 void *guest, *shadow, *snapshot;
1227 int need_flush = 0, external = shadow_mode_external(d);
1228 int unshadow;
1229 int changed;
1231 ASSERT(shadow_lock_is_acquired(d));
1233 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
1235 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1236 continue;
1238 smfn = __shadow_status(d, entry->gpfn, stype);
1240 if ( !smfn )
1242 if ( shadow_mode_refcounts(d) )
1243 continue;
1245 // For light weight shadows, even when no shadow page exists,
1246 // we need to resync the refcounts to the new contents of the
1247 // guest page.
1248 // This only applies when we have writable page tables.
1249 //
1250 if ( !shadow_mode_write_all(d) &&
1251 !((stype == PGT_l1_shadow) &&
1252 VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
1253 // Page is not writable -- no resync necessary
1254 continue;
1257 FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
1258 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
1260 // Compare guest's new contents to its snapshot, validating
1261 // and updating its shadow as appropriate.
1262 //
1263 guest = map_domain_page(entry->gmfn);
1264 snapshot = map_domain_page(entry->snapshot_mfn);
1266 if ( smfn )
1267 shadow = map_domain_page(smfn);
1268 else
1269 shadow = NULL;
1271 unshadow = 0;
1273 u32 min_max_shadow = pfn_to_page(smfn)->tlbflush_timestamp;
1274 int min_shadow = SHADOW_MIN(min_max_shadow);
1275 int max_shadow = SHADOW_MAX(min_max_shadow);
1277 u32 min_max_snapshot =
1278 pfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
1279 int min_snapshot = SHADOW_MIN(min_max_snapshot);
1280 int max_snapshot = SHADOW_MAX(min_max_snapshot);
1282 switch ( stype ) {
1283 case PGT_l1_shadow:
1285 l1_pgentry_t *guest1 = guest;
1286 l1_pgentry_t *shadow1 = shadow;
1287 l1_pgentry_t *snapshot1 = snapshot;
1289 ASSERT(VM_ASSIST(d, VMASST_TYPE_writable_pagetables) ||
1290 shadow_mode_write_all(d));
1292 if ( !shadow_mode_refcounts(d) )
1293 revalidate_l1(d, guest1, snapshot1);
1295 if ( !smfn )
1296 break;
1299 changed = 0;
1301 for ( i = min_shadow; i <= max_shadow; i++ )
1303 if ( (i < min_snapshot) || (i > max_snapshot) ||
1304 l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
1306 need_flush |= validate_pte_change(d, guest1[i], &shadow1[i]);
1308 // can't update snapshots of linear page tables -- they
1309 // are used multiple times...
1310 //
1311 // snapshot[i] = new_pte;
1313 changed++;
1316 perfc_incrc(resync_l1);
1317 perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
1318 perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
1319 break;
1321 #if defined (__i386__)
1322 case PGT_l2_shadow:
1324 int max = -1;
1326 l2_pgentry_t *guest2 = guest;
1327 l2_pgentry_t *shadow2 = shadow;
1328 l2_pgentry_t *snapshot2 = snapshot;
1330 ASSERT(shadow_mode_write_all(d));
1331 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
1333 changed = 0;
1334 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1336 #if CONFIG_X86_PAE
1337 BUG(); /* FIXME: need type_info */
1338 #endif
1339 if ( !is_guest_l2_slot(0,i) && !external )
1340 continue;
1342 l2_pgentry_t new_pde = guest2[i];
1343 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
1345 need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
1347 // can't update snapshots of linear page tables -- they
1348 // are used multiple times...
1349 //
1350 // snapshot[i] = new_pde;
1352 changed++;
1354 if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
1355 max = i;
1357 // XXX - This hack works for linux guests.
1358 // Need a better solution long term.
1359 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
1360 unlikely(l2e_get_intpte(new_pde) != 0) &&
1361 !unshadow && MFN_PINNED(smfn) )
1362 unshadow = 1;
1364 if ( max == -1 )
1365 unshadow = 1;
1366 perfc_incrc(resync_l2);
1367 perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
1368 break;
1370 case PGT_hl2_shadow:
1372 l2_pgentry_t *guest2 = guest;
1373 l2_pgentry_t *snapshot2 = snapshot;
1374 l1_pgentry_t *shadow2 = shadow;
1376 ASSERT(shadow_mode_write_all(d));
1377 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
1379 changed = 0;
1380 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1382 #if CONFIG_X86_PAE
1383 BUG(); /* FIXME: need type_info */
1384 #endif
1385 if ( !is_guest_l2_slot(0, i) && !external )
1386 continue;
1388 l2_pgentry_t new_pde = guest2[i];
1389 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
1391 need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
1393 // can't update snapshots of linear page tables -- they
1394 // are used multiple times...
1395 //
1396 // snapshot[i] = new_pde;
1398 changed++;
1401 perfc_incrc(resync_hl2);
1402 perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
1403 break;
1405 #else
1406 case PGT_l2_shadow:
1407 case PGT_l3_shadow:
1409 pgentry_64_t *guest_pt = guest;
1410 pgentry_64_t *shadow_pt = shadow;
1411 pgentry_64_t *snapshot_pt = snapshot;
1413 changed = 0;
1414 for ( i = min_shadow; i <= max_shadow; i++ )
1416 if ( (i < min_snapshot) || (i > max_snapshot) ||
1417 entry_has_changed(
1418 guest_pt[i], snapshot_pt[i], PAGE_FLAG_MASK) )
1420 need_flush |= validate_entry_change(
1421 d, &guest_pt[i], &shadow_pt[i],
1422 shadow_type_to_level(stype));
1423 changed++;
1426 break;
1430 case PGT_l4_shadow:
1432 int max = -1;
1434 l4_pgentry_t *guest4 = guest;
1435 l4_pgentry_t *shadow4 = shadow;
1436 l4_pgentry_t *snapshot4 = snapshot;
1438 changed = 0;
1439 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1441 if ( !is_guest_l4_slot(i) && !external )
1442 continue;
1443 l4_pgentry_t new_l4e = guest4[i];
1444 if ( l4e_has_changed(new_l4e, snapshot4[i], PAGE_FLAG_MASK))
1446 need_flush |= validate_entry_change(
1447 d, (pgentry_64_t *)&new_l4e,
1448 (pgentry_64_t *)&shadow4[i], shadow_type_to_level(stype));
1450 changed++;
1451 ESH_LOG("%d: shadow4 mfn: %lx, shadow root: %lx\n", i,
1452 smfn, pagetable_get_paddr(current->arch.shadow_table));
1454 if ( l4e_get_intpte(new_l4e) != 0 ) /* FIXME: check flags? */
1455 max = i;
1457 // Need a better solution in the long term.
1458 if ( !(l4e_get_flags(new_l4e) & _PAGE_PRESENT) &&
1459 unlikely(l4e_get_intpte(new_l4e) != 0) &&
1460 !unshadow &&
1461 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
1462 unshadow = 1;
1464 if ( max == -1 )
1465 unshadow = 1;
1466 perfc_incrc(resync_l4);
1467 perfc_incr_histo(shm_l4_updates, changed, PT_UPDATES);
1468 break;
1471 #endif
1472 default:
1473 BUG();
1476 if ( smfn )
1477 unmap_domain_page(shadow);
1478 unmap_domain_page(snapshot);
1479 unmap_domain_page(guest);
1481 if ( unlikely(unshadow) )
1483 perfc_incrc(unshadow_l2_count);
1484 shadow_unpin(smfn);
1485 #if defined (__i386__)
1486 if ( unlikely(shadow_mode_external(d)) )
1488 unsigned long hl2mfn;
1490 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
1491 MFN_PINNED(hl2mfn) )
1492 shadow_unpin(hl2mfn);
1494 #endif
1498 return need_flush;
1501 static void sync_all(struct domain *d)
1503 struct out_of_sync_entry *entry;
1504 int need_flush = 0;
1506 perfc_incrc(shadow_sync_all);
1508 ASSERT(shadow_lock_is_acquired(d));
1510 // First, remove all write permissions to the page tables
1511 //
1512 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
1514 // Skip entries that have low bits set... Those aren't
1515 // real PTEs.
1516 //
1517 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
1518 continue;
1520 l1_pgentry_t *ppte = (l1_pgentry_t *)(
1521 (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
1522 (entry->writable_pl1e & ~PAGE_MASK));
1523 l1_pgentry_t opte = *ppte;
1524 l1_pgentry_t npte = opte;
1525 l1e_remove_flags(npte, _PAGE_RW);
1527 if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
1528 !shadow_get_page_from_l1e(npte, d) )
1529 BUG();
1530 *ppte = npte;
1531 shadow_put_page_from_l1e(opte, d);
1533 unmap_domain_page(ppte);
1536 // XXX mafetter: SMP
1537 //
1538 // With the current algorithm, we've gotta flush all the TLBs
1539 // before we can safely continue. I don't think we want to
1540 // do it this way, so I think we should consider making
1541 // entirely private copies of the shadow for each vcpu, and/or
1542 // possibly having a mix of private and shared shadow state
1543 // (any path from a PTE that grants write access to an out-of-sync
1544 // page table page needs to be vcpu private).
1545 //
1546 #if 0 // this should be enabled for SMP guests...
1547 flush_tlb_mask(cpu_online_map);
1548 #endif
1549 need_flush = 1;
1551 // Second, resync all L1 pages, then L2 pages, etc...
1552 //
1553 need_flush |= resync_all(d, PGT_l1_shadow);
1554 #if defined (__i386__)
1555 if ( shadow_mode_translate(d) )
1556 need_flush |= resync_all(d, PGT_hl2_shadow);
1557 #endif
1558 need_flush |= resync_all(d, PGT_l2_shadow);
1559 need_flush |= resync_all(d, PGT_l3_shadow);
1560 need_flush |= resync_all(d, PGT_l4_shadow);
1562 if ( need_flush && !unlikely(shadow_mode_external(d)) )
1563 local_flush_tlb();
1565 free_out_of_sync_state(d);
1568 static inline int l1pte_write_fault(
1569 struct vcpu *v, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
1570 unsigned long va)
1572 struct domain *d = v->domain;
1573 l1_pgentry_t gpte = *gpte_p;
1574 l1_pgentry_t spte;
1575 unsigned long gpfn = l1e_get_pfn(gpte);
1576 unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
1578 //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
1580 if ( unlikely(!VALID_MFN(gmfn)) )
1582 SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
1583 *spte_p = l1e_empty();
1584 return 0;
1587 ASSERT(l1e_get_flags(gpte) & _PAGE_RW);
1588 l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
1589 spte = l1e_from_pfn(gmfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
1591 SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
1592 l1e_get_intpte(spte), l1e_get_intpte(gpte));
1594 if ( shadow_mode_log_dirty(d) )
1595 __mark_dirty(d, gmfn);
1597 if ( mfn_is_page_table(gmfn) )
1598 shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
1600 *gpte_p = gpte;
1601 *spte_p = spte;
1603 return 1;
1606 static inline int l1pte_read_fault(
1607 struct domain *d, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
1609 l1_pgentry_t gpte = *gpte_p;
1610 l1_pgentry_t spte = *spte_p;
1611 unsigned long pfn = l1e_get_pfn(gpte);
1612 unsigned long mfn = __gpfn_to_mfn(d, pfn);
1614 if ( unlikely(!VALID_MFN(mfn)) )
1616 SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
1617 *spte_p = l1e_empty();
1618 return 0;
1621 l1e_add_flags(gpte, _PAGE_ACCESSED);
1622 spte = l1e_from_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
1624 if ( shadow_mode_log_dirty(d) || !(l1e_get_flags(gpte) & _PAGE_DIRTY) ||
1625 mfn_is_page_table(mfn) )
1627 l1e_remove_flags(spte, _PAGE_RW);
1630 SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
1631 l1e_get_intpte(spte), l1e_get_intpte(gpte));
1632 *gpte_p = gpte;
1633 *spte_p = spte;
1635 return 1;
1638 static int shadow_fault_32(unsigned long va, struct cpu_user_regs *regs)
1640 l1_pgentry_t gpte, spte, orig_gpte;
1641 struct vcpu *v = current;
1642 struct domain *d = v->domain;
1643 l2_pgentry_t gpde;
1645 spte = l1e_empty();
1647 SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
1648 va, (unsigned long)regs->error_code);
1649 perfc_incrc(shadow_fault_calls);
1651 check_pagetable(v, "pre-sf");
1653 /*
1654 * Don't let someone else take the guest's table pages out-of-sync.
1655 */
1656 shadow_lock(d);
1658 /* XXX - FIX THIS COMMENT!!!
1659 * STEP 1. Check to see if this fault might have been caused by an
1660 * out-of-sync table page entry, or if we should pass this
1661 * fault onto the guest.
1662 */
1663 __shadow_sync_va(v, va);
1665 /*
1666 * STEP 2. Check the guest PTE.
1667 */
1668 __guest_get_l2e(v, va, &gpde);
1669 if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
1671 SH_VVLOG("shadow_fault - EXIT: L1 not present");
1672 perfc_incrc(shadow_fault_bail_pde_not_present);
1673 goto fail;
1676 // This can't fault because we hold the shadow lock and we've ensured that
1677 // the mapping is in-sync, so the check of the PDE's present bit, above,
1678 // covers this access.
1679 //
1680 //orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
1681 __guest_get_l1e(v, va, &gpte);
1682 orig_gpte = gpte;
1684 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
1686 SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")",
1687 l1e_get_intpte(gpte));
1688 perfc_incrc(shadow_fault_bail_pte_not_present);
1689 goto fail;
1692 /* Write fault? */
1693 if ( regs->error_code & 2 )
1695 int allow_writes = 0;
1697 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
1699 if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
1701 allow_writes = 1;
1702 l1e_add_flags(gpte, _PAGE_RW);
1704 else
1706 /* Write fault on a read-only mapping. */
1707 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
1708 l1e_get_intpte(gpte));
1709 perfc_incrc(shadow_fault_bail_ro_mapping);
1710 goto fail;
1714 if ( !l1pte_write_fault(v, &gpte, &spte, va) )
1716 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
1717 perfc_incrc(write_fault_bail);
1718 shadow_unlock(d);
1719 return 0;
1722 if ( allow_writes )
1723 l1e_remove_flags(gpte, _PAGE_RW);
1725 else
1727 if ( !l1pte_read_fault(d, &gpte, &spte) )
1729 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
1730 perfc_incrc(read_fault_bail);
1731 shadow_unlock(d);
1732 return 0;
1736 /*
1737 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
1738 */
1739 if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
1741 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
1742 /*if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
1743 &gpte, sizeof(gpte))) )*/
1744 if ( unlikely(!__guest_set_l1e(v, va, &gpte)))
1746 printk("%s() failed, crashing domain %d "
1747 "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
1748 __func__,d->domain_id, l2e_get_intpte(gpde), va);
1749 domain_crash_synchronous();
1752 // if necessary, record the page table page as dirty
1753 if ( unlikely(shadow_mode_log_dirty(d)) )
1754 __mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde)));
1757 shadow_set_l1e(va, spte, 1);
1759 perfc_incrc(shadow_fault_fixed);
1760 d->arch.shadow_fault_count++;
1762 shadow_unlock(d);
1764 check_pagetable(v, "post-sf");
1765 return EXCRET_fault_fixed;
1767 fail:
1768 shadow_unlock(d);
1769 return 0;
1772 static int do_update_va_mapping(unsigned long va,
1773 l1_pgentry_t val,
1774 struct vcpu *v)
1776 struct domain *d = v->domain;
1777 l1_pgentry_t spte;
1778 int rc = 0;
1780 shadow_lock(d);
1782 //printk("%s(va=%p, val=%p)\n", __func__, (void *)va, (void *)l1e_get_intpte(val));
1784 // This is actually overkill - we don't need to sync the L1 itself,
1785 // just everything involved in getting to this L1 (i.e. we need
1786 // linear_pg_table[l1_linear_offset(va)] to be in sync)...
1787 //
1788 __shadow_sync_va(v, va);
1790 l1pte_propagate_from_guest(d, val, &spte);
1791 shadow_set_l1e(va, spte, 0);
1793 /*
1794 * If we're in log-dirty mode then we need to note that we've updated
1795 * the PTE in the PT-holding page. We need the machine frame number
1796 * for this.
1797 */
1798 if ( shadow_mode_log_dirty(d) )
1799 __mark_dirty(d, va_to_l1mfn(v, va));
1801 // out:
1802 shadow_unlock(d);
1804 return rc;
1808 /*
1809 * What lives where in the 32-bit address space in the various shadow modes,
1810 * and what it uses to get/maintain that mapping.
1812 * SHADOW MODE: none enable translate external
1814 * 4KB things:
1815 * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
1816 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
1817 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
1818 * monitor_vtable n/a n/a n/a mapped once
1820 * 4MB things:
1821 * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
1822 * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
1823 * monitor_linear n/a n/a n/a ???
1824 * perdomain perdomain perdomain perdomain perdomain
1825 * R/O M2P R/O M2P R/O M2P n/a n/a
1826 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
1827 * P2M n/a n/a R/O M2P R/O M2P
1829 * NB:
1830 * update_pagetables(), shadow_update_pagetables(), shadow_mode_enable(),
1831 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
1832 * all play a part in maintaining these mappings.
1833 */
1834 static void shadow_update_pagetables(struct vcpu *v)
1836 struct domain *d = v->domain;
1837 #if defined (__x86_64__)
1838 unsigned long gmfn = ((v->arch.flags & TF_kernel_mode)?
1839 pagetable_get_pfn(v->arch.guest_table) :
1840 pagetable_get_pfn(v->arch.guest_table_user));
1841 #else
1842 unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
1843 #endif
1845 unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
1846 unsigned long smfn, old_smfn;
1848 #if CONFIG_PAGING_LEVELS == 2
1849 unsigned long hl2mfn;
1850 #endif
1852 int max_mode = ( shadow_mode_external(d) ? SHM_external
1853 : shadow_mode_translate(d) ? SHM_translate
1854 : shadow_mode_enabled(d) ? SHM_enable
1855 : 0 );
1857 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
1858 ASSERT( max_mode );
1860 /*
1861 * arch.guest_vtable
1862 */
1863 if ( max_mode & (SHM_enable | SHM_external) )
1865 if ( likely(v->arch.guest_vtable != NULL) )
1866 unmap_domain_page(v->arch.guest_vtable);
1867 v->arch.guest_vtable = map_domain_page(gmfn);
1870 /*
1871 * arch.shadow_table
1872 */
1873 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) ) {
1874 #if CONFIG_PAGING_LEVELS == 2
1875 smfn = shadow_l2_table(d, gpfn, gmfn);
1876 #elif CONFIG_PAGING_LEVELS == 3
1877 smfn = shadow_l3_table(d, gpfn, gmfn);
1878 #elif CONFIG_PAGING_LEVELS == 4
1879 smfn = shadow_l4_table(d, gpfn, gmfn);
1880 #endif
1882 if ( !get_shadow_ref(smfn) )
1883 BUG();
1884 old_smfn = pagetable_get_pfn(v->arch.shadow_table);
1885 v->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
1886 if ( old_smfn )
1887 put_shadow_ref(old_smfn);
1889 SH_VVLOG("shadow_update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
1891 /*
1892 * arch.shadow_vtable
1893 */
1894 if ( max_mode == SHM_external
1895 #if CONFIG_PAGING_LEVELS >=4
1896 || max_mode & SHM_enable
1897 #endif
1900 if ( v->arch.shadow_vtable )
1901 unmap_domain_page(v->arch.shadow_vtable);
1902 v->arch.shadow_vtable = map_domain_page(smfn);
1905 #if CONFIG_PAGING_LEVELS == 2
1906 /*
1907 * arch.hl2_vtable
1908 */
1910 // if max_mode == SHM_translate, then the hl2 is already installed
1911 // correctly in its smfn, and there's nothing to do.
1912 //
1913 if ( max_mode == SHM_external )
1915 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
1916 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
1917 if ( v->arch.hl2_vtable )
1918 unmap_domain_page(v->arch.hl2_vtable);
1919 v->arch.hl2_vtable = map_domain_page(hl2mfn);
1922 /*
1923 * fixup pointers in monitor table, as necessary
1924 */
1925 if ( max_mode == SHM_external )
1927 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
1928 l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
1929 l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
1931 ASSERT( shadow_mode_translate(d) );
1933 if ( !get_shadow_ref(hl2mfn) )
1934 BUG();
1935 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1936 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
1937 if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
1938 put_shadow_ref(l2e_get_pfn(old_hl2e));
1940 if ( !get_shadow_ref(smfn) )
1941 BUG();
1942 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
1943 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
1944 if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1945 put_shadow_ref(l2e_get_pfn(old_sl2e));
1947 // XXX - maybe this can be optimized somewhat??
1948 local_flush_tlb();
1950 #endif
1952 #if CONFIG_PAGING_LEVELS == 3
1953 /* FIXME: PAE code to be written */
1954 #endif
1957 struct shadow_ops MODE_A_HANDLER = {
1958 .guest_paging_levels = 2,
1959 .invlpg = shadow_invlpg_32,
1960 .fault = shadow_fault_32,
1961 .update_pagetables = shadow_update_pagetables,
1962 .sync_all = sync_all,
1963 .remove_all_write_access = remove_all_write_access,
1964 .do_update_va_mapping = do_update_va_mapping,
1965 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
1966 .is_out_of_sync = is_out_of_sync,
1967 };
1969 /************************************************************************/
1970 /************************************************************************/
1971 /************************************************************************/
1973 #if SHADOW_DEBUG
1975 // The following is entirely for _check_pagetable()'s benefit.
1976 // _check_pagetable() wants to know whether a given entry in a
1977 // shadow page table is supposed to be the shadow of the guest's
1978 // current entry, or the shadow of the entry held in the snapshot
1979 // taken above.
1980 //
1981 // Here, we mark all currently existing entries as reflecting
1982 // the snapshot, above. All other places in xen that update
1983 // the shadow will keep the shadow in sync with the guest's
1984 // entries (via l1pte_propagate_from_guest and friends), which clear
1985 // the SHADOW_REFLECTS_SNAPSHOT bit.
1986 //
1987 static void
1988 mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
1990 unsigned long smfn;
1991 l1_pgentry_t *l1e;
1992 l2_pgentry_t *l2e;
1993 unsigned i;
1995 if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
1997 l1e = map_domain_page(smfn);
1998 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1999 if ( is_guest_l1_slot(i) &&
2000 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
2001 l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
2002 unmap_domain_page(l1e);
2005 if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
2007 l2e = map_domain_page(smfn);
2008 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2009 if ( is_guest_l2_slot(0, i) &&
2010 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
2011 l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
2012 unmap_domain_page(l2e);
2016 // BUG: these are not SMP safe...
2017 static int sh_l2_present;
2018 static int sh_l1_present;
2019 char * sh_check_name;
2020 int shadow_status_noswap;
2022 #define v2m(_v, _adr) ({ \
2023 unsigned long _a = (unsigned long)(_adr); \
2024 l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \
2025 unsigned long _pa = -1; \
2026 if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
2027 { \
2028 l1_pgentry_t _pte; \
2029 _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
2030 if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
2031 _pa = l1e_get_paddr(_pte); \
2032 } \
2033 _pa | (_a & ~PAGE_MASK); \
2034 })
2036 #define FAIL(_f, _a...) \
2037 do { \
2038 printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \
2039 sh_check_name, level, l2_idx, l1_idx, ## _a, \
2040 __FILE__, __LINE__); \
2041 printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \
2042 " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \
2043 " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \
2044 " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \
2045 l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \
2046 l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \
2047 p_guest_pte, p_shadow_pte, p_snapshot_pte, \
2048 (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \
2049 (void *)v2m(v, p_snapshot_pte), \
2050 (l2_idx << L2_PAGETABLE_SHIFT) | \
2051 (l1_idx << L1_PAGETABLE_SHIFT)); \
2052 errors++; \
2053 } while ( 0 )
2055 static int check_pte(
2056 struct vcpu *v,
2057 l1_pgentry_t *p_guest_pte,
2058 l1_pgentry_t *p_shadow_pte,
2059 l1_pgentry_t *p_snapshot_pte,
2060 int level, int l2_idx, int l1_idx)
2062 struct domain *d = v->domain;
2063 l1_pgentry_t guest_pte = *p_guest_pte;
2064 l1_pgentry_t shadow_pte = *p_shadow_pte;
2065 l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
2066 l1_pgentry_t eff_guest_pte;
2067 unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
2068 int errors = 0, guest_writable;
2069 int page_table_page;
2071 if ( (l1e_get_intpte(shadow_pte) == 0) ||
2072 (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
2073 (l1e_get_intpte(shadow_pte) == 0x00000E00) )
2074 return errors; /* always safe */
2076 if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
2077 FAIL("Non zero not present shadow_pte");
2079 if ( level == 2 ) sh_l2_present++;
2080 if ( level == 1 ) sh_l1_present++;
2082 if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
2083 eff_guest_pte = snapshot_pte;
2084 else
2085 eff_guest_pte = guest_pte;
2087 if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
2088 FAIL("Guest not present yet shadow is");
2090 mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
2092 if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
2093 FAIL("Corrupt?");
2095 if ( (level == 1) &&
2096 (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
2097 !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
2098 FAIL("Dirty coherence");
2100 if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
2101 !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
2102 FAIL("Accessed coherence");
2104 if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
2105 FAIL("global bit set in shadow");
2107 eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
2108 eff_guest_mfn = __gpfn_to_mfn(d, eff_guest_pfn);
2109 shadow_mfn = l1e_get_pfn(shadow_pte);
2111 if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
2112 FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
2113 __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
2115 page_table_page = mfn_is_page_table(eff_guest_mfn);
2117 guest_writable =
2118 (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
2119 (VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
2121 if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
2123 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n",
2124 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
2125 frame_table[eff_guest_mfn].u.inuse.type_info,
2126 page_table_page);
2127 FAIL("RW coherence");
2130 if ( (level == 1) &&
2131 (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
2132 !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
2134 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n",
2135 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
2136 frame_table[eff_guest_mfn].u.inuse.type_info,
2137 page_table_page);
2138 FAIL("RW2 coherence");
2141 if ( eff_guest_mfn == shadow_mfn )
2143 if ( level > 1 )
2144 FAIL("Linear map ???"); /* XXX this will fail on BSD */
2146 else
2148 if ( level < 2 )
2149 FAIL("Shadow in L1 entry?");
2151 if ( level == 2 )
2153 if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
2154 FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
2155 __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
2157 else
2158 BUG(); // XXX -- not handled yet.
2161 return errors;
2163 #undef FAIL
2164 #undef v2m
2166 static int check_l1_table(
2167 struct vcpu *v, unsigned long gpfn,
2168 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
2170 struct domain *d = v->domain;
2171 int i;
2172 unsigned long snapshot_mfn;
2173 l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
2174 int errors = 0;
2176 if ( page_out_of_sync(pfn_to_page(gmfn)) )
2178 snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
2179 ASSERT(snapshot_mfn);
2180 p_snapshot = map_domain_page(snapshot_mfn);
2183 p_guest = map_domain_page(gmfn);
2184 p_shadow = map_domain_page(smfn);
2186 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2187 errors += check_pte(v, p_guest+i, p_shadow+i,
2188 p_snapshot ? p_snapshot+i : NULL,
2189 1, l2_idx, i);
2191 unmap_domain_page(p_shadow);
2192 unmap_domain_page(p_guest);
2193 if ( p_snapshot )
2194 unmap_domain_page(p_snapshot);
2196 return errors;
2199 #define FAILPT(_f, _a...) \
2200 do { \
2201 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
2202 errors++; \
2203 } while ( 0 )
2205 static int check_l2_table(
2206 struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
2208 struct domain *d = v->domain;
2209 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
2210 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
2211 l2_pgentry_t match;
2212 int i;
2213 int errors = 0;
2214 int limit;
2216 if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
2217 FAILPT("domain doesn't own page");
2218 if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
2219 FAILPT("bogus owner for snapshot page");
2220 if ( page_get_owner(pfn_to_page(smfn)) != NULL )
2221 FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
2222 smfn, page_get_owner(pfn_to_page(smfn))->domain_id);
2224 #if 0
2225 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2226 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2227 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
2228 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
2230 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2231 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
2232 i++ )
2233 printk("+++ (%d) %lx %lx\n",i,
2234 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
2235 FAILPT("hypervisor entries inconsistent");
2238 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
2239 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
2240 FAILPT("hypervisor linear map inconsistent");
2241 #endif
2243 match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
2244 if ( !shadow_mode_external(d) &&
2245 l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
2246 match, PAGE_FLAG_MASK))
2248 FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
2249 l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
2250 L2_PAGETABLE_SHIFT]),
2251 l2e_get_intpte(match));
2254 match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
2255 if ( !shadow_mode_external(d) &&
2256 l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
2257 match, PAGE_FLAG_MASK))
2259 FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
2260 l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
2261 d->arch.mm_perdomain_pt,
2262 l2e_get_intpte(match));
2265 #ifdef __i386__
2266 if ( shadow_mode_external(d) )
2267 limit = L2_PAGETABLE_ENTRIES;
2268 else
2269 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2270 #else
2271 limit = 0; /* XXX x86/64 XXX */
2272 #endif
2274 /* Check the whole L2. */
2275 for ( i = 0; i < limit; i++ )
2276 errors += check_pte(v,
2277 (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
2278 (l1_pgentry_t*)(&spl2e[i]),
2279 NULL,
2280 2, i, 0);
2282 unmap_domain_page(spl2e);
2283 unmap_domain_page(gpl2e);
2285 #if 1
2286 if ( errors )
2287 printk("check_l2_table returning %d errors\n", errors);
2288 #endif
2290 return errors;
2292 #undef FAILPT
2294 static int _check_pagetable(struct vcpu *v, char *s)
2296 struct domain *d = v->domain;
2297 #if defined (__x86_64__)
2298 pagetable_t pt = ((v->arch.flags & TF_kernel_mode)?
2299 pagetable_get_pfn(v->arch.guest_table) :
2300 pagetable_get_pfn(v->arch.guest_table_user));
2301 #else
2302 pagetable_t pt = v->arch.guest_table;
2303 #endif
2304 unsigned long gptbase = pagetable_get_paddr(pt);
2305 unsigned long ptbase_pfn, smfn;
2306 unsigned long i;
2307 l2_pgentry_t *gpl2e, *spl2e;
2308 unsigned long ptbase_mfn = 0;
2309 int errors = 0, limit, oos_pdes = 0;
2311 //_audit_domain(d, AUDIT_QUIET);
2312 shadow_lock(d);
2314 sh_check_name = s;
2315 //SH_VVLOG("%s-PT Audit", s);
2316 sh_l2_present = sh_l1_present = 0;
2317 perfc_incrc(check_pagetable);
2319 ptbase_mfn = gptbase >> PAGE_SHIFT;
2320 ptbase_pfn = __mfn_to_gpfn(d, ptbase_mfn);
2322 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
2324 printk("%s-PT %lx not shadowed\n", s, gptbase);
2325 goto out;
2327 if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
2329 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
2330 oos_pdes = 1;
2331 ASSERT(ptbase_mfn);
2334 errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
2336 gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
2337 spl2e = (l2_pgentry_t *) map_domain_page(smfn);
2339 /* Go back and recurse. */
2340 #ifdef __i386__
2341 if ( shadow_mode_external(d) )
2342 limit = L2_PAGETABLE_ENTRIES;
2343 else
2344 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2345 #else
2346 limit = 0; /* XXX x86/64 XXX */
2347 #endif
2349 for ( i = 0; i < limit; i++ )
2351 unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
2352 unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
2353 unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
2355 if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */
2357 errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
2361 unmap_domain_page(spl2e);
2362 unmap_domain_page(gpl2e);
2364 #if 0
2365 SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
2366 sh_l2_present, sh_l1_present);
2367 #endif
2369 out:
2370 if ( errors )
2371 BUG();
2373 shadow_unlock(d);
2375 return errors;
2378 int _check_all_pagetables(struct vcpu *v, char *s)
2380 struct domain *d = v->domain;
2381 int i;
2382 struct shadow_status *a;
2383 unsigned long gmfn;
2384 int errors = 0;
2386 shadow_status_noswap = 1;
2388 sh_check_name = s;
2389 SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
2390 sh_l2_present = sh_l1_present = 0;
2391 perfc_incrc(check_all_pagetables);
2393 for (i = 0; i < shadow_ht_buckets; i++)
2395 a = &d->arch.shadow_ht[i];
2396 while ( a && a->gpfn_and_flags )
2398 gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
2400 switch ( a->gpfn_and_flags & PGT_type_mask )
2402 case PGT_l1_shadow:
2403 errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
2404 gmfn, a->smfn, 0);
2405 break;
2406 case PGT_l2_shadow:
2407 errors += check_l2_table(v, gmfn, a->smfn,
2408 page_out_of_sync(pfn_to_page(gmfn)));
2409 break;
2410 case PGT_l3_shadow:
2411 case PGT_l4_shadow:
2412 case PGT_hl2_shadow:
2413 BUG(); // XXX - ought to fix this...
2414 break;
2415 case PGT_snapshot:
2416 case PGT_writable_pred:
2417 break;
2418 default:
2419 errors++;
2420 printk("unexpected shadow type %lx, gpfn=%lx, "
2421 "gmfn=%lx smfn=%lx\n",
2422 a->gpfn_and_flags & PGT_type_mask,
2423 a->gpfn_and_flags & PGT_mfn_mask,
2424 gmfn, a->smfn);
2425 BUG();
2427 a = a->next;
2431 shadow_status_noswap = 0;
2433 if ( errors )
2434 BUG();
2436 return errors;
2439 #endif // SHADOW_DEBUG
2441 #if CONFIG_PAGING_LEVELS == 3
2442 static unsigned long shadow_l3_table(
2443 struct domain *d, unsigned long gpfn, unsigned long gmfn)
2445 BUG(); /* not implemenated yet */
2446 return 42;
2448 #endif
2450 #if CONFIG_PAGING_LEVELS >= 4
2451 /****************************************************************************/
2452 /* 64-bit shadow-mode code testing */
2453 /****************************************************************************/
2455 static unsigned long shadow_l4_table(
2456 struct domain *d, unsigned long gpfn, unsigned long gmfn)
2458 unsigned long smfn;
2459 l4_pgentry_t *spl4e;
2461 SH_VVLOG("shadow_l4_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
2463 perfc_incrc(shadow_l4_table_count);
2465 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
2467 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
2468 BUG(); /* XXX Deal gracefully with failure. */
2471 spl4e = (l4_pgentry_t *)map_domain_page(smfn);
2472 /* Install hypervisor and 4x linear p.t. mapings. */
2473 if ( (PGT_base_page_table == PGT_l4_page_table) &&
2474 !shadow_mode_external(d) )
2476 /*
2477 * We could proactively fill in PDEs for pages that are already
2478 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
2479 * (restriction required for coherence of the accessed bit). However,
2480 * we tried it and it didn't help performance. This is simpler.
2481 */
2482 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
2484 /* Install hypervisor and 2x linear p.t. mapings. */
2485 memcpy(&spl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
2486 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
2487 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
2489 spl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
2490 l4e_from_paddr(__pa(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_l3),
2491 __PAGE_HYPERVISOR);
2493 if ( shadow_mode_translate(d) ) // NB: not external
2495 spl4e[l4_table_offset(RO_MPT_VIRT_START)] =
2496 l4e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
2497 __PAGE_HYPERVISOR);
2499 else
2500 spl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
2501 l4e_from_pfn(gmfn, __PAGE_HYPERVISOR);
2503 } else
2504 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
2506 unmap_domain_page(spl4e);
2508 ESH_LOG("shadow_l4_table(%lx -> %lx)", gmfn, smfn);
2509 return smfn;
2512 /*
2513 * This shadow_mark_va_out_of_sync() is for 2M page shadow
2514 */
2515 static void shadow_mark_va_out_of_sync_2mp(
2516 struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long writable_pl1e)
2518 struct out_of_sync_entry *entry =
2519 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
2521 entry->writable_pl1e = writable_pl1e;
2522 ESH_LOG("<shadow_mark_va_out_of_sync_2mp> gpfn = %lx\n", gpfn);
2523 if ( !get_shadow_ref(writable_pl1e >> L1_PAGETABLE_SHIFT) )
2524 BUG();
2528 static int get_shadow_mfn(struct domain *d, unsigned long gpfn, unsigned long *spmfn, u32 flag)
2530 unsigned long gmfn;
2531 if ( !(*spmfn = __shadow_status(d, gpfn, flag)) )
2533 /* This is NOT already shadowed so we need to shadow it. */
2534 SH_VVLOG("<get_shadow_mfn>: not shadowed");
2536 gmfn = __gpfn_to_mfn(d, gpfn);
2537 if ( unlikely(!VALID_MFN(gmfn)) )
2539 // Attempt to use an invalid pfn as an shadow page.
2540 // XXX this needs to be more graceful!
2541 BUG();
2544 if ( unlikely(!(*spmfn =
2545 alloc_shadow_page(d, gpfn, gmfn, flag))) )
2547 printk("<get_shadow_mfn>Couldn't alloc an shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
2548 BUG(); /* XXX Need to deal gracefully with failure. */
2550 switch(flag) {
2551 case PGT_l1_shadow:
2552 perfc_incrc(shadow_l1_table_count);
2553 break;
2554 case PGT_l2_shadow:
2555 perfc_incrc(shadow_l2_table_count);
2556 break;
2557 case PGT_l3_shadow:
2558 perfc_incrc(shadow_l3_table_count);
2559 break;
2560 case PGT_hl2_shadow:
2561 perfc_incrc(shadow_hl2_table_count);
2562 break;
2565 return 1;
2566 } else {
2567 /* This L1 is shadowed already, but the L2 entry is missing. */
2568 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", *spmfn);
2569 return 0;
2573 static void shadow_map_into_current(struct vcpu *v,
2574 unsigned long va, unsigned int from, unsigned int to)
2576 pgentry_64_t gle, sle;
2577 unsigned long gpfn, smfn;
2579 if (from == L1 && to == L2) {
2580 shadow_map_l1_into_current_l2(va);
2581 return;
2584 __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | to);
2585 ASSERT(entry_get_flags(gle) & _PAGE_PRESENT);
2586 gpfn = entry_get_pfn(gle);
2588 get_shadow_mfn(v->domain, gpfn, &smfn, shadow_level_to_type(from));
2590 if ( !get_shadow_ref(smfn) )
2591 BUG();
2592 entry_general(v->domain, &gle, &sle, smfn, to);
2593 __rw_entry(v, va, &gle, GUEST_ENTRY | SET_ENTRY | to);
2594 __rw_entry(v, va, &sle, SHADOW_ENTRY | SET_ENTRY | to);
2597 /*
2598 * shadow_set_lxe should be put in shadow.h
2599 */
2600 static void shadow_set_l2e_64(unsigned long va, l2_pgentry_t sl2e,
2601 int create_l2_shadow, int put_ref_check)
2603 struct vcpu *v = current;
2604 l4_pgentry_t sl4e;
2605 l3_pgentry_t sl3e;
2607 __shadow_get_l4e(v, va, &sl4e);
2608 if (!(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
2609 if (create_l2_shadow) {
2610 perfc_incrc(shadow_set_l3e_force_map);
2611 shadow_map_into_current(v, va, L3, L4);
2612 __shadow_get_l4e(v, va, &sl4e);
2613 } else {
2614 printk("For non VMX shadow, create_l1_shadow:%d\n", create_l2_shadow);
2618 __shadow_get_l3e(v, va, &sl3e);
2619 if (!(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
2620 if (create_l2_shadow) {
2621 perfc_incrc(shadow_set_l2e_force_map);
2622 shadow_map_into_current(v, va, L2, L3);
2623 __shadow_get_l3e(v, va, &sl3e);
2624 } else {
2625 printk("For non VMX shadow, create_l1_shadow:%d\n", create_l2_shadow);
2627 shadow_update_min_max(l4e_get_pfn(sl4e), l3_table_offset(va));
2631 if ( put_ref_check ) {
2632 l2_pgentry_t tmp_sl2e;
2633 if ( __shadow_get_l2e(v, va, &tmp_sl2e) ) {
2634 if ( l2e_get_flags(tmp_sl2e) & _PAGE_PRESENT )
2635 if ( l2e_get_pfn(tmp_sl2e) == l2e_get_pfn(sl2e) ) {
2636 put_shadow_ref(l2e_get_pfn(sl2e));
2642 if (! __shadow_set_l2e(v, va, &sl2e))
2643 BUG();
2644 shadow_update_min_max(l3e_get_pfn(sl3e), l2_table_offset(va));
2648 static void shadow_set_l1e_64(unsigned long va, pgentry_64_t *sl1e_p,
2649 int create_l1_shadow)
2651 struct vcpu *v = current;
2652 struct domain *d = v->domain;
2653 pgentry_64_t sle;
2654 pgentry_64_t sle_up = {0};
2655 l1_pgentry_t old_spte;
2656 l1_pgentry_t sl1e = *(l1_pgentry_t *)sl1e_p;
2657 int i;
2659 for (i = L4; i >= L2; i--) {
2660 if (!__rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i)) {
2661 printk("<%s> i = %d\n", __func__, i);
2662 BUG();
2664 if (!(entry_get_flags(sle) & _PAGE_PRESENT)) {
2665 if (create_l1_shadow) {
2666 perfc_incrc(shadow_set_l3e_force_map);
2667 shadow_map_into_current(v, va, i-1, i);
2668 __rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i);
2669 } else {
2670 #if 0
2671 printk("For non VMX shadow, create_l1_shadow:%d\n", create_l1_shadow);
2672 #endif
2675 if(i < L4)
2676 shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
2677 sle_up = sle;
2680 if ( shadow_mode_refcounts(d) )
2682 __shadow_get_l1e(v, va, &old_spte);
2683 ESH_LOG("old_sl1e: %lx, new_sl1e: %lx\n", l1e_get_intpte(old_spte), l1e_get_intpte(sl1e));
2684 if ( l1e_has_changed(old_spte, sl1e, _PAGE_RW | _PAGE_PRESENT) )
2686 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
2687 !shadow_get_page_from_l1e(sl1e, d) )
2688 sl1e = l1e_empty();
2689 if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
2690 put_page_from_l1e(old_spte, d);
2694 __shadow_set_l1e(v, va, &sl1e);
2695 shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, L1));
2698 static inline int l2e_rw_fault(
2699 struct vcpu *v, l2_pgentry_t *gl2e_p, unsigned long va, int rw)
2701 struct domain *d = v->domain;
2702 l2_pgentry_t gl2e = *gl2e_p;
2703 l2_pgentry_t tmp_l2e = gl2e;
2704 unsigned long start_gpfn = l2e_get_pfn(gl2e);
2705 unsigned long gpfn, mfn;
2706 unsigned long l1_mfn, gmfn;
2707 l1_pgentry_t *l1_p;
2708 l1_pgentry_t sl1e;
2709 l1_pgentry_t old_sl1e;
2710 l2_pgentry_t sl2e;
2711 unsigned long nx = 0;
2712 int put_ref_check = 0;
2713 /* Check if gpfn is 2M aligned */
2715 /* Update guest l2e */
2716 if (rw) {
2717 ASSERT(l2e_get_flags(gl2e) & _PAGE_RW);
2718 l2e_add_flags(gl2e, _PAGE_DIRTY | _PAGE_ACCESSED);
2719 } else {
2720 l2e_add_flags(gl2e, _PAGE_ACCESSED);
2723 l2e_remove_flags(tmp_l2e, _PAGE_PSE);
2724 if (l2e_get_flags(gl2e) & _PAGE_NX) {
2725 l2e_remove_flags(tmp_l2e, _PAGE_NX);
2726 nx = 1UL << 63;
2730 /* Get the shadow l2 first */
2731 if ( !__shadow_get_l2e(v, va, &sl2e) )
2732 sl2e = l2e_empty();
2734 l1_mfn = ___shadow_status(d, start_gpfn | nx, PGT_fl1_shadow);
2736 /* Check the corresponding l2e */
2737 if (l1_mfn) {
2738 /* Why it is PRESENT?*/
2739 if ((l2e_get_flags(sl2e) & _PAGE_PRESENT) &&
2740 l2e_get_pfn(sl2e) == l1_mfn) {
2741 ESH_LOG("sl2e PRSENT bit is set: %lx, l1_mfn = %lx\n", l2e_get_pfn(sl2e), l1_mfn);
2742 } else {
2743 put_ref_check = 1;
2744 if (!get_shadow_ref(l1_mfn))
2745 BUG();
2747 l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn);
2748 sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
2749 } else {
2750 /* Allocate a new page as shadow page table if need */
2751 gmfn = __gpfn_to_mfn(d, start_gpfn);
2752 l1_mfn = alloc_shadow_page(d, start_gpfn | nx, gmfn, PGT_fl1_shadow);
2753 if (unlikely(!l1_mfn)) {
2754 BUG();
2757 if (!get_shadow_ref(l1_mfn))
2758 BUG();
2759 l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn );
2760 sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
2761 memset(l1_p, 0, PAGE_SIZE);
2762 ESH_LOG("Alloc a shadow page: %lx\n", l1_mfn);
2765 ESH_LOG("<%s>: sl2e = %lx\n", __func__, l2e_get_intpte(sl2e));
2766 /* Map the page to l2*/
2767 shadow_set_l2e_64(va, sl2e, 1, put_ref_check);
2769 if (l2e_get_flags(gl2e) & _PAGE_NX)
2770 l2e_add_flags(tmp_l2e, _PAGE_NX);
2772 /* Propagate the shadow page table, i.e. setting sl1e */
2773 for (gpfn = start_gpfn;
2774 gpfn < (start_gpfn + L1_PAGETABLE_ENTRIES); gpfn++) {
2776 mfn = __gpfn_to_mfn(d, gpfn);
2778 if ( unlikely(!VALID_MFN(mfn)) )
2780 continue;
2783 sl1e = l1e_from_pfn(mfn, l2e_get_flags(tmp_l2e));
2785 if (!rw) {
2786 if ( shadow_mode_log_dirty(d) ||
2787 !(l2e_get_flags(gl2e) & _PAGE_DIRTY) || mfn_is_page_table(mfn) )
2789 l1e_remove_flags(sl1e, _PAGE_RW);
2791 } else {
2792 /* log dirty*/
2793 /*
2794 if ( shadow_mode_log_dirty(d) )
2795 __mark_dirty(d, gmfn);
2796 */
2798 // printk("<%s> gpfn: %lx, mfn: %lx, sl1e: %lx\n", __func__, gpfn, mfn, l1e_get_intpte(sl1e));
2799 /* The shadow entrys need setup before shadow_mark_va_out_of_sync()*/
2800 old_sl1e = l1_p[gpfn - start_gpfn];
2802 if ( l1e_has_changed(old_sl1e, sl1e, _PAGE_RW | _PAGE_PRESENT) )
2804 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
2805 !shadow_get_page_from_l1e(sl1e, d) ) {
2806 ESH_LOG("%lx, mfn: %lx why make me empty, start_pfn: %lx, gpfn: %lx\n", l1e_get_intpte(sl1e),mfn, start_gpfn, gpfn);
2807 sl1e = l1e_empty();
2809 if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
2810 put_page_from_l1e(old_sl1e, d);
2813 l1_p[gpfn - start_gpfn] = sl1e;
2815 if (rw) {
2816 /* shadow_mark_va_out_of_sync() need modificatin for 2M pages*/
2817 if ( mfn_is_page_table(mfn) )
2818 shadow_mark_va_out_of_sync_2mp(v, gpfn, mfn,
2819 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * (gpfn - start_gpfn)));
2823 unmap_domain_page(l1_p);
2824 return 1;
2828 static int shadow_fault_64(unsigned long va, struct cpu_user_regs *regs)
2830 struct vcpu *v = current;
2831 struct domain *d = v->domain;
2832 l2_pgentry_t gl2e;
2833 l1_pgentry_t sl1e, gl1e;
2835 perfc_incrc(shadow_fault_calls);
2837 ESH_LOG("<shadow_fault_64> va=%lx, rip = %lx, error code = %x\n",
2838 va, regs->eip, regs->error_code);
2840 /*
2841 * Don't let someone else take the guest's table pages out-of-sync.
2842 */
2843 shadow_lock(d);
2845 /* XXX - FIX THIS COMMENT!!!
2846 * STEP 1. Check to see if this fault might have been caused by an
2847 * out-of-sync table page entry, or if we should pass this
2848 * fault onto the guest.
2849 */
2850 __shadow_sync_va(v, va);
2852 /*
2853 * STEP 2. Check if the fault belongs to guest
2854 */
2855 if ( guest_page_fault(
2856 v, va, regs->error_code,
2857 (pgentry_64_t *)&gl2e, (pgentry_64_t *)&gl1e) ) {
2858 goto fail;
2861 if ( unlikely(!(l2e_get_flags(gl2e) & _PAGE_PSE)) ) {
2862 /*
2863 * Handle 4K pages here
2864 */
2866 /* Write fault? */
2867 if ( regs->error_code & 2 ) {
2868 if ( !l1pte_write_fault(v, &gl1e, &sl1e, va) ) {
2869 goto fail;
2871 } else {
2872 l1pte_read_fault(d, &gl1e, &sl1e);
2874 /*
2875 * STEP 3. Write guest/shadow l2e back
2876 */
2877 if (unlikely(!__guest_set_l1e(v, va, &gl1e))) {
2878 domain_crash_synchronous();
2881 ESH_LOG("gl1e: %lx, sl1e: %lx\n", l1e_get_intpte(gl1e), l1e_get_intpte(sl1e));
2882 shadow_set_l1e_64(va, (pgentry_64_t *)&sl1e, 1);
2883 /*
2884 * if necessary, record the page table page as dirty
2885 */
2886 if ( unlikely(shadow_mode_log_dirty(d)) )
2887 __mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gl2e)));
2889 } else {
2890 /*
2891 * Handle 2M pages here
2892 */
2893 /* Write fault? */
2894 if ( regs->error_code & 2 ) {
2895 if ( !l2e_rw_fault(v, &gl2e, va, WRITE_FAULT) ) {
2896 goto fail;
2898 } else {
2899 l2e_rw_fault(v, &gl2e, va, READ_FAULT);
2902 /*
2903 * STEP 3. Write guest/shadow l2e back
2904 */
2906 if ( unlikely(!__guest_set_l2e(v, va, &gl2e)) ) {
2907 domain_crash_synchronous();
2910 /*
2911 * Todo: if necessary, record the page table page as dirty
2912 */
2917 perfc_incrc(shadow_fault_fixed);
2918 d->arch.shadow_fault_count++;
2920 shadow_unlock(d);
2922 return EXCRET_fault_fixed;
2923 fail:
2924 shadow_unlock(d);
2925 ESH_LOG("Guest fault~~~\n");
2926 return 0;
2929 static void shadow_invlpg_64(struct vcpu *v, unsigned long va)
2931 struct domain *d = v->domain;
2932 l1_pgentry_t sl1e, old_sl1e;
2934 shadow_lock(d);
2936 if ( __shadow_get_l1e(v, va, &old_sl1e) )
2937 if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
2938 put_page_from_l1e(old_sl1e, d);
2941 sl1e = l1e_empty();
2942 __shadow_set_l1e(v, va, &sl1e);
2944 shadow_unlock(d);
2947 #ifndef PGENTRY_32
2948 struct shadow_ops MODE_F_HANDLER = {
2949 .guest_paging_levels = 4,
2950 .invlpg = shadow_invlpg_64,
2951 .fault = shadow_fault_64,
2952 .update_pagetables = shadow_update_pagetables,
2953 .sync_all = sync_all,
2954 .remove_all_write_access = remove_all_write_access,
2955 .do_update_va_mapping = do_update_va_mapping,
2956 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
2957 .is_out_of_sync = is_out_of_sync,
2958 };
2959 #endif
2961 #endif
2963 /*
2964 * Local variables:
2965 * mode: C
2966 * c-set-style: "BSD"
2967 * c-basic-offset: 4
2968 * tab-width: 4
2969 * indent-tabs-mode: nil
2970 * End:
2971 */