ia64/xen-unstable

view xen/arch/x86/shadow.c @ 8974:0349fb4de335

Clean up some vmx code.

Signed-off-by: Xin Li <xin.b.li@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Feb 23 11:34:11 2006 +0100 (2006-02-23)
parents 8fb4392c1d87
children 6734682d2fd0
line source
1 /******************************************************************************
2 * arch/x86/shadow.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 /*
22 * Jun Nakajima <jun.nakajima@intel.com>
23 * Chengyuan Li <chengyuan.li@intel.com>
24 *
25 * Extended to support 32-bit PAE and 64-bit guests.
26 */
28 #include <xen/config.h>
29 #include <xen/types.h>
30 #include <xen/mm.h>
31 #include <xen/domain_page.h>
32 #include <asm/shadow.h>
33 #include <asm/page.h>
34 #include <xen/event.h>
35 #include <xen/sched.h>
36 #include <xen/trace.h>
37 #include <asm/shadow_64.h>
39 /* Use this to have the compiler remove unnecessary branches */
40 #define SH_L1_HAS_NEXT_PAGE (GUEST_L1_PAGETABLE_ENTRIES - L1_PAGETABLE_ENTRIES)
42 extern void free_shadow_pages(struct domain *d);
44 #if 0 // this code has not been updated for 32pae & 64 bit modes
45 #if SHADOW_DEBUG
46 static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
47 #endif
48 #endif
50 #if CONFIG_PAGING_LEVELS == 3
51 static unsigned long shadow_l3_table(
52 struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
53 #endif
55 #if CONFIG_PAGING_LEVELS == 4
56 static unsigned long shadow_l4_table(
57 struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
58 #endif
60 #if CONFIG_PAGING_LEVELS >= 3
61 static void shadow_map_into_current(struct vcpu *v,
62 unsigned long va, unsigned int from, unsigned int to);
63 static inline void validate_bl2e_change( struct domain *d,
64 guest_root_pgentry_t *new_gle_p, pgentry_64_t *shadow_l3, int index);
65 static void update_top_level_shadow(struct vcpu *v, unsigned long smfn);
66 #endif
68 /********
70 There's a per-domain shadow table spin lock which works fine for SMP
71 hosts. We don't have to worry about interrupts as no shadow operations
72 happen in an interrupt context. It's probably not quite ready for SMP
73 guest operation as we have to worry about synchonisation between gpte
74 and spte updates. Its possible that this might only happen in a
75 hypercall context, in which case we'll probably at have a per-domain
76 hypercall lock anyhow (at least initially).
78 ********/
80 static inline int
81 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
82 unsigned long new_type)
83 {
84 struct page_info *page = mfn_to_page(gmfn);
85 int pinned = 0, okay = 1;
87 if ( page_out_of_sync(page) )
88 {
89 // Don't know how long ago this snapshot was taken.
90 // Can't trust it to be recent enough.
91 //
92 __shadow_sync_mfn(d, gmfn);
93 }
95 if ( !shadow_mode_refcounts(d) )
96 return 1;
98 if ( unlikely(page_is_page_table(page)) )
99 return 1;
101 FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
103 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
104 {
105 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
106 __func__, gpfn, gmfn);
107 #if 1 || defined(LIVE_DANGEROUSLY)
108 set_bit(_PGC_page_table, &page->count_info);
109 return 1;
110 #endif
111 return 0;
112 }
114 // To convert this page to use as a page table, the writable count
115 // should now be zero. Test this by grabbing the page as an page table,
116 // and then immediately releasing. This will also deal with any
117 // necessary TLB flushing issues for us.
118 //
119 // The cruft here about pinning doesn't really work right. This
120 // needs rethinking/rewriting... Need to gracefully deal with the
121 // TLB flushes required when promoting a writable page, and also deal
122 // with any outstanding (external) writable refs to this page (by
123 // refusing to promote it). The pinning headache complicates this
124 // code -- it would all get much simpler if we stop using
125 // shadow_lock() and move the shadow code to BIGLOCK().
126 //
127 if ( unlikely(!get_page(page, d)) )
128 BUG(); // XXX -- needs more thought for a graceful failure
129 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
130 {
131 pinned = 1;
132 put_page_and_type(page);
133 }
134 if ( get_page_type(page, PGT_base_page_table) )
135 {
136 set_bit(_PGC_page_table, &page->count_info);
137 put_page_type(page);
138 }
139 else
140 {
141 printk("shadow_promote: get_page_type failed "
142 "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
143 d->domain_id, gpfn, gmfn, new_type);
144 okay = 0;
145 }
147 // Now put the type back to writable...
148 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
149 BUG(); // XXX -- needs more thought for a graceful failure
150 if ( unlikely(pinned) )
151 {
152 if ( unlikely(test_and_set_bit(_PGT_pinned,
153 &page->u.inuse.type_info)) )
154 BUG(); // hmm... someone pinned this again?
155 }
156 else
157 put_page_and_type(page);
159 return okay;
160 }
163 /*
164 * Things in shadow mode that collect get_page() refs to the domain's
165 * pages are:
166 * - PGC_allocated takes a gen count, just like normal.
167 * - A writable page can be pinned (paravirtualized guests may consider
168 * these pages to be L1s or L2s, and don't know the difference).
169 * Pinning a page takes a gen count (but, for domains in shadow mode,
170 * it *doesn't* take a type count)
171 * - CR3 grabs a ref to whatever it points at, just like normal.
172 * - Shadow mode grabs an initial gen count for itself, as a placehold
173 * for whatever references will exist.
174 * - Shadow PTEs that point to a page take a gen count, just like regular
175 * PTEs. However, they don't get a type count, as get_page_type() is
176 * hardwired to keep writable pages' counts at 1 for domains in shadow
177 * mode.
178 * - Whenever we shadow a page, the entry in the shadow hash grabs a
179 * general ref to the page.
180 * - Whenever a page goes out of sync, the out of sync entry grabs a
181 * general ref to the page.
182 */
183 /*
184 * page_info fields for pages allocated as shadow pages:
185 *
186 * All 32 bits of count_info are a simple count of refs to this shadow
187 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
188 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
189 * references.
190 *
191 * u.inuse._domain is left NULL, to prevent accidently allow some random
192 * domain from gaining permissions to map this page.
193 *
194 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
195 * shadowed.
196 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
197 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
198 * is currently exists because this is a shadow of a root page, and we
199 * don't want to let those disappear just because no CR3 is currently pointing
200 * at it.
201 *
202 * tlbflush_timestamp holds a min & max index of valid page table entries
203 * within the shadow page.
204 */
206 static inline unsigned long
207 alloc_shadow_page(struct domain *d,
208 unsigned long gpfn, unsigned long gmfn,
209 u32 psh_type)
210 {
211 struct page_info *page;
212 unsigned long smfn, real_gpfn;
213 int pin = 0;
214 void *l1, *lp;
216 // Currently, we only keep pre-zero'ed pages around for use as L1's...
217 // This will change. Soon.
218 //
219 if ( psh_type == PGT_l1_shadow )
220 {
221 if ( !list_empty(&d->arch.free_shadow_frames) )
222 {
223 struct list_head *entry = d->arch.free_shadow_frames.next;
224 page = list_entry(entry, struct page_info, list);
225 list_del(entry);
226 perfc_decr(free_l1_pages);
227 }
228 else
229 {
230 if ( SH_L1_HAS_NEXT_PAGE &&
231 d->arch.ops->guest_paging_levels == PAGING_L2)
232 {
233 #if CONFIG_PAGING_LEVELS >= 3
234 /*
235 * For 32-bit HVM guest, 2 shadow L1s are required to
236 * simulate 1 guest L1 So need allocate 2 shadow L1
237 * pages each time.
238 *
239 * --> Need to avoidalloc_domheap_pages.
240 */
241 page = alloc_domheap_pages(NULL, SL1_ORDER, 0);
242 if (!page)
243 goto no_shadow_page;
245 l1 = map_domain_page(page_to_mfn(page));
246 memset(l1, 0, PAGE_SIZE);
247 unmap_domain_page(l1);
249 l1 = map_domain_page(page_to_mfn(page + 1));
250 memset(l1, 0, PAGE_SIZE);
251 unmap_domain_page(l1);
252 #else
253 page = alloc_domheap_page(NULL);
254 if (!page)
255 goto no_shadow_page;
257 l1 = map_domain_page(page_to_mfn(page));
258 memset(l1, 0, PAGE_SIZE);
259 unmap_domain_page(l1);
260 #endif
261 }
262 else
263 {
264 page = alloc_domheap_page(NULL);
265 if (!page)
266 goto no_shadow_page;
268 l1 = map_domain_page(page_to_mfn(page));
269 memset(l1, 0, PAGE_SIZE);
270 unmap_domain_page(l1);
271 }
272 }
273 }
274 else {
275 #if CONFIG_PAGING_LEVELS == 2
276 page = alloc_domheap_page(NULL);
277 #elif CONFIG_PAGING_LEVELS >= 3
278 if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
279 psh_type == PGT_l4_shadow ) /* allocated for PAE PDP page */
280 page = alloc_domheap_pages(NULL, 0, ALLOC_DOM_DMA);
281 else if ( d->arch.ops->guest_paging_levels == PAGING_L3 &&
282 psh_type == PGT_l3_shadow ) /* allocated for PAE PDP page */
283 page = alloc_domheap_pages(NULL, 0, ALLOC_DOM_DMA);
284 else
285 page = alloc_domheap_page(NULL);
286 #endif
287 if (!page)
288 goto no_shadow_page;
290 lp = map_domain_page(page_to_mfn(page));
291 memset(lp, 0, PAGE_SIZE);
292 unmap_domain_page(lp);
293 }
295 smfn = page_to_mfn(page);
297 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
298 page->u.inuse.type_info = psh_type | gmfn;
299 page->count_info = 0;
300 page->tlbflush_timestamp = 0;
302 switch ( psh_type )
303 {
304 case PGT_l1_shadow:
305 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
306 goto fail;
307 perfc_incr(shadow_l1_pages);
308 d->arch.shadow_page_count++;
309 break;
311 case PGT_l2_shadow:
312 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
313 goto fail;
314 perfc_incr(shadow_l2_pages);
315 d->arch.shadow_page_count++;
316 if ( PGT_l2_page_table == PGT_root_page_table )
317 pin = 1;
319 break;
321 case PGT_l3_shadow:
322 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
323 goto fail;
324 perfc_incr(shadow_l3_pages);
325 d->arch.shadow_page_count++;
326 if ( PGT_l3_page_table == PGT_root_page_table )
327 pin = 1;
328 break;
330 case PGT_l4_shadow:
331 real_gpfn = gpfn & PGT_mfn_mask;
332 if ( !shadow_promote(d, real_gpfn, gmfn, psh_type) )
333 goto fail;
334 perfc_incr(shadow_l4_pages);
335 d->arch.shadow_page_count++;
336 if ( PGT_l4_page_table == PGT_root_page_table )
337 pin = 1;
338 break;
340 #if CONFIG_PAGING_LEVELS >= 4
341 case PGT_fl1_shadow:
342 perfc_incr(shadow_l1_pages);
343 d->arch.shadow_page_count++;
344 break;
345 #else
347 case PGT_hl2_shadow:
348 // Treat an hl2 as an L1 for purposes of promotion.
349 // For external mode domains, treat them as an L2 for purposes of
350 // pinning.
351 //
352 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
353 goto fail;
354 perfc_incr(hl2_table_pages);
355 d->arch.hl2_page_count++;
356 if ( shadow_mode_external(d) &&
357 (PGT_l2_page_table == PGT_root_page_table) )
358 pin = 1;
360 break;
361 #endif
362 case PGT_snapshot:
363 perfc_incr(snapshot_pages);
364 d->arch.snapshot_page_count++;
365 break;
367 default:
368 printk("Alloc shadow weird page type type=%08x\n", psh_type);
369 BUG();
370 break;
371 }
373 // Don't add a new shadow of something that already has a snapshot.
374 //
375 ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
377 set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
379 if ( pin )
380 shadow_pin(smfn);
382 return smfn;
384 fail:
385 FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?",
386 gpfn, gmfn);
387 if (psh_type == PGT_l1_shadow)
388 {
389 if (d->arch.ops->guest_paging_levels == PAGING_L2)
390 {
391 #if CONFIG_PAGING_LEVELS >=3
392 free_domheap_pages(page, SL1_ORDER);
393 #else
394 free_domheap_page(page);
395 #endif
396 }
397 else
398 free_domheap_page(page);
399 }
400 else
401 free_domheap_page(page);
403 return 0;
405 no_shadow_page:
406 ASSERT(page == NULL);
407 printk("Couldn't alloc shadow page! dom%d count=%d\n",
408 d->domain_id, d->arch.shadow_page_count);
409 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
410 perfc_value(shadow_l1_pages),
411 perfc_value(shadow_l2_pages),
412 perfc_value(hl2_table_pages),
413 perfc_value(snapshot_pages));
414 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
416 return 0;
417 }
419 #if CONFIG_PAGING_LEVELS == 2
420 static unsigned long
421 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
422 unsigned long smfn)
423 {
424 unsigned long hl2mfn;
425 l1_pgentry_t *hl2;
426 int limit;
428 ASSERT(PGT_base_page_table == PGT_l2_page_table);
430 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
431 {
432 printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
433 gpfn, gmfn);
434 BUG(); /* XXX Deal gracefully with failure. */
435 }
437 SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
438 gpfn, gmfn, smfn, hl2mfn);
439 perfc_incrc(shadow_hl2_table_count);
441 hl2 = map_domain_page(hl2mfn);
443 if ( shadow_mode_external(d) )
444 limit = L2_PAGETABLE_ENTRIES;
445 else
446 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
448 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
450 if ( !shadow_mode_external(d) )
451 {
452 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
453 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
455 // Setup easy access to the GL2, SL2, and HL2 frames.
456 //
457 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
458 l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
459 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
460 l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
461 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
462 l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
463 }
465 unmap_domain_page(hl2);
467 return hl2mfn;
468 }
470 /*
471 * This could take and use a snapshot, and validate the entire page at
472 * once, or it could continue to fault in entries one at a time...
473 * Might be worth investigating...
474 */
475 static unsigned long shadow_l2_table(
476 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
477 {
478 unsigned long smfn;
479 l2_pgentry_t *spl2e;
480 struct domain *d = v->domain;
481 int i;
483 SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
485 perfc_incrc(shadow_l2_table_count);
487 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
488 {
489 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
490 gpfn, gmfn);
491 BUG(); /* XXX Deal gracefully with failure. */
492 }
494 spl2e = (l2_pgentry_t *)map_domain_page(smfn);
496 /* Install hypervisor and 2x linear p.t. mapings. */
497 if ( (PGT_base_page_table == PGT_l2_page_table) &&
498 !shadow_mode_external(d) )
499 {
500 /*
501 * We could proactively fill in PDEs for pages that are already
502 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
503 * (restriction required for coherence of the accessed bit). However,
504 * we tried it and it didn't help performance. This is simpler.
505 */
506 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
508 /* Install hypervisor and 2x linear p.t. mapings. */
509 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
510 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
511 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
513 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
514 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
516 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
517 spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
518 l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
519 arch.mm_perdomain_pt) + i,
520 __PAGE_HYPERVISOR);
522 if ( shadow_mode_translate(d) ) // NB: not external
523 {
524 unsigned long hl2mfn;
526 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
527 l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
528 __PAGE_HYPERVISOR);
530 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
531 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
533 // shadow_mode_translate (but not external) sl2 tables hold a
534 // ref to their hl2.
535 //
536 if ( !get_shadow_ref(hl2mfn) )
537 BUG();
539 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
540 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
541 }
542 else
543 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
544 l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
545 }
546 else
547 {
548 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
549 }
551 unmap_domain_page(spl2e);
553 SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
554 return smfn;
555 }
556 #endif /* CONFIG_PAGING_LEVELS == 2 */
558 static void shadow_map_l1_into_current_l2(unsigned long va)
559 {
560 struct vcpu *v = current;
561 struct domain *d = v->domain;
562 l1_pgentry_t *spl1e, *spl1e_next = 0;
563 l2_pgentry_t sl2e;
564 guest_l1_pgentry_t *gpl1e;
565 guest_l2_pgentry_t gl2e = {0};
566 unsigned long gl1pfn, gl1mfn, sl1mfn;
567 int i, init_table = 0;
569 __guest_get_l2e(v, va, &gl2e);
570 ASSERT(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT);
571 gl1pfn = l2e_get_pfn(gl2e);
573 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
574 {
575 /* This L1 is NOT already shadowed so we need to shadow it. */
576 SH_VVLOG("4a: l1 not shadowed");
578 gl1mfn = gmfn_to_mfn(d, gl1pfn);
579 if ( unlikely(!VALID_MFN(gl1mfn)) )
580 {
581 // Attempt to use an invalid pfn as an L1 page.
582 // XXX this needs to be more graceful!
583 BUG();
584 }
586 if ( unlikely(!(sl1mfn =
587 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
588 {
589 printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
590 gl1pfn, gl1mfn);
591 BUG(); /* XXX Need to deal gracefully with failure. */
592 }
594 perfc_incrc(shadow_l1_table_count);
595 init_table = 1;
596 }
597 else
598 {
599 /* This L1 is shadowed already, but the L2 entry is missing. */
600 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
601 }
603 #ifndef NDEBUG
604 {
605 l2_pgentry_t old_sl2e;
606 __shadow_get_l2e(v, va, &old_sl2e);
607 ASSERT(!(l2e_get_flags(old_sl2e) & _PAGE_PRESENT));
608 }
609 #endif
611 #if CONFIG_PAGING_LEVELS >= 3
612 if ( SH_L1_HAS_NEXT_PAGE &&
613 d->arch.ops->guest_paging_levels == PAGING_L2 )
614 {
615 /* for 32-bit HVM guest on 64-bit or PAE host,
616 * need update two L2 entries each time
617 */
618 if ( !get_shadow_ref(sl1mfn))
619 BUG();
620 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
621 __guest_set_l2e(v, va, &gl2e);
622 __shadow_set_l2e(v, va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1), &sl2e);
623 if ( !get_shadow_ref(sl1mfn+1))
624 BUG();
625 sl2e = l2e_empty();
626 l2pde_general(d, &gl2e, &sl2e, sl1mfn+1);
627 __shadow_set_l2e(v,((va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1)) + (1 << L2_PAGETABLE_SHIFT)) , &sl2e);
628 } else
629 #endif
630 {
631 if ( !get_shadow_ref(sl1mfn) )
632 BUG();
633 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
634 __guest_set_l2e(v, va, &gl2e);
635 __shadow_set_l2e(v, va , &sl2e);
636 }
638 if ( init_table )
639 {
640 l1_pgentry_t sl1e;
641 int index = guest_l1_table_offset(va);
642 int min = 1, max = 0;
644 unsigned long tmp_gmfn;
645 l2_pgentry_t tmp_sl2e = {0};
646 guest_l2_pgentry_t tmp_gl2e = {0};
648 __guest_get_l2e(v, va, &tmp_gl2e);
649 tmp_gmfn = gmfn_to_mfn(d, l2e_get_pfn(tmp_gl2e));
650 gpl1e = (guest_l1_pgentry_t *) map_domain_page(tmp_gmfn);
652 /* If the PGT_l1_shadow has two contiguous pages */
653 #if CONFIG_PAGING_LEVELS >= 3
654 if ( SH_L1_HAS_NEXT_PAGE &&
655 d->arch.ops->guest_paging_levels == PAGING_L2 )
656 __shadow_get_l2e(v, va & ~((1UL << L2_PAGETABLE_SHIFT_32) - 1), &tmp_sl2e);
657 else
658 #endif
659 __shadow_get_l2e(v, va, &tmp_sl2e);
661 spl1e = (l1_pgentry_t *) map_domain_page(l2e_get_pfn(tmp_sl2e));
663 if ( SH_L1_HAS_NEXT_PAGE )
664 spl1e_next = (l1_pgentry_t *) map_domain_page(
665 (l2e_get_pfn(tmp_sl2e) + 1UL));
667 for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
668 {
669 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
670 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
671 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
672 sl1e = l1e_empty();
673 if ( l1e_get_flags(sl1e) == 0 )
674 {
675 // First copy entries from 0 until first invalid.
676 // Then copy entries from index until first invalid.
677 //
678 if ( i < index ) {
679 i = index - 1;
680 continue;
681 }
682 break;
683 }
685 if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
686 spl1e_next[i - L1_PAGETABLE_ENTRIES] = sl1e;
687 else
688 spl1e[i] = sl1e;
690 if ( unlikely(i < min) )
691 min = i;
692 if ( likely(i > max) )
693 max = i;
694 set_guest_back_ptr(d, sl1e, sl1mfn, i);
695 }
697 mfn_to_page(sl1mfn)->tlbflush_timestamp =
698 SHADOW_ENCODE_MIN_MAX(min, max);
700 unmap_domain_page(gpl1e);
701 unmap_domain_page(spl1e);
703 if ( SH_L1_HAS_NEXT_PAGE )
704 unmap_domain_page(spl1e_next);
705 }
706 }
708 #if CONFIG_PAGING_LEVELS == 2
709 static void
710 shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
711 {
712 struct vcpu *v = current;
713 struct domain *d = v->domain;
714 l2_pgentry_t sl2e = {0};
716 __shadow_get_l2e(v, va, &sl2e);
717 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
718 {
719 /*
720 * Either the L1 is not shadowed, or the shadow isn't linked into
721 * the current shadow L2.
722 */
723 if ( create_l1_shadow )
724 {
725 perfc_incrc(shadow_set_l1e_force_map);
726 shadow_map_l1_into_current_l2(va);
727 }
728 else /* check to see if it exists; if so, link it in */
729 {
730 l2_pgentry_t gpde = {0};
731 unsigned long gl1pfn;
732 unsigned long sl1mfn;
734 __guest_get_l2e(v, va, &gpde);
736 if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
737 {
738 gl1pfn = l2e_get_pfn(gpde);
739 sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
740 }
741 else
742 {
743 // no shadow exists, so there's nothing to do.
744 perfc_incrc(shadow_set_l1e_fail);
745 return;
746 }
748 if ( sl1mfn )
749 {
750 perfc_incrc(shadow_set_l1e_unlinked);
751 if ( !get_shadow_ref(sl1mfn) )
752 BUG();
753 l2pde_general(d, (guest_l2_pgentry_t *)&gpde, &sl2e, sl1mfn);
754 __guest_set_l2e(v, va, &gpde);
755 __shadow_set_l2e(v, va, &sl2e);
756 }
757 else
758 {
759 // no shadow exists, so there's nothing to do.
760 perfc_incrc(shadow_set_l1e_fail);
761 return;
762 }
763 }
764 }
766 __shadow_get_l2e(v, va, &sl2e);
768 if ( shadow_mode_refcounts(d) )
769 {
770 l1_pgentry_t old_spte;
771 __shadow_get_l1e(v, va, &old_spte);
773 // only do the ref counting if something important changed.
774 //
775 if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
776 {
777 if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
778 !shadow_get_page_from_l1e(new_spte, d) )
779 new_spte = l1e_empty();
780 if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
781 shadow_put_page_from_l1e(old_spte, d);
782 }
783 }
785 set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
786 __shadow_set_l1e(v, va, &new_spte);
787 shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
788 }
790 static void shadow_invlpg_32(struct vcpu *v, unsigned long va)
791 {
792 struct domain *d = v->domain;
793 l1_pgentry_t gpte, spte;
795 ASSERT(shadow_mode_enabled(d));
797 shadow_lock(d);
799 __shadow_sync_va(v, va);
801 // XXX mafetter: will need to think about 4MB pages...
803 // It's not strictly necessary to update the shadow here,
804 // but it might save a fault later.
805 //
806 /*if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
807 sizeof(gpte))) {*/
808 if (unlikely(!__guest_get_l1e(v, va, &gpte))) {
809 perfc_incrc(shadow_invlpg_faults);
810 shadow_unlock(d);
811 return;
812 }
813 l1pte_propagate_from_guest(d, gpte, &spte);
814 shadow_set_l1e(va, spte, 1);
816 shadow_unlock(d);
817 }
818 #endif /* CONFIG_PAGING_LEVELS == 2 */
820 #if CONFIG_PAGING_LEVELS >= 3
821 static void shadow_set_l1e_64(
822 unsigned long va, pgentry_64_t *sl1e_p,
823 int create_l1_shadow)
824 {
825 struct vcpu *v = current;
826 struct domain *d = v->domain;
827 pgentry_64_t sle = { 0 };
828 pgentry_64_t sle_up = {0};
829 l1_pgentry_t old_spte;
830 l1_pgentry_t sl1e = *(l1_pgentry_t *)sl1e_p;
831 int i;
832 unsigned long orig_va = 0;
834 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
835 {
836 /* This is for 32-bit VMX guest on 64-bit host */
837 orig_va = va;
838 va = va & (~((1<<L2_PAGETABLE_SHIFT_32)-1));
839 }
841 for ( i = PAGING_L4; i >= PAGING_L2; i-- )
842 {
843 if ( !__rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i) )
844 {
845 sl1e = l1e_empty();
846 goto out;
847 }
848 if ( !(entry_get_flags(sle) & _PAGE_PRESENT) )
849 {
850 if ( create_l1_shadow )
851 {
852 perfc_incrc(shadow_set_l3e_force_map);
853 shadow_map_into_current(v, va, i-1, i);
854 __rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i);
855 }
856 }
857 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
858 {
859 if ( i < PAGING_L3 )
860 shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
861 }
862 else
863 {
864 if ( i < PAGING_L4 )
865 shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
866 }
868 sle_up = sle;
869 }
871 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
872 {
873 va = orig_va;
874 }
876 if ( shadow_mode_refcounts(d) )
877 {
878 __shadow_get_l1e(v, va, &old_spte);
879 if ( l1e_has_changed(old_spte, sl1e, _PAGE_RW | _PAGE_PRESENT) )
880 {
881 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
882 !shadow_get_page_from_l1e(sl1e, d) )
883 sl1e = l1e_empty();
884 if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
885 put_page_from_l1e(old_spte, d);
886 }
887 }
889 out:
890 __shadow_set_l1e(v, va, &sl1e);
892 shadow_update_min_max(entry_get_pfn(sle_up), guest_l1_table_offset(va));
893 }
894 #endif /* CONFIG_PAGING_LEVELS >= 3 */
896 static struct out_of_sync_entry *
897 shadow_alloc_oos_entry(struct domain *d)
898 {
899 struct out_of_sync_entry *f, *extra;
900 unsigned size, i;
902 if ( unlikely(d->arch.out_of_sync_free == NULL) )
903 {
904 FSH_LOG("Allocate more fullshadow tuple blocks.");
906 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
907 extra = xmalloc_bytes(size);
909 /* XXX Should be more graceful here. */
910 if ( extra == NULL )
911 BUG();
913 memset(extra, 0, size);
915 /* Record the allocation block so it can be correctly freed later. */
916 d->arch.out_of_sync_extras_count++;
917 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
918 d->arch.out_of_sync_extras;
919 d->arch.out_of_sync_extras = &extra[0];
921 /* Thread a free chain through the newly-allocated nodes. */
922 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
923 extra[i].next = &extra[i+1];
924 extra[i].next = NULL;
926 /* Add the new nodes to the free list. */
927 d->arch.out_of_sync_free = &extra[0];
928 }
930 /* Allocate a new node from the quicklist. */
931 f = d->arch.out_of_sync_free;
932 d->arch.out_of_sync_free = f->next;
934 return f;
935 }
937 static inline unsigned long
938 shadow_make_snapshot(
939 struct domain *d, unsigned long gpfn, unsigned long gmfn)
940 {
941 unsigned long smfn, sl1mfn = 0;
942 void *original, *snapshot;
943 u32 min_max = 0;
944 int min, max, length;
946 if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
947 {
948 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
949 return SHADOW_SNAPSHOT_ELSEWHERE;
950 }
952 perfc_incrc(shadow_make_snapshot);
954 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
955 {
956 printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
957 "Dom%d snapshot_count_count=%d\n",
958 gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
959 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
960 }
962 if ( !get_shadow_ref(smfn) )
963 BUG();
965 if ( shadow_mode_refcounts(d) &&
966 (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
967 min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
968 mfn_to_page(smfn)->tlbflush_timestamp = min_max;
970 min = SHADOW_MIN(min_max);
971 max = SHADOW_MAX(min_max);
972 length = max - min + 1;
973 perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
975 min *= sizeof(guest_l1_pgentry_t);
976 length *= sizeof(guest_l1_pgentry_t);
978 original = map_domain_page(gmfn);
979 snapshot = map_domain_page(smfn);
980 memcpy(snapshot + min, original + min, length);
981 unmap_domain_page(original);
982 unmap_domain_page(snapshot);
984 return smfn;
985 }
987 static struct out_of_sync_entry *
988 __mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
989 unsigned long mfn)
990 {
991 struct domain *d = v->domain;
992 struct page_info *page = mfn_to_page(mfn);
993 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
995 ASSERT(shadow_lock_is_acquired(d));
996 ASSERT(mfn_valid(mfn));
998 #ifndef NDEBUG
999 {
1000 u32 type = page->u.inuse.type_info & PGT_type_mask;
1001 if ( shadow_mode_refcounts(d) )
1003 ASSERT(type == PGT_writable_page);
1005 else
1007 ASSERT(type && (type < PGT_l4_page_table));
1010 #endif
1012 FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
1013 gpfn, mfn, page->count_info, page->u.inuse.type_info);
1015 // XXX this will require some more thought... Cross-domain sharing and
1016 // modification of page tables? Hmm...
1017 //
1018 if ( d != page_get_owner(page) )
1019 BUG();
1021 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
1023 entry->v = v;
1024 entry->gpfn = gpfn;
1025 entry->gmfn = mfn;
1026 entry->writable_pl1e = -1;
1028 #if 0 // this code has not been updated for 32pae & 64 bit modes
1029 #if SHADOW_DEBUG
1030 mark_shadows_as_reflecting_snapshot(d, gpfn);
1031 #endif
1032 #endif
1034 // increment guest's ref count to represent the entry in the
1035 // full shadow out-of-sync list.
1036 //
1037 get_page(page, d);
1039 return entry;
1042 static struct out_of_sync_entry *
1043 mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
1044 unsigned long mfn)
1046 struct out_of_sync_entry *entry =
1047 __mark_mfn_out_of_sync(v, gpfn, mfn);
1048 struct domain *d = v->domain;
1050 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1051 // Add to the out-of-sync list
1052 //
1053 entry->next = d->arch.out_of_sync;
1054 d->arch.out_of_sync = entry;
1056 return entry;
1060 static void shadow_mark_va_out_of_sync(
1061 struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
1063 struct out_of_sync_entry *entry =
1064 __mark_mfn_out_of_sync(v, gpfn, mfn);
1065 l2_pgentry_t sl2e;
1066 struct domain *d = v->domain;
1068 #if CONFIG_PAGING_LEVELS >= 3
1070 l4_pgentry_t sl4e;
1071 l3_pgentry_t sl3e;
1073 __shadow_get_l4e(v, va, &sl4e);
1074 if ( !(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
1075 shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
1078 if (!__shadow_get_l3e(v, va, &sl3e)) {
1079 BUG();
1082 if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
1083 shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
1086 #endif
1088 // We need the address of shadow PTE that maps @va.
1089 // It might not exist yet. Make sure it's there.
1090 //
1091 __shadow_get_l2e(v, va, &sl2e);
1092 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
1094 // either this L1 isn't shadowed yet, or the shadow isn't linked into
1095 // the current L2.
1096 shadow_map_l1_into_current_l2(va);
1097 __shadow_get_l2e(v, va, &sl2e);
1099 ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
1101 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1102 // NB: this is stored as a machine address.
1103 entry->writable_pl1e =
1104 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
1105 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
1106 entry->va = va;
1108 // Increment shadow's page count to represent the reference
1109 // inherent in entry->writable_pl1e
1110 //
1111 if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
1112 BUG();
1114 // Add to the out-of-sync list
1115 //
1116 entry->next = d->arch.out_of_sync;
1117 d->arch.out_of_sync = entry;
1119 FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
1120 __func__, va, entry->writable_pl1e);
1123 /*
1124 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
1125 * Returns 0 otherwise.
1126 */
1127 static int snapshot_entry_matches(
1128 struct domain *d, guest_l1_pgentry_t *guest_pt,
1129 unsigned long gpfn, unsigned index)
1131 unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
1132 guest_l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
1133 int entries_match;
1135 perfc_incrc(snapshot_entry_matches_calls);
1137 if ( !smfn )
1138 return 0;
1140 snapshot = map_domain_page(smfn);
1142 if (__copy_from_user(&gpte, &guest_pt[index],
1143 sizeof(gpte)))
1145 unmap_domain_page(snapshot);
1146 return 0;
1149 // This could probably be smarter, but this is sufficent for
1150 // our current needs.
1151 //
1152 entries_match = !guest_l1e_has_changed(gpte, snapshot[index],
1153 PAGE_FLAG_MASK);
1155 unmap_domain_page(snapshot);
1157 #ifdef PERF_COUNTERS
1158 if ( entries_match )
1159 perfc_incrc(snapshot_entry_matches_true);
1160 #endif
1162 return entries_match;
1165 /*
1166 * Returns 1 if va's shadow mapping is out-of-sync.
1167 * Returns 0 otherwise.
1168 */
1169 static int is_out_of_sync(struct vcpu *v, unsigned long va) /* __shadow_out_of_sync */
1171 struct domain *d = v->domain;
1172 #if CONFIG_PAGING_LEVELS == 4
1173 unsigned long l2mfn = ((v->arch.flags & TF_kernel_mode)?
1174 pagetable_get_pfn(v->arch.guest_table) :
1175 pagetable_get_pfn(v->arch.guest_table_user));
1176 #else
1177 unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
1178 #endif
1179 unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
1180 guest_l2_pgentry_t l2e;
1181 unsigned long l1pfn, l1mfn;
1182 guest_l1_pgentry_t *guest_pt;
1184 ASSERT(shadow_lock_is_acquired(d));
1185 ASSERT(VALID_M2P(l2pfn));
1187 perfc_incrc(shadow_out_of_sync_calls);
1189 #if CONFIG_PAGING_LEVELS >= 3
1191 #define unmap_and_return(x) \
1192 if ( guest_pt != (guest_l1_pgentry_t *) v->arch.guest_vtable ) \
1193 unmap_domain_page(guest_pt); \
1194 return (x);
1196 if (d->arch.ops->guest_paging_levels >= PAGING_L3)
1198 pgentry_64_t le;
1199 unsigned long gmfn;
1200 unsigned long gpfn;
1201 int i;
1202 unsigned int base_idx = 0;
1203 base_idx = get_cr3_idxval(v);
1205 gmfn = l2mfn;
1206 gpfn = l2pfn;
1207 guest_pt = (guest_l1_pgentry_t *)v->arch.guest_vtable;
1209 for ( i = PAGING_L4; i >= PAGING_L3; i-- )
1211 if (d->arch.ops->guest_paging_levels == PAGING_L3
1212 && i == PAGING_L4)
1213 continue; /* skip the top-level for 3-level */
1215 if ( page_out_of_sync(mfn_to_page(gmfn)) &&
1216 !snapshot_entry_matches(
1217 d, guest_pt, gpfn, guest_table_offset_64(va, i, base_idx)) )
1219 unmap_and_return (1);
1222 le = entry_empty();
1223 __rw_entry(v, va, &le, GUEST_ENTRY | GET_ENTRY | i);
1225 if ( !(entry_get_flags(le) & _PAGE_PRESENT) )
1227 unmap_and_return (0);
1229 gpfn = entry_get_pfn(le);
1230 gmfn = gmfn_to_mfn(d, gpfn);
1231 if ( !VALID_MFN(gmfn) )
1233 unmap_and_return (0);
1235 if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
1236 unmap_domain_page(guest_pt);
1237 guest_pt = (guest_l1_pgentry_t *)map_domain_page(gmfn);
1240 /* L2 */
1241 if ( page_out_of_sync(mfn_to_page(gmfn)) &&
1242 !snapshot_entry_matches(d, guest_pt, gpfn, l2_table_offset(va)) )
1244 unmap_and_return (1);
1247 if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
1248 unmap_domain_page(guest_pt);
1251 else
1252 #undef unmap_and_return
1253 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1255 if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
1256 !snapshot_entry_matches(d, (guest_l1_pgentry_t *)v->arch.guest_vtable,
1257 l2pfn, guest_l2_table_offset(va)) )
1258 return 1;
1261 __guest_get_l2e(v, va, &l2e);
1262 if ( !(guest_l2e_get_flags(l2e) & _PAGE_PRESENT) ||
1263 (guest_l2e_get_flags(l2e) & _PAGE_PSE))
1264 return 0;
1266 l1pfn = l2e_get_pfn(l2e);
1267 l1mfn = gmfn_to_mfn(d, l1pfn);
1269 // If the l1 pfn is invalid, it can't be out of sync...
1270 if ( !VALID_MFN(l1mfn) )
1271 return 0;
1273 guest_pt = (guest_l1_pgentry_t *) map_domain_page(l1mfn);
1275 if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
1276 !snapshot_entry_matches(
1277 d, guest_pt, l1pfn, guest_l1_table_offset(va)) )
1279 unmap_domain_page(guest_pt);
1280 return 1;
1283 unmap_domain_page(guest_pt);
1284 return 0;
1287 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(guest_l1_pgentry_t)))
1288 static inline unsigned long
1289 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
1291 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
1294 static inline void
1295 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1297 unsigned long score = prediction & PGT_score_mask;
1298 int create = (score == 0);
1300 // saturating addition
1301 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
1302 score = score ? score : PGT_score_mask;
1304 prediction = (prediction & PGT_mfn_mask) | score;
1306 //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create);
1307 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1309 if ( create )
1310 perfc_incr(writable_pte_predictions);
1313 static inline void
1314 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1316 unsigned long score = prediction & PGT_score_mask;
1317 ASSERT(score);
1319 // divide score by 2... We don't like bad predictions.
1320 //
1321 score = (score >> 1) & PGT_score_mask;
1323 prediction = (prediction & PGT_mfn_mask) | score;
1325 //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score);
1327 if ( score )
1328 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1329 else
1331 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
1332 perfc_decr(writable_pte_predictions);
1336 static int fix_entry(
1337 struct domain *d,
1338 l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
1340 l1_pgentry_t old = *pt;
1341 l1_pgentry_t new = old;
1343 l1e_remove_flags(new,_PAGE_RW);
1344 if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
1345 BUG();
1346 (*found)++;
1347 *pt = new;
1348 if ( is_l1_shadow )
1349 shadow_put_page_from_l1e(old, d);
1351 return (*found == max_refs_to_find);
1354 static u32 remove_all_write_access_in_ptpage(
1355 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
1356 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
1357 u32 max_refs_to_find, unsigned long prediction)
1359 l1_pgentry_t *pt = map_domain_page(pt_mfn);
1360 l1_pgentry_t *pt_next = 0, *sl1e_p;
1361 l1_pgentry_t match;
1362 unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
1363 int i;
1364 u32 found = 0;
1365 int is_l1_shadow =
1366 ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
1367 PGT_l1_shadow);
1368 #if CONFIG_PAGING_LEVELS == 4
1369 is_l1_shadow |=
1370 ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
1371 PGT_fl1_shadow);
1372 #endif
1374 if ( SH_L1_HAS_NEXT_PAGE )
1375 pt_next = map_domain_page(pt_mfn + 1);
1377 match = l1e_from_pfn(readonly_gmfn, flags);
1379 if ( shadow_mode_external(d) )
1381 i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask)
1382 >> PGT_va_shift;
1384 if ( SH_L1_HAS_NEXT_PAGE &&
1385 i >= L1_PAGETABLE_ENTRIES )
1386 sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
1387 else
1388 sl1e_p = &pt[i];
1390 if ( (i >= 0 && i < GUEST_L1_PAGETABLE_ENTRIES) &&
1391 !l1e_has_changed(*sl1e_p, match, flags) &&
1392 fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) &&
1393 !prediction )
1394 goto out;
1397 for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
1399 if ( SH_L1_HAS_NEXT_PAGE &&
1400 i >= L1_PAGETABLE_ENTRIES )
1401 sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
1402 else
1403 sl1e_p = &pt[i];
1405 if ( unlikely(!l1e_has_changed(*sl1e_p, match, flags)) &&
1406 fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) )
1407 break;
1410 out:
1411 unmap_domain_page(pt);
1412 if ( SH_L1_HAS_NEXT_PAGE )
1413 unmap_domain_page(pt_next);
1415 return found;
1418 static int remove_all_write_access(
1419 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
1421 int i;
1422 struct shadow_status *a;
1423 u32 found = 0, write_refs;
1424 unsigned long predicted_smfn;
1426 ASSERT(shadow_lock_is_acquired(d));
1427 ASSERT(VALID_MFN(readonly_gmfn));
1429 perfc_incrc(remove_write_access);
1431 // If it's not a writable page, then no writable refs can be outstanding.
1432 //
1433 if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
1434 PGT_writable_page )
1436 perfc_incrc(remove_write_not_writable);
1437 return 1;
1440 // How many outstanding writable PTEs for this page are there?
1441 //
1442 write_refs =
1443 (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
1444 if ( write_refs && MFN_PINNED(readonly_gmfn) )
1446 write_refs--;
1449 if ( write_refs == 0 )
1451 perfc_incrc(remove_write_no_work);
1452 return 1;
1455 if ( shadow_mode_external(d) ) {
1456 if (--write_refs == 0)
1457 return 0;
1459 // Use the back pointer to locate the shadow page that can contain
1460 // the PTE of interest
1461 if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
1462 found += remove_all_write_access_in_ptpage(
1463 d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
1464 if ( found == write_refs )
1465 return 0;
1469 // Search all the shadow L1 page tables...
1470 //
1471 for (i = 0; i < shadow_ht_buckets; i++)
1473 a = &d->arch.shadow_ht[i];
1474 while ( a && a->gpfn_and_flags )
1476 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow
1477 #if CONFIG_PAGING_LEVELS >= 4
1478 || (a->gpfn_and_flags & PGT_type_mask) == PGT_fl1_shadow
1479 #endif
1483 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
1484 if ( found == write_refs )
1485 return 0;
1488 a = a->next;
1492 FSH_LOG("%s: looking for %d refs, found %d refs",
1493 __func__, write_refs, found);
1495 return 0;
1498 static void resync_pae_guest_l3(struct domain *d)
1500 struct out_of_sync_entry *entry;
1501 unsigned long i, idx;
1502 unsigned long smfn, gmfn;
1503 pgentry_64_t *guest, *shadow_l3, *snapshot;
1504 struct vcpu *v = current;
1505 int max = -1;
1506 int unshadow = 0;
1509 ASSERT( shadow_mode_external(d) );
1511 gmfn = pagetable_get_pfn(v->arch.guest_table);
1513 for ( entry = d->arch.out_of_sync; entry; entry = entry->next )
1515 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1516 continue;
1517 if ( entry->gmfn != gmfn )
1518 continue;
1520 idx = get_cr3_idxval(v);
1521 smfn = __shadow_status(
1522 d, ((unsigned long)(idx << PGT_score_shift) | entry->gpfn), PGT_l4_shadow);
1524 #ifndef NDEBUG
1525 if ( !smfn )
1527 BUG();
1529 #endif
1531 guest = (pgentry_64_t *)map_domain_page(entry->gmfn);
1532 snapshot = (pgentry_64_t *)map_domain_page(entry->snapshot_mfn);
1533 shadow_l3 = (pgentry_64_t *)map_domain_page(smfn);
1535 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
1537 int index = i + idx * PAE_L3_PAGETABLE_ENTRIES;
1538 if ( entry_has_changed(
1539 guest[index], snapshot[index], PAGE_FLAG_MASK) )
1541 validate_entry_change(d, &guest[index],
1542 &shadow_l3[i], PAGING_L3);
1544 if ( entry_get_value(guest[index]) != 0 )
1545 max = i;
1547 if ( !(entry_get_flags(guest[index]) & _PAGE_PRESENT) &&
1548 unlikely(entry_get_value(guest[index]) != 0) &&
1549 !unshadow &&
1550 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
1551 unshadow = 1;
1554 if ( max == -1 )
1555 unshadow = 1;
1557 unmap_domain_page(guest);
1558 unmap_domain_page(snapshot);
1559 unmap_domain_page(shadow_l3);
1561 if ( unlikely(unshadow) )
1562 shadow_unpin(smfn);
1563 break;
1567 static int resync_all(struct domain *d, u32 stype)
1569 struct out_of_sync_entry *entry;
1570 unsigned i;
1571 unsigned long smfn;
1572 void *guest, *shadow, *snapshot;
1573 int need_flush = 0, external = shadow_mode_external(d);
1574 int unshadow;
1575 int changed;
1576 u32 min_max_shadow, min_max_snapshot;
1577 int min_shadow, max_shadow, min_snapshot, max_snapshot;
1578 struct vcpu *v;
1580 ASSERT(shadow_lock_is_acquired(d));
1582 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
1584 int max = -1;
1586 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1587 continue;
1589 smfn = __shadow_status(d, entry->gpfn, stype);
1591 if ( !smfn )
1593 // For heavy weight shadows: no need to update refcounts if
1594 // there's no shadow page.
1595 //
1596 if ( shadow_mode_refcounts(d) )
1597 continue;
1599 // For light weight shadows: only need up resync the refcounts to
1600 // the new contents of the guest page iff this it has the right
1601 // page type.
1602 //
1603 if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
1604 continue;
1607 FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
1608 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
1610 // Compare guest's new contents to its snapshot, validating
1611 // and updating its shadow as appropriate.
1612 //
1613 guest = map_domain_page(entry->gmfn);
1614 snapshot = map_domain_page(entry->snapshot_mfn);
1616 if ( smfn )
1617 shadow = map_domain_page(smfn);
1618 else
1619 shadow = NULL;
1621 unshadow = 0;
1623 min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
1624 min_shadow = SHADOW_MIN(min_max_shadow);
1625 max_shadow = SHADOW_MAX(min_max_shadow);
1627 min_max_snapshot= mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
1628 min_snapshot = SHADOW_MIN(min_max_snapshot);
1629 max_snapshot = SHADOW_MAX(min_max_snapshot);
1631 switch ( stype )
1633 case PGT_l1_shadow:
1635 guest_l1_pgentry_t *guest1 = guest;
1636 l1_pgentry_t *shadow1 = shadow;
1637 l1_pgentry_t *shadow1_next = 0, *sl1e_p;
1638 guest_l1_pgentry_t *snapshot1 = snapshot;
1639 int unshadow_l1 = 0;
1641 ASSERT(shadow_mode_write_l1(d) ||
1642 shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
1644 if ( !shadow_mode_refcounts(d) )
1645 revalidate_l1(d, (l1_pgentry_t *)guest1, (l1_pgentry_t *)snapshot1);
1646 if ( !smfn )
1647 break;
1649 changed = 0;
1651 if ( SH_L1_HAS_NEXT_PAGE && shadow1 )
1652 shadow1_next = map_domain_page(smfn + 1);
1654 for ( i = min_shadow; i <= max_shadow; i++ )
1657 if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
1658 sl1e_p = &shadow1_next[i - L1_PAGETABLE_ENTRIES];
1659 else
1660 sl1e_p = &shadow1[i];
1662 if ( (i < min_snapshot) || (i > max_snapshot) ||
1663 guest_l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
1665 int error;
1667 error = validate_pte_change(d, guest1[i], sl1e_p);
1668 if ( error == -1 )
1669 unshadow_l1 = 1;
1670 else {
1671 need_flush |= error;
1672 set_guest_back_ptr(d, *sl1e_p, smfn, i);
1674 // can't update snapshots of linear page tables -- they
1675 // are used multiple times...
1676 //
1677 // snapshot[i] = new_pte;
1679 changed++;
1683 if ( shadow1_next )
1684 unmap_domain_page(shadow1_next);
1686 perfc_incrc(resync_l1);
1687 perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
1688 perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
1689 if ( d->arch.ops->guest_paging_levels >= PAGING_L3 &&
1690 unshadow_l1 ) {
1691 pgentry_64_t l2e = { 0 };
1693 __shadow_get_l2e(entry->v, entry->va, &l2e);
1695 if ( entry_get_flags(l2e) & _PAGE_PRESENT ) {
1696 put_shadow_ref(entry_get_pfn(l2e));
1697 l2e = entry_empty();
1698 __shadow_set_l2e(entry->v, entry->va, &l2e);
1700 if (entry->v == current)
1701 need_flush = 1;
1705 break;
1707 #if CONFIG_PAGING_LEVELS == 2
1708 case PGT_l2_shadow:
1710 l2_pgentry_t *guest2 = guest;
1711 l2_pgentry_t *shadow2 = shadow;
1712 l2_pgentry_t *snapshot2 = snapshot;
1714 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
1715 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
1717 changed = 0;
1718 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1720 if ( !is_guest_l2_slot(0,i) && !external )
1721 continue;
1723 l2_pgentry_t new_pde = guest2[i];
1724 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
1726 need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
1728 // can't update snapshots of linear page tables -- they
1729 // are used multiple times...
1730 //
1731 // snapshot[i] = new_pde;
1733 changed++;
1735 if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
1736 max = i;
1738 // XXX - This hack works for linux guests.
1739 // Need a better solution long term.
1740 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
1741 unlikely(l2e_get_intpte(new_pde) != 0) &&
1742 !unshadow && MFN_PINNED(smfn) )
1743 unshadow = 1;
1745 if ( max == -1 )
1746 unshadow = 1;
1747 perfc_incrc(resync_l2);
1748 perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
1749 break;
1751 case PGT_hl2_shadow:
1753 l2_pgentry_t *guest2 = guest;
1754 l2_pgentry_t *snapshot2 = snapshot;
1755 l1_pgentry_t *shadow2 = shadow;
1757 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
1758 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
1760 changed = 0;
1761 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1763 if ( !is_guest_l2_slot(0, i) && !external )
1764 continue;
1766 l2_pgentry_t new_pde = guest2[i];
1767 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
1769 need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
1771 // can't update snapshots of linear page tables -- they
1772 // are used multiple times...
1773 //
1774 // snapshot[i] = new_pde;
1776 changed++;
1779 perfc_incrc(resync_hl2);
1780 perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
1781 break;
1783 #elif CONFIG_PAGING_LEVELS >= 3
1784 case PGT_l2_shadow:
1785 case PGT_l3_shadow:
1787 pgentry_64_t *guest_pt = guest;
1788 pgentry_64_t *shadow_pt = shadow;
1789 pgentry_64_t *snapshot_pt = snapshot;
1791 changed = 0;
1792 for ( i = min_shadow; i <= max_shadow; i++ )
1794 if ( (i < min_snapshot) || (i > max_snapshot) ||
1795 entry_has_changed(
1796 guest_pt[i], snapshot_pt[i], PAGE_FLAG_MASK) )
1798 need_flush |= validate_entry_change(
1799 d, &guest_pt[i], &shadow_pt[i],
1800 shadow_type_to_level(stype));
1801 changed++;
1803 #if CONFIG_PAGING_LEVELS == 3
1804 if ( stype == PGT_l3_shadow )
1806 if ( entry_get_value(guest_pt[i]) != 0 )
1807 max = i;
1809 if ( !(entry_get_flags(guest_pt[i]) & _PAGE_PRESENT) &&
1810 unlikely(entry_get_value(guest_pt[i]) != 0) &&
1811 !unshadow &&
1812 (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
1813 unshadow = 1;
1815 #endif
1818 if ( d->arch.ops->guest_paging_levels == PAGING_L3
1819 && max == -1 && stype == PGT_l3_shadow )
1820 unshadow = 1;
1822 perfc_incrc(resync_l3);
1823 perfc_incr_histo(shm_l3_updates, changed, PT_UPDATES);
1824 break;
1826 case PGT_l4_shadow:
1828 guest_root_pgentry_t *guest_root = guest;
1829 guest_root_pgentry_t *snapshot_root = snapshot;
1831 changed = 0;
1832 for ( i = 0; i < GUEST_ROOT_PAGETABLE_ENTRIES; i++ )
1834 guest_root_pgentry_t new_root_e = guest_root[i];
1835 if ( !is_guest_l4_slot(i) && !external )
1836 continue;
1837 if ( root_entry_has_changed(
1838 new_root_e, snapshot_root[i], PAGE_FLAG_MASK))
1840 #ifndef GUEST_PGENTRY_32
1841 l4_pgentry_t *shadow4 = shadow;
1843 if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
1845 need_flush |= validate_entry_change(
1846 d, (pgentry_64_t *)&new_root_e,
1847 (pgentry_64_t *)&shadow4[i], shadow_type_to_level(stype));
1849 else
1850 #endif
1852 validate_bl2e_change(d, &new_root_e, shadow, i);
1854 changed++;
1855 ESH_LOG("%d: shadow4 mfn: %lx, shadow root: %lx\n", i,
1856 smfn, pagetable_get_paddr(current->arch.shadow_table));
1858 if ( guest_root_get_intpte(new_root_e) != 0 ) /* FIXME: check flags? */
1859 max = i;
1861 // Need a better solution in the long term.
1862 if ( !(guest_root_get_flags(new_root_e) & _PAGE_PRESENT) &&
1863 unlikely(guest_root_get_intpte(new_root_e) != 0) &&
1864 !unshadow &&
1865 (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
1866 unshadow = 1;
1868 if ( max == -1 )
1869 unshadow = 1;
1870 perfc_incrc(resync_l4);
1871 perfc_incr_histo(shm_l4_updates, changed, PT_UPDATES);
1872 break;
1875 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1876 default:
1877 BUG();
1880 if ( smfn )
1881 unmap_domain_page(shadow);
1882 unmap_domain_page(snapshot);
1883 unmap_domain_page(guest);
1885 if ( unlikely(unshadow) )
1887 for_each_vcpu(d, v)
1888 if(smfn == pagetable_get_pfn(v->arch.shadow_table))
1889 return need_flush;
1890 perfc_incrc(unshadow_l2_count);
1891 shadow_unpin(smfn);
1892 #if CONFIG_PAGING_LEVELS == 2
1893 if ( unlikely(shadow_mode_external(d)) )
1895 unsigned long hl2mfn;
1897 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
1898 MFN_PINNED(hl2mfn) )
1899 shadow_unpin(hl2mfn);
1901 #endif
1905 return need_flush;
1908 #if CONFIG_PAGING_LEVELS == 2
1909 static int resync_all_levels_guest_page(struct domain *d)
1911 int need_flush = 0;
1913 need_flush |= resync_all(d, PGT_l1_shadow);
1914 if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
1915 shadow_mode_translate(d) )
1917 need_flush |= resync_all(d, PGT_hl2_shadow);
1919 return need_flush;
1921 #elif CONFIG_PAGING_LEVELS == 3
1922 static int resync_all_levels_guest_page(struct domain *d)
1924 int need_flush = 0;
1926 need_flush |= resync_all(d, PGT_l1_shadow);
1927 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
1928 need_flush |= resync_all(d, PGT_l4_shadow);
1929 else
1931 need_flush |= resync_all(d, PGT_l2_shadow);
1932 if ( shadow_mode_log_dirty(d) )
1934 need_flush |= resync_all(d, PGT_l3_shadow);
1935 need_flush |= resync_all(d, PGT_l4_shadow);
1937 else
1938 resync_pae_guest_l3(d);
1941 return need_flush;
1943 #elif CONFIG_PAGING_LEVELS == 4
1944 static int resync_all_levels_guest_page(struct domain *d)
1946 int need_flush = 0;
1948 need_flush |= resync_all(d, PGT_l1_shadow);
1949 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
1950 need_flush |= resync_all(d, PGT_l4_shadow);
1951 else
1953 need_flush |= resync_all(d, PGT_l2_shadow);
1954 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
1955 resync_pae_guest_l3(d);
1956 else
1958 need_flush |= resync_all(d, PGT_l3_shadow);
1959 need_flush |= resync_all(d, PGT_l4_shadow);
1962 return need_flush;
1964 #endif
1966 static void sync_all(struct domain *d)
1968 struct out_of_sync_entry *entry;
1969 int need_flush = 0;
1970 l1_pgentry_t *ppte, opte, npte;
1971 cpumask_t other_vcpus_mask;
1973 perfc_incrc(shadow_sync_all);
1975 ASSERT(shadow_lock_is_acquired(d));
1977 // First, remove all write permissions to the page tables
1978 //
1979 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
1981 // Skip entries that have low bits set... Those aren't
1982 // real PTEs.
1983 //
1984 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
1985 continue;
1987 ppte = (l1_pgentry_t *)(
1988 (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
1989 (entry->writable_pl1e & ~PAGE_MASK));
1990 opte = npte = *ppte;
1991 l1e_remove_flags(npte, _PAGE_RW);
1993 if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
1994 !shadow_get_page_from_l1e(npte, d) )
1995 BUG();
1996 *ppte = npte;
1997 set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT,
1998 (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
1999 shadow_put_page_from_l1e(opte, d);
2001 unmap_domain_page(ppte);
2004 /* Other VCPUs mustn't use the revoked writable mappings. */
2005 other_vcpus_mask = d->domain_dirty_cpumask;
2006 cpu_clear(smp_processor_id(), other_vcpus_mask);
2007 flush_tlb_mask(other_vcpus_mask);
2009 /* Flush ourself later. */
2010 need_flush = 1;
2012 need_flush |= resync_all_levels_guest_page(d);
2014 if ( need_flush && !unlikely(shadow_mode_external(d)) )
2015 local_flush_tlb();
2017 free_out_of_sync_state(d);
2020 static inline int l1pte_write_fault(
2021 struct vcpu *v, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
2022 unsigned long va)
2024 struct domain *d = v->domain;
2025 guest_l1_pgentry_t gpte = *gpte_p;
2026 l1_pgentry_t spte;
2027 unsigned long gpfn = l1e_get_pfn(gpte);
2028 unsigned long gmfn = gmfn_to_mfn(d, gpfn);
2030 //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
2032 if ( unlikely(!VALID_MFN(gmfn)) )
2034 SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
2035 *spte_p = l1e_empty();
2036 return 0;
2039 ASSERT(guest_l1e_get_flags(gpte) & _PAGE_RW);
2040 guest_l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
2041 spte = l1e_from_pfn(gmfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
2043 SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
2044 l1e_get_intpte(spte), l1e_get_intpte(gpte));
2046 __mark_dirty(d, gmfn);
2048 if ( mfn_is_page_table(gmfn) )
2049 shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
2051 *gpte_p = gpte;
2052 *spte_p = spte;
2054 return 1;
2057 static inline int l1pte_read_fault(
2058 struct domain *d, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
2060 guest_l1_pgentry_t gpte = *gpte_p;
2061 l1_pgentry_t spte = *spte_p;
2062 unsigned long pfn = l1e_get_pfn(gpte);
2063 unsigned long mfn = gmfn_to_mfn(d, pfn);
2065 if ( unlikely(!VALID_MFN(mfn)) )
2067 SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
2068 *spte_p = l1e_empty();
2069 return 0;
2072 guest_l1e_add_flags(gpte, _PAGE_ACCESSED);
2073 spte = l1e_from_pfn(mfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
2075 if ( shadow_mode_log_dirty(d) || !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
2076 mfn_is_page_table(mfn) )
2078 l1e_remove_flags(spte, _PAGE_RW);
2081 SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
2082 l1e_get_intpte(spte), l1e_get_intpte(gpte));
2083 *gpte_p = gpte;
2084 *spte_p = spte;
2086 return 1;
2088 #if CONFIG_PAGING_LEVELS == 2
2089 static int shadow_fault_32(unsigned long va, struct cpu_user_regs *regs)
2091 l1_pgentry_t gpte, spte, orig_gpte;
2092 struct vcpu *v = current;
2093 struct domain *d = v->domain;
2094 l2_pgentry_t gpde;
2096 spte = l1e_empty();
2098 SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
2099 va, (unsigned long)regs->error_code);
2100 perfc_incrc(shadow_fault_calls);
2102 check_pagetable(v, "pre-sf");
2104 /*
2105 * Don't let someone else take the guest's table pages out-of-sync.
2106 */
2107 shadow_lock(d);
2109 /* XXX - FIX THIS COMMENT!!!
2110 * STEP 1. Check to see if this fault might have been caused by an
2111 * out-of-sync table page entry, or if we should pass this
2112 * fault onto the guest.
2113 */
2114 __shadow_sync_va(v, va);
2116 /*
2117 * STEP 2. Check the guest PTE.
2118 */
2119 __guest_get_l2e(v, va, &gpde);
2120 if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
2122 SH_VVLOG("shadow_fault - EXIT: L1 not present");
2123 perfc_incrc(shadow_fault_bail_pde_not_present);
2124 goto fail;
2127 // This can't fault because we hold the shadow lock and we've ensured that
2128 // the mapping is in-sync, so the check of the PDE's present bit, above,
2129 // covers this access.
2130 //
2131 //orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
2132 __guest_get_l1e(v, va, &gpte);
2133 orig_gpte = gpte;
2135 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2137 SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")",
2138 l1e_get_intpte(gpte));
2139 perfc_incrc(shadow_fault_bail_pte_not_present);
2140 goto fail;
2143 /* Write fault? */
2144 if ( regs->error_code & 2 )
2146 int allow_writes = 0;
2148 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
2150 if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
2152 allow_writes = 1;
2153 l1e_add_flags(gpte, _PAGE_RW);
2155 else
2157 /* Write fault on a read-only mapping. */
2158 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
2159 l1e_get_intpte(gpte));
2160 perfc_incrc(shadow_fault_bail_ro_mapping);
2161 goto fail;
2164 else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
2166 SH_LOG("l1pte_write_fault: no write access to page table page");
2167 domain_crash_synchronous();
2170 if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
2172 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
2173 perfc_incrc(write_fault_bail);
2174 shadow_unlock(d);
2175 return 0;
2178 if ( allow_writes )
2179 l1e_remove_flags(gpte, _PAGE_RW);
2181 else
2183 if ( !l1pte_read_fault(d, &gpte, &spte) )
2185 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
2186 perfc_incrc(read_fault_bail);
2187 shadow_unlock(d);
2188 return 0;
2192 /*
2193 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
2194 */
2195 if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
2197 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
2198 /*if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
2199 &gpte, sizeof(gpte))) )*/
2200 if ( unlikely(!__guest_set_l1e(v, va, &gpte)))
2202 printk("%s() failed, crashing domain %d "
2203 "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
2204 __func__,d->domain_id, l2e_get_intpte(gpde), va);
2205 domain_crash_synchronous();
2208 __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
2211 shadow_set_l1e(va, spte, 1);
2213 perfc_incrc(shadow_fault_fixed);
2214 d->arch.shadow_fault_count++;
2216 shadow_unlock(d);
2218 check_pagetable(v, "post-sf");
2219 return EXCRET_fault_fixed;
2221 fail:
2222 shadow_unlock(d);
2223 return 0;
2225 #endif /* CONFIG_PAGING_LEVELS == 2 */
2227 static inline unsigned long va_to_l1mfn(struct vcpu *v, unsigned long va)
2229 struct domain *d = v->domain;
2230 guest_l2_pgentry_t gl2e = {0};
2232 __guest_get_l2e(v, va, &gl2e);
2234 if ( unlikely(!(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT)) )
2235 return INVALID_MFN;
2237 return gmfn_to_mfn(d, l2e_get_pfn(gl2e));
2240 static int do_update_va_mapping(unsigned long va,
2241 l1_pgentry_t val,
2242 struct vcpu *v)
2244 struct domain *d = v->domain;
2245 l1_pgentry_t spte;
2246 int rc = 0;
2248 shadow_lock(d);
2250 // This is actually overkill - we don't need to sync the L1 itself,
2251 // just everything involved in getting to this L1 (i.e. we need
2252 // linear_pg_table[l1_linear_offset(va)] to be in sync)...
2253 //
2254 __shadow_sync_va(v, va);
2256 l1pte_propagate_from_guest(d, *(guest_l1_pgentry_t *)&val, &spte);
2257 #if CONFIG_PAGING_LEVELS == 2
2258 shadow_set_l1e(va, spte, 0);
2259 #elif CONFIG_PAGING_LEVELS >= 3
2260 shadow_set_l1e_64(va, (pgentry_64_t *) &spte, 0);
2261 #endif
2262 /*
2263 * If we're in log-dirty mode then we need to note that we've updated
2264 * the PTE in the PT-holding page. We need the machine frame number
2265 * for this.
2266 */
2267 __mark_dirty(d, va_to_l1mfn(v, va));
2269 shadow_unlock(d);
2271 return rc;
2275 /*
2276 * What lives where in the 32-bit address space in the various shadow modes,
2277 * and what it uses to get/maintain that mapping.
2279 * SHADOW MODE: none enable translate external
2281 * 4KB things:
2282 * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
2283 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
2284 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
2285 * monitor_vtable n/a n/a n/a mapped once
2287 * 4MB things:
2288 * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
2289 * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
2290 * monitor_linear n/a n/a n/a ???
2291 * perdomain perdomain perdomain perdomain perdomain
2292 * R/O M2P R/O M2P R/O M2P n/a n/a
2293 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
2294 * P2M n/a n/a R/O M2P R/O M2P
2296 * NB:
2297 * update_pagetables(), shadow_update_pagetables(), shadow_mode_enable(),
2298 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
2299 * all play a part in maintaining these mappings.
2300 */
2301 static void shadow_update_pagetables(struct vcpu *v)
2303 struct domain *d = v->domain;
2304 #if CONFIG_PAGING_LEVELS == 4
2305 unsigned long gmfn = ((v->arch.flags & TF_kernel_mode)?
2306 pagetable_get_pfn(v->arch.guest_table) :
2307 pagetable_get_pfn(v->arch.guest_table_user));
2308 #else
2309 unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
2310 #endif
2312 unsigned long gpfn = mfn_to_gmfn(d, gmfn);
2313 unsigned long smfn, old_smfn;
2315 #if CONFIG_PAGING_LEVELS == 2
2316 unsigned long hl2mfn;
2317 #endif
2318 int need_sync = 0;
2320 int max_mode = ( shadow_mode_external(d) ? SHM_external
2321 : shadow_mode_translate(d) ? SHM_translate
2322 : shadow_mode_enabled(d) ? SHM_enable
2323 : 0 );
2325 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
2326 ASSERT( max_mode );
2328 /*
2329 * arch.guest_vtable
2330 */
2331 if ( max_mode & (SHM_enable | SHM_external) )
2333 if ( likely(v->arch.guest_vtable != NULL) )
2334 unmap_domain_page_global(v->arch.guest_vtable);
2335 v->arch.guest_vtable = map_domain_page_global(gmfn);
2338 #if CONFIG_PAGING_LEVELS >= 3
2339 /*
2340 * Handle 32-bit PAE enabled guest
2341 */
2342 if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
2344 u32 index = get_cr3_idxval(v);
2345 gpfn = (index << PGT_score_shift) | gpfn;
2347 #endif
2349 /*
2350 * arch.shadow_table
2351 */
2352 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
2354 #if CONFIG_PAGING_LEVELS == 2
2355 smfn = shadow_l2_table(v, gpfn, gmfn);
2356 #elif CONFIG_PAGING_LEVELS == 3
2357 smfn = shadow_l3_table(v, gpfn, gmfn);
2358 #elif CONFIG_PAGING_LEVELS == 4
2359 smfn = shadow_l4_table(v, gpfn, gmfn);
2360 #endif
2362 else
2364 #if CONFIG_PAGING_LEVELS >= 3
2365 if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
2366 update_top_level_shadow(v, smfn);
2367 #endif
2368 /*
2369 * move sync later in order to avoid this smfn been
2370 * unshadowed occasionally
2371 */
2372 need_sync = 1;
2376 if ( !get_shadow_ref(smfn) )
2377 BUG();
2378 old_smfn = pagetable_get_pfn(v->arch.shadow_table);
2379 v->arch.shadow_table = mk_pagetable((u64)smfn << PAGE_SHIFT);
2380 if ( old_smfn )
2381 put_shadow_ref(old_smfn);
2383 SH_VVLOG("shadow_update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
2385 /*
2386 * arch.shadow_vtable
2387 */
2388 if ( max_mode == SHM_external
2389 #if CONFIG_PAGING_LEVELS >=3
2390 || max_mode & SHM_enable
2391 #endif
2394 if ( v->arch.shadow_vtable )
2395 unmap_domain_page_global(v->arch.shadow_vtable);
2396 v->arch.shadow_vtable = map_domain_page_global(smfn);
2399 #if CONFIG_PAGING_LEVELS == 2
2400 /*
2401 * arch.hl2_vtable
2402 */
2404 // if max_mode == SHM_translate, then the hl2 is already installed
2405 // correctly in its smfn, and there's nothing to do.
2406 //
2407 if ( max_mode == SHM_external )
2409 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
2410 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
2411 if ( v->arch.hl2_vtable )
2412 unmap_domain_page_global(v->arch.hl2_vtable);
2413 v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
2416 /*
2417 * fixup pointers in monitor table, as necessary
2418 */
2419 if ( max_mode == SHM_external )
2421 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
2422 l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
2423 l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
2425 ASSERT( shadow_mode_translate(d) );
2427 if ( !get_shadow_ref(hl2mfn) )
2428 BUG();
2429 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
2430 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
2431 if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
2432 put_shadow_ref(l2e_get_pfn(old_hl2e));
2434 if ( !get_shadow_ref(smfn) )
2435 BUG();
2436 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
2437 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
2438 if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
2439 put_shadow_ref(l2e_get_pfn(old_sl2e));
2441 // XXX - maybe this can be optimized somewhat??
2442 local_flush_tlb();
2444 #endif /* CONFIG_PAGING_LEVELS == 2 */
2446 #if CONFIG_PAGING_LEVELS == 3
2447 /*
2448 * fixup pointers in monitor table, as necessary
2449 */
2450 if ( max_mode == SHM_external )
2452 l3_pgentry_t *mpl3e = (l3_pgentry_t *) v->arch.monitor_vtable;
2453 l2_pgentry_t *spl2e;
2454 unsigned long s2mfn;
2455 int i;
2457 ASSERT( shadow_mode_translate(d) );
2458 s2mfn = l3e_get_pfn(mpl3e[L3_PAGETABLE_ENTRIES - 1]);
2460 ASSERT( s2mfn);
2461 spl2e = map_domain_page(s2mfn);
2463 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
2464 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
2465 (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
2466 l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
2467 l2e_empty();
2469 unmap_domain_page(spl2e);
2470 local_flush_tlb();
2472 #endif
2474 if(likely(need_sync))
2475 shadow_sync_all(d);
2479 /************************************************************************/
2480 /************************************************************************/
2481 /************************************************************************/
2483 #if 0 // this code has not been updated for 32pae & 64 bit modes
2484 #if SHADOW_DEBUG
2486 // The following is entirely for _check_pagetable()'s benefit.
2487 // _check_pagetable() wants to know whether a given entry in a
2488 // shadow page table is supposed to be the shadow of the guest's
2489 // current entry, or the shadow of the entry held in the snapshot
2490 // taken above.
2491 //
2492 // Here, we mark all currently existing entries as reflecting
2493 // the snapshot, above. All other places in xen that update
2494 // the shadow will keep the shadow in sync with the guest's
2495 // entries (via l1pte_propagate_from_guest and friends), which clear
2496 // the SHADOW_REFLECTS_SNAPSHOT bit.
2497 //
2498 static void
2499 mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
2501 unsigned long smfn;
2502 l1_pgentry_t *l1e;
2503 l2_pgentry_t *l2e;
2504 unsigned i;
2506 if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
2508 l1e = map_domain_page(smfn);
2509 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2510 if ( is_guest_l1_slot(i) &&
2511 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
2512 l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
2513 unmap_domain_page(l1e);
2516 if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
2518 l2e = map_domain_page(smfn);
2519 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2520 if ( is_guest_l2_slot(0, i) &&
2521 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
2522 l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
2523 unmap_domain_page(l2e);
2527 // BUG: these are not SMP safe...
2528 static int sh_l2_present;
2529 static int sh_l1_present;
2530 static char *sh_check_name;
2531 // int shadow_status_noswap; // declared in shadow32.c
2533 #define v2m(_v, _adr) ({ \
2534 unsigned long _a = (unsigned long)(_adr); \
2535 l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \
2536 unsigned long _pa = -1; \
2537 if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
2538 { \
2539 l1_pgentry_t _pte; \
2540 _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
2541 if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
2542 _pa = l1e_get_paddr(_pte); \
2543 } \
2544 _pa | (_a & ~PAGE_MASK); \
2545 })
2547 #define FAIL(_f, _a...) \
2548 do { \
2549 printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \
2550 sh_check_name, level, l2_idx, l1_idx, ## _a, \
2551 __FILE__, __LINE__); \
2552 printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \
2553 " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \
2554 " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \
2555 " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \
2556 l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \
2557 l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \
2558 p_guest_pte, p_shadow_pte, p_snapshot_pte, \
2559 (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \
2560 (void *)v2m(v, p_snapshot_pte), \
2561 (l2_idx << L2_PAGETABLE_SHIFT) | \
2562 (l1_idx << L1_PAGETABLE_SHIFT)); \
2563 errors++; \
2564 } while ( 0 )
2566 static int check_pte(
2567 struct vcpu *v,
2568 l1_pgentry_t *p_guest_pte,
2569 l1_pgentry_t *p_shadow_pte,
2570 l1_pgentry_t *p_snapshot_pte,
2571 int level, int l2_idx, int l1_idx)
2573 struct domain *d = v->domain;
2574 l1_pgentry_t guest_pte = *p_guest_pte;
2575 l1_pgentry_t shadow_pte = *p_shadow_pte;
2576 l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
2577 l1_pgentry_t eff_guest_pte;
2578 unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
2579 int errors = 0, guest_writable;
2580 int page_table_page;
2582 if ( (l1e_get_intpte(shadow_pte) == 0) ||
2583 (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
2584 (l1e_get_intpte(shadow_pte) == 0x00000E00) )
2585 return errors; /* always safe */
2587 if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
2588 FAIL("Non zero not present shadow_pte");
2590 if ( level == 2 ) sh_l2_present++;
2591 if ( level == 1 ) sh_l1_present++;
2593 if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
2594 eff_guest_pte = snapshot_pte;
2595 else
2596 eff_guest_pte = guest_pte;
2598 if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
2599 FAIL("Guest not present yet shadow is");
2601 mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
2603 if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
2604 FAIL("Corrupt?");
2606 if ( (level == 1) &&
2607 (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
2608 !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
2609 FAIL("Dirty coherence");
2611 if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
2612 !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
2613 FAIL("Accessed coherence");
2615 if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
2616 FAIL("global bit set in shadow");
2618 eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
2619 eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
2620 shadow_mfn = l1e_get_pfn(shadow_pte);
2622 if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
2623 FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
2624 __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
2626 page_table_page = mfn_is_page_table(eff_guest_mfn);
2628 guest_writable =
2629 (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
2630 (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
2632 if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
2634 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
2635 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
2636 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
2637 page_table_page);
2638 FAIL("RW coherence");
2641 if ( (level == 1) &&
2642 (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
2643 !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
2645 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
2646 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
2647 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
2648 page_table_page);
2649 FAIL("RW2 coherence");
2652 if ( eff_guest_mfn == shadow_mfn )
2654 if ( level > 1 )
2655 FAIL("Linear map ???"); /* XXX this will fail on BSD */
2657 else
2659 if ( level < 2 )
2660 FAIL("Shadow in L1 entry?");
2662 if ( level == 2 )
2664 if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
2665 FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
2666 __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
2668 else
2669 BUG(); // XXX -- not handled yet.
2672 return errors;
2674 #undef FAIL
2675 #undef v2m
2677 static int check_l1_table(
2678 struct vcpu *v, unsigned long gpfn,
2679 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
2681 struct domain *d = v->domain;
2682 int i;
2683 unsigned long snapshot_mfn;
2684 l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
2685 int errors = 0;
2687 if ( page_out_of_sync(mfn_to_page(gmfn)) )
2689 snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
2690 ASSERT(snapshot_mfn);
2691 p_snapshot = map_domain_page(snapshot_mfn);
2694 p_guest = map_domain_page(gmfn);
2695 p_shadow = map_domain_page(smfn);
2697 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2698 errors += check_pte(v, p_guest+i, p_shadow+i,
2699 p_snapshot ? p_snapshot+i : NULL,
2700 1, l2_idx, i);
2702 unmap_domain_page(p_shadow);
2703 unmap_domain_page(p_guest);
2704 if ( p_snapshot )
2705 unmap_domain_page(p_snapshot);
2707 return errors;
2710 #define FAILPT(_f, _a...) \
2711 do { \
2712 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
2713 errors++; \
2714 } while ( 0 )
2716 static int check_l2_table(
2717 struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
2719 struct domain *d = v->domain;
2720 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
2721 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
2722 l2_pgentry_t match;
2723 int i;
2724 int errors = 0;
2725 int limit;
2727 if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
2728 FAILPT("domain doesn't own page");
2729 if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
2730 FAILPT("bogus owner for snapshot page");
2731 if ( page_get_owner(mfn_to_page(smfn)) != NULL )
2732 FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
2733 smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
2735 #if 0
2736 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2737 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2738 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
2739 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
2741 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2742 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
2743 i++ )
2744 printk("+++ (%d) %lx %lx\n",i,
2745 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
2746 FAILPT("hypervisor entries inconsistent");
2749 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
2750 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
2751 FAILPT("hypervisor linear map inconsistent");
2752 #endif
2754 match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
2755 if ( !shadow_mode_external(d) &&
2756 l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
2757 match, PAGE_FLAG_MASK))
2759 FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
2760 l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
2761 L2_PAGETABLE_SHIFT]),
2762 l2e_get_intpte(match));
2765 match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
2766 if ( !shadow_mode_external(d) &&
2767 l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
2768 match, PAGE_FLAG_MASK))
2770 FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
2771 l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
2772 d->arch.mm_perdomain_pt,
2773 l2e_get_intpte(match));
2776 #if CONFIG_PAGING_LEVELS == 2
2777 if ( shadow_mode_external(d) )
2778 limit = L2_PAGETABLE_ENTRIES;
2779 else
2780 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2781 #else
2782 limit = 0; /* XXX x86/64 XXX */
2783 #endif
2785 /* Check the whole L2. */
2786 for ( i = 0; i < limit; i++ )
2787 errors += check_pte(v,
2788 (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
2789 (l1_pgentry_t*)(&spl2e[i]),
2790 NULL,
2791 2, i, 0);
2793 unmap_domain_page(spl2e);
2794 unmap_domain_page(gpl2e);
2796 #if 1
2797 if ( errors )
2798 printk("check_l2_table returning %d errors\n", errors);
2799 #endif
2801 return errors;
2803 #undef FAILPT
2805 int _check_pagetable(struct vcpu *v, char *s)
2807 struct domain *d = v->domain;
2808 #if CONFIG_PAGING_LEVELS == 4
2809 pagetable_t pt = ((v->arch.flags & TF_kernel_mode)?
2810 v->arch.guest_table : v->arch.guest_table_user);
2811 #else
2812 pagetable_t pt = v->arch.guest_table;
2813 #endif
2814 unsigned long gptbase = pagetable_get_paddr(pt);
2815 unsigned long ptbase_pfn, smfn;
2816 unsigned long i;
2817 l2_pgentry_t *gpl2e, *spl2e;
2818 unsigned long ptbase_mfn = 0;
2819 int errors = 0, limit, oos_pdes = 0;
2821 //_audit_domain(d, AUDIT_QUIET);
2822 shadow_lock(d);
2824 sh_check_name = s;
2825 //SH_VVLOG("%s-PT Audit", s);
2826 sh_l2_present = sh_l1_present = 0;
2827 perfc_incrc(check_pagetable);
2829 ptbase_mfn = gptbase >> PAGE_SHIFT;
2830 ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
2832 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
2834 printk("%s-PT %lx not shadowed\n", s, gptbase);
2835 goto out;
2837 if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
2839 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
2840 oos_pdes = 1;
2841 ASSERT(ptbase_mfn);
2844 errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
2846 gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
2847 spl2e = (l2_pgentry_t *) map_domain_page(smfn);
2849 /* Go back and recurse. */
2850 #if CONFIG_PAGING_LEVELS == 2
2851 if ( shadow_mode_external(d) )
2852 limit = L2_PAGETABLE_ENTRIES;
2853 else
2854 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2855 #else
2856 limit = 0; /* XXX x86/64 XXX */
2857 #endif
2859 for ( i = 0; i < limit; i++ )
2861 unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
2862 unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
2863 unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
2865 if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */
2867 errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
2871 unmap_domain_page(spl2e);
2872 unmap_domain_page(gpl2e);
2874 #if 0
2875 SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
2876 sh_l2_present, sh_l1_present);
2877 #endif
2879 out:
2880 if ( errors )
2881 BUG();
2883 shadow_unlock(d);
2885 return errors;
2888 int _check_all_pagetables(struct vcpu *v, char *s)
2890 struct domain *d = v->domain;
2891 int i;
2892 struct shadow_status *a;
2893 unsigned long gmfn;
2894 int errors = 0;
2896 shadow_status_noswap = 1;
2898 sh_check_name = s;
2899 SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
2900 sh_l2_present = sh_l1_present = 0;
2901 perfc_incrc(check_all_pagetables);
2903 for (i = 0; i < shadow_ht_buckets; i++)
2905 a = &d->arch.shadow_ht[i];
2906 while ( a && a->gpfn_and_flags )
2908 gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
2910 switch ( a->gpfn_and_flags & PGT_type_mask )
2912 case PGT_l1_shadow:
2913 errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
2914 gmfn, a->smfn, 0);
2915 break;
2916 case PGT_l2_shadow:
2917 errors += check_l2_table(v, gmfn, a->smfn,
2918 page_out_of_sync(mfn_to_page(gmfn)));
2919 break;
2920 case PGT_l3_shadow:
2921 case PGT_l4_shadow:
2922 case PGT_hl2_shadow:
2923 BUG(); // XXX - ought to fix this...
2924 break;
2925 case PGT_snapshot:
2926 case PGT_writable_pred:
2927 break;
2928 default:
2929 errors++;
2930 printk("unexpected shadow type %lx, gpfn=%lx, "
2931 "gmfn=%lx smfn=%lx\n",
2932 a->gpfn_and_flags & PGT_type_mask,
2933 a->gpfn_and_flags & PGT_mfn_mask,
2934 gmfn, a->smfn);
2935 BUG();
2937 a = a->next;
2941 shadow_status_noswap = 0;
2943 if ( errors )
2944 BUG();
2946 return errors;
2949 #endif // SHADOW_DEBUG
2950 #endif // this code has not been updated for 32pae & 64 bit modes
2952 #if CONFIG_PAGING_LEVELS >= 3
2953 /****************************************************************************/
2954 /* 64-bit shadow-mode code testing */
2955 /****************************************************************************/
2956 /*
2957 * init_bl2() is for 32-bit VMX guest on 64-bit host
2958 * Using 1 shadow L4(l3) and 4 shadow L2s to simulate guest L2
2959 */
2960 static inline unsigned long init_bl2(
2961 struct domain *d, unsigned long gpfn, unsigned long gmfn)
2963 unsigned int count;
2964 unsigned long sl2mfn;
2965 unsigned long smfn;
2966 struct page_info *page;
2967 l4_pgentry_t *spl4e;
2968 void *l2;
2970 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
2972 printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
2973 BUG(); /* XXX Deal gracefully with failure. */
2976 spl4e = (l4_pgentry_t *)map_domain_page(smfn);
2978 /* Map the self entry, L4&L3 share the same page */
2979 spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
2981 /* Allocate 4 shadow L2s */
2982 page = alloc_domheap_pages(NULL, SL2_ORDER, 0);
2983 if ( !page )
2984 domain_crash_synchronous();
2986 for ( count = 0; count < PAE_L3_PAGETABLE_ENTRIES; count++ )
2988 sl2mfn = page_to_mfn(page+count);
2989 l2 = map_domain_page(sl2mfn);
2990 memset(l2, 0, PAGE_SIZE);
2991 unmap_domain_page(l2);
2992 spl4e[count] = l4e_from_pfn(sl2mfn, _PAGE_PRESENT);
2995 unmap_domain_page(spl4e);
2997 return smfn;
2999 #endif
3001 #if CONFIG_PAGING_LEVELS == 3
3002 static unsigned long shadow_l3_table(
3003 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
3005 unsigned long smfn;
3006 l3_pgentry_t *spl3e;
3007 struct domain *d = v->domain;
3009 perfc_incrc(shadow_l3_table_count);
3011 SH_VVLOG("shadow_l3_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
3013 if ( SH_L1_HAS_NEXT_PAGE &&
3014 d->arch.ops->guest_paging_levels == PAGING_L2 )
3016 return init_bl2(d, gpfn, gmfn);
3019 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l3_shadow))) )
3021 printk("Couldn't alloc an L3 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3022 BUG(); /* XXX Deal gracefully with failure. */
3025 spl3e = (l3_pgentry_t *)map_domain_page(smfn);
3027 /* Make the self entry */
3028 spl3e[PAE_SHADOW_SELF_ENTRY] = l3e_from_pfn(smfn, __PAGE_HYPERVISOR);
3030 if ( (PGT_base_page_table == PGT_l3_page_table) &&
3031 !shadow_mode_external(d) ) {
3032 int i;
3033 unsigned long g2mfn, s2mfn;
3034 l2_pgentry_t *spl2e;
3035 l3_pgentry_t *gpl3e;
3037 /* Get the top entry */
3038 gpl3e = (l3_pgentry_t *)map_domain_page(gmfn);
3040 if ( !(l3e_get_flags(gpl3e[L3_PAGETABLE_ENTRIES - 1]) & _PAGE_PRESENT) )
3042 BUG();
3045 g2mfn = l3e_get_pfn(gpl3e[L3_PAGETABLE_ENTRIES - 1]);
3047 /* NB. g2mfn should be same as g2pfn */
3048 if (!(s2mfn = __shadow_status(d, g2mfn, PGT_l2_shadow))) {
3049 if ( unlikely(!(s2mfn =
3050 alloc_shadow_page(d, g2mfn, g2mfn, PGT_l2_shadow))) ) {
3051 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
3052 g2mfn, g2mfn);
3053 BUG(); /* XXX Deal gracefully with failure. */
3057 if (!get_shadow_ref(s2mfn))
3058 BUG();
3060 /* Map shadow L2 into shadow L3 */
3061 spl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(s2mfn, _PAGE_PRESENT);
3062 shadow_update_min_max(smfn, L3_PAGETABLE_ENTRIES -1);
3064 /*
3065 * Xen private mappings. Do the similar things as
3066 * create_pae_xen_mappings().
3067 */
3068 spl2e = (l2_pgentry_t *)map_domain_page(s2mfn);
3070 /*
3071 * When we free L2 pages, we need to tell if the page contains
3072 * Xen private mappings. Use the va_mask part.
3073 */
3074 mfn_to_page(s2mfn)->u.inuse.type_info |=
3075 (unsigned long) 3 << PGT_score_shift;
3077 memset(spl2e, 0,
3078 (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)) * sizeof(l2_pgentry_t));
3080 memcpy(&spl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
3081 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
3082 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
3084 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
3085 spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
3086 l2e_from_page(
3087 virt_to_page(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_pt) + i,
3088 __PAGE_HYPERVISOR);
3089 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
3090 spl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
3091 (l3e_get_flags(gpl3e[i]) & _PAGE_PRESENT) ?
3092 l2e_from_pfn(l3e_get_pfn(gpl3e[i]), __PAGE_HYPERVISOR) :
3093 l2e_empty();
3095 unmap_domain_page(spl2e);
3096 unmap_domain_page(gpl3e);
3098 unmap_domain_page(spl3e);
3100 return smfn;
3102 #endif /* CONFIG_PAGING_LEVELS == 3 */
3104 #if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
3105 static unsigned long gva_to_gpa_pae(unsigned long gva)
3107 BUG();
3108 return 43;
3110 #endif
3112 #if CONFIG_PAGING_LEVELS == 4
3113 static unsigned long shadow_l4_table(
3114 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
3116 unsigned long smfn;
3117 l4_pgentry_t *spl4e;
3118 struct domain *d = v->domain;
3120 SH_VVLOG("shadow_l4_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
3122 perfc_incrc(shadow_l4_table_count);
3124 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
3126 return init_bl2(d, gpfn, gmfn);
3129 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
3131 printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3132 BUG(); /* XXX Deal gracefully with failure. */
3135 spl4e = (l4_pgentry_t *)map_domain_page(smfn);
3137 /* For 32-bit PAE guest on 64-bit host */
3138 if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
3140 unsigned long index;
3141 /*
3142 * Shadow L4's pfn_info->tlbflush_timestamp
3143 * should also save it's own index.
3144 */
3145 index = get_cr3_idxval(v);
3146 frame_table[smfn].tlbflush_timestamp = index;
3148 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
3149 /* Map the self entry */
3150 spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
3151 unmap_domain_page(spl4e);
3152 return smfn;
3155 /* Install hypervisor and 4x linear p.t. mapings. */
3156 if ( (PGT_base_page_table == PGT_l4_page_table) &&
3157 !shadow_mode_external(d) )
3159 /*
3160 * We could proactively fill in PDEs for pages that are already
3161 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
3162 * (restriction required for coherence of the accessed bit). However,
3163 * we tried it and it didn't help performance. This is simpler.
3164 */
3165 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
3167 /* Install hypervisor and 2x linear p.t. mapings. */
3168 memcpy(&spl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
3169 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
3170 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
3172 spl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
3173 l4e_from_paddr(__pa(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_l3),
3174 __PAGE_HYPERVISOR);
3176 if ( shadow_mode_translate(d) ) // NB: not external
3178 spl4e[l4_table_offset(RO_MPT_VIRT_START)] =
3179 l4e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
3180 __PAGE_HYPERVISOR);
3182 else
3183 spl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
3184 l4e_from_pfn(gmfn, __PAGE_HYPERVISOR);
3186 } else
3187 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
3189 unmap_domain_page(spl4e);
3191 ESH_LOG("shadow_l4_table(%lx -> %lx)", gmfn, smfn);
3192 return smfn;
3194 #endif /* CONFIG_PAGING_LEVELS == 4 */
3196 #if CONFIG_PAGING_LEVELS >= 3
3197 static void
3198 update_top_level_shadow(struct vcpu *v, unsigned long smfn)
3200 unsigned long index = get_cr3_idxval(v);
3201 pgentry_64_t *sple = (pgentry_64_t *)map_domain_page(smfn);
3202 pgentry_64_t *gple = (pgentry_64_t *)&v->arch.guest_vtable;
3203 int i;
3205 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
3206 validate_entry_change(
3207 v->domain, &gple[index*4+i], &sple[i], PAGING_L3);
3209 unmap_domain_page(sple);
3212 /*
3213 * validate_bl2e_change()
3214 * The code is for 32-bit HVM guest on 64-bit host.
3215 * To sync guest L2.
3216 */
3218 static inline void
3219 validate_bl2e_change(
3220 struct domain *d,
3221 guest_root_pgentry_t *new_gle_p,
3222 pgentry_64_t *shadow_l3,
3223 int index)
3225 int sl3_idx, sl2_idx;
3226 unsigned long sl2mfn, sl1mfn;
3227 pgentry_64_t *sl2_p;
3229 /* Using guest l2 pte index to get shadow l3&l2 index
3230 * index: 0 ~ 1023, PAGETABLE_ENTRIES: 512
3231 */
3232 sl3_idx = index / (PAGETABLE_ENTRIES / 2);
3233 sl2_idx = (index % (PAGETABLE_ENTRIES / 2)) * 2;
3235 sl2mfn = entry_get_pfn(shadow_l3[sl3_idx]);
3236 sl2_p = (pgentry_64_t *)map_domain_page(sl2mfn);
3238 validate_pde_change(
3239 d, *(guest_l2_pgentry_t *)new_gle_p, (l2_pgentry_t *)&sl2_p[sl2_idx]);
3241 /* Mapping the second l1 shadow page */
3242 if (entry_get_flags(sl2_p[sl2_idx]) & _PAGE_PRESENT) {
3243 sl1mfn = entry_get_pfn(sl2_p[sl2_idx]);
3244 sl2_p[sl2_idx + 1] =
3245 entry_from_pfn(sl1mfn + 1, entry_get_flags(sl2_p[sl2_idx]));
3247 else
3248 sl2_p[sl2_idx + 1] = (pgentry_64_t){0};
3249 unmap_domain_page(sl2_p);
3253 /*
3254 * This shadow_mark_va_out_of_sync() is for 2M page shadow
3255 */
3256 static void shadow_mark_va_out_of_sync_2mp(
3257 struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long writable_pl1e)
3259 struct out_of_sync_entry *entry =
3260 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
3262 entry->writable_pl1e = writable_pl1e;
3263 ESH_LOG("<shadow_mark_va_out_of_sync_2mp> gpfn = %lx\n", gpfn);
3264 if ( !get_shadow_ref(writable_pl1e >> L1_PAGETABLE_SHIFT) )
3265 BUG();
3268 static int get_shadow_mfn(struct domain *d, unsigned long gpfn, unsigned long *spmfn, u32 flag)
3270 unsigned long gmfn;
3271 if ( !(*spmfn = __shadow_status(d, gpfn, flag)) )
3273 /* This is NOT already shadowed so we need to shadow it. */
3274 SH_VVLOG("<get_shadow_mfn>: not shadowed");
3276 gmfn = gmfn_to_mfn(d, gpfn);
3277 if ( unlikely(!VALID_MFN(gmfn)) )
3279 // Attempt to use an invalid pfn as an shadow page.
3280 // XXX this needs to be more graceful!
3281 BUG();
3284 if ( unlikely(!(*spmfn =
3285 alloc_shadow_page(d, gpfn, gmfn, flag))) )
3287 printk("<get_shadow_mfn>Couldn't alloc an shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3288 BUG(); /* XXX Need to deal gracefully with failure. */
3290 switch(flag) {
3291 case PGT_l1_shadow:
3292 perfc_incrc(shadow_l1_table_count);
3293 break;
3294 case PGT_l2_shadow:
3295 perfc_incrc(shadow_l2_table_count);
3296 break;
3297 case PGT_l3_shadow:
3298 perfc_incrc(shadow_l3_table_count);
3299 break;
3300 case PGT_hl2_shadow:
3301 perfc_incrc(shadow_hl2_table_count);
3302 break;
3305 return 1;
3306 } else {
3307 /* This L1 is shadowed already, but the L2 entry is missing. */
3308 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", *spmfn);
3309 return 0;
3313 static void shadow_map_into_current(struct vcpu *v,
3314 unsigned long va, unsigned int from, unsigned int to)
3316 pgentry_64_t gle = {0}, sle;
3317 unsigned long gpfn, smfn;
3319 if (from == PAGING_L1 && to == PAGING_L2) {
3320 shadow_map_l1_into_current_l2(va);
3321 return;
3324 __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | to);
3325 ASSERT(entry_get_flags(gle) & _PAGE_PRESENT);
3326 gpfn = entry_get_pfn(gle);
3328 get_shadow_mfn(v->domain, gpfn, &smfn, shadow_level_to_type(from));
3330 if ( !get_shadow_ref(smfn) )
3331 BUG();
3332 entry_general(v->domain, &gle, &sle, smfn, to);
3333 __rw_entry(v, va, &gle, GUEST_ENTRY | SET_ENTRY | to);
3334 __rw_entry(v, va, &sle, SHADOW_ENTRY | SET_ENTRY | to);
3337 /*
3338 * shadow_set_lxe should be put in shadow.h
3339 */
3340 static void shadow_set_l2e_64(unsigned long va, l2_pgentry_t sl2e,
3341 int create_l2_shadow, int put_ref_check)
3343 struct vcpu *v = current;
3344 l4_pgentry_t sl4e;
3345 l3_pgentry_t sl3e;
3347 __shadow_get_l4e(v, va, &sl4e);
3348 if (!(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
3349 if (create_l2_shadow) {
3350 perfc_incrc(shadow_set_l3e_force_map);
3351 shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
3352 __shadow_get_l4e(v, va, &sl4e);
3353 } else {
3354 printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
3358 __shadow_get_l3e(v, va, &sl3e);
3359 if (!(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
3360 if (create_l2_shadow) {
3361 perfc_incrc(shadow_set_l2e_force_map);
3362 shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
3363 __shadow_get_l3e(v, va, &sl3e);
3364 } else {
3365 printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
3367 shadow_update_min_max(l4e_get_pfn(sl4e), l3_table_offset(va));
3371 if ( put_ref_check ) {
3372 l2_pgentry_t tmp_sl2e;
3373 if ( __shadow_get_l2e(v, va, &tmp_sl2e) ) {
3374 if ( l2e_get_flags(tmp_sl2e) & _PAGE_PRESENT )
3375 if ( l2e_get_pfn(tmp_sl2e) == l2e_get_pfn(sl2e) ) {
3376 put_shadow_ref(l2e_get_pfn(sl2e));
3382 if (! __shadow_set_l2e(v, va, &sl2e))
3383 BUG();
3384 shadow_update_min_max(l3e_get_pfn(sl3e), l2_table_offset(va));
3388 /* As 32-bit guest don't support 4M page yet,
3389 * we don't concern double compile for this function
3390 */
3391 static inline int l2e_rw_fault(
3392 struct vcpu *v, l2_pgentry_t *gl2e_p, unsigned long va, int rw)
3394 struct domain *d = v->domain;
3395 l2_pgentry_t gl2e = *gl2e_p;
3396 l2_pgentry_t tmp_l2e = gl2e;
3397 unsigned long start_gpfn = l2e_get_pfn(gl2e);
3398 unsigned long gpfn, mfn;
3399 unsigned long l1_mfn, gmfn;
3400 l1_pgentry_t *l1_p;
3401 l1_pgentry_t sl1e;
3402 l1_pgentry_t old_sl1e;
3403 l2_pgentry_t sl2e;
3404 u64 nx = 0;
3405 int put_ref_check = 0;
3406 /* Check if gpfn is 2M aligned */
3408 /* Update guest l2e */
3409 if (rw) {
3410 ASSERT(l2e_get_flags(gl2e) & _PAGE_RW);
3411 l2e_add_flags(gl2e, _PAGE_DIRTY | _PAGE_ACCESSED);
3412 } else {
3413 l2e_add_flags(gl2e, _PAGE_ACCESSED);
3416 l2e_remove_flags(tmp_l2e, _PAGE_PSE);
3417 if (l2e_get_flags(gl2e) & _PAGE_NX) {
3418 l2e_remove_flags(tmp_l2e, _PAGE_NX);
3419 nx = 1ULL << 63;
3423 /* Get the shadow l2 first */
3424 if ( !__shadow_get_l2e(v, va, &sl2e) )
3425 sl2e = l2e_empty();
3427 l1_mfn = __shadow_status(d, start_gpfn | nx, PGT_fl1_shadow);
3429 /* Check the corresponding l2e */
3430 if (l1_mfn) {
3431 /* Why it is PRESENT?*/
3432 if ((l2e_get_flags(sl2e) & _PAGE_PRESENT) &&
3433 l2e_get_pfn(sl2e) == l1_mfn) {
3434 ESH_LOG("sl2e PRSENT bit is set: %lx, l1_mfn = %lx\n", l2e_get_pfn(sl2e), l1_mfn);
3435 } else {
3436 put_ref_check = 1;
3437 if (!get_shadow_ref(l1_mfn))
3438 BUG();
3440 l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn);
3441 sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
3442 } else {
3443 /* Allocate a new page as shadow page table if need */
3444 gmfn = gmfn_to_mfn(d, start_gpfn);
3445 l1_mfn = alloc_shadow_page(d, start_gpfn | nx, gmfn, PGT_fl1_shadow);
3446 if (unlikely(!l1_mfn)) {
3447 BUG();
3450 if (!get_shadow_ref(l1_mfn))
3451 BUG();
3452 l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn );
3453 sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
3454 memset(l1_p, 0, PAGE_SIZE);
3455 ESH_LOG("Alloc a shadow page: %lx\n", l1_mfn);
3458 ESH_LOG("<%s>: sl2e = %lx\n", __func__, l2e_get_intpte(sl2e));
3459 /* Map the page to l2*/
3460 shadow_set_l2e_64(va, sl2e, 1, put_ref_check);
3462 if (l2e_get_flags(gl2e) & _PAGE_NX)
3463 l2e_add_flags(tmp_l2e, _PAGE_NX);
3465 /* Propagate the shadow page table, i.e. setting sl1e */
3466 for (gpfn = start_gpfn;
3467 gpfn < (start_gpfn + L1_PAGETABLE_ENTRIES); gpfn++) {
3469 mfn = gmfn_to_mfn(d, gpfn);
3471 if ( unlikely(!VALID_MFN(mfn)) )
3473 continue;
3476 sl1e = l1e_from_pfn(mfn, l2e_get_flags(tmp_l2e));
3478 if (!rw) {
3479 if ( shadow_mode_log_dirty(d) ||
3480 !(l2e_get_flags(gl2e) & _PAGE_DIRTY) || mfn_is_page_table(mfn) )
3482 l1e_remove_flags(sl1e, _PAGE_RW);
3484 } else {
3485 /* __mark_dirty(d, gmfn); */
3487 // printk("<%s> gpfn: %lx, mfn: %lx, sl1e: %lx\n", __func__, gpfn, mfn, l1e_get_intpte(sl1e));
3488 /* The shadow entrys need setup before shadow_mark_va_out_of_sync()*/
3489 old_sl1e = l1_p[gpfn - start_gpfn];
3491 if ( l1e_has_changed(old_sl1e, sl1e, _PAGE_RW | _PAGE_PRESENT) )
3493 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
3494 !shadow_get_page_from_l1e(sl1e, d) ) {
3495 ESH_LOG("%lx, mfn: %lx why make me empty, start_pfn: %lx, gpfn: %lx\n", l1e_get_intpte(sl1e),mfn, start_gpfn, gpfn);
3496 sl1e = l1e_empty();
3498 if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
3499 put_page_from_l1e(old_sl1e, d);
3502 l1_p[gpfn - start_gpfn] = sl1e;
3504 if (rw) {
3505 /* shadow_mark_va_out_of_sync() need modificatin for 2M pages*/
3506 if ( mfn_is_page_table(mfn) )
3507 shadow_mark_va_out_of_sync_2mp(v, gpfn, mfn,
3508 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * (gpfn - start_gpfn)));
3512 unmap_domain_page(l1_p);
3513 return 1;
3517 /*
3518 * Check P, R/W, U/S bits in the guest page table.
3519 * If the fault belongs to guest return 1,
3520 * else return 0.
3521 */
3522 #if defined( GUEST_PGENTRY_32 )
3523 static inline int guest_page_fault(
3524 struct vcpu *v,
3525 unsigned long va, unsigned int error_code,
3526 guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
3528 /* The following check for 32-bit guest on 64-bit host */
3530 __guest_get_l2e(v, va, gpl2e);
3532 /* Check the guest L2 page-table entry first*/
3533 if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_PRESENT)) )
3534 return 1;
3536 if ( error_code & ERROR_W )
3538 if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_RW)) )
3539 return 1;
3542 if ( error_code & ERROR_U )
3544 if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_USER)) )
3545 return 1;
3548 if ( guest_l2e_get_flags(*gpl2e) & _PAGE_PSE )
3549 return 0;
3551 __guest_get_l1e(v, va, gpl1e);
3553 /* Then check the guest L1 page-table entry */
3554 if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_PRESENT)) )
3555 return 1;
3557 if ( error_code & ERROR_W )
3559 if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_RW)) )
3560 return 1;
3563 if ( error_code & ERROR_U )
3565 if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_USER)) )
3566 return 1;
3569 return 0;
3571 #else
3572 static inline int guest_page_fault(
3573 struct vcpu *v,
3574 unsigned long va, unsigned int error_code,
3575 guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
3577 struct domain *d = v->domain;
3578 pgentry_64_t gle = { 0 };
3579 unsigned long gpfn = 0, mfn;
3580 int i;
3581 unsigned int base_idx = 0;
3582 base_idx = get_cr3_idxval(v);
3584 ASSERT( d->arch.ops->guest_paging_levels >= PAGING_L3 );
3586 #if CONFIG_PAGING_LEVELS == 4
3587 if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
3589 __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | PAGING_L4);
3590 if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
3591 return 1;
3593 if ( error_code & ERROR_W )
3595 if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
3596 return 1;
3599 if ( error_code & ERROR_U )
3601 if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
3602 return 1;
3604 gpfn = entry_get_pfn(gle);
3606 #endif
3608 #if CONFIG_PAGING_LEVELS >= 3
3609 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
3611 if ( SH_GUEST_32PAE )
3612 gpfn = (hvm_get_guest_ctrl_reg(v, 3)) >> PAGE_SHIFT;
3613 else
3614 gpfn = pagetable_get_pfn(v->arch.guest_table);
3616 #endif
3618 for ( i = PAGING_L3; i >= PAGING_L1; i-- )
3620 pgentry_64_t *lva;
3621 /*
3622 * If it's not external mode, then mfn should be machine physical.
3623 */
3624 mfn = gmfn_to_mfn(d, gpfn);
3626 lva = (pgentry_64_t *) map_domain_page(mfn);
3627 gle = lva[guest_table_offset_64(va, i, base_idx)];
3629 unmap_domain_page(lva);
3631 gpfn = entry_get_pfn(gle);
3633 if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
3634 return 1;
3636 if ( i < PAGING_L3 )
3638 if ( error_code & ERROR_W )
3640 if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
3642 if ( i == PAGING_L1 )
3643 if ( gpl1e )
3644 gpl1e->l1 = gle.lo;
3645 return 1;
3648 if ( error_code & ERROR_U )
3650 if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
3651 return 1;
3655 if ( i == PAGING_L2 )
3657 if ( gpl2e )
3658 gpl2e->l2 = gle.lo;
3659 if ( likely(entry_get_flags(gle) & _PAGE_PSE) )
3660 return 0;
3663 if ( i == PAGING_L1 )
3664 if ( gpl1e )
3665 gpl1e->l1 = gle.lo;
3668 return 0;
3671 #endif
3673 static int shadow_fault_64(unsigned long va, struct cpu_user_regs *regs)
3675 struct vcpu *v = current;
3676 struct domain *d = v->domain;
3677 guest_l2_pgentry_t gl2e;
3678 guest_l1_pgentry_t gl1e, orig_gl1e;
3679 l1_pgentry_t sl1e;
3681 gl1e = guest_l1e_empty(); gl2e = guest_l2e_empty();
3683 sl1e = l1e_empty();
3685 perfc_incrc(shadow_fault_calls);
3687 ESH_LOG("<shadow_fault_64> va=%lx, rip = %lx, error code = %x\n",
3688 va, regs->eip, regs->error_code);
3690 /*
3691 * Don't let someone else take the guest's table pages out-of-sync.
3692 */
3693 shadow_lock(d);
3695 /*
3696 * STEP 1. Check to see if this fault might have been caused by an
3697 * out-of-sync table page entry, or if we should pass this
3698 * fault onto the guest.
3699 */
3700 __shadow_sync_va(v, va);
3702 /*
3703 * STEP 2. Check if the fault belongs to guest
3704 */
3705 if ( guest_page_fault(v, va, regs->error_code, &gl2e, &gl1e) )
3707 if ( unlikely(shadow_mode_log_dirty(d)) && l1e_get_intpte(gl1e) != 0 )
3708 goto check_writeable;
3710 goto fail;
3713 if ( unlikely((guest_l2e_get_flags(gl2e) & _PAGE_PSE)) )
3714 goto pse;
3716 /*
3717 * Handle 4K pages here
3718 */
3719 check_writeable:
3720 orig_gl1e = gl1e;
3722 /* Write fault? */
3723 if ( regs->error_code & 2 )
3725 int allow_writes = 0;
3727 if ( unlikely(!(guest_l1e_get_flags(gl1e) & _PAGE_RW)) )
3729 if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gl1e)) )
3731 allow_writes = 1;
3732 l1e_add_flags(gl1e, _PAGE_RW);
3734 else
3736 /* Write fault on a read-only mapping. */
3737 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
3738 l1e_get_intpte(gl1e));
3739 perfc_incrc(shadow_fault_bail_ro_mapping);
3740 goto fail;
3744 if ( !l1pte_write_fault(v, &gl1e, &sl1e, va) )
3746 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
3747 perfc_incrc(write_fault_bail);
3748 shadow_unlock(d);
3749 return 0;
3752 if (allow_writes)
3753 l1e_remove_flags(gl1e, _PAGE_RW);
3755 else
3757 if ( !l1pte_read_fault(d, &gl1e, &sl1e) )
3759 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
3760 perfc_incrc(read_fault_bail);
3761 shadow_unlock(d);
3762 return 0;
3766 /*
3767 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables
3768 */
3769 if ( l1e_has_changed(orig_gl1e, gl1e, PAGE_FLAG_MASK) )
3771 if (unlikely(!__guest_set_l1e(v, va, &gl1e)))
3772 domain_crash_synchronous();
3774 __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gl2e)));
3777 shadow_set_l1e_64(va, (pgentry_64_t *)&sl1e, 1);
3779 perfc_incrc(shadow_fault_fixed);
3780 d->arch.shadow_fault_count++;
3782 shadow_unlock(d);
3784 return EXCRET_fault_fixed;
3786 pse:
3787 /*
3788 * Handle 2M pages here
3789 */
3790 if ( unlikely(!shadow_mode_external(d)) )
3791 BUG();
3793 /* Write fault? */
3794 if ( regs->error_code & 2 )
3796 if ( !l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, WRITE_FAULT) )
3798 goto fail;
3801 else
3803 l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, READ_FAULT);
3806 /*
3807 * STEP 3. Write guest/shadow l2e back
3808 */
3810 if ( unlikely(!__guest_set_l2e(v, va, &gl2e)) )
3812 domain_crash_synchronous();
3815 /*
3816 * Todo: if necessary, record the page table page as dirty
3817 */
3819 perfc_incrc(shadow_fault_fixed);
3820 d->arch.shadow_fault_count++;
3822 shadow_unlock(d);
3824 return EXCRET_fault_fixed;
3825 fail:
3826 shadow_unlock(d);
3827 ESH_LOG("Guest fault~~~\n");
3828 return 0;
3831 static void shadow_invlpg_64(struct vcpu *v, unsigned long va)
3833 struct domain *d = v->domain;
3834 l1_pgentry_t sl1e, old_sl1e;
3836 shadow_lock(d);
3838 __shadow_sync_va(v, va);
3840 if ( shadow_mode_external(d) && __shadow_get_l1e(v, va, &old_sl1e) )
3841 if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
3842 put_page_from_l1e(old_sl1e, d);
3844 sl1e = l1e_empty();
3845 __shadow_set_l1e(v, va, &sl1e);
3847 shadow_unlock(d);
3850 static unsigned long gva_to_gpa_64(unsigned long gva)
3852 struct vcpu *v = current;
3853 guest_l1_pgentry_t gl1e = {0};
3854 guest_l2_pgentry_t gl2e = {0};
3855 unsigned long gpa;
3857 if (guest_page_fault(v, gva, 0, &gl2e, &gl1e))
3858 return 0;
3860 if (guest_l2e_get_flags(gl2e) & _PAGE_PSE)
3861 gpa = guest_l2e_get_paddr(gl2e) + (gva & ((1 << GUEST_L2_PAGETABLE_SHIFT) - 1));
3862 else
3863 gpa = guest_l1e_get_paddr(gl1e) + (gva & ~PAGE_MASK);
3865 return gpa;
3868 /*
3869 * The naming convention of the shadow_ops:
3870 * MODE_<pgentry size>_<guest paging levels>_HANDLER
3871 */
3872 #if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
3873 struct shadow_ops MODE_64_3_HANDLER = {
3874 .guest_paging_levels = 3,
3875 .invlpg = shadow_invlpg_64,
3876 .fault = shadow_fault_64,
3877 .update_pagetables = shadow_update_pagetables,
3878 .sync_all = sync_all,
3879 .remove_all_write_access = remove_all_write_access,
3880 .do_update_va_mapping = do_update_va_mapping,
3881 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
3882 .is_out_of_sync = is_out_of_sync,
3883 .gva_to_gpa = gva_to_gpa_pae,
3884 };
3886 struct shadow_ops MODE_64_4_HANDLER = {
3887 .guest_paging_levels = 4,
3888 .invlpg = shadow_invlpg_64,
3889 .fault = shadow_fault_64,
3890 .update_pagetables = shadow_update_pagetables,
3891 .sync_all = sync_all,
3892 .remove_all_write_access = remove_all_write_access,
3893 .do_update_va_mapping = do_update_va_mapping,
3894 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
3895 .is_out_of_sync = is_out_of_sync,
3896 .gva_to_gpa = gva_to_gpa_64,
3897 };
3898 #endif /* GUEST_PGENTRY_32 */
3899 #endif /* CONFIG_PAGING_LEVELS >= 3 */
3902 #if CONFIG_PAGING_LEVELS == 2
3903 struct shadow_ops MODE_32_2_HANDLER = {
3904 .guest_paging_levels = 2,
3905 .invlpg = shadow_invlpg_32,
3906 .fault = shadow_fault_32,
3907 .update_pagetables = shadow_update_pagetables,
3908 .sync_all = sync_all,
3909 .remove_all_write_access = remove_all_write_access,
3910 .do_update_va_mapping = do_update_va_mapping,
3911 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
3912 .is_out_of_sync = is_out_of_sync,
3913 .gva_to_gpa = gva_to_gpa_64,
3914 };
3915 #endif
3917 #if ( CONFIG_PAGING_LEVELS == 3 && !defined (GUEST_PGENTRY_32) ) || \
3918 ( CONFIG_PAGING_LEVELS == 4 && defined (GUEST_PGENTRY_32) )
3921 /*
3922 * Use GUEST_PGENTRY_32 to force PAE_SHADOW_SELF_ENTRY for L4.
3924 * Very simple shadow code to handle 1:1 direct mapping for guest
3925 * non-paging code, which actually is running in PAE/vm86 mode with
3926 * paging-enabled.
3928 * We expect that the top level (L3) page has been allocated and initialized.
3929 */
3930 int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs)
3932 struct vcpu *v = current;
3933 struct domain *d = v->domain;
3934 l3_pgentry_t sl3e, *sl3e_p;
3935 l2_pgentry_t sl2e, *sl2e_p;
3936 l1_pgentry_t sl1e;
3937 unsigned long mfn, smfn;
3938 struct page_info *page;
3940 /*
3941 * If the faulting address is within the MMIO range, we continue
3942 * on handling the #PF as such.
3943 */
3944 if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN )
3945 return 0;
3947 shadow_lock(d);
3949 __direct_get_l3e(v, vpa, &sl3e);
3951 if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3953 page = alloc_domheap_page(NULL);
3954 if ( !page )
3955 goto nomem;
3957 smfn = page_to_mfn(page);
3958 sl3e = l3e_from_pfn(smfn, _PAGE_PRESENT);
3960 sl3e_p = (l3_pgentry_t *)map_domain_page(smfn);
3961 memset(sl3e_p, 0, PAGE_SIZE);
3962 unmap_domain_page(sl3e_p);
3964 __direct_set_l3e(v, vpa, &sl3e);
3967 __direct_get_l2e(v, vpa, &sl2e);
3969 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3971 page = alloc_domheap_page(NULL);
3972 if ( !page )
3973 goto nomem;
3975 smfn = page_to_mfn(page);
3976 sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER);
3977 sl2e_p = (l2_pgentry_t *)map_domain_page(smfn);
3978 memset(sl2e_p, 0, PAGE_SIZE);
3979 unmap_domain_page(sl2e_p);
3981 __direct_set_l2e(v, vpa, &sl2e);
3984 __direct_get_l1e(v, vpa, &sl1e);
3986 if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) )
3988 sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER);
3989 __direct_set_l1e(v, vpa, &sl1e);
3992 shadow_unlock(d);
3993 return EXCRET_fault_fixed;
3995 nomem:
3996 shadow_direct_map_clean(d);
3997 domain_crash_synchronous();
3999 #endif
4001 /*
4002 * Local variables:
4003 * mode: C
4004 * c-set-style: "BSD"
4005 * c-basic-offset: 4
4006 * tab-width: 4
4007 * indent-tabs-mode: nil
4008 * End:
4009 */