ia64/xen-unstable

view xen/arch/x86/shadow.c @ 9458:4840c3da2521

Allow 64-bit Xen to run 64-bit hvm SMP guests.

Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Sun Mar 26 11:45:35 2006 +0100 (2006-03-26)
parents f4cef1aa2521
children 0267063e050c
line source
1 /******************************************************************************
2 * arch/x86/shadow.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 /*
22 * Jun Nakajima <jun.nakajima@intel.com>
23 * Chengyuan Li <chengyuan.li@intel.com>
24 *
25 * Extended to support 32-bit PAE and 64-bit guests.
26 */
28 #include <xen/config.h>
29 #include <xen/types.h>
30 #include <xen/mm.h>
31 #include <xen/domain_page.h>
32 #include <asm/shadow.h>
33 #include <asm/page.h>
34 #include <xen/event.h>
35 #include <xen/sched.h>
36 #include <xen/trace.h>
37 #include <asm/shadow_64.h>
39 /* Use this to have the compiler remove unnecessary branches */
40 #define SH_L1_HAS_NEXT_PAGE (GUEST_L1_PAGETABLE_ENTRIES - L1_PAGETABLE_ENTRIES)
42 extern void free_shadow_pages(struct domain *d);
44 #if 0 // this code has not been updated for 32pae & 64 bit modes
45 #if SHADOW_DEBUG
46 static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
47 #endif
48 #endif
50 #if CONFIG_PAGING_LEVELS == 3
51 static unsigned long shadow_l3_table(
52 struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
53 #endif
55 #if CONFIG_PAGING_LEVELS == 4
56 static unsigned long shadow_l4_table(
57 struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
58 #endif
60 #if CONFIG_PAGING_LEVELS >= 3
61 static void shadow_map_into_current(struct vcpu *v,
62 unsigned long va, unsigned int from, unsigned int to);
63 static inline void validate_bl2e_change( struct domain *d,
64 guest_root_pgentry_t *new_gle_p, pgentry_64_t *shadow_l3, int index);
65 static void update_top_level_shadow(struct vcpu *v, unsigned long smfn);
66 #endif
68 /********
70 There's a per-domain shadow table spin lock which works fine for SMP
71 hosts. We don't have to worry about interrupts as no shadow operations
72 happen in an interrupt context. It's probably not quite ready for SMP
73 guest operation as we have to worry about synchonisation between gpte
74 and spte updates. Its possible that this might only happen in a
75 hypercall context, in which case we'll probably at have a per-domain
76 hypercall lock anyhow (at least initially).
78 ********/
80 static inline int
81 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
82 unsigned long new_type)
83 {
84 struct page_info *page = mfn_to_page(gmfn);
85 int pinned = 0, okay = 1;
87 if ( page_out_of_sync(page) )
88 {
89 // Don't know how long ago this snapshot was taken.
90 // Can't trust it to be recent enough.
91 //
92 __shadow_sync_mfn(d, gmfn);
93 }
95 if ( !shadow_mode_refcounts(d) )
96 return 1;
98 if ( unlikely(page_is_page_table(page)) )
99 return 1;
101 FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
103 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
104 {
105 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
106 __func__, gpfn, gmfn);
107 #if 1 || defined(LIVE_DANGEROUSLY)
108 set_bit(_PGC_page_table, &page->count_info);
109 return 1;
110 #endif
111 return 0;
112 }
114 // To convert this page to use as a page table, the writable count
115 // should now be zero. Test this by grabbing the page as an page table,
116 // and then immediately releasing. This will also deal with any
117 // necessary TLB flushing issues for us.
118 //
119 // The cruft here about pinning doesn't really work right. This
120 // needs rethinking/rewriting... Need to gracefully deal with the
121 // TLB flushes required when promoting a writable page, and also deal
122 // with any outstanding (external) writable refs to this page (by
123 // refusing to promote it). The pinning headache complicates this
124 // code -- it would all get much simpler if we stop using
125 // shadow_lock() and move the shadow code to BIGLOCK().
126 //
127 if ( unlikely(!get_page(page, d)) )
128 BUG(); // XXX -- needs more thought for a graceful failure
129 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
130 {
131 pinned = 1;
132 put_page_and_type(page);
133 }
134 if ( get_page_type(page, PGT_base_page_table) )
135 {
136 set_bit(_PGC_page_table, &page->count_info);
137 put_page_type(page);
138 }
139 else
140 {
141 printk("shadow_promote: get_page_type failed "
142 "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
143 d->domain_id, gpfn, gmfn, new_type);
144 okay = 0;
145 }
147 // Now put the type back to writable...
148 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
149 BUG(); // XXX -- needs more thought for a graceful failure
150 if ( unlikely(pinned) )
151 {
152 if ( unlikely(test_and_set_bit(_PGT_pinned,
153 &page->u.inuse.type_info)) )
154 BUG(); // hmm... someone pinned this again?
155 }
156 else
157 put_page_and_type(page);
159 return okay;
160 }
163 /*
164 * Things in shadow mode that collect get_page() refs to the domain's
165 * pages are:
166 * - PGC_allocated takes a gen count, just like normal.
167 * - A writable page can be pinned (paravirtualized guests may consider
168 * these pages to be L1s or L2s, and don't know the difference).
169 * Pinning a page takes a gen count (but, for domains in shadow mode,
170 * it *doesn't* take a type count)
171 * - CR3 grabs a ref to whatever it points at, just like normal.
172 * - Shadow mode grabs an initial gen count for itself, as a placehold
173 * for whatever references will exist.
174 * - Shadow PTEs that point to a page take a gen count, just like regular
175 * PTEs. However, they don't get a type count, as get_page_type() is
176 * hardwired to keep writable pages' counts at 1 for domains in shadow
177 * mode.
178 * - Whenever we shadow a page, the entry in the shadow hash grabs a
179 * general ref to the page.
180 * - Whenever a page goes out of sync, the out of sync entry grabs a
181 * general ref to the page.
182 */
183 /*
184 * page_info fields for pages allocated as shadow pages:
185 *
186 * All 32 bits of count_info are a simple count of refs to this shadow
187 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
188 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
189 * references.
190 *
191 * u.inuse._domain is left NULL, to prevent accidently allow some random
192 * domain from gaining permissions to map this page.
193 *
194 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
195 * shadowed.
196 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
197 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
198 * is currently exists because this is a shadow of a root page, and we
199 * don't want to let those disappear just because no CR3 is currently pointing
200 * at it.
201 *
202 * tlbflush_timestamp holds a min & max index of valid page table entries
203 * within the shadow page.
204 */
205 static inline void
206 shadow_page_info_init(struct page_info *page,
207 unsigned long gmfn,
208 u32 psh_type)
209 {
210 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
211 page->u.inuse.type_info = psh_type | gmfn;
212 page->count_info = 0;
213 page->tlbflush_timestamp = 0;
214 }
216 static inline unsigned long
217 alloc_shadow_page(struct domain *d,
218 unsigned long gpfn, unsigned long gmfn,
219 u32 psh_type)
220 {
221 struct page_info *page;
222 unsigned long smfn, real_gpfn;
223 int pin = 0;
224 void *l1, *lp;
226 // Currently, we only keep pre-zero'ed pages around for use as L1's...
227 // This will change. Soon.
228 //
229 if ( psh_type == PGT_l1_shadow )
230 {
231 if ( !list_empty(&d->arch.free_shadow_frames) )
232 {
233 struct list_head *entry = d->arch.free_shadow_frames.next;
234 page = list_entry(entry, struct page_info, list);
235 list_del(entry);
236 perfc_decr(free_l1_pages);
237 }
238 else
239 {
240 if ( SH_L1_HAS_NEXT_PAGE &&
241 d->arch.ops->guest_paging_levels == PAGING_L2)
242 {
243 #if CONFIG_PAGING_LEVELS >= 3
244 /*
245 * For 32-bit HVM guest, 2 shadow L1s are required to
246 * simulate 1 guest L1 So need allocate 2 shadow L1
247 * pages each time.
248 *
249 * --> Need to avoidalloc_domheap_pages.
250 */
251 page = alloc_domheap_pages(NULL, SL1_ORDER, 0);
252 if (!page)
253 goto no_shadow_page;
255 l1 = map_domain_page(page_to_mfn(page));
256 memset(l1, 0, PAGE_SIZE);
257 unmap_domain_page(l1);
259 l1 = map_domain_page(page_to_mfn(page + 1));
260 memset(l1, 0, PAGE_SIZE);
261 unmap_domain_page(l1);
263 /* we'd like to initialize the second continuous page here
264 * and leave the first page initialization later */
266 shadow_page_info_init(page+1, gmfn, psh_type);
267 #else
268 page = alloc_domheap_page(NULL);
269 if (!page)
270 goto no_shadow_page;
272 l1 = map_domain_page(page_to_mfn(page));
273 memset(l1, 0, PAGE_SIZE);
274 unmap_domain_page(l1);
275 #endif
276 }
277 else
278 {
279 page = alloc_domheap_page(NULL);
280 if (!page)
281 goto no_shadow_page;
283 l1 = map_domain_page(page_to_mfn(page));
284 memset(l1, 0, PAGE_SIZE);
285 unmap_domain_page(l1);
286 }
287 }
288 }
289 else {
290 #if CONFIG_PAGING_LEVELS == 2
291 page = alloc_domheap_page(NULL);
292 #elif CONFIG_PAGING_LEVELS >= 3
293 if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
294 psh_type == PGT_l4_shadow ) /* allocated for PAE PDP page */
295 page = alloc_domheap_pages(NULL, 0, ALLOC_DOM_DMA);
296 else if ( d->arch.ops->guest_paging_levels == PAGING_L3 &&
297 (psh_type == PGT_l3_shadow || psh_type == PGT_l4_shadow) )
298 page = alloc_domheap_pages(NULL, 0, ALLOC_DOM_DMA); /* allocated for PAE PDP page */
299 else
300 page = alloc_domheap_page(NULL);
301 #endif
302 if (!page)
303 goto no_shadow_page;
305 lp = map_domain_page(page_to_mfn(page));
306 memset(lp, 0, PAGE_SIZE);
307 unmap_domain_page(lp);
308 }
310 smfn = page_to_mfn(page);
312 shadow_page_info_init(page, gmfn, psh_type);
314 switch ( psh_type )
315 {
316 case PGT_l1_shadow:
317 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
318 goto fail;
319 perfc_incr(shadow_l1_pages);
320 d->arch.shadow_page_count++;
321 break;
323 case PGT_l2_shadow:
324 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
325 goto fail;
326 perfc_incr(shadow_l2_pages);
327 d->arch.shadow_page_count++;
328 if ( PGT_l2_page_table == PGT_root_page_table )
329 pin = 1;
331 break;
333 case PGT_l3_shadow:
334 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
335 goto fail;
336 perfc_incr(shadow_l3_pages);
337 d->arch.shadow_page_count++;
338 if ( PGT_l3_page_table == PGT_root_page_table )
339 pin = 1;
340 break;
342 case PGT_l4_shadow:
343 real_gpfn = gpfn & PGT_mfn_mask;
344 if ( !shadow_promote(d, real_gpfn, gmfn, psh_type) )
345 goto fail;
346 perfc_incr(shadow_l4_pages);
347 d->arch.shadow_page_count++;
348 if ( PGT_l4_page_table == PGT_root_page_table )
349 pin = 1;
350 break;
352 #if CONFIG_PAGING_LEVELS >= 4
353 case PGT_fl1_shadow:
354 perfc_incr(shadow_l1_pages);
355 d->arch.shadow_page_count++;
356 break;
357 #else
359 case PGT_hl2_shadow:
360 // Treat an hl2 as an L1 for purposes of promotion.
361 // For external mode domains, treat them as an L2 for purposes of
362 // pinning.
363 //
364 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
365 goto fail;
366 perfc_incr(hl2_table_pages);
367 d->arch.hl2_page_count++;
368 if ( shadow_mode_external(d) &&
369 (PGT_l2_page_table == PGT_root_page_table) )
370 pin = 1;
372 break;
373 #endif
374 case PGT_snapshot:
375 perfc_incr(snapshot_pages);
376 d->arch.snapshot_page_count++;
377 break;
379 default:
380 printk("Alloc shadow weird page type type=%08x\n", psh_type);
381 BUG();
382 break;
383 }
385 // Don't add a new shadow of something that already has a snapshot.
386 //
387 ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
389 set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
391 if ( pin )
392 shadow_pin(smfn);
394 return smfn;
396 fail:
397 FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?",
398 gpfn, gmfn);
399 if (psh_type == PGT_l1_shadow)
400 {
401 if (d->arch.ops->guest_paging_levels == PAGING_L2)
402 {
403 #if CONFIG_PAGING_LEVELS >=3
404 free_domheap_pages(page, SL1_ORDER);
405 #else
406 free_domheap_page(page);
407 #endif
408 }
409 else
410 free_domheap_page(page);
411 }
412 else
413 free_domheap_page(page);
415 return 0;
417 no_shadow_page:
418 ASSERT(page == NULL);
419 printk("Couldn't alloc shadow page! dom%d count=%d\n",
420 d->domain_id, d->arch.shadow_page_count);
421 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
422 perfc_value(shadow_l1_pages),
423 perfc_value(shadow_l2_pages),
424 perfc_value(hl2_table_pages),
425 perfc_value(snapshot_pages));
426 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
428 return 0;
429 }
431 #if CONFIG_PAGING_LEVELS == 2
432 static unsigned long
433 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
434 unsigned long smfn)
435 {
436 unsigned long hl2mfn;
437 l1_pgentry_t *hl2;
438 int limit;
440 ASSERT(PGT_base_page_table == PGT_l2_page_table);
442 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
443 {
444 printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
445 gpfn, gmfn);
446 BUG(); /* XXX Deal gracefully with failure. */
447 }
449 SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
450 gpfn, gmfn, smfn, hl2mfn);
451 perfc_incrc(shadow_hl2_table_count);
453 hl2 = map_domain_page(hl2mfn);
455 if ( shadow_mode_external(d) )
456 limit = L2_PAGETABLE_ENTRIES;
457 else
458 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
460 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
462 if ( !shadow_mode_external(d) )
463 {
464 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
465 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
467 // Setup easy access to the GL2, SL2, and HL2 frames.
468 //
469 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
470 l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
471 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
472 l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
473 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
474 l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
475 }
477 unmap_domain_page(hl2);
479 return hl2mfn;
480 }
482 /*
483 * This could take and use a snapshot, and validate the entire page at
484 * once, or it could continue to fault in entries one at a time...
485 * Might be worth investigating...
486 */
487 static unsigned long shadow_l2_table(
488 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
489 {
490 unsigned long smfn;
491 l2_pgentry_t *spl2e;
492 struct domain *d = v->domain;
493 int i;
495 SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
497 perfc_incrc(shadow_l2_table_count);
499 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
500 {
501 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
502 gpfn, gmfn);
503 BUG(); /* XXX Deal gracefully with failure. */
504 }
506 spl2e = (l2_pgentry_t *)map_domain_page(smfn);
508 /* Install hypervisor and 2x linear p.t. mapings. */
509 if ( (PGT_base_page_table == PGT_l2_page_table) &&
510 !shadow_mode_external(d) )
511 {
512 /*
513 * We could proactively fill in PDEs for pages that are already
514 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
515 * (restriction required for coherence of the accessed bit). However,
516 * we tried it and it didn't help performance. This is simpler.
517 */
518 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
520 /* Install hypervisor and 2x linear p.t. mapings. */
521 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
522 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
523 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
525 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
526 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
528 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
529 spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
530 l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
531 arch.mm_perdomain_pt) + i,
532 __PAGE_HYPERVISOR);
534 if ( shadow_mode_translate(d) ) // NB: not external
535 {
536 unsigned long hl2mfn;
538 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
539 l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
540 __PAGE_HYPERVISOR);
542 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
543 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
545 // shadow_mode_translate (but not external) sl2 tables hold a
546 // ref to their hl2.
547 //
548 if ( !get_shadow_ref(hl2mfn) )
549 BUG();
551 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
552 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
553 }
554 else
555 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
556 l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
557 }
558 else
559 {
560 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
561 }
563 unmap_domain_page(spl2e);
565 SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
566 return smfn;
567 }
568 #endif /* CONFIG_PAGING_LEVELS == 2 */
570 static void shadow_map_l1_into_current_l2(unsigned long va)
571 {
572 struct vcpu *v = current;
573 struct domain *d = v->domain;
574 l1_pgentry_t *spl1e, *spl1e_next = 0;
575 l2_pgentry_t sl2e;
576 guest_l1_pgentry_t *gpl1e;
577 guest_l2_pgentry_t gl2e = {0};
578 unsigned long gl1pfn, gl1mfn, sl1mfn;
579 int i, init_table = 0;
581 __guest_get_l2e(v, va, &gl2e);
582 ASSERT(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT);
583 gl1pfn = l2e_get_pfn(gl2e);
585 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
586 {
587 /* This L1 is NOT already shadowed so we need to shadow it. */
588 SH_VVLOG("4a: l1 not shadowed");
590 gl1mfn = gmfn_to_mfn(d, gl1pfn);
591 if ( unlikely(!VALID_MFN(gl1mfn)) )
592 {
593 // Attempt to use an invalid pfn as an L1 page.
594 // XXX this needs to be more graceful!
595 BUG();
596 }
598 if ( unlikely(!(sl1mfn =
599 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
600 {
601 printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
602 gl1pfn, gl1mfn);
603 BUG(); /* XXX Need to deal gracefully with failure. */
604 }
606 perfc_incrc(shadow_l1_table_count);
607 init_table = 1;
608 }
609 else
610 {
611 /* This L1 is shadowed already, but the L2 entry is missing. */
612 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
613 }
615 #ifndef NDEBUG
616 {
617 l2_pgentry_t old_sl2e;
618 __shadow_get_l2e(v, va, &old_sl2e);
619 ASSERT(!(l2e_get_flags(old_sl2e) & _PAGE_PRESENT));
620 }
621 #endif
623 #if CONFIG_PAGING_LEVELS >= 3
624 if ( SH_L1_HAS_NEXT_PAGE &&
625 d->arch.ops->guest_paging_levels == PAGING_L2 )
626 {
627 /* for 32-bit HVM guest on 64-bit or PAE host,
628 * need update two L2 entries each time
629 */
630 if ( !get_shadow_ref(sl1mfn))
631 BUG();
632 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
633 __guest_set_l2e(v, va, &gl2e);
634 __shadow_set_l2e(v, va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1), &sl2e);
635 if ( !get_shadow_ref(sl1mfn+1))
636 BUG();
637 sl2e = l2e_empty();
638 l2pde_general(d, &gl2e, &sl2e, sl1mfn+1);
639 __shadow_set_l2e(v,((va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1)) + (1 << L2_PAGETABLE_SHIFT)) , &sl2e);
640 } else
641 #endif
642 {
643 if ( !get_shadow_ref(sl1mfn) )
644 BUG();
645 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
646 __guest_set_l2e(v, va, &gl2e);
647 __shadow_set_l2e(v, va , &sl2e);
648 }
650 if ( init_table )
651 {
652 l1_pgentry_t sl1e;
653 int index = guest_l1_table_offset(va);
654 int min = 1, max = 0;
656 unsigned long tmp_gmfn;
657 l2_pgentry_t tmp_sl2e = {0};
658 guest_l2_pgentry_t tmp_gl2e = {0};
660 __guest_get_l2e(v, va, &tmp_gl2e);
661 tmp_gmfn = gmfn_to_mfn(d, l2e_get_pfn(tmp_gl2e));
662 gpl1e = (guest_l1_pgentry_t *) map_domain_page(tmp_gmfn);
664 /* If the PGT_l1_shadow has two contiguous pages */
665 #if CONFIG_PAGING_LEVELS >= 3
666 if ( SH_L1_HAS_NEXT_PAGE &&
667 d->arch.ops->guest_paging_levels == PAGING_L2 )
668 __shadow_get_l2e(v, va & ~((1UL << L2_PAGETABLE_SHIFT_32) - 1), &tmp_sl2e);
669 else
670 #endif
671 __shadow_get_l2e(v, va, &tmp_sl2e);
673 spl1e = (l1_pgentry_t *) map_domain_page(l2e_get_pfn(tmp_sl2e));
675 if ( SH_L1_HAS_NEXT_PAGE )
676 spl1e_next = (l1_pgentry_t *) map_domain_page(
677 (l2e_get_pfn(tmp_sl2e) + 1UL));
679 for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
680 {
681 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
682 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
683 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
684 sl1e = l1e_empty();
685 if ( l1e_get_flags(sl1e) == 0 )
686 {
687 // First copy entries from 0 until first invalid.
688 // Then copy entries from index until first invalid.
689 //
690 if ( i < index ) {
691 i = index - 1;
692 continue;
693 }
694 break;
695 }
697 if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
698 spl1e_next[i - L1_PAGETABLE_ENTRIES] = sl1e;
699 else
700 spl1e[i] = sl1e;
702 if ( unlikely(i < min) )
703 min = i;
704 if ( likely(i > max) )
705 max = i;
706 set_guest_back_ptr(d, sl1e, sl1mfn, i);
707 }
709 mfn_to_page(sl1mfn)->tlbflush_timestamp =
710 SHADOW_ENCODE_MIN_MAX(min, max);
712 unmap_domain_page(gpl1e);
713 unmap_domain_page(spl1e);
715 if ( SH_L1_HAS_NEXT_PAGE )
716 unmap_domain_page(spl1e_next);
717 }
718 }
720 #if CONFIG_PAGING_LEVELS == 2
721 static void
722 shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
723 {
724 struct vcpu *v = current;
725 struct domain *d = v->domain;
726 l2_pgentry_t sl2e = {0};
728 __shadow_get_l2e(v, va, &sl2e);
729 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
730 {
731 /*
732 * Either the L1 is not shadowed, or the shadow isn't linked into
733 * the current shadow L2.
734 */
735 if ( create_l1_shadow )
736 {
737 perfc_incrc(shadow_set_l1e_force_map);
738 shadow_map_l1_into_current_l2(va);
739 }
740 else /* check to see if it exists; if so, link it in */
741 {
742 l2_pgentry_t gpde = {0};
743 unsigned long gl1pfn;
744 unsigned long sl1mfn;
746 __guest_get_l2e(v, va, &gpde);
748 if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
749 {
750 gl1pfn = l2e_get_pfn(gpde);
751 sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
752 }
753 else
754 {
755 // no shadow exists, so there's nothing to do.
756 perfc_incrc(shadow_set_l1e_fail);
757 return;
758 }
760 if ( sl1mfn )
761 {
762 perfc_incrc(shadow_set_l1e_unlinked);
763 if ( !get_shadow_ref(sl1mfn) )
764 BUG();
765 l2pde_general(d, (guest_l2_pgentry_t *)&gpde, &sl2e, sl1mfn);
766 __guest_set_l2e(v, va, &gpde);
767 __shadow_set_l2e(v, va, &sl2e);
768 }
769 else
770 {
771 // no shadow exists, so there's nothing to do.
772 perfc_incrc(shadow_set_l1e_fail);
773 return;
774 }
775 }
776 }
778 __shadow_get_l2e(v, va, &sl2e);
780 if ( shadow_mode_refcounts(d) )
781 {
782 l1_pgentry_t old_spte;
783 __shadow_get_l1e(v, va, &old_spte);
785 // only do the ref counting if something important changed.
786 //
787 if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
788 {
789 if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
790 !shadow_get_page_from_l1e(new_spte, d) )
791 new_spte = l1e_empty();
792 if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
793 shadow_put_page_from_l1e(old_spte, d);
794 }
795 }
797 set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
798 __shadow_set_l1e(v, va, &new_spte);
799 shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
800 }
802 static void shadow_invlpg_32(struct vcpu *v, unsigned long va)
803 {
804 struct domain *d = v->domain;
805 l1_pgentry_t gpte, spte;
807 ASSERT(shadow_mode_enabled(d));
809 shadow_lock(d);
811 __shadow_sync_va(v, va);
813 // XXX mafetter: will need to think about 4MB pages...
815 // It's not strictly necessary to update the shadow here,
816 // but it might save a fault later.
817 //
818 /*if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
819 sizeof(gpte))) {*/
820 if (unlikely(!__guest_get_l1e(v, va, &gpte))) {
821 perfc_incrc(shadow_invlpg_faults);
822 shadow_unlock(d);
823 return;
824 }
825 l1pte_propagate_from_guest(d, gpte, &spte);
826 shadow_set_l1e(va, spte, 1);
828 shadow_unlock(d);
829 }
830 #endif /* CONFIG_PAGING_LEVELS == 2 */
832 #if CONFIG_PAGING_LEVELS >= 3
833 static void shadow_set_l1e_64(
834 unsigned long va, pgentry_64_t *sl1e_p,
835 int create_l1_shadow)
836 {
837 struct vcpu *v = current;
838 struct domain *d = v->domain;
839 pgentry_64_t sle = { 0 };
840 pgentry_64_t sle_up = {0};
841 l1_pgentry_t old_spte;
842 l1_pgentry_t sl1e = *(l1_pgentry_t *)sl1e_p;
843 int i;
844 unsigned long orig_va = 0;
846 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
847 {
848 /* This is for 32-bit VMX guest on 64-bit host */
849 orig_va = va;
850 va = va & (~((1<<L2_PAGETABLE_SHIFT_32)-1));
851 }
853 for ( i = PAGING_L4; i >= PAGING_L2; i-- )
854 {
855 if ( !__rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i) )
856 {
857 sl1e = l1e_empty();
858 goto out;
859 }
860 if ( !(entry_get_flags(sle) & _PAGE_PRESENT) )
861 {
862 if ( create_l1_shadow )
863 {
864 perfc_incrc(shadow_set_l3e_force_map);
865 shadow_map_into_current(v, va, i-1, i);
866 __rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i);
867 }
868 }
869 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
870 {
871 if ( i < PAGING_L3 )
872 shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
873 }
874 else
875 {
876 if ( i < PAGING_L4 )
877 shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
878 }
880 sle_up = sle;
881 }
883 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
884 {
885 va = orig_va;
886 }
888 if ( shadow_mode_refcounts(d) )
889 {
890 __shadow_get_l1e(v, va, &old_spte);
891 if ( l1e_has_changed(old_spte, sl1e, _PAGE_RW | _PAGE_PRESENT) )
892 {
893 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
894 !shadow_get_page_from_l1e(sl1e, d) )
895 sl1e = l1e_empty();
896 if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
897 put_page_from_l1e(old_spte, d);
898 }
899 }
901 out:
902 __shadow_set_l1e(v, va, &sl1e);
904 shadow_update_min_max(entry_get_pfn(sle_up), guest_l1_table_offset(va));
905 }
906 #endif /* CONFIG_PAGING_LEVELS >= 3 */
908 static struct out_of_sync_entry *
909 shadow_alloc_oos_entry(struct domain *d)
910 {
911 struct out_of_sync_entry *f, *extra;
912 unsigned size, i;
914 if ( unlikely(d->arch.out_of_sync_free == NULL) )
915 {
916 FSH_LOG("Allocate more fullshadow tuple blocks.");
918 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
919 extra = xmalloc_bytes(size);
921 /* XXX Should be more graceful here. */
922 if ( extra == NULL )
923 BUG();
925 memset(extra, 0, size);
927 /* Record the allocation block so it can be correctly freed later. */
928 d->arch.out_of_sync_extras_count++;
929 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
930 d->arch.out_of_sync_extras;
931 d->arch.out_of_sync_extras = &extra[0];
933 /* Thread a free chain through the newly-allocated nodes. */
934 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
935 extra[i].next = &extra[i+1];
936 extra[i].next = NULL;
938 /* Add the new nodes to the free list. */
939 d->arch.out_of_sync_free = &extra[0];
940 }
942 /* Allocate a new node from the quicklist. */
943 f = d->arch.out_of_sync_free;
944 d->arch.out_of_sync_free = f->next;
946 return f;
947 }
949 static inline unsigned long
950 shadow_make_snapshot(
951 struct domain *d, unsigned long gpfn, unsigned long gmfn)
952 {
953 unsigned long smfn, sl1mfn = 0;
954 void *original, *snapshot;
955 u32 min_max = 0;
956 int min, max, length;
958 if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
959 {
960 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
961 return SHADOW_SNAPSHOT_ELSEWHERE;
962 }
964 perfc_incrc(shadow_make_snapshot);
966 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
967 {
968 printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
969 "Dom%d snapshot_count_count=%d\n",
970 gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
971 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
972 }
974 if ( !get_shadow_ref(smfn) )
975 BUG();
977 if ( shadow_mode_refcounts(d) &&
978 (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
979 min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
980 mfn_to_page(smfn)->tlbflush_timestamp = min_max;
982 min = SHADOW_MIN(min_max);
983 max = SHADOW_MAX(min_max);
984 length = max - min + 1;
985 perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
987 min *= sizeof(guest_l1_pgentry_t);
988 length *= sizeof(guest_l1_pgentry_t);
990 original = map_domain_page(gmfn);
991 snapshot = map_domain_page(smfn);
992 memcpy(snapshot + min, original + min, length);
993 unmap_domain_page(original);
994 unmap_domain_page(snapshot);
996 return smfn;
997 }
999 static struct out_of_sync_entry *
1000 __mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
1001 unsigned long mfn)
1003 struct domain *d = v->domain;
1004 struct page_info *page = mfn_to_page(mfn);
1005 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
1007 ASSERT(shadow_lock_is_acquired(d));
1008 ASSERT(mfn_valid(mfn));
1010 #ifndef NDEBUG
1012 u32 type = page->u.inuse.type_info & PGT_type_mask;
1013 if ( shadow_mode_refcounts(d) )
1015 ASSERT(type == PGT_writable_page);
1017 else
1019 ASSERT(type && (type < PGT_l4_page_table));
1022 #endif
1024 FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
1025 gpfn, mfn, page->count_info, page->u.inuse.type_info);
1027 // XXX this will require some more thought... Cross-domain sharing and
1028 // modification of page tables? Hmm...
1029 //
1030 if ( d != page_get_owner(page) )
1031 BUG();
1033 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
1035 entry->v = v;
1036 entry->gpfn = gpfn;
1037 entry->gmfn = mfn;
1038 entry->writable_pl1e = -1;
1040 #if 0 // this code has not been updated for 32pae & 64 bit modes
1041 #if SHADOW_DEBUG
1042 mark_shadows_as_reflecting_snapshot(d, gpfn);
1043 #endif
1044 #endif
1046 // increment guest's ref count to represent the entry in the
1047 // full shadow out-of-sync list.
1048 //
1049 get_page(page, d);
1051 return entry;
1054 static struct out_of_sync_entry *
1055 mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
1056 unsigned long mfn)
1058 struct out_of_sync_entry *entry =
1059 __mark_mfn_out_of_sync(v, gpfn, mfn);
1060 struct domain *d = v->domain;
1062 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1063 // Add to the out-of-sync list
1064 //
1065 entry->next = d->arch.out_of_sync;
1066 d->arch.out_of_sync = entry;
1068 return entry;
1072 static void shadow_mark_va_out_of_sync(
1073 struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
1075 struct out_of_sync_entry *entry =
1076 __mark_mfn_out_of_sync(v, gpfn, mfn);
1077 l2_pgentry_t sl2e;
1078 struct domain *d = v->domain;
1080 #if CONFIG_PAGING_LEVELS >= 3
1082 l4_pgentry_t sl4e;
1083 l3_pgentry_t sl3e;
1085 __shadow_get_l4e(v, va, &sl4e);
1086 if ( !(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
1087 shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
1090 if (!__shadow_get_l3e(v, va, &sl3e)) {
1091 BUG();
1094 if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
1095 shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
1098 #endif
1100 // We need the address of shadow PTE that maps @va.
1101 // It might not exist yet. Make sure it's there.
1102 //
1103 __shadow_get_l2e(v, va, &sl2e);
1104 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
1106 // either this L1 isn't shadowed yet, or the shadow isn't linked into
1107 // the current L2.
1108 shadow_map_l1_into_current_l2(va);
1109 __shadow_get_l2e(v, va, &sl2e);
1111 ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
1113 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1114 // NB: this is stored as a machine address.
1115 entry->writable_pl1e =
1116 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
1117 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
1118 entry->va = va;
1120 // Increment shadow's page count to represent the reference
1121 // inherent in entry->writable_pl1e
1122 //
1123 if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
1124 BUG();
1126 // Add to the out-of-sync list
1127 //
1128 entry->next = d->arch.out_of_sync;
1129 d->arch.out_of_sync = entry;
1131 FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
1132 __func__, va, entry->writable_pl1e);
1135 /*
1136 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
1137 * Returns 0 otherwise.
1138 */
1139 static int snapshot_entry_matches(
1140 struct domain *d, guest_l1_pgentry_t *guest_pt,
1141 unsigned long gpfn, unsigned index)
1143 unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
1144 guest_l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
1145 int entries_match;
1147 perfc_incrc(snapshot_entry_matches_calls);
1149 if ( !smfn )
1150 return 0;
1152 snapshot = map_domain_page(smfn);
1154 if (__copy_from_user(&gpte, &guest_pt[index],
1155 sizeof(gpte)))
1157 unmap_domain_page(snapshot);
1158 return 0;
1161 // This could probably be smarter, but this is sufficent for
1162 // our current needs.
1163 //
1164 entries_match = !guest_l1e_has_changed(gpte, snapshot[index],
1165 PAGE_FLAG_MASK);
1167 unmap_domain_page(snapshot);
1169 #ifdef PERF_COUNTERS
1170 if ( entries_match )
1171 perfc_incrc(snapshot_entry_matches_true);
1172 #endif
1174 return entries_match;
1177 /*
1178 * Returns 1 if va's shadow mapping is out-of-sync.
1179 * Returns 0 otherwise.
1180 */
1181 static int is_out_of_sync(struct vcpu *v, unsigned long va) /* __shadow_out_of_sync */
1183 struct domain *d = v->domain;
1184 #if CONFIG_PAGING_LEVELS == 4
1185 unsigned long l2mfn = ((v->arch.flags & TF_kernel_mode)?
1186 pagetable_get_pfn(v->arch.guest_table) :
1187 pagetable_get_pfn(v->arch.guest_table_user));
1188 #else
1189 unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
1190 #endif
1191 unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
1192 guest_l2_pgentry_t l2e;
1193 unsigned long l1pfn, l1mfn;
1194 guest_l1_pgentry_t *guest_pt;
1196 ASSERT(shadow_lock_is_acquired(d));
1197 ASSERT(VALID_M2P(l2pfn));
1199 perfc_incrc(shadow_out_of_sync_calls);
1201 #if CONFIG_PAGING_LEVELS >= 3
1203 #define unmap_and_return(x) \
1204 if ( guest_pt != (guest_l1_pgentry_t *) v->arch.guest_vtable ) \
1205 unmap_domain_page(guest_pt); \
1206 return (x);
1208 if (d->arch.ops->guest_paging_levels >= PAGING_L3)
1210 pgentry_64_t le;
1211 unsigned long gmfn;
1212 unsigned long gpfn;
1213 int i;
1214 unsigned int base_idx = 0;
1215 base_idx = get_cr3_idxval(v);
1217 gmfn = l2mfn;
1218 gpfn = l2pfn;
1219 guest_pt = (guest_l1_pgentry_t *)v->arch.guest_vtable;
1221 for ( i = PAGING_L4; i >= PAGING_L3; i-- )
1223 if (d->arch.ops->guest_paging_levels == PAGING_L3
1224 && i == PAGING_L4)
1225 continue; /* skip the top-level for 3-level */
1227 if ( page_out_of_sync(mfn_to_page(gmfn)) &&
1228 !snapshot_entry_matches(
1229 d, guest_pt, gpfn, guest_table_offset_64(va, i, base_idx)) )
1231 unmap_and_return (1);
1234 le = entry_empty();
1235 __rw_entry(v, va, &le, GUEST_ENTRY | GET_ENTRY | i);
1237 if ( !(entry_get_flags(le) & _PAGE_PRESENT) )
1239 unmap_and_return (0);
1241 gpfn = entry_get_pfn(le);
1242 gmfn = gmfn_to_mfn(d, gpfn);
1243 if ( !VALID_MFN(gmfn) )
1245 unmap_and_return (0);
1247 if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
1248 unmap_domain_page(guest_pt);
1249 guest_pt = (guest_l1_pgentry_t *)map_domain_page(gmfn);
1252 /* L2 */
1253 if ( page_out_of_sync(mfn_to_page(gmfn)) &&
1254 !snapshot_entry_matches(d, guest_pt, gpfn, l2_table_offset(va)) )
1256 unmap_and_return (1);
1259 if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
1260 unmap_domain_page(guest_pt);
1263 else
1264 #undef unmap_and_return
1265 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1267 if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
1268 !snapshot_entry_matches(d, (guest_l1_pgentry_t *)v->arch.guest_vtable,
1269 l2pfn, guest_l2_table_offset(va)) )
1270 return 1;
1273 __guest_get_l2e(v, va, &l2e);
1274 if ( !(guest_l2e_get_flags(l2e) & _PAGE_PRESENT) ||
1275 (guest_l2e_get_flags(l2e) & _PAGE_PSE))
1276 return 0;
1278 l1pfn = l2e_get_pfn(l2e);
1279 l1mfn = gmfn_to_mfn(d, l1pfn);
1281 // If the l1 pfn is invalid, it can't be out of sync...
1282 if ( !VALID_MFN(l1mfn) )
1283 return 0;
1285 guest_pt = (guest_l1_pgentry_t *) map_domain_page(l1mfn);
1287 if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
1288 !snapshot_entry_matches(
1289 d, guest_pt, l1pfn, guest_l1_table_offset(va)) )
1291 unmap_domain_page(guest_pt);
1292 return 1;
1295 unmap_domain_page(guest_pt);
1296 return 0;
1299 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(guest_l1_pgentry_t)))
1300 static inline unsigned long
1301 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
1303 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
1306 static inline void
1307 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1309 unsigned long score = prediction & PGT_score_mask;
1310 int create = (score == 0);
1312 // saturating addition
1313 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
1314 score = score ? score : PGT_score_mask;
1316 prediction = (prediction & PGT_mfn_mask) | score;
1318 //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create);
1319 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1321 if ( create )
1322 perfc_incr(writable_pte_predictions);
1325 static inline void
1326 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1328 unsigned long score = prediction & PGT_score_mask;
1329 ASSERT(score);
1331 // divide score by 2... We don't like bad predictions.
1332 //
1333 score = (score >> 1) & PGT_score_mask;
1335 prediction = (prediction & PGT_mfn_mask) | score;
1337 //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score);
1339 if ( score )
1340 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
1341 else
1343 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
1344 perfc_decr(writable_pte_predictions);
1348 static int fix_entry(
1349 struct domain *d,
1350 l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
1352 l1_pgentry_t old = *pt;
1353 l1_pgentry_t new = old;
1355 l1e_remove_flags(new,_PAGE_RW);
1356 if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
1357 BUG();
1358 (*found)++;
1359 *pt = new;
1360 if ( is_l1_shadow )
1361 shadow_put_page_from_l1e(old, d);
1363 return (*found == max_refs_to_find);
1366 static u32 remove_all_write_access_in_ptpage(
1367 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
1368 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
1369 u32 max_refs_to_find, unsigned long prediction)
1371 l1_pgentry_t *pt = map_domain_page(pt_mfn);
1372 l1_pgentry_t *pt_next = 0, *sl1e_p;
1373 l1_pgentry_t match;
1374 unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
1375 int i;
1376 u32 found = 0;
1377 int is_l1_shadow =
1378 ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
1379 PGT_l1_shadow);
1380 #if CONFIG_PAGING_LEVELS == 4
1381 is_l1_shadow |=
1382 ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
1383 PGT_fl1_shadow);
1384 #endif
1386 if ( SH_L1_HAS_NEXT_PAGE )
1387 pt_next = map_domain_page(pt_mfn + 1);
1389 match = l1e_from_pfn(readonly_gmfn, flags);
1391 if ( shadow_mode_external(d) )
1393 i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask)
1394 >> PGT_va_shift;
1396 if ( SH_L1_HAS_NEXT_PAGE &&
1397 i >= L1_PAGETABLE_ENTRIES )
1398 sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
1399 else
1400 sl1e_p = &pt[i];
1402 if ( (i >= 0 && i < GUEST_L1_PAGETABLE_ENTRIES) &&
1403 !l1e_has_changed(*sl1e_p, match, flags) &&
1404 fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) &&
1405 !prediction )
1406 goto out;
1409 for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
1411 if ( SH_L1_HAS_NEXT_PAGE &&
1412 i >= L1_PAGETABLE_ENTRIES )
1413 sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
1414 else
1415 sl1e_p = &pt[i];
1417 if ( unlikely(!l1e_has_changed(*sl1e_p, match, flags)) &&
1418 fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) )
1419 break;
1422 out:
1423 unmap_domain_page(pt);
1424 if ( SH_L1_HAS_NEXT_PAGE )
1425 unmap_domain_page(pt_next);
1427 return found;
1430 static int remove_all_write_access(
1431 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
1433 int i;
1434 struct shadow_status *a;
1435 u32 found = 0, write_refs;
1436 unsigned long predicted_smfn;
1438 ASSERT(shadow_lock_is_acquired(d));
1439 ASSERT(VALID_MFN(readonly_gmfn));
1441 perfc_incrc(remove_write_access);
1443 // If it's not a writable page, then no writable refs can be outstanding.
1444 //
1445 if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
1446 PGT_writable_page )
1448 perfc_incrc(remove_write_not_writable);
1449 return 1;
1452 // How many outstanding writable PTEs for this page are there?
1453 //
1454 write_refs =
1455 (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
1456 if ( write_refs && MFN_PINNED(readonly_gmfn) )
1458 write_refs--;
1461 if ( write_refs == 0 )
1463 perfc_incrc(remove_write_no_work);
1464 return 1;
1467 if ( shadow_mode_external(d) ) {
1468 if (--write_refs == 0)
1469 return 0;
1471 // Use the back pointer to locate the shadow page that can contain
1472 // the PTE of interest
1473 if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
1474 found += remove_all_write_access_in_ptpage(
1475 d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
1476 if ( found == write_refs )
1477 return 0;
1481 // Search all the shadow L1 page tables...
1482 //
1483 for (i = 0; i < shadow_ht_buckets; i++)
1485 a = &d->arch.shadow_ht[i];
1486 while ( a && a->gpfn_and_flags )
1488 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow
1489 #if CONFIG_PAGING_LEVELS >= 4
1490 || (a->gpfn_and_flags & PGT_type_mask) == PGT_fl1_shadow
1491 #endif
1495 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
1496 if ( found == write_refs )
1497 return 0;
1500 a = a->next;
1504 FSH_LOG("%s: looking for %d refs, found %d refs",
1505 __func__, write_refs, found);
1507 return 0;
1510 static void resync_pae_guest_l3(struct domain *d)
1512 struct out_of_sync_entry *entry;
1513 unsigned long i, idx;
1514 unsigned long smfn, gmfn;
1515 pgentry_64_t *guest, *shadow_l3, *snapshot;
1516 struct vcpu *v = current;
1517 int max = -1;
1518 int unshadow = 0;
1521 ASSERT( shadow_mode_external(d) );
1523 gmfn = pagetable_get_pfn(v->arch.guest_table);
1525 for ( entry = d->arch.out_of_sync; entry; entry = entry->next )
1527 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1528 continue;
1529 if ( entry->gmfn != gmfn )
1530 continue;
1532 idx = get_cr3_idxval(v);
1533 smfn = __shadow_status(
1534 d, ((unsigned long)(idx << PGT_score_shift) | entry->gpfn), PGT_l4_shadow);
1536 #ifndef NDEBUG
1537 if ( !smfn )
1539 BUG();
1541 #endif
1543 guest = (pgentry_64_t *)map_domain_page(entry->gmfn);
1544 snapshot = (pgentry_64_t *)map_domain_page(entry->snapshot_mfn);
1545 shadow_l3 = (pgentry_64_t *)map_domain_page(smfn);
1547 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
1549 int index = i + idx * PAE_L3_PAGETABLE_ENTRIES;
1550 if ( entry_has_changed(
1551 guest[index], snapshot[index], PAGE_FLAG_MASK) )
1553 validate_entry_change(d, &guest[index],
1554 &shadow_l3[i], PAGING_L3);
1556 if ( entry_get_value(guest[index]) != 0 )
1557 max = i;
1559 if ( !(entry_get_flags(guest[index]) & _PAGE_PRESENT) &&
1560 unlikely(entry_get_value(guest[index]) != 0) &&
1561 !unshadow &&
1562 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
1563 unshadow = 1;
1566 if ( max == -1 )
1567 unshadow = 1;
1569 unmap_domain_page(guest);
1570 unmap_domain_page(snapshot);
1571 unmap_domain_page(shadow_l3);
1573 if ( unlikely(unshadow) )
1574 shadow_unpin(smfn);
1575 break;
1579 static int resync_all(struct domain *d, u32 stype)
1581 struct out_of_sync_entry *entry;
1582 unsigned i;
1583 unsigned long smfn;
1584 void *guest, *shadow, *snapshot;
1585 int need_flush = 0, external = shadow_mode_external(d);
1586 int unshadow;
1587 int changed;
1588 u32 min_max_shadow, min_max_snapshot;
1589 int min_shadow, max_shadow, min_snapshot, max_snapshot;
1590 struct vcpu *v;
1592 ASSERT(shadow_lock_is_acquired(d));
1594 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
1596 int max = -1;
1598 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1599 continue;
1601 smfn = __shadow_status(d, entry->gpfn, stype);
1603 if ( !smfn )
1605 // For heavy weight shadows: no need to update refcounts if
1606 // there's no shadow page.
1607 //
1608 if ( shadow_mode_refcounts(d) )
1609 continue;
1611 // For light weight shadows: only need up resync the refcounts to
1612 // the new contents of the guest page iff this it has the right
1613 // page type.
1614 //
1615 if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
1616 continue;
1619 FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
1620 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
1622 // Compare guest's new contents to its snapshot, validating
1623 // and updating its shadow as appropriate.
1624 //
1625 guest = map_domain_page(entry->gmfn);
1626 snapshot = map_domain_page(entry->snapshot_mfn);
1628 if ( smfn )
1629 shadow = map_domain_page(smfn);
1630 else
1631 shadow = NULL;
1633 unshadow = 0;
1635 min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
1636 min_shadow = SHADOW_MIN(min_max_shadow);
1637 max_shadow = SHADOW_MAX(min_max_shadow);
1639 min_max_snapshot= mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
1640 min_snapshot = SHADOW_MIN(min_max_snapshot);
1641 max_snapshot = SHADOW_MAX(min_max_snapshot);
1643 switch ( stype )
1645 case PGT_l1_shadow:
1647 guest_l1_pgentry_t *guest1 = guest;
1648 l1_pgentry_t *shadow1 = shadow;
1649 l1_pgentry_t *shadow1_next = 0, *sl1e_p;
1650 guest_l1_pgentry_t *snapshot1 = snapshot;
1651 int unshadow_l1 = 0;
1653 ASSERT(shadow_mode_write_l1(d) ||
1654 shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
1656 if ( !shadow_mode_refcounts(d) )
1657 revalidate_l1(d, (l1_pgentry_t *)guest1, (l1_pgentry_t *)snapshot1);
1658 if ( !smfn )
1659 break;
1661 changed = 0;
1663 if ( SH_L1_HAS_NEXT_PAGE && shadow1 )
1664 shadow1_next = map_domain_page(smfn + 1);
1666 for ( i = min_shadow; i <= max_shadow; i++ )
1669 if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
1670 sl1e_p = &shadow1_next[i - L1_PAGETABLE_ENTRIES];
1671 else
1672 sl1e_p = &shadow1[i];
1674 if ( (i < min_snapshot) || (i > max_snapshot) ||
1675 guest_l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
1677 int error;
1679 error = validate_pte_change(d, guest1[i], sl1e_p);
1680 if ( error == -1 )
1681 unshadow_l1 = 1;
1682 else {
1683 need_flush |= error;
1684 set_guest_back_ptr(d, *sl1e_p, smfn, i);
1686 // can't update snapshots of linear page tables -- they
1687 // are used multiple times...
1688 //
1689 // snapshot[i] = new_pte;
1691 changed++;
1695 if ( shadow1_next )
1696 unmap_domain_page(shadow1_next);
1698 perfc_incrc(resync_l1);
1699 perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
1700 perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
1701 if ( d->arch.ops->guest_paging_levels >= PAGING_L3 &&
1702 unshadow_l1 ) {
1703 pgentry_64_t l2e = { 0 };
1705 __shadow_get_l2e(entry->v, entry->va, &l2e);
1707 if ( entry_get_flags(l2e) & _PAGE_PRESENT ) {
1708 put_shadow_ref(entry_get_pfn(l2e));
1709 l2e = entry_empty();
1710 __shadow_set_l2e(entry->v, entry->va, &l2e);
1712 if (entry->v == current)
1713 need_flush = 1;
1717 break;
1719 #if CONFIG_PAGING_LEVELS == 2
1720 case PGT_l2_shadow:
1722 l2_pgentry_t *guest2 = guest;
1723 l2_pgentry_t *shadow2 = shadow;
1724 l2_pgentry_t *snapshot2 = snapshot;
1726 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
1727 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
1729 changed = 0;
1730 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1732 if ( !is_guest_l2_slot(0,i) && !external )
1733 continue;
1735 l2_pgentry_t new_pde = guest2[i];
1736 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
1738 need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
1740 // can't update snapshots of linear page tables -- they
1741 // are used multiple times...
1742 //
1743 // snapshot[i] = new_pde;
1745 changed++;
1747 if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
1748 max = i;
1750 // XXX - This hack works for linux guests.
1751 // Need a better solution long term.
1752 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
1753 unlikely(l2e_get_intpte(new_pde) != 0) &&
1754 !unshadow && MFN_PINNED(smfn) )
1755 unshadow = 1;
1757 if ( max == -1 )
1758 unshadow = 1;
1759 perfc_incrc(resync_l2);
1760 perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
1761 break;
1763 case PGT_hl2_shadow:
1765 l2_pgentry_t *guest2 = guest;
1766 l2_pgentry_t *snapshot2 = snapshot;
1767 l1_pgentry_t *shadow2 = shadow;
1769 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
1770 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
1772 changed = 0;
1773 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1775 if ( !is_guest_l2_slot(0, i) && !external )
1776 continue;
1778 l2_pgentry_t new_pde = guest2[i];
1779 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
1781 need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
1783 // can't update snapshots of linear page tables -- they
1784 // are used multiple times...
1785 //
1786 // snapshot[i] = new_pde;
1788 changed++;
1791 perfc_incrc(resync_hl2);
1792 perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
1793 break;
1795 #elif CONFIG_PAGING_LEVELS >= 3
1796 case PGT_l2_shadow:
1797 case PGT_l3_shadow:
1799 pgentry_64_t *guest_pt = guest;
1800 pgentry_64_t *shadow_pt = shadow;
1801 pgentry_64_t *snapshot_pt = snapshot;
1803 changed = 0;
1804 for ( i = min_shadow; i <= max_shadow; i++ )
1806 if ( (i < min_snapshot) || (i > max_snapshot) ||
1807 entry_has_changed(
1808 guest_pt[i], snapshot_pt[i], PAGE_FLAG_MASK) )
1811 unsigned long gpfn;
1813 gpfn = entry_get_pfn(guest_pt[i]);
1814 /*
1815 * Looks like it's longer a page table.
1816 */
1817 if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
1818 continue;
1820 need_flush |= validate_entry_change(
1821 d, &guest_pt[i], &shadow_pt[i],
1822 shadow_type_to_level(stype));
1823 changed++;
1825 #if CONFIG_PAGING_LEVELS == 3
1826 if ( stype == PGT_l3_shadow )
1828 if ( entry_get_value(guest_pt[i]) != 0 )
1829 max = i;
1831 if ( !(entry_get_flags(guest_pt[i]) & _PAGE_PRESENT) &&
1832 unlikely(entry_get_value(guest_pt[i]) != 0) &&
1833 !unshadow &&
1834 (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
1835 unshadow = 1;
1837 #endif
1840 if ( d->arch.ops->guest_paging_levels == PAGING_L3
1841 && max == -1 && stype == PGT_l3_shadow )
1842 unshadow = 1;
1844 perfc_incrc(resync_l3);
1845 perfc_incr_histo(shm_l3_updates, changed, PT_UPDATES);
1846 break;
1848 case PGT_l4_shadow:
1850 guest_root_pgentry_t *guest_root = guest;
1851 guest_root_pgentry_t *snapshot_root = snapshot;
1853 changed = 0;
1854 for ( i = 0; i < GUEST_ROOT_PAGETABLE_ENTRIES; i++ )
1856 guest_root_pgentry_t new_root_e = guest_root[i];
1857 if ( !is_guest_l4_slot(i) && !external )
1858 continue;
1859 if ( root_entry_has_changed(
1860 new_root_e, snapshot_root[i], PAGE_FLAG_MASK))
1862 #ifndef GUEST_PGENTRY_32
1863 l4_pgentry_t *shadow4 = shadow;
1864 unsigned long gpfn;
1866 gpfn = l4e_get_pfn(new_root_e);
1867 /*
1868 * Looks like it's longer a page table.
1869 */
1870 if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
1871 continue;
1873 if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
1875 need_flush |= validate_entry_change(
1876 d, (pgentry_64_t *)&new_root_e,
1877 (pgentry_64_t *)&shadow4[i], shadow_type_to_level(stype));
1879 else
1880 #endif
1882 validate_bl2e_change(d, &new_root_e, shadow, i);
1884 changed++;
1885 ESH_LOG("%d: shadow4 mfn: %lx, shadow root: %lx\n", i,
1886 smfn, pagetable_get_paddr(current->arch.shadow_table));
1888 if ( guest_root_get_intpte(new_root_e) != 0 ) /* FIXME: check flags? */
1889 max = i;
1891 // Need a better solution in the long term.
1892 if ( !(guest_root_get_flags(new_root_e) & _PAGE_PRESENT) &&
1893 unlikely(guest_root_get_intpte(new_root_e) != 0) &&
1894 !unshadow &&
1895 (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
1896 unshadow = 1;
1898 if ( max == -1 )
1899 unshadow = 1;
1900 perfc_incrc(resync_l4);
1901 perfc_incr_histo(shm_l4_updates, changed, PT_UPDATES);
1902 break;
1905 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1906 default:
1907 BUG();
1910 if ( smfn )
1911 unmap_domain_page(shadow);
1912 unmap_domain_page(snapshot);
1913 unmap_domain_page(guest);
1915 if ( unlikely(unshadow && stype == PGT_root_page_table) )
1917 for_each_vcpu(d, v)
1918 if(smfn == pagetable_get_pfn(v->arch.shadow_table))
1919 return need_flush;
1920 perfc_incrc(unshadow_l2_count);
1921 shadow_unpin(smfn);
1922 #if CONFIG_PAGING_LEVELS == 2
1923 if ( unlikely(shadow_mode_external(d)) )
1925 unsigned long hl2mfn;
1927 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
1928 MFN_PINNED(hl2mfn) )
1929 shadow_unpin(hl2mfn);
1931 #endif
1935 return need_flush;
1938 #if CONFIG_PAGING_LEVELS == 2
1939 static int resync_all_levels_guest_page(struct domain *d)
1941 int need_flush = 0;
1943 need_flush |= resync_all(d, PGT_l1_shadow);
1944 if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
1945 shadow_mode_translate(d) )
1947 need_flush |= resync_all(d, PGT_hl2_shadow);
1949 return need_flush;
1951 #elif CONFIG_PAGING_LEVELS == 3
1952 static int resync_all_levels_guest_page(struct domain *d)
1954 int need_flush = 0;
1956 need_flush |= resync_all(d, PGT_l1_shadow);
1957 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
1958 need_flush |= resync_all(d, PGT_l4_shadow);
1959 else
1961 need_flush |= resync_all(d, PGT_l2_shadow);
1962 if ( shadow_mode_log_dirty(d) )
1964 need_flush |= resync_all(d, PGT_l3_shadow);
1965 need_flush |= resync_all(d, PGT_l4_shadow);
1967 else
1968 resync_pae_guest_l3(d);
1971 return need_flush;
1973 #elif CONFIG_PAGING_LEVELS == 4
1974 static int resync_all_levels_guest_page(struct domain *d)
1976 int need_flush = 0;
1978 need_flush |= resync_all(d, PGT_l1_shadow);
1979 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
1980 need_flush |= resync_all(d, PGT_l4_shadow);
1981 else
1983 need_flush |= resync_all(d, PGT_l2_shadow);
1984 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
1985 resync_pae_guest_l3(d);
1986 else
1988 need_flush |= resync_all(d, PGT_l3_shadow);
1989 need_flush |= resync_all(d, PGT_l4_shadow);
1992 return need_flush;
1994 #endif
1996 static void sync_all(struct domain *d)
1998 struct out_of_sync_entry *entry;
1999 int need_flush = 0;
2000 l1_pgentry_t *ppte, opte, npte;
2001 cpumask_t other_vcpus_mask;
2003 perfc_incrc(shadow_sync_all);
2005 ASSERT(shadow_lock_is_acquired(d));
2007 // First, remove all write permissions to the page tables
2008 //
2009 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2011 // Skip entries that have low bits set... Those aren't
2012 // real PTEs.
2013 //
2014 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
2015 continue;
2017 ppte = (l1_pgentry_t *)(
2018 (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
2019 (entry->writable_pl1e & ~PAGE_MASK));
2020 opte = npte = *ppte;
2021 l1e_remove_flags(npte, _PAGE_RW);
2023 if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
2024 !shadow_get_page_from_l1e(npte, d) )
2025 BUG();
2026 *ppte = npte;
2027 set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT,
2028 (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
2029 shadow_put_page_from_l1e(opte, d);
2031 unmap_domain_page(ppte);
2034 /* Other VCPUs mustn't use the revoked writable mappings. */
2035 other_vcpus_mask = d->domain_dirty_cpumask;
2036 cpu_clear(smp_processor_id(), other_vcpus_mask);
2037 flush_tlb_mask(other_vcpus_mask);
2039 /* Flush ourself later. */
2040 need_flush = 1;
2042 need_flush |= resync_all_levels_guest_page(d);
2044 if ( need_flush && !unlikely(shadow_mode_external(d)) )
2045 local_flush_tlb();
2047 free_out_of_sync_state(d);
2050 static inline int l1pte_write_fault(
2051 struct vcpu *v, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
2052 unsigned long va)
2054 struct domain *d = v->domain;
2055 guest_l1_pgentry_t gpte = *gpte_p;
2056 l1_pgentry_t spte;
2057 unsigned long gpfn = l1e_get_pfn(gpte);
2058 unsigned long gmfn = gmfn_to_mfn(d, gpfn);
2060 //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
2062 if ( unlikely(!VALID_MFN(gmfn)) )
2064 SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
2065 *spte_p = l1e_empty();
2066 return 0;
2069 ASSERT(guest_l1e_get_flags(gpte) & _PAGE_RW);
2070 guest_l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
2071 spte = l1e_from_pfn(gmfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
2073 SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
2074 l1e_get_intpte(spte), l1e_get_intpte(gpte));
2076 __mark_dirty(d, gmfn);
2078 if ( mfn_is_page_table(gmfn) )
2079 shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
2081 *gpte_p = gpte;
2082 *spte_p = spte;
2084 return 1;
2087 static inline int l1pte_read_fault(
2088 struct domain *d, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
2090 guest_l1_pgentry_t gpte = *gpte_p;
2091 l1_pgentry_t spte = *spte_p;
2092 unsigned long pfn = l1e_get_pfn(gpte);
2093 unsigned long mfn = gmfn_to_mfn(d, pfn);
2095 if ( unlikely(!VALID_MFN(mfn)) )
2097 SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
2098 *spte_p = l1e_empty();
2099 return 0;
2102 guest_l1e_add_flags(gpte, _PAGE_ACCESSED);
2103 spte = l1e_from_pfn(mfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
2105 if ( shadow_mode_log_dirty(d) || !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
2106 mfn_is_page_table(mfn) )
2108 l1e_remove_flags(spte, _PAGE_RW);
2111 SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
2112 l1e_get_intpte(spte), l1e_get_intpte(gpte));
2113 *gpte_p = gpte;
2114 *spte_p = spte;
2116 return 1;
2118 #if CONFIG_PAGING_LEVELS == 2
2119 static int shadow_fault_32(unsigned long va, struct cpu_user_regs *regs)
2121 l1_pgentry_t gpte, spte, orig_gpte;
2122 struct vcpu *v = current;
2123 struct domain *d = v->domain;
2124 l2_pgentry_t gpde;
2126 spte = l1e_empty();
2128 SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
2129 va, (unsigned long)regs->error_code);
2130 perfc_incrc(shadow_fault_calls);
2132 check_pagetable(v, "pre-sf");
2134 /*
2135 * Don't let someone else take the guest's table pages out-of-sync.
2136 */
2137 shadow_lock(d);
2139 /* XXX - FIX THIS COMMENT!!!
2140 * STEP 1. Check to see if this fault might have been caused by an
2141 * out-of-sync table page entry, or if we should pass this
2142 * fault onto the guest.
2143 */
2144 __shadow_sync_va(v, va);
2146 /*
2147 * STEP 2. Check the guest PTE.
2148 */
2149 __guest_get_l2e(v, va, &gpde);
2150 if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
2152 SH_VVLOG("shadow_fault - EXIT: L1 not present");
2153 perfc_incrc(shadow_fault_bail_pde_not_present);
2154 goto fail;
2157 // This can't fault because we hold the shadow lock and we've ensured that
2158 // the mapping is in-sync, so the check of the PDE's present bit, above,
2159 // covers this access.
2160 //
2161 //orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
2162 __guest_get_l1e(v, va, &gpte);
2163 orig_gpte = gpte;
2165 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2167 SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")",
2168 l1e_get_intpte(gpte));
2169 perfc_incrc(shadow_fault_bail_pte_not_present);
2170 goto fail;
2173 /* Write fault? */
2174 if ( regs->error_code & 2 )
2176 int allow_writes = 0;
2178 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
2180 if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
2182 allow_writes = 1;
2183 l1e_add_flags(gpte, _PAGE_RW);
2185 else
2187 /* Write fault on a read-only mapping. */
2188 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
2189 l1e_get_intpte(gpte));
2190 perfc_incrc(shadow_fault_bail_ro_mapping);
2191 goto fail;
2194 else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
2196 SH_LOG("l1pte_write_fault: no write access to page table page");
2197 domain_crash_synchronous();
2200 if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
2202 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
2203 perfc_incrc(write_fault_bail);
2204 shadow_unlock(d);
2205 return 0;
2208 if ( allow_writes )
2209 l1e_remove_flags(gpte, _PAGE_RW);
2211 else
2213 if ( !l1pte_read_fault(d, &gpte, &spte) )
2215 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
2216 perfc_incrc(read_fault_bail);
2217 shadow_unlock(d);
2218 return 0;
2222 /*
2223 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
2224 */
2225 if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
2227 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
2228 /*if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
2229 &gpte, sizeof(gpte))) )*/
2230 if ( unlikely(!__guest_set_l1e(v, va, &gpte)))
2232 printk("%s() failed, crashing domain %d "
2233 "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
2234 __func__,d->domain_id, l2e_get_intpte(gpde), va);
2235 domain_crash_synchronous();
2238 __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
2241 shadow_set_l1e(va, spte, 1);
2243 perfc_incrc(shadow_fault_fixed);
2244 d->arch.shadow_fault_count++;
2246 shadow_unlock(d);
2248 check_pagetable(v, "post-sf");
2249 return EXCRET_fault_fixed;
2251 fail:
2252 shadow_unlock(d);
2253 return 0;
2255 #endif /* CONFIG_PAGING_LEVELS == 2 */
2257 static inline unsigned long va_to_l1mfn(struct vcpu *v, unsigned long va)
2259 struct domain *d = v->domain;
2260 guest_l2_pgentry_t gl2e = {0};
2262 __guest_get_l2e(v, va, &gl2e);
2264 if ( unlikely(!(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT)) )
2265 return INVALID_MFN;
2267 return gmfn_to_mfn(d, l2e_get_pfn(gl2e));
2270 static int do_update_va_mapping(unsigned long va,
2271 l1_pgentry_t val,
2272 struct vcpu *v)
2274 struct domain *d = v->domain;
2275 l1_pgentry_t spte;
2276 int rc = 0;
2278 shadow_lock(d);
2280 // This is actually overkill - we don't need to sync the L1 itself,
2281 // just everything involved in getting to this L1 (i.e. we need
2282 // linear_pg_table[l1_linear_offset(va)] to be in sync)...
2283 //
2284 __shadow_sync_va(v, va);
2286 l1pte_propagate_from_guest(d, *(guest_l1_pgentry_t *)&val, &spte);
2287 #if CONFIG_PAGING_LEVELS == 2
2288 shadow_set_l1e(va, spte, 0);
2289 #elif CONFIG_PAGING_LEVELS >= 3
2290 shadow_set_l1e_64(va, (pgentry_64_t *) &spte, 0);
2291 #endif
2292 /*
2293 * If we're in log-dirty mode then we need to note that we've updated
2294 * the PTE in the PT-holding page. We need the machine frame number
2295 * for this.
2296 */
2297 __mark_dirty(d, va_to_l1mfn(v, va));
2299 shadow_unlock(d);
2301 return rc;
2305 /*
2306 * What lives where in the 32-bit address space in the various shadow modes,
2307 * and what it uses to get/maintain that mapping.
2309 * SHADOW MODE: none enable translate external
2311 * 4KB things:
2312 * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
2313 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
2314 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
2315 * monitor_vtable n/a n/a n/a mapped once
2317 * 4MB things:
2318 * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
2319 * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
2320 * monitor_linear n/a n/a n/a ???
2321 * perdomain perdomain perdomain perdomain perdomain
2322 * R/O M2P R/O M2P R/O M2P n/a n/a
2323 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
2324 * P2M n/a n/a R/O M2P R/O M2P
2326 * NB:
2327 * update_pagetables(), shadow_update_pagetables(), shadow_mode_enable(),
2328 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
2329 * all play a part in maintaining these mappings.
2330 */
2331 static void shadow_update_pagetables(struct vcpu *v)
2333 struct domain *d = v->domain;
2334 #if CONFIG_PAGING_LEVELS == 4
2335 unsigned long gmfn = ((v->arch.flags & TF_kernel_mode)?
2336 pagetable_get_pfn(v->arch.guest_table) :
2337 pagetable_get_pfn(v->arch.guest_table_user));
2338 #else
2339 unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
2340 #endif
2342 unsigned long gpfn = mfn_to_gmfn(d, gmfn);
2343 unsigned long smfn, old_smfn;
2345 #if CONFIG_PAGING_LEVELS == 2
2346 unsigned long hl2mfn;
2347 #endif
2348 int need_sync = 0;
2350 int max_mode = ( shadow_mode_external(d) ? SHM_external
2351 : shadow_mode_translate(d) ? SHM_translate
2352 : shadow_mode_enabled(d) ? SHM_enable
2353 : 0 );
2355 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
2356 ASSERT( max_mode );
2358 /*
2359 * arch.guest_vtable
2360 */
2361 if ( max_mode & (SHM_enable | SHM_external) )
2363 if ( likely(v->arch.guest_vtable != NULL) )
2364 unmap_domain_page_global(v->arch.guest_vtable);
2365 v->arch.guest_vtable = map_domain_page_global(gmfn);
2368 #if CONFIG_PAGING_LEVELS >= 3
2369 /*
2370 * Handle 32-bit PAE enabled guest
2371 */
2372 if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
2374 u32 index = get_cr3_idxval(v);
2375 gpfn = (index << PGT_score_shift) | gpfn;
2377 #endif
2379 /*
2380 * arch.shadow_table
2381 */
2382 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
2384 #if CONFIG_PAGING_LEVELS == 2
2385 smfn = shadow_l2_table(v, gpfn, gmfn);
2386 #elif CONFIG_PAGING_LEVELS == 3
2387 smfn = shadow_l3_table(v, gpfn, gmfn);
2388 #elif CONFIG_PAGING_LEVELS == 4
2389 smfn = shadow_l4_table(v, gpfn, gmfn);
2390 #endif
2392 else
2394 #if CONFIG_PAGING_LEVELS >= 3
2395 if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
2396 update_top_level_shadow(v, smfn);
2397 #endif
2398 /*
2399 * move sync later in order to avoid this smfn been
2400 * unshadowed occasionally
2401 */
2402 need_sync = 1;
2406 if ( !get_shadow_ref(smfn) )
2407 BUG();
2408 old_smfn = pagetable_get_pfn(v->arch.shadow_table);
2409 v->arch.shadow_table = mk_pagetable((u64)smfn << PAGE_SHIFT);
2410 if ( old_smfn )
2411 put_shadow_ref(old_smfn);
2413 SH_VVLOG("shadow_update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
2415 /*
2416 * arch.shadow_vtable
2417 */
2418 if ( max_mode == SHM_external
2419 #if CONFIG_PAGING_LEVELS >=3
2420 || max_mode & SHM_enable
2421 #endif
2424 if ( v->arch.shadow_vtable )
2425 unmap_domain_page_global(v->arch.shadow_vtable);
2426 v->arch.shadow_vtable = map_domain_page_global(smfn);
2429 #if CONFIG_PAGING_LEVELS == 2
2430 /*
2431 * arch.hl2_vtable
2432 */
2434 // if max_mode == SHM_translate, then the hl2 is already installed
2435 // correctly in its smfn, and there's nothing to do.
2436 //
2437 if ( max_mode == SHM_external )
2439 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
2440 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
2441 if ( v->arch.hl2_vtable )
2442 unmap_domain_page_global(v->arch.hl2_vtable);
2443 v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
2446 /*
2447 * fixup pointers in monitor table, as necessary
2448 */
2449 if ( max_mode == SHM_external )
2451 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
2452 l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
2453 l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
2455 ASSERT( shadow_mode_translate(d) );
2457 if ( !get_shadow_ref(hl2mfn) )
2458 BUG();
2459 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
2460 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
2461 if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
2462 put_shadow_ref(l2e_get_pfn(old_hl2e));
2464 if ( !get_shadow_ref(smfn) )
2465 BUG();
2466 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
2467 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
2468 if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
2469 put_shadow_ref(l2e_get_pfn(old_sl2e));
2471 // XXX - maybe this can be optimized somewhat??
2472 local_flush_tlb();
2474 #endif /* CONFIG_PAGING_LEVELS == 2 */
2476 #if CONFIG_PAGING_LEVELS == 3
2477 /*
2478 * fixup pointers in monitor table, as necessary
2479 */
2480 if ( max_mode == SHM_external )
2482 l3_pgentry_t *mpl3e = (l3_pgentry_t *) v->arch.monitor_vtable;
2483 l2_pgentry_t *spl2e;
2484 unsigned long s2mfn;
2485 int i;
2487 ASSERT( shadow_mode_translate(d) );
2488 s2mfn = l3e_get_pfn(mpl3e[L3_PAGETABLE_ENTRIES - 1]);
2490 ASSERT( s2mfn);
2491 spl2e = map_domain_page(s2mfn);
2493 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
2494 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
2495 (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
2496 l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
2497 l2e_empty();
2499 unmap_domain_page(spl2e);
2500 local_flush_tlb();
2502 #endif
2504 if(likely(need_sync))
2505 shadow_sync_all(d);
2509 /************************************************************************/
2510 /************************************************************************/
2511 /************************************************************************/
2513 #if 0 // this code has not been updated for 32pae & 64 bit modes
2514 #if SHADOW_DEBUG
2516 // The following is entirely for _check_pagetable()'s benefit.
2517 // _check_pagetable() wants to know whether a given entry in a
2518 // shadow page table is supposed to be the shadow of the guest's
2519 // current entry, or the shadow of the entry held in the snapshot
2520 // taken above.
2521 //
2522 // Here, we mark all currently existing entries as reflecting
2523 // the snapshot, above. All other places in xen that update
2524 // the shadow will keep the shadow in sync with the guest's
2525 // entries (via l1pte_propagate_from_guest and friends), which clear
2526 // the SHADOW_REFLECTS_SNAPSHOT bit.
2527 //
2528 static void
2529 mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
2531 unsigned long smfn;
2532 l1_pgentry_t *l1e;
2533 l2_pgentry_t *l2e;
2534 unsigned i;
2536 if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
2538 l1e = map_domain_page(smfn);
2539 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2540 if ( is_guest_l1_slot(i) &&
2541 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
2542 l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
2543 unmap_domain_page(l1e);
2546 if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
2548 l2e = map_domain_page(smfn);
2549 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2550 if ( is_guest_l2_slot(0, i) &&
2551 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
2552 l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
2553 unmap_domain_page(l2e);
2557 // BUG: these are not SMP safe...
2558 static int sh_l2_present;
2559 static int sh_l1_present;
2560 static char *sh_check_name;
2561 // int shadow_status_noswap; // declared in shadow32.c
2563 #define v2m(_v, _adr) ({ \
2564 unsigned long _a = (unsigned long)(_adr); \
2565 l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \
2566 unsigned long _pa = -1; \
2567 if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
2568 { \
2569 l1_pgentry_t _pte; \
2570 _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
2571 if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
2572 _pa = l1e_get_paddr(_pte); \
2573 } \
2574 _pa | (_a & ~PAGE_MASK); \
2575 })
2577 #define FAIL(_f, _a...) \
2578 do { \
2579 printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \
2580 sh_check_name, level, l2_idx, l1_idx, ## _a, \
2581 __FILE__, __LINE__); \
2582 printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \
2583 " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \
2584 " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \
2585 " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \
2586 l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \
2587 l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \
2588 p_guest_pte, p_shadow_pte, p_snapshot_pte, \
2589 (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \
2590 (void *)v2m(v, p_snapshot_pte), \
2591 (l2_idx << L2_PAGETABLE_SHIFT) | \
2592 (l1_idx << L1_PAGETABLE_SHIFT)); \
2593 errors++; \
2594 } while ( 0 )
2596 static int check_pte(
2597 struct vcpu *v,
2598 l1_pgentry_t *p_guest_pte,
2599 l1_pgentry_t *p_shadow_pte,
2600 l1_pgentry_t *p_snapshot_pte,
2601 int level, int l2_idx, int l1_idx)
2603 struct domain *d = v->domain;
2604 l1_pgentry_t guest_pte = *p_guest_pte;
2605 l1_pgentry_t shadow_pte = *p_shadow_pte;
2606 l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
2607 l1_pgentry_t eff_guest_pte;
2608 unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
2609 int errors = 0, guest_writable;
2610 int page_table_page;
2612 if ( (l1e_get_intpte(shadow_pte) == 0) ||
2613 (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
2614 (l1e_get_intpte(shadow_pte) == 0x00000E00) )
2615 return errors; /* always safe */
2617 if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
2618 FAIL("Non zero not present shadow_pte");
2620 if ( level == 2 ) sh_l2_present++;
2621 if ( level == 1 ) sh_l1_present++;
2623 if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
2624 eff_guest_pte = snapshot_pte;
2625 else
2626 eff_guest_pte = guest_pte;
2628 if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
2629 FAIL("Guest not present yet shadow is");
2631 mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
2633 if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
2634 FAIL("Corrupt?");
2636 if ( (level == 1) &&
2637 (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
2638 !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
2639 FAIL("Dirty coherence");
2641 if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
2642 !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
2643 FAIL("Accessed coherence");
2645 if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
2646 FAIL("global bit set in shadow");
2648 eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
2649 eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
2650 shadow_mfn = l1e_get_pfn(shadow_pte);
2652 if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
2653 FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
2654 __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
2656 page_table_page = mfn_is_page_table(eff_guest_mfn);
2658 guest_writable =
2659 (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
2660 (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
2662 if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
2664 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
2665 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
2666 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
2667 page_table_page);
2668 FAIL("RW coherence");
2671 if ( (level == 1) &&
2672 (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
2673 !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
2675 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
2676 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
2677 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
2678 page_table_page);
2679 FAIL("RW2 coherence");
2682 if ( eff_guest_mfn == shadow_mfn )
2684 if ( level > 1 )
2685 FAIL("Linear map ???"); /* XXX this will fail on BSD */
2687 else
2689 if ( level < 2 )
2690 FAIL("Shadow in L1 entry?");
2692 if ( level == 2 )
2694 if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
2695 FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
2696 __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
2698 else
2699 BUG(); // XXX -- not handled yet.
2702 return errors;
2704 #undef FAIL
2705 #undef v2m
2707 static int check_l1_table(
2708 struct vcpu *v, unsigned long gpfn,
2709 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
2711 struct domain *d = v->domain;
2712 int i;
2713 unsigned long snapshot_mfn;
2714 l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
2715 int errors = 0;
2717 if ( page_out_of_sync(mfn_to_page(gmfn)) )
2719 snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
2720 ASSERT(snapshot_mfn);
2721 p_snapshot = map_domain_page(snapshot_mfn);
2724 p_guest = map_domain_page(gmfn);
2725 p_shadow = map_domain_page(smfn);
2727 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2728 errors += check_pte(v, p_guest+i, p_shadow+i,
2729 p_snapshot ? p_snapshot+i : NULL,
2730 1, l2_idx, i);
2732 unmap_domain_page(p_shadow);
2733 unmap_domain_page(p_guest);
2734 if ( p_snapshot )
2735 unmap_domain_page(p_snapshot);
2737 return errors;
2740 #define FAILPT(_f, _a...) \
2741 do { \
2742 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
2743 errors++; \
2744 } while ( 0 )
2746 static int check_l2_table(
2747 struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
2749 struct domain *d = v->domain;
2750 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
2751 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
2752 l2_pgentry_t match;
2753 int i;
2754 int errors = 0;
2755 int limit;
2757 if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
2758 FAILPT("domain doesn't own page");
2759 if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
2760 FAILPT("bogus owner for snapshot page");
2761 if ( page_get_owner(mfn_to_page(smfn)) != NULL )
2762 FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
2763 smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
2765 #if 0
2766 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2767 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2768 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
2769 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
2771 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2772 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
2773 i++ )
2774 printk("+++ (%d) %lx %lx\n",i,
2775 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
2776 FAILPT("hypervisor entries inconsistent");
2779 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
2780 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
2781 FAILPT("hypervisor linear map inconsistent");
2782 #endif
2784 match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
2785 if ( !shadow_mode_external(d) &&
2786 l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
2787 match, PAGE_FLAG_MASK))
2789 FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
2790 l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
2791 L2_PAGETABLE_SHIFT]),
2792 l2e_get_intpte(match));
2795 match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
2796 if ( !shadow_mode_external(d) &&
2797 l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
2798 match, PAGE_FLAG_MASK))
2800 FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
2801 l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
2802 d->arch.mm_perdomain_pt,
2803 l2e_get_intpte(match));
2806 #if CONFIG_PAGING_LEVELS == 2
2807 if ( shadow_mode_external(d) )
2808 limit = L2_PAGETABLE_ENTRIES;
2809 else
2810 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2811 #else
2812 limit = 0; /* XXX x86/64 XXX */
2813 #endif
2815 /* Check the whole L2. */
2816 for ( i = 0; i < limit; i++ )
2817 errors += check_pte(v,
2818 (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
2819 (l1_pgentry_t*)(&spl2e[i]),
2820 NULL,
2821 2, i, 0);
2823 unmap_domain_page(spl2e);
2824 unmap_domain_page(gpl2e);
2826 #if 1
2827 if ( errors )
2828 printk("check_l2_table returning %d errors\n", errors);
2829 #endif
2831 return errors;
2833 #undef FAILPT
2835 int _check_pagetable(struct vcpu *v, char *s)
2837 struct domain *d = v->domain;
2838 #if CONFIG_PAGING_LEVELS == 4
2839 pagetable_t pt = ((v->arch.flags & TF_kernel_mode)?
2840 v->arch.guest_table : v->arch.guest_table_user);
2841 #else
2842 pagetable_t pt = v->arch.guest_table;
2843 #endif
2844 unsigned long gptbase = pagetable_get_paddr(pt);
2845 unsigned long ptbase_pfn, smfn;
2846 unsigned long i;
2847 l2_pgentry_t *gpl2e, *spl2e;
2848 unsigned long ptbase_mfn = 0;
2849 int errors = 0, limit, oos_pdes = 0;
2851 //_audit_domain(d, AUDIT_QUIET);
2852 shadow_lock(d);
2854 sh_check_name = s;
2855 //SH_VVLOG("%s-PT Audit", s);
2856 sh_l2_present = sh_l1_present = 0;
2857 perfc_incrc(check_pagetable);
2859 ptbase_mfn = gptbase >> PAGE_SHIFT;
2860 ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
2862 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
2864 printk("%s-PT %lx not shadowed\n", s, gptbase);
2865 goto out;
2867 if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
2869 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
2870 oos_pdes = 1;
2871 ASSERT(ptbase_mfn);
2874 errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
2876 gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
2877 spl2e = (l2_pgentry_t *) map_domain_page(smfn);
2879 /* Go back and recurse. */
2880 #if CONFIG_PAGING_LEVELS == 2
2881 if ( shadow_mode_external(d) )
2882 limit = L2_PAGETABLE_ENTRIES;
2883 else
2884 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2885 #else
2886 limit = 0; /* XXX x86/64 XXX */
2887 #endif
2889 for ( i = 0; i < limit; i++ )
2891 unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
2892 unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
2893 unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
2895 if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */
2897 errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
2901 unmap_domain_page(spl2e);
2902 unmap_domain_page(gpl2e);
2904 #if 0
2905 SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
2906 sh_l2_present, sh_l1_present);
2907 #endif
2909 out:
2910 if ( errors )
2911 BUG();
2913 shadow_unlock(d);
2915 return errors;
2918 int _check_all_pagetables(struct vcpu *v, char *s)
2920 struct domain *d = v->domain;
2921 int i;
2922 struct shadow_status *a;
2923 unsigned long gmfn;
2924 int errors = 0;
2926 shadow_status_noswap = 1;
2928 sh_check_name = s;
2929 SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
2930 sh_l2_present = sh_l1_present = 0;
2931 perfc_incrc(check_all_pagetables);
2933 for (i = 0; i < shadow_ht_buckets; i++)
2935 a = &d->arch.shadow_ht[i];
2936 while ( a && a->gpfn_and_flags )
2938 gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
2940 switch ( a->gpfn_and_flags & PGT_type_mask )
2942 case PGT_l1_shadow:
2943 errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
2944 gmfn, a->smfn, 0);
2945 break;
2946 case PGT_l2_shadow:
2947 errors += check_l2_table(v, gmfn, a->smfn,
2948 page_out_of_sync(mfn_to_page(gmfn)));
2949 break;
2950 case PGT_l3_shadow:
2951 case PGT_l4_shadow:
2952 case PGT_hl2_shadow:
2953 BUG(); // XXX - ought to fix this...
2954 break;
2955 case PGT_snapshot:
2956 case PGT_writable_pred:
2957 break;
2958 default:
2959 errors++;
2960 printk("unexpected shadow type %lx, gpfn=%lx, "
2961 "gmfn=%lx smfn=%lx\n",
2962 a->gpfn_and_flags & PGT_type_mask,
2963 a->gpfn_and_flags & PGT_mfn_mask,
2964 gmfn, a->smfn);
2965 BUG();
2967 a = a->next;
2971 shadow_status_noswap = 0;
2973 if ( errors )
2974 BUG();
2976 return errors;
2979 #endif // SHADOW_DEBUG
2980 #endif // this code has not been updated for 32pae & 64 bit modes
2982 #if CONFIG_PAGING_LEVELS >= 3
2983 /****************************************************************************/
2984 /* 64-bit shadow-mode code testing */
2985 /****************************************************************************/
2986 /*
2987 * init_bl2() is for 32-bit VMX guest on 64-bit host
2988 * Using 1 shadow L4(l3) and 4 shadow L2s to simulate guest L2
2989 */
2990 static inline unsigned long init_bl2(
2991 struct domain *d, unsigned long gpfn, unsigned long gmfn)
2993 unsigned int count;
2994 unsigned long sl2mfn;
2995 unsigned long smfn;
2996 struct page_info *page;
2997 l4_pgentry_t *spl4e;
2998 void *l2;
3000 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
3002 printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3003 BUG(); /* XXX Deal gracefully with failure. */
3006 spl4e = (l4_pgentry_t *)map_domain_page(smfn);
3008 /* Map the self entry, L4&L3 share the same page */
3009 spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
3011 /* Allocate 4 shadow L2s */
3012 page = alloc_domheap_pages(NULL, SL2_ORDER, 0);
3013 if ( !page )
3014 domain_crash_synchronous();
3016 for ( count = 0; count < PAE_L3_PAGETABLE_ENTRIES; count++ )
3018 sl2mfn = page_to_mfn(page+count);
3019 l2 = map_domain_page(sl2mfn);
3020 memset(l2, 0, PAGE_SIZE);
3021 unmap_domain_page(l2);
3022 spl4e[count] = l4e_from_pfn(sl2mfn, _PAGE_PRESENT);
3025 unmap_domain_page(spl4e);
3027 return smfn;
3029 #endif
3031 #if CONFIG_PAGING_LEVELS == 3
3032 static unsigned long shadow_l3_table(
3033 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
3035 unsigned long smfn;
3036 l3_pgentry_t *spl3e;
3037 struct domain *d = v->domain;
3039 perfc_incrc(shadow_l3_table_count);
3041 SH_VVLOG("shadow_l3_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
3043 if ( SH_L1_HAS_NEXT_PAGE &&
3044 d->arch.ops->guest_paging_levels == PAGING_L2 )
3046 return init_bl2(d, gpfn, gmfn);
3049 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l3_shadow))) )
3051 printk("Couldn't alloc an L3 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3052 BUG(); /* XXX Deal gracefully with failure. */
3055 spl3e = (l3_pgentry_t *)map_domain_page(smfn);
3057 /* Make the self entry */
3058 spl3e[PAE_SHADOW_SELF_ENTRY] = l3e_from_pfn(smfn, __PAGE_HYPERVISOR);
3060 if ( (PGT_base_page_table == PGT_l3_page_table) &&
3061 !shadow_mode_external(d) ) {
3062 int i;
3063 unsigned long g2mfn, s2mfn;
3064 l2_pgentry_t *spl2e;
3065 l3_pgentry_t *gpl3e;
3067 /* Get the top entry */
3068 gpl3e = (l3_pgentry_t *)map_domain_page(gmfn);
3070 if ( !(l3e_get_flags(gpl3e[L3_PAGETABLE_ENTRIES - 1]) & _PAGE_PRESENT) )
3072 BUG();
3075 g2mfn = l3e_get_pfn(gpl3e[L3_PAGETABLE_ENTRIES - 1]);
3077 /* NB. g2mfn should be same as g2pfn */
3078 if (!(s2mfn = __shadow_status(d, g2mfn, PGT_l2_shadow))) {
3079 if ( unlikely(!(s2mfn =
3080 alloc_shadow_page(d, g2mfn, g2mfn, PGT_l2_shadow))) ) {
3081 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
3082 g2mfn, g2mfn);
3083 BUG(); /* XXX Deal gracefully with failure. */
3087 if (!get_shadow_ref(s2mfn))
3088 BUG();
3090 /* Map shadow L2 into shadow L3 */
3091 spl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(s2mfn, _PAGE_PRESENT);
3092 shadow_update_min_max(smfn, L3_PAGETABLE_ENTRIES -1);
3094 /*
3095 * Xen private mappings. Do the similar things as
3096 * create_pae_xen_mappings().
3097 */
3098 spl2e = (l2_pgentry_t *)map_domain_page(s2mfn);
3100 /*
3101 * When we free L2 pages, we need to tell if the page contains
3102 * Xen private mappings. Use the va_mask part.
3103 */
3104 mfn_to_page(s2mfn)->u.inuse.type_info |=
3105 (unsigned long) 3 << PGT_score_shift;
3107 memset(spl2e, 0,
3108 (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)) * sizeof(l2_pgentry_t));
3110 memcpy(&spl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
3111 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
3112 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
3114 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
3115 spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
3116 l2e_from_page(
3117 virt_to_page(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_pt) + i,
3118 __PAGE_HYPERVISOR);
3119 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
3120 spl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
3121 (l3e_get_flags(gpl3e[i]) & _PAGE_PRESENT) ?
3122 l2e_from_pfn(l3e_get_pfn(gpl3e[i]), __PAGE_HYPERVISOR) :
3123 l2e_empty();
3125 unmap_domain_page(spl2e);
3126 unmap_domain_page(gpl3e);
3128 unmap_domain_page(spl3e);
3130 return smfn;
3132 #endif /* CONFIG_PAGING_LEVELS == 3 */
3134 #if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
3135 static unsigned long gva_to_gpa_pae(unsigned long gva)
3137 BUG();
3138 return 43;
3140 #endif
3142 #if CONFIG_PAGING_LEVELS == 4
3143 static unsigned long shadow_l4_table(
3144 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
3146 unsigned long smfn;
3147 l4_pgentry_t *spl4e;
3148 struct domain *d = v->domain;
3150 SH_VVLOG("shadow_l4_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
3152 perfc_incrc(shadow_l4_table_count);
3154 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
3156 return init_bl2(d, gpfn, gmfn);
3159 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
3161 printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3162 BUG(); /* XXX Deal gracefully with failure. */
3165 spl4e = (l4_pgentry_t *)map_domain_page(smfn);
3167 /* For 32-bit PAE guest on 64-bit host */
3168 if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
3170 unsigned long index;
3171 /*
3172 * Shadow L4's pfn_info->tlbflush_timestamp
3173 * should also save it's own index.
3174 */
3175 index = get_cr3_idxval(v);
3176 frame_table[smfn].tlbflush_timestamp = index;
3178 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
3179 /* Map the self entry */
3180 spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
3181 unmap_domain_page(spl4e);
3182 return smfn;
3185 /* Install hypervisor and 4x linear p.t. mapings. */
3186 if ( (PGT_base_page_table == PGT_l4_page_table) &&
3187 !shadow_mode_external(d) )
3189 /*
3190 * We could proactively fill in PDEs for pages that are already
3191 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
3192 * (restriction required for coherence of the accessed bit). However,
3193 * we tried it and it didn't help performance. This is simpler.
3194 */
3195 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
3197 /* Install hypervisor and 2x linear p.t. mapings. */
3198 memcpy(&spl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
3199 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
3200 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
3202 spl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
3203 l4e_from_paddr(__pa(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_l3),
3204 __PAGE_HYPERVISOR);
3206 if ( shadow_mode_translate(d) ) // NB: not external
3208 spl4e[l4_table_offset(RO_MPT_VIRT_START)] =
3209 l4e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
3210 __PAGE_HYPERVISOR);
3212 else
3213 spl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
3214 l4e_from_pfn(gmfn, __PAGE_HYPERVISOR);
3216 } else
3217 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
3219 unmap_domain_page(spl4e);
3221 ESH_LOG("shadow_l4_table(%lx -> %lx)", gmfn, smfn);
3222 return smfn;
3224 #endif /* CONFIG_PAGING_LEVELS == 4 */
3226 #if CONFIG_PAGING_LEVELS >= 3
3227 static void
3228 update_top_level_shadow(struct vcpu *v, unsigned long smfn)
3230 unsigned long index = get_cr3_idxval(v);
3231 pgentry_64_t *sple = (pgentry_64_t *)map_domain_page(smfn);
3232 pgentry_64_t *gple = (pgentry_64_t *)&v->arch.guest_vtable;
3233 int i;
3235 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
3236 validate_entry_change(
3237 v->domain, &gple[index*4+i], &sple[i], PAGING_L3);
3239 unmap_domain_page(sple);
3242 /*
3243 * validate_bl2e_change()
3244 * The code is for 32-bit HVM guest on 64-bit host.
3245 * To sync guest L2.
3246 */
3248 static inline void
3249 validate_bl2e_change(
3250 struct domain *d,
3251 guest_root_pgentry_t *new_gle_p,
3252 pgentry_64_t *shadow_l3,
3253 int index)
3255 int sl3_idx, sl2_idx;
3256 unsigned long sl2mfn, sl1mfn;
3257 pgentry_64_t *sl2_p;
3259 /* Using guest l2 pte index to get shadow l3&l2 index
3260 * index: 0 ~ 1023, PAGETABLE_ENTRIES: 512
3261 */
3262 sl3_idx = index / (PAGETABLE_ENTRIES / 2);
3263 sl2_idx = (index % (PAGETABLE_ENTRIES / 2)) * 2;
3265 sl2mfn = entry_get_pfn(shadow_l3[sl3_idx]);
3266 sl2_p = (pgentry_64_t *)map_domain_page(sl2mfn);
3268 validate_pde_change(
3269 d, *(guest_l2_pgentry_t *)new_gle_p, (l2_pgentry_t *)&sl2_p[sl2_idx]);
3271 /* Mapping the second l1 shadow page */
3272 if (entry_get_flags(sl2_p[sl2_idx]) & _PAGE_PRESENT) {
3273 sl1mfn = entry_get_pfn(sl2_p[sl2_idx]);
3274 sl2_p[sl2_idx + 1] =
3275 entry_from_pfn(sl1mfn + 1, entry_get_flags(sl2_p[sl2_idx]));
3277 else
3278 sl2_p[sl2_idx + 1] = (pgentry_64_t){0};
3279 unmap_domain_page(sl2_p);
3283 /*
3284 * This shadow_mark_va_out_of_sync() is for 2M page shadow
3285 */
3286 static void shadow_mark_va_out_of_sync_2mp(
3287 struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long writable_pl1e)
3289 struct out_of_sync_entry *entry =
3290 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
3292 entry->writable_pl1e = writable_pl1e;
3293 ESH_LOG("<shadow_mark_va_out_of_sync_2mp> gpfn = %lx\n", gpfn);
3294 if ( !get_shadow_ref(writable_pl1e >> L1_PAGETABLE_SHIFT) )
3295 BUG();
3298 static int get_shadow_mfn(struct domain *d, unsigned long gpfn, unsigned long *spmfn, u32 flag)
3300 unsigned long gmfn;
3301 if ( !(*spmfn = __shadow_status(d, gpfn, flag)) )
3303 /* This is NOT already shadowed so we need to shadow it. */
3304 SH_VVLOG("<get_shadow_mfn>: not shadowed");
3306 gmfn = gmfn_to_mfn(d, gpfn);
3307 if ( unlikely(!VALID_MFN(gmfn)) )
3309 // Attempt to use an invalid pfn as an shadow page.
3310 // XXX this needs to be more graceful!
3311 BUG();
3314 if ( unlikely(!(*spmfn =
3315 alloc_shadow_page(d, gpfn, gmfn, flag))) )
3317 printk("<get_shadow_mfn>Couldn't alloc an shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3318 BUG(); /* XXX Need to deal gracefully with failure. */
3320 switch(flag) {
3321 case PGT_l1_shadow:
3322 perfc_incrc(shadow_l1_table_count);
3323 break;
3324 case PGT_l2_shadow:
3325 perfc_incrc(shadow_l2_table_count);
3326 break;
3327 case PGT_l3_shadow:
3328 perfc_incrc(shadow_l3_table_count);
3329 break;
3330 case PGT_hl2_shadow:
3331 perfc_incrc(shadow_hl2_table_count);
3332 break;
3335 return 1;
3336 } else {
3337 /* This L1 is shadowed already, but the L2 entry is missing. */
3338 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", *spmfn);
3339 return 0;
3343 static void shadow_map_into_current(struct vcpu *v,
3344 unsigned long va, unsigned int from, unsigned int to)
3346 pgentry_64_t gle = {0}, sle;
3347 unsigned long gpfn, smfn;
3349 if (from == PAGING_L1 && to == PAGING_L2) {
3350 shadow_map_l1_into_current_l2(va);
3351 return;
3354 __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | to);
3355 ASSERT(entry_get_flags(gle) & _PAGE_PRESENT);
3356 gpfn = entry_get_pfn(gle);
3358 get_shadow_mfn(v->domain, gpfn, &smfn, shadow_level_to_type(from));
3360 if ( !get_shadow_ref(smfn) )
3361 BUG();
3362 entry_general(v->domain, &gle, &sle, smfn, to);
3363 __rw_entry(v, va, &gle, GUEST_ENTRY | SET_ENTRY | to);
3364 __rw_entry(v, va, &sle, SHADOW_ENTRY | SET_ENTRY | to);
3367 /*
3368 * shadow_set_lxe should be put in shadow.h
3369 */
3370 static void shadow_set_l2e_64(unsigned long va, l2_pgentry_t sl2e,
3371 int create_l2_shadow, int put_ref_check)
3373 struct vcpu *v = current;
3374 l4_pgentry_t sl4e;
3375 l3_pgentry_t sl3e;
3377 __shadow_get_l4e(v, va, &sl4e);
3378 if (!(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
3379 if (create_l2_shadow) {
3380 perfc_incrc(shadow_set_l3e_force_map);
3381 shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
3382 __shadow_get_l4e(v, va, &sl4e);
3383 } else {
3384 printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
3388 __shadow_get_l3e(v, va, &sl3e);
3389 if (!(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
3390 if (create_l2_shadow) {
3391 perfc_incrc(shadow_set_l2e_force_map);
3392 shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
3393 __shadow_get_l3e(v, va, &sl3e);
3394 } else {
3395 printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
3397 shadow_update_min_max(l4e_get_pfn(sl4e), l3_table_offset(va));
3401 if ( put_ref_check ) {
3402 l2_pgentry_t tmp_sl2e;
3403 if ( __shadow_get_l2e(v, va, &tmp_sl2e) ) {
3404 if ( l2e_get_flags(tmp_sl2e) & _PAGE_PRESENT )
3405 if ( l2e_get_pfn(tmp_sl2e) == l2e_get_pfn(sl2e) ) {
3406 put_shadow_ref(l2e_get_pfn(sl2e));
3412 if (! __shadow_set_l2e(v, va, &sl2e))
3413 BUG();
3414 shadow_update_min_max(l3e_get_pfn(sl3e), l2_table_offset(va));
3418 /* As 32-bit guest don't support 4M page yet,
3419 * we don't concern double compile for this function
3420 */
3421 static inline int l2e_rw_fault(
3422 struct vcpu *v, l2_pgentry_t *gl2e_p, unsigned long va, int rw)
3424 struct domain *d = v->domain;
3425 l2_pgentry_t gl2e = *gl2e_p;
3426 l2_pgentry_t tmp_l2e = gl2e;
3427 unsigned long start_gpfn = l2e_get_pfn(gl2e);
3428 unsigned long gpfn, mfn;
3429 unsigned long l1_mfn, gmfn;
3430 l1_pgentry_t *l1_p;
3431 l1_pgentry_t sl1e;
3432 l1_pgentry_t old_sl1e;
3433 l2_pgentry_t sl2e;
3434 #ifdef __x86_64__
3435 u64 nx = 0;
3436 #endif
3437 int put_ref_check = 0;
3438 /* Check if gpfn is 2M aligned */
3440 /* Update guest l2e */
3441 if (rw) {
3442 ASSERT(l2e_get_flags(gl2e) & _PAGE_RW);
3443 l2e_add_flags(gl2e, _PAGE_DIRTY | _PAGE_ACCESSED);
3444 } else {
3445 l2e_add_flags(gl2e, _PAGE_ACCESSED);
3448 l2e_remove_flags(tmp_l2e, _PAGE_PSE);
3449 if (l2e_get_flags(gl2e) & _PAGE_NX) {
3450 l2e_remove_flags(tmp_l2e, _PAGE_NX);
3451 #ifdef __x86_64__
3452 nx = PGT_high_mfn_nx;
3453 #endif
3457 /* Get the shadow l2 first */
3458 if ( !__shadow_get_l2e(v, va, &sl2e) )
3459 sl2e = l2e_empty();
3461 #ifdef __x86_64__
3462 l1_mfn = __shadow_status(d, start_gpfn | nx, PGT_fl1_shadow);
3463 #else
3464 l1_mfn = __shadow_status(d, start_gpfn, PGT_fl1_shadow);
3465 #endif
3467 /* Check the corresponding l2e */
3468 if (l1_mfn) {
3469 /* Why it is PRESENT?*/
3470 if ((l2e_get_flags(sl2e) & _PAGE_PRESENT) &&
3471 l2e_get_pfn(sl2e) == l1_mfn) {
3472 ESH_LOG("sl2e PRSENT bit is set: %lx, l1_mfn = %lx\n", l2e_get_pfn(sl2e), l1_mfn);
3473 } else {
3474 put_ref_check = 1;
3475 if (!get_shadow_ref(l1_mfn))
3476 BUG();
3478 l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn);
3479 sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
3480 } else {
3481 /* Allocate a new page as shadow page table if need */
3482 gmfn = gmfn_to_mfn(d, start_gpfn);
3483 #ifdef __x86_64__
3484 l1_mfn = alloc_shadow_page(d, start_gpfn | nx, gmfn, PGT_fl1_shadow);
3485 #else
3486 l1_mfn = alloc_shadow_page(d, start_gpfn, gmfn, PGT_fl1_shadow);
3487 #endif
3488 if (unlikely(!l1_mfn)) {
3489 BUG();
3492 if (!get_shadow_ref(l1_mfn))
3493 BUG();
3494 l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn );
3495 sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
3496 memset(l1_p, 0, PAGE_SIZE);
3497 ESH_LOG("Alloc a shadow page: %lx\n", l1_mfn);
3500 ESH_LOG("<%s>: sl2e = %lx\n", __func__, l2e_get_intpte(sl2e));
3501 /* Map the page to l2*/
3502 shadow_set_l2e_64(va, sl2e, 1, put_ref_check);
3504 if (l2e_get_flags(gl2e) & _PAGE_NX)
3505 l2e_add_flags(tmp_l2e, _PAGE_NX);
3507 /* Propagate the shadow page table, i.e. setting sl1e */
3508 for (gpfn = start_gpfn;
3509 gpfn < (start_gpfn + L1_PAGETABLE_ENTRIES); gpfn++) {
3511 mfn = gmfn_to_mfn(d, gpfn);
3513 if ( unlikely(!VALID_MFN(mfn)) )
3515 continue;
3518 sl1e = l1e_from_pfn(mfn, l2e_get_flags(tmp_l2e));
3520 if (!rw) {
3521 if ( shadow_mode_log_dirty(d) ||
3522 !(l2e_get_flags(gl2e) & _PAGE_DIRTY) || mfn_is_page_table(mfn) )
3524 l1e_remove_flags(sl1e, _PAGE_RW);
3526 } else {
3527 /* __mark_dirty(d, gmfn); */
3529 // printk("<%s> gpfn: %lx, mfn: %lx, sl1e: %lx\n", __func__, gpfn, mfn, l1e_get_intpte(sl1e));
3530 /* The shadow entrys need setup before shadow_mark_va_out_of_sync()*/
3531 old_sl1e = l1_p[gpfn - start_gpfn];
3533 if ( l1e_has_changed(old_sl1e, sl1e, _PAGE_RW | _PAGE_PRESENT) )
3535 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
3536 !shadow_get_page_from_l1e(sl1e, d) ) {
3537 ESH_LOG("%lx, mfn: %lx why make me empty, start_pfn: %lx, gpfn: %lx\n", l1e_get_intpte(sl1e),mfn, start_gpfn, gpfn);
3538 sl1e = l1e_empty();
3540 if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
3541 put_page_from_l1e(old_sl1e, d);
3544 l1_p[gpfn - start_gpfn] = sl1e;
3546 if (rw) {
3547 /* shadow_mark_va_out_of_sync() need modificatin for 2M pages*/
3548 if ( mfn_is_page_table(mfn) )
3549 shadow_mark_va_out_of_sync_2mp(v, gpfn, mfn,
3550 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * (gpfn - start_gpfn)));
3554 unmap_domain_page(l1_p);
3555 return 1;
3559 /*
3560 * Check P, R/W, U/S bits in the guest page table.
3561 * If the fault belongs to guest return 1,
3562 * else return 0.
3563 */
3564 #if defined( GUEST_PGENTRY_32 )
3565 static inline int guest_page_fault(
3566 struct vcpu *v,
3567 unsigned long va, unsigned int error_code,
3568 guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
3570 /* The following check for 32-bit guest on 64-bit host */
3572 __guest_get_l2e(v, va, gpl2e);
3574 /* Check the guest L2 page-table entry first*/
3575 if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_PRESENT)) )
3576 return 1;
3578 if ( error_code & ERROR_W )
3580 if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_RW)) )
3581 return 1;
3584 if ( error_code & ERROR_U )
3586 if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_USER)) )
3587 return 1;
3590 if ( guest_l2e_get_flags(*gpl2e) & _PAGE_PSE )
3591 return 0;
3593 __guest_get_l1e(v, va, gpl1e);
3595 /* Then check the guest L1 page-table entry */
3596 if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_PRESENT)) )
3597 return 1;
3599 if ( error_code & ERROR_W )
3601 if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_RW)) )
3602 return 1;
3605 if ( error_code & ERROR_U )
3607 if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_USER)) )
3608 return 1;
3611 return 0;
3613 #else
3614 static inline int guest_page_fault(
3615 struct vcpu *v,
3616 unsigned long va, unsigned int error_code,
3617 guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
3619 struct domain *d = v->domain;
3620 pgentry_64_t gle = { 0 };
3621 unsigned long gpfn = 0, mfn;
3622 int i;
3623 unsigned int base_idx = 0;
3624 base_idx = get_cr3_idxval(v);
3626 ASSERT( d->arch.ops->guest_paging_levels >= PAGING_L3 );
3628 #if CONFIG_PAGING_LEVELS >= 4
3629 if ( (error_code & (ERROR_I | ERROR_P)) == (ERROR_I | ERROR_P) )
3630 return 1;
3631 #endif
3633 #if CONFIG_PAGING_LEVELS == 4
3634 if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
3636 __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | PAGING_L4);
3637 if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
3638 return 1;
3640 if ( error_code & ERROR_W )
3642 if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
3643 return 1;
3646 if ( error_code & ERROR_U )
3648 if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
3649 return 1;
3651 gpfn = entry_get_pfn(gle);
3653 #endif
3655 #if CONFIG_PAGING_LEVELS >= 3
3656 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
3658 if ( SH_GUEST_32PAE )
3659 gpfn = (hvm_get_guest_ctrl_reg(v, 3)) >> PAGE_SHIFT;
3660 else
3661 gpfn = pagetable_get_pfn(v->arch.guest_table);
3663 #endif
3665 for ( i = PAGING_L3; i >= PAGING_L1; i-- )
3667 pgentry_64_t *lva;
3668 /*
3669 * If it's not external mode, then mfn should be machine physical.
3670 */
3671 mfn = gmfn_to_mfn(d, gpfn);
3673 lva = (pgentry_64_t *) map_domain_page(mfn);
3674 gle = lva[guest_table_offset_64(va, i, base_idx)];
3676 unmap_domain_page(lva);
3678 gpfn = entry_get_pfn(gle);
3680 if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
3681 return 1;
3683 if ( i < PAGING_L3 )
3685 if ( error_code & ERROR_W )
3687 if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
3689 if ( i == PAGING_L1 )
3690 if ( gpl1e )
3691 gpl1e->l1 = gle.lo;
3692 return 1;
3695 if ( error_code & ERROR_U )
3697 if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
3698 return 1;
3702 if ( i == PAGING_L2 )
3704 if ( gpl2e )
3705 gpl2e->l2 = gle.lo;
3706 if ( likely(entry_get_flags(gle) & _PAGE_PSE) )
3707 return 0;
3710 if ( i == PAGING_L1 )
3711 if ( gpl1e )
3712 gpl1e->l1 = gle.lo;
3715 return 0;
3718 #endif
3720 static int shadow_fault_64(unsigned long va, struct cpu_user_regs *regs)
3722 struct vcpu *v = current;
3723 struct domain *d = v->domain;
3724 guest_l2_pgentry_t gl2e;
3725 guest_l1_pgentry_t gl1e, orig_gl1e;
3726 l1_pgentry_t sl1e;
3728 gl1e = guest_l1e_empty(); gl2e = guest_l2e_empty();
3730 sl1e = l1e_empty();
3732 perfc_incrc(shadow_fault_calls);
3734 ESH_LOG("<shadow_fault_64> va=%lx, rip = %lx, error code = %x\n",
3735 va, regs->eip, regs->error_code);
3737 /*
3738 * Don't let someone else take the guest's table pages out-of-sync.
3739 */
3740 shadow_lock(d);
3742 /*
3743 * STEP 1. Check to see if this fault might have been caused by an
3744 * out-of-sync table page entry, or if we should pass this
3745 * fault onto the guest.
3746 */
3747 __shadow_sync_va(v, va);
3749 /*
3750 * STEP 2. Check if the fault belongs to guest
3751 */
3752 if ( guest_page_fault(v, va, regs->error_code, &gl2e, &gl1e) )
3754 if ( unlikely(shadow_mode_log_dirty(d)) && l1e_get_intpte(gl1e) != 0 )
3755 goto check_writeable;
3757 goto fail;
3760 if ( unlikely((guest_l2e_get_flags(gl2e) & _PAGE_PSE)) )
3761 goto pse;
3763 /*
3764 * Handle 4K pages here
3765 */
3766 check_writeable:
3767 orig_gl1e = gl1e;
3769 /* Write fault? */
3770 if ( regs->error_code & 2 )
3772 int allow_writes = 0;
3774 if ( unlikely(!(guest_l1e_get_flags(gl1e) & _PAGE_RW)) )
3776 if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gl1e)) )
3778 allow_writes = 1;
3779 l1e_add_flags(gl1e, _PAGE_RW);
3781 else
3783 /* Write fault on a read-only mapping. */
3784 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
3785 l1e_get_intpte(gl1e));
3786 perfc_incrc(shadow_fault_bail_ro_mapping);
3787 goto fail;
3791 if ( !l1pte_write_fault(v, &gl1e, &sl1e, va) )
3793 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
3794 perfc_incrc(write_fault_bail);
3795 shadow_unlock(d);
3796 return 0;
3799 if (allow_writes)
3800 l1e_remove_flags(gl1e, _PAGE_RW);
3802 else
3804 if ( !l1pte_read_fault(d, &gl1e, &sl1e) )
3806 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
3807 perfc_incrc(read_fault_bail);
3808 shadow_unlock(d);
3809 return 0;
3813 /*
3814 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables
3815 */
3816 if ( l1e_has_changed(orig_gl1e, gl1e, PAGE_FLAG_MASK) )
3818 if (unlikely(!__guest_set_l1e(v, va, &gl1e)))
3819 domain_crash_synchronous();
3821 __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gl2e)));
3824 shadow_set_l1e_64(va, (pgentry_64_t *)&sl1e, 1);
3826 perfc_incrc(shadow_fault_fixed);
3827 d->arch.shadow_fault_count++;
3829 shadow_unlock(d);
3831 return EXCRET_fault_fixed;
3833 pse:
3834 /*
3835 * Handle 2M pages here
3836 */
3837 if ( unlikely(!shadow_mode_external(d)) )
3838 BUG();
3840 /* Write fault? */
3841 if ( regs->error_code & 2 )
3843 if ( !l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, WRITE_FAULT) )
3845 goto fail;
3848 else
3850 l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, READ_FAULT);
3853 /*
3854 * STEP 3. Write guest/shadow l2e back
3855 */
3857 if ( unlikely(!__guest_set_l2e(v, va, &gl2e)) )
3859 domain_crash_synchronous();
3862 /*
3863 * Todo: if necessary, record the page table page as dirty
3864 */
3866 perfc_incrc(shadow_fault_fixed);
3867 d->arch.shadow_fault_count++;
3869 shadow_unlock(d);
3871 return EXCRET_fault_fixed;
3872 fail:
3873 shadow_unlock(d);
3874 ESH_LOG("Guest fault~~~\n");
3875 return 0;
3878 static void shadow_invlpg_64(struct vcpu *v, unsigned long va)
3880 struct domain *d = v->domain;
3881 l1_pgentry_t sl1e, old_sl1e;
3883 shadow_lock(d);
3885 __shadow_sync_va(v, va);
3887 if ( shadow_mode_external(d) && __shadow_get_l1e(v, va, &old_sl1e) )
3888 if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
3889 put_page_from_l1e(old_sl1e, d);
3891 sl1e = l1e_empty();
3892 __shadow_set_l1e(v, va, &sl1e);
3894 shadow_unlock(d);
3897 static unsigned long gva_to_gpa_64(unsigned long gva)
3899 struct vcpu *v = current;
3900 guest_l1_pgentry_t gl1e = {0};
3901 guest_l2_pgentry_t gl2e = {0};
3902 unsigned long gpa;
3904 if (guest_page_fault(v, gva, 0, &gl2e, &gl1e))
3905 return 0;
3907 if (guest_l2e_get_flags(gl2e) & _PAGE_PSE)
3908 gpa = guest_l2e_get_paddr(gl2e) + (gva & ((1 << GUEST_L2_PAGETABLE_SHIFT) - 1));
3909 else
3910 gpa = guest_l1e_get_paddr(gl1e) + (gva & ~PAGE_MASK);
3912 return gpa;
3915 /*
3916 * The naming convention of the shadow_ops:
3917 * MODE_<pgentry size>_<guest paging levels>_HANDLER
3918 */
3919 #if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
3920 struct shadow_ops MODE_64_3_HANDLER = {
3921 .guest_paging_levels = 3,
3922 .invlpg = shadow_invlpg_64,
3923 .fault = shadow_fault_64,
3924 .update_pagetables = shadow_update_pagetables,
3925 .sync_all = sync_all,
3926 .remove_all_write_access = remove_all_write_access,
3927 .do_update_va_mapping = do_update_va_mapping,
3928 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
3929 .is_out_of_sync = is_out_of_sync,
3930 .gva_to_gpa = gva_to_gpa_pae,
3931 };
3933 struct shadow_ops MODE_64_4_HANDLER = {
3934 .guest_paging_levels = 4,
3935 .invlpg = shadow_invlpg_64,
3936 .fault = shadow_fault_64,
3937 .update_pagetables = shadow_update_pagetables,
3938 .sync_all = sync_all,
3939 .remove_all_write_access = remove_all_write_access,
3940 .do_update_va_mapping = do_update_va_mapping,
3941 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
3942 .is_out_of_sync = is_out_of_sync,
3943 .gva_to_gpa = gva_to_gpa_64,
3944 };
3945 #endif /* GUEST_PGENTRY_32 */
3946 #endif /* CONFIG_PAGING_LEVELS >= 3 */
3949 #if CONFIG_PAGING_LEVELS == 2
3950 struct shadow_ops MODE_32_2_HANDLER = {
3951 .guest_paging_levels = 2,
3952 .invlpg = shadow_invlpg_32,
3953 .fault = shadow_fault_32,
3954 .update_pagetables = shadow_update_pagetables,
3955 .sync_all = sync_all,
3956 .remove_all_write_access = remove_all_write_access,
3957 .do_update_va_mapping = do_update_va_mapping,
3958 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
3959 .is_out_of_sync = is_out_of_sync,
3960 .gva_to_gpa = gva_to_gpa_64,
3961 };
3962 #endif
3964 #if ( CONFIG_PAGING_LEVELS == 3 && !defined (GUEST_PGENTRY_32) ) || \
3965 ( CONFIG_PAGING_LEVELS == 4 && defined (GUEST_PGENTRY_32) )
3968 /*
3969 * Use GUEST_PGENTRY_32 to force PAE_SHADOW_SELF_ENTRY for L4.
3971 * Very simple shadow code to handle 1:1 direct mapping for guest
3972 * non-paging code, which actually is running in PAE/vm86 mode with
3973 * paging-enabled.
3975 * We expect that the top level (L3) page has been allocated and initialized.
3976 */
3977 int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs)
3979 struct vcpu *v = current;
3980 struct domain *d = v->domain;
3981 l3_pgentry_t sl3e, *sl3e_p;
3982 l2_pgentry_t sl2e, *sl2e_p;
3983 l1_pgentry_t sl1e;
3984 unsigned long mfn, smfn;
3985 struct page_info *page;
3987 /*
3988 * If the faulting address is within the MMIO range, we continue
3989 * on handling the #PF as such.
3990 */
3991 if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN )
3992 return 0;
3994 shadow_lock(d);
3996 __direct_get_l3e(v, vpa, &sl3e);
3998 if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT) )
4000 page = alloc_domheap_page(NULL);
4001 if ( !page )
4002 goto nomem;
4004 smfn = page_to_mfn(page);
4005 sl3e = l3e_from_pfn(smfn, _PAGE_PRESENT);
4007 sl3e_p = (l3_pgentry_t *)map_domain_page(smfn);
4008 memset(sl3e_p, 0, PAGE_SIZE);
4009 unmap_domain_page(sl3e_p);
4011 __direct_set_l3e(v, vpa, &sl3e);
4014 __direct_get_l2e(v, vpa, &sl2e);
4016 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
4018 page = alloc_domheap_page(NULL);
4019 if ( !page )
4020 goto nomem;
4022 smfn = page_to_mfn(page);
4023 sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER);
4024 sl2e_p = (l2_pgentry_t *)map_domain_page(smfn);
4025 memset(sl2e_p, 0, PAGE_SIZE);
4026 unmap_domain_page(sl2e_p);
4028 __direct_set_l2e(v, vpa, &sl2e);
4031 __direct_get_l1e(v, vpa, &sl1e);
4033 if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) )
4035 sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER);
4036 __direct_set_l1e(v, vpa, &sl1e);
4039 shadow_unlock(d);
4040 return EXCRET_fault_fixed;
4042 nomem:
4043 shadow_direct_map_clean(d);
4044 domain_crash_synchronous();
4046 #endif
4048 /*
4049 * Local variables:
4050 * mode: C
4051 * c-set-style: "BSD"
4052 * c-basic-offset: 4
4053 * tab-width: 4
4054 * indent-tabs-mode: nil
4055 * End:
4056 */