direct-io.hg

view xen/arch/x86/shadow.c @ 11135:88e6bd5e2b54

Whitespace clean-ups.

Signed-off-by: Steven Hand <steven@xensource.com>
author shand@kneesaa.uk.xensource.com
date Wed Aug 16 11:36:13 2006 +0100 (2006-08-16)
parents bc2f68334e96
children 716ef8e8bddc
line source
1 /******************************************************************************
2 * arch/x86/shadow.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 /*
22 * Jun Nakajima <jun.nakajima@intel.com>
23 * Chengyuan Li <chengyuan.li@intel.com>
24 *
25 * Extended to support 32-bit PAE and 64-bit guests.
26 */
28 #include <xen/config.h>
29 #include <xen/types.h>
30 #include <xen/mm.h>
31 #include <xen/domain_page.h>
32 #include <asm/shadow.h>
33 #include <asm/page.h>
34 #include <xen/event.h>
35 #include <xen/sched.h>
36 #include <xen/trace.h>
37 #include <asm/shadow_64.h>
39 /* Use this to have the compiler remove unnecessary branches */
40 #define SH_L1_HAS_NEXT_PAGE (GUEST_L1_PAGETABLE_ENTRIES - L1_PAGETABLE_ENTRIES)
42 extern void free_shadow_pages(struct domain *d);
44 #if 0 // this code has not been updated for 32pae & 64 bit modes
45 #if SHADOW_DEBUG
46 static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
47 #endif
48 #endif
50 #if CONFIG_PAGING_LEVELS == 3
51 static unsigned long shadow_l3_table(
52 struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
53 #endif
55 #if CONFIG_PAGING_LEVELS == 4
56 static unsigned long shadow_l4_table(
57 struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
58 #endif
60 #if CONFIG_PAGING_LEVELS >= 3
61 static void shadow_map_into_current(struct vcpu *v,
62 unsigned long va, unsigned int from, unsigned int to);
63 static inline void validate_bl2e_change( struct domain *d,
64 guest_root_pgentry_t *new_gle_p, pgentry_64_t *shadow_l3, int index);
65 static void update_top_level_shadow(struct vcpu *v, unsigned long smfn);
66 #endif
68 /********
70 There's a per-domain shadow table spin lock which works fine for SMP
71 hosts. We don't have to worry about interrupts as no shadow operations
72 happen in an interrupt context. It's probably not quite ready for SMP
73 guest operation as we have to worry about synchonisation between gpte
74 and spte updates. Its possible that this might only happen in a
75 hypercall context, in which case we'll probably at have a per-domain
76 hypercall lock anyhow (at least initially).
78 ********/
80 static inline int
81 shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
82 unsigned long new_type)
83 {
84 struct page_info *page = mfn_to_page(gmfn);
85 int pinned = 0, okay = 1;
87 if ( page_out_of_sync(page) )
88 {
89 // Don't know how long ago this snapshot was taken.
90 // Can't trust it to be recent enough.
91 //
92 __shadow_sync_mfn(d, gmfn);
93 }
95 if ( !shadow_mode_refcounts(d) )
96 return 1;
98 if ( unlikely(page_is_page_table(page)) )
99 return 1;
101 FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
103 if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
104 {
105 FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
106 __func__, gpfn, gmfn);
107 #if 1 || defined(LIVE_DANGEROUSLY)
108 set_bit(_PGC_page_table, &page->count_info);
109 return 1;
110 #endif
111 return 0;
112 }
114 // To convert this page to use as a page table, the writable count
115 // should now be zero. Test this by grabbing the page as an page table,
116 // and then immediately releasing. This will also deal with any
117 // necessary TLB flushing issues for us.
118 //
119 // The cruft here about pinning doesn't really work right. This
120 // needs rethinking/rewriting... Need to gracefully deal with the
121 // TLB flushes required when promoting a writable page, and also deal
122 // with any outstanding (external) writable refs to this page (by
123 // refusing to promote it). The pinning headache complicates this
124 // code -- it would all get much simpler if we stop using
125 // shadow_lock() and move the shadow code to BIGLOCK().
126 //
127 if ( unlikely(!get_page(page, d)) )
128 BUG(); // XXX -- needs more thought for a graceful failure
129 if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
130 {
131 pinned = 1;
132 put_page_and_type(page);
133 }
134 if ( get_page_type(page, PGT_base_page_table) )
135 {
136 set_bit(_PGC_page_table, &page->count_info);
137 put_page_type(page);
138 }
139 else
140 {
141 printk("shadow_promote: get_page_type failed "
142 "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
143 d->domain_id, gpfn, gmfn, new_type);
144 okay = 0;
145 }
147 // Now put the type back to writable...
148 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
149 BUG(); // XXX -- needs more thought for a graceful failure
150 if ( unlikely(pinned) )
151 {
152 if ( unlikely(test_and_set_bit(_PGT_pinned,
153 &page->u.inuse.type_info)) )
154 BUG(); // hmm... someone pinned this again?
155 }
156 else
157 put_page_and_type(page);
159 return okay;
160 }
163 /*
164 * Things in shadow mode that collect get_page() refs to the domain's
165 * pages are:
166 * - PGC_allocated takes a gen count, just like normal.
167 * - A writable page can be pinned (paravirtualized guests may consider
168 * these pages to be L1s or L2s, and don't know the difference).
169 * Pinning a page takes a gen count (but, for domains in shadow mode,
170 * it *doesn't* take a type count)
171 * - CR3 grabs a ref to whatever it points at, just like normal.
172 * - Shadow mode grabs an initial gen count for itself, as a placehold
173 * for whatever references will exist.
174 * - Shadow PTEs that point to a page take a gen count, just like regular
175 * PTEs. However, they don't get a type count, as get_page_type() is
176 * hardwired to keep writable pages' counts at 1 for domains in shadow
177 * mode.
178 * - Whenever we shadow a page, the entry in the shadow hash grabs a
179 * general ref to the page.
180 * - Whenever a page goes out of sync, the out of sync entry grabs a
181 * general ref to the page.
182 */
183 /*
184 * page_info fields for pages allocated as shadow pages:
185 *
186 * All 32 bits of count_info are a simple count of refs to this shadow
187 * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
188 * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
189 * references.
190 *
191 * u.inuse._domain is left NULL, to prevent accidently allow some random
192 * domain from gaining permissions to map this page.
193 *
194 * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
195 * shadowed.
196 * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
197 * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
198 * is currently exists because this is a shadow of a root page, and we
199 * don't want to let those disappear just because no CR3 is currently pointing
200 * at it.
201 *
202 * tlbflush_timestamp holds a min & max index of valid page table entries
203 * within the shadow page.
204 */
205 static inline void
206 shadow_page_info_init(struct page_info *page,
207 unsigned long gmfn,
208 u32 psh_type)
209 {
210 ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
211 page->u.inuse.type_info = psh_type | gmfn;
212 page->count_info = 0;
213 page->tlbflush_timestamp = 0;
214 }
216 static inline unsigned long
217 alloc_shadow_page(struct domain *d,
218 unsigned long gpfn, unsigned long gmfn,
219 u32 psh_type)
220 {
221 struct page_info *page;
222 unsigned long smfn, real_gpfn;
223 int pin = 0;
224 void *l1, *lp;
225 u64 index = 0;
227 // Currently, we only keep pre-zero'ed pages around for use as L1's...
228 // This will change. Soon.
229 //
230 if ( psh_type == PGT_l1_shadow )
231 {
232 if ( !list_empty(&d->arch.free_shadow_frames) )
233 {
234 struct list_head *entry = d->arch.free_shadow_frames.next;
235 page = list_entry(entry, struct page_info, list);
236 list_del(entry);
237 perfc_decr(free_l1_pages);
238 }
239 else
240 {
241 if ( SH_L1_HAS_NEXT_PAGE &&
242 d->arch.ops->guest_paging_levels == PAGING_L2)
243 {
244 #if CONFIG_PAGING_LEVELS >= 3
245 /*
246 * For 32-bit HVM guest, 2 shadow L1s are required to
247 * simulate 1 guest L1 So need allocate 2 shadow L1
248 * pages each time.
249 *
250 * --> Need to avoidalloc_domheap_pages.
251 */
252 page = alloc_domheap_pages(NULL, SL1_ORDER, 0);
253 if (!page)
254 goto no_shadow_page;
256 l1 = map_domain_page(page_to_mfn(page));
257 memset(l1, 0, PAGE_SIZE);
258 unmap_domain_page(l1);
260 l1 = map_domain_page(page_to_mfn(page + 1));
261 memset(l1, 0, PAGE_SIZE);
262 unmap_domain_page(l1);
264 /* we'd like to initialize the second continuous page here
265 * and leave the first page initialization later */
267 shadow_page_info_init(page+1, gmfn, psh_type);
268 #else
269 page = alloc_domheap_page(NULL);
270 if (!page)
271 goto no_shadow_page;
273 l1 = map_domain_page(page_to_mfn(page));
274 memset(l1, 0, PAGE_SIZE);
275 unmap_domain_page(l1);
276 #endif
277 }
278 else
279 {
280 page = alloc_domheap_page(NULL);
281 if (!page)
282 goto no_shadow_page;
284 l1 = map_domain_page(page_to_mfn(page));
285 memset(l1, 0, PAGE_SIZE);
286 unmap_domain_page(l1);
287 }
288 }
289 }
290 else {
291 #if CONFIG_PAGING_LEVELS == 2
292 page = alloc_domheap_page(NULL);
293 #elif CONFIG_PAGING_LEVELS >= 3
294 if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
295 psh_type == PGT_l4_shadow ) /* allocated for PAE PDP page */
296 page = alloc_domheap_pages(NULL, 0, MEMF_dma);
297 else if ( d->arch.ops->guest_paging_levels == PAGING_L3 &&
298 (psh_type == PGT_l3_shadow || psh_type == PGT_l4_shadow) )
299 page = alloc_domheap_pages(NULL, 0, MEMF_dma); /* allocated for PAE PDP page */
300 else
301 page = alloc_domheap_page(NULL);
302 #endif
303 if (!page)
304 goto no_shadow_page;
306 lp = map_domain_page(page_to_mfn(page));
307 memset(lp, 0, PAGE_SIZE);
308 unmap_domain_page(lp);
309 }
311 smfn = page_to_mfn(page);
313 shadow_page_info_init(page, gmfn, psh_type);
315 switch ( psh_type )
316 {
317 case PGT_l1_shadow:
318 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
319 goto fail;
320 perfc_incr(shadow_l1_pages);
321 d->arch.shadow_page_count++;
322 break;
324 case PGT_l2_shadow:
325 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
326 goto fail;
327 perfc_incr(shadow_l2_pages);
328 d->arch.shadow_page_count++;
329 if ( PGT_l2_page_table == PGT_root_page_table )
330 pin = 1;
332 break;
334 case PGT_l3_shadow:
335 if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
336 goto fail;
337 perfc_incr(shadow_l3_pages);
338 d->arch.shadow_page_count++;
339 if ( PGT_l3_page_table == PGT_root_page_table )
340 pin = 1;
341 break;
343 case PGT_l4_shadow:
344 real_gpfn = gpfn & PGT_mfn_mask;
345 if ( !shadow_promote(d, real_gpfn, gmfn, psh_type) )
346 goto fail;
347 perfc_incr(shadow_l4_pages);
348 d->arch.shadow_page_count++;
349 if ( PGT_l4_page_table == PGT_root_page_table )
350 pin = 1;
351 #if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32)
352 /*
353 * We use PGT_l4_shadow for 2-level paging guests on PAE
354 */
355 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
356 pin = 1;
357 #endif
359 #if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE )
360 /*
361 * We use PGT_l4_shadow for 2-level paging guests on PAE
362 */
363 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
364 pin = 1;
365 #endif
366 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
367 index = get_cr3_idxval(current);
368 break;
370 #if CONFIG_PAGING_LEVELS >= 3
371 case PGT_fl1_shadow:
372 perfc_incr(shadow_l1_pages);
373 d->arch.shadow_page_count++;
374 break;
375 #else
377 case PGT_hl2_shadow:
378 // Treat an hl2 as an L1 for purposes of promotion.
379 // For external mode domains, treat them as an L2 for purposes of
380 // pinning.
381 //
382 if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
383 goto fail;
384 perfc_incr(hl2_table_pages);
385 d->arch.hl2_page_count++;
386 if ( shadow_mode_external(d) &&
387 (PGT_l2_page_table == PGT_root_page_table) )
388 pin = 1;
390 break;
391 #endif
392 case PGT_snapshot:
393 perfc_incr(snapshot_pages);
394 d->arch.snapshot_page_count++;
395 break;
397 default:
398 printk("Alloc shadow weird page type type=%08x\n", psh_type);
399 BUG();
400 break;
401 }
403 // Don't add a new shadow of something that already has a snapshot.
404 //
405 ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
407 set_shadow_status(d, gpfn, gmfn, smfn, psh_type, index);
409 if ( pin )
410 shadow_pin(smfn);
412 return smfn;
414 fail:
415 FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?",
416 gpfn, gmfn);
417 if (psh_type == PGT_l1_shadow)
418 {
419 if (d->arch.ops->guest_paging_levels == PAGING_L2)
420 {
421 #if CONFIG_PAGING_LEVELS >=3
422 free_domheap_pages(page, SL1_ORDER);
423 #else
424 free_domheap_page(page);
425 #endif
426 }
427 else
428 free_domheap_page(page);
429 }
430 else
431 free_domheap_page(page);
433 return 0;
435 no_shadow_page:
436 ASSERT(page == NULL);
437 printk("Couldn't alloc shadow page! dom%d count=%d\n",
438 d->domain_id, d->arch.shadow_page_count);
439 printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
440 perfc_value(shadow_l1_pages),
441 perfc_value(shadow_l2_pages),
442 perfc_value(hl2_table_pages),
443 perfc_value(snapshot_pages));
444 /* XXX FIXME: try a shadow flush to free up some memory. */
445 domain_crash_synchronous();
447 return 0;
448 }
450 #if CONFIG_PAGING_LEVELS == 2
451 static unsigned long
452 shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
453 unsigned long smfn)
454 {
455 unsigned long hl2mfn;
456 l1_pgentry_t *hl2;
457 int limit;
459 ASSERT(PGT_base_page_table == PGT_l2_page_table);
461 if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
462 {
463 printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
464 gpfn, gmfn);
465 BUG(); /* XXX Deal gracefully with failure. */
466 }
468 SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
469 gpfn, gmfn, smfn, hl2mfn);
470 perfc_incrc(shadow_hl2_table_count);
472 hl2 = map_domain_page(hl2mfn);
474 if ( shadow_mode_external(d) )
475 limit = L2_PAGETABLE_ENTRIES;
476 else
477 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
479 memset(hl2, 0, limit * sizeof(l1_pgentry_t));
481 if ( !shadow_mode_external(d) )
482 {
483 memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
484 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
486 // Setup easy access to the GL2, SL2, and HL2 frames.
487 //
488 hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
489 l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
490 hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
491 l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
492 hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
493 l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
494 }
496 unmap_domain_page(hl2);
498 return hl2mfn;
499 }
501 /*
502 * This could take and use a snapshot, and validate the entire page at
503 * once, or it could continue to fault in entries one at a time...
504 * Might be worth investigating...
505 */
506 static unsigned long shadow_l2_table(
507 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
508 {
509 unsigned long smfn;
510 l2_pgentry_t *spl2e;
511 struct domain *d = v->domain;
512 int i;
514 SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
516 perfc_incrc(shadow_l2_table_count);
518 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
519 {
520 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
521 gpfn, gmfn);
522 BUG(); /* XXX Deal gracefully with failure. */
523 }
525 spl2e = (l2_pgentry_t *)map_domain_page(smfn);
527 /* Install hypervisor and 2x linear p.t. mapings. */
528 if ( (PGT_base_page_table == PGT_l2_page_table) &&
529 !shadow_mode_external(d) )
530 {
531 /*
532 * We could proactively fill in PDEs for pages that are already
533 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
534 * (restriction required for coherence of the accessed bit). However,
535 * we tried it and it didn't help performance. This is simpler.
536 */
537 memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
539 /* Install hypervisor and 2x linear p.t. mapings. */
540 memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
541 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
542 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
544 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
545 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
547 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
548 spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
549 l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
550 arch.mm_perdomain_pt) + i,
551 __PAGE_HYPERVISOR);
553 if ( shadow_mode_translate(d) ) // NB: not external
554 {
555 unsigned long hl2mfn;
557 spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
558 l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
559 __PAGE_HYPERVISOR);
561 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
562 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
564 // shadow_mode_translate (but not external) sl2 tables hold a
565 // ref to their hl2.
566 //
567 if ( !get_shadow_ref(hl2mfn) )
568 BUG();
570 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
571 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
572 }
573 else
574 spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
575 l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
576 }
577 else
578 {
579 memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
580 }
582 unmap_domain_page(spl2e);
584 SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
585 return smfn;
586 }
587 #endif /* CONFIG_PAGING_LEVELS == 2 */
589 static void shadow_map_l1_into_current_l2(unsigned long va)
590 {
591 struct vcpu *v = current;
592 struct domain *d = v->domain;
593 l1_pgentry_t *spl1e, *spl1e_next = 0;
594 l2_pgentry_t sl2e;
595 guest_l1_pgentry_t *gpl1e;
596 guest_l2_pgentry_t gl2e = {0};
597 unsigned long gl1pfn, gl1mfn, sl1mfn;
598 int i, init_table = 0;
600 __guest_get_l2e(v, va, &gl2e);
601 ASSERT(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT);
602 gl1pfn = l2e_get_pfn(gl2e);
604 if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
605 {
606 /* This L1 is NOT already shadowed so we need to shadow it. */
607 SH_VVLOG("4a: l1 not shadowed");
609 gl1mfn = gmfn_to_mfn(d, gl1pfn);
610 if ( unlikely(!VALID_MFN(gl1mfn)) )
611 {
612 // Attempt to use an invalid pfn as an L1 page.
613 // XXX this needs to be more graceful!
614 BUG();
615 }
617 if ( unlikely(!(sl1mfn =
618 alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
619 {
620 printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
621 gl1pfn, gl1mfn);
622 BUG(); /* XXX Need to deal gracefully with failure. */
623 }
625 perfc_incrc(shadow_l1_table_count);
626 init_table = 1;
627 }
628 else
629 {
630 /* This L1 is shadowed already, but the L2 entry is missing. */
631 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
632 }
634 #ifndef NDEBUG
635 {
636 l2_pgentry_t old_sl2e;
637 __shadow_get_l2e(v, va, &old_sl2e);
638 ASSERT(!(l2e_get_flags(old_sl2e) & _PAGE_PRESENT));
639 }
640 #endif
642 #if CONFIG_PAGING_LEVELS >= 3
643 if ( SH_L1_HAS_NEXT_PAGE &&
644 d->arch.ops->guest_paging_levels == PAGING_L2 )
645 {
646 /* for 32-bit HVM guest on 64-bit or PAE host,
647 * need update two L2 entries each time
648 */
649 if ( !get_shadow_ref(sl1mfn))
650 BUG();
651 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
652 __guest_set_l2e(v, va, &gl2e);
653 __shadow_set_l2e(v, va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1), &sl2e);
654 if ( !get_shadow_ref(sl1mfn+1))
655 BUG();
656 sl2e = l2e_empty();
657 l2pde_general(d, &gl2e, &sl2e, sl1mfn+1);
658 __shadow_set_l2e(v,((va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1)) + (1 << L2_PAGETABLE_SHIFT)) , &sl2e);
659 } else
660 #endif
661 {
662 if ( !get_shadow_ref(sl1mfn) )
663 BUG();
664 l2pde_general(d, &gl2e, &sl2e, sl1mfn);
665 __guest_set_l2e(v, va, &gl2e);
666 __shadow_set_l2e(v, va , &sl2e);
667 }
669 if ( init_table )
670 {
671 l1_pgentry_t sl1e;
672 int index = guest_l1_table_offset(va);
673 int min = 1, max = 0;
675 unsigned long tmp_gmfn;
676 l2_pgentry_t tmp_sl2e = {0};
677 guest_l2_pgentry_t tmp_gl2e = {0};
679 __guest_get_l2e(v, va, &tmp_gl2e);
680 tmp_gmfn = gmfn_to_mfn(d, l2e_get_pfn(tmp_gl2e));
681 gpl1e = (guest_l1_pgentry_t *) map_domain_page(tmp_gmfn);
683 /* If the PGT_l1_shadow has two contiguous pages */
684 #if CONFIG_PAGING_LEVELS >= 3
685 if ( SH_L1_HAS_NEXT_PAGE &&
686 d->arch.ops->guest_paging_levels == PAGING_L2 )
687 __shadow_get_l2e(v, va & ~((1UL << L2_PAGETABLE_SHIFT_32) - 1), &tmp_sl2e);
688 else
689 #endif
690 __shadow_get_l2e(v, va, &tmp_sl2e);
692 spl1e = (l1_pgentry_t *) map_domain_page(l2e_get_pfn(tmp_sl2e));
694 if ( SH_L1_HAS_NEXT_PAGE )
695 spl1e_next = (l1_pgentry_t *) map_domain_page(
696 (l2e_get_pfn(tmp_sl2e) + 1UL));
698 for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
699 {
700 l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
701 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
702 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
703 sl1e = l1e_empty();
704 if ( l1e_get_flags(sl1e) == 0 )
705 {
706 // First copy entries from 0 until first invalid.
707 // Then copy entries from index until first invalid.
708 //
709 if ( i < index ) {
710 i = index - 1;
711 continue;
712 }
713 break;
714 }
716 if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
717 spl1e_next[i - L1_PAGETABLE_ENTRIES] = sl1e;
718 else
719 spl1e[i] = sl1e;
721 if ( unlikely(i < min) )
722 min = i;
723 if ( likely(i > max) )
724 max = i;
725 set_guest_back_ptr(d, sl1e, sl1mfn, i);
726 }
728 mfn_to_page(sl1mfn)->tlbflush_timestamp =
729 SHADOW_ENCODE_MIN_MAX(min, max);
731 unmap_domain_page(gpl1e);
732 unmap_domain_page(spl1e);
734 if ( SH_L1_HAS_NEXT_PAGE )
735 unmap_domain_page(spl1e_next);
736 }
737 }
739 #if CONFIG_PAGING_LEVELS == 2
740 static void
741 shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
742 {
743 struct vcpu *v = current;
744 struct domain *d = v->domain;
745 l2_pgentry_t sl2e = {0};
747 __shadow_get_l2e(v, va, &sl2e);
748 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
749 {
750 /*
751 * Either the L1 is not shadowed, or the shadow isn't linked into
752 * the current shadow L2.
753 */
754 if ( create_l1_shadow )
755 {
756 perfc_incrc(shadow_set_l1e_force_map);
757 shadow_map_l1_into_current_l2(va);
758 }
759 else /* check to see if it exists; if so, link it in */
760 {
761 l2_pgentry_t gpde = {0};
762 unsigned long gl1pfn;
763 unsigned long sl1mfn;
765 __guest_get_l2e(v, va, &gpde);
767 if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
768 {
769 gl1pfn = l2e_get_pfn(gpde);
770 sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
771 }
772 else
773 {
774 // no shadow exists, so there's nothing to do.
775 perfc_incrc(shadow_set_l1e_fail);
776 return;
777 }
779 if ( sl1mfn )
780 {
781 perfc_incrc(shadow_set_l1e_unlinked);
782 if ( !get_shadow_ref(sl1mfn) )
783 BUG();
784 l2pde_general(d, (guest_l2_pgentry_t *)&gpde, &sl2e, sl1mfn);
785 __guest_set_l2e(v, va, &gpde);
786 __shadow_set_l2e(v, va, &sl2e);
787 }
788 else
789 {
790 // no shadow exists, so there's nothing to do.
791 perfc_incrc(shadow_set_l1e_fail);
792 return;
793 }
794 }
795 }
797 __shadow_get_l2e(v, va, &sl2e);
799 if ( shadow_mode_refcounts(d) )
800 {
801 l1_pgentry_t old_spte;
802 __shadow_get_l1e(v, va, &old_spte);
804 // only do the ref counting if something important changed.
805 //
806 if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
807 {
808 if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
809 !shadow_get_page_from_l1e(new_spte, d) )
810 new_spte = l1e_empty();
811 if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
812 shadow_put_page_from_l1e(old_spte, d);
813 }
814 }
816 set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
817 __shadow_set_l1e(v, va, &new_spte);
818 shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
819 }
821 static void shadow_invlpg_32(struct vcpu *v, unsigned long va)
822 {
823 struct domain *d = v->domain;
824 l1_pgentry_t gpte, spte;
826 ASSERT(shadow_mode_enabled(d));
828 shadow_lock(d);
830 __shadow_sync_va(v, va);
832 // XXX mafetter: will need to think about 4MB pages...
834 // It's not strictly necessary to update the shadow here,
835 // but it might save a fault later.
836 //
837 /*if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
838 sizeof(gpte))) {*/
839 if (unlikely(!__guest_get_l1e(v, va, &gpte))) {
840 perfc_incrc(shadow_invlpg_faults);
841 shadow_unlock(d);
842 return;
843 }
844 l1pte_propagate_from_guest(d, gpte, &spte);
845 shadow_set_l1e(va, spte, 1);
847 shadow_unlock(d);
848 }
849 #endif /* CONFIG_PAGING_LEVELS == 2 */
851 #if CONFIG_PAGING_LEVELS >= 3
852 static void shadow_set_l1e_64(
853 unsigned long va, pgentry_64_t *sl1e_p,
854 int create_l1_shadow)
855 {
856 struct vcpu *v = current;
857 struct domain *d = v->domain;
858 pgentry_64_t sle = { 0 };
859 pgentry_64_t sle_up = {0};
860 l1_pgentry_t old_spte;
861 l1_pgentry_t sl1e = *(l1_pgentry_t *)sl1e_p;
862 int i;
863 unsigned long orig_va = 0;
865 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
866 {
867 /* This is for 32-bit VMX guest on 64-bit host */
868 orig_va = va;
869 va = va & (~((1<<L2_PAGETABLE_SHIFT_32)-1));
870 }
872 for ( i = PAGING_L4; i >= PAGING_L2; i-- )
873 {
874 if ( !__rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i) )
875 {
876 sl1e = l1e_empty();
877 goto out;
878 }
879 if ( !(entry_get_flags(sle) & _PAGE_PRESENT) )
880 {
881 if ( create_l1_shadow )
882 {
883 perfc_incrc(shadow_set_l3e_force_map);
884 shadow_map_into_current(v, va, i-1, i);
885 __rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i);
886 }
887 }
888 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
889 {
890 if ( i < PAGING_L3 )
891 shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
892 }
893 else
894 {
895 if ( i < PAGING_L4 )
896 shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
897 }
899 sle_up = sle;
900 }
902 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
903 {
904 va = orig_va;
905 }
907 if ( shadow_mode_refcounts(d) )
908 {
909 __shadow_get_l1e(v, va, &old_spte);
910 if ( l1e_has_changed(old_spte, sl1e, _PAGE_RW | _PAGE_PRESENT) )
911 {
912 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
913 !shadow_get_page_from_l1e(sl1e, d) )
914 sl1e = l1e_empty();
915 if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
916 put_page_from_l1e(old_spte, d);
917 }
918 }
920 out:
921 __shadow_set_l1e(v, va, &sl1e);
923 shadow_update_min_max(entry_get_pfn(sle_up), guest_l1_table_offset(va));
924 }
925 #endif /* CONFIG_PAGING_LEVELS >= 3 */
927 static struct out_of_sync_entry *
928 shadow_alloc_oos_entry(struct domain *d)
929 {
930 struct out_of_sync_entry *f, *extra;
931 unsigned size, i;
933 if ( unlikely(d->arch.out_of_sync_free == NULL) )
934 {
935 FSH_LOG("Allocate more fullshadow tuple blocks.");
937 size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
938 extra = xmalloc_bytes(size);
940 /* XXX Should be more graceful here. */
941 if ( extra == NULL )
942 BUG();
944 memset(extra, 0, size);
946 /* Record the allocation block so it can be correctly freed later. */
947 d->arch.out_of_sync_extras_count++;
948 *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
949 d->arch.out_of_sync_extras;
950 d->arch.out_of_sync_extras = &extra[0];
952 /* Thread a free chain through the newly-allocated nodes. */
953 for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
954 extra[i].next = &extra[i+1];
955 extra[i].next = NULL;
957 /* Add the new nodes to the free list. */
958 d->arch.out_of_sync_free = &extra[0];
959 }
961 /* Allocate a new node from the quicklist. */
962 f = d->arch.out_of_sync_free;
963 d->arch.out_of_sync_free = f->next;
965 return f;
966 }
968 static inline unsigned long
969 shadow_make_snapshot(
970 struct domain *d, unsigned long gpfn, unsigned long gmfn)
971 {
972 unsigned long smfn, sl1mfn = 0;
973 void *original, *snapshot;
974 u32 min_max = 0;
975 int min, max, length;
977 if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
978 {
979 ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
980 return SHADOW_SNAPSHOT_ELSEWHERE;
981 }
983 perfc_incrc(shadow_make_snapshot);
985 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
986 {
987 printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
988 "Dom%d snapshot_count_count=%d\n",
989 gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
990 BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
991 }
993 if ( !get_shadow_ref(smfn) )
994 BUG();
996 if ( shadow_mode_refcounts(d) &&
997 (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
998 min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
999 mfn_to_page(smfn)->tlbflush_timestamp = min_max;
1001 min = SHADOW_MIN(min_max);
1002 max = SHADOW_MAX(min_max);
1003 length = max - min + 1;
1004 perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
1006 min *= sizeof(guest_l1_pgentry_t);
1007 length *= sizeof(guest_l1_pgentry_t);
1009 original = map_domain_page(gmfn);
1010 snapshot = map_domain_page(smfn);
1011 memcpy(snapshot + min, original + min, length);
1012 unmap_domain_page(original);
1013 unmap_domain_page(snapshot);
1015 return smfn;
1018 static struct out_of_sync_entry *
1019 __mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
1020 unsigned long mfn)
1022 struct domain *d = v->domain;
1023 struct page_info *page = mfn_to_page(mfn);
1024 struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
1026 ASSERT(shadow_lock_is_acquired(d));
1027 ASSERT(mfn_valid(mfn));
1029 #ifndef NDEBUG
1031 u32 type = page->u.inuse.type_info & PGT_type_mask;
1032 if ( shadow_mode_refcounts(d) )
1034 ASSERT(type == PGT_writable_page);
1036 else
1038 ASSERT(type && (type < PGT_l4_page_table));
1041 #endif
1043 FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
1044 gpfn, mfn, page->count_info, page->u.inuse.type_info);
1046 // XXX this will require some more thought... Cross-domain sharing and
1047 // modification of page tables? Hmm...
1048 //
1049 if ( d != page_get_owner(page) )
1050 BUG();
1052 perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
1054 entry->v = v;
1055 entry->gpfn = gpfn;
1056 entry->gmfn = mfn;
1057 entry->writable_pl1e = -1;
1059 #if 0 // this code has not been updated for 32pae & 64 bit modes
1060 #if SHADOW_DEBUG
1061 mark_shadows_as_reflecting_snapshot(d, gpfn);
1062 #endif
1063 #endif
1065 // increment guest's ref count to represent the entry in the
1066 // full shadow out-of-sync list.
1067 //
1068 get_page(page, d);
1070 return entry;
1073 static struct out_of_sync_entry *
1074 mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
1075 unsigned long mfn)
1077 struct out_of_sync_entry *entry =
1078 __mark_mfn_out_of_sync(v, gpfn, mfn);
1079 struct domain *d = v->domain;
1081 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1082 // Add to the out-of-sync list
1083 //
1084 entry->next = d->arch.out_of_sync;
1085 d->arch.out_of_sync = entry;
1087 return entry;
1091 static void shadow_mark_va_out_of_sync(
1092 struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
1094 struct out_of_sync_entry *entry =
1095 __mark_mfn_out_of_sync(v, gpfn, mfn);
1096 l2_pgentry_t sl2e;
1097 struct domain *d = v->domain;
1099 #if CONFIG_PAGING_LEVELS >= 3
1101 l4_pgentry_t sl4e;
1102 l3_pgentry_t sl3e;
1104 __shadow_get_l4e(v, va, &sl4e);
1105 if ( !(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
1106 shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
1109 if (!__shadow_get_l3e(v, va, &sl3e)) {
1110 BUG();
1113 if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
1114 shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
1117 #endif
1119 // We need the address of shadow PTE that maps @va.
1120 // It might not exist yet. Make sure it's there.
1121 //
1122 __shadow_get_l2e(v, va, &sl2e);
1123 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
1125 // either this L1 isn't shadowed yet, or the shadow isn't linked into
1126 // the current L2.
1127 shadow_map_l1_into_current_l2(va);
1128 __shadow_get_l2e(v, va, &sl2e);
1130 ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
1132 entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
1133 // NB: this is stored as a machine address.
1134 entry->writable_pl1e =
1135 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
1136 ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
1137 entry->va = va;
1139 // Increment shadow's page count to represent the reference
1140 // inherent in entry->writable_pl1e
1141 //
1142 if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
1143 BUG();
1145 // Add to the out-of-sync list
1146 //
1147 entry->next = d->arch.out_of_sync;
1148 d->arch.out_of_sync = entry;
1150 FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
1151 __func__, va, entry->writable_pl1e);
1154 /*
1155 * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
1156 * Returns 0 otherwise.
1157 */
1158 static int snapshot_entry_matches(
1159 struct domain *d, guest_l1_pgentry_t *guest_pt,
1160 unsigned long gpfn, unsigned index)
1162 unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
1163 guest_l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
1164 int entries_match;
1166 perfc_incrc(snapshot_entry_matches_calls);
1168 if ( !smfn )
1169 return 0;
1171 snapshot = map_domain_page(smfn);
1173 if (__copy_from_user(&gpte, &guest_pt[index],
1174 sizeof(gpte)))
1176 unmap_domain_page(snapshot);
1177 return 0;
1180 // This could probably be smarter, but this is sufficent for
1181 // our current needs.
1182 //
1183 entries_match = !guest_l1e_has_changed(gpte, snapshot[index],
1184 PAGE_FLAG_MASK);
1186 unmap_domain_page(snapshot);
1188 #ifdef PERF_COUNTERS
1189 if ( entries_match )
1190 perfc_incrc(snapshot_entry_matches_true);
1191 #endif
1193 return entries_match;
1196 /*
1197 * Returns 1 if va's shadow mapping is out-of-sync.
1198 * Returns 0 otherwise.
1199 */
1200 static int is_out_of_sync(struct vcpu *v, unsigned long va) /* __shadow_out_of_sync */
1202 struct domain *d = v->domain;
1203 #if CONFIG_PAGING_LEVELS == 4
1204 unsigned long l2mfn = ((v->arch.flags & TF_kernel_mode)?
1205 pagetable_get_pfn(v->arch.guest_table) :
1206 pagetable_get_pfn(v->arch.guest_table_user));
1207 #else
1208 unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
1209 #endif
1210 unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
1211 guest_l2_pgentry_t l2e;
1212 unsigned long l1pfn, l1mfn;
1213 guest_l1_pgentry_t *guest_pt;
1215 ASSERT(shadow_lock_is_acquired(d));
1216 ASSERT(VALID_M2P(l2pfn));
1218 perfc_incrc(shadow_out_of_sync_calls);
1220 #if CONFIG_PAGING_LEVELS >= 3
1222 #define unmap_and_return(x) \
1223 if ( guest_pt != (guest_l1_pgentry_t *) v->arch.guest_vtable ) \
1224 unmap_domain_page(guest_pt); \
1225 return (x);
1227 if (d->arch.ops->guest_paging_levels >= PAGING_L3)
1229 pgentry_64_t le;
1230 unsigned long gmfn;
1231 unsigned long gpfn;
1232 int i;
1233 unsigned int base_idx = 0;
1234 base_idx = get_cr3_idxval(v);
1236 gmfn = l2mfn;
1237 gpfn = l2pfn;
1238 guest_pt = (guest_l1_pgentry_t *)v->arch.guest_vtable;
1240 for ( i = PAGING_L4; i >= PAGING_L3; i-- )
1242 if (d->arch.ops->guest_paging_levels == PAGING_L3
1243 && i == PAGING_L4)
1244 continue; /* skip the top-level for 3-level */
1246 if ( page_out_of_sync(mfn_to_page(gmfn)) &&
1247 !snapshot_entry_matches(
1248 d, guest_pt, gpfn, guest_table_offset_64(va, i, base_idx)) )
1250 unmap_and_return (1);
1253 le = entry_empty();
1254 __rw_entry(v, va, &le, GUEST_ENTRY | GET_ENTRY | i);
1256 if ( !(entry_get_flags(le) & _PAGE_PRESENT) )
1258 unmap_and_return (0);
1260 gpfn = entry_get_pfn(le);
1261 gmfn = gmfn_to_mfn(d, gpfn);
1262 if ( !VALID_MFN(gmfn) )
1264 unmap_and_return (0);
1266 if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
1267 unmap_domain_page(guest_pt);
1268 guest_pt = (guest_l1_pgentry_t *)map_domain_page(gmfn);
1271 /* L2 */
1272 if ( page_out_of_sync(mfn_to_page(gmfn)) &&
1273 !snapshot_entry_matches(d, guest_pt, gpfn, l2_table_offset(va)) )
1275 unmap_and_return (1);
1278 if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
1279 unmap_domain_page(guest_pt);
1282 else
1283 #undef unmap_and_return
1284 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1286 if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
1287 !snapshot_entry_matches(d, (guest_l1_pgentry_t *)v->arch.guest_vtable,
1288 l2pfn, guest_l2_table_offset(va)) )
1289 return 1;
1292 __guest_get_l2e(v, va, &l2e);
1293 if ( !(guest_l2e_get_flags(l2e) & _PAGE_PRESENT) ||
1294 (guest_l2e_get_flags(l2e) & _PAGE_PSE))
1295 return 0;
1297 l1pfn = l2e_get_pfn(l2e);
1298 l1mfn = gmfn_to_mfn(d, l1pfn);
1300 // If the l1 pfn is invalid, it can't be out of sync...
1301 if ( !VALID_MFN(l1mfn) )
1302 return 0;
1304 guest_pt = (guest_l1_pgentry_t *) map_domain_page(l1mfn);
1306 if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
1307 !snapshot_entry_matches(
1308 d, guest_pt, l1pfn, guest_l1_table_offset(va)) )
1310 unmap_domain_page(guest_pt);
1311 return 1;
1314 unmap_domain_page(guest_pt);
1315 return 0;
1318 #define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(guest_l1_pgentry_t)))
1319 static inline unsigned long
1320 predict_writable_pte_page(struct domain *d, unsigned long gpfn)
1322 return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
1325 static inline void
1326 increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1328 unsigned long score = prediction & PGT_score_mask;
1329 int create = (score == 0);
1331 // saturating addition
1332 score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
1333 score = score ? score : PGT_score_mask;
1335 prediction = (prediction & PGT_mfn_mask) | score;
1337 //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create);
1338 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred, 0);
1340 if ( create )
1341 perfc_incr(writable_pte_predictions);
1344 static inline void
1345 decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
1347 unsigned long score = prediction & PGT_score_mask;
1348 ASSERT(score);
1350 // divide score by 2... We don't like bad predictions.
1351 //
1352 score = (score >> 1) & PGT_score_mask;
1354 prediction = (prediction & PGT_mfn_mask) | score;
1356 //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score);
1358 if ( score )
1359 set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred, 0);
1360 else
1362 delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred, 0);
1363 perfc_decr(writable_pte_predictions);
1367 static int fix_entry(
1368 struct domain *d,
1369 l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
1371 l1_pgentry_t old = *pt;
1372 l1_pgentry_t new = old;
1374 l1e_remove_flags(new,_PAGE_RW);
1375 if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
1376 BUG();
1377 (*found)++;
1378 *pt = new;
1379 if ( is_l1_shadow )
1380 shadow_put_page_from_l1e(old, d);
1382 return (*found == max_refs_to_find);
1385 static u32 remove_all_write_access_in_ptpage(
1386 struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
1387 unsigned long readonly_gpfn, unsigned long readonly_gmfn,
1388 u32 max_refs_to_find, unsigned long prediction)
1390 l1_pgentry_t *pt = map_domain_page(pt_mfn);
1391 l1_pgentry_t *pt_next = 0, *sl1e_p;
1392 l1_pgentry_t match;
1393 unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
1394 int i;
1395 u32 found = 0;
1396 int is_l1_shadow =
1397 ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
1398 PGT_l1_shadow);
1399 #if CONFIG_PAGING_LEVELS >= 3
1400 is_l1_shadow |=
1401 ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
1402 PGT_fl1_shadow);
1403 #endif
1405 if ( SH_L1_HAS_NEXT_PAGE )
1406 pt_next = map_domain_page(pt_mfn + 1);
1408 match = l1e_from_pfn(readonly_gmfn, flags);
1410 if ( shadow_mode_external(d) )
1412 i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask)
1413 >> PGT_va_shift;
1415 if ( SH_L1_HAS_NEXT_PAGE &&
1416 i >= L1_PAGETABLE_ENTRIES )
1417 sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
1418 else
1419 sl1e_p = &pt[i];
1421 if ( (i >= 0 && i < GUEST_L1_PAGETABLE_ENTRIES) &&
1422 !l1e_has_changed(*sl1e_p, match, flags) &&
1423 fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) &&
1424 !prediction )
1425 goto out;
1428 for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
1430 if ( SH_L1_HAS_NEXT_PAGE &&
1431 i >= L1_PAGETABLE_ENTRIES )
1432 sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
1433 else
1434 sl1e_p = &pt[i];
1436 if ( unlikely(!l1e_has_changed(*sl1e_p, match, flags)) &&
1437 fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) )
1438 break;
1441 out:
1442 unmap_domain_page(pt);
1443 if ( SH_L1_HAS_NEXT_PAGE )
1444 unmap_domain_page(pt_next);
1446 return found;
1449 static int remove_all_write_access(
1450 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
1452 int i;
1453 struct shadow_status *a;
1454 u32 found = 0, write_refs;
1455 unsigned long predicted_smfn;
1457 ASSERT(shadow_lock_is_acquired(d));
1458 ASSERT(VALID_MFN(readonly_gmfn));
1460 perfc_incrc(remove_write_access);
1462 // If it's not a writable page, then no writable refs can be outstanding.
1463 //
1464 if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
1465 PGT_writable_page )
1467 perfc_incrc(remove_write_not_writable);
1468 return 1;
1471 // How many outstanding writable PTEs for this page are there?
1472 //
1473 write_refs =
1474 (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
1475 if ( write_refs && MFN_PINNED(readonly_gmfn) )
1477 write_refs--;
1480 if ( write_refs == 0 )
1482 perfc_incrc(remove_write_no_work);
1483 return 1;
1486 if ( shadow_mode_external(d) ) {
1487 if (--write_refs == 0)
1488 return 0;
1490 // Use the back pointer to locate the shadow page that can contain
1491 // the PTE of interest
1492 if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
1493 found += remove_all_write_access_in_ptpage(
1494 d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
1495 if ( found == write_refs )
1496 return 0;
1500 // Search all the shadow L1 page tables...
1501 //
1502 for (i = 0; i < shadow_ht_buckets; i++)
1504 a = &d->arch.shadow_ht[i];
1505 while ( a && a->gpfn_and_flags )
1507 if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow
1508 #if CONFIG_PAGING_LEVELS >= 3
1509 || (a->gpfn_and_flags & PGT_type_mask) == PGT_fl1_shadow
1510 #endif
1514 found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
1515 if ( found == write_refs )
1516 return 0;
1519 a = a->next;
1523 FSH_LOG("%s: looking for %d refs, found %d refs",
1524 __func__, write_refs, found);
1526 return 0;
1529 static void resync_pae_guest_l3(struct domain *d)
1531 struct out_of_sync_entry *entry;
1532 unsigned long i, idx;
1533 unsigned long smfn, gmfn;
1534 pgentry_64_t *guest, *shadow_l3, *snapshot;
1535 struct vcpu *v = current;
1536 int max = -1;
1537 int unshadow = 0;
1540 ASSERT( shadow_mode_external(d) );
1542 gmfn = pagetable_get_pfn(v->arch.guest_table);
1544 for ( entry = d->arch.out_of_sync; entry; entry = entry->next )
1546 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1547 continue;
1548 if ( entry->gmfn != gmfn )
1549 continue;
1551 idx = get_cr3_idxval(v);
1553 smfn = __shadow_status(d, entry->gpfn, PGT_l4_shadow);
1555 if ( !smfn )
1556 continue;
1558 guest = (pgentry_64_t *)map_domain_page(entry->gmfn);
1559 snapshot = (pgentry_64_t *)map_domain_page(entry->snapshot_mfn);
1560 shadow_l3 = (pgentry_64_t *)map_domain_page(smfn);
1562 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
1564 int index = i + idx * PAE_L3_PAGETABLE_ENTRIES;
1565 if ( entry_has_changed(
1566 guest[index], snapshot[index], PAGE_FLAG_MASK) )
1568 unsigned long gpfn;
1570 /*
1571 * Looks like it's no longer a page table.
1572 */
1573 if ( unlikely(entry_get_value(guest[index]) & PAE_PDPT_RESERVED) )
1575 if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT )
1576 put_shadow_ref(entry_get_pfn(shadow_l3[i]));
1578 shadow_l3[i] = entry_empty();
1579 continue;
1582 gpfn = entry_get_pfn(guest[index]);
1584 if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
1586 if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT )
1587 put_shadow_ref(entry_get_pfn(shadow_l3[i]));
1589 shadow_l3[i] = entry_empty();
1590 continue;
1593 validate_entry_change(d, &guest[index],
1594 &shadow_l3[i], PAGING_L3);
1597 if ( entry_get_value(guest[index]) != 0 )
1598 max = i;
1600 if ( !(entry_get_flags(guest[index]) & _PAGE_PRESENT) &&
1601 unlikely(entry_get_value(guest[index]) != 0) &&
1602 !unshadow &&
1603 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
1604 unshadow = 1;
1607 if ( max == -1 )
1608 unshadow = 1;
1610 unmap_domain_page(guest);
1611 unmap_domain_page(snapshot);
1612 unmap_domain_page(shadow_l3);
1614 if ( unlikely(unshadow) )
1615 shadow_unpin(smfn);
1616 break;
1620 static int resync_all(struct domain *d, u32 stype)
1622 struct out_of_sync_entry *entry;
1623 unsigned i;
1624 unsigned long smfn;
1625 void *guest, *shadow, *snapshot;
1626 int need_flush = 0, external = shadow_mode_external(d);
1627 int unshadow;
1628 int changed;
1629 u32 min_max_shadow, min_max_snapshot;
1630 int min_shadow, max_shadow, min_snapshot, max_snapshot;
1631 struct vcpu *v;
1633 ASSERT(shadow_lock_is_acquired(d));
1635 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
1637 int max = -1;
1639 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
1640 continue;
1642 smfn = __shadow_status(d, entry->gpfn, stype);
1644 if ( !smfn )
1646 // For heavy weight shadows: no need to update refcounts if
1647 // there's no shadow page.
1648 //
1649 if ( shadow_mode_refcounts(d) )
1650 continue;
1652 // For light weight shadows: only need up resync the refcounts to
1653 // the new contents of the guest page iff this it has the right
1654 // page type.
1655 //
1656 if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
1657 continue;
1660 FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
1661 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
1663 // Compare guest's new contents to its snapshot, validating
1664 // and updating its shadow as appropriate.
1665 //
1666 guest = map_domain_page(entry->gmfn);
1667 snapshot = map_domain_page(entry->snapshot_mfn);
1669 if ( smfn )
1670 shadow = map_domain_page(smfn);
1671 else
1672 shadow = NULL;
1674 unshadow = 0;
1676 min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
1677 min_shadow = SHADOW_MIN(min_max_shadow);
1678 max_shadow = SHADOW_MAX(min_max_shadow);
1680 min_max_snapshot= mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
1681 min_snapshot = SHADOW_MIN(min_max_snapshot);
1682 max_snapshot = SHADOW_MAX(min_max_snapshot);
1684 switch ( stype )
1686 case PGT_l1_shadow:
1688 guest_l1_pgentry_t *guest1 = guest;
1689 l1_pgentry_t *shadow1 = shadow;
1690 l1_pgentry_t *shadow1_next = 0, *sl1e_p;
1691 guest_l1_pgentry_t *snapshot1 = snapshot;
1692 int unshadow_l1 = 0;
1694 ASSERT(shadow_mode_write_l1(d) ||
1695 shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
1697 if ( !shadow_mode_refcounts(d) )
1698 revalidate_l1(d, (l1_pgentry_t *)guest1, (l1_pgentry_t *)snapshot1);
1699 if ( !smfn )
1700 break;
1702 changed = 0;
1704 if ( SH_L1_HAS_NEXT_PAGE && shadow1 )
1705 shadow1_next = map_domain_page(smfn + 1);
1707 for ( i = min_shadow; i <= max_shadow; i++ )
1710 if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
1711 sl1e_p = &shadow1_next[i - L1_PAGETABLE_ENTRIES];
1712 else
1713 sl1e_p = &shadow1[i];
1715 if ( (i < min_snapshot) || (i > max_snapshot) ||
1716 guest_l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
1718 int error;
1720 #if CONFIG_PAGING_LEVELS >= 3
1721 unsigned long gpfn;
1723 gpfn = guest_l1e_get_paddr(guest1[i]) >> PAGE_SHIFT;
1725 if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
1727 guest_l1_pgentry_t tmp_gl1e = guest_l1e_empty();
1728 validate_pte_change(d, tmp_gl1e, sl1e_p);
1729 unshadow_l1 = 1;
1730 continue;
1732 #endif
1734 error = validate_pte_change(d, guest1[i], sl1e_p);
1735 if ( error == -1 )
1736 unshadow_l1 = 1;
1737 else {
1738 need_flush |= error;
1739 if ( l1e_get_flags(*sl1e_p) & _PAGE_PRESENT )
1740 set_guest_back_ptr(d, *sl1e_p, smfn, i);
1742 // can't update snapshots of linear page tables -- they
1743 // are used multiple times...
1744 //
1745 // snapshot[i] = new_pte;
1747 changed++;
1751 if ( shadow1_next )
1752 unmap_domain_page(shadow1_next);
1754 perfc_incrc(resync_l1);
1755 perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
1756 perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
1758 if ( d->arch.ops->guest_paging_levels >= PAGING_L3 &&
1759 unshadow_l1 ) {
1760 pgentry_64_t l2e = { 0 };
1762 __shadow_get_l2e(entry->v, entry->va, &l2e);
1764 if ( entry_get_flags(l2e) & _PAGE_PRESENT ) {
1765 put_shadow_ref(entry_get_pfn(l2e));
1766 l2e = entry_empty();
1767 __shadow_set_l2e(entry->v, entry->va, &l2e);
1769 if (entry->v == current)
1770 need_flush = 1;
1774 break;
1776 #if CONFIG_PAGING_LEVELS == 2
1777 case PGT_l2_shadow:
1779 l2_pgentry_t *guest2 = guest;
1780 l2_pgentry_t *shadow2 = shadow;
1781 l2_pgentry_t *snapshot2 = snapshot;
1783 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
1784 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
1786 changed = 0;
1787 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1789 if ( !is_guest_l2_slot(0,i) && !external )
1790 continue;
1792 l2_pgentry_t new_pde = guest2[i];
1793 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
1795 need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
1797 // can't update snapshots of linear page tables -- they
1798 // are used multiple times...
1799 //
1800 // snapshot[i] = new_pde;
1802 changed++;
1804 if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
1805 max = i;
1807 // XXX - This hack works for linux guests.
1808 // Need a better solution long term.
1809 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
1810 unlikely(l2e_get_intpte(new_pde) != 0) &&
1811 !unshadow && MFN_PINNED(smfn) )
1812 unshadow = 1;
1814 if ( max == -1 )
1815 unshadow = 1;
1816 perfc_incrc(resync_l2);
1817 perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
1818 break;
1820 case PGT_hl2_shadow:
1822 l2_pgentry_t *guest2 = guest;
1823 l2_pgentry_t *snapshot2 = snapshot;
1824 l1_pgentry_t *shadow2 = shadow;
1826 ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
1827 BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
1829 changed = 0;
1830 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1832 if ( !is_guest_l2_slot(0, i) && !external )
1833 continue;
1835 l2_pgentry_t new_pde = guest2[i];
1836 if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
1838 need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
1840 // can't update snapshots of linear page tables -- they
1841 // are used multiple times...
1842 //
1843 // snapshot[i] = new_pde;
1845 changed++;
1848 perfc_incrc(resync_hl2);
1849 perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
1850 break;
1852 #elif CONFIG_PAGING_LEVELS >= 3
1853 case PGT_l2_shadow:
1854 case PGT_l3_shadow:
1856 pgentry_64_t *guest_pt = guest;
1857 pgentry_64_t *shadow_pt = shadow;
1858 pgentry_64_t *snapshot_pt = snapshot;
1860 changed = 0;
1861 for ( i = min_shadow; i <= max_shadow; i++ )
1863 if ( (i < min_snapshot) || (i > max_snapshot) ||
1864 entry_has_changed(
1865 guest_pt[i], snapshot_pt[i], PAGE_FLAG_MASK) )
1867 unsigned long gpfn;
1869 gpfn = entry_get_pfn(guest_pt[i]);
1870 /*
1871 * Looks like it's no longer a page table.
1872 */
1873 if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
1875 if ( entry_get_flags(shadow_pt[i]) & _PAGE_PRESENT )
1876 put_shadow_ref(entry_get_pfn(shadow_pt[i]));
1877 shadow_pt[i] = entry_empty();
1878 continue;
1881 need_flush |= validate_entry_change(
1882 d, &guest_pt[i], &shadow_pt[i],
1883 shadow_type_to_level(stype));
1884 changed++;
1886 #if CONFIG_PAGING_LEVELS == 3
1887 if ( stype == PGT_l3_shadow )
1889 if ( entry_get_value(guest_pt[i]) != 0 )
1890 max = i;
1892 if ( !(entry_get_flags(guest_pt[i]) & _PAGE_PRESENT) &&
1893 unlikely(entry_get_value(guest_pt[i]) != 0) &&
1894 !unshadow &&
1895 (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
1896 unshadow = 1;
1898 #endif
1901 if ( d->arch.ops->guest_paging_levels == PAGING_L3
1902 && max == -1 && stype == PGT_l3_shadow )
1903 unshadow = 1;
1905 perfc_incrc(resync_l3);
1906 perfc_incr_histo(shm_l3_updates, changed, PT_UPDATES);
1907 break;
1909 case PGT_l4_shadow:
1911 guest_root_pgentry_t *guest_root = guest;
1912 guest_root_pgentry_t *snapshot_root = snapshot;
1914 changed = 0;
1915 for ( i = 0; i < GUEST_ROOT_PAGETABLE_ENTRIES; i++ )
1917 guest_root_pgentry_t new_root_e = guest_root[i];
1918 if ( !is_guest_l4_slot(i) && !external )
1919 continue;
1920 if ( root_entry_has_changed(
1921 new_root_e, snapshot_root[i], PAGE_FLAG_MASK))
1923 #ifndef GUEST_PGENTRY_32
1924 l4_pgentry_t *shadow4 = shadow;
1925 unsigned long gpfn;
1927 gpfn = l4e_get_pfn(new_root_e);
1929 /*
1930 * Looks like it's no longer a page table.
1931 */
1932 if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
1934 if ( l4e_get_flags(shadow4[i]) & _PAGE_PRESENT )
1935 put_shadow_ref(l4e_get_pfn(shadow4[i]));
1936 shadow4[i] = l4e_empty();
1937 continue;
1940 if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
1942 need_flush |= validate_entry_change(
1943 d, (pgentry_64_t *)&new_root_e,
1944 (pgentry_64_t *)&shadow4[i], shadow_type_to_level(stype));
1946 else
1947 #endif
1949 validate_bl2e_change(d, &new_root_e, shadow, i);
1951 changed++;
1952 ESH_LOG("%d: shadow4 mfn: %lx, shadow root: %lx\n", i,
1953 smfn, pagetable_get_paddr(current->arch.shadow_table));
1955 if ( guest_root_get_intpte(new_root_e) != 0 ) /* FIXME: check flags? */
1956 max = i;
1958 // Need a better solution in the long term.
1959 if ( !(guest_root_get_flags(new_root_e) & _PAGE_PRESENT) &&
1960 unlikely(guest_root_get_intpte(new_root_e) != 0) &&
1961 !unshadow &&
1962 (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
1963 unshadow = 1;
1965 if ( max == -1 )
1966 unshadow = 1;
1967 perfc_incrc(resync_l4);
1968 perfc_incr_histo(shm_l4_updates, changed, PT_UPDATES);
1969 break;
1972 #endif /* CONFIG_PAGING_LEVELS >= 3 */
1973 default:
1974 BUG();
1977 if ( smfn )
1978 unmap_domain_page(shadow);
1979 unmap_domain_page(snapshot);
1980 unmap_domain_page(guest);
1982 if ( unlikely(unshadow && stype == PGT_root_page_table) )
1984 for_each_vcpu(d, v)
1985 if(smfn == pagetable_get_pfn(v->arch.shadow_table))
1986 return need_flush;
1987 perfc_incrc(unshadow_l2_count);
1988 shadow_unpin(smfn);
1989 #if CONFIG_PAGING_LEVELS == 2
1990 if ( unlikely(shadow_mode_external(d)) )
1992 unsigned long hl2mfn;
1994 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
1995 MFN_PINNED(hl2mfn) )
1996 shadow_unpin(hl2mfn);
1998 #endif
2002 return need_flush;
2005 #if CONFIG_PAGING_LEVELS == 2
2006 static int resync_all_levels_guest_page(struct domain *d)
2008 int need_flush = 0;
2010 need_flush |= resync_all(d, PGT_l1_shadow);
2011 if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
2012 shadow_mode_translate(d) )
2014 need_flush |= resync_all(d, PGT_hl2_shadow);
2016 return need_flush;
2018 #elif CONFIG_PAGING_LEVELS == 3
2019 static int resync_all_levels_guest_page(struct domain *d)
2021 int need_flush = 0;
2023 need_flush |= resync_all(d, PGT_l1_shadow);
2024 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
2025 need_flush |= resync_all(d, PGT_l4_shadow);
2026 else
2028 need_flush |= resync_all(d, PGT_l2_shadow);
2029 if ( shadow_mode_log_dirty(d) )
2031 need_flush |= resync_all(d, PGT_l3_shadow);
2032 need_flush |= resync_all(d, PGT_l4_shadow);
2034 else
2035 resync_pae_guest_l3(d);
2038 return need_flush;
2040 #elif CONFIG_PAGING_LEVELS == 4
2041 static int resync_all_levels_guest_page(struct domain *d)
2043 int need_flush = 0;
2045 need_flush |= resync_all(d, PGT_l1_shadow);
2046 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
2047 need_flush |= resync_all(d, PGT_l4_shadow);
2048 else
2050 need_flush |= resync_all(d, PGT_l2_shadow);
2051 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
2052 resync_pae_guest_l3(d);
2053 else
2055 need_flush |= resync_all(d, PGT_l3_shadow);
2056 need_flush |= resync_all(d, PGT_l4_shadow);
2059 return need_flush;
2061 #endif
2063 static void sync_all(struct domain *d)
2065 struct out_of_sync_entry *entry;
2066 int need_flush = 0;
2067 l1_pgentry_t *ppte, opte, npte;
2068 cpumask_t other_vcpus_mask;
2070 perfc_incrc(shadow_sync_all);
2072 ASSERT(shadow_lock_is_acquired(d));
2074 // First, remove all write permissions to the page tables
2075 //
2076 for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
2078 // Skip entries that have low bits set... Those aren't
2079 // real PTEs.
2080 //
2081 if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
2082 continue;
2084 ppte = (l1_pgentry_t *)(
2085 (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
2086 (entry->writable_pl1e & ~PAGE_MASK));
2087 opte = npte = *ppte;
2088 l1e_remove_flags(npte, _PAGE_RW);
2090 if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
2091 !shadow_get_page_from_l1e(npte, d) )
2092 BUG();
2093 *ppte = npte;
2094 set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT,
2095 (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
2096 shadow_put_page_from_l1e(opte, d);
2098 unmap_domain_page(ppte);
2101 /* Other VCPUs mustn't use the revoked writable mappings. */
2102 other_vcpus_mask = d->domain_dirty_cpumask;
2103 cpu_clear(smp_processor_id(), other_vcpus_mask);
2104 flush_tlb_mask(other_vcpus_mask);
2106 /* Flush ourself later. */
2107 need_flush = 1;
2109 need_flush |= resync_all_levels_guest_page(d);
2111 if ( need_flush && !unlikely(shadow_mode_external(d)) )
2112 local_flush_tlb();
2114 free_out_of_sync_state(d);
2117 static inline int l1pte_write_fault(
2118 struct vcpu *v, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
2119 unsigned long va)
2121 struct domain *d = v->domain;
2122 guest_l1_pgentry_t gpte = *gpte_p;
2123 l1_pgentry_t spte;
2124 unsigned long gpfn = l1e_get_pfn(gpte);
2125 unsigned long gmfn = gmfn_to_mfn(d, gpfn);
2127 //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
2129 if ( unlikely(!VALID_MFN(gmfn)) )
2131 SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
2132 *spte_p = l1e_empty();
2133 return 0;
2136 ASSERT(guest_l1e_get_flags(gpte) & _PAGE_RW);
2137 guest_l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
2138 spte = l1e_from_pfn(gmfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
2140 SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
2141 l1e_get_intpte(spte), l1e_get_intpte(gpte));
2143 __mark_dirty(d, gmfn);
2145 if ( mfn_is_page_table(gmfn) )
2146 shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
2148 *gpte_p = gpte;
2149 *spte_p = spte;
2151 return 1;
2154 static inline int l1pte_read_fault(
2155 struct domain *d, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
2157 guest_l1_pgentry_t gpte = *gpte_p;
2158 l1_pgentry_t spte = *spte_p;
2159 unsigned long pfn = l1e_get_pfn(gpte);
2160 unsigned long mfn = gmfn_to_mfn(d, pfn);
2162 if ( unlikely(!VALID_MFN(mfn)) )
2164 SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
2165 *spte_p = l1e_empty();
2166 return 0;
2169 guest_l1e_add_flags(gpte, _PAGE_ACCESSED);
2170 spte = l1e_from_pfn(mfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
2172 if ( shadow_mode_log_dirty(d) || !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
2173 mfn_is_page_table(mfn) )
2175 l1e_remove_flags(spte, _PAGE_RW);
2178 SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
2179 l1e_get_intpte(spte), l1e_get_intpte(gpte));
2180 *gpte_p = gpte;
2181 *spte_p = spte;
2183 return 1;
2185 #if CONFIG_PAGING_LEVELS == 2
2186 static int shadow_fault_32(unsigned long va, struct cpu_user_regs *regs)
2188 l1_pgentry_t gpte, spte, orig_gpte;
2189 struct vcpu *v = current;
2190 struct domain *d = v->domain;
2191 l2_pgentry_t gpde;
2193 spte = l1e_empty();
2195 SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
2196 va, (unsigned long)regs->error_code);
2197 perfc_incrc(shadow_fault_calls);
2199 check_pagetable(v, "pre-sf");
2201 /*
2202 * Don't let someone else take the guest's table pages out-of-sync.
2203 */
2204 shadow_lock(d);
2206 /* XXX - FIX THIS COMMENT!!!
2207 * STEP 1. Check to see if this fault might have been caused by an
2208 * out-of-sync table page entry, or if we should pass this
2209 * fault onto the guest.
2210 */
2211 __shadow_sync_va(v, va);
2213 /*
2214 * STEP 2. Check the guest PTE.
2215 */
2216 __guest_get_l2e(v, va, &gpde);
2217 if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
2219 SH_VVLOG("shadow_fault - EXIT: L1 not present");
2220 perfc_incrc(shadow_fault_bail_pde_not_present);
2221 goto fail;
2224 // This can't fault because we hold the shadow lock and we've ensured that
2225 // the mapping is in-sync, so the check of the PDE's present bit, above,
2226 // covers this access.
2227 //
2228 //orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
2229 __guest_get_l1e(v, va, &gpte);
2230 orig_gpte = gpte;
2232 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
2234 SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")",
2235 l1e_get_intpte(gpte));
2236 perfc_incrc(shadow_fault_bail_pte_not_present);
2237 goto fail;
2240 /* Write fault? */
2241 if ( regs->error_code & 2 )
2243 int allow_writes = 0;
2245 if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
2247 if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
2249 allow_writes = 1;
2250 l1e_add_flags(gpte, _PAGE_RW);
2252 else
2254 /* Write fault on a read-only mapping. */
2255 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
2256 l1e_get_intpte(gpte));
2257 perfc_incrc(shadow_fault_bail_ro_mapping);
2258 goto fail;
2261 else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
2263 SH_LOG("l1pte_write_fault: no write access to page table page");
2264 domain_crash_synchronous();
2267 if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
2269 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
2270 perfc_incrc(write_fault_bail);
2271 shadow_unlock(d);
2272 return 0;
2275 if ( allow_writes )
2276 l1e_remove_flags(gpte, _PAGE_RW);
2278 else
2280 if ( !l1pte_read_fault(d, &gpte, &spte) )
2282 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
2283 perfc_incrc(read_fault_bail);
2284 shadow_unlock(d);
2285 return 0;
2289 /*
2290 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
2291 */
2292 if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
2294 /* XXX Watch out for read-only L2 entries! (not used in Linux). */
2295 /*if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
2296 &gpte, sizeof(gpte))) )*/
2297 if ( unlikely(!__guest_set_l1e(v, va, &gpte)))
2299 printk("%s() failed, crashing domain %d "
2300 "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
2301 __func__,d->domain_id, l2e_get_intpte(gpde), va);
2302 domain_crash_synchronous();
2305 __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
2308 shadow_set_l1e(va, spte, 1);
2310 perfc_incrc(shadow_fault_fixed);
2311 d->arch.shadow_fault_count++;
2313 shadow_unlock(d);
2315 check_pagetable(v, "post-sf");
2316 return EXCRET_fault_fixed;
2318 fail:
2319 shadow_unlock(d);
2320 return 0;
2322 #endif /* CONFIG_PAGING_LEVELS == 2 */
2324 static inline unsigned long va_to_l1mfn(struct vcpu *v, unsigned long va)
2326 struct domain *d = v->domain;
2327 guest_l2_pgentry_t gl2e = {0};
2329 __guest_get_l2e(v, va, &gl2e);
2331 if ( unlikely(!(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT)) )
2332 return INVALID_MFN;
2334 return gmfn_to_mfn(d, l2e_get_pfn(gl2e));
2337 static int do_update_va_mapping(unsigned long va,
2338 l1_pgentry_t val,
2339 struct vcpu *v)
2341 struct domain *d = v->domain;
2342 l1_pgentry_t spte;
2343 int rc = 0;
2345 shadow_lock(d);
2347 // This is actually overkill - we don't need to sync the L1 itself,
2348 // just everything involved in getting to this L1 (i.e. we need
2349 // linear_pg_table[l1_linear_offset(va)] to be in sync)...
2350 //
2351 __shadow_sync_va(v, va);
2353 l1pte_propagate_from_guest(d, *(guest_l1_pgentry_t *)&val, &spte);
2354 #if CONFIG_PAGING_LEVELS == 2
2355 shadow_set_l1e(va, spte, 0);
2356 #elif CONFIG_PAGING_LEVELS >= 3
2357 shadow_set_l1e_64(va, (pgentry_64_t *) &spte, 0);
2358 #endif
2359 /*
2360 * If we're in log-dirty mode then we need to note that we've updated
2361 * the PTE in the PT-holding page. We need the machine frame number
2362 * for this.
2363 */
2364 __mark_dirty(d, va_to_l1mfn(v, va));
2366 shadow_unlock(d);
2368 return rc;
2372 /*
2373 * What lives where in the 32-bit address space in the various shadow modes,
2374 * and what it uses to get/maintain that mapping.
2376 * SHADOW MODE: none enable translate external
2378 * 4KB things:
2379 * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
2380 * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
2381 * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
2382 * monitor_vtable n/a n/a n/a mapped once
2384 * 4MB things:
2385 * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
2386 * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
2387 * monitor_linear n/a n/a n/a ???
2388 * perdomain perdomain perdomain perdomain perdomain
2389 * R/O M2P R/O M2P R/O M2P n/a n/a
2390 * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
2391 * P2M n/a n/a R/O M2P R/O M2P
2393 * NB:
2394 * update_pagetables(), shadow_update_pagetables(), shadow_mode_enable(),
2395 * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
2396 * all play a part in maintaining these mappings.
2397 */
2398 static void shadow_update_pagetables(struct vcpu *v)
2400 struct domain *d = v->domain;
2401 #if CONFIG_PAGING_LEVELS == 4
2402 unsigned long gmfn = ((v->arch.flags & TF_kernel_mode)?
2403 pagetable_get_pfn(v->arch.guest_table) :
2404 pagetable_get_pfn(v->arch.guest_table_user));
2405 #else
2406 unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
2407 #endif
2409 unsigned long gpfn = mfn_to_gmfn(d, gmfn);
2410 unsigned long smfn, old_smfn;
2412 #if CONFIG_PAGING_LEVELS == 2
2413 unsigned long hl2mfn;
2414 #endif
2415 int need_sync = 0;
2417 int max_mode = ( shadow_mode_external(d) ? SHM_external
2418 : shadow_mode_translate(d) ? SHM_translate
2419 : shadow_mode_enabled(d) ? SHM_enable
2420 : 0 );
2422 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
2423 ASSERT( max_mode );
2425 /*
2426 * arch.guest_vtable
2427 */
2428 if ( max_mode & (SHM_enable | SHM_external) )
2430 if ( likely(v->arch.guest_vtable != NULL) )
2431 unmap_domain_page_global(v->arch.guest_vtable);
2432 v->arch.guest_vtable = map_domain_page_global(gmfn);
2435 /*
2436 * arch.shadow_table
2437 */
2438 #if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32)
2439 /*
2440 * We use PGT_l4_shadow for 2-level paging guests on PAE
2441 */
2442 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
2444 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) )
2445 smfn = shadow_l3_table(v, gpfn, gmfn);
2447 else
2448 #endif
2450 #if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE )
2451 /*
2452 * We use PGT_l4_shadow for 2-level paging guests on PAE
2453 */
2454 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
2456 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) )
2457 smfn = shadow_l3_table(v, gpfn, gmfn);
2458 else
2460 update_top_level_shadow(v, smfn);
2461 need_sync = 1;
2464 else
2465 #endif
2466 if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
2468 #if CONFIG_PAGING_LEVELS == 2
2469 smfn = shadow_l2_table(v, gpfn, gmfn);
2470 #elif CONFIG_PAGING_LEVELS == 3
2471 smfn = shadow_l3_table(v, gpfn, gmfn);
2472 #elif CONFIG_PAGING_LEVELS == 4
2473 smfn = shadow_l4_table(v, gpfn, gmfn);
2474 #endif
2476 else
2478 #if CONFIG_PAGING_LEVELS >= 3
2479 if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
2480 update_top_level_shadow(v, smfn);
2481 #endif
2482 /*
2483 * move sync later in order to avoid this smfn been
2484 * unshadowed occasionally
2485 */
2486 need_sync = 1;
2490 if ( !get_shadow_ref(smfn) )
2491 BUG();
2492 old_smfn = pagetable_get_pfn(v->arch.shadow_table);
2493 v->arch.shadow_table = pagetable_from_pfn(smfn);
2494 if ( old_smfn )
2495 put_shadow_ref(old_smfn);
2497 SH_VVLOG("shadow_update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
2499 /*
2500 * arch.shadow_vtable
2501 */
2502 if ( max_mode == SHM_external
2503 #if CONFIG_PAGING_LEVELS >=3
2504 || max_mode & SHM_enable
2505 #endif
2508 if ( v->arch.shadow_vtable )
2509 unmap_domain_page_global(v->arch.shadow_vtable);
2510 v->arch.shadow_vtable = map_domain_page_global(smfn);
2513 #if CONFIG_PAGING_LEVELS == 2
2514 /*
2515 * arch.hl2_vtable
2516 */
2518 // if max_mode == SHM_translate, then the hl2 is already installed
2519 // correctly in its smfn, and there's nothing to do.
2520 //
2521 if ( max_mode == SHM_external )
2523 if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
2524 hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
2525 if ( v->arch.hl2_vtable )
2526 unmap_domain_page_global(v->arch.hl2_vtable);
2527 v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
2530 /*
2531 * fixup pointers in monitor table, as necessary
2532 */
2533 if ( max_mode == SHM_external )
2535 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
2536 l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
2537 l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
2539 ASSERT( shadow_mode_translate(d) );
2541 if ( !get_shadow_ref(hl2mfn) )
2542 BUG();
2543 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
2544 l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
2545 if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
2546 put_shadow_ref(l2e_get_pfn(old_hl2e));
2548 if ( !get_shadow_ref(smfn) )
2549 BUG();
2550 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
2551 l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
2552 if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
2553 put_shadow_ref(l2e_get_pfn(old_sl2e));
2555 // XXX - maybe this can be optimized somewhat??
2556 local_flush_tlb();
2558 #endif /* CONFIG_PAGING_LEVELS == 2 */
2560 #if CONFIG_PAGING_LEVELS == 3
2561 /*
2562 * fixup pointers in monitor table, as necessary
2563 */
2564 if ( max_mode == SHM_external )
2566 l3_pgentry_t *mpl3e = (l3_pgentry_t *) v->arch.monitor_vtable;
2567 l2_pgentry_t *spl2e;
2568 unsigned long s2mfn;
2569 int i;
2571 ASSERT( shadow_mode_translate(d) );
2572 s2mfn = l3e_get_pfn(mpl3e[L3_PAGETABLE_ENTRIES - 1]);
2574 ASSERT( s2mfn);
2575 spl2e = map_domain_page(s2mfn);
2577 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
2578 spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
2579 (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
2580 l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
2581 l2e_empty();
2583 unmap_domain_page(spl2e);
2584 local_flush_tlb();
2586 #endif
2588 if(likely(need_sync))
2589 shadow_sync_all(d);
2593 /************************************************************************/
2594 /************************************************************************/
2595 /************************************************************************/
2597 #if 0 // this code has not been updated for 32pae & 64 bit modes
2598 #if SHADOW_DEBUG
2600 // The following is entirely for _check_pagetable()'s benefit.
2601 // _check_pagetable() wants to know whether a given entry in a
2602 // shadow page table is supposed to be the shadow of the guest's
2603 // current entry, or the shadow of the entry held in the snapshot
2604 // taken above.
2605 //
2606 // Here, we mark all currently existing entries as reflecting
2607 // the snapshot, above. All other places in xen that update
2608 // the shadow will keep the shadow in sync with the guest's
2609 // entries (via l1pte_propagate_from_guest and friends), which clear
2610 // the SHADOW_REFLECTS_SNAPSHOT bit.
2611 //
2612 static void
2613 mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
2615 unsigned long smfn;
2616 l1_pgentry_t *l1e;
2617 l2_pgentry_t *l2e;
2618 unsigned i;
2620 if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
2622 l1e = map_domain_page(smfn);
2623 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2624 if ( is_guest_l1_slot(i) &&
2625 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
2626 l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
2627 unmap_domain_page(l1e);
2630 if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
2632 l2e = map_domain_page(smfn);
2633 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
2634 if ( is_guest_l2_slot(0, i) &&
2635 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
2636 l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
2637 unmap_domain_page(l2e);
2641 // BUG: these are not SMP safe...
2642 static int sh_l2_present;
2643 static int sh_l1_present;
2644 static char *sh_check_name;
2645 // int shadow_status_noswap; // declared in shadow32.c
2647 #define v2m(_v, _adr) ({ \
2648 unsigned long _a = (unsigned long)(_adr); \
2649 l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \
2650 unsigned long _pa = -1; \
2651 if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
2652 { \
2653 l1_pgentry_t _pte; \
2654 _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
2655 if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
2656 _pa = l1e_get_paddr(_pte); \
2657 } \
2658 _pa | (_a & ~PAGE_MASK); \
2659 })
2661 #define FAIL(_f, _a...) \
2662 do { \
2663 printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \
2664 sh_check_name, level, l2_idx, l1_idx, ## _a, \
2665 __FILE__, __LINE__); \
2666 printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \
2667 " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \
2668 " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \
2669 " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \
2670 l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \
2671 l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \
2672 p_guest_pte, p_shadow_pte, p_snapshot_pte, \
2673 (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \
2674 (void *)v2m(v, p_snapshot_pte), \
2675 (l2_idx << L2_PAGETABLE_SHIFT) | \
2676 (l1_idx << L1_PAGETABLE_SHIFT)); \
2677 errors++; \
2678 } while ( 0 )
2680 static int check_pte(
2681 struct vcpu *v,
2682 l1_pgentry_t *p_guest_pte,
2683 l1_pgentry_t *p_shadow_pte,
2684 l1_pgentry_t *p_snapshot_pte,
2685 int level, int l2_idx, int l1_idx)
2687 struct domain *d = v->domain;
2688 l1_pgentry_t guest_pte = *p_guest_pte;
2689 l1_pgentry_t shadow_pte = *p_shadow_pte;
2690 l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
2691 l1_pgentry_t eff_guest_pte;
2692 unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
2693 int errors = 0, guest_writable;
2694 int page_table_page;
2696 if ( (l1e_get_intpte(shadow_pte) == 0) ||
2697 (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
2698 (l1e_get_intpte(shadow_pte) == 0x00000E00) )
2699 return errors; /* always safe */
2701 if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
2702 FAIL("Non zero not present shadow_pte");
2704 if ( level == 2 ) sh_l2_present++;
2705 if ( level == 1 ) sh_l1_present++;
2707 if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
2708 eff_guest_pte = snapshot_pte;
2709 else
2710 eff_guest_pte = guest_pte;
2712 if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
2713 FAIL("Guest not present yet shadow is");
2715 mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
2717 if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
2718 FAIL("Corrupt?");
2720 if ( (level == 1) &&
2721 (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
2722 !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
2723 FAIL("Dirty coherence");
2725 if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
2726 !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
2727 FAIL("Accessed coherence");
2729 if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
2730 FAIL("global bit set in shadow");
2732 eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
2733 eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
2734 shadow_mfn = l1e_get_pfn(shadow_pte);
2736 if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
2737 FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
2738 __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
2740 page_table_page = mfn_is_page_table(eff_guest_mfn);
2742 guest_writable =
2743 (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
2744 (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
2746 if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
2748 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
2749 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
2750 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
2751 page_table_page);
2752 FAIL("RW coherence");
2755 if ( (level == 1) &&
2756 (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
2757 !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
2759 printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
2760 eff_guest_pfn, eff_guest_mfn, shadow_mfn,
2761 mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
2762 page_table_page);
2763 FAIL("RW2 coherence");
2766 if ( eff_guest_mfn == shadow_mfn )
2768 if ( level > 1 )
2769 FAIL("Linear map ???"); /* XXX this will fail on BSD */
2771 else
2773 if ( level < 2 )
2774 FAIL("Shadow in L1 entry?");
2776 if ( level == 2 )
2778 if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
2779 FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
2780 __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
2782 else
2783 BUG(); // XXX -- not handled yet.
2786 return errors;
2788 #undef FAIL
2789 #undef v2m
2791 static int check_l1_table(
2792 struct vcpu *v, unsigned long gpfn,
2793 unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
2795 struct domain *d = v->domain;
2796 int i;
2797 unsigned long snapshot_mfn;
2798 l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
2799 int errors = 0;
2801 if ( page_out_of_sync(mfn_to_page(gmfn)) )
2803 snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
2804 ASSERT(snapshot_mfn);
2805 p_snapshot = map_domain_page(snapshot_mfn);
2808 p_guest = map_domain_page(gmfn);
2809 p_shadow = map_domain_page(smfn);
2811 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2812 errors += check_pte(v, p_guest+i, p_shadow+i,
2813 p_snapshot ? p_snapshot+i : NULL,
2814 1, l2_idx, i);
2816 unmap_domain_page(p_shadow);
2817 unmap_domain_page(p_guest);
2818 if ( p_snapshot )
2819 unmap_domain_page(p_snapshot);
2821 return errors;
2824 #define FAILPT(_f, _a...) \
2825 do { \
2826 printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
2827 errors++; \
2828 } while ( 0 )
2830 static int check_l2_table(
2831 struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
2833 struct domain *d = v->domain;
2834 l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
2835 l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
2836 l2_pgentry_t match;
2837 int i;
2838 int errors = 0;
2839 int limit;
2841 if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
2842 FAILPT("domain doesn't own page");
2843 if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
2844 FAILPT("bogus owner for snapshot page");
2845 if ( page_get_owner(mfn_to_page(smfn)) != NULL )
2846 FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
2847 smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
2849 #if 0
2850 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2851 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
2852 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
2853 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
2855 for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2856 i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
2857 i++ )
2858 printk("+++ (%d) %lx %lx\n",i,
2859 l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
2860 FAILPT("hypervisor entries inconsistent");
2863 if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
2864 l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
2865 FAILPT("hypervisor linear map inconsistent");
2866 #endif
2868 match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
2869 if ( !shadow_mode_external(d) &&
2870 l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
2871 match, PAGE_FLAG_MASK))
2873 FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
2874 l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
2875 L2_PAGETABLE_SHIFT]),
2876 l2e_get_intpte(match));
2879 match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
2880 if ( !shadow_mode_external(d) &&
2881 l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
2882 match, PAGE_FLAG_MASK))
2884 FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
2885 l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
2886 d->arch.mm_perdomain_pt,
2887 l2e_get_intpte(match));
2890 #if CONFIG_PAGING_LEVELS == 2
2891 if ( shadow_mode_external(d) )
2892 limit = L2_PAGETABLE_ENTRIES;
2893 else
2894 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2895 #else
2896 limit = 0; /* XXX x86/64 XXX */
2897 #endif
2899 /* Check the whole L2. */
2900 for ( i = 0; i < limit; i++ )
2901 errors += check_pte(v,
2902 (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
2903 (l1_pgentry_t*)(&spl2e[i]),
2904 NULL,
2905 2, i, 0);
2907 unmap_domain_page(spl2e);
2908 unmap_domain_page(gpl2e);
2910 #if 1
2911 if ( errors )
2912 printk("check_l2_table returning %d errors\n", errors);
2913 #endif
2915 return errors;
2917 #undef FAILPT
2919 int _check_pagetable(struct vcpu *v, char *s)
2921 struct domain *d = v->domain;
2922 #if CONFIG_PAGING_LEVELS == 4
2923 pagetable_t pt = ((v->arch.flags & TF_kernel_mode)?
2924 v->arch.guest_table : v->arch.guest_table_user);
2925 #else
2926 pagetable_t pt = v->arch.guest_table;
2927 #endif
2928 unsigned long gptbase = pagetable_get_paddr(pt);
2929 unsigned long ptbase_pfn, smfn;
2930 unsigned long i;
2931 l2_pgentry_t *gpl2e, *spl2e;
2932 unsigned long ptbase_mfn = 0;
2933 int errors = 0, limit, oos_pdes = 0;
2935 //_audit_domain(d, AUDIT_QUIET);
2936 shadow_lock(d);
2938 sh_check_name = s;
2939 //SH_VVLOG("%s-PT Audit", s);
2940 sh_l2_present = sh_l1_present = 0;
2941 perfc_incrc(check_pagetable);
2943 ptbase_mfn = gptbase >> PAGE_SHIFT;
2944 ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
2946 if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
2948 printk("%s-PT %lx not shadowed\n", s, gptbase);
2949 goto out;
2951 if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
2953 ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
2954 oos_pdes = 1;
2955 ASSERT(ptbase_mfn);
2958 errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
2960 gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
2961 spl2e = (l2_pgentry_t *) map_domain_page(smfn);
2963 /* Go back and recurse. */
2964 #if CONFIG_PAGING_LEVELS == 2
2965 if ( shadow_mode_external(d) )
2966 limit = L2_PAGETABLE_ENTRIES;
2967 else
2968 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
2969 #else
2970 limit = 0; /* XXX x86/64 XXX */
2971 #endif
2973 for ( i = 0; i < limit; i++ )
2975 unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
2976 unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
2977 unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
2979 if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */
2981 errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
2985 unmap_domain_page(spl2e);
2986 unmap_domain_page(gpl2e);
2988 #if 0
2989 SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
2990 sh_l2_present, sh_l1_present);
2991 #endif
2993 out:
2994 if ( errors )
2995 BUG();
2997 shadow_unlock(d);
2999 return errors;
3002 int _check_all_pagetables(struct vcpu *v, char *s)
3004 struct domain *d = v->domain;
3005 int i;
3006 struct shadow_status *a;
3007 unsigned long gmfn;
3008 int errors = 0;
3010 shadow_status_noswap = 1;
3012 sh_check_name = s;
3013 SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
3014 sh_l2_present = sh_l1_present = 0;
3015 perfc_incrc(check_all_pagetables);
3017 for (i = 0; i < shadow_ht_buckets; i++)
3019 a = &d->arch.shadow_ht[i];
3020 while ( a && a->gpfn_and_flags )
3022 gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
3024 switch ( a->gpfn_and_flags & PGT_type_mask )
3026 case PGT_l1_shadow:
3027 errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
3028 gmfn, a->smfn, 0);
3029 break;
3030 case PGT_l2_shadow:
3031 errors += check_l2_table(v, gmfn, a->smfn,
3032 page_out_of_sync(mfn_to_page(gmfn)));
3033 break;
3034 case PGT_l3_shadow:
3035 case PGT_l4_shadow:
3036 case PGT_hl2_shadow:
3037 BUG(); // XXX - ought to fix this...
3038 break;
3039 case PGT_snapshot:
3040 case PGT_writable_pred:
3041 break;
3042 default:
3043 errors++;
3044 printk("unexpected shadow type %lx, gpfn=%lx, "
3045 "gmfn=%lx smfn=%lx\n",
3046 a->gpfn_and_flags & PGT_type_mask,
3047 a->gpfn_and_flags & PGT_mfn_mask,
3048 gmfn, a->smfn);
3049 BUG();
3051 a = a->next;
3055 shadow_status_noswap = 0;
3057 if ( errors )
3058 BUG();
3060 return errors;
3063 #endif // SHADOW_DEBUG
3064 #endif // this code has not been updated for 32pae & 64 bit modes
3066 #if CONFIG_PAGING_LEVELS >= 3
3067 /****************************************************************************/
3068 /* 64-bit shadow-mode code testing */
3069 /****************************************************************************/
3070 /*
3071 * init_bl2() is for 32-bit VMX guest on 64-bit host
3072 * Using 1 shadow L4(l3) and 4 shadow L2s to simulate guest L2
3073 */
3074 static inline unsigned long init_bl2(
3075 struct domain *d, unsigned long gpfn, unsigned long gmfn)
3077 unsigned int count;
3078 unsigned long sl2mfn;
3079 unsigned long smfn;
3080 struct page_info *page;
3081 l4_pgentry_t *spl4e;
3082 void *l2;
3084 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
3086 printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3087 /* XXX Deal gracefully with failure. */
3088 domain_crash_synchronous();
3091 spl4e = (l4_pgentry_t *)map_domain_page(smfn);
3093 /* Map the self entry, L4&L3 share the same page */
3094 spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
3096 /* Allocate 4 shadow L2s */
3097 page = alloc_domheap_pages(NULL, SL2_ORDER, 0);
3098 if ( !page )
3099 domain_crash_synchronous();
3101 for ( count = 0; count < PAE_L3_PAGETABLE_ENTRIES; count++ )
3103 sl2mfn = page_to_mfn(page+count);
3104 l2 = map_domain_page(sl2mfn);
3105 memset(l2, 0, PAGE_SIZE);
3106 unmap_domain_page(l2);
3107 spl4e[count] = l4e_from_pfn(sl2mfn, _PAGE_PRESENT);
3110 unmap_domain_page(spl4e);
3112 return smfn;
3115 static inline unsigned long init_l3(
3116 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
3118 unsigned long smfn;
3119 l4_pgentry_t *spl4e;
3120 unsigned long index;
3122 if ( unlikely(!(smfn = alloc_shadow_page(v->domain, gpfn, gmfn, PGT_l4_shadow))) )
3124 printk("Couldn't alloc an L4 shadow for pfn= %lx mfn= %lx\n", gpfn, gmfn);
3125 BUG(); /* XXX Deal gracefully wiht failure. */
3128 /* Map the self entry, L4&L3 share the same page */
3129 spl4e = (l4_pgentry_t *)map_domain_page(smfn);
3131 /*
3132 * Shadow L4's pfn_info->tlbflush_timestamp
3133 * should also save it's own index.
3134 */
3136 index = get_cr3_idxval(v);
3137 frame_table[smfn].tlbflush_timestamp = index;
3139 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
3140 spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
3141 unmap_domain_page(spl4e);
3142 return smfn;
3144 #endif
3146 #if CONFIG_PAGING_LEVELS == 3
3147 static unsigned long shadow_l3_table(
3148 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
3150 unsigned long smfn;
3151 l3_pgentry_t *spl3e;
3152 struct domain *d = v->domain;
3154 perfc_incrc(shadow_l3_table_count);
3156 SH_VVLOG("shadow_l3_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
3158 if ( SH_L1_HAS_NEXT_PAGE &&
3159 d->arch.ops->guest_paging_levels == PAGING_L2 )
3161 return init_bl2(d, gpfn, gmfn);
3164 if ( SH_GUEST_32PAE &&
3165 d->arch.ops->guest_paging_levels == PAGING_L3 )
3167 return init_l3(v, gpfn, gmfn);
3170 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l3_shadow))) )
3172 printk("Couldn't alloc an L3 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3173 BUG(); /* XXX Deal gracefully with failure. */
3176 spl3e = (l3_pgentry_t *)map_domain_page(smfn);
3178 /* Make the self entry */
3179 spl3e[PAE_SHADOW_SELF_ENTRY] = l3e_from_pfn(smfn, __PAGE_HYPERVISOR);
3181 if ( (PGT_base_page_table == PGT_l3_page_table) &&
3182 !shadow_mode_external(d) ) {
3183 int i;
3184 unsigned long g2mfn, s2mfn;
3185 l2_pgentry_t *spl2e;
3186 l3_pgentry_t *gpl3e;
3188 /* Get the top entry */
3189 gpl3e = (l3_pgentry_t *)map_domain_page(gmfn);
3191 if ( !(l3e_get_flags(gpl3e[L3_PAGETABLE_ENTRIES - 1]) & _PAGE_PRESENT) )
3193 BUG();
3196 g2mfn = l3e_get_pfn(gpl3e[L3_PAGETABLE_ENTRIES - 1]);
3198 /* NB. g2mfn should be same as g2pfn */
3199 if (!(s2mfn = __shadow_status(d, g2mfn, PGT_l2_shadow))) {
3200 if ( unlikely(!(s2mfn =
3201 alloc_shadow_page(d, g2mfn, g2mfn, PGT_l2_shadow))) ) {
3202 printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
3203 g2mfn, g2mfn);
3204 BUG(); /* XXX Deal gracefully with failure. */
3208 if (!get_shadow_ref(s2mfn))
3209 BUG();
3211 /* Map shadow L2 into shadow L3 */
3212 spl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(s2mfn, _PAGE_PRESENT);
3213 shadow_update_min_max(smfn, L3_PAGETABLE_ENTRIES -1);
3215 /*
3216 * Xen private mappings. Do the similar things as
3217 * create_pae_xen_mappings().
3218 */
3219 spl2e = (l2_pgentry_t *)map_domain_page(s2mfn);
3221 /*
3222 * When we free L2 pages, we need to tell if the page contains
3223 * Xen private mappings. Use the va_mask part.
3224 */
3225 mfn_to_page(s2mfn)->u.inuse.type_info |=
3226 (unsigned long) 3 << PGT_score_shift;
3228 memset(spl2e, 0,
3229 (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)) * sizeof(l2_pgentry_t));
3231 memcpy(&spl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
3232 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
3233 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
3235 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
3236 spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
3237 l2e_from_page(
3238 virt_to_page(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_pt) + i,
3239 __PAGE_HYPERVISOR);
3240 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
3241 spl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
3242 (l3e_get_flags(gpl3e[i]) & _PAGE_PRESENT) ?
3243 l2e_from_pfn(l3e_get_pfn(gpl3e[i]), __PAGE_HYPERVISOR) :
3244 l2e_empty();
3246 unmap_domain_page(spl2e);
3247 unmap_domain_page(gpl3e);
3249 unmap_domain_page(spl3e);
3251 return smfn;
3253 #endif /* CONFIG_PAGING_LEVELS == 3 */
3255 #if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
3256 static unsigned long gva_to_gpa_pae(unsigned long gva)
3258 BUG();
3259 return 43;
3261 #endif
3263 #if CONFIG_PAGING_LEVELS == 4
3264 static unsigned long shadow_l4_table(
3265 struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
3267 unsigned long smfn;
3268 l4_pgentry_t *spl4e;
3269 struct domain *d = v->domain;
3271 SH_VVLOG("shadow_l4_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
3273 perfc_incrc(shadow_l4_table_count);
3275 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
3277 return init_bl2(d, gpfn, gmfn);
3280 if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
3282 return init_l3(v, gpfn, gmfn);
3285 if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
3287 printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3288 BUG(); /* XXX Deal gracefully with failure. */
3291 spl4e = (l4_pgentry_t *)map_domain_page(smfn);
3293 /* Install hypervisor and 4x linear p.t. mapings. */
3294 if ( (PGT_base_page_table == PGT_l4_page_table) &&
3295 !shadow_mode_external(d) )
3297 /*
3298 * We could proactively fill in PDEs for pages that are already
3299 * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
3300 * (restriction required for coherence of the accessed bit). However,
3301 * we tried it and it didn't help performance. This is simpler.
3302 */
3303 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
3305 /* Install hypervisor and 2x linear p.t. mapings. */
3306 memcpy(&spl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
3307 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
3308 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
3310 spl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
3311 l4e_from_paddr(__pa(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_l3),
3312 __PAGE_HYPERVISOR);
3314 if ( shadow_mode_translate(d) ) // NB: not external
3316 spl4e[l4_table_offset(RO_MPT_VIRT_START)] =
3317 l4e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
3318 __PAGE_HYPERVISOR);
3320 else
3321 spl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
3322 l4e_from_pfn(gmfn, __PAGE_HYPERVISOR);
3324 } else
3325 memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
3327 unmap_domain_page(spl4e);
3329 ESH_LOG("shadow_l4_table(%lx -> %lx)", gmfn, smfn);
3330 return smfn;
3332 #endif /* CONFIG_PAGING_LEVELS == 4 */
3334 #if CONFIG_PAGING_LEVELS >= 3
3335 static void
3336 update_top_level_shadow(struct vcpu *v, unsigned long smfn)
3338 unsigned long index = get_cr3_idxval(v);
3339 pgentry_64_t *sple = (pgentry_64_t *)map_domain_page(smfn);
3340 pgentry_64_t *gple = (pgentry_64_t *)&v->arch.guest_vtable;
3341 int i;
3343 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
3345 unsigned long gpfn;
3347 /*
3348 * Looks like it's no longer a page table.
3349 */
3350 if ( unlikely(entry_get_value(gple[index*4+i]) & PAE_PDPT_RESERVED) )
3352 if ( entry_get_flags(sple[i]) & _PAGE_PRESENT )
3353 put_shadow_ref(entry_get_pfn(sple[i]));
3355 sple[i] = entry_empty();
3356 continue;
3359 gpfn = entry_get_pfn(gple[index*4+i]);
3361 if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
3363 if ( entry_get_flags(sple[i]) & _PAGE_PRESENT )
3364 put_shadow_ref(entry_get_pfn(sple[i]));
3366 sple[i] = entry_empty();
3367 continue;
3370 validate_entry_change(
3371 v->domain, &gple[index*4+i], &sple[i], PAGING_L3);
3374 unmap_domain_page(sple);
3377 /*
3378 * validate_bl2e_change()
3379 * The code is for 32-bit HVM guest on 64-bit host.
3380 * To sync guest L2.
3381 */
3383 static inline void
3384 validate_bl2e_change(
3385 struct domain *d,
3386 guest_root_pgentry_t *new_gle_p,
3387 pgentry_64_t *shadow_l3,
3388 int index)
3390 int sl3_idx, sl2_idx;
3391 unsigned long sl2mfn, sl1mfn;
3392 pgentry_64_t *sl2_p;
3394 /* Using guest l2 pte index to get shadow l3&l2 index
3395 * index: 0 ~ 1023, PAGETABLE_ENTRIES: 512
3396 */
3397 sl3_idx = index / (PAGETABLE_ENTRIES / 2);
3398 sl2_idx = (index % (PAGETABLE_ENTRIES / 2)) * 2;
3400 sl2mfn = entry_get_pfn(shadow_l3[sl3_idx]);
3401 sl2_p = (pgentry_64_t *)map_domain_page(sl2mfn);
3403 validate_pde_change(
3404 d, *(guest_l2_pgentry_t *)new_gle_p, (l2_pgentry_t *)&sl2_p[sl2_idx]);
3406 /* Mapping the second l1 shadow page */
3407 if (entry_get_flags(sl2_p[sl2_idx]) & _PAGE_PRESENT) {
3408 sl1mfn = entry_get_pfn(sl2_p[sl2_idx]);
3409 sl2_p[sl2_idx + 1] =
3410 entry_from_pfn(sl1mfn + 1, entry_get_flags(sl2_p[sl2_idx]));
3412 else
3413 sl2_p[sl2_idx + 1] = (pgentry_64_t){0};
3414 unmap_domain_page(sl2_p);
3418 /*
3419 * This shadow_mark_va_out_of_sync() is for 2M page shadow
3420 */
3421 static void shadow_mark_va_out_of_sync_2mp(
3422 struct vcpu *v, unsigned long gpfn, unsigned long mfn, paddr_t writable_pl1e)
3424 struct out_of_sync_entry *entry =
3425 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
3427 entry->writable_pl1e = writable_pl1e;
3428 ESH_LOG("<shadow_mark_va_out_of_sync_2mp> gpfn = %lx\n", gpfn);
3429 if ( !get_shadow_ref(writable_pl1e >> L1_PAGETABLE_SHIFT) )
3430 BUG();
3433 static int get_shadow_mfn(struct domain *d, unsigned long gpfn, unsigned long *spmfn, u32 flag)
3435 unsigned long gmfn;
3436 if ( !(*spmfn = __shadow_status(d, gpfn, flag)) )
3438 /* This is NOT already shadowed so we need to shadow it. */
3439 SH_VVLOG("<get_shadow_mfn>: not shadowed");
3441 gmfn = gmfn_to_mfn(d, gpfn);
3442 if ( unlikely(!VALID_MFN(gmfn)) )
3444 // Attempt to use an invalid pfn as an shadow page.
3445 // XXX this needs to be more graceful!
3446 BUG();
3449 if ( unlikely(!(*spmfn =
3450 alloc_shadow_page(d, gpfn, gmfn, flag))) )
3452 printk("<get_shadow_mfn>Couldn't alloc an shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
3453 BUG(); /* XXX Need to deal gracefully with failure. */
3455 switch(flag) {
3456 case PGT_l1_shadow:
3457 perfc_incrc(shadow_l1_table_count);
3458 break;
3459 case PGT_l2_shadow:
3460 perfc_incrc(shadow_l2_table_count);
3461 break;
3462 case PGT_l3_shadow:
3463 perfc_incrc(shadow_l3_table_count);
3464 break;
3465 case PGT_hl2_shadow:
3466 perfc_incrc(shadow_hl2_table_count);
3467 break;
3470 return 1;
3471 } else {
3472 /* This L1 is shadowed already, but the L2 entry is missing. */
3473 SH_VVLOG("4b: was shadowed, l2 missing (%lx)", *spmfn);
3474 return 0;
3478 static void shadow_map_into_current(struct vcpu *v,
3479 unsigned long va, unsigned int from, unsigned int to)
3481 pgentry_64_t gle = {0}, sle;
3482 unsigned long gpfn, smfn;
3484 if (from == PAGING_L1 && to == PAGING_L2) {
3485 shadow_map_l1_into_current_l2(va);
3486 return;
3489 __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | to);
3490 ASSERT(entry_get_flags(gle) & _PAGE_PRESENT);
3491 gpfn = entry_get_pfn(gle);
3493 get_shadow_mfn(v->domain, gpfn, &smfn, shadow_level_to_type(from));
3495 if ( !get_shadow_ref(smfn) )
3496 BUG();
3497 entry_general(v->domain, &gle, &sle, smfn, to);
3498 __rw_entry(v, va, &gle, GUEST_ENTRY | SET_ENTRY | to);
3499 __rw_entry(v, va, &sle, SHADOW_ENTRY | SET_ENTRY | to);
3502 /*
3503 * shadow_set_lxe should be put in shadow.h
3504 */
3505 static void shadow_set_l2e_64(unsigned long va, l2_pgentry_t sl2e,
3506 int create_l2_shadow, int put_ref_check)
3508 struct vcpu *v = current;
3509 l4_pgentry_t sl4e;
3510 l3_pgentry_t sl3e;
3512 __shadow_get_l4e(v, va, &sl4e);
3513 if (!(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
3514 if (create_l2_shadow) {
3515 perfc_incrc(shadow_set_l3e_force_map);
3516 shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
3517 __shadow_get_l4e(v, va, &sl4e);
3518 } else {
3519 printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
3523 __shadow_get_l3e(v, va, &sl3e);
3524 if (!(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
3525 if (create_l2_shadow) {
3526 perfc_incrc(shadow_set_l2e_force_map);
3527 shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
3528 __shadow_get_l3e(v, va, &sl3e);
3529 } else {
3530 printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
3533 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L4 )
3534 shadow_update_min_max(l4e_get_pfn(sl4e), l3_table_offset(va));
3537 if ( put_ref_check ) {
3538 l2_pgentry_t tmp_sl2e;
3539 if ( __shadow_get_l2e(v, va, &tmp_sl2e) ) {
3540 if ( l2e_get_flags(tmp_sl2e) & _PAGE_PRESENT )
3541 if ( l2e_get_pfn(tmp_sl2e) == l2e_get_pfn(sl2e) ) {
3542 put_shadow_ref(l2e_get_pfn(sl2e));
3548 if (! __shadow_set_l2e(v, va, &sl2e))
3549 BUG();
3550 shadow_update_min_max(l3e_get_pfn(sl3e), l2_table_offset(va));
3554 /* As 32-bit guest don't support 4M page yet,
3555 * we don't concern double compile for this function
3556 */
3557 static inline int l2e_rw_fault(
3558 struct vcpu *v, l2_pgentry_t *gl2e_p, unsigned long va, int rw)
3560 struct domain *d = v->domain;
3561 l2_pgentry_t gl2e = *gl2e_p;
3562 l2_pgentry_t tmp_l2e = gl2e;
3563 unsigned long start_gpfn = l2e_get_pfn(gl2e);
3564 unsigned long gpfn, mfn;
3565 unsigned long l1_mfn, gmfn;
3566 l1_pgentry_t *l1_p;
3567 l1_pgentry_t sl1e;
3568 l1_pgentry_t old_sl1e;
3569 l2_pgentry_t sl2e;
3570 #ifdef __x86_64__
3571 u64 nx = 0;
3572 #endif
3573 int put_ref_check = 0;
3574 /* Check if gpfn is 2M aligned */
3576 /* Update guest l2e */
3577 if (rw) {
3578 ASSERT(l2e_get_flags(gl2e) & _PAGE_RW);
3579 l2e_add_flags(gl2e, _PAGE_DIRTY | _PAGE_ACCESSED);
3580 } else {
3581 l2e_add_flags(gl2e, _PAGE_ACCESSED);
3584 l2e_remove_flags(tmp_l2e, _PAGE_PSE);
3585 if (l2e_get_flags(gl2e) & _PAGE_NX) {
3586 l2e_remove_flags(tmp_l2e, _PAGE_NX);
3587 #ifdef __x86_64__
3588 nx = PGT_high_mfn_nx;
3589 #endif
3593 /* Get the shadow l2 first */
3594 if ( !__shadow_get_l2e(v, va, &sl2e) )
3595 sl2e = l2e_empty();
3597 #ifdef __x86_64__
3598 l1_mfn = __shadow_status(d, start_gpfn | nx, PGT_fl1_shadow);
3599 #else
3600 l1_mfn = __shadow_status(d, start_gpfn, PGT_fl1_shadow);
3601 #endif
3603 /* Check the corresponding l2e */
3604 if (l1_mfn) {
3605 /* Why it is PRESENT?*/
3606 if ((l2e_get_flags(sl2e) & _PAGE_PRESENT) &&
3607 l2e_get_pfn(sl2e) == l1_mfn) {
3608 ESH_LOG("sl2e PRSENT bit is set: %lx, l1_mfn = %lx\n", l2e_get_pfn(sl2e), l1_mfn);
3609 } else {
3610 put_ref_check = 1;
3611 if (!get_shadow_ref(l1_mfn))
3612 BUG();
3614 l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn);
3615 sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
3616 } else {
3617 /* Allocate a new page as shadow page table if need */
3618 gmfn = gmfn_to_mfn(d, start_gpfn);
3619 #ifdef __x86_64__
3620 l1_mfn = alloc_shadow_page(d, start_gpfn | nx, gmfn, PGT_fl1_shadow);
3621 #else
3622 l1_mfn = alloc_shadow_page(d, start_gpfn, gmfn, PGT_fl1_shadow);
3623 #endif
3624 if (unlikely(!l1_mfn)) {
3625 BUG();
3628 if (!get_shadow_ref(l1_mfn))
3629 BUG();
3630 l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn );
3631 sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
3632 memset(l1_p, 0, PAGE_SIZE);
3633 ESH_LOG("Alloc a shadow page: %lx\n", l1_mfn);
3636 ESH_LOG("<%s>: sl2e = %lx\n", __func__, l2e_get_intpte(sl2e));
3637 /* Map the page to l2*/
3638 shadow_set_l2e_64(va, sl2e, 1, put_ref_check);
3640 if (l2e_get_flags(gl2e) & _PAGE_NX)
3641 l2e_add_flags(tmp_l2e, _PAGE_NX);
3643 /* Propagate the shadow page table, i.e. setting sl1e */
3644 for (gpfn = start_gpfn;
3645 gpfn < (start_gpfn + L1_PAGETABLE_ENTRIES); gpfn++) {
3647 mfn = gmfn_to_mfn(d, gpfn);
3649 if ( unlikely(!VALID_MFN(mfn)) )
3651 continue;
3654 sl1e = l1e_from_pfn(mfn, l2e_get_flags(tmp_l2e));
3656 if (!rw) {
3657 if ( shadow_mode_log_dirty(d) ||
3658 !(l2e_get_flags(gl2e) & _PAGE_DIRTY) || mfn_is_page_table(mfn) )
3660 l1e_remove_flags(sl1e, _PAGE_RW);
3662 } else {
3663 /* __mark_dirty(d, gmfn); */
3665 // printk("<%s> gpfn: %lx, mfn: %lx, sl1e: %lx\n", __func__, gpfn, mfn, l1e_get_intpte(sl1e));
3666 /* The shadow entrys need setup before shadow_mark_va_out_of_sync()*/
3667 old_sl1e = l1_p[gpfn - start_gpfn];
3669 if ( l1e_has_changed(old_sl1e, sl1e, _PAGE_RW | _PAGE_PRESENT) )
3671 if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
3672 !shadow_get_page_from_l1e(sl1e, d) ) {
3673 ESH_LOG("%lx, mfn: %lx why make me empty, start_pfn: %lx, gpfn: %lx\n", l1e_get_intpte(sl1e),mfn, start_gpfn, gpfn);
3674 sl1e = l1e_empty();
3676 if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
3677 put_page_from_l1e(old_sl1e, d);
3680 if (rw) {
3681 /* shadow_mark_va_out_of_sync() need modificatin for 2M pages*/
3682 if ( mfn_is_page_table(mfn) )
3683 shadow_mark_va_out_of_sync_2mp(v, gpfn, mfn,
3684 l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * (gpfn - start_gpfn)));
3687 l1_p[gpfn - start_gpfn] = sl1e;
3690 unmap_domain_page(l1_p);
3691 *gl2e_p = gl2e;
3692 return 1;
3695 /*
3696 * Check P, R/W, U/S bits in the guest page table.
3697 * If the fault belongs to guest return 1,
3698 * else return 0.
3699 */
3700 #if defined( GUEST_PGENTRY_32 )
3701 static inline int guest_page_fault(
3702 struct vcpu *v,
3703 unsigned long va, unsigned int error_code,
3704 guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
3706 /* The following check for 32-bit guest on 64-bit host */
3708 __guest_get_l2e(v, va, gpl2e);
3710 /* Check the guest L2 page-table entry first*/
3711 if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_PRESENT)) )
3712 return 1;
3714 if ( error_code & ERROR_W )
3716 if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_RW)) )
3717 return 1;
3720 if ( error_code & ERROR_U )
3722 if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_USER)) )
3723 return 1;
3726 if ( guest_l2e_get_flags(*gpl2e) & _PAGE_PSE )
3728 printk("None-PAE HVM guests can NOT use PSE, "
3729 "because we don't support 4MBytes PSE pages.\n");
3730 printk("remove pae=1 from your config file.\n");
3731 domain_crash_synchronous();
3732 return 0;
3735 __guest_get_l1e(v, va, gpl1e);
3737 /* Then check the guest L1 page-table entry */
3738 if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_PRESENT)) )
3739 return 1;
3741 if ( error_code & ERROR_W )
3743 if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_RW)) )
3744 return 1;
3747 if ( error_code & ERROR_U )
3749 if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_USER)) )
3750 return 1;
3753 return 0;
3755 #else
3756 static inline int guest_page_fault(
3757 struct vcpu *v,
3758 unsigned long va, unsigned int error_code,
3759 guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
3761 struct domain *d = v->domain;
3762 pgentry_64_t gle = { 0 };
3763 unsigned long gpfn = 0, mfn;
3764 int i;
3765 unsigned int base_idx = 0;
3766 base_idx = get_cr3_idxval(v);
3768 ASSERT( d->arch.ops->guest_paging_levels >= PAGING_L3 );
3770 #if CONFIG_PAGING_LEVELS >= 3
3771 if ( (error_code & (ERROR_I | ERROR_P)) == (ERROR_I | ERROR_P) )
3772 return 1;
3773 #endif
3775 #if CONFIG_PAGING_LEVELS == 4
3776 if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
3778 __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | PAGING_L4);
3779 if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
3780 return 1;
3782 if ( error_code & ERROR_W )
3784 if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
3785 return 1;
3788 if ( error_code & ERROR_U )
3790 if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
3791 return 1;
3793 gpfn = entry_get_pfn(gle);
3795 #endif
3797 #if CONFIG_PAGING_LEVELS >= 3
3798 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
3800 if ( SH_GUEST_32PAE )
3801 gpfn = (hvm_get_guest_ctrl_reg(v, 3)) >> PAGE_SHIFT;
3802 else
3803 gpfn = pagetable_get_pfn(v->arch.guest_table);
3805 #endif
3807 for ( i = PAGING_L3; i >= PAGING_L1; i-- )
3809 pgentry_64_t *lva;
3810 /*
3811 * If it's not external mode, then mfn should be machine physical.
3812 */
3813 mfn = gmfn_to_mfn(d, gpfn);
3815 lva = (pgentry_64_t *) map_domain_page(mfn);
3816 gle = lva[guest_table_offset_64(va, i, base_idx)];
3818 unmap_domain_page(lva);
3820 gpfn = entry_get_pfn(gle);
3822 if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
3823 return 1;
3825 if ( i < PAGING_L3 ||
3826 d->arch.ops->guest_paging_levels == PAGING_L4 )
3828 if ( error_code & ERROR_W )
3830 if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
3832 if ( i == PAGING_L1 )
3833 if ( gpl1e )
3834 gpl1e->l1 = gle.lo;
3835 return 1;
3838 if ( error_code & ERROR_U )
3840 if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
3841 return 1;
3845 if ( i == PAGING_L2 )
3847 if ( gpl2e )
3848 gpl2e->l2 = gle.lo;
3849 if ( likely(entry_get_flags(gle) & _PAGE_PSE) )
3850 return 0;
3853 if ( i == PAGING_L1 )
3854 if ( gpl1e )
3855 gpl1e->l1 = gle.lo;
3858 return 0;
3861 #endif
3863 static int shadow_fault_64(unsigned long va, struct cpu_user_regs *regs)
3865 struct vcpu *v = current;
3866 struct domain *d = v->domain;
3867 guest_l2_pgentry_t gl2e;
3868 guest_l1_pgentry_t gl1e, orig_gl1e;
3869 l1_pgentry_t sl1e;
3871 gl1e = guest_l1e_empty(); gl2e = guest_l2e_empty();
3873 sl1e = l1e_empty();
3875 perfc_incrc(shadow_fault_calls);
3877 ESH_LOG("<shadow_fault_64> va=%lx, rip = %lx, error code = %x\n",
3878 va, regs->eip, regs->error_code);
3880 /*
3881 * Don't let someone else take the guest's table pages out-of-sync.
3882 */
3883 shadow_lock(d);
3885 /*
3886 * STEP 1. Check to see if this fault might have been caused by an
3887 * out-of-sync table page entry, or if we should pass this
3888 * fault onto the guest.
3889 */
3890 __shadow_sync_va(v, va);
3892 /*
3893 * STEP 2. Check if the fault belongs to guest
3894 */
3895 if ( guest_page_fault(v, va, regs->error_code, &gl2e, &gl1e) )
3897 if ( unlikely(shadow_mode_log_dirty(d)) && l1e_get_intpte(gl1e) != 0 )
3898 goto check_writeable;
3900 goto fail;
3903 if ( unlikely((guest_l2e_get_flags(gl2e) & _PAGE_PSE)) )
3904 goto pse;
3906 /*
3907 * Handle 4K pages here
3908 */
3909 check_writeable:
3910 orig_gl1e = gl1e;
3912 /* Write fault? */
3913 if ( regs->error_code & 2 )
3915 int allow_writes = 0;
3917 if ( unlikely(!(guest_l1e_get_flags(gl1e) & _PAGE_RW)) )
3919 if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gl1e)) )
3921 allow_writes = 1;
3922 l1e_add_flags(gl1e, _PAGE_RW);
3924 else
3926 /* Write fault on a read-only mapping. */
3927 SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
3928 l1e_get_intpte(gl1e));
3929 perfc_incrc(shadow_fault_bail_ro_mapping);
3930 goto fail;
3934 if ( !l1pte_write_fault(v, &gl1e, &sl1e, va) )
3936 SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
3937 perfc_incrc(write_fault_bail);
3938 shadow_unlock(d);
3939 return 0;
3942 if (allow_writes)
3943 l1e_remove_flags(gl1e, _PAGE_RW);
3945 else
3947 if ( !l1pte_read_fault(d, &gl1e, &sl1e) )
3949 SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
3950 perfc_incrc(read_fault_bail);
3951 shadow_unlock(d);
3952 return 0;
3956 /*
3957 * STEP 3. Write the modified shadow PTE and guest PTE back to the tables
3958 */
3959 if ( l1e_has_changed(orig_gl1e, gl1e, PAGE_FLAG_MASK) )
3961 if (unlikely(!__guest_set_l1e(v, va, &gl1e)))
3962 domain_crash_synchronous();
3964 __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gl2e)));
3967 shadow_set_l1e_64(va, (pgentry_64_t *)&sl1e, 1);
3969 perfc_incrc(shadow_fault_fixed);
3970 d->arch.shadow_fault_count++;
3972 shadow_unlock(d);
3974 return EXCRET_fault_fixed;
3976 pse:
3977 /*
3978 * Handle 2M pages here
3979 */
3980 if ( unlikely(!shadow_mode_external(d)) )
3981 BUG();
3983 /* Write fault? */
3984 if ( regs->error_code & 2 )
3986 if ( !l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, WRITE_FAULT) )
3988 goto fail;
3991 else
3993 l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, READ_FAULT);
3996 /*
3997 * STEP 3. Write guest/shadow l2e back
3998 */
4000 if ( unlikely(!__guest_set_l2e(v, va, &gl2e)) )
4002 domain_crash_synchronous();
4005 /*
4006 * Todo: if necessary, record the page table page as dirty
4007 */
4009 perfc_incrc(shadow_fault_fixed);
4010 d->arch.shadow_fault_count++;
4012 shadow_unlock(d);
4014 return EXCRET_fault_fixed;
4015 fail:
4016 shadow_unlock(d);
4017 ESH_LOG("Guest fault~~~\n");
4018 return 0;
4021 static void shadow_invlpg_64(struct vcpu *v, unsigned long va)
4023 struct domain *d = v->domain;
4024 l1_pgentry_t sl1e, old_sl1e;
4026 shadow_lock(d);
4028 __shadow_sync_va(v, va);
4030 if ( shadow_mode_external(d) && __shadow_get_l1e(v, va, &old_sl1e) )
4031 if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
4032 put_page_from_l1e(old_sl1e, d);
4034 sl1e = l1e_empty();
4035 __shadow_set_l1e(v, va, &sl1e);
4037 shadow_unlock(d);
4040 static unsigned long gva_to_gpa_64(unsigned long gva)
4042 struct vcpu *v = current;
4043 guest_l1_pgentry_t gl1e = {0};
4044 guest_l2_pgentry_t gl2e = {0};
4045 unsigned long gpa;
4047 if (guest_page_fault(v, gva, 0, &gl2e, &gl1e))
4048 return 0;
4050 if (guest_l2e_get_flags(gl2e) & _PAGE_PSE)
4051 gpa = guest_l2e_get_paddr(gl2e) + (gva & ((1 << GUEST_L2_PAGETABLE_SHIFT) - 1));
4052 else
4053 gpa = guest_l1e_get_paddr(gl1e) + (gva & ~PAGE_MASK);
4055 return gpa;
4058 /*
4059 * The naming convention of the shadow_ops:
4060 * MODE_<pgentry size>_<guest paging levels>_HANDLER
4061 */
4062 #if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
4063 struct shadow_ops MODE_64_3_HANDLER = {
4064 .guest_paging_levels = 3,
4065 .invlpg = shadow_invlpg_64,
4066 .fault = shadow_fault_64,
4067 .update_pagetables = shadow_update_pagetables,
4068 .sync_all = sync_all,
4069 .remove_all_write_access = remove_all_write_access,
4070 .do_update_va_mapping = do_update_va_mapping,
4071 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
4072 .is_out_of_sync = is_out_of_sync,
4073 .gva_to_gpa = gva_to_gpa_pae,
4074 };
4076 struct shadow_ops MODE_64_4_HANDLER = {
4077 .guest_paging_levels = 4,
4078 .invlpg = shadow_invlpg_64,
4079 .fault = shadow_fault_64,
4080 .update_pagetables = shadow_update_pagetables,
4081 .sync_all = sync_all,
4082 .remove_all_write_access = remove_all_write_access,
4083 .do_update_va_mapping = do_update_va_mapping,
4084 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
4085 .is_out_of_sync = is_out_of_sync,
4086 .gva_to_gpa = gva_to_gpa_64,
4087 };
4088 #endif /* GUEST_PGENTRY_32 */
4089 #endif /* CONFIG_PAGING_LEVELS >= 3 */
4092 #if CONFIG_PAGING_LEVELS == 2
4093 struct shadow_ops MODE_32_2_HANDLER = {
4094 .guest_paging_levels = 2,
4095 .invlpg = shadow_invlpg_32,
4096 .fault = shadow_fault_32,
4097 .update_pagetables = shadow_update_pagetables,
4098 .sync_all = sync_all,
4099 .remove_all_write_access = remove_all_write_access,
4100 .do_update_va_mapping = do_update_va_mapping,
4101 .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
4102 .is_out_of_sync = is_out_of_sync,
4103 .gva_to_gpa = gva_to_gpa_64,
4104 };
4105 #endif
4107 #if ( CONFIG_PAGING_LEVELS == 3 && !defined (GUEST_PGENTRY_32) && !defined (GUEST_32PAE) ) || \
4108 ( CONFIG_PAGING_LEVELS == 4 && defined (GUEST_PGENTRY_32) )
4111 /*
4112 * Use GUEST_PGENTRY_32 to force PAE_SHADOW_SELF_ENTRY for L4.
4114 * Very simple shadow code to handle 1:1 direct mapping for guest
4115 * non-paging code, which actually is running in PAE/vm86 mode with
4116 * paging-enabled.
4118 * We expect that the top level (L3) page has been allocated and initialized.
4119 */
4120 int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs)
4122 struct vcpu *v = current;
4123 struct domain *d = v->domain;
4124 l3_pgentry_t sl3e, *sl3e_p;
4125 l2_pgentry_t sl2e, *sl2e_p;
4126 l1_pgentry_t sl1e;
4127 unsigned long mfn, smfn;
4128 struct page_info *page;
4130 /*
4131 * If the faulting address is within the MMIO range, we continue
4132 * on handling the #PF as such.
4133 */
4134 if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN )
4135 return 0;
4137 shadow_lock(d);
4139 __direct_get_l3e(v, vpa, &sl3e);
4141 if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT) )
4143 page = alloc_domheap_page(NULL);
4144 if ( !page )
4145 goto nomem;
4147 smfn = page_to_mfn(page);
4148 sl3e = l3e_from_pfn(smfn, _PAGE_PRESENT);
4150 sl3e_p = (l3_pgentry_t *)map_domain_page(smfn);
4151 memset(sl3e_p, 0, PAGE_SIZE);
4152 unmap_domain_page(sl3e_p);
4154 __direct_set_l3e(v, vpa, &sl3e);
4157 __direct_get_l2e(v, vpa, &sl2e);
4159 if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
4161 page = alloc_domheap_page(NULL);
4162 if ( !page )
4163 goto nomem;
4165 smfn = page_to_mfn(page);
4166 sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER);
4167 sl2e_p = (l2_pgentry_t *)map_domain_page(smfn);
4168 memset(sl2e_p, 0, PAGE_SIZE);
4169 unmap_domain_page(sl2e_p);
4171 __direct_set_l2e(v, vpa, &sl2e);
4174 __direct_get_l1e(v, vpa, &sl1e);
4176 if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) )
4178 sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER);
4179 __direct_set_l1e(v, vpa, &sl1e);
4182 shadow_unlock(d);
4183 return EXCRET_fault_fixed;
4185 nomem:
4186 shadow_direct_map_clean(d);
4187 domain_crash_synchronous();
4189 #endif
4191 /*
4192 * Local variables:
4193 * mode: C
4194 * c-set-style: "BSD"
4195 * c-basic-offset: 4
4196 * tab-width: 4
4197 * indent-tabs-mode: nil
4198 * End:
4199 */