ia64/xen-unstable

view xen/arch/x86/shadow_public.c @ 9842:57e7b96139e7

This is a small fix for SMP PAE guest on x86-64.
Since part of the VMX guest initialization is running in the context of
dom0, so when we want to make sure the operation of shadow ops is
always done by vcpu0 of the guest VMX domain, we should make sure
the current vcpu0 is running in guest domain and not in the dom0.

Signed-off-by: Xiaohui Xin xiaohui.xin@intel.com
Signed-off-by: Xin Li <xin.b.li@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Apr 25 08:56:01 2006 +0100 (2006-04-25)
parents 72f9c751d3ea
children 3c2e7925bb93
line source
1 /******************************************************************************
2 * arch/x86/shadow_public.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
23 #include <xen/config.h>
24 #include <xen/types.h>
25 #include <xen/mm.h>
26 #include <xen/domain_page.h>
27 #include <asm/shadow.h>
28 #include <asm/page.h>
29 #include <xen/event.h>
30 #include <xen/sched.h>
31 #include <xen/trace.h>
32 #include <xen/guest_access.h>
33 #include <asm/shadow_64.h>
35 static int alloc_p2m_table(struct domain *d);
36 static void free_p2m_table(struct domain *d);
38 #define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) - ((_encoded) >> 16))
41 int shadow_direct_map_init(struct domain *d)
42 {
43 struct page_info *page;
44 l3_pgentry_t *root;
46 if ( !(page = alloc_domheap_pages(NULL, 0, ALLOC_DOM_DMA)) )
47 return 0;
49 root = map_domain_page(page_to_mfn(page));
50 memset(root, 0, PAGE_SIZE);
51 root[PAE_SHADOW_SELF_ENTRY] = l3e_from_page(page, __PAGE_HYPERVISOR);
53 d->arch.phys_table = mk_pagetable(page_to_maddr(page));
55 unmap_domain_page(root);
56 return 1;
57 }
59 void shadow_direct_map_clean(struct domain *d)
60 {
61 unsigned long mfn;
62 l2_pgentry_t *l2e;
63 l3_pgentry_t *l3e;
64 int i, j;
66 mfn = pagetable_get_pfn(d->arch.phys_table);
68 /*
69 * We may fail very early before direct map is built.
70 */
71 if ( !mfn )
72 return;
74 l3e = (l3_pgentry_t *)map_domain_page(mfn);
76 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
77 {
78 if ( l3e_get_flags(l3e[i]) & _PAGE_PRESENT )
79 {
80 l2e = map_domain_page(l3e_get_pfn(l3e[i]));
82 for ( j = 0; j < L2_PAGETABLE_ENTRIES; j++ )
83 {
84 if ( l2e_get_flags(l2e[j]) & _PAGE_PRESENT )
85 free_domheap_page(mfn_to_page(l2e_get_pfn(l2e[j])));
86 }
87 unmap_domain_page(l2e);
88 free_domheap_page(mfn_to_page(l3e_get_pfn(l3e[i])));
89 }
90 }
91 free_domheap_page(mfn_to_page(mfn));
93 unmap_domain_page(l3e);
95 d->arch.phys_table = mk_pagetable(0);
96 }
98 /****************************************************************************/
99 /************* export interface functions ***********************************/
100 /****************************************************************************/
101 void free_shadow_pages(struct domain *d);
103 int shadow_set_guest_paging_levels(struct domain *d, int levels)
104 {
105 struct vcpu *v = current;
107 /*
108 * Need to wait for VCPU0 to complete the on-going shadow ops.
109 */
111 if ( v->domain == d && v->vcpu_id )
112 return 1;
114 shadow_lock(d);
116 switch(levels) {
117 #if CONFIG_PAGING_LEVELS == 4
118 case 4:
119 if ( d->arch.ops != &MODE_64_4_HANDLER )
120 d->arch.ops = &MODE_64_4_HANDLER;
121 shadow_unlock(d);
122 return 1;
123 #endif
124 #if CONFIG_PAGING_LEVELS == 3
125 case 3:
126 if ( d->arch.ops != &MODE_64_3_HANDLER )
127 d->arch.ops = &MODE_64_3_HANDLER;
128 shadow_unlock(d);
129 return 1;
130 #endif
131 #if CONFIG_PAGING_LEVELS == 4
132 case 3:
133 if ( d->arch.ops == &MODE_64_2_HANDLER )
134 free_shadow_pages(d);
135 if ( d->arch.ops != &MODE_64_PAE_HANDLER )
136 d->arch.ops = &MODE_64_PAE_HANDLER;
137 shadow_unlock(d);
138 return 1;
139 #endif
140 case 2:
141 #if CONFIG_PAGING_LEVELS == 2
142 if ( d->arch.ops != &MODE_32_2_HANDLER )
143 d->arch.ops = &MODE_32_2_HANDLER;
144 #elif CONFIG_PAGING_LEVELS >= 3
145 if ( d->arch.ops != &MODE_64_2_HANDLER )
146 d->arch.ops = &MODE_64_2_HANDLER;
147 #endif
148 shadow_unlock(d);
149 return 1;
150 default:
151 shadow_unlock(d);
152 return 0;
153 }
154 }
156 void shadow_invlpg(struct vcpu *v, unsigned long va)
157 {
158 struct domain *d = current->domain;
159 d->arch.ops->invlpg(v, va);
160 }
162 int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
163 {
164 struct domain *d = current->domain;
165 return d->arch.ops->fault(va, regs);
166 }
168 void __update_pagetables(struct vcpu *v)
169 {
170 struct domain *d = v->domain;
171 d->arch.ops->update_pagetables(v);
172 }
174 void __shadow_sync_all(struct domain *d)
175 {
176 d->arch.ops->sync_all(d);
177 }
179 int shadow_remove_all_write_access(
180 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
181 {
182 return d->arch.ops->remove_all_write_access(d, readonly_gpfn, readonly_gmfn);
183 }
185 int shadow_do_update_va_mapping(unsigned long va,
186 l1_pgentry_t val,
187 struct vcpu *v)
188 {
189 struct domain *d = v->domain;
190 return d->arch.ops->do_update_va_mapping(va, val, v);
191 }
193 struct out_of_sync_entry *
194 shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
195 unsigned long mfn)
196 {
197 struct domain *d = v->domain;
198 return d->arch.ops->mark_mfn_out_of_sync(v, gpfn, mfn);
199 }
201 /*
202 * Returns 1 if va's shadow mapping is out-of-sync.
203 * Returns 0 otherwise.
204 */
205 int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
206 {
207 struct domain *d = v->domain;
208 return d->arch.ops->is_out_of_sync(v, va);
209 }
211 unsigned long gva_to_gpa(unsigned long gva)
212 {
213 struct domain *d = current->domain;
214 return d->arch.ops->gva_to_gpa(gva);
215 }
216 /****************************************************************************/
217 /****************************************************************************/
218 #if CONFIG_PAGING_LEVELS >= 3
220 static void inline
221 free_shadow_fl1_table(struct domain *d, unsigned long smfn)
222 {
223 l1_pgentry_t *pl1e = map_domain_page(smfn);
224 int i;
226 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
227 put_page_from_l1e(pl1e[i], d);
229 unmap_domain_page(pl1e);
230 }
232 /*
233 * Free l2, l3, l4 shadow tables
234 */
236 void free_fake_shadow_l2(struct domain *d,unsigned long smfn);
238 static void inline
239 free_shadow_tables(struct domain *d, unsigned long smfn, u32 level)
240 {
241 pgentry_64_t *ple = map_domain_page(smfn);
242 int i, external = shadow_mode_external(d);
244 #if CONFIG_PAGING_LEVELS >= 3
245 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
246 {
247 struct page_info *page = mfn_to_page(smfn);
248 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
249 {
250 if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
251 free_fake_shadow_l2(d, entry_get_pfn(ple[i]));
252 }
254 page = mfn_to_page(entry_get_pfn(ple[0]));
255 free_domheap_pages(page, SL2_ORDER);
256 unmap_domain_page(ple);
257 }
258 else
259 #endif
260 {
261 /*
262 * No Xen mappings in external pages
263 */
264 if ( external )
265 {
266 for ( i = 0; i < PAGETABLE_ENTRIES; i++ ) {
267 if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
268 put_shadow_ref(entry_get_pfn(ple[i]));
269 if (d->arch.ops->guest_paging_levels == PAGING_L3)
270 {
271 #if CONFIG_PAGING_LEVELS == 4
272 if ( i == PAE_L3_PAGETABLE_ENTRIES && level == PAGING_L4 )
273 #elif CONFIG_PAGING_LEVELS == 3
274 if ( i == PAE_L3_PAGETABLE_ENTRIES && level == PAGING_L3 )
275 #endif
276 break;
277 }
278 }
279 }
280 else
281 {
282 for ( i = 0; i < PAGETABLE_ENTRIES; i++ )
283 {
284 /*
285 * List the skip/break conditions to avoid freeing
286 * Xen private mappings.
287 */
288 #if CONFIG_PAGING_LEVELS == 2
289 if ( level == PAGING_L2 && !is_guest_l2_slot(0, i) )
290 continue;
291 #endif
292 #if CONFIG_PAGING_LEVELS == 3
293 if ( level == PAGING_L3 && i == L3_PAGETABLE_ENTRIES )
294 break;
295 if ( level == PAGING_L2 )
296 {
297 struct page_info *page = mfn_to_page(smfn);
298 if ( is_xen_l2_slot(page->u.inuse.type_info, i) )
299 continue;
300 }
301 #endif
302 #if CONFIG_PAGING_LEVELS == 4
303 if ( level == PAGING_L4 && !is_guest_l4_slot(i))
304 continue;
305 #endif
306 if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
307 put_shadow_ref(entry_get_pfn(ple[i]));
308 }
309 }
310 unmap_domain_page(ple);
311 }
312 }
313 #endif
315 #if CONFIG_PAGING_LEVELS == 4
316 static void alloc_monitor_pagetable(struct vcpu *v)
317 {
318 unsigned long mmfn;
319 l4_pgentry_t *mpl4e;
320 struct page_info *mmfn_info;
321 struct domain *d = v->domain;
323 ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */
325 mmfn_info = alloc_domheap_page(NULL);
326 ASSERT( mmfn_info );
328 mmfn = page_to_mfn(mmfn_info);
329 mpl4e = (l4_pgentry_t *) map_domain_page_global(mmfn);
330 memcpy(mpl4e, idle_pg_table, PAGE_SIZE);
331 mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
332 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
334 /* map the phys_to_machine map into the per domain Read-Only MPT space */
336 v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
337 v->arch.monitor_vtable = (l2_pgentry_t *) mpl4e;
338 mpl4e[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
340 if ( v->vcpu_id == 0 )
341 alloc_p2m_table(d);
342 else
343 {
344 unsigned long mfn;
346 mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
347 if ( mfn )
348 {
349 l4_pgentry_t *l4tab;
351 l4tab = map_domain_page(mfn);
353 mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
354 l4tab[l4_table_offset(RO_MPT_VIRT_START)];
356 unmap_domain_page(l4tab);
357 }
358 }
359 }
361 void free_monitor_pagetable(struct vcpu *v)
362 {
363 unsigned long mfn;
365 /*
366 * free monitor_table.
367 */
368 if ( v->vcpu_id == 0 )
369 free_p2m_table(v->domain);
371 /*
372 * Then free monitor_table.
373 */
374 mfn = pagetable_get_pfn(v->arch.monitor_table);
375 unmap_domain_page_global(v->arch.monitor_vtable);
376 free_domheap_page(mfn_to_page(mfn));
378 v->arch.monitor_table = mk_pagetable(0);
379 v->arch.monitor_vtable = 0;
380 }
381 #elif CONFIG_PAGING_LEVELS == 3
382 static void alloc_monitor_pagetable(struct vcpu *v)
383 {
384 unsigned long m2mfn, m3mfn;
385 l2_pgentry_t *mpl2e;
386 l3_pgentry_t *mpl3e;
387 struct page_info *m2mfn_info, *m3mfn_info, *page;
388 struct domain *d = v->domain;
389 int i;
391 ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */
393 m3mfn_info = alloc_domheap_pages(NULL, 0, ALLOC_DOM_DMA);
394 ASSERT( m3mfn_info );
396 m3mfn = page_to_mfn(m3mfn_info);
397 mpl3e = (l3_pgentry_t *) map_domain_page_global(m3mfn);
398 memset(mpl3e, 0, L3_PAGETABLE_ENTRIES * sizeof(l3_pgentry_t));
400 m2mfn_info = alloc_domheap_page(NULL);
401 ASSERT( m2mfn_info );
403 m2mfn = page_to_mfn(m2mfn_info);
404 mpl2e = (l2_pgentry_t *) map_domain_page(m2mfn);
405 memset(mpl2e, 0, L2_PAGETABLE_ENTRIES * sizeof(l2_pgentry_t));
407 memcpy(&mpl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
408 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
409 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
410 /*
411 * Map L2 page into L3
412 */
413 mpl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(m2mfn, _PAGE_PRESENT);
414 page = l3e_get_page(mpl3e[L3_PAGETABLE_ENTRIES - 1]);
416 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
417 mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
418 l2e_from_page(
419 virt_to_page(d->arch.mm_perdomain_pt) + i,
420 __PAGE_HYPERVISOR);
421 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
422 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
423 (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
424 l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
425 l2e_empty();
426 for ( i = 0; i < (MACHPHYS_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
427 mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] = l2e_empty();
429 v->arch.monitor_table = mk_pagetable(m3mfn << PAGE_SHIFT); /* < 4GB */
430 v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e;
432 if ( v->vcpu_id == 0 )
433 alloc_p2m_table(d);
434 else
435 {
436 unsigned long mfn;
438 mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
439 if ( mfn )
440 {
441 l3_pgentry_t *l3tab, l3e;
442 l2_pgentry_t *l2tab;
444 l3tab = map_domain_page(mfn);
445 l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
447 /*
448 * NB: when CONFIG_PAGING_LEVELS == 3,
449 * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
450 * alloc_monitor_pagetable should guarantee this.
451 */
452 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
453 BUG();
455 l2tab = map_domain_page(l3e_get_pfn(l3e));
457 /*
458 * Just one l2 slot is used here, so at most 2M for p2m table:
459 * ((4K * 512)/sizeof(unsigned long)) * 4K = 2G
460 * should be OK on PAE xen, since Qemu DM can only map 1.5G VMX
461 * guest memory.
462 */
463 mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
464 l2tab[l2_table_offset(RO_MPT_VIRT_START)];
466 unmap_domain_page(l2tab);
467 unmap_domain_page(l3tab);
468 }
469 }
471 unmap_domain_page(mpl2e);
472 }
474 void free_monitor_pagetable(struct vcpu *v)
475 {
476 unsigned long m2mfn, m3mfn;
477 /*
478 * free monitor_table.
479 */
480 if ( v->vcpu_id == 0 )
481 free_p2m_table(v->domain);
483 m3mfn = pagetable_get_pfn(v->arch.monitor_table);
484 m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]);
486 free_domheap_page(mfn_to_page(m2mfn));
487 unmap_domain_page_global(v->arch.monitor_vtable);
488 free_domheap_page(mfn_to_page(m3mfn));
490 v->arch.monitor_table = mk_pagetable(0);
491 v->arch.monitor_vtable = 0;
492 }
493 #endif
495 static void
496 shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
497 {
498 void *snapshot;
500 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
501 return;
503 // Clear the out_of_sync bit.
504 //
505 clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info);
507 // XXX Need to think about how to protect the domain's
508 // information less expensively.
509 //
510 snapshot = map_domain_page(entry->snapshot_mfn);
511 memset(snapshot, 0, PAGE_SIZE);
512 unmap_domain_page(snapshot);
514 put_shadow_ref(entry->snapshot_mfn);
515 }
517 void
518 release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
519 {
520 struct page_info *page;
522 page = mfn_to_page(entry->gmfn);
524 // Decrement ref count of guest & shadow pages
525 //
526 put_page(page);
528 // Only use entries that have low bits clear...
529 //
530 if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
531 {
532 put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
533 entry->writable_pl1e = -2;
534 }
535 else
536 ASSERT( entry->writable_pl1e == -1 );
538 // Free the snapshot
539 //
540 shadow_free_snapshot(d, entry);
541 }
543 static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
544 {
545 struct out_of_sync_entry *entry = d->arch.out_of_sync;
546 struct out_of_sync_entry **prev = &d->arch.out_of_sync;
547 struct out_of_sync_entry *found = NULL;
549 // NB: Be careful not to call something that manipulates this list
550 // while walking it. Collect the results into a separate list
551 // first, then walk that list.
552 //
553 while ( entry )
554 {
555 if ( entry->gmfn == gmfn )
556 {
557 // remove from out of sync list
558 *prev = entry->next;
560 // add to found list
561 entry->next = found;
562 found = entry;
564 entry = *prev;
565 continue;
566 }
567 prev = &entry->next;
568 entry = entry->next;
569 }
571 prev = NULL;
572 entry = found;
573 while ( entry )
574 {
575 release_out_of_sync_entry(d, entry);
577 prev = &entry->next;
578 entry = entry->next;
579 }
581 // Add found list to free list
582 if ( prev )
583 {
584 *prev = d->arch.out_of_sync_free;
585 d->arch.out_of_sync_free = found;
586 }
587 }
589 static inline void
590 shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
591 {
592 if ( !shadow_mode_refcounts(d) )
593 return;
595 ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table);
597 if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
598 {
599 clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info);
601 if ( page_out_of_sync(mfn_to_page(gmfn)) )
602 {
603 remove_out_of_sync_entries(d, gmfn);
604 }
605 }
606 }
608 static void inline
609 free_shadow_l1_table(struct domain *d, unsigned long smfn)
610 {
611 l1_pgentry_t *pl1e = map_domain_page(smfn);
612 l1_pgentry_t *pl1e_next = 0, *sl1e_p;
613 int i;
614 struct page_info *spage = mfn_to_page(smfn);
615 u32 min_max = spage->tlbflush_timestamp;
616 int min = SHADOW_MIN(min_max);
617 int max;
619 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
620 {
621 max = SHADOW_MAX_GUEST32(min_max);
622 pl1e_next = map_domain_page(smfn + 1);
623 }
624 else
625 max = SHADOW_MAX(min_max);
627 for ( i = min; i <= max; i++ )
628 {
629 if ( pl1e_next && i >= L1_PAGETABLE_ENTRIES )
630 sl1e_p = &pl1e_next[i - L1_PAGETABLE_ENTRIES];
631 else
632 sl1e_p = &pl1e[i];
634 shadow_put_page_from_l1e(*sl1e_p, d);
635 *sl1e_p = l1e_empty();
636 }
638 unmap_domain_page(pl1e);
639 if ( pl1e_next )
640 unmap_domain_page(pl1e_next);
641 }
643 static void inline
644 free_shadow_hl2_table(struct domain *d, unsigned long smfn)
645 {
646 l1_pgentry_t *hl2 = map_domain_page(smfn);
647 int i, limit;
649 SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
651 #if CONFIG_PAGING_LEVELS == 2
652 if ( shadow_mode_external(d) )
653 limit = L2_PAGETABLE_ENTRIES;
654 else
655 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
656 #endif
658 for ( i = 0; i < limit; i++ )
659 {
660 if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
661 put_page(mfn_to_page(l1e_get_pfn(hl2[i])));
662 }
664 unmap_domain_page(hl2);
665 }
667 static void inline
668 free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
669 {
670 l2_pgentry_t *pl2e = map_domain_page(smfn);
671 int i, external = shadow_mode_external(d);
673 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
674 if ( external || is_guest_l2_slot(type, i) )
675 if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
676 put_shadow_ref(l2e_get_pfn(pl2e[i]));
678 if ( (PGT_base_page_table == PGT_l2_page_table) &&
679 shadow_mode_translate(d) && !external )
680 {
681 // free the ref to the hl2
682 //
683 put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
684 }
686 unmap_domain_page(pl2e);
687 }
689 void free_fake_shadow_l2(struct domain *d, unsigned long smfn)
690 {
691 pgentry_64_t *ple = map_domain_page(smfn);
692 int i;
694 for ( i = 0; i < PAGETABLE_ENTRIES; i = i + 2 )
695 if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
696 put_shadow_ref(entry_get_pfn(ple[i]));
698 unmap_domain_page(ple);
699 }
701 void free_shadow_page(unsigned long smfn)
702 {
703 struct page_info *page = mfn_to_page(smfn);
704 unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
705 struct domain *d = page_get_owner(mfn_to_page(gmfn));
706 unsigned long gpfn = mfn_to_gmfn(d, gmfn);
707 unsigned long type = page->u.inuse.type_info & PGT_type_mask;
709 SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
711 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
712 #if CONFIG_PAGING_LEVELS >= 4
713 if ( type == PGT_fl1_shadow )
714 {
715 unsigned long mfn;
716 mfn = __shadow_status(d, gpfn, PGT_fl1_shadow);
717 if ( !mfn )
718 gpfn |= (1UL << 63);
719 }
720 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
721 if ( type == PGT_l4_shadow )
722 gpfn = ((unsigned long)page->tlbflush_timestamp << PGT_pae_idx_shift) | gpfn;
723 #endif
725 delete_shadow_status(d, gpfn, gmfn, type);
727 switch ( type )
728 {
729 case PGT_l1_shadow:
730 perfc_decr(shadow_l1_pages);
731 shadow_demote(d, gpfn, gmfn);
732 free_shadow_l1_table(d, smfn);
733 d->arch.shadow_page_count--;
734 break;
735 #if CONFIG_PAGING_LEVELS == 2
736 case PGT_l2_shadow:
737 perfc_decr(shadow_l2_pages);
738 shadow_demote(d, gpfn, gmfn);
739 free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
740 d->arch.shadow_page_count--;
741 break;
743 case PGT_hl2_shadow:
744 perfc_decr(hl2_table_pages);
745 shadow_demote(d, gpfn, gmfn);
746 free_shadow_hl2_table(d, smfn);
747 d->arch.hl2_page_count--;
748 break;
749 #endif
750 #if CONFIG_PAGING_LEVELS >= 3
751 case PGT_l2_shadow:
752 case PGT_l3_shadow:
753 shadow_demote(d, gpfn, gmfn);
754 free_shadow_tables(d, smfn, shadow_type_to_level(type));
755 d->arch.shadow_page_count--;
756 break;
758 case PGT_l4_shadow:
759 gpfn = gpfn & PGT_mfn_mask;
760 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
761 {
762 /*
763 * Since a single PDPT page can have multiple PDPs, it's possible
764 * that shadow_demote() has been already called for gmfn.
765 */
766 if ( mfn_is_page_table(gmfn) )
767 shadow_demote(d, gpfn, gmfn);
768 } else
769 shadow_demote(d, gpfn, gmfn);
771 free_shadow_tables(d, smfn, shadow_type_to_level(type));
772 d->arch.shadow_page_count--;
773 break;
775 case PGT_fl1_shadow:
776 free_shadow_fl1_table(d, smfn);
777 d->arch.shadow_page_count--;
778 break;
779 #endif
780 case PGT_snapshot:
781 perfc_decr(snapshot_pages);
782 break;
784 default:
785 printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n",
786 page_to_mfn(page), page->u.inuse.type_info);
787 break;
788 }
790 // No TLB flushes are needed the next time this page gets allocated.
791 //
792 page->tlbflush_timestamp = 0;
793 page->u.free.cpumask = CPU_MASK_NONE;
795 if ( type == PGT_l1_shadow )
796 {
797 list_add(&page->list, &d->arch.free_shadow_frames);
798 perfc_incr(free_l1_pages);
799 }
800 else
801 free_domheap_page(page);
802 }
804 static void
805 free_writable_pte_predictions(struct domain *d)
806 {
807 int i;
808 struct shadow_status *x;
810 for ( i = 0; i < shadow_ht_buckets; i++ )
811 {
812 u32 count;
813 unsigned long *gpfn_list;
815 /* Skip empty buckets. */
816 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
817 continue;
819 count = 0;
820 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
821 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
822 count++;
824 gpfn_list = xmalloc_array(unsigned long, count);
825 count = 0;
826 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
827 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
828 gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
830 while ( count )
831 {
832 count--;
833 delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
834 }
836 xfree(gpfn_list);
837 }
838 }
840 static void free_shadow_ht_entries(struct domain *d)
841 {
842 struct shadow_status *x, *n;
844 SH_VLOG("freed tables count=%d l1=%d l2=%d",
845 d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
846 perfc_value(shadow_l2_pages));
848 n = d->arch.shadow_ht_extras;
849 while ( (x = n) != NULL )
850 {
851 d->arch.shadow_extras_count--;
852 n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
853 xfree(x);
854 }
856 d->arch.shadow_ht_extras = NULL;
857 d->arch.shadow_ht_free = NULL;
859 ASSERT(d->arch.shadow_extras_count == 0);
860 SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
862 if ( d->arch.shadow_dirty_bitmap != NULL )
863 {
864 xfree(d->arch.shadow_dirty_bitmap);
865 d->arch.shadow_dirty_bitmap = 0;
866 d->arch.shadow_dirty_bitmap_size = 0;
867 }
869 xfree(d->arch.shadow_ht);
870 d->arch.shadow_ht = NULL;
871 }
873 static void free_out_of_sync_entries(struct domain *d)
874 {
875 struct out_of_sync_entry *x, *n;
877 n = d->arch.out_of_sync_extras;
878 while ( (x = n) != NULL )
879 {
880 d->arch.out_of_sync_extras_count--;
881 n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
882 xfree(x);
883 }
885 d->arch.out_of_sync_extras = NULL;
886 d->arch.out_of_sync_free = NULL;
887 d->arch.out_of_sync = NULL;
889 ASSERT(d->arch.out_of_sync_extras_count == 0);
890 FSH_LOG("freed extra out_of_sync entries, now %d",
891 d->arch.out_of_sync_extras_count);
892 }
894 void free_shadow_pages(struct domain *d)
895 {
896 int i;
897 struct shadow_status *x;
898 struct vcpu *v;
899 struct list_head *list_ent, *tmp;
901 /*
902 * WARNING! The shadow page table must not currently be in use!
903 * e.g., You are expected to have paused the domain and synchronized CR3.
904 */
906 if( !d->arch.shadow_ht ) return;
908 shadow_audit(d, 1);
910 // first, remove any outstanding refs from out_of_sync entries...
911 //
912 free_out_of_sync_state(d);
914 // second, remove any outstanding refs from v->arch.shadow_table
915 // and CR3.
916 //
917 for_each_vcpu(d, v)
918 {
919 if ( pagetable_get_paddr(v->arch.shadow_table) )
920 {
921 put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
922 v->arch.shadow_table = mk_pagetable(0);
924 if ( shadow_mode_external(d) )
925 {
926 if ( v->arch.shadow_vtable )
927 unmap_domain_page_global(v->arch.shadow_vtable);
928 v->arch.shadow_vtable = NULL;
929 }
930 }
932 if ( v->arch.monitor_shadow_ref )
933 {
934 put_shadow_ref(v->arch.monitor_shadow_ref);
935 v->arch.monitor_shadow_ref = 0;
936 }
937 }
939 #if CONFIG_PAGING_LEVELS == 2
940 // For external shadows, remove the monitor table's refs
941 //
942 if ( shadow_mode_external(d) )
943 {
944 for_each_vcpu(d, v)
945 {
946 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
948 if ( mpl2e )
949 {
950 l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
951 l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
953 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
954 {
955 put_shadow_ref(l2e_get_pfn(hl2e));
956 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
957 }
958 if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
959 {
960 put_shadow_ref(l2e_get_pfn(smfn));
961 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
962 }
963 }
964 }
965 }
966 #endif
967 // Now, the only refs to shadow pages that are left are from the shadow
968 // pages themselves. We just unpin the pinned pages, and the rest
969 // should automatically disappear.
970 //
971 // NB: Beware: each explicitly or implicit call to free_shadow_page
972 // can/will result in the hash bucket getting rewritten out from
973 // under us... First, collect the list of pinned pages, then
974 // free them.
975 //
976 for ( i = 0; i < shadow_ht_buckets; i++ )
977 {
978 u32 count;
979 unsigned long *mfn_list;
981 /* Skip empty buckets. */
982 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
983 continue;
985 count = 0;
986 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
987 if ( MFN_PINNED(x->smfn) )
988 count++;
989 if ( !count )
990 continue;
992 mfn_list = xmalloc_array(unsigned long, count);
993 count = 0;
994 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
995 if ( MFN_PINNED(x->smfn) )
996 mfn_list[count++] = x->smfn;
998 while ( count )
999 {
1000 shadow_unpin(mfn_list[--count]);
1002 xfree(mfn_list);
1005 /* Now free the pre-zero'ed pages from the domain. */
1006 list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
1008 struct page_info *page = list_entry(list_ent, struct page_info, list);
1010 list_del(list_ent);
1011 perfc_decr(free_l1_pages);
1013 if (d->arch.ops->guest_paging_levels == PAGING_L2)
1015 #if CONFIG_PAGING_LEVELS >=3
1016 free_domheap_pages(page, SL1_ORDER);
1017 #else
1018 free_domheap_page(page);
1019 #endif
1021 else
1022 free_domheap_page(page);
1025 shadow_audit(d, 0);
1027 SH_LOG("Free shadow table.");
1030 void __shadow_mode_disable(struct domain *d)
1032 struct vcpu *v;
1033 #ifndef NDEBUG
1034 int i;
1035 #endif
1037 if ( unlikely(!shadow_mode_enabled(d)) )
1038 return;
1040 free_shadow_pages(d);
1041 free_writable_pte_predictions(d);
1043 #ifndef NDEBUG
1044 for ( i = 0; i < shadow_ht_buckets; i++ )
1046 if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
1048 printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
1049 __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
1050 BUG();
1053 #endif
1055 d->arch.shadow_mode = 0;
1057 free_shadow_ht_entries(d);
1058 free_out_of_sync_entries(d);
1060 for_each_vcpu(d, v)
1061 update_pagetables(v);
1065 int __shadow_mode_enable(struct domain *d, unsigned int mode)
1067 struct vcpu *v;
1068 int new_modes = (mode & ~d->arch.shadow_mode);
1070 // Gotta be adding something to call this function.
1071 ASSERT(new_modes);
1073 // can't take anything away by calling this function.
1074 ASSERT(!(d->arch.shadow_mode & ~mode));
1076 #if defined(CONFIG_PAGING_LEVELS)
1077 if(!shadow_set_guest_paging_levels(d,
1078 CONFIG_PAGING_LEVELS)) {
1079 printk("Unsupported guest paging levels\n");
1080 domain_crash_synchronous(); /* need to take a clean path */
1082 #endif
1084 for_each_vcpu(d, v)
1086 invalidate_shadow_ldt(v);
1088 // We need to set these up for __update_pagetables().
1089 // See the comment there.
1091 /*
1092 * arch.guest_vtable
1093 */
1094 if ( v->arch.guest_vtable &&
1095 (v->arch.guest_vtable != __linear_l2_table) )
1097 unmap_domain_page_global(v->arch.guest_vtable);
1099 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
1100 v->arch.guest_vtable = __linear_l2_table;
1101 else
1102 v->arch.guest_vtable = NULL;
1104 /*
1105 * arch.shadow_vtable
1106 */
1107 if ( v->arch.shadow_vtable &&
1108 (v->arch.shadow_vtable != __shadow_linear_l2_table) )
1110 unmap_domain_page_global(v->arch.shadow_vtable);
1112 if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2)
1113 v->arch.shadow_vtable = __shadow_linear_l2_table;
1114 else
1115 v->arch.shadow_vtable = NULL;
1117 #if CONFIG_PAGING_LEVELS == 2
1118 /*
1119 * arch.hl2_vtable
1120 */
1121 if ( v->arch.hl2_vtable &&
1122 (v->arch.hl2_vtable != __linear_hl2_table) )
1124 unmap_domain_page_global(v->arch.hl2_vtable);
1126 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
1127 v->arch.hl2_vtable = __linear_hl2_table;
1128 else
1129 v->arch.hl2_vtable = NULL;
1130 #endif
1131 /*
1132 * arch.monitor_table & arch.monitor_vtable
1133 */
1134 if ( v->arch.monitor_vtable )
1136 free_monitor_pagetable(v);
1138 if ( mode & SHM_external )
1140 alloc_monitor_pagetable(v);
1144 if ( new_modes & SHM_enable )
1146 ASSERT( !d->arch.shadow_ht );
1147 d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
1148 if ( d->arch.shadow_ht == NULL )
1149 goto nomem;
1151 memset(d->arch.shadow_ht, 0,
1152 shadow_ht_buckets * sizeof(struct shadow_status));
1155 if ( new_modes & SHM_log_dirty )
1157 ASSERT( !d->arch.shadow_dirty_bitmap );
1158 d->arch.shadow_dirty_bitmap_size =
1159 (d->shared_info->arch.max_pfn + 63) & ~63;
1160 d->arch.shadow_dirty_bitmap =
1161 xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
1162 (8 * sizeof(unsigned long)));
1163 if ( d->arch.shadow_dirty_bitmap == NULL )
1165 d->arch.shadow_dirty_bitmap_size = 0;
1166 goto nomem;
1168 memset(d->arch.shadow_dirty_bitmap, 0,
1169 d->arch.shadow_dirty_bitmap_size/8);
1172 if ( new_modes & SHM_translate )
1174 if ( !(new_modes & SHM_external) )
1176 ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
1177 if ( !alloc_p2m_table(d) )
1179 printk("alloc_p2m_table failed (out-of-memory?)\n");
1180 goto nomem;
1185 // Get rid of any shadow pages from any previous shadow mode.
1186 //
1187 free_shadow_pages(d);
1189 d->arch.shadow_mode = mode;
1191 if ( shadow_mode_refcounts(d) )
1193 struct list_head *list_ent;
1194 struct page_info *page;
1196 /*
1197 * Tear down its counts by disassembling its page-table-based refcounts
1198 * Also remove CR3's gcount/tcount.
1199 * That leaves things like GDTs and LDTs and external refs in tact.
1201 * Most pages will be writable tcount=0.
1202 * Some will still be L1 tcount=0 or L2 tcount=0.
1203 * Maybe some pages will be type none tcount=0.
1204 * Pages granted external writable refs (via grant tables?) will
1205 * still have a non-zero tcount. That's OK.
1207 * gcounts will generally be 1 for PGC_allocated.
1208 * GDTs and LDTs will have additional gcounts.
1209 * Any grant-table based refs will still be in the gcount.
1211 * We attempt to grab writable refs to each page thus setting its type
1212 * Immediately put back those type refs.
1214 * Assert that no pages are left with L1/L2/L3/L4 type.
1215 */
1216 audit_adjust_pgtables(d, -1, 1);
1219 for (list_ent = d->page_list.next; list_ent != &d->page_list;
1220 list_ent = page->list.next) {
1222 page = list_entry(list_ent, struct page_info, list);
1223 if ( !get_page_type(page, PGT_writable_page) )
1224 BUG();
1225 put_page_type(page);
1226 /*
1227 * We use tlbflush_timestamp as back pointer to smfn, and need to
1228 * clean up it.
1229 */
1230 if (shadow_mode_external(d))
1231 page->tlbflush_timestamp = 0;
1234 audit_adjust_pgtables(d, 1, 1);
1238 return 0;
1240 nomem:
1241 if ( (new_modes & SHM_enable) )
1243 xfree(d->arch.shadow_ht);
1244 d->arch.shadow_ht = NULL;
1246 if ( (new_modes & SHM_log_dirty) )
1248 xfree(d->arch.shadow_dirty_bitmap);
1249 d->arch.shadow_dirty_bitmap = NULL;
1252 return -ENOMEM;
1256 int shadow_mode_enable(struct domain *d, unsigned int mode)
1258 int rc;
1259 shadow_lock(d);
1260 rc = __shadow_mode_enable(d, mode);
1261 shadow_unlock(d);
1262 return rc;
1265 static int shadow_mode_table_op(
1266 struct domain *d, dom0_shadow_control_t *sc)
1268 unsigned int op = sc->op;
1269 int i, rc = 0;
1270 struct vcpu *v;
1272 ASSERT(shadow_lock_is_acquired(d));
1274 SH_VLOG("shadow mode table op %lx %lx count %d",
1275 (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */
1276 (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
1277 d->arch.shadow_page_count);
1279 shadow_audit(d, 1);
1281 switch ( op )
1283 case DOM0_SHADOW_CONTROL_OP_FLUSH:
1284 free_shadow_pages(d);
1286 d->arch.shadow_fault_count = 0;
1287 d->arch.shadow_dirty_count = 0;
1289 break;
1291 case DOM0_SHADOW_CONTROL_OP_CLEAN:
1292 free_shadow_pages(d);
1294 sc->stats.fault_count = d->arch.shadow_fault_count;
1295 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1297 d->arch.shadow_fault_count = 0;
1298 d->arch.shadow_dirty_count = 0;
1300 if ( guest_handle_is_null(sc->dirty_bitmap) ||
1301 (d->arch.shadow_dirty_bitmap == NULL) )
1303 rc = -EINVAL;
1304 break;
1307 if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
1308 sc->pages = d->arch.shadow_dirty_bitmap_size;
1310 #define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
1311 for ( i = 0; i < sc->pages; i += chunk )
1313 int bytes = ((((sc->pages - i) > chunk) ?
1314 chunk : (sc->pages - i)) + 7) / 8;
1316 if ( copy_to_guest_offset(
1317 sc->dirty_bitmap, i/(8*sizeof(unsigned long)),
1318 d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
1319 (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) )
1321 rc = -EINVAL;
1322 break;
1324 memset(
1325 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
1326 0, bytes);
1329 break;
1331 case DOM0_SHADOW_CONTROL_OP_PEEK:
1332 sc->stats.fault_count = d->arch.shadow_fault_count;
1333 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1335 if ( guest_handle_is_null(sc->dirty_bitmap) ||
1336 (d->arch.shadow_dirty_bitmap == NULL) )
1338 rc = -EINVAL;
1339 break;
1342 if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
1343 sc->pages = d->arch.shadow_dirty_bitmap_size;
1345 if ( copy_to_guest(sc->dirty_bitmap,
1346 d->arch.shadow_dirty_bitmap,
1347 (((sc->pages+7)/8)+sizeof(unsigned long)-1) /
1348 sizeof(unsigned long)) )
1350 rc = -EINVAL;
1351 break;
1354 break;
1356 default:
1357 rc = -EINVAL;
1358 break;
1361 SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
1362 shadow_audit(d, 1);
1364 for_each_vcpu(d,v)
1365 __update_pagetables(v);
1367 return rc;
1370 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
1372 unsigned int op = sc->op;
1373 int rc = 0;
1374 struct vcpu *v;
1376 if ( unlikely(d == current->domain) )
1378 DPRINTK("Don't try to do a shadow op on yourself!\n");
1379 return -EINVAL;
1382 domain_pause(d);
1384 shadow_lock(d);
1386 switch ( op )
1388 case DOM0_SHADOW_CONTROL_OP_OFF:
1389 if ( shadow_mode_enabled(d) )
1391 __shadow_sync_all(d);
1392 __shadow_mode_disable(d);
1394 break;
1396 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
1397 free_shadow_pages(d);
1398 rc = __shadow_mode_enable(d, SHM_enable);
1399 break;
1401 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
1402 free_shadow_pages(d);
1403 rc = __shadow_mode_enable(
1404 d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
1405 break;
1407 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
1408 free_shadow_pages(d);
1409 rc = __shadow_mode_enable(
1410 d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate);
1411 break;
1413 default:
1414 rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
1415 break;
1418 shadow_unlock(d);
1420 for_each_vcpu(d,v)
1421 update_pagetables(v);
1423 domain_unpause(d);
1425 return rc;
1428 void shadow_mode_init(void)
1432 int _shadow_mode_refcounts(struct domain *d)
1434 return shadow_mode_refcounts(d);
1437 static int
1438 map_p2m_entry(pgentry_64_t *top_tab, unsigned long va,
1439 unsigned long gpfn, unsigned long mfn)
1441 #if CONFIG_PAGING_LEVELS >= 4
1442 pgentry_64_t l4e = { 0 };
1443 pgentry_64_t *l3tab = NULL;
1444 #endif
1445 #if CONFIG_PAGING_LEVELS >= 3
1446 pgentry_64_t l3e = { 0 };
1447 #endif
1448 l2_pgentry_t *l2tab = NULL;
1449 l1_pgentry_t *l1tab = NULL;
1450 unsigned long *l0tab = NULL;
1451 l2_pgentry_t l2e = { 0 };
1452 l1_pgentry_t l1e = { 0 };
1453 struct page_info *page;
1455 #if CONFIG_PAGING_LEVELS >= 4
1456 l4e = top_tab[l4_table_offset(va)];
1457 if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
1459 page = alloc_domheap_page(NULL);
1460 if ( !page )
1461 goto nomem;
1463 l3tab = map_domain_page(page_to_mfn(page));
1464 memset(l3tab, 0, PAGE_SIZE);
1465 l4e = top_tab[l4_table_offset(va)] =
1466 entry_from_page(page, __PAGE_HYPERVISOR);
1468 else
1469 l3tab = map_domain_page(entry_get_pfn(l4e));
1471 l3e = l3tab[l3_table_offset(va)];
1472 if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
1474 page = alloc_domheap_page(NULL);
1475 if ( !page )
1476 goto nomem;
1478 l2tab = map_domain_page(page_to_mfn(page));
1479 memset(l2tab, 0, PAGE_SIZE);
1480 l3e = l3tab[l3_table_offset(va)] =
1481 entry_from_page(page, __PAGE_HYPERVISOR);
1483 else
1484 l2tab = map_domain_page(entry_get_pfn(l3e));
1486 unmap_domain_page(l3tab);
1487 #else
1488 l3e = top_tab[l3_table_offset(va)];
1490 /*
1491 * NB: when CONFIG_PAGING_LEVELS == 3,
1492 * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
1493 * alloc_monitor_pagetable should guarantee this.
1494 */
1495 if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
1496 BUG();
1498 l2tab = map_domain_page(entry_get_pfn(l3e));
1499 #endif
1501 l2e = l2tab[l2_table_offset(va)];
1502 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1504 page = alloc_domheap_page(NULL);
1505 if ( !page )
1506 goto nomem;
1508 l1tab = map_domain_page(page_to_mfn(page));
1509 memset(l1tab, 0, PAGE_SIZE);
1510 l2e = l2tab[l2_table_offset(va)] =
1511 l2e_from_page(page, __PAGE_HYPERVISOR);
1513 else
1514 l1tab = map_domain_page(l2e_get_pfn(l2e));
1516 unmap_domain_page(l2tab);
1518 l1e = l1tab[l1_table_offset(va)];
1519 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
1521 page = alloc_domheap_page(NULL);
1522 if ( !page )
1523 goto nomem;
1525 l0tab = map_domain_page(page_to_mfn(page));
1526 memset(l0tab, 0, PAGE_SIZE);
1527 l1e = l1tab[l1_table_offset(va)] =
1528 l1e_from_page(page, __PAGE_HYPERVISOR);
1530 else
1531 l0tab = map_domain_page(l1e_get_pfn(l1e));
1533 unmap_domain_page(l1tab);
1535 l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
1537 unmap_domain_page(l0tab);
1539 return 1;
1541 nomem:
1542 return 0;
1545 int
1546 set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn,
1547 struct domain_mmap_cache *l2cache,
1548 struct domain_mmap_cache *l1cache)
1550 unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
1551 unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(unsigned long));
1552 pgentry_64_t *top_tab;
1553 int error;
1555 ASSERT(tabmfn != 0);
1556 ASSERT(shadow_lock_is_acquired(d));
1558 top_tab = map_domain_page_with_cache(tabmfn, l2cache);
1560 if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
1561 domain_crash(d);
1563 unmap_domain_page_with_cache(top_tab, l2cache);
1565 return error;
1568 static int
1569 alloc_p2m_table(struct domain *d)
1571 struct list_head *list_ent;
1572 unsigned long va = RO_MPT_VIRT_START; /* phys_to_machine_mapping */
1573 pgentry_64_t *top_tab = NULL;
1574 unsigned long mfn;
1575 int gpfn, error = 0;
1577 ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
1579 top_tab = map_domain_page(
1580 pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
1582 list_ent = d->page_list.next;
1584 for ( gpfn = 0; list_ent != &d->page_list; gpfn++ )
1586 struct page_info *page;
1588 page = list_entry(list_ent, struct page_info, list);
1589 mfn = page_to_mfn(page);
1591 if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
1593 domain_crash(d);
1594 break;
1597 list_ent = frame_table[mfn].list.next;
1598 va += sizeof(mfn);
1601 unmap_domain_page(top_tab);
1603 return error;
1606 #if CONFIG_PAGING_LEVELS >= 3
1607 static void
1608 free_p2m_table(struct domain *d)
1610 unsigned long va;
1611 l1_pgentry_t *l1tab;
1612 l1_pgentry_t l1e;
1613 l2_pgentry_t *l2tab;
1614 l2_pgentry_t l2e;
1615 #if CONFIG_PAGING_LEVELS >= 3
1616 l3_pgentry_t *l3tab;
1617 l3_pgentry_t l3e;
1618 #endif
1619 #if CONFIG_PAGING_LEVELS == 4
1620 int i3;
1621 l4_pgentry_t *l4tab;
1622 l4_pgentry_t l4e;
1623 #endif
1625 ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
1627 #if CONFIG_PAGING_LEVELS == 4
1628 l4tab = map_domain_page(
1629 pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
1630 #endif
1631 #if CONFIG_PAGING_LEVELS == 3
1632 l3tab = map_domain_page(
1633 pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
1635 l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
1637 /*
1638 * NB: when CONFIG_PAGING_LEVELS == 3,
1639 * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
1640 * alloc_monitor_pagetable should guarantee this.
1641 */
1642 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1643 BUG();
1645 l2tab = map_domain_page(l3e_get_pfn(l3e));
1646 #endif
1648 for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
1650 #if CONFIG_PAGING_LEVELS == 4
1651 l4e = l4tab[l4_table_offset(va)];
1653 if ( l4e_get_flags(l4e) & _PAGE_PRESENT )
1655 l3tab = map_domain_page(l4e_get_pfn(l4e));
1657 for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ )
1659 l3e = l3tab[l3_table_offset(va)];
1661 if ( l3e_get_flags(l3e) & _PAGE_PRESENT )
1663 int i2;
1665 l2tab = map_domain_page(l3e_get_pfn(l3e));
1667 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
1669 #endif
1670 l2e = l2tab[l2_table_offset(va)];
1672 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
1674 int i1;
1676 l1tab = map_domain_page(l2e_get_pfn(l2e));
1678 /*
1679 * unsigned long phys_to_machine_mapping[]
1680 */
1681 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++ )
1683 l1e = l1tab[l1_table_offset(va)];
1685 if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
1686 free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
1688 va += PAGE_SIZE;
1690 unmap_domain_page(l1tab);
1691 free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
1693 else
1694 va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
1696 #if CONFIG_PAGING_LEVELS == 4
1698 unmap_domain_page(l2tab);
1699 free_domheap_page(mfn_to_page(l3e_get_pfn(l3e)));
1701 else
1702 va += PAGE_SIZE * L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES;
1704 unmap_domain_page(l3tab);
1705 free_domheap_page(mfn_to_page(l4e_get_pfn(l4e)));
1707 else
1708 va += PAGE_SIZE *
1709 L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES * L3_PAGETABLE_ENTRIES;
1710 #endif
1713 #if CONFIG_PAGING_LEVELS == 4
1714 unmap_domain_page(l4tab);
1715 #endif
1716 #if CONFIG_PAGING_LEVELS == 3
1717 unmap_domain_page(l3tab);
1718 #endif
1720 #endif
1722 void shadow_l1_normal_pt_update(
1723 struct domain *d,
1724 paddr_t pa, l1_pgentry_t gpte,
1725 struct domain_mmap_cache *cache)
1727 unsigned long sl1mfn;
1728 l1_pgentry_t *spl1e, spte;
1730 shadow_lock(d);
1732 sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
1733 if ( sl1mfn )
1735 SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpde=%" PRIpte,
1736 (void *)pa, l1e_get_intpte(gpte));
1737 l1pte_propagate_from_guest(current->domain, gpte, &spte);
1739 spl1e = map_domain_page_with_cache(sl1mfn, cache);
1740 spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
1741 unmap_domain_page_with_cache(spl1e, cache);
1744 shadow_unlock(d);
1747 void shadow_l2_normal_pt_update(
1748 struct domain *d,
1749 paddr_t pa, l2_pgentry_t gpde,
1750 struct domain_mmap_cache *cache)
1752 unsigned long sl2mfn;
1753 l2_pgentry_t *spl2e;
1755 shadow_lock(d);
1757 sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
1758 if ( sl2mfn )
1760 SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
1761 (void *)pa, l2e_get_intpte(gpde));
1762 spl2e = map_domain_page_with_cache(sl2mfn, cache);
1763 validate_pde_change(d, gpde,
1764 &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
1765 unmap_domain_page_with_cache(spl2e, cache);
1768 shadow_unlock(d);
1771 #if CONFIG_PAGING_LEVELS >= 3
1772 void shadow_l3_normal_pt_update(
1773 struct domain *d,
1774 paddr_t pa, l3_pgentry_t l3e,
1775 struct domain_mmap_cache *cache)
1777 unsigned long sl3mfn;
1778 pgentry_64_t *spl3e;
1780 shadow_lock(d);
1782 sl3mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l3_shadow);
1783 if ( sl3mfn )
1785 SH_VVLOG("shadow_l3_normal_pt_update pa=%p, l3e=%" PRIpte,
1786 (void *)pa, l3e_get_intpte(l3e));
1787 spl3e = (pgentry_64_t *) map_domain_page_with_cache(sl3mfn, cache);
1788 validate_entry_change(d, (pgentry_64_t *) &l3e,
1789 &spl3e[(pa & ~PAGE_MASK) / sizeof(l3_pgentry_t)],
1790 shadow_type_to_level(PGT_l3_shadow));
1791 unmap_domain_page_with_cache(spl3e, cache);
1794 shadow_unlock(d);
1796 #endif
1798 #if CONFIG_PAGING_LEVELS >= 4
1799 void shadow_l4_normal_pt_update(
1800 struct domain *d,
1801 paddr_t pa, l4_pgentry_t l4e,
1802 struct domain_mmap_cache *cache)
1804 unsigned long sl4mfn;
1805 pgentry_64_t *spl4e;
1807 shadow_lock(d);
1809 sl4mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l4_shadow);
1810 if ( sl4mfn )
1812 SH_VVLOG("shadow_l4_normal_pt_update pa=%p, l4e=%" PRIpte,
1813 (void *)pa, l4e_get_intpte(l4e));
1814 spl4e = (pgentry_64_t *)map_domain_page_with_cache(sl4mfn, cache);
1815 validate_entry_change(d, (pgentry_64_t *)&l4e,
1816 &spl4e[(pa & ~PAGE_MASK) / sizeof(l4_pgentry_t)],
1817 shadow_type_to_level(PGT_l4_shadow));
1818 unmap_domain_page_with_cache(spl4e, cache);
1821 shadow_unlock(d);
1823 #endif
1825 static void
1826 translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
1828 int i;
1829 l1_pgentry_t *l1;
1831 l1 = map_domain_page(l1mfn);
1832 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1834 if ( is_guest_l1_slot(i) &&
1835 (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
1837 unsigned long mfn = l1e_get_pfn(l1[i]);
1838 unsigned long gpfn = mfn_to_gmfn(d, mfn);
1839 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1840 l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
1843 unmap_domain_page(l1);
1846 // This is not general enough to handle arbitrary pagetables
1847 // with shared L1 pages, etc., but it is sufficient for bringing
1848 // up dom0.
1849 //
1850 void
1851 translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
1852 unsigned int type)
1854 int i;
1855 l2_pgentry_t *l2;
1857 ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
1859 l2 = map_domain_page(l2mfn);
1860 for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
1862 if ( is_guest_l2_slot(type, i) &&
1863 (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
1865 unsigned long mfn = l2e_get_pfn(l2[i]);
1866 unsigned long gpfn = mfn_to_gmfn(d, mfn);
1867 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1868 l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
1869 translate_l1pgtable(d, p2m, mfn);
1872 unmap_domain_page(l2);
1875 void
1876 remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
1878 unsigned long smfn;
1880 shadow_lock(d);
1882 while ( stype >= PGT_l1_shadow )
1884 smfn = __shadow_status(d, gpfn, stype);
1885 if ( smfn && MFN_PINNED(smfn) )
1886 shadow_unpin(smfn);
1887 stype -= PGT_l1_shadow;
1890 shadow_unlock(d);
1893 unsigned long
1894 get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn)
1896 unsigned long va, tabpfn;
1897 l1_pgentry_t *l1, l1e;
1898 l2_pgentry_t *l2, l2e;
1899 #if CONFIG_PAGING_LEVELS >= 4
1900 pgentry_64_t *l4 = NULL;
1901 pgentry_64_t l4e = { 0 };
1902 #endif
1903 pgentry_64_t *l3 = NULL;
1904 pgentry_64_t l3e = { 0 };
1905 unsigned long *l0tab = NULL;
1906 unsigned long mfn;
1908 ASSERT(shadow_mode_translate(d));
1910 perfc_incrc(get_mfn_from_gpfn_foreign);
1912 va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
1914 tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
1915 if ( !tabpfn )
1916 return INVALID_MFN;
1918 #if CONFIG_PAGING_LEVELS >= 4
1919 l4 = map_domain_page(tabpfn);
1920 l4e = l4[l4_table_offset(va)];
1921 unmap_domain_page(l4);
1922 if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
1923 return INVALID_MFN;
1925 l3 = map_domain_page(entry_get_pfn(l4e));
1926 #else
1927 l3 = map_domain_page(tabpfn);
1928 #endif
1929 l3e = l3[l3_table_offset(va)];
1930 unmap_domain_page(l3);
1931 if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
1932 return INVALID_MFN;
1933 l2 = map_domain_page(entry_get_pfn(l3e));
1934 l2e = l2[l2_table_offset(va)];
1935 unmap_domain_page(l2);
1936 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1937 return INVALID_MFN;
1939 l1 = map_domain_page(l2e_get_pfn(l2e));
1940 l1e = l1[l1_table_offset(va)];
1941 unmap_domain_page(l1);
1942 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
1943 return INVALID_MFN;
1945 l0tab = map_domain_page(l1e_get_pfn(l1e));
1946 mfn = l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1)];
1947 unmap_domain_page(l0tab);
1948 return mfn;
1951 static u32 remove_all_access_in_page(
1952 struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
1954 l1_pgentry_t *pl1e = map_domain_page(l1mfn);
1955 l1_pgentry_t match, ol2e;
1956 unsigned long flags = _PAGE_PRESENT;
1957 int i;
1958 u32 count = 0;
1959 int is_l1_shadow =
1960 ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) ==
1961 PGT_l1_shadow);
1963 match = l1e_from_pfn(forbidden_gmfn, flags);
1965 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1967 if ( l1e_has_changed(pl1e[i], match, flags) )
1968 continue;
1970 ol2e = pl1e[i];
1971 pl1e[i] = l1e_empty();
1972 count++;
1974 if ( is_l1_shadow )
1975 shadow_put_page_from_l1e(ol2e, d);
1976 else /* must be an hl2 page */
1977 put_page(mfn_to_page(forbidden_gmfn));
1980 unmap_domain_page(pl1e);
1982 return count;
1985 static u32 __shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
1987 int i;
1988 struct shadow_status *a;
1989 u32 count = 0;
1991 if ( unlikely(!shadow_mode_enabled(d)) )
1992 return 0;
1994 ASSERT(shadow_lock_is_acquired(d));
1995 perfc_incrc(remove_all_access);
1997 for (i = 0; i < shadow_ht_buckets; i++)
1999 a = &d->arch.shadow_ht[i];
2000 while ( a && a->gpfn_and_flags )
2002 switch (a->gpfn_and_flags & PGT_type_mask)
2004 case PGT_l1_shadow:
2005 case PGT_l2_shadow:
2006 case PGT_l3_shadow:
2007 case PGT_l4_shadow:
2008 case PGT_hl2_shadow:
2009 count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
2010 break;
2011 case PGT_snapshot:
2012 case PGT_writable_pred:
2013 // these can't hold refs to the forbidden page
2014 break;
2015 default:
2016 BUG();
2019 a = a->next;
2023 return count;
2026 void shadow_drop_references(
2027 struct domain *d, struct page_info *page)
2029 if ( likely(!shadow_mode_refcounts(d)) ||
2030 ((page->u.inuse.type_info & PGT_count_mask) == 0) )
2031 return;
2033 /* XXX This needs more thought... */
2034 printk("%s: needing to call __shadow_remove_all_access for mfn=%lx\n",
2035 __func__, page_to_mfn(page));
2036 printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
2037 page->count_info, page->u.inuse.type_info);
2039 shadow_lock(d);
2040 __shadow_remove_all_access(d, page_to_mfn(page));
2041 shadow_unlock(d);
2043 printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
2044 page->count_info, page->u.inuse.type_info);
2047 /* XXX Needs more thought. Neither pretty nor fast: a place holder. */
2048 void shadow_sync_and_drop_references(
2049 struct domain *d, struct page_info *page)
2051 if ( likely(!shadow_mode_refcounts(d)) )
2052 return;
2054 shadow_lock(d);
2056 if ( page_out_of_sync(page) )
2057 __shadow_sync_mfn(d, page_to_mfn(page));
2059 __shadow_remove_all_access(d, page_to_mfn(page));
2061 shadow_unlock(d);
2064 void clear_all_shadow_status(struct domain *d)
2066 struct vcpu *v = current;
2068 /*
2069 * Don't clean up while other vcpus are working.
2070 */
2071 if ( v->vcpu_id )
2072 return;
2074 shadow_lock(d);
2076 free_shadow_pages(d);
2077 free_shadow_ht_entries(d);
2078 d->arch.shadow_ht =
2079 xmalloc_array(struct shadow_status, shadow_ht_buckets);
2080 if ( d->arch.shadow_ht == NULL ) {
2081 printk("clear all shadow status:xmalloc fail\n");
2082 domain_crash_synchronous();
2084 memset(d->arch.shadow_ht, 0,
2085 shadow_ht_buckets * sizeof(struct shadow_status));
2087 free_out_of_sync_entries(d);
2089 shadow_unlock(d);
2093 /*
2094 * Local variables:
2095 * mode: C
2096 * c-set-style: "BSD"
2097 * c-basic-offset: 4
2098 * tab-width: 4
2099 * indent-tabs-mode: nil
2100 * End:
2101 */