ia64/xen-unstable

view xen/arch/x86/shadow_public.c @ 9776:72f9c751d3ea

Replace &foo[0] with foo where the latter seems cleaner
(which is usually, and particularly when its an argument
to one of the bitops functions).

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Apr 19 18:32:20 2006 +0100 (2006-04-19)
parents 0267063e050c
children 57e7b96139e7
line source
1 /******************************************************************************
2 * arch/x86/shadow_public.c
3 *
4 * Copyright (c) 2005 Michael A Fetterman
5 * Based on an earlier implementation by Ian Pratt et al
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
23 #include <xen/config.h>
24 #include <xen/types.h>
25 #include <xen/mm.h>
26 #include <xen/domain_page.h>
27 #include <asm/shadow.h>
28 #include <asm/page.h>
29 #include <xen/event.h>
30 #include <xen/sched.h>
31 #include <xen/trace.h>
32 #include <xen/guest_access.h>
33 #include <asm/shadow_64.h>
35 static int alloc_p2m_table(struct domain *d);
36 static void free_p2m_table(struct domain *d);
38 #define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) - ((_encoded) >> 16))
41 int shadow_direct_map_init(struct domain *d)
42 {
43 struct page_info *page;
44 l3_pgentry_t *root;
46 if ( !(page = alloc_domheap_pages(NULL, 0, ALLOC_DOM_DMA)) )
47 return 0;
49 root = map_domain_page(page_to_mfn(page));
50 memset(root, 0, PAGE_SIZE);
51 root[PAE_SHADOW_SELF_ENTRY] = l3e_from_page(page, __PAGE_HYPERVISOR);
53 d->arch.phys_table = mk_pagetable(page_to_maddr(page));
55 unmap_domain_page(root);
56 return 1;
57 }
59 void shadow_direct_map_clean(struct domain *d)
60 {
61 unsigned long mfn;
62 l2_pgentry_t *l2e;
63 l3_pgentry_t *l3e;
64 int i, j;
66 mfn = pagetable_get_pfn(d->arch.phys_table);
68 /*
69 * We may fail very early before direct map is built.
70 */
71 if ( !mfn )
72 return;
74 l3e = (l3_pgentry_t *)map_domain_page(mfn);
76 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
77 {
78 if ( l3e_get_flags(l3e[i]) & _PAGE_PRESENT )
79 {
80 l2e = map_domain_page(l3e_get_pfn(l3e[i]));
82 for ( j = 0; j < L2_PAGETABLE_ENTRIES; j++ )
83 {
84 if ( l2e_get_flags(l2e[j]) & _PAGE_PRESENT )
85 free_domheap_page(mfn_to_page(l2e_get_pfn(l2e[j])));
86 }
87 unmap_domain_page(l2e);
88 free_domheap_page(mfn_to_page(l3e_get_pfn(l3e[i])));
89 }
90 }
91 free_domheap_page(mfn_to_page(mfn));
93 unmap_domain_page(l3e);
95 d->arch.phys_table = mk_pagetable(0);
96 }
98 /****************************************************************************/
99 /************* export interface functions ***********************************/
100 /****************************************************************************/
101 void free_shadow_pages(struct domain *d);
103 int shadow_set_guest_paging_levels(struct domain *d, int levels)
104 {
105 struct vcpu *v = current;
107 /*
108 * Need to wait for VCPU0 to complete the on-going shadow ops.
109 */
111 if ( v->vcpu_id )
112 return 1;
114 shadow_lock(d);
116 switch(levels) {
117 #if CONFIG_PAGING_LEVELS == 4
118 case 4:
119 if ( d->arch.ops != &MODE_64_4_HANDLER )
120 d->arch.ops = &MODE_64_4_HANDLER;
121 shadow_unlock(d);
122 return 1;
123 #endif
124 #if CONFIG_PAGING_LEVELS == 3
125 case 3:
126 if ( d->arch.ops != &MODE_64_3_HANDLER )
127 d->arch.ops = &MODE_64_3_HANDLER;
128 shadow_unlock(d);
129 return 1;
130 #endif
131 #if CONFIG_PAGING_LEVELS == 4
132 case 3:
133 if ( d->arch.ops == &MODE_64_2_HANDLER )
134 free_shadow_pages(d);
135 if ( d->arch.ops != &MODE_64_PAE_HANDLER )
136 d->arch.ops = &MODE_64_PAE_HANDLER;
137 shadow_unlock(d);
138 return 1;
139 #endif
140 case 2:
141 #if CONFIG_PAGING_LEVELS == 2
142 if ( d->arch.ops != &MODE_32_2_HANDLER )
143 d->arch.ops = &MODE_32_2_HANDLER;
144 #elif CONFIG_PAGING_LEVELS >= 3
145 if ( d->arch.ops != &MODE_64_2_HANDLER )
146 d->arch.ops = &MODE_64_2_HANDLER;
147 #endif
148 shadow_unlock(d);
149 return 1;
150 default:
151 shadow_unlock(d);
152 return 0;
153 }
154 }
156 void shadow_invlpg(struct vcpu *v, unsigned long va)
157 {
158 struct domain *d = current->domain;
159 d->arch.ops->invlpg(v, va);
160 }
162 int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
163 {
164 struct domain *d = current->domain;
165 return d->arch.ops->fault(va, regs);
166 }
168 void __update_pagetables(struct vcpu *v)
169 {
170 struct domain *d = v->domain;
171 d->arch.ops->update_pagetables(v);
172 }
174 void __shadow_sync_all(struct domain *d)
175 {
176 d->arch.ops->sync_all(d);
177 }
179 int shadow_remove_all_write_access(
180 struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
181 {
182 return d->arch.ops->remove_all_write_access(d, readonly_gpfn, readonly_gmfn);
183 }
185 int shadow_do_update_va_mapping(unsigned long va,
186 l1_pgentry_t val,
187 struct vcpu *v)
188 {
189 struct domain *d = v->domain;
190 return d->arch.ops->do_update_va_mapping(va, val, v);
191 }
193 struct out_of_sync_entry *
194 shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
195 unsigned long mfn)
196 {
197 struct domain *d = v->domain;
198 return d->arch.ops->mark_mfn_out_of_sync(v, gpfn, mfn);
199 }
201 /*
202 * Returns 1 if va's shadow mapping is out-of-sync.
203 * Returns 0 otherwise.
204 */
205 int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
206 {
207 struct domain *d = v->domain;
208 return d->arch.ops->is_out_of_sync(v, va);
209 }
211 unsigned long gva_to_gpa(unsigned long gva)
212 {
213 struct domain *d = current->domain;
214 return d->arch.ops->gva_to_gpa(gva);
215 }
216 /****************************************************************************/
217 /****************************************************************************/
218 #if CONFIG_PAGING_LEVELS >= 3
220 static void inline
221 free_shadow_fl1_table(struct domain *d, unsigned long smfn)
222 {
223 l1_pgentry_t *pl1e = map_domain_page(smfn);
224 int i;
226 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
227 put_page_from_l1e(pl1e[i], d);
229 unmap_domain_page(pl1e);
230 }
232 /*
233 * Free l2, l3, l4 shadow tables
234 */
236 void free_fake_shadow_l2(struct domain *d,unsigned long smfn);
238 static void inline
239 free_shadow_tables(struct domain *d, unsigned long smfn, u32 level)
240 {
241 pgentry_64_t *ple = map_domain_page(smfn);
242 int i, external = shadow_mode_external(d);
244 #if CONFIG_PAGING_LEVELS >= 3
245 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
246 {
247 struct page_info *page = mfn_to_page(smfn);
248 for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
249 {
250 if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
251 free_fake_shadow_l2(d, entry_get_pfn(ple[i]));
252 }
254 page = mfn_to_page(entry_get_pfn(ple[0]));
255 free_domheap_pages(page, SL2_ORDER);
256 unmap_domain_page(ple);
257 }
258 else
259 #endif
260 {
261 /*
262 * No Xen mappings in external pages
263 */
264 if ( external )
265 {
266 for ( i = 0; i < PAGETABLE_ENTRIES; i++ ) {
267 if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
268 put_shadow_ref(entry_get_pfn(ple[i]));
269 if (d->arch.ops->guest_paging_levels == PAGING_L3)
270 {
271 #if CONFIG_PAGING_LEVELS == 4
272 if ( i == PAE_L3_PAGETABLE_ENTRIES && level == PAGING_L4 )
273 #elif CONFIG_PAGING_LEVELS == 3
274 if ( i == PAE_L3_PAGETABLE_ENTRIES && level == PAGING_L3 )
275 #endif
276 break;
277 }
278 }
279 }
280 else
281 {
282 for ( i = 0; i < PAGETABLE_ENTRIES; i++ )
283 {
284 /*
285 * List the skip/break conditions to avoid freeing
286 * Xen private mappings.
287 */
288 #if CONFIG_PAGING_LEVELS == 2
289 if ( level == PAGING_L2 && !is_guest_l2_slot(0, i) )
290 continue;
291 #endif
292 #if CONFIG_PAGING_LEVELS == 3
293 if ( level == PAGING_L3 && i == L3_PAGETABLE_ENTRIES )
294 break;
295 if ( level == PAGING_L2 )
296 {
297 struct page_info *page = mfn_to_page(smfn);
298 if ( is_xen_l2_slot(page->u.inuse.type_info, i) )
299 continue;
300 }
301 #endif
302 #if CONFIG_PAGING_LEVELS == 4
303 if ( level == PAGING_L4 && !is_guest_l4_slot(i))
304 continue;
305 #endif
306 if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
307 put_shadow_ref(entry_get_pfn(ple[i]));
308 }
309 }
310 unmap_domain_page(ple);
311 }
312 }
313 #endif
315 #if CONFIG_PAGING_LEVELS == 4
316 static void alloc_monitor_pagetable(struct vcpu *v)
317 {
318 unsigned long mmfn;
319 l4_pgentry_t *mpl4e;
320 struct page_info *mmfn_info;
321 struct domain *d = v->domain;
323 ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */
325 mmfn_info = alloc_domheap_page(NULL);
326 ASSERT( mmfn_info );
328 mmfn = page_to_mfn(mmfn_info);
329 mpl4e = (l4_pgentry_t *) map_domain_page_global(mmfn);
330 memcpy(mpl4e, idle_pg_table, PAGE_SIZE);
331 mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
332 l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
334 /* map the phys_to_machine map into the per domain Read-Only MPT space */
336 v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
337 v->arch.monitor_vtable = (l2_pgentry_t *) mpl4e;
338 mpl4e[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
340 if ( v->vcpu_id == 0 )
341 alloc_p2m_table(d);
342 else
343 {
344 unsigned long mfn;
346 mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
347 if ( mfn )
348 {
349 l4_pgentry_t *l4tab;
351 l4tab = map_domain_page(mfn);
353 mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
354 l4tab[l4_table_offset(RO_MPT_VIRT_START)];
356 unmap_domain_page(l4tab);
357 }
358 }
359 }
361 void free_monitor_pagetable(struct vcpu *v)
362 {
363 unsigned long mfn;
365 /*
366 * free monitor_table.
367 */
368 if ( v->vcpu_id == 0 )
369 free_p2m_table(v->domain);
371 /*
372 * Then free monitor_table.
373 */
374 mfn = pagetable_get_pfn(v->arch.monitor_table);
375 unmap_domain_page_global(v->arch.monitor_vtable);
376 free_domheap_page(mfn_to_page(mfn));
378 v->arch.monitor_table = mk_pagetable(0);
379 v->arch.monitor_vtable = 0;
380 }
381 #elif CONFIG_PAGING_LEVELS == 3
382 static void alloc_monitor_pagetable(struct vcpu *v)
383 {
384 unsigned long m2mfn, m3mfn;
385 l2_pgentry_t *mpl2e;
386 l3_pgentry_t *mpl3e;
387 struct page_info *m2mfn_info, *m3mfn_info, *page;
388 struct domain *d = v->domain;
389 int i;
391 ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */
393 m3mfn_info = alloc_domheap_pages(NULL, 0, ALLOC_DOM_DMA);
394 ASSERT( m3mfn_info );
396 m3mfn = page_to_mfn(m3mfn_info);
397 mpl3e = (l3_pgentry_t *) map_domain_page_global(m3mfn);
398 memset(mpl3e, 0, L3_PAGETABLE_ENTRIES * sizeof(l3_pgentry_t));
400 m2mfn_info = alloc_domheap_page(NULL);
401 ASSERT( m2mfn_info );
403 m2mfn = page_to_mfn(m2mfn_info);
404 mpl2e = (l2_pgentry_t *) map_domain_page(m2mfn);
405 memset(mpl2e, 0, L2_PAGETABLE_ENTRIES * sizeof(l2_pgentry_t));
407 memcpy(&mpl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
408 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
409 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
410 /*
411 * Map L2 page into L3
412 */
413 mpl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(m2mfn, _PAGE_PRESENT);
414 page = l3e_get_page(mpl3e[L3_PAGETABLE_ENTRIES - 1]);
416 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
417 mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
418 l2e_from_page(
419 virt_to_page(d->arch.mm_perdomain_pt) + i,
420 __PAGE_HYPERVISOR);
421 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
422 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
423 (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
424 l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
425 l2e_empty();
426 for ( i = 0; i < (MACHPHYS_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
427 mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] = l2e_empty();
429 v->arch.monitor_table = mk_pagetable(m3mfn << PAGE_SHIFT); /* < 4GB */
430 v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e;
432 if ( v->vcpu_id == 0 )
433 alloc_p2m_table(d);
434 else
435 {
436 unsigned long mfn;
438 mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
439 if ( mfn )
440 {
441 l3_pgentry_t *l3tab, l3e;
442 l2_pgentry_t *l2tab;
444 l3tab = map_domain_page(mfn);
445 l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
447 /*
448 * NB: when CONFIG_PAGING_LEVELS == 3,
449 * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
450 * alloc_monitor_pagetable should guarantee this.
451 */
452 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
453 BUG();
455 l2tab = map_domain_page(l3e_get_pfn(l3e));
457 /*
458 * Just one l2 slot is used here, so at most 2M for p2m table:
459 * ((4K * 512)/sizeof(unsigned long)) * 4K = 2G
460 * should be OK on PAE xen, since Qemu DM can only map 1.5G VMX
461 * guest memory.
462 */
463 mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
464 l2tab[l2_table_offset(RO_MPT_VIRT_START)];
466 unmap_domain_page(l2tab);
467 unmap_domain_page(l3tab);
468 }
469 }
471 unmap_domain_page(mpl2e);
472 }
474 void free_monitor_pagetable(struct vcpu *v)
475 {
476 unsigned long m2mfn, m3mfn;
477 /*
478 * free monitor_table.
479 */
480 if ( v->vcpu_id == 0 )
481 free_p2m_table(v->domain);
483 m3mfn = pagetable_get_pfn(v->arch.monitor_table);
484 m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]);
486 free_domheap_page(mfn_to_page(m2mfn));
487 unmap_domain_page_global(v->arch.monitor_vtable);
488 free_domheap_page(mfn_to_page(m3mfn));
490 v->arch.monitor_table = mk_pagetable(0);
491 v->arch.monitor_vtable = 0;
492 }
493 #endif
495 static void
496 shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
497 {
498 void *snapshot;
500 if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
501 return;
503 // Clear the out_of_sync bit.
504 //
505 clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info);
507 // XXX Need to think about how to protect the domain's
508 // information less expensively.
509 //
510 snapshot = map_domain_page(entry->snapshot_mfn);
511 memset(snapshot, 0, PAGE_SIZE);
512 unmap_domain_page(snapshot);
514 put_shadow_ref(entry->snapshot_mfn);
515 }
517 void
518 release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
519 {
520 struct page_info *page;
522 page = mfn_to_page(entry->gmfn);
524 // Decrement ref count of guest & shadow pages
525 //
526 put_page(page);
528 // Only use entries that have low bits clear...
529 //
530 if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
531 {
532 put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
533 entry->writable_pl1e = -2;
534 }
535 else
536 ASSERT( entry->writable_pl1e == -1 );
538 // Free the snapshot
539 //
540 shadow_free_snapshot(d, entry);
541 }
543 static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
544 {
545 struct out_of_sync_entry *entry = d->arch.out_of_sync;
546 struct out_of_sync_entry **prev = &d->arch.out_of_sync;
547 struct out_of_sync_entry *found = NULL;
549 // NB: Be careful not to call something that manipulates this list
550 // while walking it. Collect the results into a separate list
551 // first, then walk that list.
552 //
553 while ( entry )
554 {
555 if ( entry->gmfn == gmfn )
556 {
557 // remove from out of sync list
558 *prev = entry->next;
560 // add to found list
561 entry->next = found;
562 found = entry;
564 entry = *prev;
565 continue;
566 }
567 prev = &entry->next;
568 entry = entry->next;
569 }
571 prev = NULL;
572 entry = found;
573 while ( entry )
574 {
575 release_out_of_sync_entry(d, entry);
577 prev = &entry->next;
578 entry = entry->next;
579 }
581 // Add found list to free list
582 if ( prev )
583 {
584 *prev = d->arch.out_of_sync_free;
585 d->arch.out_of_sync_free = found;
586 }
587 }
589 static inline void
590 shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
591 {
592 if ( !shadow_mode_refcounts(d) )
593 return;
595 ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table);
597 if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
598 {
599 clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info);
601 if ( page_out_of_sync(mfn_to_page(gmfn)) )
602 {
603 remove_out_of_sync_entries(d, gmfn);
604 }
605 }
606 }
608 static void inline
609 free_shadow_l1_table(struct domain *d, unsigned long smfn)
610 {
611 l1_pgentry_t *pl1e = map_domain_page(smfn);
612 l1_pgentry_t *pl1e_next = 0, *sl1e_p;
613 int i;
614 struct page_info *spage = mfn_to_page(smfn);
615 u32 min_max = spage->tlbflush_timestamp;
616 int min = SHADOW_MIN(min_max);
617 int max;
619 if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
620 {
621 max = SHADOW_MAX_GUEST32(min_max);
622 pl1e_next = map_domain_page(smfn + 1);
623 }
624 else
625 max = SHADOW_MAX(min_max);
627 for ( i = min; i <= max; i++ )
628 {
629 if ( pl1e_next && i >= L1_PAGETABLE_ENTRIES )
630 sl1e_p = &pl1e_next[i - L1_PAGETABLE_ENTRIES];
631 else
632 sl1e_p = &pl1e[i];
634 shadow_put_page_from_l1e(*sl1e_p, d);
635 *sl1e_p = l1e_empty();
636 }
638 unmap_domain_page(pl1e);
639 if ( pl1e_next )
640 unmap_domain_page(pl1e_next);
641 }
643 static void inline
644 free_shadow_hl2_table(struct domain *d, unsigned long smfn)
645 {
646 l1_pgentry_t *hl2 = map_domain_page(smfn);
647 int i, limit;
649 SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
651 #if CONFIG_PAGING_LEVELS == 2
652 if ( shadow_mode_external(d) )
653 limit = L2_PAGETABLE_ENTRIES;
654 else
655 limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
656 #endif
658 for ( i = 0; i < limit; i++ )
659 {
660 if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
661 put_page(mfn_to_page(l1e_get_pfn(hl2[i])));
662 }
664 unmap_domain_page(hl2);
665 }
667 static void inline
668 free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
669 {
670 l2_pgentry_t *pl2e = map_domain_page(smfn);
671 int i, external = shadow_mode_external(d);
673 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
674 if ( external || is_guest_l2_slot(type, i) )
675 if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
676 put_shadow_ref(l2e_get_pfn(pl2e[i]));
678 if ( (PGT_base_page_table == PGT_l2_page_table) &&
679 shadow_mode_translate(d) && !external )
680 {
681 // free the ref to the hl2
682 //
683 put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
684 }
686 unmap_domain_page(pl2e);
687 }
689 void free_fake_shadow_l2(struct domain *d, unsigned long smfn)
690 {
691 pgentry_64_t *ple = map_domain_page(smfn);
692 int i;
694 for ( i = 0; i < PAGETABLE_ENTRIES; i = i + 2 )
695 if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
696 put_shadow_ref(entry_get_pfn(ple[i]));
698 unmap_domain_page(ple);
699 }
701 void free_shadow_page(unsigned long smfn)
702 {
703 struct page_info *page = mfn_to_page(smfn);
704 unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
705 struct domain *d = page_get_owner(mfn_to_page(gmfn));
706 unsigned long gpfn = mfn_to_gmfn(d, gmfn);
707 unsigned long type = page->u.inuse.type_info & PGT_type_mask;
709 SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
711 ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
712 #if CONFIG_PAGING_LEVELS >= 4
713 if ( type == PGT_fl1_shadow )
714 {
715 unsigned long mfn;
716 mfn = __shadow_status(d, gpfn, PGT_fl1_shadow);
717 if ( !mfn )
718 gpfn |= (1UL << 63);
719 }
720 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
721 if ( type == PGT_l4_shadow )
722 gpfn = ((unsigned long)page->tlbflush_timestamp << PGT_pae_idx_shift) | gpfn;
723 #endif
725 delete_shadow_status(d, gpfn, gmfn, type);
727 switch ( type )
728 {
729 case PGT_l1_shadow:
730 perfc_decr(shadow_l1_pages);
731 shadow_demote(d, gpfn, gmfn);
732 free_shadow_l1_table(d, smfn);
733 d->arch.shadow_page_count--;
734 break;
735 #if CONFIG_PAGING_LEVELS == 2
736 case PGT_l2_shadow:
737 perfc_decr(shadow_l2_pages);
738 shadow_demote(d, gpfn, gmfn);
739 free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
740 d->arch.shadow_page_count--;
741 break;
743 case PGT_hl2_shadow:
744 perfc_decr(hl2_table_pages);
745 shadow_demote(d, gpfn, gmfn);
746 free_shadow_hl2_table(d, smfn);
747 d->arch.hl2_page_count--;
748 break;
749 #endif
750 #if CONFIG_PAGING_LEVELS >= 3
751 case PGT_l2_shadow:
752 case PGT_l3_shadow:
753 shadow_demote(d, gpfn, gmfn);
754 free_shadow_tables(d, smfn, shadow_type_to_level(type));
755 d->arch.shadow_page_count--;
756 break;
758 case PGT_l4_shadow:
759 gpfn = gpfn & PGT_mfn_mask;
760 if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
761 {
762 /*
763 * Since a single PDPT page can have multiple PDPs, it's possible
764 * that shadow_demote() has been already called for gmfn.
765 */
766 if ( mfn_is_page_table(gmfn) )
767 shadow_demote(d, gpfn, gmfn);
768 } else
769 shadow_demote(d, gpfn, gmfn);
771 free_shadow_tables(d, smfn, shadow_type_to_level(type));
772 d->arch.shadow_page_count--;
773 break;
775 case PGT_fl1_shadow:
776 free_shadow_fl1_table(d, smfn);
777 d->arch.shadow_page_count--;
778 break;
779 #endif
780 case PGT_snapshot:
781 perfc_decr(snapshot_pages);
782 break;
784 default:
785 printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n",
786 page_to_mfn(page), page->u.inuse.type_info);
787 break;
788 }
790 // No TLB flushes are needed the next time this page gets allocated.
791 //
792 page->tlbflush_timestamp = 0;
793 page->u.free.cpumask = CPU_MASK_NONE;
795 if ( type == PGT_l1_shadow )
796 {
797 list_add(&page->list, &d->arch.free_shadow_frames);
798 perfc_incr(free_l1_pages);
799 }
800 else
801 free_domheap_page(page);
802 }
804 static void
805 free_writable_pte_predictions(struct domain *d)
806 {
807 int i;
808 struct shadow_status *x;
810 for ( i = 0; i < shadow_ht_buckets; i++ )
811 {
812 u32 count;
813 unsigned long *gpfn_list;
815 /* Skip empty buckets. */
816 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
817 continue;
819 count = 0;
820 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
821 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
822 count++;
824 gpfn_list = xmalloc_array(unsigned long, count);
825 count = 0;
826 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
827 if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
828 gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
830 while ( count )
831 {
832 count--;
833 delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
834 }
836 xfree(gpfn_list);
837 }
838 }
840 static void free_shadow_ht_entries(struct domain *d)
841 {
842 struct shadow_status *x, *n;
844 SH_VLOG("freed tables count=%d l1=%d l2=%d",
845 d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
846 perfc_value(shadow_l2_pages));
848 n = d->arch.shadow_ht_extras;
849 while ( (x = n) != NULL )
850 {
851 d->arch.shadow_extras_count--;
852 n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
853 xfree(x);
854 }
856 d->arch.shadow_ht_extras = NULL;
857 d->arch.shadow_ht_free = NULL;
859 ASSERT(d->arch.shadow_extras_count == 0);
860 SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
862 if ( d->arch.shadow_dirty_bitmap != NULL )
863 {
864 xfree(d->arch.shadow_dirty_bitmap);
865 d->arch.shadow_dirty_bitmap = 0;
866 d->arch.shadow_dirty_bitmap_size = 0;
867 }
869 xfree(d->arch.shadow_ht);
870 d->arch.shadow_ht = NULL;
871 }
873 static void free_out_of_sync_entries(struct domain *d)
874 {
875 struct out_of_sync_entry *x, *n;
877 n = d->arch.out_of_sync_extras;
878 while ( (x = n) != NULL )
879 {
880 d->arch.out_of_sync_extras_count--;
881 n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
882 xfree(x);
883 }
885 d->arch.out_of_sync_extras = NULL;
886 d->arch.out_of_sync_free = NULL;
887 d->arch.out_of_sync = NULL;
889 ASSERT(d->arch.out_of_sync_extras_count == 0);
890 FSH_LOG("freed extra out_of_sync entries, now %d",
891 d->arch.out_of_sync_extras_count);
892 }
894 void free_shadow_pages(struct domain *d)
895 {
896 int i;
897 struct shadow_status *x;
898 struct vcpu *v;
899 struct list_head *list_ent, *tmp;
901 /*
902 * WARNING! The shadow page table must not currently be in use!
903 * e.g., You are expected to have paused the domain and synchronized CR3.
904 */
906 if( !d->arch.shadow_ht ) return;
908 shadow_audit(d, 1);
910 // first, remove any outstanding refs from out_of_sync entries...
911 //
912 free_out_of_sync_state(d);
914 // second, remove any outstanding refs from v->arch.shadow_table
915 // and CR3.
916 //
917 for_each_vcpu(d, v)
918 {
919 if ( pagetable_get_paddr(v->arch.shadow_table) )
920 {
921 put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
922 v->arch.shadow_table = mk_pagetable(0);
924 if ( shadow_mode_external(d) )
925 {
926 if ( v->arch.shadow_vtable )
927 unmap_domain_page_global(v->arch.shadow_vtable);
928 v->arch.shadow_vtable = NULL;
929 }
930 }
932 if ( v->arch.monitor_shadow_ref )
933 {
934 put_shadow_ref(v->arch.monitor_shadow_ref);
935 v->arch.monitor_shadow_ref = 0;
936 }
937 }
939 #if CONFIG_PAGING_LEVELS == 2
940 // For external shadows, remove the monitor table's refs
941 //
942 if ( shadow_mode_external(d) )
943 {
944 for_each_vcpu(d, v)
945 {
946 l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
948 if ( mpl2e )
949 {
950 l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
951 l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
953 if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
954 {
955 put_shadow_ref(l2e_get_pfn(hl2e));
956 mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
957 }
958 if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
959 {
960 put_shadow_ref(l2e_get_pfn(smfn));
961 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
962 }
963 }
964 }
965 }
966 #endif
967 // Now, the only refs to shadow pages that are left are from the shadow
968 // pages themselves. We just unpin the pinned pages, and the rest
969 // should automatically disappear.
970 //
971 // NB: Beware: each explicitly or implicit call to free_shadow_page
972 // can/will result in the hash bucket getting rewritten out from
973 // under us... First, collect the list of pinned pages, then
974 // free them.
975 //
976 for ( i = 0; i < shadow_ht_buckets; i++ )
977 {
978 u32 count;
979 unsigned long *mfn_list;
981 /* Skip empty buckets. */
982 if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
983 continue;
985 count = 0;
986 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
987 if ( MFN_PINNED(x->smfn) )
988 count++;
989 if ( !count )
990 continue;
992 mfn_list = xmalloc_array(unsigned long, count);
993 count = 0;
994 for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
995 if ( MFN_PINNED(x->smfn) )
996 mfn_list[count++] = x->smfn;
998 while ( count )
999 {
1000 shadow_unpin(mfn_list[--count]);
1002 xfree(mfn_list);
1005 /* Now free the pre-zero'ed pages from the domain. */
1006 list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
1008 struct page_info *page = list_entry(list_ent, struct page_info, list);
1010 list_del(list_ent);
1011 perfc_decr(free_l1_pages);
1013 if (d->arch.ops->guest_paging_levels == PAGING_L2)
1015 #if CONFIG_PAGING_LEVELS >=3
1016 free_domheap_pages(page, SL1_ORDER);
1017 #else
1018 free_domheap_page(page);
1019 #endif
1021 else
1022 free_domheap_page(page);
1025 shadow_audit(d, 0);
1027 SH_LOG("Free shadow table.");
1030 void __shadow_mode_disable(struct domain *d)
1032 struct vcpu *v;
1033 #ifndef NDEBUG
1034 int i;
1035 #endif
1037 if ( unlikely(!shadow_mode_enabled(d)) )
1038 return;
1040 free_shadow_pages(d);
1041 free_writable_pte_predictions(d);
1043 #ifndef NDEBUG
1044 for ( i = 0; i < shadow_ht_buckets; i++ )
1046 if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
1048 printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
1049 __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
1050 BUG();
1053 #endif
1055 d->arch.shadow_mode = 0;
1057 free_shadow_ht_entries(d);
1058 free_out_of_sync_entries(d);
1060 for_each_vcpu(d, v)
1061 update_pagetables(v);
1065 int __shadow_mode_enable(struct domain *d, unsigned int mode)
1067 struct vcpu *v;
1068 int new_modes = (mode & ~d->arch.shadow_mode);
1070 // Gotta be adding something to call this function.
1071 ASSERT(new_modes);
1073 // can't take anything away by calling this function.
1074 ASSERT(!(d->arch.shadow_mode & ~mode));
1076 #if defined(CONFIG_PAGING_LEVELS)
1077 if(!shadow_set_guest_paging_levels(d,
1078 CONFIG_PAGING_LEVELS)) {
1079 printk("Unsupported guest paging levels\n");
1080 domain_crash_synchronous(); /* need to take a clean path */
1082 #endif
1084 for_each_vcpu(d, v)
1086 invalidate_shadow_ldt(v);
1088 // We need to set these up for __update_pagetables().
1089 // See the comment there.
1091 /*
1092 * arch.guest_vtable
1093 */
1094 if ( v->arch.guest_vtable &&
1095 (v->arch.guest_vtable != __linear_l2_table) )
1097 unmap_domain_page_global(v->arch.guest_vtable);
1099 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
1100 v->arch.guest_vtable = __linear_l2_table;
1101 else
1102 v->arch.guest_vtable = NULL;
1104 /*
1105 * arch.shadow_vtable
1106 */
1107 if ( v->arch.shadow_vtable &&
1108 (v->arch.shadow_vtable != __shadow_linear_l2_table) )
1110 unmap_domain_page_global(v->arch.shadow_vtable);
1112 if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2)
1113 v->arch.shadow_vtable = __shadow_linear_l2_table;
1114 else
1115 v->arch.shadow_vtable = NULL;
1117 #if CONFIG_PAGING_LEVELS == 2
1118 /*
1119 * arch.hl2_vtable
1120 */
1121 if ( v->arch.hl2_vtable &&
1122 (v->arch.hl2_vtable != __linear_hl2_table) )
1124 unmap_domain_page_global(v->arch.hl2_vtable);
1126 if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
1127 v->arch.hl2_vtable = __linear_hl2_table;
1128 else
1129 v->arch.hl2_vtable = NULL;
1130 #endif
1131 /*
1132 * arch.monitor_table & arch.monitor_vtable
1133 */
1134 if ( v->arch.monitor_vtable )
1136 free_monitor_pagetable(v);
1138 if ( mode & SHM_external )
1140 alloc_monitor_pagetable(v);
1144 if ( new_modes & SHM_enable )
1146 ASSERT( !d->arch.shadow_ht );
1147 d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
1148 if ( d->arch.shadow_ht == NULL )
1149 goto nomem;
1151 memset(d->arch.shadow_ht, 0,
1152 shadow_ht_buckets * sizeof(struct shadow_status));
1155 if ( new_modes & SHM_log_dirty )
1157 ASSERT( !d->arch.shadow_dirty_bitmap );
1158 d->arch.shadow_dirty_bitmap_size =
1159 (d->shared_info->arch.max_pfn + 63) & ~63;
1160 d->arch.shadow_dirty_bitmap =
1161 xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
1162 (8 * sizeof(unsigned long)));
1163 if ( d->arch.shadow_dirty_bitmap == NULL )
1165 d->arch.shadow_dirty_bitmap_size = 0;
1166 goto nomem;
1168 memset(d->arch.shadow_dirty_bitmap, 0,
1169 d->arch.shadow_dirty_bitmap_size/8);
1172 if ( new_modes & SHM_translate )
1174 if ( !(new_modes & SHM_external) )
1176 ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
1177 if ( !alloc_p2m_table(d) )
1179 printk("alloc_p2m_table failed (out-of-memory?)\n");
1180 goto nomem;
1185 // Get rid of any shadow pages from any previous shadow mode.
1186 //
1187 free_shadow_pages(d);
1189 d->arch.shadow_mode = mode;
1191 if ( shadow_mode_refcounts(d) )
1193 struct list_head *list_ent;
1194 struct page_info *page;
1196 /*
1197 * Tear down its counts by disassembling its page-table-based refcounts
1198 * Also remove CR3's gcount/tcount.
1199 * That leaves things like GDTs and LDTs and external refs in tact.
1201 * Most pages will be writable tcount=0.
1202 * Some will still be L1 tcount=0 or L2 tcount=0.
1203 * Maybe some pages will be type none tcount=0.
1204 * Pages granted external writable refs (via grant tables?) will
1205 * still have a non-zero tcount. That's OK.
1207 * gcounts will generally be 1 for PGC_allocated.
1208 * GDTs and LDTs will have additional gcounts.
1209 * Any grant-table based refs will still be in the gcount.
1211 * We attempt to grab writable refs to each page thus setting its type
1212 * Immediately put back those type refs.
1214 * Assert that no pages are left with L1/L2/L3/L4 type.
1215 */
1216 audit_adjust_pgtables(d, -1, 1);
1219 for (list_ent = d->page_list.next; list_ent != &d->page_list;
1220 list_ent = page->list.next) {
1222 page = list_entry(list_ent, struct page_info, list);
1223 if ( !get_page_type(page, PGT_writable_page) )
1224 BUG();
1225 put_page_type(page);
1226 /*
1227 * We use tlbflush_timestamp as back pointer to smfn, and need to
1228 * clean up it.
1229 */
1230 if (shadow_mode_external(d))
1231 page->tlbflush_timestamp = 0;
1234 audit_adjust_pgtables(d, 1, 1);
1238 return 0;
1240 nomem:
1241 if ( (new_modes & SHM_enable) )
1243 xfree(d->arch.shadow_ht);
1244 d->arch.shadow_ht = NULL;
1246 if ( (new_modes & SHM_log_dirty) )
1248 xfree(d->arch.shadow_dirty_bitmap);
1249 d->arch.shadow_dirty_bitmap = NULL;
1252 return -ENOMEM;
1256 int shadow_mode_enable(struct domain *d, unsigned int mode)
1258 int rc;
1259 shadow_lock(d);
1260 rc = __shadow_mode_enable(d, mode);
1261 shadow_unlock(d);
1262 return rc;
1265 static int shadow_mode_table_op(
1266 struct domain *d, dom0_shadow_control_t *sc)
1268 unsigned int op = sc->op;
1269 int i, rc = 0;
1270 struct vcpu *v;
1272 ASSERT(shadow_lock_is_acquired(d));
1274 SH_VLOG("shadow mode table op %lx %lx count %d",
1275 (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */
1276 (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
1277 d->arch.shadow_page_count);
1279 shadow_audit(d, 1);
1281 switch ( op )
1283 case DOM0_SHADOW_CONTROL_OP_FLUSH:
1284 free_shadow_pages(d);
1286 d->arch.shadow_fault_count = 0;
1287 d->arch.shadow_dirty_count = 0;
1289 break;
1291 case DOM0_SHADOW_CONTROL_OP_CLEAN:
1292 free_shadow_pages(d);
1294 sc->stats.fault_count = d->arch.shadow_fault_count;
1295 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1297 d->arch.shadow_fault_count = 0;
1298 d->arch.shadow_dirty_count = 0;
1300 if ( guest_handle_is_null(sc->dirty_bitmap) ||
1301 (d->arch.shadow_dirty_bitmap == NULL) )
1303 rc = -EINVAL;
1304 break;
1307 if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
1308 sc->pages = d->arch.shadow_dirty_bitmap_size;
1310 #define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
1311 for ( i = 0; i < sc->pages; i += chunk )
1313 int bytes = ((((sc->pages - i) > chunk) ?
1314 chunk : (sc->pages - i)) + 7) / 8;
1316 if ( copy_to_guest_offset(
1317 sc->dirty_bitmap, i/(8*sizeof(unsigned long)),
1318 d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
1319 (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) )
1321 rc = -EINVAL;
1322 break;
1324 memset(
1325 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
1326 0, bytes);
1329 break;
1331 case DOM0_SHADOW_CONTROL_OP_PEEK:
1332 sc->stats.fault_count = d->arch.shadow_fault_count;
1333 sc->stats.dirty_count = d->arch.shadow_dirty_count;
1335 if ( guest_handle_is_null(sc->dirty_bitmap) ||
1336 (d->arch.shadow_dirty_bitmap == NULL) )
1338 rc = -EINVAL;
1339 break;
1342 if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
1343 sc->pages = d->arch.shadow_dirty_bitmap_size;
1345 if ( copy_to_guest(sc->dirty_bitmap,
1346 d->arch.shadow_dirty_bitmap,
1347 (((sc->pages+7)/8)+sizeof(unsigned long)-1) /
1348 sizeof(unsigned long)) )
1350 rc = -EINVAL;
1351 break;
1354 break;
1356 default:
1357 rc = -EINVAL;
1358 break;
1361 SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
1362 shadow_audit(d, 1);
1364 for_each_vcpu(d,v)
1365 __update_pagetables(v);
1367 return rc;
1370 int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
1372 unsigned int op = sc->op;
1373 int rc = 0;
1374 struct vcpu *v;
1376 if ( unlikely(d == current->domain) )
1378 DPRINTK("Don't try to do a shadow op on yourself!\n");
1379 return -EINVAL;
1382 domain_pause(d);
1384 shadow_lock(d);
1386 switch ( op )
1388 case DOM0_SHADOW_CONTROL_OP_OFF:
1389 if ( shadow_mode_enabled(d) )
1391 __shadow_sync_all(d);
1392 __shadow_mode_disable(d);
1394 break;
1396 case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
1397 free_shadow_pages(d);
1398 rc = __shadow_mode_enable(d, SHM_enable);
1399 break;
1401 case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
1402 free_shadow_pages(d);
1403 rc = __shadow_mode_enable(
1404 d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
1405 break;
1407 case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
1408 free_shadow_pages(d);
1409 rc = __shadow_mode_enable(
1410 d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate);
1411 break;
1413 default:
1414 rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
1415 break;
1418 shadow_unlock(d);
1420 for_each_vcpu(d,v)
1421 update_pagetables(v);
1423 domain_unpause(d);
1425 return rc;
1428 void shadow_mode_init(void)
1432 int _shadow_mode_refcounts(struct domain *d)
1434 return shadow_mode_refcounts(d);
1437 static int
1438 map_p2m_entry(pgentry_64_t *top_tab, unsigned long va,
1439 unsigned long gpfn, unsigned long mfn)
1441 #if CONFIG_PAGING_LEVELS >= 4
1442 pgentry_64_t l4e = { 0 };
1443 pgentry_64_t *l3tab = NULL;
1444 #endif
1445 #if CONFIG_PAGING_LEVELS >= 3
1446 pgentry_64_t l3e = { 0 };
1447 #endif
1448 l2_pgentry_t *l2tab = NULL;
1449 l1_pgentry_t *l1tab = NULL;
1450 unsigned long *l0tab = NULL;
1451 l2_pgentry_t l2e = { 0 };
1452 l1_pgentry_t l1e = { 0 };
1453 struct page_info *page;
1455 #if CONFIG_PAGING_LEVELS >= 4
1456 l4e = top_tab[l4_table_offset(va)];
1457 if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
1459 page = alloc_domheap_page(NULL);
1460 if ( !page )
1461 goto nomem;
1463 l3tab = map_domain_page(page_to_mfn(page));
1464 memset(l3tab, 0, PAGE_SIZE);
1465 l4e = top_tab[l4_table_offset(va)] =
1466 entry_from_page(page, __PAGE_HYPERVISOR);
1468 else
1469 l3tab = map_domain_page(entry_get_pfn(l4e));
1471 l3e = l3tab[l3_table_offset(va)];
1472 if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
1474 page = alloc_domheap_page(NULL);
1475 if ( !page )
1476 goto nomem;
1478 l2tab = map_domain_page(page_to_mfn(page));
1479 memset(l2tab, 0, PAGE_SIZE);
1480 l3e = l3tab[l3_table_offset(va)] =
1481 entry_from_page(page, __PAGE_HYPERVISOR);
1483 else
1484 l2tab = map_domain_page(entry_get_pfn(l3e));
1486 unmap_domain_page(l3tab);
1487 #else
1488 l3e = top_tab[l3_table_offset(va)];
1490 /*
1491 * NB: when CONFIG_PAGING_LEVELS == 3,
1492 * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
1493 * alloc_monitor_pagetable should guarantee this.
1494 */
1495 if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
1496 BUG();
1498 l2tab = map_domain_page(entry_get_pfn(l3e));
1499 #endif
1501 l2e = l2tab[l2_table_offset(va)];
1502 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1504 page = alloc_domheap_page(NULL);
1505 if ( !page )
1506 goto nomem;
1508 l1tab = map_domain_page(page_to_mfn(page));
1509 memset(l1tab, 0, PAGE_SIZE);
1510 l2e = l2tab[l2_table_offset(va)] =
1511 l2e_from_page(page, __PAGE_HYPERVISOR);
1513 else
1514 l1tab = map_domain_page(l2e_get_pfn(l2e));
1516 unmap_domain_page(l2tab);
1518 l1e = l1tab[l1_table_offset(va)];
1519 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
1521 page = alloc_domheap_page(NULL);
1522 if ( !page )
1523 goto nomem;
1525 l0tab = map_domain_page(page_to_mfn(page));
1526 memset(l0tab, 0, PAGE_SIZE);
1527 l1e = l1tab[l1_table_offset(va)] =
1528 l1e_from_page(page, __PAGE_HYPERVISOR);
1530 else
1531 l0tab = map_domain_page(l1e_get_pfn(l1e));
1533 unmap_domain_page(l1tab);
1535 l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
1537 unmap_domain_page(l0tab);
1539 return 1;
1541 nomem:
1542 return 0;
1545 int
1546 set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn,
1547 struct domain_mmap_cache *l2cache,
1548 struct domain_mmap_cache *l1cache)
1550 unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
1551 unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(unsigned long));
1552 pgentry_64_t *top_tab;
1553 int error;
1555 ASSERT(tabmfn != 0);
1556 ASSERT(shadow_lock_is_acquired(d));
1558 top_tab = map_domain_page_with_cache(tabmfn, l2cache);
1560 if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
1561 domain_crash(d);
1563 unmap_domain_page_with_cache(top_tab, l2cache);
1565 return error;
1568 static int
1569 alloc_p2m_table(struct domain *d)
1571 struct list_head *list_ent;
1572 unsigned long va = RO_MPT_VIRT_START; /* phys_to_machine_mapping */
1573 pgentry_64_t *top_tab = NULL;
1574 unsigned long mfn;
1575 int gpfn, error = 0;
1577 ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
1579 top_tab = map_domain_page(
1580 pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
1582 list_ent = d->page_list.next;
1584 for ( gpfn = 0; list_ent != &d->page_list; gpfn++ )
1586 struct page_info *page;
1588 page = list_entry(list_ent, struct page_info, list);
1589 mfn = page_to_mfn(page);
1591 if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
1593 domain_crash(d);
1594 break;
1597 list_ent = frame_table[mfn].list.next;
1598 va += sizeof(mfn);
1601 unmap_domain_page(top_tab);
1603 return error;
1606 #if CONFIG_PAGING_LEVELS >= 3
1607 static void
1608 free_p2m_table(struct domain *d)
1610 unsigned long va;
1611 l1_pgentry_t *l1tab;
1612 l1_pgentry_t l1e;
1613 l2_pgentry_t *l2tab;
1614 l2_pgentry_t l2e;
1615 #if CONFIG_PAGING_LEVELS >= 3
1616 l3_pgentry_t *l3tab;
1617 l3_pgentry_t l3e;
1618 #endif
1619 #if CONFIG_PAGING_LEVELS == 4
1620 int i3;
1621 l4_pgentry_t *l4tab;
1622 l4_pgentry_t l4e;
1623 #endif
1625 ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
1627 #if CONFIG_PAGING_LEVELS == 4
1628 l4tab = map_domain_page(
1629 pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
1630 #endif
1631 #if CONFIG_PAGING_LEVELS == 3
1632 l3tab = map_domain_page(
1633 pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
1635 l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
1637 /*
1638 * NB: when CONFIG_PAGING_LEVELS == 3,
1639 * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
1640 * alloc_monitor_pagetable should guarantee this.
1641 */
1642 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1643 BUG();
1645 l2tab = map_domain_page(l3e_get_pfn(l3e));
1646 #endif
1648 for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
1650 #if CONFIG_PAGING_LEVELS == 4
1651 l4e = l4tab[l4_table_offset(va)];
1653 if ( l4e_get_flags(l4e) & _PAGE_PRESENT )
1655 l3tab = map_domain_page(l4e_get_pfn(l4e));
1657 for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ )
1659 l3e = l3tab[l3_table_offset(va)];
1661 if ( l3e_get_flags(l3e) & _PAGE_PRESENT )
1663 int i2;
1665 l2tab = map_domain_page(l3e_get_pfn(l3e));
1667 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
1669 #endif
1670 l2e = l2tab[l2_table_offset(va)];
1672 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
1674 int i1;
1676 l1tab = map_domain_page(l2e_get_pfn(l2e));
1678 /*
1679 * unsigned long phys_to_machine_mapping[]
1680 */
1681 for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++ )
1683 l1e = l1tab[l1_table_offset(va)];
1685 if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
1686 free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
1688 va += PAGE_SIZE;
1690 unmap_domain_page(l1tab);
1691 free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
1693 else
1694 va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
1696 #if CONFIG_PAGING_LEVELS == 4
1698 unmap_domain_page(l2tab);
1699 free_domheap_page(mfn_to_page(l3e_get_pfn(l3e)));
1701 else
1702 va += PAGE_SIZE * L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES;
1704 unmap_domain_page(l3tab);
1705 free_domheap_page(mfn_to_page(l4e_get_pfn(l4e)));
1707 else
1708 va += PAGE_SIZE *
1709 L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES * L3_PAGETABLE_ENTRIES;
1710 #endif
1713 #if CONFIG_PAGING_LEVELS == 4
1714 unmap_domain_page(l4tab);
1715 #endif
1716 #if CONFIG_PAGING_LEVELS == 3
1717 unmap_domain_page(l3tab);
1718 #endif
1720 #endif
1722 void shadow_l1_normal_pt_update(
1723 struct domain *d,
1724 paddr_t pa, l1_pgentry_t gpte,
1725 struct domain_mmap_cache *cache)
1727 unsigned long sl1mfn;
1728 l1_pgentry_t *spl1e, spte;
1730 shadow_lock(d);
1732 sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
1733 if ( sl1mfn )
1735 SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpde=%" PRIpte,
1736 (void *)pa, l1e_get_intpte(gpte));
1737 l1pte_propagate_from_guest(current->domain, gpte, &spte);
1739 spl1e = map_domain_page_with_cache(sl1mfn, cache);
1740 spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
1741 unmap_domain_page_with_cache(spl1e, cache);
1744 shadow_unlock(d);
1747 void shadow_l2_normal_pt_update(
1748 struct domain *d,
1749 paddr_t pa, l2_pgentry_t gpde,
1750 struct domain_mmap_cache *cache)
1752 unsigned long sl2mfn;
1753 l2_pgentry_t *spl2e;
1755 shadow_lock(d);
1757 sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
1758 if ( sl2mfn )
1760 SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
1761 (void *)pa, l2e_get_intpte(gpde));
1762 spl2e = map_domain_page_with_cache(sl2mfn, cache);
1763 validate_pde_change(d, gpde,
1764 &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
1765 unmap_domain_page_with_cache(spl2e, cache);
1768 shadow_unlock(d);
1771 #if CONFIG_PAGING_LEVELS >= 3
1772 void shadow_l3_normal_pt_update(
1773 struct domain *d,
1774 paddr_t pa, l3_pgentry_t l3e,
1775 struct domain_mmap_cache *cache)
1777 unsigned long sl3mfn;
1778 pgentry_64_t *spl3e;
1780 shadow_lock(d);
1782 sl3mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l3_shadow);
1783 if ( sl3mfn )
1785 SH_VVLOG("shadow_l3_normal_pt_update pa=%p, l3e=%" PRIpte,
1786 (void *)pa, l3e_get_intpte(l3e));
1787 spl3e = (pgentry_64_t *) map_domain_page_with_cache(sl3mfn, cache);
1788 validate_entry_change(d, (pgentry_64_t *) &l3e,
1789 &spl3e[(pa & ~PAGE_MASK) / sizeof(l3_pgentry_t)],
1790 shadow_type_to_level(PGT_l3_shadow));
1791 unmap_domain_page_with_cache(spl3e, cache);
1794 shadow_unlock(d);
1796 #endif
1798 #if CONFIG_PAGING_LEVELS >= 4
1799 void shadow_l4_normal_pt_update(
1800 struct domain *d,
1801 paddr_t pa, l4_pgentry_t l4e,
1802 struct domain_mmap_cache *cache)
1804 unsigned long sl4mfn;
1805 pgentry_64_t *spl4e;
1807 shadow_lock(d);
1809 sl4mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l4_shadow);
1810 if ( sl4mfn )
1812 SH_VVLOG("shadow_l4_normal_pt_update pa=%p, l4e=%" PRIpte,
1813 (void *)pa, l4e_get_intpte(l4e));
1814 spl4e = (pgentry_64_t *)map_domain_page_with_cache(sl4mfn, cache);
1815 validate_entry_change(d, (pgentry_64_t *)&l4e,
1816 &spl4e[(pa & ~PAGE_MASK) / sizeof(l4_pgentry_t)],
1817 shadow_type_to_level(PGT_l4_shadow));
1818 unmap_domain_page_with_cache(spl4e, cache);
1821 shadow_unlock(d);
1823 #endif
1825 static void
1826 translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
1828 int i;
1829 l1_pgentry_t *l1;
1831 l1 = map_domain_page(l1mfn);
1832 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1834 if ( is_guest_l1_slot(i) &&
1835 (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
1837 unsigned long mfn = l1e_get_pfn(l1[i]);
1838 unsigned long gpfn = mfn_to_gmfn(d, mfn);
1839 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1840 l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
1843 unmap_domain_page(l1);
1846 // This is not general enough to handle arbitrary pagetables
1847 // with shared L1 pages, etc., but it is sufficient for bringing
1848 // up dom0.
1849 //
1850 void
1851 translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
1852 unsigned int type)
1854 int i;
1855 l2_pgentry_t *l2;
1857 ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
1859 l2 = map_domain_page(l2mfn);
1860 for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
1862 if ( is_guest_l2_slot(type, i) &&
1863 (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
1865 unsigned long mfn = l2e_get_pfn(l2[i]);
1866 unsigned long gpfn = mfn_to_gmfn(d, mfn);
1867 ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
1868 l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
1869 translate_l1pgtable(d, p2m, mfn);
1872 unmap_domain_page(l2);
1875 void
1876 remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
1878 unsigned long smfn;
1880 shadow_lock(d);
1882 while ( stype >= PGT_l1_shadow )
1884 smfn = __shadow_status(d, gpfn, stype);
1885 if ( smfn && MFN_PINNED(smfn) )
1886 shadow_unpin(smfn);
1887 stype -= PGT_l1_shadow;
1890 shadow_unlock(d);
1893 unsigned long
1894 get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn)
1896 unsigned long va, tabpfn;
1897 l1_pgentry_t *l1, l1e;
1898 l2_pgentry_t *l2, l2e;
1899 #if CONFIG_PAGING_LEVELS >= 4
1900 pgentry_64_t *l4 = NULL;
1901 pgentry_64_t l4e = { 0 };
1902 #endif
1903 pgentry_64_t *l3 = NULL;
1904 pgentry_64_t l3e = { 0 };
1905 unsigned long *l0tab = NULL;
1906 unsigned long mfn;
1908 ASSERT(shadow_mode_translate(d));
1910 perfc_incrc(get_mfn_from_gpfn_foreign);
1912 va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
1914 tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
1915 if ( !tabpfn )
1916 return INVALID_MFN;
1918 #if CONFIG_PAGING_LEVELS >= 4
1919 l4 = map_domain_page(tabpfn);
1920 l4e = l4[l4_table_offset(va)];
1921 unmap_domain_page(l4);
1922 if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
1923 return INVALID_MFN;
1925 l3 = map_domain_page(entry_get_pfn(l4e));
1926 #else
1927 l3 = map_domain_page(tabpfn);
1928 #endif
1929 l3e = l3[l3_table_offset(va)];
1930 unmap_domain_page(l3);
1931 if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
1932 return INVALID_MFN;
1933 l2 = map_domain_page(entry_get_pfn(l3e));
1934 l2e = l2[l2_table_offset(va)];
1935 unmap_domain_page(l2);
1936 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1937 return INVALID_MFN;
1939 l1 = map_domain_page(l2e_get_pfn(l2e));
1940 l1e = l1[l1_table_offset(va)];
1941 unmap_domain_page(l1);
1942 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
1943 return INVALID_MFN;
1945 l0tab = map_domain_page(l1e_get_pfn(l1e));
1946 mfn = l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1)];
1947 unmap_domain_page(l0tab);
1948 return mfn;
1951 static u32 remove_all_access_in_page(
1952 struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
1954 l1_pgentry_t *pl1e = map_domain_page(l1mfn);
1955 l1_pgentry_t match, ol2e;
1956 unsigned long flags = _PAGE_PRESENT;
1957 int i;
1958 u32 count = 0;
1959 int is_l1_shadow =
1960 ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) ==
1961 PGT_l1_shadow);
1963 match = l1e_from_pfn(forbidden_gmfn, flags);
1965 for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
1967 if ( l1e_has_changed(pl1e[i], match, flags) )
1968 continue;
1970 ol2e = pl1e[i];
1971 pl1e[i] = l1e_empty();
1972 count++;
1974 if ( is_l1_shadow )
1975 shadow_put_page_from_l1e(ol2e, d);
1976 else /* must be an hl2 page */
1977 put_page(mfn_to_page(forbidden_gmfn));
1980 unmap_domain_page(pl1e);
1982 return count;
1985 static u32 __shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
1987 int i;
1988 struct shadow_status *a;
1989 u32 count = 0;
1991 if ( unlikely(!shadow_mode_enabled(d)) )
1992 return 0;
1994 ASSERT(shadow_lock_is_acquired(d));
1995 perfc_incrc(remove_all_access);
1997 for (i = 0; i < shadow_ht_buckets; i++)
1999 a = &d->arch.shadow_ht[i];
2000 while ( a && a->gpfn_and_flags )
2002 switch (a->gpfn_and_flags & PGT_type_mask)
2004 case PGT_l1_shadow:
2005 case PGT_l2_shadow:
2006 case PGT_l3_shadow:
2007 case PGT_l4_shadow:
2008 case PGT_hl2_shadow:
2009 count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
2010 break;
2011 case PGT_snapshot:
2012 case PGT_writable_pred:
2013 // these can't hold refs to the forbidden page
2014 break;
2015 default:
2016 BUG();
2019 a = a->next;
2023 return count;
2026 void shadow_drop_references(
2027 struct domain *d, struct page_info *page)
2029 if ( likely(!shadow_mode_refcounts(d)) ||
2030 ((page->u.inuse.type_info & PGT_count_mask) == 0) )
2031 return;
2033 /* XXX This needs more thought... */
2034 printk("%s: needing to call __shadow_remove_all_access for mfn=%lx\n",
2035 __func__, page_to_mfn(page));
2036 printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
2037 page->count_info, page->u.inuse.type_info);
2039 shadow_lock(d);
2040 __shadow_remove_all_access(d, page_to_mfn(page));
2041 shadow_unlock(d);
2043 printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
2044 page->count_info, page->u.inuse.type_info);
2047 /* XXX Needs more thought. Neither pretty nor fast: a place holder. */
2048 void shadow_sync_and_drop_references(
2049 struct domain *d, struct page_info *page)
2051 if ( likely(!shadow_mode_refcounts(d)) )
2052 return;
2054 shadow_lock(d);
2056 if ( page_out_of_sync(page) )
2057 __shadow_sync_mfn(d, page_to_mfn(page));
2059 __shadow_remove_all_access(d, page_to_mfn(page));
2061 shadow_unlock(d);
2064 void clear_all_shadow_status(struct domain *d)
2066 struct vcpu *v = current;
2068 /*
2069 * Don't clean up while other vcpus are working.
2070 */
2071 if ( v->vcpu_id )
2072 return;
2074 shadow_lock(d);
2076 free_shadow_pages(d);
2077 free_shadow_ht_entries(d);
2078 d->arch.shadow_ht =
2079 xmalloc_array(struct shadow_status, shadow_ht_buckets);
2080 if ( d->arch.shadow_ht == NULL ) {
2081 printk("clear all shadow status:xmalloc fail\n");
2082 domain_crash_synchronous();
2084 memset(d->arch.shadow_ht, 0,
2085 shadow_ht_buckets * sizeof(struct shadow_status));
2087 free_out_of_sync_entries(d);
2089 shadow_unlock(d);
2093 /*
2094 * Local variables:
2095 * mode: C
2096 * c-set-style: "BSD"
2097 * c-basic-offset: 4
2098 * tab-width: 4
2099 * indent-tabs-mode: nil
2100 * End:
2101 */