ia64/xen-unstable

view xen/arch/x86/mm.c @ 10499:69f7e0ea2985

[XEN] Fix PAE PDPT shadowing coherence.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Jun 20 17:04:13 2006 +0100 (2006-06-20)
parents 46e853c34a2e
children af9809f51f81
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/shadow.h>
103 #include <asm/page.h>
104 #include <asm/flushtlb.h>
105 #include <asm/io.h>
106 #include <asm/ldt.h>
107 #include <asm/x86_emulate.h>
108 #include <public/memory.h>
110 #ifdef VERBOSE
111 #define MEM_LOG(_f, _a...) \
112 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
113 current->domain->domain_id , __LINE__ , ## _a )
114 #else
115 #define MEM_LOG(_f, _a...) ((void)0)
116 #endif
118 /*
119 * PTE updates can be done with ordinary writes except:
120 * 1. Debug builds get extra checking by using CMPXCHG[8B].
121 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
122 */
123 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
124 #define PTE_UPDATE_WITH_CMPXCHG
125 #endif
127 /*
128 * Both do_mmuext_op() and do_mmu_update():
129 * We steal the m.s.b. of the @count parameter to indicate whether this
130 * invocation of do_mmu_update() is resuming a previously preempted call.
131 */
132 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
134 static void free_l2_table(struct page_info *page);
135 static void free_l1_table(struct page_info *page);
137 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
138 unsigned long type);
139 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
141 /* Used to defer flushing of memory structures. */
142 static struct {
143 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
144 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
145 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
146 unsigned int deferred_ops;
147 /* If non-NULL, specifies a foreign subject domain for some operations. */
148 struct domain *foreign;
149 } __cacheline_aligned percpu_info[NR_CPUS];
151 /*
152 * Returns the current foreign domain; defaults to the currently-executing
153 * domain if a foreign override hasn't been specified.
154 */
155 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
157 /* Private domain structs for DOMID_XEN and DOMID_IO. */
158 static struct domain *dom_xen, *dom_io;
160 /* Frame table and its size in pages. */
161 struct page_info *frame_table;
162 unsigned long max_page;
163 unsigned long total_pages;
165 void __init init_frametable(void)
166 {
167 unsigned long nr_pages, page_step, i, mfn;
169 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
171 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
172 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
174 for ( i = 0; i < nr_pages; i += page_step )
175 {
176 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
177 if ( mfn == 0 )
178 panic("Not enough memory for frame table\n");
179 map_pages_to_xen(
180 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
181 mfn, page_step, PAGE_HYPERVISOR);
182 }
184 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
185 }
187 void arch_init_memory(void)
188 {
189 extern void subarch_init_memory(void);
191 unsigned long i, pfn, rstart_pfn, rend_pfn;
193 memset(percpu_info, 0, sizeof(percpu_info));
195 /*
196 * Initialise our DOMID_XEN domain.
197 * Any Xen-heap pages that we will allow to be mapped will have
198 * their domain field set to dom_xen.
199 */
200 dom_xen = alloc_domain(DOMID_XEN);
201 BUG_ON(dom_xen == NULL);
203 /*
204 * Initialise our DOMID_IO domain.
205 * This domain owns I/O pages that are within the range of the page_info
206 * array. Mappings occur at the priv of the caller.
207 */
208 dom_io = alloc_domain(DOMID_IO);
209 BUG_ON(dom_io == NULL);
211 /* First 1MB of RAM is historically marked as I/O. */
212 for ( i = 0; i < 0x100; i++ )
213 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
215 /* Any areas not specified as RAM by the e820 map are considered I/O. */
216 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
217 {
218 if ( e820.map[i].type != E820_RAM )
219 continue;
220 /* Every page from cursor to start of next RAM region is I/O. */
221 rstart_pfn = PFN_UP(e820.map[i].addr);
222 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
223 for ( ; pfn < rstart_pfn; pfn++ )
224 {
225 BUG_ON(!mfn_valid(pfn));
226 share_xen_page_with_guest(
227 mfn_to_page(pfn), dom_io, XENSHARE_writable);
228 }
229 /* Skip the RAM region. */
230 pfn = rend_pfn;
231 }
232 BUG_ON(pfn != max_page);
234 subarch_init_memory();
235 }
237 void share_xen_page_with_guest(
238 struct page_info *page, struct domain *d, int readonly)
239 {
240 if ( page_get_owner(page) == d )
241 return;
243 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
245 spin_lock(&d->page_alloc_lock);
247 /* The incremented type count pins as writable or read-only. */
248 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
249 page->u.inuse.type_info |= PGT_validated | 1;
251 page_set_owner(page, d);
252 wmb(); /* install valid domain ptr before updating refcnt. */
253 ASSERT(page->count_info == 0);
254 page->count_info |= PGC_allocated | 1;
256 if ( unlikely(d->xenheap_pages++ == 0) )
257 get_knownalive_domain(d);
258 list_add_tail(&page->list, &d->xenpage_list);
260 spin_unlock(&d->page_alloc_lock);
261 }
263 void share_xen_page_with_privileged_guests(
264 struct page_info *page, int readonly)
265 {
266 share_xen_page_with_guest(page, dom_xen, readonly);
267 }
269 #if defined(CONFIG_X86_PAE)
271 #ifdef NDEBUG
272 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
273 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
274 #else
275 /*
276 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
277 * We cannot safely shadow the idle page table, nor shadow-mode page tables
278 * (detected by lack of an owning domain). As required for correctness, we
279 * always shadow PDPTs aboive 4GB.
280 */
281 #define l3tab_needs_shadow(mfn) \
282 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
283 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
284 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
285 ((mfn) >= 0x100000))
286 #endif
288 static l1_pgentry_t *fix_pae_highmem_pl1e;
290 /* Cache the address of PAE high-memory fixmap page tables. */
291 static int __init cache_pae_fixmap_address(void)
292 {
293 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
294 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
295 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
296 return 0;
297 }
298 __initcall(cache_pae_fixmap_address);
300 static void __write_ptbase(unsigned long mfn)
301 {
302 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
303 struct pae_l3_cache *cache = &current->arch.pae_l3_cache;
304 unsigned int cpu = smp_processor_id();
306 /* Fast path 1: does this mfn need a shadow at all? */
307 if ( !l3tab_needs_shadow(mfn) )
308 {
309 write_cr3(mfn << PAGE_SHIFT);
310 /* Cache is no longer in use or valid (/after/ write to %cr3). */
311 cache->high_mfn = 0;
312 return;
313 }
315 /* Caching logic is not interrupt safe. */
316 ASSERT(!in_irq());
318 /* Fast path 2: is this mfn already cached? */
319 if ( cache->high_mfn == mfn )
320 {
321 write_cr3(__pa(cache->table[cache->inuse_idx]));
322 return;
323 }
325 /* Protects against pae_flush_pgd(). */
326 spin_lock(&cache->lock);
328 cache->inuse_idx ^= 1;
329 cache->high_mfn = mfn;
331 /* Map the guest L3 table and copy to the chosen low-memory cache. */
332 *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
333 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
334 lowmem_l3tab = cache->table[cache->inuse_idx];
335 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
336 *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
338 /* Install the low-memory L3 table in CR3. */
339 write_cr3(__pa(lowmem_l3tab));
341 spin_unlock(&cache->lock);
342 }
344 #else /* !CONFIG_X86_PAE */
346 static void __write_ptbase(unsigned long mfn)
347 {
348 write_cr3(mfn << PAGE_SHIFT);
349 }
351 #endif /* !CONFIG_X86_PAE */
353 void write_ptbase(struct vcpu *v)
354 {
355 __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
356 }
358 void invalidate_shadow_ldt(struct vcpu *v)
359 {
360 int i;
361 unsigned long pfn;
362 struct page_info *page;
364 if ( v->arch.shadow_ldt_mapcnt == 0 )
365 return;
367 v->arch.shadow_ldt_mapcnt = 0;
369 for ( i = 16; i < 32; i++ )
370 {
371 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
372 if ( pfn == 0 ) continue;
373 v->arch.perdomain_ptes[i] = l1e_empty();
374 page = mfn_to_page(pfn);
375 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
376 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
377 put_page_and_type(page);
378 }
380 /* Dispose of the (now possibly invalid) mappings from the TLB. */
381 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
382 }
385 static int alloc_segdesc_page(struct page_info *page)
386 {
387 struct desc_struct *descs;
388 int i;
390 descs = map_domain_page(page_to_mfn(page));
392 for ( i = 0; i < 512; i++ )
393 if ( unlikely(!check_descriptor(&descs[i])) )
394 goto fail;
396 unmap_domain_page(descs);
397 return 1;
399 fail:
400 unmap_domain_page(descs);
401 return 0;
402 }
405 /* Map shadow page at offset @off. */
406 int map_ldt_shadow_page(unsigned int off)
407 {
408 struct vcpu *v = current;
409 struct domain *d = v->domain;
410 unsigned long gmfn, mfn;
411 l1_pgentry_t l1e, nl1e;
412 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
413 int res;
415 #if defined(__x86_64__)
416 /* If in user mode, switch to kernel mode just to read LDT mapping. */
417 int user_mode = !(v->arch.flags & TF_kernel_mode);
418 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
419 #elif defined(__i386__)
420 #define TOGGLE_MODE() ((void)0)
421 #endif
423 BUG_ON(unlikely(in_irq()));
425 shadow_sync_va(v, gva);
427 TOGGLE_MODE();
428 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
429 sizeof(l1e));
430 TOGGLE_MODE();
432 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
433 return 0;
435 gmfn = l1e_get_pfn(l1e);
436 mfn = gmfn_to_mfn(d, gmfn);
437 if ( unlikely(!VALID_MFN(mfn)) )
438 return 0;
440 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
442 if ( !res && unlikely(shadow_mode_refcounts(d)) )
443 {
444 shadow_lock(d);
445 shadow_remove_all_write_access(d, gmfn, mfn);
446 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
447 shadow_unlock(d);
448 }
450 if ( unlikely(!res) )
451 return 0;
453 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
455 v->arch.perdomain_ptes[off + 16] = nl1e;
456 v->arch.shadow_ldt_mapcnt++;
458 return 1;
459 }
462 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
463 {
464 struct page_info *page = mfn_to_page(page_nr);
466 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
467 {
468 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
469 return 0;
470 }
472 return 1;
473 }
476 static int get_page_and_type_from_pagenr(unsigned long page_nr,
477 unsigned long type,
478 struct domain *d)
479 {
480 struct page_info *page = mfn_to_page(page_nr);
482 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
483 return 0;
485 if ( unlikely(!get_page_type(page, type)) )
486 {
487 put_page(page);
488 return 0;
489 }
491 return 1;
492 }
494 #ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
495 /*
496 * We allow root tables to map each other (a.k.a. linear page tables). It
497 * needs some special care with reference counts and access permissions:
498 * 1. The mapping entry must be read-only, or the guest may get write access
499 * to its own PTEs.
500 * 2. We must only bump the reference counts for an *already validated*
501 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
502 * on a validation that is required to complete that validation.
503 * 3. We only need to increment the reference counts for the mapped page
504 * frame if it is mapped by a different root table. This is sufficient and
505 * also necessary to allow validation of a root table mapping itself.
506 */
507 static int
508 get_linear_pagetable(
509 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
510 {
511 unsigned long x, y;
512 struct page_info *page;
513 unsigned long pfn;
515 ASSERT( !shadow_mode_refcounts(d) );
517 if ( (root_get_flags(re) & _PAGE_RW) )
518 {
519 MEM_LOG("Attempt to create linear p.t. with write perms");
520 return 0;
521 }
523 if ( (pfn = root_get_pfn(re)) != re_pfn )
524 {
525 /* Make sure the mapped frame belongs to the correct domain. */
526 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
527 return 0;
529 /*
530 * Make sure that the mapped frame is an already-validated L2 table.
531 * If so, atomically increment the count (checking for overflow).
532 */
533 page = mfn_to_page(pfn);
534 y = page->u.inuse.type_info;
535 do {
536 x = y;
537 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
538 unlikely((x & (PGT_type_mask|PGT_validated)) !=
539 (PGT_root_page_table|PGT_validated)) )
540 {
541 put_page(page);
542 return 0;
543 }
544 }
545 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
546 }
548 return 1;
549 }
550 #endif /* !CONFIG_X86_PAE */
552 int
553 get_page_from_l1e(
554 l1_pgentry_t l1e, struct domain *d)
555 {
556 unsigned long mfn = l1e_get_pfn(l1e);
557 struct page_info *page = mfn_to_page(mfn);
558 int okay;
560 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
561 return 1;
563 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
564 {
565 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
566 return 0;
567 }
569 if ( unlikely(!mfn_valid(mfn)) ||
570 unlikely(page_get_owner(page) == dom_io) )
571 {
572 /* DOMID_IO reverts to caller for privilege checks. */
573 if ( d == dom_io )
574 d = current->domain;
576 if ( !iomem_access_permitted(d, mfn, mfn) )
577 {
578 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
579 return 0;
580 }
582 /* No reference counting for out-of-range I/O pages. */
583 if ( !mfn_valid(mfn) )
584 return 1;
586 d = dom_io;
587 }
589 okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
590 get_page_and_type(page, d, PGT_writable_page) :
591 get_page(page, d));
592 if ( !okay )
593 {
594 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
595 " for dom%d",
596 mfn, get_gpfn_from_mfn(mfn),
597 l1e_get_intpte(l1e), d->domain_id);
598 }
600 return okay;
601 }
604 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
605 static int
606 get_page_from_l2e(
607 l2_pgentry_t l2e, unsigned long pfn,
608 struct domain *d, unsigned long vaddr)
609 {
610 int rc;
612 ASSERT(!shadow_mode_refcounts(d));
614 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
615 return 1;
617 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
618 {
619 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
620 return 0;
621 }
623 vaddr >>= L2_PAGETABLE_SHIFT;
624 vaddr <<= PGT_va_shift;
625 rc = get_page_and_type_from_pagenr(
626 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
627 #if CONFIG_PAGING_LEVELS == 2
628 if ( unlikely(!rc) )
629 rc = get_linear_pagetable(l2e, pfn, d);
630 #endif
631 return rc;
632 }
635 #if CONFIG_PAGING_LEVELS >= 3
636 static int
637 get_page_from_l3e(
638 l3_pgentry_t l3e, unsigned long pfn,
639 struct domain *d, unsigned long vaddr)
640 {
641 int rc;
643 ASSERT(!shadow_mode_refcounts(d));
645 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
646 return 1;
648 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
649 {
650 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
651 return 0;
652 }
654 vaddr >>= L3_PAGETABLE_SHIFT;
655 vaddr <<= PGT_va_shift;
656 rc = get_page_and_type_from_pagenr(
657 l3e_get_pfn(l3e),
658 PGT_l2_page_table | vaddr, d);
659 return rc;
660 }
661 #endif /* 3 level */
663 #if CONFIG_PAGING_LEVELS >= 4
664 static int
665 get_page_from_l4e(
666 l4_pgentry_t l4e, unsigned long pfn,
667 struct domain *d, unsigned long vaddr)
668 {
669 int rc;
671 ASSERT( !shadow_mode_refcounts(d) );
673 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
674 return 1;
676 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
677 {
678 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
679 return 0;
680 }
682 vaddr >>= L4_PAGETABLE_SHIFT;
683 vaddr <<= PGT_va_shift;
684 rc = get_page_and_type_from_pagenr(
685 l4e_get_pfn(l4e),
686 PGT_l3_page_table | vaddr, d);
688 if ( unlikely(!rc) )
689 rc = get_linear_pagetable(l4e, pfn, d);
691 return rc;
692 }
693 #endif /* 4 level */
696 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
697 {
698 unsigned long pfn = l1e_get_pfn(l1e);
699 struct page_info *page = mfn_to_page(pfn);
700 struct domain *e;
701 struct vcpu *v;
703 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
704 return;
706 e = page_get_owner(page);
708 /*
709 * Check if this is a mapping that was established via a grant reference.
710 * If it was then we should not be here: we require that such mappings are
711 * explicitly destroyed via the grant-table interface.
712 *
713 * The upshot of this is that the guest can end up with active grants that
714 * it cannot destroy (because it no longer has a PTE to present to the
715 * grant-table interface). This can lead to subtle hard-to-catch bugs,
716 * hence a special grant PTE flag can be enabled to catch the bug early.
717 *
718 * (Note that the undestroyable active grants are not a security hole in
719 * Xen. All active grants can safely be cleaned up when the domain dies.)
720 */
721 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
722 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
723 {
724 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
725 l1e_get_intpte(l1e));
726 domain_crash(d);
727 }
729 if ( l1e_get_flags(l1e) & _PAGE_RW )
730 {
731 put_page_and_type(page);
732 }
733 else
734 {
735 /* We expect this is rare so we blow the entire shadow LDT. */
736 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
737 PGT_ldt_page)) &&
738 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
739 (d == e) )
740 {
741 for_each_vcpu ( d, v )
742 invalidate_shadow_ldt(v);
743 }
744 put_page(page);
745 }
746 }
749 /*
750 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
751 * Note also that this automatically deals correctly with linear p.t.'s.
752 */
753 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
754 {
755 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
756 (l2e_get_pfn(l2e) != pfn) )
757 put_page_and_type(mfn_to_page(l2e_get_pfn(l2e)));
758 }
761 #if CONFIG_PAGING_LEVELS >= 3
762 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
763 {
764 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
765 (l3e_get_pfn(l3e) != pfn) )
766 put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
767 }
768 #endif
770 #if CONFIG_PAGING_LEVELS >= 4
771 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
772 {
773 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
774 (l4e_get_pfn(l4e) != pfn) )
775 put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
776 }
777 #endif
779 static int alloc_l1_table(struct page_info *page)
780 {
781 struct domain *d = page_get_owner(page);
782 unsigned long pfn = page_to_mfn(page);
783 l1_pgentry_t *pl1e;
784 int i;
786 ASSERT(!shadow_mode_refcounts(d));
788 pl1e = map_domain_page(pfn);
790 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
791 if ( is_guest_l1_slot(i) &&
792 unlikely(!get_page_from_l1e(pl1e[i], d)) )
793 goto fail;
795 unmap_domain_page(pl1e);
796 return 1;
798 fail:
799 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
800 while ( i-- > 0 )
801 if ( is_guest_l1_slot(i) )
802 put_page_from_l1e(pl1e[i], d);
804 unmap_domain_page(pl1e);
805 return 0;
806 }
808 #ifdef CONFIG_X86_PAE
809 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
810 {
811 struct page_info *page;
812 l2_pgentry_t *pl2e;
813 l3_pgentry_t l3e3;
814 int i;
816 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
818 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
819 l3e3 = pl3e[3];
820 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
821 {
822 MEM_LOG("PAE L3 3rd slot is empty");
823 return 0;
824 }
826 /*
827 * The Xen-private mappings include linear mappings. The L2 thus cannot
828 * be shared by multiple L3 tables. The test here is adequate because:
829 * 1. Cannot appear in slots != 3 because the page would then then have
830 * unknown va backpointer, which get_page_type() explicitly disallows.
831 * 2. Cannot appear in another page table's L3:
832 * a. alloc_l3_table() calls this function and this check will fail
833 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
834 */
835 page = l3e_get_page(l3e3);
836 BUG_ON(page->u.inuse.type_info & PGT_pinned);
837 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
838 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
839 {
840 MEM_LOG("PAE L3 3rd slot is shared");
841 return 0;
842 }
844 /* Xen private mappings. */
845 pl2e = map_domain_page(l3e_get_pfn(l3e3));
846 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
847 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
848 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
849 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
850 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
851 l2e_from_page(
852 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
853 __PAGE_HYPERVISOR);
854 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
855 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
856 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
857 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
858 l2e_empty();
859 unmap_domain_page(pl2e);
861 return 1;
862 }
864 /* Flush a pgdir update into low-memory caches. */
865 static void pae_flush_pgd(
866 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
867 {
868 struct domain *d = page_get_owner(mfn_to_page(mfn));
869 struct vcpu *v;
870 intpte_t _ol3e, _nl3e, _pl3e;
871 l3_pgentry_t *l3tab_ptr;
872 struct pae_l3_cache *cache;
874 /* If below 4GB then the pgdir is not shadowed in low memory. */
875 if ( !l3tab_needs_shadow(mfn) )
876 return;
878 for_each_vcpu ( d, v )
879 {
880 cache = &v->arch.pae_l3_cache;
882 spin_lock(&cache->lock);
884 if ( cache->high_mfn == mfn )
885 {
886 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
887 _ol3e = l3e_get_intpte(*l3tab_ptr);
888 _nl3e = l3e_get_intpte(nl3e);
889 _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
890 BUG_ON(_pl3e != _ol3e);
891 }
893 spin_unlock(&cache->lock);
894 }
896 flush_tlb_mask(d->domain_dirty_cpumask);
897 }
899 static inline int l1_backptr(
900 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
901 {
902 unsigned long l2_backptr = l2_type & PGT_va_mask;
903 ASSERT(l2_backptr != PGT_va_unknown);
904 ASSERT(l2_backptr != PGT_va_mutable);
905 *backptr =
906 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
907 (offset_in_l2 << L2_PAGETABLE_SHIFT);
908 return 1;
909 }
911 #elif CONFIG_X86_64
912 # define create_pae_xen_mappings(pl3e) (1)
913 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
915 static inline int l1_backptr(
916 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
917 {
918 unsigned long l2_backptr = l2_type & PGT_va_mask;
919 ASSERT(l2_backptr != PGT_va_unknown);
920 ASSERT(l2_backptr != PGT_va_mutable);
921 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
922 (offset_in_l2 << L2_PAGETABLE_SHIFT);
923 return 1;
924 }
926 static inline int l2_backptr(
927 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
928 {
929 unsigned long l3_backptr = l3_type & PGT_va_mask;
930 ASSERT(l3_backptr != PGT_va_unknown);
931 ASSERT(l3_backptr != PGT_va_mutable);
932 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
933 (offset_in_l3 << L3_PAGETABLE_SHIFT);
934 return 1;
935 }
937 static inline int l3_backptr(
938 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
939 {
940 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
941 return 1;
942 }
943 #else
944 # define create_pae_xen_mappings(pl3e) (1)
945 # define l1_backptr(bp,l2o,l2t) \
946 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
947 #endif
949 static int alloc_l2_table(struct page_info *page, unsigned long type)
950 {
951 struct domain *d = page_get_owner(page);
952 unsigned long pfn = page_to_mfn(page);
953 unsigned long vaddr;
954 l2_pgentry_t *pl2e;
955 int i;
957 /* See the code in shadow_promote() to understand why this is here. */
958 if ( (PGT_base_page_table == PGT_l2_page_table) &&
959 unlikely(shadow_mode_refcounts(d)) )
960 return 1;
961 ASSERT(!shadow_mode_refcounts(d));
963 pl2e = map_domain_page(pfn);
965 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
966 {
967 if ( !l1_backptr(&vaddr, i, type) )
968 goto fail;
969 if ( is_guest_l2_slot(type, i) &&
970 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
971 goto fail;
972 }
974 #if CONFIG_PAGING_LEVELS == 2
975 /* Xen private mappings. */
976 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
977 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
978 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
979 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
980 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
981 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
982 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
983 l2e_from_page(
984 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
985 __PAGE_HYPERVISOR);
986 #endif
988 unmap_domain_page(pl2e);
989 return 1;
991 fail:
992 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
993 while ( i-- > 0 )
994 if ( is_guest_l2_slot(type, i) )
995 put_page_from_l2e(pl2e[i], pfn);
997 unmap_domain_page(pl2e);
998 return 0;
999 }
1002 #if CONFIG_PAGING_LEVELS >= 3
1003 static int alloc_l3_table(struct page_info *page, unsigned long type)
1005 struct domain *d = page_get_owner(page);
1006 unsigned long pfn = page_to_mfn(page);
1007 unsigned long vaddr;
1008 l3_pgentry_t *pl3e;
1009 int i;
1011 ASSERT(!shadow_mode_refcounts(d));
1013 #ifdef CONFIG_X86_PAE
1014 /*
1015 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1016 * the weird 'extended cr3' format for dealing with high-order address
1017 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1018 */
1019 if ( (pfn >= 0x100000) &&
1020 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1021 d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) )
1023 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1024 return 0;
1026 #endif
1028 pl3e = map_domain_page(pfn);
1029 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1031 #if CONFIG_PAGING_LEVELS >= 4
1032 if ( !l2_backptr(&vaddr, i, type) )
1033 goto fail;
1034 #else
1035 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
1036 #endif
1037 if ( is_guest_l3_slot(i) &&
1038 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
1039 goto fail;
1042 if ( !create_pae_xen_mappings(pl3e) )
1043 goto fail;
1045 unmap_domain_page(pl3e);
1046 return 1;
1048 fail:
1049 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1050 while ( i-- > 0 )
1051 if ( is_guest_l3_slot(i) )
1052 put_page_from_l3e(pl3e[i], pfn);
1054 unmap_domain_page(pl3e);
1055 return 0;
1057 #else
1058 #define alloc_l3_table(page, type) (0)
1059 #endif
1061 #if CONFIG_PAGING_LEVELS >= 4
1062 static int alloc_l4_table(struct page_info *page, unsigned long type)
1064 struct domain *d = page_get_owner(page);
1065 unsigned long pfn = page_to_mfn(page);
1066 l4_pgentry_t *pl4e = page_to_virt(page);
1067 unsigned long vaddr;
1068 int i;
1070 /* See the code in shadow_promote() to understand why this is here. */
1071 if ( (PGT_base_page_table == PGT_l4_page_table) &&
1072 shadow_mode_refcounts(d) )
1073 return 1;
1074 ASSERT(!shadow_mode_refcounts(d));
1076 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1078 if ( !l3_backptr(&vaddr, i, type) )
1079 goto fail;
1081 if ( is_guest_l4_slot(i) &&
1082 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
1083 goto fail;
1086 /* Xen private mappings. */
1087 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1088 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1089 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1090 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1091 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1092 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1093 l4e_from_page(
1094 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
1095 __PAGE_HYPERVISOR);
1097 return 1;
1099 fail:
1100 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1101 while ( i-- > 0 )
1102 if ( is_guest_l4_slot(i) )
1103 put_page_from_l4e(pl4e[i], pfn);
1105 return 0;
1107 #else
1108 #define alloc_l4_table(page, type) (0)
1109 #endif
1112 static void free_l1_table(struct page_info *page)
1114 struct domain *d = page_get_owner(page);
1115 unsigned long pfn = page_to_mfn(page);
1116 l1_pgentry_t *pl1e;
1117 int i;
1119 pl1e = map_domain_page(pfn);
1121 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1122 if ( is_guest_l1_slot(i) )
1123 put_page_from_l1e(pl1e[i], d);
1125 unmap_domain_page(pl1e);
1129 static void free_l2_table(struct page_info *page)
1131 unsigned long pfn = page_to_mfn(page);
1132 l2_pgentry_t *pl2e;
1133 int i;
1135 pl2e = map_domain_page(pfn);
1137 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1138 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
1139 put_page_from_l2e(pl2e[i], pfn);
1141 unmap_domain_page(pl2e);
1145 #if CONFIG_PAGING_LEVELS >= 3
1147 static void free_l3_table(struct page_info *page)
1149 unsigned long pfn = page_to_mfn(page);
1150 l3_pgentry_t *pl3e;
1151 int i;
1153 pl3e = map_domain_page(pfn);
1155 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1156 if ( is_guest_l3_slot(i) )
1157 put_page_from_l3e(pl3e[i], pfn);
1159 unmap_domain_page(pl3e);
1162 #endif
1164 #if CONFIG_PAGING_LEVELS >= 4
1166 static void free_l4_table(struct page_info *page)
1168 unsigned long pfn = page_to_mfn(page);
1169 l4_pgentry_t *pl4e = page_to_virt(page);
1170 int i;
1172 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1173 if ( is_guest_l4_slot(i) )
1174 put_page_from_l4e(pl4e[i], pfn);
1177 #endif
1179 static inline int update_l1e(l1_pgentry_t *pl1e,
1180 l1_pgentry_t ol1e,
1181 l1_pgentry_t nl1e)
1183 #ifndef PTE_UPDATE_WITH_CMPXCHG
1184 return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e));
1185 #else
1186 intpte_t o = l1e_get_intpte(ol1e);
1187 intpte_t n = l1e_get_intpte(nl1e);
1189 for ( ; ; )
1191 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
1193 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1194 ": saw %" PRIpte,
1195 l1e_get_intpte(ol1e),
1196 l1e_get_intpte(nl1e),
1197 o);
1198 return 0;
1201 if ( o == l1e_get_intpte(ol1e) )
1202 break;
1204 /* Allowed to change in Accessed/Dirty flags only. */
1205 BUG_ON((o ^ l1e_get_intpte(ol1e)) &
1206 ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
1207 ol1e = l1e_from_intpte(o);
1210 return 1;
1211 #endif
1215 /* Update the L1 entry at pl1e to new value nl1e. */
1216 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1218 l1_pgentry_t ol1e;
1219 struct domain *d = current->domain;
1221 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1222 return 0;
1224 if ( unlikely(shadow_mode_refcounts(d)) )
1225 return update_l1e(pl1e, ol1e, nl1e);
1227 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1229 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1231 MEM_LOG("Bad L1 flags %x",
1232 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1233 return 0;
1236 /* Fast path for identical mapping, r/w and presence. */
1237 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1238 return update_l1e(pl1e, ol1e, nl1e);
1240 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1241 return 0;
1243 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1245 put_page_from_l1e(nl1e, d);
1246 return 0;
1249 else
1251 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1252 return 0;
1255 put_page_from_l1e(ol1e, d);
1256 return 1;
1259 #ifndef PTE_UPDATE_WITH_CMPXCHG
1260 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
1261 #else
1262 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1263 for ( ; ; ) \
1264 { \
1265 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1266 _t ## e_get_intpte(_o), \
1267 _t ## e_get_intpte(_n)); \
1268 if ( __o == _t ## e_get_intpte(_o) ) \
1269 break; \
1270 /* Allowed to change in Accessed/Dirty flags only. */ \
1271 BUG_ON((__o ^ _t ## e_get_intpte(_o)) & \
1272 ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY)); \
1273 _o = _t ## e_from_intpte(__o); \
1274 } \
1275 1; })
1276 #endif
1278 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1279 static int mod_l2_entry(l2_pgentry_t *pl2e,
1280 l2_pgentry_t nl2e,
1281 unsigned long pfn,
1282 unsigned long type)
1284 l2_pgentry_t ol2e;
1285 unsigned long vaddr = 0;
1287 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1289 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1290 return 0;
1293 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1294 return 0;
1296 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1298 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1300 MEM_LOG("Bad L2 flags %x",
1301 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1302 return 0;
1305 /* Fast path for identical mapping and presence. */
1306 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1307 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1309 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1310 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1311 return 0;
1313 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1315 put_page_from_l2e(nl2e, pfn);
1316 return 0;
1319 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1321 return 0;
1324 put_page_from_l2e(ol2e, pfn);
1325 return 1;
1329 #if CONFIG_PAGING_LEVELS >= 3
1331 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1332 static int mod_l3_entry(l3_pgentry_t *pl3e,
1333 l3_pgentry_t nl3e,
1334 unsigned long pfn,
1335 unsigned long type)
1337 l3_pgentry_t ol3e;
1338 unsigned long vaddr;
1339 int okay;
1341 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1343 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1344 return 0;
1347 #ifdef CONFIG_X86_PAE
1348 /*
1349 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1350 * would be a pain to ensure they remain continuously valid throughout.
1351 */
1352 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1353 return 0;
1354 #endif
1356 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1357 return 0;
1359 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1361 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1363 MEM_LOG("Bad L3 flags %x",
1364 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1365 return 0;
1368 /* Fast path for identical mapping and presence. */
1369 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1370 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1372 #if CONFIG_PAGING_LEVELS >= 4
1373 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1374 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1375 return 0;
1376 #else
1377 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1378 << L3_PAGETABLE_SHIFT;
1379 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1380 return 0;
1381 #endif
1383 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1385 put_page_from_l3e(nl3e, pfn);
1386 return 0;
1389 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1391 return 0;
1394 okay = create_pae_xen_mappings(pl3e);
1395 BUG_ON(!okay);
1397 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1399 put_page_from_l3e(ol3e, pfn);
1400 return 1;
1403 #endif
1405 #if CONFIG_PAGING_LEVELS >= 4
1407 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1408 static int mod_l4_entry(l4_pgentry_t *pl4e,
1409 l4_pgentry_t nl4e,
1410 unsigned long pfn,
1411 unsigned long type)
1413 l4_pgentry_t ol4e;
1414 unsigned long vaddr;
1416 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1418 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1419 return 0;
1422 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1423 return 0;
1425 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1427 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1429 MEM_LOG("Bad L4 flags %x",
1430 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1431 return 0;
1434 /* Fast path for identical mapping and presence. */
1435 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1436 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1438 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1439 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1440 return 0;
1442 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1444 put_page_from_l4e(nl4e, pfn);
1445 return 0;
1448 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1450 return 0;
1453 put_page_from_l4e(ol4e, pfn);
1454 return 1;
1457 #endif
1459 int alloc_page_type(struct page_info *page, unsigned long type)
1461 struct domain *owner = page_get_owner(page);
1463 if ( owner != NULL )
1464 mark_dirty(owner, page_to_mfn(page));
1466 switch ( type & PGT_type_mask )
1468 case PGT_l1_page_table:
1469 return alloc_l1_table(page);
1470 case PGT_l2_page_table:
1471 return alloc_l2_table(page, type);
1472 case PGT_l3_page_table:
1473 return alloc_l3_table(page, type);
1474 case PGT_l4_page_table:
1475 return alloc_l4_table(page, type);
1476 case PGT_gdt_page:
1477 case PGT_ldt_page:
1478 return alloc_segdesc_page(page);
1479 default:
1480 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1481 type, page->u.inuse.type_info,
1482 page->count_info);
1483 BUG();
1486 return 0;
1490 void free_page_type(struct page_info *page, unsigned long type)
1492 struct domain *owner = page_get_owner(page);
1493 unsigned long gmfn;
1495 if ( likely(owner != NULL) )
1497 /*
1498 * We have to flush before the next use of the linear mapping
1499 * (e.g., update_va_mapping()) or we could end up modifying a page
1500 * that is no longer a page table (and hence screw up ref counts).
1501 */
1502 percpu_info[smp_processor_id()].deferred_ops |= DOP_FLUSH_ALL_TLBS;
1504 if ( unlikely(shadow_mode_enabled(owner)) )
1506 /* Raw page tables are rewritten during save/restore. */
1507 if ( !shadow_mode_translate(owner) )
1508 mark_dirty(owner, page_to_mfn(page));
1510 if ( shadow_mode_refcounts(owner) )
1511 return;
1513 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1514 ASSERT(VALID_M2P(gmfn));
1515 remove_shadow(owner, gmfn, type & PGT_type_mask);
1519 switch ( type & PGT_type_mask )
1521 case PGT_l1_page_table:
1522 free_l1_table(page);
1523 break;
1525 case PGT_l2_page_table:
1526 free_l2_table(page);
1527 break;
1529 #if CONFIG_PAGING_LEVELS >= 3
1530 case PGT_l3_page_table:
1531 free_l3_table(page);
1532 break;
1533 #endif
1535 #if CONFIG_PAGING_LEVELS >= 4
1536 case PGT_l4_page_table:
1537 free_l4_table(page);
1538 break;
1539 #endif
1541 default:
1542 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1543 type, page_to_mfn(page));
1544 BUG();
1549 void put_page_type(struct page_info *page)
1551 unsigned long nx, x, y = page->u.inuse.type_info;
1553 again:
1554 do {
1555 x = y;
1556 nx = x - 1;
1558 ASSERT((x & PGT_count_mask) != 0);
1560 /*
1561 * The page should always be validated while a reference is held. The
1562 * exception is during domain destruction, when we forcibly invalidate
1563 * page-table pages if we detect a referential loop.
1564 * See domain.c:relinquish_list().
1565 */
1566 ASSERT((x & PGT_validated) ||
1567 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1569 if ( unlikely((nx & PGT_count_mask) == 0) )
1571 /* Record TLB information for flush later. Races are harmless. */
1572 page->tlbflush_timestamp = tlbflush_current_time();
1574 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1575 likely(nx & PGT_validated) )
1577 /*
1578 * Page-table pages must be unvalidated when count is zero. The
1579 * 'free' is safe because the refcnt is non-zero and validated
1580 * bit is clear => other ops will spin or fail.
1581 */
1582 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1583 x & ~PGT_validated)) != x) )
1584 goto again;
1585 /* We cleared the 'valid bit' so we do the clean up. */
1586 free_page_type(page, x);
1587 /* Carry on, but with the 'valid bit' now clear. */
1588 x &= ~PGT_validated;
1589 nx &= ~PGT_validated;
1592 else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) ==
1593 (PGT_pinned|PGT_l1_page_table|1)) )
1595 /* Page is now only pinned. Make the back pointer mutable again. */
1596 nx |= PGT_va_mutable;
1599 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1603 int get_page_type(struct page_info *page, unsigned long type)
1605 unsigned long nx, x, y = page->u.inuse.type_info;
1607 again:
1608 do {
1609 x = y;
1610 nx = x + 1;
1611 if ( unlikely((nx & PGT_count_mask) == 0) )
1613 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1614 return 0;
1616 else if ( unlikely((x & PGT_count_mask) == 0) )
1618 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1620 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1622 /*
1623 * On type change we check to flush stale TLB
1624 * entries. This may be unnecessary (e.g., page
1625 * was GDT/LDT) but those circumstances should be
1626 * very rare.
1627 */
1628 cpumask_t mask =
1629 page_get_owner(page)->domain_dirty_cpumask;
1630 tlbflush_filter(mask, page->tlbflush_timestamp);
1632 if ( unlikely(!cpus_empty(mask)) )
1634 perfc_incrc(need_flush_tlb_flush);
1635 flush_tlb_mask(mask);
1639 /* We lose existing type, back pointer, and validity. */
1640 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1641 nx |= type;
1643 /* No special validation needed for writable pages. */
1644 /* Page tables and GDT/LDT need to be scanned for validity. */
1645 if ( type == PGT_writable_page )
1646 nx |= PGT_validated;
1649 else
1651 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1653 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1655 if ( (current->domain == page_get_owner(page)) &&
1656 ((x & PGT_type_mask) == PGT_writable_page) )
1658 /*
1659 * This ensures functions like set_gdt() see up-to-date
1660 * type info without needing to clean up writable p.t.
1661 * state on the fast path. We take this path only
1662 * when the current type is writable because:
1663 * 1. It's the only type that this path can decrement.
1664 * 2. If we take this path more liberally then we can
1665 * enter a recursive loop via get_page_from_l1e()
1666 * during pagetable revalidation.
1667 */
1668 LOCK_BIGLOCK(current->domain);
1669 cleanup_writable_pagetable(current->domain);
1670 y = page->u.inuse.type_info;
1671 UNLOCK_BIGLOCK(current->domain);
1672 /* Can we make progress now? */
1673 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1674 ((y & PGT_count_mask) == 0) )
1675 goto again;
1677 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1678 ((type & PGT_type_mask) != PGT_l1_page_table) )
1679 MEM_LOG("Bad type (saw %" PRtype_info
1680 " != exp %" PRtype_info ") "
1681 "for mfn %lx (pfn %lx)",
1682 x, type, page_to_mfn(page),
1683 get_gpfn_from_mfn(page_to_mfn(page)));
1684 return 0;
1686 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1688 /* The va backpointer is mutable, hence we update it. */
1689 nx &= ~PGT_va_mask;
1690 nx |= type; /* we know the actual type is correct */
1692 else if ( (type & PGT_va_mask) != PGT_va_mutable )
1694 ASSERT((type & PGT_va_mask) != (x & PGT_va_mask));
1695 #ifdef CONFIG_X86_PAE
1696 /* We use backptr as extra typing. Cannot be unknown. */
1697 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1698 return 0;
1699 #endif
1700 /* Fixme: add code to propagate va_unknown to subtables. */
1701 if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
1702 !shadow_mode_refcounts(page_get_owner(page)) )
1703 return 0;
1704 /* This table is possibly mapped at multiple locations. */
1705 nx &= ~PGT_va_mask;
1706 nx |= PGT_va_unknown;
1709 if ( unlikely(!(x & PGT_validated)) )
1711 /* Someone else is updating validation of this page. Wait... */
1712 while ( (y = page->u.inuse.type_info) == x )
1713 cpu_relax();
1714 goto again;
1718 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1720 if ( unlikely(!(nx & PGT_validated)) )
1722 /* Try to validate page type; drop the new reference on failure. */
1723 if ( unlikely(!alloc_page_type(page, type)) )
1725 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1726 PRtype_info ": caf=%08x taf=%" PRtype_info,
1727 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1728 type, page->count_info, page->u.inuse.type_info);
1729 /* Noone else can get a reference. We hold the only ref. */
1730 page->u.inuse.type_info = 0;
1731 return 0;
1734 /* Noone else is updating simultaneously. */
1735 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1738 return 1;
1742 int new_guest_cr3(unsigned long mfn)
1744 struct vcpu *v = current;
1745 struct domain *d = v->domain;
1746 int okay;
1747 unsigned long old_base_mfn;
1749 ASSERT(writable_pagetable_in_sync(d));
1751 if ( shadow_mode_refcounts(d) )
1753 okay = get_page_from_pagenr(mfn, d);
1754 if ( unlikely(!okay) )
1756 MEM_LOG("Error while installing new baseptr %lx", mfn);
1757 return 0;
1760 else
1762 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1763 if ( unlikely(!okay) )
1765 /* Switch to idle pagetable: this VCPU has no active p.t. now. */
1766 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1767 v->arch.guest_table = pagetable_null();
1768 update_pagetables(v);
1769 write_cr3(__pa(idle_pg_table));
1770 if ( old_base_mfn != 0 )
1771 put_page_and_type(mfn_to_page(old_base_mfn));
1773 /* Retry the validation with no active p.t. for this VCPU. */
1774 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1775 if ( !okay )
1777 /* Failure here is unrecoverable: the VCPU has no pagetable! */
1778 MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
1779 domain_crash(d);
1780 percpu_info[v->processor].deferred_ops = 0;
1781 return 0;
1786 invalidate_shadow_ldt(v);
1788 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1789 v->arch.guest_table = pagetable_from_pfn(mfn);
1790 update_pagetables(v); /* update shadow_table and monitor_table */
1792 write_ptbase(v);
1794 if ( likely(old_base_mfn != 0) )
1796 if ( shadow_mode_refcounts(d) )
1797 put_page(mfn_to_page(old_base_mfn));
1798 else
1799 put_page_and_type(mfn_to_page(old_base_mfn));
1802 /* CR3 also holds a ref to its shadow... */
1803 if ( shadow_mode_enabled(d) )
1805 if ( v->arch.monitor_shadow_ref )
1806 put_shadow_ref(v->arch.monitor_shadow_ref);
1807 v->arch.monitor_shadow_ref =
1808 pagetable_get_pfn(v->arch.monitor_table);
1809 ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
1810 get_shadow_ref(v->arch.monitor_shadow_ref);
1813 return 1;
1816 static void process_deferred_ops(unsigned int cpu)
1818 unsigned int deferred_ops;
1819 struct domain *d = current->domain;
1821 deferred_ops = percpu_info[cpu].deferred_ops;
1822 percpu_info[cpu].deferred_ops = 0;
1824 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1826 if ( shadow_mode_enabled(d) )
1827 shadow_sync_all(d);
1828 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1829 flush_tlb_mask(d->domain_dirty_cpumask);
1830 else
1831 local_flush_tlb();
1834 if ( deferred_ops & DOP_RELOAD_LDT )
1835 (void)map_ldt_shadow_page(0);
1837 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1839 put_domain(percpu_info[cpu].foreign);
1840 percpu_info[cpu].foreign = NULL;
1844 static int set_foreigndom(unsigned int cpu, domid_t domid)
1846 struct domain *e, *d = current->domain;
1847 int okay = 1;
1849 ASSERT(percpu_info[cpu].foreign == NULL);
1851 if ( likely(domid == DOMID_SELF) )
1852 goto out;
1854 if ( domid == d->domain_id )
1856 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1857 d->domain_id);
1858 okay = 0;
1860 else if ( !IS_PRIV(d) )
1862 switch ( domid )
1864 case DOMID_IO:
1865 get_knownalive_domain(dom_io);
1866 percpu_info[cpu].foreign = dom_io;
1867 break;
1868 default:
1869 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1870 okay = 0;
1871 break;
1874 else
1876 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1877 if ( e == NULL )
1879 switch ( domid )
1881 case DOMID_XEN:
1882 get_knownalive_domain(dom_xen);
1883 percpu_info[cpu].foreign = dom_xen;
1884 break;
1885 case DOMID_IO:
1886 get_knownalive_domain(dom_io);
1887 percpu_info[cpu].foreign = dom_io;
1888 break;
1889 default:
1890 MEM_LOG("Unknown domain '%u'", domid);
1891 okay = 0;
1892 break;
1897 out:
1898 return okay;
1901 static inline cpumask_t vcpumask_to_pcpumask(
1902 struct domain *d, unsigned long vmask)
1904 unsigned int vcpu_id;
1905 cpumask_t pmask = CPU_MASK_NONE;
1906 struct vcpu *v;
1908 while ( vmask != 0 )
1910 vcpu_id = find_first_set_bit(vmask);
1911 vmask &= ~(1UL << vcpu_id);
1912 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1913 ((v = d->vcpu[vcpu_id]) != NULL) )
1914 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1917 return pmask;
1920 int do_mmuext_op(
1921 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1922 unsigned int count,
1923 XEN_GUEST_HANDLE(uint) pdone,
1924 unsigned int foreigndom)
1926 struct mmuext_op op;
1927 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1928 unsigned long mfn, type;
1929 unsigned int done = 0;
1930 struct page_info *page;
1931 struct vcpu *v = current;
1932 struct domain *d = v->domain;
1934 LOCK_BIGLOCK(d);
1936 cleanup_writable_pagetable(d);
1938 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1940 count &= ~MMU_UPDATE_PREEMPTED;
1941 if ( unlikely(!guest_handle_is_null(pdone)) )
1942 (void)copy_from_guest(&done, pdone, 1);
1945 if ( !set_foreigndom(cpu, foreigndom) )
1947 rc = -ESRCH;
1948 goto out;
1951 if ( unlikely(!guest_handle_okay(uops, count)) )
1953 rc = -EFAULT;
1954 goto out;
1957 for ( i = 0; i < count; i++ )
1959 if ( hypercall_preempt_check() )
1961 rc = hypercall_create_continuation(
1962 __HYPERVISOR_mmuext_op, "hihi",
1963 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1964 break;
1967 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
1969 MEM_LOG("Bad __copy_from_guest");
1970 rc = -EFAULT;
1971 break;
1974 okay = 1;
1975 mfn = op.arg1.mfn;
1976 page = mfn_to_page(mfn);
1978 switch ( op.cmd )
1980 case MMUEXT_PIN_L1_TABLE:
1981 type = PGT_l1_page_table | PGT_va_mutable;
1982 goto pin_page;
1984 case MMUEXT_PIN_L2_TABLE:
1985 case MMUEXT_PIN_L3_TABLE:
1986 case MMUEXT_PIN_L4_TABLE:
1987 /* Ignore pinning of subdirectories. */
1988 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) != (CONFIG_PAGING_LEVELS - 1) )
1989 break;
1991 type = PGT_root_page_table;
1993 pin_page:
1994 if ( shadow_mode_refcounts(FOREIGNDOM) )
1995 break;
1997 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1998 if ( unlikely(!okay) )
2000 MEM_LOG("Error while pinning mfn %lx", mfn);
2001 break;
2004 if ( unlikely(test_and_set_bit(_PGT_pinned,
2005 &page->u.inuse.type_info)) )
2007 MEM_LOG("Mfn %lx already pinned", mfn);
2008 put_page_and_type(page);
2009 okay = 0;
2010 break;
2013 break;
2015 case MMUEXT_UNPIN_TABLE:
2016 if ( shadow_mode_refcounts(d) )
2017 break;
2019 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2021 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2022 mfn, page_get_owner(page));
2024 else if ( likely(test_and_clear_bit(_PGT_pinned,
2025 &page->u.inuse.type_info)) )
2027 put_page_and_type(page);
2028 put_page(page);
2030 else
2032 okay = 0;
2033 put_page(page);
2034 MEM_LOG("Mfn %lx not pinned", mfn);
2036 break;
2038 case MMUEXT_NEW_BASEPTR:
2039 mfn = gmfn_to_mfn(current->domain, mfn);
2040 okay = new_guest_cr3(mfn);
2041 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
2042 break;
2044 #ifdef __x86_64__
2045 case MMUEXT_NEW_USER_BASEPTR:
2046 okay = get_page_and_type_from_pagenr(
2047 mfn, PGT_root_page_table, d);
2048 if ( unlikely(!okay) )
2050 MEM_LOG("Error while installing new mfn %lx", mfn);
2052 else
2054 unsigned long old_mfn =
2055 pagetable_get_pfn(v->arch.guest_table_user);
2056 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2057 if ( old_mfn != 0 )
2058 put_page_and_type(mfn_to_page(old_mfn));
2060 break;
2061 #endif
2063 case MMUEXT_TLB_FLUSH_LOCAL:
2064 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
2065 break;
2067 case MMUEXT_INVLPG_LOCAL:
2068 if ( shadow_mode_enabled(d) )
2069 shadow_invlpg(v, op.arg1.linear_addr);
2070 local_flush_tlb_one(op.arg1.linear_addr);
2071 break;
2073 case MMUEXT_TLB_FLUSH_MULTI:
2074 case MMUEXT_INVLPG_MULTI:
2076 unsigned long vmask;
2077 cpumask_t pmask;
2078 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
2080 okay = 0;
2081 break;
2083 pmask = vcpumask_to_pcpumask(d, vmask);
2084 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2085 flush_tlb_mask(pmask);
2086 else
2087 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2088 break;
2091 case MMUEXT_TLB_FLUSH_ALL:
2092 flush_tlb_mask(d->domain_dirty_cpumask);
2093 break;
2095 case MMUEXT_INVLPG_ALL:
2096 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2097 break;
2099 case MMUEXT_FLUSH_CACHE:
2100 if ( unlikely(!cache_flush_permitted(d)) )
2102 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2103 okay = 0;
2105 else
2107 wbinvd();
2109 break;
2111 case MMUEXT_SET_LDT:
2113 unsigned long ptr = op.arg1.linear_addr;
2114 unsigned long ents = op.arg2.nr_ents;
2116 if ( shadow_mode_external(d) )
2118 MEM_LOG("ignoring SET_LDT hypercall from external "
2119 "domain %u", d->domain_id);
2120 okay = 0;
2122 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2123 (ents > 8192) ||
2124 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2126 okay = 0;
2127 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2129 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2130 (v->arch.guest_context.ldt_base != ptr) )
2132 invalidate_shadow_ldt(v);
2133 v->arch.guest_context.ldt_base = ptr;
2134 v->arch.guest_context.ldt_ents = ents;
2135 load_LDT(v);
2136 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
2137 if ( ents != 0 )
2138 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
2140 break;
2143 default:
2144 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2145 okay = 0;
2146 break;
2149 if ( unlikely(!okay) )
2151 rc = -EINVAL;
2152 break;
2155 guest_handle_add_offset(uops, 1);
2158 out:
2159 process_deferred_ops(cpu);
2161 /* Add incremental work we have done to the @done output parameter. */
2162 done += i;
2163 if ( unlikely(!guest_handle_is_null(pdone)) )
2164 copy_to_guest(pdone, &done, 1);
2166 UNLOCK_BIGLOCK(d);
2167 return rc;
2170 int do_mmu_update(
2171 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2172 unsigned int count,
2173 XEN_GUEST_HANDLE(uint) pdone,
2174 unsigned int foreigndom)
2176 struct mmu_update req;
2177 void *va;
2178 unsigned long gpfn, gmfn, mfn;
2179 struct page_info *page;
2180 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
2181 unsigned int cmd, done = 0;
2182 struct vcpu *v = current;
2183 struct domain *d = v->domain;
2184 unsigned long type_info;
2185 struct domain_mmap_cache mapcache, sh_mapcache;
2187 LOCK_BIGLOCK(d);
2189 cleanup_writable_pagetable(d);
2191 if ( unlikely(shadow_mode_enabled(d)) )
2192 check_pagetable(v, "pre-mmu"); /* debug */
2194 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2196 count &= ~MMU_UPDATE_PREEMPTED;
2197 if ( unlikely(!guest_handle_is_null(pdone)) )
2198 (void)copy_from_guest(&done, pdone, 1);
2201 domain_mmap_cache_init(&mapcache);
2202 domain_mmap_cache_init(&sh_mapcache);
2204 if ( !set_foreigndom(cpu, foreigndom) )
2206 rc = -ESRCH;
2207 goto out;
2210 perfc_incrc(calls_to_mmu_update);
2211 perfc_addc(num_page_updates, count);
2212 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2214 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2216 rc = -EFAULT;
2217 goto out;
2220 for ( i = 0; i < count; i++ )
2222 if ( hypercall_preempt_check() )
2224 rc = hypercall_create_continuation(
2225 __HYPERVISOR_mmu_update, "hihi",
2226 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2227 break;
2230 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2232 MEM_LOG("Bad __copy_from_guest");
2233 rc = -EFAULT;
2234 break;
2237 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2238 okay = 0;
2240 switch ( cmd )
2242 /*
2243 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2244 */
2245 case MMU_NORMAL_PT_UPDATE:
2247 gmfn = req.ptr >> PAGE_SHIFT;
2248 mfn = gmfn_to_mfn(d, gmfn);
2250 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2252 MEM_LOG("Could not get page for normal update");
2253 break;
2256 va = map_domain_page_with_cache(mfn, &mapcache);
2257 va = (void *)((unsigned long)va +
2258 (unsigned long)(req.ptr & ~PAGE_MASK));
2259 page = mfn_to_page(mfn);
2261 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2263 case PGT_l1_page_table:
2264 case PGT_l2_page_table:
2265 case PGT_l3_page_table:
2266 case PGT_l4_page_table:
2268 ASSERT(!shadow_mode_refcounts(d));
2269 if ( unlikely(!get_page_type(
2270 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2271 goto not_a_pt;
2273 switch ( type_info & PGT_type_mask )
2275 case PGT_l1_page_table:
2277 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2278 okay = mod_l1_entry(va, l1e);
2279 if ( okay && unlikely(shadow_mode_enabled(d)) )
2280 shadow_l1_normal_pt_update(
2281 d, req.ptr, l1e, &sh_mapcache);
2283 break;
2284 case PGT_l2_page_table:
2286 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2287 okay = mod_l2_entry(
2288 (l2_pgentry_t *)va, l2e, mfn, type_info);
2289 if ( okay && unlikely(shadow_mode_enabled(d)) )
2290 shadow_l2_normal_pt_update(
2291 d, req.ptr, l2e, &sh_mapcache);
2293 break;
2294 #if CONFIG_PAGING_LEVELS >= 3
2295 case PGT_l3_page_table:
2297 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2298 okay = mod_l3_entry(va, l3e, mfn, type_info);
2299 if ( okay && unlikely(shadow_mode_enabled(d)) )
2300 shadow_l3_normal_pt_update(
2301 d, req.ptr, l3e, &sh_mapcache);
2303 break;
2304 #endif
2305 #if CONFIG_PAGING_LEVELS >= 4
2306 case PGT_l4_page_table:
2308 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2309 okay = mod_l4_entry(va, l4e, mfn, type_info);
2310 if ( okay && unlikely(shadow_mode_enabled(d)) )
2311 shadow_l4_normal_pt_update(
2312 d, req.ptr, l4e, &sh_mapcache);
2314 break;
2315 #endif
2318 put_page_type(page);
2320 break;
2322 default:
2323 not_a_pt:
2325 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2326 break;
2328 if ( shadow_mode_enabled(d) )
2330 shadow_lock(d);
2331 __mark_dirty(d, mfn);
2332 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2333 shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
2336 *(intpte_t *)va = req.val;
2337 okay = 1;
2339 if ( shadow_mode_enabled(d) )
2340 shadow_unlock(d);
2342 put_page_type(page);
2344 break;
2347 unmap_domain_page_with_cache(va, &mapcache);
2349 put_page(page);
2350 break;
2352 case MMU_MACHPHYS_UPDATE:
2354 if ( shadow_mode_translate(FOREIGNDOM) )
2356 MEM_LOG("can't mutate m2p table of translate mode guest");
2357 break;
2360 mfn = req.ptr >> PAGE_SHIFT;
2361 gpfn = req.val;
2363 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2365 MEM_LOG("Could not get page for mach->phys update");
2366 break;
2369 set_gpfn_from_mfn(mfn, gpfn);
2370 okay = 1;
2372 mark_dirty(FOREIGNDOM, mfn);
2374 put_page(mfn_to_page(mfn));
2375 break;
2377 default:
2378 MEM_LOG("Invalid page update command %x", cmd);
2379 break;
2382 if ( unlikely(!okay) )
2384 rc = -EINVAL;
2385 break;
2388 guest_handle_add_offset(ureqs, 1);
2391 out:
2392 domain_mmap_cache_destroy(&mapcache);
2393 domain_mmap_cache_destroy(&sh_mapcache);
2395 process_deferred_ops(cpu);
2397 /* Add incremental work we have done to the @done output parameter. */
2398 done += i;
2399 if ( unlikely(!guest_handle_is_null(pdone)) )
2400 copy_to_guest(pdone, &done, 1);
2402 if ( unlikely(shadow_mode_enabled(d)) )
2403 check_pagetable(v, "post-mmu"); /* debug */
2405 UNLOCK_BIGLOCK(d);
2406 return rc;
2410 static int create_grant_pte_mapping(
2411 unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
2413 int rc = GNTST_okay;
2414 void *va;
2415 unsigned long gmfn, mfn;
2416 struct page_info *page;
2417 u32 type_info;
2418 l1_pgentry_t ol1e;
2419 struct domain *d = v->domain;
2421 ASSERT(spin_is_locked(&d->big_lock));
2422 ASSERT(!shadow_mode_refcounts(d));
2424 gmfn = pte_addr >> PAGE_SHIFT;
2425 mfn = gmfn_to_mfn(d, gmfn);
2427 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2429 MEM_LOG("Could not get page for normal update");
2430 return GNTST_general_error;
2433 va = map_domain_page(mfn);
2434 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2435 page = mfn_to_page(mfn);
2437 type_info = page->u.inuse.type_info;
2438 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2439 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2441 MEM_LOG("Grant map attempted to update a non-L1 page");
2442 rc = GNTST_general_error;
2443 goto failed;
2446 ol1e = *(l1_pgentry_t *)va;
2447 if ( !update_l1e(va, ol1e, _nl1e) )
2449 put_page_type(page);
2450 rc = GNTST_general_error;
2451 goto failed;
2454 put_page_from_l1e(ol1e, d);
2456 if ( unlikely(shadow_mode_enabled(d)) )
2458 struct domain_mmap_cache sh_mapcache;
2459 domain_mmap_cache_init(&sh_mapcache);
2460 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2461 domain_mmap_cache_destroy(&sh_mapcache);
2464 put_page_type(page);
2466 failed:
2467 unmap_domain_page(va);
2468 put_page(page);
2469 return rc;
2472 static int destroy_grant_pte_mapping(
2473 unsigned long addr, unsigned long frame, struct domain *d)
2475 int rc = GNTST_okay;
2476 void *va;
2477 unsigned long gmfn, mfn;
2478 struct page_info *page;
2479 u32 type_info;
2480 l1_pgentry_t ol1e;
2482 ASSERT(!shadow_mode_refcounts(d));
2484 gmfn = addr >> PAGE_SHIFT;
2485 mfn = gmfn_to_mfn(d, gmfn);
2487 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2489 MEM_LOG("Could not get page for normal update");
2490 return GNTST_general_error;
2493 va = map_domain_page(mfn);
2494 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2495 page = mfn_to_page(mfn);
2497 type_info = page->u.inuse.type_info;
2498 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2499 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2501 MEM_LOG("Grant map attempted to update a non-L1 page");
2502 rc = GNTST_general_error;
2503 goto failed;
2506 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2508 put_page_type(page);
2509 rc = GNTST_general_error;
2510 goto failed;
2513 /* Check that the virtual address supplied is actually mapped to frame. */
2514 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2516 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2517 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2518 put_page_type(page);
2519 rc = GNTST_general_error;
2520 goto failed;
2523 /* Delete pagetable entry. */
2524 if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) )
2526 MEM_LOG("Cannot delete PTE entry at %p", va);
2527 put_page_type(page);
2528 rc = GNTST_general_error;
2529 goto failed;
2532 if ( unlikely(shadow_mode_enabled(d)) )
2534 struct domain_mmap_cache sh_mapcache;
2535 domain_mmap_cache_init(&sh_mapcache);
2536 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2537 domain_mmap_cache_destroy(&sh_mapcache);
2540 put_page_type(page);
2542 failed:
2543 unmap_domain_page(va);
2544 put_page(page);
2545 return rc;
2549 static int create_grant_va_mapping(
2550 unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
2552 l1_pgentry_t *pl1e, ol1e;
2553 struct domain *d = v->domain;
2555 ASSERT(spin_is_locked(&d->big_lock));
2556 ASSERT(!shadow_mode_refcounts(d));
2558 /*
2559 * This is actually overkill - we don't need to sync the L1 itself,
2560 * just everything involved in getting to this L1 (i.e. we need
2561 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2562 */
2563 __shadow_sync_va(v, va);
2565 pl1e = &linear_pg_table[l1_linear_offset(va)];
2567 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2568 !update_l1e(pl1e, ol1e, _nl1e) )
2569 return GNTST_general_error;
2571 put_page_from_l1e(ol1e, d);
2573 if ( unlikely(shadow_mode_enabled(d)) )
2574 shadow_do_update_va_mapping(va, _nl1e, v);
2576 return GNTST_okay;
2579 static int destroy_grant_va_mapping(
2580 unsigned long addr, unsigned long frame)
2582 l1_pgentry_t *pl1e, ol1e;
2584 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2586 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2588 MEM_LOG("Could not find PTE entry for address %lx", addr);
2589 return GNTST_general_error;
2592 /*
2593 * Check that the virtual address supplied is actually mapped to
2594 * frame.
2595 */
2596 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2598 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2599 l1e_get_pfn(ol1e), addr, frame);
2600 return GNTST_general_error;
2603 /* Delete pagetable entry. */
2604 if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) )
2606 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2607 return GNTST_general_error;
2610 return 0;
2613 int create_grant_host_mapping(
2614 unsigned long addr, unsigned long frame, unsigned int flags)
2616 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2618 if ( (flags & GNTMAP_application_map) )
2619 l1e_add_flags(pte,_PAGE_USER);
2620 if ( !(flags & GNTMAP_readonly) )
2621 l1e_add_flags(pte,_PAGE_RW);
2623 if ( flags & GNTMAP_contains_pte )
2624 return create_grant_pte_mapping(addr, pte, current);
2625 return create_grant_va_mapping(addr, pte, current);
2628 int destroy_grant_host_mapping(
2629 unsigned long addr, unsigned long frame, unsigned int flags)
2631 if ( flags & GNTMAP_contains_pte )
2632 return destroy_grant_pte_mapping(addr, frame, current->domain);
2633 return destroy_grant_va_mapping(addr, frame);
2636 int steal_page(
2637 struct domain *d, struct page_info *page, unsigned int memflags)
2639 u32 _d, _nd, x, y;
2641 spin_lock(&d->page_alloc_lock);
2643 /*
2644 * The tricky bit: atomically release ownership while there is just one
2645 * benign reference to the page (PGC_allocated). If that reference
2646 * disappears then the deallocation routine will safely spin.
2647 */
2648 _d = pickle_domptr(d);
2649 _nd = page->u.inuse._domain;
2650 y = page->count_info;
2651 do {
2652 x = y;
2653 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2654 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2655 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2656 " caf=%08x, taf=%" PRtype_info "\n",
2657 (void *) page_to_mfn(page),
2658 d, d->domain_id, unpickle_domptr(_nd), x,
2659 page->u.inuse.type_info);
2660 spin_unlock(&d->page_alloc_lock);
2661 return -1;
2663 __asm__ __volatile__(
2664 LOCK_PREFIX "cmpxchg8b %2"
2665 : "=d" (_nd), "=a" (y),
2666 "=m" (*(volatile u64 *)(&page->count_info))
2667 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2668 } while (unlikely(_nd != _d) || unlikely(y != x));
2670 /*
2671 * Unlink from 'd'. At least one reference remains (now anonymous), so
2672 * noone else is spinning to try to delete this page from 'd'.
2673 */
2674 if ( !(memflags & MEMF_no_refcount) )
2675 d->tot_pages--;
2676 list_del(&page->list);
2678 spin_unlock(&d->page_alloc_lock);
2680 return 0;
2683 int do_update_va_mapping(unsigned long va, u64 val64,
2684 unsigned long flags)
2686 l1_pgentry_t val = l1e_from_intpte(val64);
2687 struct vcpu *v = current;
2688 struct domain *d = v->domain;
2689 unsigned int cpu = smp_processor_id();
2690 unsigned long vmask, bmap_ptr;
2691 cpumask_t pmask;
2692 int rc = 0;
2694 perfc_incrc(calls_to_update_va);
2696 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2697 return -EINVAL;
2699 LOCK_BIGLOCK(d);
2701 cleanup_writable_pagetable(d);
2703 if ( unlikely(shadow_mode_enabled(d)) )
2704 check_pagetable(v, "pre-va"); /* debug */
2706 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2707 val)) )
2708 rc = -EINVAL;
2710 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2712 if ( unlikely(percpu_info[cpu].foreign &&
2713 (shadow_mode_translate(d) ||
2714 shadow_mode_translate(percpu_info[cpu].foreign))) )
2716 /*
2717 * The foreign domain's pfn's are in a different namespace. There's
2718 * not enough information in just a gpte to figure out how to
2719 * (re-)shadow this entry.
2720 */
2721 domain_crash(d);
2724 rc = shadow_do_update_va_mapping(va, val, v);
2726 check_pagetable(v, "post-va"); /* debug */
2729 switch ( flags & UVMF_FLUSHTYPE_MASK )
2731 case UVMF_TLB_FLUSH:
2732 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2734 case UVMF_LOCAL:
2735 if ( unlikely(shadow_mode_enabled(d)) )
2736 shadow_sync_all(d);
2737 local_flush_tlb();
2738 break;
2739 case UVMF_ALL:
2740 flush_tlb_mask(d->domain_dirty_cpumask);
2741 break;
2742 default:
2743 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2744 rc = -EFAULT;
2745 pmask = vcpumask_to_pcpumask(d, vmask);
2746 flush_tlb_mask(pmask);
2747 break;
2749 break;
2751 case UVMF_INVLPG:
2752 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2754 case UVMF_LOCAL:
2755 if ( unlikely(shadow_mode_enabled(d)) )
2756 shadow_invlpg(current, va);
2757 local_flush_tlb_one(va);
2758 break;
2759 case UVMF_ALL:
2760 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2761 break;
2762 default:
2763 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2764 rc = -EFAULT;
2765 pmask = vcpumask_to_pcpumask(d, vmask);
2766 flush_tlb_one_mask(pmask, va);
2767 break;
2769 break;
2772 process_deferred_ops(cpu);
2774 UNLOCK_BIGLOCK(d);
2776 return rc;
2779 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2780 unsigned long flags,
2781 domid_t domid)
2783 unsigned int cpu = smp_processor_id();
2784 int rc;
2786 if ( unlikely(!IS_PRIV(current->domain)) )
2787 return -EPERM;
2789 if ( !set_foreigndom(cpu, domid) )
2790 return -ESRCH;
2792 rc = do_update_va_mapping(va, val64, flags);
2794 return rc;
2799 /*************************
2800 * Descriptor Tables
2801 */
2803 void destroy_gdt(struct vcpu *v)
2805 int i;
2806 unsigned long pfn;
2808 v->arch.guest_context.gdt_ents = 0;
2809 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2811 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2812 put_page_and_type(mfn_to_page(pfn));
2813 v->arch.perdomain_ptes[i] = l1e_empty();
2814 v->arch.guest_context.gdt_frames[i] = 0;
2819 long set_gdt(struct vcpu *v,
2820 unsigned long *frames,
2821 unsigned int entries)
2823 struct domain *d = v->domain;
2824 /* NB. There are 512 8-byte entries per GDT page. */
2825 int i, nr_pages = (entries + 511) / 512;
2826 unsigned long mfn;
2828 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2829 return -EINVAL;
2831 shadow_sync_all(d);
2833 /* Check the pages in the new GDT. */
2834 for ( i = 0; i < nr_pages; i++ ) {
2835 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2836 if ( !mfn_valid(mfn) ||
2837 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2838 goto fail;
2841 /* Tear down the old GDT. */
2842 destroy_gdt(v);
2844 /* Install the new GDT. */
2845 v->arch.guest_context.gdt_ents = entries;
2846 for ( i = 0; i < nr_pages; i++ )
2848 v->arch.guest_context.gdt_frames[i] = frames[i];
2849 v->arch.perdomain_ptes[i] =
2850 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2853 return 0;
2855 fail:
2856 while ( i-- > 0 )
2857 put_page_and_type(mfn_to_page(frames[i]));
2858 return -EINVAL;
2862 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2864 int nr_pages = (entries + 511) / 512;
2865 unsigned long frames[16];
2866 long ret;
2868 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2869 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2870 return -EINVAL;
2872 if ( copy_from_guest((unsigned long *)frames, frame_list, nr_pages) )
2873 return -EFAULT;
2875 LOCK_BIGLOCK(current->domain);
2877 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2878 local_flush_tlb();
2880 UNLOCK_BIGLOCK(current->domain);
2882 return ret;
2886 long do_update_descriptor(u64 pa, u64 desc)
2888 struct domain *dom = current->domain;
2889 unsigned long gmfn = pa >> PAGE_SHIFT;
2890 unsigned long mfn;
2891 unsigned int offset;
2892 struct desc_struct *gdt_pent, d;
2893 struct page_info *page;
2894 long ret = -EINVAL;
2896 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2898 *(u64 *)&d = desc;
2900 LOCK_BIGLOCK(dom);
2902 if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
2903 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2904 !mfn_valid(mfn) ||
2905 !check_descriptor(&d) )
2907 UNLOCK_BIGLOCK(dom);
2908 return -EINVAL;
2911 page = mfn_to_page(mfn);
2912 if ( unlikely(!get_page(page, dom)) )
2914 UNLOCK_BIGLOCK(dom);
2915 return -EINVAL;
2918 /* Check if the given frame is in use in an unsafe context. */
2919 switch ( page->u.inuse.type_info & PGT_type_mask )
2921 case PGT_gdt_page:
2922 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2923 goto out;
2924 break;
2925 case PGT_ldt_page:
2926 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2927 goto out;
2928 break;
2929 default:
2930 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2931 goto out;
2932 break;
2935 if ( shadow_mode_enabled(dom) )
2937 shadow_lock(dom);
2939 __mark_dirty(dom, mfn);
2941 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2942 shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
2945 /* All is good so make the update. */
2946 gdt_pent = map_domain_page(mfn);
2947 memcpy(&gdt_pent[offset], &d, 8);
2948 unmap_domain_page(gdt_pent);
2950 if ( shadow_mode_enabled(dom) )
2951 shadow_unlock(dom);
2953 put_page_type(page);
2955 ret = 0; /* success */
2957 out:
2958 put_page(page);
2960 UNLOCK_BIGLOCK(dom);
2962 return ret;
2965 typedef struct e820entry e820entry_t;
2966 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
2968 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2970 switch ( op )
2972 case XENMEM_add_to_physmap:
2974 struct xen_add_to_physmap xatp;
2975 unsigned long mfn = 0, gpfn;
2976 struct domain *d;
2978 if ( copy_from_guest(&xatp, arg, 1) )
2979 return -EFAULT;
2981 if ( (d = find_domain_by_id(xatp.domid)) == NULL )
2982 return -ESRCH;
2984 switch ( xatp.space )
2986 case XENMAPSPACE_shared_info:
2987 if ( xatp.idx == 0 )
2988 mfn = virt_to_mfn(d->shared_info);
2989 break;
2990 case XENMAPSPACE_grant_table:
2991 if ( xatp.idx < NR_GRANT_FRAMES )
2992 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
2993 break;
2994 default:
2995 break;
2998 if ( !shadow_mode_translate(d) || (mfn == 0) )
3000 put_domain(d);
3001 return -EINVAL;
3004 LOCK_BIGLOCK(d);
3006 /* Remove previously mapped page if it was present. */
3007 if ( mfn_valid(gmfn_to_mfn(d, xatp.gpfn)) )
3008 guest_remove_page(d, xatp.gpfn);
3010 /* Unmap from old location, if any. */
3011 gpfn = get_gpfn_from_mfn(mfn);
3012 if ( gpfn != INVALID_M2P_ENTRY )
3013 guest_physmap_remove_page(d, gpfn, mfn);
3015 /* Map at new location. */
3016 guest_physmap_add_page(d, xatp.gpfn, mfn);
3018 UNLOCK_BIGLOCK(d);
3020 put_domain(d);
3022 break;
3025 case XENMEM_memory_map:
3027 return -ENOSYS;
3030 case XENMEM_machine_memory_map:
3032 struct xen_memory_map memmap;
3033 XEN_GUEST_HANDLE(e820entry_t) buffer;
3034 int count;
3036 if ( !IS_PRIV(current->domain) )
3037 return -EINVAL;
3039 if ( copy_from_guest(&memmap, arg, 1) )
3040 return -EFAULT;
3041 if ( memmap.nr_entries < e820.nr_map + 1 )
3042 return -EINVAL;
3044 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3046 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3047 if ( copy_to_guest(buffer, &e820.map[0], count) < 0 )
3048 return -EFAULT;
3050 memmap.nr_entries = count;
3052 if ( copy_to_guest(arg, &memmap, 1) )
3053 return -EFAULT;
3055 return 0;
3058 case XENMEM_machphys_mapping:
3060 struct xen_machphys_mapping mapping = {
3061 .v_start = MACH2PHYS_VIRT_START,
3062 .v_end = MACH2PHYS_VIRT_END,
3063 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3064 };
3066 if ( copy_to_guest(arg, &mapping, 1) )
3067 return -EFAULT;
3069 return 0;
3072 default:
3073 return subarch_memory_op(op, arg);
3076 return 0;
3080 /*************************
3081 * Writable Pagetables
3082 */
3084 #ifdef VVERBOSE
3085 int ptwr_debug = 0x0;
3086 #define PTWR_PRINTK(_f, _a...) \
3087 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
3088 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
3089 #else
3090 #define PTWR_PRINTK(_f, _a...) ((void)0)
3091 #endif
3094 #ifdef PERF_ARRAYS
3096 /**************** writeable pagetables profiling functions *****************/
3098 #define ptwr_eip_buckets 256
3100 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
3102 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
3104 struct {
3105 unsigned long eip;
3106 domid_t id;
3107 u32 val[ptwr_eip_stat_thresholdN];
3108 } typedef ptwr_eip_stat_t;
3110 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
3112 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
3114 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
3115 ptwr_eip_buckets;
3118 static void ptwr_eip_stat_inc(u32 *n)
3120 unsigned int i, j;
3122 if ( ++(*n) != 0 )
3123 return;
3125 *n = ~0;
3127 /* Re-scale all buckets. */
3128 for ( i = 0; i < ptwr_eip_buckets; i++ )
3129 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3130 ptwr_eip_stats[i].val[j] >>= 1;
3133 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
3135 unsigned int i, j, b;
3137 i = b = ptwr_eip_stat_hash(eip, id);
3139 do
3141 if ( !ptwr_eip_stats[i].eip )
3143 /* doesn't exist */
3144 ptwr_eip_stats[i].eip = eip;
3145 ptwr_eip_stats[i].id = id;
3146 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
3149 if ( ptwr_eip_stats[i].eip == eip && ptwr_eip_stats[i].id == id)
3151 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3152 if ( modified <= ptwr_eip_stat_threshold[j] )
3153 break;
3154 BUG_ON(j >= ptwr_eip_stat_thresholdN);
3155 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
3156 return;
3159 i = (i+1) % ptwr_eip_buckets;
3161 while ( i != b );
3163 printk("ptwr_eip_stat: too many EIPs in use!\n");
3165 ptwr_eip_stat_print();
3166 ptwr_eip_stat_reset();
3169 void ptwr_eip_stat_reset(void)
3171 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
3174 void ptwr_eip_stat_print(void)
3176 struct domain *e;
3177 domid_t d;
3178 unsigned int i, j;
3180 for_each_domain( e )
3182 d = e->domain_id;
3184 for ( i = 0; i < ptwr_eip_buckets; i++ )
3186 if ( !ptwr_eip_stats[i].eip || ptwr_eip_stats[i].id != d )
3187 continue;
3189 printk("D %5d eip %p ",
3190 ptwr_eip_stats[i].id, (void *)ptwr_eip_stats[i].eip);
3192 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3193 printk("<=%u %4u \t",
3194 ptwr_eip_stat_threshold[j],
3195 ptwr_eip_stats[i].val[j]);
3196 printk("\n");
3201 #else /* PERF_ARRAYS */
3203 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
3205 #endif
3207 /*******************************************************************/
3209 /* Re-validate a given p.t. page, given its prior snapshot */
3210 int revalidate_l1(
3211 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
3213 l1_pgentry_t ol1e, nl1e;
3214 int modified = 0, i;
3216 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3218 ol1e = snapshot[i];
3219 nl1e = l1page[i];
3221 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
3222 continue;
3224 /* Update number of entries modified. */
3225 modified++;
3227 /*
3228 * Fast path for PTEs that have merely been write-protected
3229 * (e.g., during a Unix fork()). A strict reduction in privilege.
3230 */
3231 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
3233 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3234 put_page_type(mfn_to_page(l1e_get_pfn(nl1e)));
3235 continue;
3238 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3240 /*
3241 * Make the remaining p.t's consistent before crashing, so the
3242 * reference counts are correct.
3243 */
3244 memcpy(&l1page[i], &snapshot[i],
3245 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
3247 /* Crash the offending domain. */
3248 MEM_LOG("ptwr: Could not revalidate l1 page");
3249 domain_crash(d);
3250 break;
3253 put_page_from_l1e(ol1e, d);
3256 return modified;
3260 /* Flush the given writable p.t. page and write-protect it again. */
3261 void ptwr_flush(struct domain *d, const int which)
3263 unsigned long l1va;
3264 l1_pgentry_t *pl1e, pte, *ptep;
3265 l2_pgentry_t *pl2e;
3266 unsigned int modified;
3268 #ifdef CONFIG_X86_64
3269 struct vcpu *v = current;
3270 int user_mode = !(v->arch.flags & TF_kernel_mode);
3271 #endif
3273 ASSERT(!shadow_mode_enabled(d));
3275 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3276 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
3277 __write_ptbase(pagetable_get_pfn(
3278 d->arch.ptwr[which].vcpu->arch.guest_table));
3279 else
3280 TOGGLE_MODE();
3282 l1va = d->arch.ptwr[which].l1va;
3283 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
3285 /*
3286 * STEP 1. Write-protect the p.t. page so no more updates can occur.
3287 */
3289 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
3291 MEM_LOG("ptwr: Could not read pte at %p", ptep);
3292 /*
3293 * Really a bug. We could read this PTE during the initial fault,
3294 * and pagetables can't have changed meantime.
3295 */
3296 BUG();
3298 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
3299 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3300 l1e_remove_flags(pte, _PAGE_RW);
3302 /* Write-protect the p.t. page in the guest page table. */
3303 if ( unlikely(__put_user(pte, ptep)) )
3305 MEM_LOG("ptwr: Could not update pte at %p", ptep);
3306 /*
3307 * Really a bug. We could write this PTE during the initial fault,
3308 * and pagetables can't have changed meantime.
3309 */
3310 BUG();
3313 /* Ensure that there are no stale writable mappings in any TLB. */
3314 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3315 flush_tlb_one_mask(d->domain_dirty_cpumask, l1va);
3316 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3317 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3319 /*
3320 * STEP 2. Validate any modified PTEs.
3321 */
3323 if ( likely(d == current->domain) )
3325 pl1e = map_domain_page(l1e_get_pfn(pte));
3326 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3327 unmap_domain_page(pl1e);
3328 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3329 ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
3330 d->arch.ptwr[which].prev_nr_updates = modified;
3332 else
3334 /*
3335 * Must make a temporary global mapping, since we are running in the
3336 * wrong address space, so no access to our own mapcache.
3337 */
3338 pl1e = map_domain_page_global(l1e_get_pfn(pte));
3339 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3340 unmap_domain_page_global(pl1e);
3343 /*
3344 * STEP 3. Reattach the L1 p.t. page into the current address space.
3345 */
3347 if ( which == PTWR_PT_ACTIVE )
3349 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3350 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3353 /*
3354 * STEP 4. Final tidy-up.
3355 */
3357 d->arch.ptwr[which].l1va = 0;
3359 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3360 write_ptbase(current);
3361 else
3362 TOGGLE_MODE();
3365 static int ptwr_emulated_update(
3366 unsigned long addr,
3367 paddr_t old,
3368 paddr_t val,
3369 unsigned int bytes,
3370 unsigned int do_cmpxchg)
3372 unsigned long pfn, l1va;
3373 struct page_info *page;
3374 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3375 struct domain *d = current->domain;
3377 /* Aligned access only, thank you. */
3378 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3380 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3381 bytes, addr);
3382 return X86EMUL_UNHANDLEABLE;
3385 /* Turn a sub-word access into a full-word access. */
3386 if ( bytes != sizeof(paddr_t) )
3388 paddr_t full;
3389 unsigned int offset = addr & (sizeof(paddr_t)-1);
3391 /* Align address; read full word. */
3392 addr &= ~(sizeof(paddr_t)-1);
3393 if ( copy_from_user(&full, (void *)addr, sizeof(paddr_t)) )
3395 propagate_page_fault(addr, 0); /* read fault */
3396 return X86EMUL_PROPAGATE_FAULT;
3398 /* Mask out bits provided by caller. */
3399 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3400 /* Shift the caller value and OR in the missing bits. */
3401 val &= (((paddr_t)1 << (bytes*8)) - 1);
3402 val <<= (offset)*8;
3403 val |= full;
3404 /* Also fill in missing parts of the cmpxchg old value. */
3405 old &= (((paddr_t)1 << (bytes*8)) - 1);
3406 old <<= (offset)*8;
3407 old |= full;
3410 #if 0 /* XXX KAF: I don't think this can happen. */
3411 /*
3412 * We must not emulate an update to a PTE that is temporarily marked
3413 * writable by the batched ptwr logic, else we can corrupt page refcnts!
3414 */
3415 if ( ((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3416 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3417 ptwr_flush(d, PTWR_PT_ACTIVE);
3418 if ( ((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3419 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3420 ptwr_flush(d, PTWR_PT_INACTIVE);
3421 #else
3422 BUG_ON(((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3423 (l1_linear_offset(l1va) == l1_linear_offset(addr)));
3424 BUG_ON(((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3425 (l1_linear_offset(l1va) == l1_linear_offset(addr)));
3426 #endif
3428 /* Read the PTE that maps the page being updated. */
3429 if ( __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3430 sizeof(pte)) )
3432 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3433 return X86EMUL_UNHANDLEABLE;
3436 pfn = l1e_get_pfn(pte);
3437 page = mfn_to_page(pfn);
3439 /* We are looking only for read-only mappings of p.t. pages. */
3440 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3441 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3442 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3443 ASSERT(page_get_owner(page) == d);
3445 /* Check the new PTE. */
3446 nl1e = l1e_from_intpte(val);
3447 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3449 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3450 return X86EMUL_UNHANDLEABLE;
3453 /* Checked successfully: do the update (write or cmpxchg). */
3454 pl1e = map_domain_page(page_to_mfn(page));
3455 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3456 if ( do_cmpxchg )
3458 ol1e = l1e_from_intpte(old);
3459 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3461 unmap_domain_page(pl1e);
3462 put_page_from_l1e(nl1e, d);
3463 return X86EMUL_CMPXCHG_FAILED;
3466 else
3468 ol1e = *pl1e;
3469 if ( !update_l1e(pl1e, ol1e, nl1e) )
3470 BUG();
3472 unmap_domain_page(pl1e);
3474 /* Finally, drop the old PTE. */
3475 put_page_from_l1e(ol1e, d);
3477 return X86EMUL_CONTINUE;
3480 static int ptwr_emulated_write(
3481 unsigned long addr,
3482 unsigned long val,
3483 unsigned int bytes,
3484 struct x86_emulate_ctxt *ctxt)
3486 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3489 static int ptwr_emulated_cmpxchg(
3490 unsigned long addr,
3491 unsigned long old,
3492 unsigned long new,
3493 unsigned int bytes,
3494 struct x86_emulate_ctxt *ctxt)
3496 return ptwr_emulated_update(addr, old, new, bytes, 1);
3499 static int ptwr_emulated_cmpxchg8b(
3500 unsigned long addr,
3501 unsigned long old,
3502 unsigned long old_hi,
3503 unsigned long new,
3504 unsigned long new_hi,
3505 struct x86_emulate_ctxt *ctxt)
3507 if ( CONFIG_PAGING_LEVELS == 2 )
3508 return X86EMUL_UNHANDLEABLE;
3509 else
3510 return ptwr_emulated_update(
3511 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3514 static struct x86_emulate_ops ptwr_emulate_ops = {
3515 .read_std = x86_emulate_read_std,
3516 .write_std = x86_emulate_write_std,
3517 .read_emulated = x86_emulate_read_std,
3518 .write_emulated = ptwr_emulated_write,
3519 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3520 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3521 };
3523 /* Write page fault handler: check if guest is trying to modify a PTE. */
3524 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3525 struct cpu_user_regs *regs)
3527 unsigned long pfn;
3528 struct page_info *page;
3529 l1_pgentry_t *pl1e, pte;
3530 l2_pgentry_t *pl2e, l2e;
3531 int which, flags;
3532 unsigned long l2_idx;
3533 struct x86_emulate_ctxt emul_ctxt;
3535 ASSERT(!shadow_mode_enabled(d));
3537 /*
3538 * Attempt to read the PTE that maps the VA being accessed. By checking for
3539 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3540 * NB. The L2 entry cannot be detached due to existing ptwr work: the
3541 * caller already checked that.
3542 */
3543 pl2e = &__linear_l2_table[l2_linear_offset(addr)];
3544 if ( __copy_from_user(&l2e, pl2e, sizeof(l2e)) ||
3545 !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3546 __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3547 sizeof(pte)) )
3549 return 0;
3552 pfn = l1e_get_pfn(pte);
3553 page = mfn_to_page(pfn);
3555 #ifdef CONFIG_X86_64
3556 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT | _PAGE_USER)
3557 #else
3558 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT)
3559 #endif
3561 /*
3562 * Check the required flags for a valid wrpt mapping. If the page is
3563 * already writable then we can return straight to the guest (SMP race).
3564 * We decide whether or not to propagate the fault by testing for write
3565 * permissions in page directories by writing back to the linear mapping.
3566 */
3567 if ( (flags = l1e_get_flags(pte) & WRPT_PTE_FLAGS) == WRPT_PTE_FLAGS )
3568 return __put_user(
3569 pte.l1, &linear_pg_table[l1_linear_offset(addr)].l1) ?
3570 0 : EXCRET_not_a_fault;
3572 /* We are looking only for read-only mappings of p.t. pages. */
3573 if ( ((flags | _PAGE_RW) != WRPT_PTE_FLAGS) ||
3574 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3575 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3576 (page_get_owner(page) != d) )
3578 return 0;
3581 #if 0 /* Leave this in as useful for debugging */
3582 goto emulate;
3583 #endif
3585 PTWR_PRINTK("ptwr_page_fault on l1 pt at va %lx, pfn %lx, eip %lx\n",
3586 addr, pfn, (unsigned long)regs->eip);
3588 /* Get the L2 index at which this L1 p.t. is always mapped. */
3589 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3590 if ( unlikely(l2_idx >= PGT_va_unknown) )
3591 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3592 l2_idx >>= PGT_va_shift;
3594 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3595 goto emulate; /* Urk! Pagetable maps itself! */
3597 /*
3598 * Is the L1 p.t. mapped into the current address space? If so we call it
3599 * an ACTIVE p.t., otherwise it is INACTIVE.
3600 */
3601 pl2e = &__linear_l2_table[l2_idx];
3602 which = PTWR_PT_INACTIVE;
3604 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3606 /*
3607 * Check the PRESENT bit to set ACTIVE mode.
3608 * If the PRESENT bit is clear, we may be conflicting with the current
3609 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3610 * The ptwr_flush call below will restore the PRESENT bit.
3611 */
3612 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3613 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3614 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3615 which = PTWR_PT_ACTIVE;
3618 /*
3619 * Multi-processor guest? Then ensure that the page table is hooked into
3620 * at most one L2, and also ensure that there is only one mapping of the
3621 * page table itself (or there can be conflicting writable mappings from
3622 * other VCPUs).
3623 */
3624 if ( d->vcpu[0]->next_in_list != NULL )
3626 if ( /* Hooked into at most one L2 table (which this VCPU maps)? */
3627 ((page->u.inuse.type_info & PGT_count_mask) !=
3628 (!!(page->u.inuse.type_info & PGT_pinned) +
3629 (which == PTWR_PT_ACTIVE))) ||
3630 /* PTEs are mapped read-only in only one place? */
3631 ((page->count_info & PGC_count_mask) !=
3632 (!!(page->count_info & PGC_allocated) + /* alloc count */
3633 (page->u.inuse.type_info & PGT_count_mask) + /* type count */
3634 1)) ) /* map count */
3636 /* Could be conflicting writable mappings from other VCPUs. */
3637 cleanup_writable_pagetable(d);
3638 goto emulate;
3642 /*
3643 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at a
3644 * time. If there is already one, we must flush it out.
3645 */
3646 if ( d->arch.ptwr[which].l1va )
3647 ptwr_flush(d, which);
3649 /*
3650 * If last batch made no updates then we are probably stuck. Emulate this
3651 * update to ensure we make progress.
3652 */
3653 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3655 /* Ensure that we don't get stuck in an emulation-only rut. */
3656 d->arch.ptwr[which].prev_nr_updates = 1;
3657 goto emulate;
3660 PTWR_PRINTK("[%c] batched ptwr_page_fault at va %lx, pt for %08lx, "
3661 "pfn %lx\n", PTWR_PRINT_WHICH, addr,
3662 l2_idx << L2_PAGETABLE_SHIFT, pfn);
3664 /* For safety, disconnect the L1 p.t. page from current space. */
3665 if ( which == PTWR_PT_ACTIVE )
3667 l2e_remove_flags(l2e, _PAGE_PRESENT);
3668 if ( unlikely(__copy_to_user(pl2e, &l2e, sizeof(l2e))) )
3670 MEM_LOG("ptwr: Could not unhook l2e at %p", pl2e);
3671 domain_crash(d);
3672 return 0;
3674 flush_tlb_mask(d->domain_dirty_cpumask);
3677 /* Temporarily map the L1 page, and make a copy of it. */
3678 pl1e = map_domain_page(pfn);
3679 memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE);
3680 unmap_domain_page(pl1e);
3682 /* Finally, make the p.t. page writable by the guest OS. */
3683 l1e_add_flags(pte, _PAGE_RW);
3684 if ( unlikely(__put_user(pte.l1,
3685 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3687 MEM_LOG("ptwr: Could not update pte at %p",
3688 &linear_pg_table[l1_linear_offset(addr)]);
3689 domain_crash(d);
3690 return 0;
3693 /*
3694 * Now record the writable pagetable state *after* any accesses that can
3695 * cause a recursive page fault (i.e., those via the *_user() accessors).
3696 * Otherwise we can enter ptwr_flush() with half-done ptwr state.
3697 */
3698 d->arch.ptwr[which].l1va = addr | 1;
3699 d->arch.ptwr[which].l2_idx = l2_idx;
3700 d->arch.ptwr[which].vcpu = current;
3701 #ifdef PERF_ARRAYS
3702 d->arch.ptwr[which].eip = regs->eip;
3703 #endif
3705 return EXCRET_fault_fixed;
3707 emulate:
3708 emul_ctxt.regs = guest_cpu_user_regs();
3709 emul_ctxt.cr2 = addr;
3710 emul_ctxt.mode = X86EMUL_MODE_HOST;
3711 if ( x86_emulate_memop(&emul_ctxt, &ptwr_emulate_ops) )
3712 return 0;
3713 perfc_incrc(ptwr_emulations);
3714 return EXCRET_fault_fixed;
3717 int ptwr_init(struct domain *d)
3719 void *x = alloc_xenheap_page();
3720 void *y = alloc_xenheap_page();
3722 if ( (x == NULL) || (y == NULL) )
3724 free_xenheap_page(x);
3725 free_xenheap_page(y);
3726 return -ENOMEM;
3729 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3730 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3732 return 0;
3735 void ptwr_destroy(struct domain *d)
3737 LOCK_BIGLOCK(d);
3738 cleanup_writable_pagetable(d);
3739 UNLOCK_BIGLOCK(d);
3740 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3741 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3744 void cleanup_writable_pagetable(struct domain *d)
3746 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3747 return;
3749 if ( unlikely(shadow_mode_enabled(d)) )
3751 shadow_sync_all(d);
3753 else
3755 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3756 ptwr_flush(d, PTWR_PT_ACTIVE);
3757 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3758 ptwr_flush(d, PTWR_PT_INACTIVE);
3762 int map_pages_to_xen(
3763 unsigned long virt,
3764 unsigned long mfn,
3765 unsigned long nr_mfns,
3766 unsigned long flags)
3768 l2_pgentry_t *pl2e, ol2e;
3769 l1_pgentry_t *pl1e, ol1e;
3770 unsigned int i;
3772 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3773 flags &= ~MAP_SMALL_PAGES;
3775 while ( nr_mfns != 0 )
3777 pl2e = virt_to_xen_l2e(virt);
3779 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3780 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3781 !map_small_pages )
3783 /* Super-page mapping. */
3784 ol2e = *pl2e;
3785 *pl2e = l2e_from_pfn(mfn, flags|_PAGE_PSE);
3787 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3789 local_flush_tlb_pge();
3790 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3791 free_xen_pagetable(l2e_get_page(ol2e));
3794 virt += 1UL << L2_PAGETABLE_SHIFT;
3795 mfn += 1UL << PAGETABLE_ORDER;
3796 nr_mfns -= 1UL << PAGETABLE_ORDER;
3798 else
3800 /* Normal page mapping. */
3801 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3803 pl1e = page_to_virt(alloc_xen_pagetable());
3804 clear_page(pl1e);
3805 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3807 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3809 pl1e = page_to_virt(alloc_xen_pagetable());
3810 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3811 pl1e[i] = l1e_from_pfn(
3812 l2e_get_pfn(*pl2e) + i,
3813 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3814 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3815 local_flush_tlb_pge();
3818 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3819 ol1e = *pl1e;
3820 *pl1e = l1e_from_pfn(mfn, flags);
3821 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3822 local_flush_tlb_one(virt);
3824 virt += 1UL << L1_PAGETABLE_SHIFT;
3825 mfn += 1UL;
3826 nr_mfns -= 1UL;
3830 return 0;
3833 void __set_fixmap(
3834 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3836 BUG_ON(idx >= __end_of_fixed_addresses);
3837 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3840 #ifdef MEMORY_GUARD
3842 void memguard_init(void)
3844 map_pages_to_xen(
3845 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3846 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3849 static void __memguard_change_range(void *p, unsigned long l, int guard)
3851 unsigned long _p = (unsigned long)p;
3852 unsigned long _l = (unsigned long)l;
3853 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3855 /* Ensure we are dealing with a page-aligned whole number of pages. */
3856 ASSERT((_p&PAGE_MASK) != 0);
3857 ASSERT((_l&PAGE_MASK) != 0);
3858 ASSERT((_p&~PAGE_MASK) == 0);
3859 ASSERT((_l&~PAGE_MASK) == 0);
3861 if ( guard )
3862 flags &= ~_PAGE_PRESENT;
3864 map_pages_to_xen(
3865 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3868 void memguard_guard_range(void *p, unsigned long l)
3870 __memguard_change_range(p, l, 1);
3873 void memguard_unguard_range(void *p, unsigned long l)
3875 __memguard_change_range(p, l, 0);
3878 #endif
3880 /*
3881 * Local variables:
3882 * mode: C
3883 * c-set-style: "BSD"
3884 * c-basic-offset: 4
3885 * tab-width: 4
3886 * indent-tabs-mode: nil
3887 * End:
3888 */