ia64/xen-unstable

view xen/arch/x86/mm.c @ 10216:0b98fcb98889

Fix PAE debug builds to avoid shadowing shadow-mode pgdirs below 4GB.
The current shadow mode is not aware of this extra level of shadowing
and gets rather confused. Furthermore it *always* ensures that its
PAE pgdirs are below 4GB so there is no need for the extra shadowing
logic to be invoked.

This fixes a bug introduced in changeset 10177:d5f98d23427a0d256b896fc63ccfd2c1f79e55ba

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue May 30 18:47:37 2006 +0100 (2006-05-30)
parents c43080e63545
children 09d9d6e7b985
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <xen/domain_page.h>
98 #include <xen/event.h>
99 #include <xen/iocap.h>
100 #include <xen/guest_access.h>
101 #include <asm/shadow.h>
102 #include <asm/page.h>
103 #include <asm/flushtlb.h>
104 #include <asm/io.h>
105 #include <asm/ldt.h>
106 #include <asm/x86_emulate.h>
107 #include <public/memory.h>
109 #ifdef VERBOSE
110 #define MEM_LOG(_f, _a...) \
111 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
112 current->domain->domain_id , __LINE__ , ## _a )
113 #else
114 #define MEM_LOG(_f, _a...) ((void)0)
115 #endif
117 /*
118 * Both do_mmuext_op() and do_mmu_update():
119 * We steal the m.s.b. of the @count parameter to indicate whether this
120 * invocation of do_mmu_update() is resuming a previously preempted call.
121 */
122 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
124 static void free_l2_table(struct page_info *page);
125 static void free_l1_table(struct page_info *page);
127 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
128 unsigned long type);
129 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
131 /* Used to defer flushing of memory structures. */
132 static struct {
133 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
134 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
135 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
136 unsigned int deferred_ops;
137 /* If non-NULL, specifies a foreign subject domain for some operations. */
138 struct domain *foreign;
139 } __cacheline_aligned percpu_info[NR_CPUS];
141 /*
142 * Returns the current foreign domain; defaults to the currently-executing
143 * domain if a foreign override hasn't been specified.
144 */
145 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
147 /* Private domain structs for DOMID_XEN and DOMID_IO. */
148 static struct domain *dom_xen, *dom_io;
150 /* Frame table and its size in pages. */
151 struct page_info *frame_table;
152 unsigned long max_page;
153 unsigned long total_pages;
155 void __init init_frametable(void)
156 {
157 unsigned long nr_pages, page_step, i, mfn;
159 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
161 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
162 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
164 for ( i = 0; i < nr_pages; i += page_step )
165 {
166 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
167 if ( mfn == 0 )
168 panic("Not enough memory for frame table\n");
169 map_pages_to_xen(
170 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
171 mfn, page_step, PAGE_HYPERVISOR);
172 }
174 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
175 }
177 void arch_init_memory(void)
178 {
179 extern void subarch_init_memory(void);
181 unsigned long i, pfn, rstart_pfn, rend_pfn;
183 memset(percpu_info, 0, sizeof(percpu_info));
185 /*
186 * Initialise our DOMID_XEN domain.
187 * Any Xen-heap pages that we will allow to be mapped will have
188 * their domain field set to dom_xen.
189 */
190 dom_xen = alloc_domain();
191 spin_lock_init(&dom_xen->page_alloc_lock);
192 atomic_set(&dom_xen->refcnt, 1);
193 dom_xen->domain_id = DOMID_XEN;
195 /*
196 * Initialise our DOMID_IO domain.
197 * This domain owns I/O pages that are within the range of the page_info
198 * array. Mappings occur at the priv of the caller.
199 */
200 dom_io = alloc_domain();
201 spin_lock_init(&dom_io->page_alloc_lock);
202 atomic_set(&dom_io->refcnt, 1);
203 dom_io->domain_id = DOMID_IO;
205 /* First 1MB of RAM is historically marked as I/O. */
206 for ( i = 0; i < 0x100; i++ )
207 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
209 /* Any areas not specified as RAM by the e820 map are considered I/O. */
210 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
211 {
212 if ( e820.map[i].type != E820_RAM )
213 continue;
214 /* Every page from cursor to start of next RAM region is I/O. */
215 rstart_pfn = PFN_UP(e820.map[i].addr);
216 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
217 for ( ; pfn < rstart_pfn; pfn++ )
218 {
219 BUG_ON(!mfn_valid(pfn));
220 share_xen_page_with_guest(
221 mfn_to_page(pfn), dom_io, XENSHARE_writable);
222 }
223 /* Skip the RAM region. */
224 pfn = rend_pfn;
225 }
226 BUG_ON(pfn != max_page);
228 subarch_init_memory();
229 }
231 void share_xen_page_with_guest(
232 struct page_info *page, struct domain *d, int readonly)
233 {
234 if ( page_get_owner(page) == d )
235 return;
237 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
239 spin_lock(&d->page_alloc_lock);
241 /* The incremented type count pins as writable or read-only. */
242 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
243 page->u.inuse.type_info |= PGT_validated | 1;
245 page_set_owner(page, d);
246 wmb(); /* install valid domain ptr before updating refcnt. */
247 ASSERT(page->count_info == 0);
248 page->count_info |= PGC_allocated | 1;
250 if ( unlikely(d->xenheap_pages++ == 0) )
251 get_knownalive_domain(d);
252 list_add_tail(&page->list, &d->xenpage_list);
254 spin_unlock(&d->page_alloc_lock);
255 }
257 void share_xen_page_with_privileged_guests(
258 struct page_info *page, int readonly)
259 {
260 share_xen_page_with_guest(page, dom_xen, readonly);
261 }
263 #if defined(CONFIG_X86_PAE)
265 #ifdef NDEBUG
266 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
267 #define l3tab_needs_shadow(mfn) (mfn >= 0x100000)
268 #else
269 /*
270 * In debug builds we aggressively shadow PDPTs to exercise code paths.
271 * We cannot safely shadow the idle page table, nor shadow-mode page tables
272 * (detected by lack of an owning domain). Always shadow PDPTs above 4GB.
273 */
274 #define l3tab_needs_shadow(mfn) \
275 ((((mfn << PAGE_SHIFT) != __pa(idle_pg_table)) && \
276 (page_get_owner(mfn_to_page(mfn)) != NULL)) || \
277 (mfn >= 0x100000))
278 #endif
280 static l1_pgentry_t *fix_pae_highmem_pl1e;
282 /* Cache the address of PAE high-memory fixmap page tables. */
283 static int __init cache_pae_fixmap_address(void)
284 {
285 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
286 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
287 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
288 return 0;
289 }
290 __initcall(cache_pae_fixmap_address);
292 static void __write_ptbase(unsigned long mfn)
293 {
294 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
295 struct pae_l3_cache *cache = &current->arch.pae_l3_cache;
296 unsigned int cpu = smp_processor_id();
298 /* Fast path 1: does this mfn need a shadow at all? */
299 if ( !l3tab_needs_shadow(mfn) )
300 {
301 write_cr3(mfn << PAGE_SHIFT);
302 return;
303 }
305 /* Caching logic is not interrupt safe. */
306 ASSERT(!in_irq());
308 /* Fast path 2: is this mfn already cached? */
309 if ( cache->high_mfn == mfn )
310 {
311 write_cr3(__pa(cache->table[cache->inuse_idx]));
312 return;
313 }
315 /* Protects against pae_flush_pgd(). */
316 spin_lock(&cache->lock);
318 cache->inuse_idx ^= 1;
319 cache->high_mfn = mfn;
321 /* Map the guest L3 table and copy to the chosen low-memory cache. */
322 *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
323 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
324 lowmem_l3tab = cache->table[cache->inuse_idx];
325 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
326 *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
328 /* Install the low-memory L3 table in CR3. */
329 write_cr3(__pa(lowmem_l3tab));
331 spin_unlock(&cache->lock);
332 }
334 #else /* !CONFIG_X86_PAE */
336 static void __write_ptbase(unsigned long mfn)
337 {
338 write_cr3(mfn << PAGE_SHIFT);
339 }
341 #endif /* !CONFIG_X86_PAE */
343 void write_ptbase(struct vcpu *v)
344 {
345 __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
346 }
348 void invalidate_shadow_ldt(struct vcpu *v)
349 {
350 int i;
351 unsigned long pfn;
352 struct page_info *page;
354 if ( v->arch.shadow_ldt_mapcnt == 0 )
355 return;
357 v->arch.shadow_ldt_mapcnt = 0;
359 for ( i = 16; i < 32; i++ )
360 {
361 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
362 if ( pfn == 0 ) continue;
363 v->arch.perdomain_ptes[i] = l1e_empty();
364 page = mfn_to_page(pfn);
365 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
366 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
367 put_page_and_type(page);
368 }
370 /* Dispose of the (now possibly invalid) mappings from the TLB. */
371 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
372 }
375 static int alloc_segdesc_page(struct page_info *page)
376 {
377 struct desc_struct *descs;
378 int i;
380 descs = map_domain_page(page_to_mfn(page));
382 for ( i = 0; i < 512; i++ )
383 if ( unlikely(!check_descriptor(&descs[i])) )
384 goto fail;
386 unmap_domain_page(descs);
387 return 1;
389 fail:
390 unmap_domain_page(descs);
391 return 0;
392 }
395 /* Map shadow page at offset @off. */
396 int map_ldt_shadow_page(unsigned int off)
397 {
398 struct vcpu *v = current;
399 struct domain *d = v->domain;
400 unsigned long gmfn, mfn;
401 l1_pgentry_t l1e, nl1e;
402 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
403 int res;
405 #if defined(__x86_64__)
406 /* If in user mode, switch to kernel mode just to read LDT mapping. */
407 int user_mode = !(v->arch.flags & TF_kernel_mode);
408 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
409 #elif defined(__i386__)
410 #define TOGGLE_MODE() ((void)0)
411 #endif
413 BUG_ON(unlikely(in_irq()));
415 shadow_sync_va(v, gva);
417 TOGGLE_MODE();
418 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
419 sizeof(l1e));
420 TOGGLE_MODE();
422 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
423 return 0;
425 gmfn = l1e_get_pfn(l1e);
426 mfn = gmfn_to_mfn(d, gmfn);
427 if ( unlikely(!VALID_MFN(mfn)) )
428 return 0;
430 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
432 if ( !res && unlikely(shadow_mode_refcounts(d)) )
433 {
434 shadow_lock(d);
435 shadow_remove_all_write_access(d, gmfn, mfn);
436 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
437 shadow_unlock(d);
438 }
440 if ( unlikely(!res) )
441 return 0;
443 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
445 v->arch.perdomain_ptes[off + 16] = nl1e;
446 v->arch.shadow_ldt_mapcnt++;
448 return 1;
449 }
452 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
453 {
454 struct page_info *page = mfn_to_page(page_nr);
456 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
457 {
458 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
459 return 0;
460 }
462 return 1;
463 }
466 static int get_page_and_type_from_pagenr(unsigned long page_nr,
467 unsigned long type,
468 struct domain *d)
469 {
470 struct page_info *page = mfn_to_page(page_nr);
472 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
473 return 0;
475 if ( unlikely(!get_page_type(page, type)) )
476 {
477 put_page(page);
478 return 0;
479 }
481 return 1;
482 }
484 #ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
485 /*
486 * We allow root tables to map each other (a.k.a. linear page tables). It
487 * needs some special care with reference counts and access permissions:
488 * 1. The mapping entry must be read-only, or the guest may get write access
489 * to its own PTEs.
490 * 2. We must only bump the reference counts for an *already validated*
491 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
492 * on a validation that is required to complete that validation.
493 * 3. We only need to increment the reference counts for the mapped page
494 * frame if it is mapped by a different root table. This is sufficient and
495 * also necessary to allow validation of a root table mapping itself.
496 */
497 static int
498 get_linear_pagetable(
499 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
500 {
501 unsigned long x, y;
502 struct page_info *page;
503 unsigned long pfn;
505 ASSERT( !shadow_mode_refcounts(d) );
507 if ( (root_get_flags(re) & _PAGE_RW) )
508 {
509 MEM_LOG("Attempt to create linear p.t. with write perms");
510 return 0;
511 }
513 if ( (pfn = root_get_pfn(re)) != re_pfn )
514 {
515 /* Make sure the mapped frame belongs to the correct domain. */
516 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
517 return 0;
519 /*
520 * Make sure that the mapped frame is an already-validated L2 table.
521 * If so, atomically increment the count (checking for overflow).
522 */
523 page = mfn_to_page(pfn);
524 y = page->u.inuse.type_info;
525 do {
526 x = y;
527 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
528 unlikely((x & (PGT_type_mask|PGT_validated)) !=
529 (PGT_root_page_table|PGT_validated)) )
530 {
531 put_page(page);
532 return 0;
533 }
534 }
535 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
536 }
538 return 1;
539 }
540 #endif /* !CONFIG_X86_PAE */
542 int
543 get_page_from_l1e(
544 l1_pgentry_t l1e, struct domain *d)
545 {
546 unsigned long mfn = l1e_get_pfn(l1e);
547 struct page_info *page = mfn_to_page(mfn);
548 int okay;
550 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
551 return 1;
553 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
554 {
555 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
556 return 0;
557 }
559 if ( unlikely(!mfn_valid(mfn)) ||
560 unlikely(page_get_owner(page) == dom_io) )
561 {
562 /* DOMID_IO reverts to caller for privilege checks. */
563 if ( d == dom_io )
564 d = current->domain;
566 if ( !iomem_access_permitted(d, mfn, mfn) )
567 {
568 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
569 return 0;
570 }
572 /* No reference counting for out-of-range I/O pages. */
573 if ( !mfn_valid(mfn) )
574 return 1;
576 d = dom_io;
577 }
579 okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
580 get_page_and_type(page, d, PGT_writable_page) :
581 get_page(page, d));
582 if ( !okay )
583 {
584 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
585 " for dom%d",
586 mfn, get_gpfn_from_mfn(mfn),
587 l1e_get_intpte(l1e), d->domain_id);
588 }
590 return okay;
591 }
594 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
595 static int
596 get_page_from_l2e(
597 l2_pgentry_t l2e, unsigned long pfn,
598 struct domain *d, unsigned long vaddr)
599 {
600 int rc;
602 ASSERT(!shadow_mode_refcounts(d));
604 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
605 return 1;
607 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
608 {
609 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
610 return 0;
611 }
613 vaddr >>= L2_PAGETABLE_SHIFT;
614 vaddr <<= PGT_va_shift;
615 rc = get_page_and_type_from_pagenr(
616 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
617 #if CONFIG_PAGING_LEVELS == 2
618 if ( unlikely(!rc) )
619 rc = get_linear_pagetable(l2e, pfn, d);
620 #endif
621 return rc;
622 }
625 #if CONFIG_PAGING_LEVELS >= 3
626 static int
627 get_page_from_l3e(
628 l3_pgentry_t l3e, unsigned long pfn,
629 struct domain *d, unsigned long vaddr)
630 {
631 int rc;
633 ASSERT(!shadow_mode_refcounts(d));
635 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
636 return 1;
638 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
639 {
640 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
641 return 0;
642 }
644 vaddr >>= L3_PAGETABLE_SHIFT;
645 vaddr <<= PGT_va_shift;
646 rc = get_page_and_type_from_pagenr(
647 l3e_get_pfn(l3e),
648 PGT_l2_page_table | vaddr, d);
649 return rc;
650 }
651 #endif /* 3 level */
653 #if CONFIG_PAGING_LEVELS >= 4
654 static int
655 get_page_from_l4e(
656 l4_pgentry_t l4e, unsigned long pfn,
657 struct domain *d, unsigned long vaddr)
658 {
659 int rc;
661 ASSERT( !shadow_mode_refcounts(d) );
663 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
664 return 1;
666 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
667 {
668 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
669 return 0;
670 }
672 vaddr >>= L4_PAGETABLE_SHIFT;
673 vaddr <<= PGT_va_shift;
674 rc = get_page_and_type_from_pagenr(
675 l4e_get_pfn(l4e),
676 PGT_l3_page_table | vaddr, d);
678 if ( unlikely(!rc) )
679 rc = get_linear_pagetable(l4e, pfn, d);
681 return rc;
682 }
683 #endif /* 4 level */
686 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
687 {
688 unsigned long pfn = l1e_get_pfn(l1e);
689 struct page_info *page = mfn_to_page(pfn);
690 struct domain *e;
691 struct vcpu *v;
693 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
694 return;
696 e = page_get_owner(page);
698 /*
699 * Check if this is a mapping that was established via a grant reference.
700 * If it was then we should not be here: we require that such mappings are
701 * explicitly destroyed via the grant-table interface.
702 *
703 * The upshot of this is that the guest can end up with active grants that
704 * it cannot destroy (because it no longer has a PTE to present to the
705 * grant-table interface). This can lead to subtle hard-to-catch bugs,
706 * hence a special grant PTE flag can be enabled to catch the bug early.
707 *
708 * (Note that the undestroyable active grants are not a security hole in
709 * Xen. All active grants can safely be cleaned up when the domain dies.)
710 */
711 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
712 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
713 {
714 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
715 l1e_get_intpte(l1e));
716 domain_crash(d);
717 }
719 if ( l1e_get_flags(l1e) & _PAGE_RW )
720 {
721 put_page_and_type(page);
722 }
723 else
724 {
725 /* We expect this is rare so we blow the entire shadow LDT. */
726 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
727 PGT_ldt_page)) &&
728 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
729 (d == e) )
730 {
731 for_each_vcpu ( d, v )
732 invalidate_shadow_ldt(v);
733 }
734 put_page(page);
735 }
736 }
739 /*
740 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
741 * Note also that this automatically deals correctly with linear p.t.'s.
742 */
743 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
744 {
745 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
746 (l2e_get_pfn(l2e) != pfn) )
747 put_page_and_type(mfn_to_page(l2e_get_pfn(l2e)));
748 }
751 #if CONFIG_PAGING_LEVELS >= 3
752 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
753 {
754 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
755 (l3e_get_pfn(l3e) != pfn) )
756 put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
757 }
758 #endif
760 #if CONFIG_PAGING_LEVELS >= 4
761 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
762 {
763 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
764 (l4e_get_pfn(l4e) != pfn) )
765 put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
766 }
767 #endif
769 static int alloc_l1_table(struct page_info *page)
770 {
771 struct domain *d = page_get_owner(page);
772 unsigned long pfn = page_to_mfn(page);
773 l1_pgentry_t *pl1e;
774 int i;
776 ASSERT(!shadow_mode_refcounts(d));
778 pl1e = map_domain_page(pfn);
780 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
781 if ( is_guest_l1_slot(i) &&
782 unlikely(!get_page_from_l1e(pl1e[i], d)) )
783 goto fail;
785 unmap_domain_page(pl1e);
786 return 1;
788 fail:
789 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
790 while ( i-- > 0 )
791 if ( is_guest_l1_slot(i) )
792 put_page_from_l1e(pl1e[i], d);
794 unmap_domain_page(pl1e);
795 return 0;
796 }
798 #ifdef CONFIG_X86_PAE
799 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
800 {
801 struct page_info *page;
802 l2_pgentry_t *pl2e;
803 l3_pgentry_t l3e3;
804 int i;
806 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
808 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
809 l3e3 = pl3e[3];
810 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
811 {
812 MEM_LOG("PAE L3 3rd slot is empty");
813 return 0;
814 }
816 /*
817 * The Xen-private mappings include linear mappings. The L2 thus cannot
818 * be shared by multiple L3 tables. The test here is adequate because:
819 * 1. Cannot appear in slots != 3 because the page would then then have
820 * unknown va backpointer, which get_page_type() explicitly disallows.
821 * 2. Cannot appear in another page table's L3:
822 * a. alloc_l3_table() calls this function and this check will fail
823 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
824 */
825 page = l3e_get_page(l3e3);
826 BUG_ON(page->u.inuse.type_info & PGT_pinned);
827 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
828 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
829 {
830 MEM_LOG("PAE L3 3rd slot is shared");
831 return 0;
832 }
834 /* Xen private mappings. */
835 pl2e = map_domain_page(l3e_get_pfn(l3e3));
836 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
837 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
838 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
839 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
840 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
841 l2e_from_page(
842 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
843 __PAGE_HYPERVISOR);
844 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
845 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
846 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
847 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
848 l2e_empty();
849 unmap_domain_page(pl2e);
851 return 1;
852 }
854 /* Flush a pgdir update into low-memory caches. */
855 static void pae_flush_pgd(
856 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
857 {
858 struct domain *d = page_get_owner(mfn_to_page(mfn));
859 struct vcpu *v;
860 intpte_t _ol3e, _nl3e, _pl3e;
861 l3_pgentry_t *l3tab_ptr;
862 struct pae_l3_cache *cache;
864 /* If below 4GB then the pgdir is not shadowed in low memory. */
865 if ( !l3tab_needs_shadow(mfn) )
866 return;
868 for_each_vcpu ( d, v )
869 {
870 cache = &v->arch.pae_l3_cache;
872 spin_lock(&cache->lock);
874 if ( cache->high_mfn == mfn )
875 {
876 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
877 _ol3e = l3e_get_intpte(*l3tab_ptr);
878 _nl3e = l3e_get_intpte(nl3e);
879 _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
880 BUG_ON(_pl3e != _ol3e);
881 }
883 spin_unlock(&cache->lock);
884 }
886 flush_tlb_mask(d->domain_dirty_cpumask);
887 }
889 static inline int l1_backptr(
890 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
891 {
892 unsigned long l2_backptr = l2_type & PGT_va_mask;
893 ASSERT(l2_backptr != PGT_va_unknown);
894 ASSERT(l2_backptr != PGT_va_mutable);
895 *backptr =
896 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
897 (offset_in_l2 << L2_PAGETABLE_SHIFT);
898 return 1;
899 }
901 #elif CONFIG_X86_64
902 # define create_pae_xen_mappings(pl3e) (1)
903 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
905 static inline int l1_backptr(
906 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
907 {
908 unsigned long l2_backptr = l2_type & PGT_va_mask;
909 ASSERT(l2_backptr != PGT_va_unknown);
910 ASSERT(l2_backptr != PGT_va_mutable);
911 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
912 (offset_in_l2 << L2_PAGETABLE_SHIFT);
913 return 1;
914 }
916 static inline int l2_backptr(
917 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
918 {
919 unsigned long l3_backptr = l3_type & PGT_va_mask;
920 ASSERT(l3_backptr != PGT_va_unknown);
921 ASSERT(l3_backptr != PGT_va_mutable);
922 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
923 (offset_in_l3 << L3_PAGETABLE_SHIFT);
924 return 1;
925 }
927 static inline int l3_backptr(
928 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
929 {
930 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
931 return 1;
932 }
933 #else
934 # define create_pae_xen_mappings(pl3e) (1)
935 # define l1_backptr(bp,l2o,l2t) \
936 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
937 #endif
939 static int alloc_l2_table(struct page_info *page, unsigned long type)
940 {
941 struct domain *d = page_get_owner(page);
942 unsigned long pfn = page_to_mfn(page);
943 unsigned long vaddr;
944 l2_pgentry_t *pl2e;
945 int i;
947 /* See the code in shadow_promote() to understand why this is here. */
948 if ( (PGT_base_page_table == PGT_l2_page_table) &&
949 unlikely(shadow_mode_refcounts(d)) )
950 return 1;
951 ASSERT(!shadow_mode_refcounts(d));
953 pl2e = map_domain_page(pfn);
955 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
956 {
957 if ( !l1_backptr(&vaddr, i, type) )
958 goto fail;
959 if ( is_guest_l2_slot(type, i) &&
960 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
961 goto fail;
962 }
964 #if CONFIG_PAGING_LEVELS == 2
965 /* Xen private mappings. */
966 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
967 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
968 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
969 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
970 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
971 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
972 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
973 l2e_from_page(
974 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
975 __PAGE_HYPERVISOR);
976 #endif
978 unmap_domain_page(pl2e);
979 return 1;
981 fail:
982 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
983 while ( i-- > 0 )
984 if ( is_guest_l2_slot(type, i) )
985 put_page_from_l2e(pl2e[i], pfn);
987 unmap_domain_page(pl2e);
988 return 0;
989 }
992 #if CONFIG_PAGING_LEVELS >= 3
993 static int alloc_l3_table(struct page_info *page, unsigned long type)
994 {
995 struct domain *d = page_get_owner(page);
996 unsigned long pfn = page_to_mfn(page);
997 unsigned long vaddr;
998 l3_pgentry_t *pl3e;
999 int i;
1001 ASSERT(!shadow_mode_refcounts(d));
1003 pl3e = map_domain_page(pfn);
1004 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1006 #if CONFIG_PAGING_LEVELS >= 4
1007 if ( !l2_backptr(&vaddr, i, type) )
1008 goto fail;
1009 #else
1010 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
1011 #endif
1012 if ( is_guest_l3_slot(i) &&
1013 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
1014 goto fail;
1017 if ( !create_pae_xen_mappings(pl3e) )
1018 goto fail;
1020 unmap_domain_page(pl3e);
1021 return 1;
1023 fail:
1024 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1025 while ( i-- > 0 )
1026 if ( is_guest_l3_slot(i) )
1027 put_page_from_l3e(pl3e[i], pfn);
1029 unmap_domain_page(pl3e);
1030 return 0;
1032 #else
1033 #define alloc_l3_table(page, type) (0)
1034 #endif
1036 #if CONFIG_PAGING_LEVELS >= 4
1037 static int alloc_l4_table(struct page_info *page, unsigned long type)
1039 struct domain *d = page_get_owner(page);
1040 unsigned long pfn = page_to_mfn(page);
1041 l4_pgentry_t *pl4e = page_to_virt(page);
1042 unsigned long vaddr;
1043 int i;
1045 /* See the code in shadow_promote() to understand why this is here. */
1046 if ( (PGT_base_page_table == PGT_l4_page_table) &&
1047 shadow_mode_refcounts(d) )
1048 return 1;
1049 ASSERT(!shadow_mode_refcounts(d));
1051 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1053 if ( !l3_backptr(&vaddr, i, type) )
1054 goto fail;
1056 if ( is_guest_l4_slot(i) &&
1057 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
1058 goto fail;
1061 /* Xen private mappings. */
1062 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1063 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1064 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1065 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1066 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1067 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1068 l4e_from_page(
1069 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
1070 __PAGE_HYPERVISOR);
1072 return 1;
1074 fail:
1075 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1076 while ( i-- > 0 )
1077 if ( is_guest_l4_slot(i) )
1078 put_page_from_l4e(pl4e[i], pfn);
1080 return 0;
1082 #else
1083 #define alloc_l4_table(page, type) (0)
1084 #endif
1087 static void free_l1_table(struct page_info *page)
1089 struct domain *d = page_get_owner(page);
1090 unsigned long pfn = page_to_mfn(page);
1091 l1_pgentry_t *pl1e;
1092 int i;
1094 pl1e = map_domain_page(pfn);
1096 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1097 if ( is_guest_l1_slot(i) )
1098 put_page_from_l1e(pl1e[i], d);
1100 unmap_domain_page(pl1e);
1104 static void free_l2_table(struct page_info *page)
1106 unsigned long pfn = page_to_mfn(page);
1107 l2_pgentry_t *pl2e;
1108 int i;
1110 pl2e = map_domain_page(pfn);
1112 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1113 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
1114 put_page_from_l2e(pl2e[i], pfn);
1116 unmap_domain_page(pl2e);
1120 #if CONFIG_PAGING_LEVELS >= 3
1122 static void free_l3_table(struct page_info *page)
1124 unsigned long pfn = page_to_mfn(page);
1125 l3_pgentry_t *pl3e;
1126 int i;
1128 pl3e = map_domain_page(pfn);
1130 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1131 if ( is_guest_l3_slot(i) )
1132 put_page_from_l3e(pl3e[i], pfn);
1134 unmap_domain_page(pl3e);
1137 #endif
1139 #if CONFIG_PAGING_LEVELS >= 4
1141 static void free_l4_table(struct page_info *page)
1143 unsigned long pfn = page_to_mfn(page);
1144 l4_pgentry_t *pl4e = page_to_virt(page);
1145 int i;
1147 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1148 if ( is_guest_l4_slot(i) )
1149 put_page_from_l4e(pl4e[i], pfn);
1152 #endif
1154 static inline int update_l1e(l1_pgentry_t *pl1e,
1155 l1_pgentry_t ol1e,
1156 l1_pgentry_t nl1e)
1158 intpte_t o = l1e_get_intpte(ol1e);
1159 intpte_t n = l1e_get_intpte(nl1e);
1161 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
1162 unlikely(o != l1e_get_intpte(ol1e)) )
1164 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1165 ": saw %" PRIpte,
1166 l1e_get_intpte(ol1e),
1167 l1e_get_intpte(nl1e),
1168 o);
1169 return 0;
1171 return 1;
1175 /* Update the L1 entry at pl1e to new value nl1e. */
1176 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1178 l1_pgentry_t ol1e;
1179 struct domain *d = current->domain;
1181 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1182 return 0;
1184 if ( unlikely(shadow_mode_refcounts(d)) )
1185 return update_l1e(pl1e, ol1e, nl1e);
1187 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1189 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1191 MEM_LOG("Bad L1 flags %x",
1192 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1193 return 0;
1196 /* Fast path for identical mapping, r/w and presence. */
1197 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1198 return update_l1e(pl1e, ol1e, nl1e);
1200 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1201 return 0;
1203 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1205 put_page_from_l1e(nl1e, d);
1206 return 0;
1209 else
1211 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1212 return 0;
1215 put_page_from_l1e(ol1e, d);
1216 return 1;
1219 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1220 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1221 _t ## e_get_intpte(_o), \
1222 _t ## e_get_intpte(_n)); \
1223 if ( __o != _t ## e_get_intpte(_o) ) \
1224 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1225 ": saw %" PRIpte "", \
1226 (_t ## e_get_intpte(_o)), \
1227 (_t ## e_get_intpte(_n)), \
1228 (__o)); \
1229 (__o == _t ## e_get_intpte(_o)); })
1231 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1232 static int mod_l2_entry(l2_pgentry_t *pl2e,
1233 l2_pgentry_t nl2e,
1234 unsigned long pfn,
1235 unsigned long type)
1237 l2_pgentry_t ol2e;
1238 unsigned long vaddr = 0;
1240 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1242 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1243 return 0;
1246 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1247 return 0;
1249 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1251 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1253 MEM_LOG("Bad L2 flags %x",
1254 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1255 return 0;
1258 /* Fast path for identical mapping and presence. */
1259 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1260 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1262 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1263 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1264 return 0;
1266 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1268 put_page_from_l2e(nl2e, pfn);
1269 return 0;
1272 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1274 return 0;
1277 put_page_from_l2e(ol2e, pfn);
1278 return 1;
1282 #if CONFIG_PAGING_LEVELS >= 3
1284 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1285 static int mod_l3_entry(l3_pgentry_t *pl3e,
1286 l3_pgentry_t nl3e,
1287 unsigned long pfn,
1288 unsigned long type)
1290 l3_pgentry_t ol3e;
1291 unsigned long vaddr;
1292 int okay;
1294 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1296 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1297 return 0;
1300 #ifdef CONFIG_X86_PAE
1301 /*
1302 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1303 * would be a pain to ensure they remain continuously valid throughout.
1304 */
1305 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1306 return 0;
1307 #endif
1309 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1310 return 0;
1312 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1314 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1316 MEM_LOG("Bad L3 flags %x",
1317 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1318 return 0;
1321 /* Fast path for identical mapping and presence. */
1322 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1323 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1325 #if CONFIG_PAGING_LEVELS >= 4
1326 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1327 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1328 return 0;
1329 #else
1330 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1331 << L3_PAGETABLE_SHIFT;
1332 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1333 return 0;
1334 #endif
1336 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1338 put_page_from_l3e(nl3e, pfn);
1339 return 0;
1342 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1344 return 0;
1347 okay = create_pae_xen_mappings(pl3e);
1348 BUG_ON(!okay);
1350 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1352 put_page_from_l3e(ol3e, pfn);
1353 return 1;
1356 #endif
1358 #if CONFIG_PAGING_LEVELS >= 4
1360 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1361 static int mod_l4_entry(l4_pgentry_t *pl4e,
1362 l4_pgentry_t nl4e,
1363 unsigned long pfn,
1364 unsigned long type)
1366 l4_pgentry_t ol4e;
1367 unsigned long vaddr;
1369 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1371 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1372 return 0;
1375 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1376 return 0;
1378 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1380 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1382 MEM_LOG("Bad L4 flags %x",
1383 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1384 return 0;
1387 /* Fast path for identical mapping and presence. */
1388 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1389 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1391 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1392 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1393 return 0;
1395 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1397 put_page_from_l4e(nl4e, pfn);
1398 return 0;
1401 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1403 return 0;
1406 put_page_from_l4e(ol4e, pfn);
1407 return 1;
1410 #endif
1412 int alloc_page_type(struct page_info *page, unsigned long type)
1414 struct domain *owner = page_get_owner(page);
1416 if ( owner != NULL )
1417 mark_dirty(owner, page_to_mfn(page));
1419 switch ( type & PGT_type_mask )
1421 case PGT_l1_page_table:
1422 return alloc_l1_table(page);
1423 case PGT_l2_page_table:
1424 return alloc_l2_table(page, type);
1425 case PGT_l3_page_table:
1426 return alloc_l3_table(page, type);
1427 case PGT_l4_page_table:
1428 return alloc_l4_table(page, type);
1429 case PGT_gdt_page:
1430 case PGT_ldt_page:
1431 return alloc_segdesc_page(page);
1432 default:
1433 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1434 type, page->u.inuse.type_info,
1435 page->count_info);
1436 BUG();
1439 return 0;
1443 void free_page_type(struct page_info *page, unsigned long type)
1445 struct domain *owner = page_get_owner(page);
1446 unsigned long gmfn;
1448 if ( likely(owner != NULL) )
1450 /*
1451 * We have to flush before the next use of the linear mapping
1452 * (e.g., update_va_mapping()) or we could end up modifying a page
1453 * that is no longer a page table (and hence screw up ref counts).
1454 */
1455 percpu_info[smp_processor_id()].deferred_ops |= DOP_FLUSH_ALL_TLBS;
1457 if ( unlikely(shadow_mode_enabled(owner)) )
1459 /* Raw page tables are rewritten during save/restore. */
1460 if ( !shadow_mode_translate(owner) )
1461 mark_dirty(owner, page_to_mfn(page));
1463 if ( shadow_mode_refcounts(owner) )
1464 return;
1466 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1467 ASSERT(VALID_M2P(gmfn));
1468 remove_shadow(owner, gmfn, type & PGT_type_mask);
1472 switch ( type & PGT_type_mask )
1474 case PGT_l1_page_table:
1475 free_l1_table(page);
1476 break;
1478 case PGT_l2_page_table:
1479 free_l2_table(page);
1480 break;
1482 #if CONFIG_PAGING_LEVELS >= 3
1483 case PGT_l3_page_table:
1484 free_l3_table(page);
1485 break;
1486 #endif
1488 #if CONFIG_PAGING_LEVELS >= 4
1489 case PGT_l4_page_table:
1490 free_l4_table(page);
1491 break;
1492 #endif
1494 default:
1495 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1496 type, page_to_mfn(page));
1497 BUG();
1502 void put_page_type(struct page_info *page)
1504 unsigned long nx, x, y = page->u.inuse.type_info;
1506 again:
1507 do {
1508 x = y;
1509 nx = x - 1;
1511 ASSERT((x & PGT_count_mask) != 0);
1513 /*
1514 * The page should always be validated while a reference is held. The
1515 * exception is during domain destruction, when we forcibly invalidate
1516 * page-table pages if we detect a referential loop.
1517 * See domain.c:relinquish_list().
1518 */
1519 ASSERT((x & PGT_validated) ||
1520 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1522 if ( unlikely((nx & PGT_count_mask) == 0) )
1524 /* Record TLB information for flush later. Races are harmless. */
1525 page->tlbflush_timestamp = tlbflush_current_time();
1527 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1528 likely(nx & PGT_validated) )
1530 /*
1531 * Page-table pages must be unvalidated when count is zero. The
1532 * 'free' is safe because the refcnt is non-zero and validated
1533 * bit is clear => other ops will spin or fail.
1534 */
1535 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1536 x & ~PGT_validated)) != x) )
1537 goto again;
1538 /* We cleared the 'valid bit' so we do the clean up. */
1539 free_page_type(page, x);
1540 /* Carry on, but with the 'valid bit' now clear. */
1541 x &= ~PGT_validated;
1542 nx &= ~PGT_validated;
1545 else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) ==
1546 (PGT_pinned|PGT_l1_page_table|1)) )
1548 /* Page is now only pinned. Make the back pointer mutable again. */
1549 nx |= PGT_va_mutable;
1552 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1556 int get_page_type(struct page_info *page, unsigned long type)
1558 unsigned long nx, x, y = page->u.inuse.type_info;
1560 again:
1561 do {
1562 x = y;
1563 nx = x + 1;
1564 if ( unlikely((nx & PGT_count_mask) == 0) )
1566 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1567 return 0;
1569 else if ( unlikely((x & PGT_count_mask) == 0) )
1571 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1573 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1575 /*
1576 * On type change we check to flush stale TLB
1577 * entries. This may be unnecessary (e.g., page
1578 * was GDT/LDT) but those circumstances should be
1579 * very rare.
1580 */
1581 cpumask_t mask =
1582 page_get_owner(page)->domain_dirty_cpumask;
1583 tlbflush_filter(mask, page->tlbflush_timestamp);
1585 if ( unlikely(!cpus_empty(mask)) )
1587 perfc_incrc(need_flush_tlb_flush);
1588 flush_tlb_mask(mask);
1592 /* We lose existing type, back pointer, and validity. */
1593 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1594 nx |= type;
1596 /* No special validation needed for writable pages. */
1597 /* Page tables and GDT/LDT need to be scanned for validity. */
1598 if ( type == PGT_writable_page )
1599 nx |= PGT_validated;
1602 else
1604 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1606 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1608 if ( (current->domain == page_get_owner(page)) &&
1609 ((x & PGT_type_mask) == PGT_writable_page) )
1611 /*
1612 * This ensures functions like set_gdt() see up-to-date
1613 * type info without needing to clean up writable p.t.
1614 * state on the fast path. We take this path only
1615 * when the current type is writable because:
1616 * 1. It's the only type that this path can decrement.
1617 * 2. If we take this path more liberally then we can
1618 * enter a recursive loop via get_page_from_l1e()
1619 * during pagetable revalidation.
1620 */
1621 LOCK_BIGLOCK(current->domain);
1622 cleanup_writable_pagetable(current->domain);
1623 y = page->u.inuse.type_info;
1624 UNLOCK_BIGLOCK(current->domain);
1625 /* Can we make progress now? */
1626 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1627 ((y & PGT_count_mask) == 0) )
1628 goto again;
1630 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1631 ((type & PGT_type_mask) != PGT_l1_page_table) )
1632 MEM_LOG("Bad type (saw %" PRtype_info
1633 " != exp %" PRtype_info ") "
1634 "for mfn %lx (pfn %lx)",
1635 x, type, page_to_mfn(page),
1636 get_gpfn_from_mfn(page_to_mfn(page)));
1637 return 0;
1639 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1641 /* The va backpointer is mutable, hence we update it. */
1642 nx &= ~PGT_va_mask;
1643 nx |= type; /* we know the actual type is correct */
1645 else if ( (type & PGT_va_mask) != PGT_va_mutable )
1647 ASSERT((type & PGT_va_mask) != (x & PGT_va_mask));
1648 #ifdef CONFIG_X86_PAE
1649 /* We use backptr as extra typing. Cannot be unknown. */
1650 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1651 return 0;
1652 #endif
1653 /* Fixme: add code to propagate va_unknown to subtables. */
1654 if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
1655 !shadow_mode_refcounts(page_get_owner(page)) )
1656 return 0;
1657 /* This table is possibly mapped at multiple locations. */
1658 nx &= ~PGT_va_mask;
1659 nx |= PGT_va_unknown;
1662 if ( unlikely(!(x & PGT_validated)) )
1664 /* Someone else is updating validation of this page. Wait... */
1665 while ( (y = page->u.inuse.type_info) == x )
1666 cpu_relax();
1667 goto again;
1671 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1673 if ( unlikely(!(nx & PGT_validated)) )
1675 /* Try to validate page type; drop the new reference on failure. */
1676 if ( unlikely(!alloc_page_type(page, type)) )
1678 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1679 PRtype_info ": caf=%08x taf=%" PRtype_info,
1680 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1681 type, page->count_info, page->u.inuse.type_info);
1682 /* Noone else can get a reference. We hold the only ref. */
1683 page->u.inuse.type_info = 0;
1684 return 0;
1687 /* Noone else is updating simultaneously. */
1688 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1691 return 1;
1695 int new_guest_cr3(unsigned long mfn)
1697 struct vcpu *v = current;
1698 struct domain *d = v->domain;
1699 int okay;
1700 unsigned long old_base_mfn;
1702 ASSERT(writable_pagetable_in_sync(d));
1704 if ( shadow_mode_refcounts(d) )
1706 okay = get_page_from_pagenr(mfn, d);
1707 if ( unlikely(!okay) )
1709 MEM_LOG("Error while installing new baseptr %lx", mfn);
1710 return 0;
1713 else
1715 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1716 if ( unlikely(!okay) )
1718 /* Switch to idle pagetable: this VCPU has no active p.t. now. */
1719 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1720 v->arch.guest_table = mk_pagetable(0);
1721 update_pagetables(v);
1722 write_cr3(__pa(idle_pg_table));
1723 if ( old_base_mfn != 0 )
1724 put_page_and_type(mfn_to_page(old_base_mfn));
1726 /* Retry the validation with no active p.t. for this VCPU. */
1727 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1728 if ( !okay )
1730 /* Failure here is unrecoverable: the VCPU has no pagetable! */
1731 MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
1732 domain_crash(d);
1733 percpu_info[v->processor].deferred_ops = 0;
1734 return 0;
1739 invalidate_shadow_ldt(v);
1741 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1742 v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1743 update_pagetables(v); /* update shadow_table and monitor_table */
1745 write_ptbase(v);
1747 if ( likely(old_base_mfn != 0) )
1749 if ( shadow_mode_refcounts(d) )
1750 put_page(mfn_to_page(old_base_mfn));
1751 else
1752 put_page_and_type(mfn_to_page(old_base_mfn));
1755 /* CR3 also holds a ref to its shadow... */
1756 if ( shadow_mode_enabled(d) )
1758 if ( v->arch.monitor_shadow_ref )
1759 put_shadow_ref(v->arch.monitor_shadow_ref);
1760 v->arch.monitor_shadow_ref =
1761 pagetable_get_pfn(v->arch.monitor_table);
1762 ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
1763 get_shadow_ref(v->arch.monitor_shadow_ref);
1766 return 1;
1769 static void process_deferred_ops(unsigned int cpu)
1771 unsigned int deferred_ops;
1772 struct domain *d = current->domain;
1774 deferred_ops = percpu_info[cpu].deferred_ops;
1775 percpu_info[cpu].deferred_ops = 0;
1777 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1779 if ( shadow_mode_enabled(d) )
1780 shadow_sync_all(d);
1781 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1782 flush_tlb_mask(d->domain_dirty_cpumask);
1783 else
1784 local_flush_tlb();
1787 if ( deferred_ops & DOP_RELOAD_LDT )
1788 (void)map_ldt_shadow_page(0);
1790 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1792 put_domain(percpu_info[cpu].foreign);
1793 percpu_info[cpu].foreign = NULL;
1797 static int set_foreigndom(unsigned int cpu, domid_t domid)
1799 struct domain *e, *d = current->domain;
1800 int okay = 1;
1802 ASSERT(percpu_info[cpu].foreign == NULL);
1804 if ( likely(domid == DOMID_SELF) )
1805 goto out;
1807 if ( domid == d->domain_id )
1809 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1810 d->domain_id);
1811 okay = 0;
1813 else if ( !IS_PRIV(d) )
1815 switch ( domid )
1817 case DOMID_IO:
1818 get_knownalive_domain(dom_io);
1819 percpu_info[cpu].foreign = dom_io;
1820 break;
1821 default:
1822 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1823 okay = 0;
1824 break;
1827 else
1829 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1830 if ( e == NULL )
1832 switch ( domid )
1834 case DOMID_XEN:
1835 get_knownalive_domain(dom_xen);
1836 percpu_info[cpu].foreign = dom_xen;
1837 break;
1838 case DOMID_IO:
1839 get_knownalive_domain(dom_io);
1840 percpu_info[cpu].foreign = dom_io;
1841 break;
1842 default:
1843 MEM_LOG("Unknown domain '%u'", domid);
1844 okay = 0;
1845 break;
1850 out:
1851 return okay;
1854 static inline cpumask_t vcpumask_to_pcpumask(
1855 struct domain *d, unsigned long vmask)
1857 unsigned int vcpu_id;
1858 cpumask_t pmask = CPU_MASK_NONE;
1859 struct vcpu *v;
1861 while ( vmask != 0 )
1863 vcpu_id = find_first_set_bit(vmask);
1864 vmask &= ~(1UL << vcpu_id);
1865 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1866 ((v = d->vcpu[vcpu_id]) != NULL) )
1867 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1870 return pmask;
1873 int do_mmuext_op(
1874 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1875 unsigned int count,
1876 XEN_GUEST_HANDLE(uint) pdone,
1877 unsigned int foreigndom)
1879 struct mmuext_op op;
1880 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1881 unsigned long mfn, type;
1882 unsigned int done = 0;
1883 struct page_info *page;
1884 struct vcpu *v = current;
1885 struct domain *d = v->domain;
1887 LOCK_BIGLOCK(d);
1889 cleanup_writable_pagetable(d);
1891 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1893 count &= ~MMU_UPDATE_PREEMPTED;
1894 if ( unlikely(!guest_handle_is_null(pdone)) )
1895 (void)copy_from_guest(&done, pdone, 1);
1898 if ( !set_foreigndom(cpu, foreigndom) )
1900 rc = -ESRCH;
1901 goto out;
1904 if ( unlikely(!guest_handle_okay(uops, count)) )
1906 rc = -EFAULT;
1907 goto out;
1910 for ( i = 0; i < count; i++ )
1912 if ( hypercall_preempt_check() )
1914 rc = hypercall_create_continuation(
1915 __HYPERVISOR_mmuext_op, "hihi",
1916 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1917 break;
1920 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
1922 MEM_LOG("Bad __copy_from_guest");
1923 rc = -EFAULT;
1924 break;
1927 okay = 1;
1928 mfn = op.arg1.mfn;
1929 page = mfn_to_page(mfn);
1931 switch ( op.cmd )
1933 case MMUEXT_PIN_L1_TABLE:
1934 type = PGT_l1_page_table | PGT_va_mutable;
1935 goto pin_page;
1937 case MMUEXT_PIN_L2_TABLE:
1938 case MMUEXT_PIN_L3_TABLE:
1939 case MMUEXT_PIN_L4_TABLE:
1940 /* Ignore pinning of subdirectories. */
1941 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) != (CONFIG_PAGING_LEVELS - 1) )
1942 break;
1944 type = PGT_root_page_table;
1946 pin_page:
1947 if ( shadow_mode_refcounts(FOREIGNDOM) )
1948 break;
1950 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1951 if ( unlikely(!okay) )
1953 MEM_LOG("Error while pinning mfn %lx", mfn);
1954 break;
1957 if ( unlikely(test_and_set_bit(_PGT_pinned,
1958 &page->u.inuse.type_info)) )
1960 MEM_LOG("Mfn %lx already pinned", mfn);
1961 put_page_and_type(page);
1962 okay = 0;
1963 break;
1966 break;
1968 case MMUEXT_UNPIN_TABLE:
1969 if ( shadow_mode_refcounts(d) )
1970 break;
1972 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
1974 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1975 mfn, page_get_owner(page));
1977 else if ( likely(test_and_clear_bit(_PGT_pinned,
1978 &page->u.inuse.type_info)) )
1980 put_page_and_type(page);
1981 put_page(page);
1983 else
1985 okay = 0;
1986 put_page(page);
1987 MEM_LOG("Mfn %lx not pinned", mfn);
1989 break;
1991 case MMUEXT_NEW_BASEPTR:
1992 mfn = gmfn_to_mfn(current->domain, mfn);
1993 okay = new_guest_cr3(mfn);
1994 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1995 break;
1997 #ifdef __x86_64__
1998 case MMUEXT_NEW_USER_BASEPTR:
1999 okay = get_page_and_type_from_pagenr(
2000 mfn, PGT_root_page_table, d);
2001 if ( unlikely(!okay) )
2003 MEM_LOG("Error while installing new mfn %lx", mfn);
2005 else
2007 unsigned long old_mfn =
2008 pagetable_get_pfn(v->arch.guest_table_user);
2009 v->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT);
2010 if ( old_mfn != 0 )
2011 put_page_and_type(mfn_to_page(old_mfn));
2013 break;
2014 #endif
2016 case MMUEXT_TLB_FLUSH_LOCAL:
2017 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
2018 break;
2020 case MMUEXT_INVLPG_LOCAL:
2021 if ( shadow_mode_enabled(d) )
2022 shadow_invlpg(v, op.arg1.linear_addr);
2023 local_flush_tlb_one(op.arg1.linear_addr);
2024 break;
2026 case MMUEXT_TLB_FLUSH_MULTI:
2027 case MMUEXT_INVLPG_MULTI:
2029 unsigned long vmask;
2030 cpumask_t pmask;
2031 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
2033 okay = 0;
2034 break;
2036 pmask = vcpumask_to_pcpumask(d, vmask);
2037 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2038 flush_tlb_mask(pmask);
2039 else
2040 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2041 break;
2044 case MMUEXT_TLB_FLUSH_ALL:
2045 flush_tlb_mask(d->domain_dirty_cpumask);
2046 break;
2048 case MMUEXT_INVLPG_ALL:
2049 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2050 break;
2052 case MMUEXT_FLUSH_CACHE:
2053 if ( unlikely(!cache_flush_permitted(d)) )
2055 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2056 okay = 0;
2058 else
2060 wbinvd();
2062 break;
2064 case MMUEXT_SET_LDT:
2066 unsigned long ptr = op.arg1.linear_addr;
2067 unsigned long ents = op.arg2.nr_ents;
2069 if ( shadow_mode_external(d) )
2071 MEM_LOG("ignoring SET_LDT hypercall from external "
2072 "domain %u", d->domain_id);
2073 okay = 0;
2075 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2076 (ents > 8192) ||
2077 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2079 okay = 0;
2080 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2082 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2083 (v->arch.guest_context.ldt_base != ptr) )
2085 invalidate_shadow_ldt(v);
2086 v->arch.guest_context.ldt_base = ptr;
2087 v->arch.guest_context.ldt_ents = ents;
2088 load_LDT(v);
2089 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
2090 if ( ents != 0 )
2091 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
2093 break;
2096 default:
2097 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2098 okay = 0;
2099 break;
2102 if ( unlikely(!okay) )
2104 rc = -EINVAL;
2105 break;
2108 guest_handle_add_offset(uops, 1);
2111 out:
2112 process_deferred_ops(cpu);
2114 /* Add incremental work we have done to the @done output parameter. */
2115 done += i;
2116 if ( unlikely(!guest_handle_is_null(pdone)) )
2117 copy_to_guest(pdone, &done, 1);
2119 UNLOCK_BIGLOCK(d);
2120 return rc;
2123 int do_mmu_update(
2124 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2125 unsigned int count,
2126 XEN_GUEST_HANDLE(uint) pdone,
2127 unsigned int foreigndom)
2129 struct mmu_update req;
2130 void *va;
2131 unsigned long gpfn, gmfn, mfn;
2132 struct page_info *page;
2133 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
2134 unsigned int cmd, done = 0;
2135 struct vcpu *v = current;
2136 struct domain *d = v->domain;
2137 unsigned long type_info;
2138 struct domain_mmap_cache mapcache, sh_mapcache;
2140 LOCK_BIGLOCK(d);
2142 cleanup_writable_pagetable(d);
2144 if ( unlikely(shadow_mode_enabled(d)) )
2145 check_pagetable(v, "pre-mmu"); /* debug */
2147 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2149 count &= ~MMU_UPDATE_PREEMPTED;
2150 if ( unlikely(!guest_handle_is_null(pdone)) )
2151 (void)copy_from_guest(&done, pdone, 1);
2154 domain_mmap_cache_init(&mapcache);
2155 domain_mmap_cache_init(&sh_mapcache);
2157 if ( !set_foreigndom(cpu, foreigndom) )
2159 rc = -ESRCH;
2160 goto out;
2163 perfc_incrc(calls_to_mmu_update);
2164 perfc_addc(num_page_updates, count);
2165 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2167 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2169 rc = -EFAULT;
2170 goto out;
2173 for ( i = 0; i < count; i++ )
2175 if ( hypercall_preempt_check() )
2177 rc = hypercall_create_continuation(
2178 __HYPERVISOR_mmu_update, "hihi",
2179 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2180 break;
2183 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2185 MEM_LOG("Bad __copy_from_guest");
2186 rc = -EFAULT;
2187 break;
2190 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2191 okay = 0;
2193 switch ( cmd )
2195 /*
2196 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2197 */
2198 case MMU_NORMAL_PT_UPDATE:
2200 gmfn = req.ptr >> PAGE_SHIFT;
2201 mfn = gmfn_to_mfn(d, gmfn);
2203 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2205 MEM_LOG("Could not get page for normal update");
2206 break;
2209 va = map_domain_page_with_cache(mfn, &mapcache);
2210 va = (void *)((unsigned long)va +
2211 (unsigned long)(req.ptr & ~PAGE_MASK));
2212 page = mfn_to_page(mfn);
2214 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2216 case PGT_l1_page_table:
2217 ASSERT( !shadow_mode_refcounts(d) );
2218 if ( likely(get_page_type(
2219 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2221 l1_pgentry_t l1e;
2223 /* FIXME: doesn't work with PAE */
2224 l1e = l1e_from_intpte(req.val);
2225 okay = mod_l1_entry(va, l1e);
2226 if ( okay && unlikely(shadow_mode_enabled(d)) )
2227 shadow_l1_normal_pt_update(
2228 d, req.ptr, l1e, &sh_mapcache);
2229 put_page_type(page);
2231 break;
2232 case PGT_l2_page_table:
2233 ASSERT( !shadow_mode_refcounts(d) );
2234 if ( likely(get_page_type(
2235 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2237 l2_pgentry_t l2e;
2239 /* FIXME: doesn't work with PAE */
2240 l2e = l2e_from_intpte(req.val);
2241 okay = mod_l2_entry(
2242 (l2_pgentry_t *)va, l2e, mfn, type_info);
2243 if ( okay && unlikely(shadow_mode_enabled(d)) )
2244 shadow_l2_normal_pt_update(
2245 d, req.ptr, l2e, &sh_mapcache);
2246 put_page_type(page);
2248 break;
2249 #if CONFIG_PAGING_LEVELS >= 3
2250 case PGT_l3_page_table:
2251 ASSERT( !shadow_mode_refcounts(d) );
2252 if ( likely(get_page_type(
2253 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2255 l3_pgentry_t l3e;
2257 /* FIXME: doesn't work with PAE */
2258 l3e = l3e_from_intpte(req.val);
2259 okay = mod_l3_entry(va, l3e, mfn, type_info);
2260 if ( okay && unlikely(shadow_mode_enabled(d)) )
2261 shadow_l3_normal_pt_update(
2262 d, req.ptr, l3e, &sh_mapcache);
2263 put_page_type(page);
2265 break;
2266 #endif
2267 #if CONFIG_PAGING_LEVELS >= 4
2268 case PGT_l4_page_table:
2269 ASSERT( !shadow_mode_refcounts(d) );
2270 if ( likely(get_page_type(
2271 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2273 l4_pgentry_t l4e;
2275 l4e = l4e_from_intpte(req.val);
2276 okay = mod_l4_entry(va, l4e, mfn, type_info);
2277 if ( okay && unlikely(shadow_mode_enabled(d)) )
2278 shadow_l4_normal_pt_update(
2279 d, req.ptr, l4e, &sh_mapcache);
2280 put_page_type(page);
2282 break;
2283 #endif
2284 default:
2285 if ( likely(get_page_type(page, PGT_writable_page)) )
2287 if ( shadow_mode_enabled(d) )
2289 shadow_lock(d);
2291 __mark_dirty(d, mfn);
2293 if ( page_is_page_table(page) &&
2294 !page_out_of_sync(page) )
2296 shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
2300 *(intpte_t *)va = req.val;
2301 okay = 1;
2303 if ( shadow_mode_enabled(d) )
2304 shadow_unlock(d);
2306 put_page_type(page);
2308 break;
2311 unmap_domain_page_with_cache(va, &mapcache);
2313 put_page(page);
2314 break;
2316 case MMU_MACHPHYS_UPDATE:
2318 if ( shadow_mode_translate(FOREIGNDOM) )
2320 MEM_LOG("can't mutate m2p table of translate mode guest");
2321 break;
2324 mfn = req.ptr >> PAGE_SHIFT;
2325 gpfn = req.val;
2327 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2329 MEM_LOG("Could not get page for mach->phys update");
2330 break;
2333 set_gpfn_from_mfn(mfn, gpfn);
2334 okay = 1;
2336 mark_dirty(FOREIGNDOM, mfn);
2338 put_page(mfn_to_page(mfn));
2339 break;
2341 default:
2342 MEM_LOG("Invalid page update command %x", cmd);
2343 break;
2346 if ( unlikely(!okay) )
2348 rc = -EINVAL;
2349 break;
2352 guest_handle_add_offset(ureqs, 1);
2355 out:
2356 domain_mmap_cache_destroy(&mapcache);
2357 domain_mmap_cache_destroy(&sh_mapcache);
2359 process_deferred_ops(cpu);
2361 /* Add incremental work we have done to the @done output parameter. */
2362 done += i;
2363 if ( unlikely(!guest_handle_is_null(pdone)) )
2364 copy_to_guest(pdone, &done, 1);
2366 if ( unlikely(shadow_mode_enabled(d)) )
2367 check_pagetable(v, "post-mmu"); /* debug */
2369 UNLOCK_BIGLOCK(d);
2370 return rc;
2374 static int create_grant_pte_mapping(
2375 unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
2377 int rc = GNTST_okay;
2378 void *va;
2379 unsigned long gmfn, mfn;
2380 struct page_info *page;
2381 u32 type_info;
2382 l1_pgentry_t ol1e;
2383 struct domain *d = v->domain;
2385 ASSERT(spin_is_locked(&d->big_lock));
2386 ASSERT(!shadow_mode_refcounts(d));
2388 gmfn = pte_addr >> PAGE_SHIFT;
2389 mfn = gmfn_to_mfn(d, gmfn);
2391 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2393 MEM_LOG("Could not get page for normal update");
2394 return GNTST_general_error;
2397 va = map_domain_page(mfn);
2398 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2399 page = mfn_to_page(mfn);
2401 type_info = page->u.inuse.type_info;
2402 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2403 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2405 MEM_LOG("Grant map attempted to update a non-L1 page");
2406 rc = GNTST_general_error;
2407 goto failed;
2410 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) ||
2411 !update_l1e(va, ol1e, _nl1e) )
2413 put_page_type(page);
2414 rc = GNTST_general_error;
2415 goto failed;
2418 put_page_from_l1e(ol1e, d);
2420 if ( unlikely(shadow_mode_enabled(d)) )
2422 struct domain_mmap_cache sh_mapcache;
2423 domain_mmap_cache_init(&sh_mapcache);
2424 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2425 domain_mmap_cache_destroy(&sh_mapcache);
2428 put_page_type(page);
2430 failed:
2431 unmap_domain_page(va);
2432 put_page(page);
2433 return rc;
2436 static int destroy_grant_pte_mapping(
2437 unsigned long addr, unsigned long frame, struct domain *d)
2439 int rc = GNTST_okay;
2440 void *va;
2441 unsigned long gmfn, mfn;
2442 struct page_info *page;
2443 u32 type_info;
2444 l1_pgentry_t ol1e;
2446 ASSERT(!shadow_mode_refcounts(d));
2448 gmfn = addr >> PAGE_SHIFT;
2449 mfn = gmfn_to_mfn(d, gmfn);
2451 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2453 MEM_LOG("Could not get page for normal update");
2454 return GNTST_general_error;
2457 va = map_domain_page(mfn);
2458 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2459 page = mfn_to_page(mfn);
2461 type_info = page->u.inuse.type_info;
2462 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2463 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2465 MEM_LOG("Grant map attempted to update a non-L1 page");
2466 rc = GNTST_general_error;
2467 goto failed;
2470 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2472 put_page_type(page);
2473 rc = GNTST_general_error;
2474 goto failed;
2477 /* Check that the virtual address supplied is actually mapped to frame. */
2478 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2480 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2481 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2482 put_page_type(page);
2483 rc = GNTST_general_error;
2484 goto failed;
2487 /* Delete pagetable entry. */
2488 if ( unlikely(__put_user(0, (intpte_t *)va)))
2490 MEM_LOG("Cannot delete PTE entry at %p", va);
2491 put_page_type(page);
2492 rc = GNTST_general_error;
2493 goto failed;
2496 if ( unlikely(shadow_mode_enabled(d)) )
2498 struct domain_mmap_cache sh_mapcache;
2499 domain_mmap_cache_init(&sh_mapcache);
2500 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2501 domain_mmap_cache_destroy(&sh_mapcache);
2504 put_page_type(page);
2506 failed:
2507 unmap_domain_page(va);
2508 put_page(page);
2509 return rc;
2513 static int create_grant_va_mapping(
2514 unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
2516 l1_pgentry_t *pl1e, ol1e;
2517 struct domain *d = v->domain;
2519 ASSERT(spin_is_locked(&d->big_lock));
2520 ASSERT(!shadow_mode_refcounts(d));
2522 /*
2523 * This is actually overkill - we don't need to sync the L1 itself,
2524 * just everything involved in getting to this L1 (i.e. we need
2525 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2526 */
2527 __shadow_sync_va(v, va);
2529 pl1e = &linear_pg_table[l1_linear_offset(va)];
2531 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2532 !update_l1e(pl1e, ol1e, _nl1e) )
2533 return GNTST_general_error;
2535 put_page_from_l1e(ol1e, d);
2537 if ( unlikely(shadow_mode_enabled(d)) )
2538 shadow_do_update_va_mapping(va, _nl1e, v);
2540 return GNTST_okay;
2543 static int destroy_grant_va_mapping(
2544 unsigned long addr, unsigned long frame)
2546 l1_pgentry_t *pl1e, ol1e;
2548 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2550 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2552 MEM_LOG("Could not find PTE entry for address %lx", addr);
2553 return GNTST_general_error;
2556 /*
2557 * Check that the virtual address supplied is actually mapped to
2558 * frame.
2559 */
2560 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2562 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2563 l1e_get_pfn(ol1e), addr, frame);
2564 return GNTST_general_error;
2567 /* Delete pagetable entry. */
2568 if ( unlikely(__put_user(0, &pl1e->l1)) )
2570 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2571 return GNTST_general_error;
2574 return 0;
2577 int create_grant_host_mapping(
2578 unsigned long addr, unsigned long frame, unsigned int flags)
2580 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2582 if ( (flags & GNTMAP_application_map) )
2583 l1e_add_flags(pte,_PAGE_USER);
2584 if ( !(flags & GNTMAP_readonly) )
2585 l1e_add_flags(pte,_PAGE_RW);
2587 if ( flags & GNTMAP_contains_pte )
2588 return create_grant_pte_mapping(addr, pte, current);
2589 return create_grant_va_mapping(addr, pte, current);
2592 int destroy_grant_host_mapping(
2593 unsigned long addr, unsigned long frame, unsigned int flags)
2595 if ( flags & GNTMAP_contains_pte )
2596 return destroy_grant_pte_mapping(addr, frame, current->domain);
2597 return destroy_grant_va_mapping(addr, frame);
2600 int steal_page_for_grant_transfer(
2601 struct domain *d, struct page_info *page)
2603 u32 _d, _nd, x, y;
2605 spin_lock(&d->page_alloc_lock);
2607 /*
2608 * The tricky bit: atomically release ownership while there is just one
2609 * benign reference to the page (PGC_allocated). If that reference
2610 * disappears then the deallocation routine will safely spin.
2611 */
2612 _d = pickle_domptr(d);
2613 _nd = page->u.inuse._domain;
2614 y = page->count_info;
2615 do {
2616 x = y;
2617 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2618 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2619 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2620 " caf=%08x, taf=%" PRtype_info "\n",
2621 (void *) page_to_mfn(page),
2622 d, d->domain_id, unpickle_domptr(_nd), x,
2623 page->u.inuse.type_info);
2624 spin_unlock(&d->page_alloc_lock);
2625 return -1;
2627 __asm__ __volatile__(
2628 LOCK_PREFIX "cmpxchg8b %2"
2629 : "=d" (_nd), "=a" (y),
2630 "=m" (*(volatile u64 *)(&page->count_info))
2631 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2632 } while (unlikely(_nd != _d) || unlikely(y != x));
2634 /*
2635 * Unlink from 'd'. At least one reference remains (now anonymous), so
2636 * noone else is spinning to try to delete this page from 'd'.
2637 */
2638 d->tot_pages--;
2639 list_del(&page->list);
2641 spin_unlock(&d->page_alloc_lock);
2643 return 0;
2646 int do_update_va_mapping(unsigned long va, u64 val64,
2647 unsigned long flags)
2649 l1_pgentry_t val = l1e_from_intpte(val64);
2650 struct vcpu *v = current;
2651 struct domain *d = v->domain;
2652 unsigned int cpu = smp_processor_id();
2653 unsigned long vmask, bmap_ptr;
2654 cpumask_t pmask;
2655 int rc = 0;
2657 perfc_incrc(calls_to_update_va);
2659 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2660 return -EINVAL;
2662 LOCK_BIGLOCK(d);
2664 cleanup_writable_pagetable(d);
2666 if ( unlikely(shadow_mode_enabled(d)) )
2667 check_pagetable(v, "pre-va"); /* debug */
2669 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2670 val)) )
2671 rc = -EINVAL;
2673 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2675 if ( unlikely(percpu_info[cpu].foreign &&
2676 (shadow_mode_translate(d) ||
2677 shadow_mode_translate(percpu_info[cpu].foreign))) )
2679 /*
2680 * The foreign domain's pfn's are in a different namespace. There's
2681 * not enough information in just a gpte to figure out how to
2682 * (re-)shadow this entry.
2683 */
2684 domain_crash(d);
2687 rc = shadow_do_update_va_mapping(va, val, v);
2689 check_pagetable(v, "post-va"); /* debug */
2692 switch ( flags & UVMF_FLUSHTYPE_MASK )
2694 case UVMF_TLB_FLUSH:
2695 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2697 case UVMF_LOCAL:
2698 if ( unlikely(shadow_mode_enabled(d)) )
2699 shadow_sync_all(d);
2700 local_flush_tlb();
2701 break;
2702 case UVMF_ALL:
2703 flush_tlb_mask(d->domain_dirty_cpumask);
2704 break;
2705 default:
2706 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2707 rc = -EFAULT;
2708 pmask = vcpumask_to_pcpumask(d, vmask);
2709 flush_tlb_mask(pmask);
2710 break;
2712 break;
2714 case UVMF_INVLPG:
2715 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2717 case UVMF_LOCAL:
2718 if ( unlikely(shadow_mode_enabled(d)) )
2719 shadow_invlpg(current, va);
2720 local_flush_tlb_one(va);
2721 break;
2722 case UVMF_ALL:
2723 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2724 break;
2725 default:
2726 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2727 rc = -EFAULT;
2728 pmask = vcpumask_to_pcpumask(d, vmask);
2729 flush_tlb_one_mask(pmask, va);
2730 break;
2732 break;
2735 process_deferred_ops(cpu);
2737 UNLOCK_BIGLOCK(d);
2739 return rc;
2742 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2743 unsigned long flags,
2744 domid_t domid)
2746 unsigned int cpu = smp_processor_id();
2747 int rc;
2749 if ( unlikely(!IS_PRIV(current->domain)) )
2750 return -EPERM;
2752 if ( !set_foreigndom(cpu, domid) )
2753 return -ESRCH;
2755 rc = do_update_va_mapping(va, val64, flags);
2757 return rc;
2762 /*************************
2763 * Descriptor Tables
2764 */
2766 void destroy_gdt(struct vcpu *v)
2768 int i;
2769 unsigned long pfn;
2771 v->arch.guest_context.gdt_ents = 0;
2772 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2774 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2775 put_page_and_type(mfn_to_page(pfn));
2776 v->arch.perdomain_ptes[i] = l1e_empty();
2777 v->arch.guest_context.gdt_frames[i] = 0;
2782 long set_gdt(struct vcpu *v,
2783 unsigned long *frames,
2784 unsigned int entries)
2786 struct domain *d = v->domain;
2787 /* NB. There are 512 8-byte entries per GDT page. */
2788 int i, nr_pages = (entries + 511) / 512;
2789 unsigned long mfn;
2791 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2792 return -EINVAL;
2794 shadow_sync_all(d);
2796 /* Check the pages in the new GDT. */
2797 for ( i = 0; i < nr_pages; i++ ) {
2798 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2799 if ( !mfn_valid(mfn) ||
2800 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2801 goto fail;
2804 /* Tear down the old GDT. */
2805 destroy_gdt(v);
2807 /* Install the new GDT. */
2808 v->arch.guest_context.gdt_ents = entries;
2809 for ( i = 0; i < nr_pages; i++ )
2811 v->arch.guest_context.gdt_frames[i] = frames[i];
2812 v->arch.perdomain_ptes[i] =
2813 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2816 return 0;
2818 fail:
2819 while ( i-- > 0 )
2820 put_page_and_type(mfn_to_page(frames[i]));
2821 return -EINVAL;
2825 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2827 int nr_pages = (entries + 511) / 512;
2828 unsigned long frames[16];
2829 long ret;
2831 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2832 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2833 return -EINVAL;
2835 if ( copy_from_guest((unsigned long *)frames, frame_list, nr_pages) )
2836 return -EFAULT;
2838 LOCK_BIGLOCK(current->domain);
2840 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2841 local_flush_tlb();
2843 UNLOCK_BIGLOCK(current->domain);
2845 return ret;
2849 long do_update_descriptor(u64 pa, u64 desc)
2851 struct domain *dom = current->domain;
2852 unsigned long gmfn = pa >> PAGE_SHIFT;
2853 unsigned long mfn;
2854 unsigned int offset;
2855 struct desc_struct *gdt_pent, d;
2856 struct page_info *page;
2857 long ret = -EINVAL;
2859 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2861 *(u64 *)&d = desc;
2863 LOCK_BIGLOCK(dom);
2865 if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
2866 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2867 !mfn_valid(mfn) ||
2868 !check_descriptor(&d) )
2870 UNLOCK_BIGLOCK(dom);
2871 return -EINVAL;
2874 page = mfn_to_page(mfn);
2875 if ( unlikely(!get_page(page, dom)) )
2877 UNLOCK_BIGLOCK(dom);
2878 return -EINVAL;
2881 /* Check if the given frame is in use in an unsafe context. */
2882 switch ( page->u.inuse.type_info & PGT_type_mask )
2884 case PGT_gdt_page:
2885 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2886 goto out;
2887 break;
2888 case PGT_ldt_page:
2889 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2890 goto out;
2891 break;
2892 default:
2893 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2894 goto out;
2895 break;
2898 if ( shadow_mode_enabled(dom) )
2900 shadow_lock(dom);
2902 __mark_dirty(dom, mfn);
2904 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2905 shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
2908 /* All is good so make the update. */
2909 gdt_pent = map_domain_page(mfn);
2910 memcpy(&gdt_pent[offset], &d, 8);
2911 unmap_domain_page(gdt_pent);
2913 if ( shadow_mode_enabled(dom) )
2914 shadow_unlock(dom);
2916 put_page_type(page);
2918 ret = 0; /* success */
2920 out:
2921 put_page(page);
2923 UNLOCK_BIGLOCK(dom);
2925 return ret;
2928 typedef struct e820entry e820entry_t;
2929 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
2931 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2933 switch ( op )
2935 case XENMEM_add_to_physmap:
2937 struct xen_add_to_physmap xatp;
2938 unsigned long mfn = 0, gpfn;
2939 struct domain *d;
2941 if ( copy_from_guest(&xatp, arg, 1) )
2942 return -EFAULT;
2944 if ( (d = find_domain_by_id(xatp.domid)) == NULL )
2945 return -ESRCH;
2947 switch ( xatp.space )
2949 case XENMAPSPACE_shared_info:
2950 if ( xatp.idx == 0 )
2951 mfn = virt_to_mfn(d->shared_info);
2952 break;
2953 case XENMAPSPACE_grant_table:
2954 if ( xatp.idx < NR_GRANT_FRAMES )
2955 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
2956 break;
2957 default:
2958 break;
2961 if ( !shadow_mode_translate(d) || (mfn == 0) )
2963 put_domain(d);
2964 return -EINVAL;
2967 LOCK_BIGLOCK(d);
2969 /* Remove previously mapped page if it was present. */
2970 if ( mfn_valid(gmfn_to_mfn(d, xatp.gpfn)) )
2971 guest_remove_page(d, xatp.gpfn);
2973 /* Unmap from old location, if any. */
2974 gpfn = get_gpfn_from_mfn(mfn);
2975 if ( gpfn != INVALID_M2P_ENTRY )
2976 guest_physmap_remove_page(d, gpfn, mfn);
2978 /* Map at new location. */
2979 guest_physmap_add_page(d, xatp.gpfn, mfn);
2981 UNLOCK_BIGLOCK(d);
2983 put_domain(d);
2985 break;
2988 case XENMEM_memory_map:
2990 return -ENOSYS;
2993 case XENMEM_machine_memory_map:
2995 struct xen_memory_map memmap;
2996 XEN_GUEST_HANDLE(e820entry_t) buffer;
2997 int count;
2999 if ( !IS_PRIV(current->domain) )
3000 return -EINVAL;
3002 if ( copy_from_guest(&memmap, arg, 1) )
3003 return -EFAULT;
3004 if ( memmap.nr_entries < e820.nr_map + 1 )
3005 return -EINVAL;
3007 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3009 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3010 if ( copy_to_guest(buffer, &e820.map[0], count) < 0 )
3011 return -EFAULT;
3013 memmap.nr_entries = count;
3015 if ( copy_to_guest(arg, &memmap, 1) )
3016 return -EFAULT;
3018 return 0;
3021 default:
3022 return subarch_memory_op(op, arg);
3025 return 0;
3029 /*************************
3030 * Writable Pagetables
3031 */
3033 #ifdef VVERBOSE
3034 int ptwr_debug = 0x0;
3035 #define PTWR_PRINTK(_f, _a...) \
3036 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
3037 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
3038 #else
3039 #define PTWR_PRINTK(_f, _a...) ((void)0)
3040 #endif
3043 #ifdef PERF_ARRAYS
3045 /**************** writeable pagetables profiling functions *****************/
3047 #define ptwr_eip_buckets 256
3049 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
3051 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
3053 struct {
3054 unsigned long eip;
3055 domid_t id;
3056 u32 val[ptwr_eip_stat_thresholdN];
3057 } typedef ptwr_eip_stat_t;
3059 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
3061 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
3063 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
3064 ptwr_eip_buckets;
3067 static void ptwr_eip_stat_inc(u32 *n)
3069 unsigned int i, j;
3071 if ( ++(*n) != 0 )
3072 return;
3074 *n = ~0;
3076 /* Re-scale all buckets. */
3077 for ( i = 0; i < ptwr_eip_buckets; i++ )
3078 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3079 ptwr_eip_stats[i].val[j] >>= 1;
3082 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
3084 unsigned int i, j, b;
3086 i = b = ptwr_eip_stat_hash(eip, id);
3088 do
3090 if ( !ptwr_eip_stats[i].eip )
3092 /* doesn't exist */
3093 ptwr_eip_stats[i].eip = eip;
3094 ptwr_eip_stats[i].id = id;
3095 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
3098 if ( ptwr_eip_stats[i].eip == eip && ptwr_eip_stats[i].id == id)
3100 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3101 if ( modified <= ptwr_eip_stat_threshold[j] )
3102 break;
3103 BUG_ON(j >= ptwr_eip_stat_thresholdN);
3104 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
3105 return;
3108 i = (i+1) % ptwr_eip_buckets;
3110 while ( i != b );
3112 printk("ptwr_eip_stat: too many EIPs in use!\n");
3114 ptwr_eip_stat_print();
3115 ptwr_eip_stat_reset();
3118 void ptwr_eip_stat_reset(void)
3120 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
3123 void ptwr_eip_stat_print(void)
3125 struct domain *e;
3126 domid_t d;
3127 unsigned int i, j;
3129 for_each_domain( e )
3131 d = e->domain_id;
3133 for ( i = 0; i < ptwr_eip_buckets; i++ )
3135 if ( !ptwr_eip_stats[i].eip || ptwr_eip_stats[i].id != d )
3136 continue;
3138 printk("D %5d eip %p ",
3139 ptwr_eip_stats[i].id, (void *)ptwr_eip_stats[i].eip);
3141 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3142 printk("<=%u %4u \t",
3143 ptwr_eip_stat_threshold[j],
3144 ptwr_eip_stats[i].val[j]);
3145 printk("\n");
3150 #else /* PERF_ARRAYS */
3152 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
3154 #endif
3156 /*******************************************************************/
3158 /* Re-validate a given p.t. page, given its prior snapshot */
3159 int revalidate_l1(
3160 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
3162 l1_pgentry_t ol1e, nl1e;
3163 int modified = 0, i;
3165 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3167 ol1e = snapshot[i];
3168 nl1e = l1page[i];
3170 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
3171 continue;
3173 /* Update number of entries modified. */
3174 modified++;
3176 /*
3177 * Fast path for PTEs that have merely been write-protected
3178 * (e.g., during a Unix fork()). A strict reduction in privilege.
3179 */
3180 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
3182 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3183 put_page_type(mfn_to_page(l1e_get_pfn(nl1e)));
3184 continue;
3187 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3189 /*
3190 * Make the remaining p.t's consistent before crashing, so the
3191 * reference counts are correct.
3192 */
3193 memcpy(&l1page[i], &snapshot[i],
3194 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
3196 /* Crash the offending domain. */
3197 MEM_LOG("ptwr: Could not revalidate l1 page");
3198 domain_crash(d);
3199 break;
3202 put_page_from_l1e(ol1e, d);
3205 return modified;
3209 /* Flush the given writable p.t. page and write-protect it again. */
3210 void ptwr_flush(struct domain *d, const int which)
3212 unsigned long l1va;
3213 l1_pgentry_t *pl1e, pte, *ptep;
3214 l2_pgentry_t *pl2e;
3215 unsigned int modified;
3217 #ifdef CONFIG_X86_64
3218 struct vcpu *v = current;
3219 int user_mode = !(v->arch.flags & TF_kernel_mode);
3220 #endif
3222 ASSERT(!shadow_mode_enabled(d));
3224 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3225 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
3226 __write_ptbase(pagetable_get_pfn(
3227 d->arch.ptwr[which].vcpu->arch.guest_table));
3228 else
3229 TOGGLE_MODE();
3231 l1va = d->arch.ptwr[which].l1va;
3232 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
3234 /*
3235 * STEP 1. Write-protect the p.t. page so no more updates can occur.
3236 */
3238 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
3240 MEM_LOG("ptwr: Could not read pte at %p", ptep);
3241 /*
3242 * Really a bug. We could read this PTE during the initial fault,
3243 * and pagetables can't have changed meantime.
3244 */
3245 BUG();
3247 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
3248 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3249 l1e_remove_flags(pte, _PAGE_RW);
3251 /* Write-protect the p.t. page in the guest page table. */
3252 if ( unlikely(__put_user(pte, ptep)) )
3254 MEM_LOG("ptwr: Could not update pte at %p", ptep);
3255 /*
3256 * Really a bug. We could write this PTE during the initial fault,
3257 * and pagetables can't have changed meantime.
3258 */
3259 BUG();
3262 /* Ensure that there are no stale writable mappings in any TLB. */
3263 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3264 flush_tlb_one_mask(d->domain_dirty_cpumask, l1va);
3265 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3266 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3268 /*
3269 * STEP 2. Validate any modified PTEs.
3270 */
3272 if ( likely(d == current->domain) )
3274 pl1e = map_domain_page(l1e_get_pfn(pte));
3275 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3276 unmap_domain_page(pl1e);
3277 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3278 ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
3279 d->arch.ptwr[which].prev_nr_updates = modified;
3281 else
3283 /*
3284 * Must make a temporary global mapping, since we are running in the
3285 * wrong address space, so no access to our own mapcache.
3286 */
3287 pl1e = map_domain_page_global(l1e_get_pfn(pte));
3288 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3289 unmap_domain_page_global(pl1e);
3292 /*
3293 * STEP 3. Reattach the L1 p.t. page into the current address space.
3294 */
3296 if ( which == PTWR_PT_ACTIVE )
3298 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3299 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3302 /*
3303 * STEP 4. Final tidy-up.
3304 */
3306 d->arch.ptwr[which].l1va = 0;
3308 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3309 write_ptbase(current);
3310 else
3311 TOGGLE_MODE();
3314 static int ptwr_emulated_update(
3315 unsigned long addr,
3316 paddr_t old,
3317 paddr_t val,
3318 unsigned int bytes,
3319 unsigned int do_cmpxchg)
3321 unsigned long pfn, l1va;
3322 struct page_info *page;
3323 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3324 struct domain *d = current->domain;
3326 /* Aligned access only, thank you. */
3327 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3329 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3330 bytes, addr);
3331 return X86EMUL_UNHANDLEABLE;
3334 /* Turn a sub-word access into a full-word access. */
3335 if ( bytes != sizeof(paddr_t) )
3337 paddr_t full;
3338 unsigned int offset = addr & (sizeof(paddr_t)-1);
3340 /* Align address; read full word. */
3341 addr &= ~(sizeof(paddr_t)-1);
3342 if ( copy_from_user(&full, (void *)addr, sizeof(paddr_t)) )
3344 propagate_page_fault(addr, 4); /* user mode, read fault */
3345 return X86EMUL_PROPAGATE_FAULT;
3347 /* Mask out bits provided by caller. */
3348 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3349 /* Shift the caller value and OR in the missing bits. */
3350 val &= (((paddr_t)1 << (bytes*8)) - 1);
3351 val <<= (offset)*8;
3352 val |= full;
3353 /* Also fill in missing parts of the cmpxchg old value. */
3354 old &= (((paddr_t)1 << (bytes*8)) - 1);
3355 old <<= (offset)*8;
3356 old |= full;
3359 /*
3360 * We must not emulate an update to a PTE that is temporarily marked
3361 * writable by the batched ptwr logic, else we can corrupt page refcnts!
3362 */
3363 if ( ((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3364 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3365 ptwr_flush(d, PTWR_PT_ACTIVE);
3366 if ( ((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3367 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3368 ptwr_flush(d, PTWR_PT_INACTIVE);
3370 /* Read the PTE that maps the page being updated. */
3371 if ( __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3372 sizeof(pte)) )
3374 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3375 return X86EMUL_UNHANDLEABLE;
3378 pfn = l1e_get_pfn(pte);
3379 page = mfn_to_page(pfn);
3381 /* We are looking only for read-only mappings of p.t. pages. */
3382 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3383 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3384 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3385 ASSERT(page_get_owner(page) == d);
3387 /* Check the new PTE. */
3388 nl1e = l1e_from_intpte(val);
3389 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3391 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3392 return X86EMUL_UNHANDLEABLE;
3395 /* Checked successfully: do the update (write or cmpxchg). */
3396 pl1e = map_domain_page(page_to_mfn(page));
3397 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3398 if ( do_cmpxchg )
3400 ol1e = l1e_from_intpte(old);
3401 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3403 unmap_domain_page(pl1e);
3404 put_page_from_l1e(nl1e, d);
3405 return X86EMUL_CMPXCHG_FAILED;
3408 else
3410 ol1e = *pl1e;
3411 *pl1e = nl1e;
3413 unmap_domain_page(pl1e);
3415 /* Finally, drop the old PTE. */
3416 put_page_from_l1e(ol1e, d);
3418 return X86EMUL_CONTINUE;
3421 static int ptwr_emulated_write(
3422 unsigned long addr,
3423 unsigned long val,
3424 unsigned int bytes,
3425 struct x86_emulate_ctxt *ctxt)
3427 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3430 static int ptwr_emulated_cmpxchg(
3431 unsigned long addr,
3432 unsigned long old,
3433 unsigned long new,
3434 unsigned int bytes,
3435 struct x86_emulate_ctxt *ctxt)
3437 return ptwr_emulated_update(addr, old, new, bytes, 1);
3440 static int ptwr_emulated_cmpxchg8b(
3441 unsigned long addr,
3442 unsigned long old,
3443 unsigned long old_hi,
3444 unsigned long new,
3445 unsigned long new_hi,
3446 struct x86_emulate_ctxt *ctxt)
3448 if ( CONFIG_PAGING_LEVELS == 2 )
3449 return X86EMUL_UNHANDLEABLE;
3450 else
3451 return ptwr_emulated_update(
3452 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3455 static struct x86_emulate_ops ptwr_emulate_ops = {
3456 .read_std = x86_emulate_read_std,
3457 .write_std = x86_emulate_write_std,
3458 .read_emulated = x86_emulate_read_std,
3459 .write_emulated = ptwr_emulated_write,
3460 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3461 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3462 };
3464 /* Write page fault handler: check if guest is trying to modify a PTE. */
3465 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3466 struct cpu_user_regs *regs)
3468 unsigned long pfn;
3469 struct page_info *page;
3470 l1_pgentry_t *pl1e, pte;
3471 l2_pgentry_t *pl2e, l2e;
3472 int which, flags;
3473 unsigned long l2_idx;
3474 struct x86_emulate_ctxt emul_ctxt;
3476 if ( unlikely(shadow_mode_enabled(d)) )
3477 return 0;
3479 /*
3480 * Attempt to read the PTE that maps the VA being accessed. By checking for
3481 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3482 */
3483 if ( !(l2e_get_flags(__linear_l2_table[l2_linear_offset(addr)]) &
3484 _PAGE_PRESENT) ||
3485 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
3486 sizeof(pte)) )
3488 return 0;
3491 pfn = l1e_get_pfn(pte);
3492 page = mfn_to_page(pfn);
3494 #ifdef CONFIG_X86_64
3495 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT | _PAGE_USER)
3496 #else
3497 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT)
3498 #endif
3500 /*
3501 * Check the required flags for a valid wrpt mapping. If the page is
3502 * already writable then we can return straight to the guest (SMP race).
3503 * We decide whether or not to propagate the fault by testing for write
3504 * permissions in page directories by writing back to the linear mapping.
3505 */
3506 if ( (flags = l1e_get_flags(pte) & WRPT_PTE_FLAGS) == WRPT_PTE_FLAGS )
3507 return __put_user(
3508 pte.l1, &linear_pg_table[l1_linear_offset(addr)].l1) ?
3509 0 : EXCRET_not_a_fault;
3511 /* We are looking only for read-only mappings of p.t. pages. */
3512 if ( ((flags | _PAGE_RW) != WRPT_PTE_FLAGS) ||
3513 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3514 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3515 (page_get_owner(page) != d) )
3517 return 0;
3520 #if 0 /* Leave this in as useful for debugging */
3521 goto emulate;
3522 #endif
3524 PTWR_PRINTK("ptwr_page_fault on l1 pt at va %lx, pfn %lx, eip %lx\n",
3525 addr, pfn, (unsigned long)regs->eip);
3527 /* Get the L2 index at which this L1 p.t. is always mapped. */
3528 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3529 if ( unlikely(l2_idx >= PGT_va_unknown) )
3530 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3531 l2_idx >>= PGT_va_shift;
3533 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3534 goto emulate; /* Urk! Pagetable maps itself! */
3536 /*
3537 * Is the L1 p.t. mapped into the current address space? If so we call it
3538 * an ACTIVE p.t., otherwise it is INACTIVE.
3539 */
3540 pl2e = &__linear_l2_table[l2_idx];
3541 which = PTWR_PT_INACTIVE;
3543 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3545 /*
3546 * Check the PRESENT bit to set ACTIVE mode.
3547 * If the PRESENT bit is clear, we may be conflicting with the current
3548 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3549 * The ptwr_flush call below will restore the PRESENT bit.
3550 */
3551 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3552 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3553 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3554 which = PTWR_PT_ACTIVE;
3557 /*
3558 * If this is a multi-processor guest then ensure that the page is hooked
3559 * into at most one L2 table, which must be the one running on this VCPU.
3560 */
3561 if ( (d->vcpu[0]->next_in_list != NULL) &&
3562 ((page->u.inuse.type_info & PGT_count_mask) !=
3563 (!!(page->u.inuse.type_info & PGT_pinned) +
3564 (which == PTWR_PT_ACTIVE))) )
3566 /* Could be conflicting writable mappings from other VCPUs. */
3567 cleanup_writable_pagetable(d);
3568 goto emulate;
3571 /*
3572 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
3573 * time. If there is already one, we must flush it out.
3574 */
3575 if ( d->arch.ptwr[which].l1va )
3576 ptwr_flush(d, which);
3578 /*
3579 * If last batch made no updates then we are probably stuck. Emulate this
3580 * update to ensure we make progress.
3581 */
3582 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3584 /* Ensure that we don't get stuck in an emulation-only rut. */
3585 d->arch.ptwr[which].prev_nr_updates = 1;
3586 goto emulate;
3589 PTWR_PRINTK("[%c] batched ptwr_page_fault at va %lx, pt for %08lx, "
3590 "pfn %lx\n", PTWR_PRINT_WHICH, addr,
3591 l2_idx << L2_PAGETABLE_SHIFT, pfn);
3593 d->arch.ptwr[which].l1va = addr | 1;
3594 d->arch.ptwr[which].l2_idx = l2_idx;
3595 d->arch.ptwr[which].vcpu = current;
3597 #ifdef PERF_ARRAYS
3598 d->arch.ptwr[which].eip = regs->eip;
3599 #endif
3601 /* For safety, disconnect the L1 p.t. page from current space. */
3602 if ( which == PTWR_PT_ACTIVE )
3604 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
3605 flush_tlb_mask(d->domain_dirty_cpumask);
3608 /* Temporarily map the L1 page, and make a copy of it. */
3609 pl1e = map_domain_page(pfn);
3610 memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE);
3611 unmap_domain_page(pl1e);
3613 /* Finally, make the p.t. page writable by the guest OS. */
3614 l1e_add_flags(pte, _PAGE_RW);
3615 if ( unlikely(__put_user(pte.l1,
3616 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3618 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
3619 &linear_pg_table[l1_linear_offset(addr)]);
3620 /* Toss the writable pagetable state and crash. */
3621 d->arch.ptwr[which].l1va = 0;
3622 domain_crash(d);
3623 return 0;
3626 return EXCRET_fault_fixed;
3628 emulate:
3629 emul_ctxt.regs = guest_cpu_user_regs();
3630 emul_ctxt.cr2 = addr;
3631 emul_ctxt.mode = X86EMUL_MODE_HOST;
3632 if ( x86_emulate_memop(&emul_ctxt, &ptwr_emulate_ops) )
3633 return 0;
3634 perfc_incrc(ptwr_emulations);
3635 return EXCRET_fault_fixed;
3638 int ptwr_init(struct domain *d)
3640 void *x = alloc_xenheap_page();
3641 void *y = alloc_xenheap_page();
3643 if ( (x == NULL) || (y == NULL) )
3645 free_xenheap_page(x);
3646 free_xenheap_page(y);
3647 return -ENOMEM;
3650 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3651 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3653 return 0;
3656 void ptwr_destroy(struct domain *d)
3658 LOCK_BIGLOCK(d);
3659 cleanup_writable_pagetable(d);
3660 UNLOCK_BIGLOCK(d);
3661 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3662 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3665 void cleanup_writable_pagetable(struct domain *d)
3667 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3668 return;
3670 if ( unlikely(shadow_mode_enabled(d)) )
3672 shadow_sync_all(d);
3674 else
3676 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3677 ptwr_flush(d, PTWR_PT_ACTIVE);
3678 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3679 ptwr_flush(d, PTWR_PT_INACTIVE);
3683 int map_pages_to_xen(
3684 unsigned long virt,
3685 unsigned long mfn,
3686 unsigned long nr_mfns,
3687 unsigned long flags)
3689 l2_pgentry_t *pl2e, ol2e;
3690 l1_pgentry_t *pl1e, ol1e;
3691 unsigned int i;
3693 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3694 flags &= ~MAP_SMALL_PAGES;
3696 while ( nr_mfns != 0 )
3698 pl2e = virt_to_xen_l2e(virt);
3700 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3701 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3702 !map_small_pages )
3704 /* Super-page mapping. */
3705 ol2e = *pl2e;
3706 *pl2e = l2e_from_pfn(mfn, flags|_PAGE_PSE);
3708 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3710 local_flush_tlb_pge();
3711 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3712 free_xen_pagetable(l2e_get_page(*pl2e));
3715 virt += 1UL << L2_PAGETABLE_SHIFT;
3716 mfn += 1UL << PAGETABLE_ORDER;
3717 nr_mfns -= 1UL << PAGETABLE_ORDER;
3719 else
3721 /* Normal page mapping. */
3722 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3724 pl1e = page_to_virt(alloc_xen_pagetable());
3725 clear_page(pl1e);
3726 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3728 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3730 pl1e = page_to_virt(alloc_xen_pagetable());
3731 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3732 pl1e[i] = l1e_from_pfn(
3733 l2e_get_pfn(*pl2e) + i,
3734 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3735 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3736 local_flush_tlb_pge();
3739 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3740 ol1e = *pl1e;
3741 *pl1e = l1e_from_pfn(mfn, flags);
3742 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3743 local_flush_tlb_one(virt);
3745 virt += 1UL << L1_PAGETABLE_SHIFT;
3746 mfn += 1UL;
3747 nr_mfns -= 1UL;
3751 return 0;
3754 void __set_fixmap(
3755 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3757 BUG_ON(idx >= __end_of_fixed_addresses);
3758 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3761 #ifdef MEMORY_GUARD
3763 void memguard_init(void)
3765 map_pages_to_xen(
3766 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3767 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3770 static void __memguard_change_range(void *p, unsigned long l, int guard)
3772 unsigned long _p = (unsigned long)p;
3773 unsigned long _l = (unsigned long)l;
3774 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3776 /* Ensure we are dealing with a page-aligned whole number of pages. */
3777 ASSERT((_p&PAGE_MASK) != 0);
3778 ASSERT((_l&PAGE_MASK) != 0);
3779 ASSERT((_p&~PAGE_MASK) == 0);
3780 ASSERT((_l&~PAGE_MASK) == 0);
3782 if ( guard )
3783 flags &= ~_PAGE_PRESENT;
3785 map_pages_to_xen(
3786 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3789 void memguard_guard_range(void *p, unsigned long l)
3791 __memguard_change_range(p, l, 1);
3794 void memguard_unguard_range(void *p, unsigned long l)
3796 __memguard_change_range(p, l, 0);
3799 #endif
3801 /*
3802 * Local variables:
3803 * mode: C
3804 * c-set-style: "BSD"
3805 * c-basic-offset: 4
3806 * tab-width: 4
3807 * indent-tabs-mode: nil
3808 * End:
3809 */