direct-io.hg

view xen/arch/x86/mm.c @ 10442:8eab9f3cdb1a

[XEN] Remove code from writable-pagetable emulation path which
does not appear to be necessary. Replace with assertions for now,
pending total removal or me being proved wrong.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sun Jun 18 19:49:31 2006 +0100 (2006-06-18)
parents 7713276d159e
children 5d44f3ab9950
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/shadow.h>
103 #include <asm/page.h>
104 #include <asm/flushtlb.h>
105 #include <asm/io.h>
106 #include <asm/ldt.h>
107 #include <asm/x86_emulate.h>
108 #include <public/memory.h>
110 #ifdef VERBOSE
111 #define MEM_LOG(_f, _a...) \
112 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
113 current->domain->domain_id , __LINE__ , ## _a )
114 #else
115 #define MEM_LOG(_f, _a...) ((void)0)
116 #endif
118 /*
119 * Both do_mmuext_op() and do_mmu_update():
120 * We steal the m.s.b. of the @count parameter to indicate whether this
121 * invocation of do_mmu_update() is resuming a previously preempted call.
122 */
123 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
125 static void free_l2_table(struct page_info *page);
126 static void free_l1_table(struct page_info *page);
128 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
129 unsigned long type);
130 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
132 /* Used to defer flushing of memory structures. */
133 static struct {
134 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
135 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
136 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
137 unsigned int deferred_ops;
138 /* If non-NULL, specifies a foreign subject domain for some operations. */
139 struct domain *foreign;
140 } __cacheline_aligned percpu_info[NR_CPUS];
142 /*
143 * Returns the current foreign domain; defaults to the currently-executing
144 * domain if a foreign override hasn't been specified.
145 */
146 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
148 /* Private domain structs for DOMID_XEN and DOMID_IO. */
149 static struct domain *dom_xen, *dom_io;
151 /* Frame table and its size in pages. */
152 struct page_info *frame_table;
153 unsigned long max_page;
154 unsigned long total_pages;
156 void __init init_frametable(void)
157 {
158 unsigned long nr_pages, page_step, i, mfn;
160 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
162 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
163 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
165 for ( i = 0; i < nr_pages; i += page_step )
166 {
167 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
168 if ( mfn == 0 )
169 panic("Not enough memory for frame table\n");
170 map_pages_to_xen(
171 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
172 mfn, page_step, PAGE_HYPERVISOR);
173 }
175 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
176 }
178 void arch_init_memory(void)
179 {
180 extern void subarch_init_memory(void);
182 unsigned long i, pfn, rstart_pfn, rend_pfn;
184 memset(percpu_info, 0, sizeof(percpu_info));
186 /*
187 * Initialise our DOMID_XEN domain.
188 * Any Xen-heap pages that we will allow to be mapped will have
189 * their domain field set to dom_xen.
190 */
191 dom_xen = alloc_domain(DOMID_XEN);
192 BUG_ON(dom_xen == NULL);
194 /*
195 * Initialise our DOMID_IO domain.
196 * This domain owns I/O pages that are within the range of the page_info
197 * array. Mappings occur at the priv of the caller.
198 */
199 dom_io = alloc_domain(DOMID_IO);
200 BUG_ON(dom_io == NULL);
202 /* First 1MB of RAM is historically marked as I/O. */
203 for ( i = 0; i < 0x100; i++ )
204 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
206 /* Any areas not specified as RAM by the e820 map are considered I/O. */
207 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
208 {
209 if ( e820.map[i].type != E820_RAM )
210 continue;
211 /* Every page from cursor to start of next RAM region is I/O. */
212 rstart_pfn = PFN_UP(e820.map[i].addr);
213 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
214 for ( ; pfn < rstart_pfn; pfn++ )
215 {
216 BUG_ON(!mfn_valid(pfn));
217 share_xen_page_with_guest(
218 mfn_to_page(pfn), dom_io, XENSHARE_writable);
219 }
220 /* Skip the RAM region. */
221 pfn = rend_pfn;
222 }
223 BUG_ON(pfn != max_page);
225 subarch_init_memory();
226 }
228 void share_xen_page_with_guest(
229 struct page_info *page, struct domain *d, int readonly)
230 {
231 if ( page_get_owner(page) == d )
232 return;
234 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
236 spin_lock(&d->page_alloc_lock);
238 /* The incremented type count pins as writable or read-only. */
239 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
240 page->u.inuse.type_info |= PGT_validated | 1;
242 page_set_owner(page, d);
243 wmb(); /* install valid domain ptr before updating refcnt. */
244 ASSERT(page->count_info == 0);
245 page->count_info |= PGC_allocated | 1;
247 if ( unlikely(d->xenheap_pages++ == 0) )
248 get_knownalive_domain(d);
249 list_add_tail(&page->list, &d->xenpage_list);
251 spin_unlock(&d->page_alloc_lock);
252 }
254 void share_xen_page_with_privileged_guests(
255 struct page_info *page, int readonly)
256 {
257 share_xen_page_with_guest(page, dom_xen, readonly);
258 }
260 #if defined(CONFIG_X86_PAE)
262 #ifdef NDEBUG
263 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
264 #define l3tab_needs_shadow(mfn) (mfn >= 0x100000)
265 #else
266 /*
267 * In debug builds we aggressively shadow PDPTs to exercise code paths.
268 * We cannot safely shadow the idle page table, nor shadow-mode page tables
269 * (detected by lack of an owning domain). Always shadow PDPTs above 4GB.
270 */
271 #define l3tab_needs_shadow(mfn) \
272 ((((mfn << PAGE_SHIFT) != __pa(idle_pg_table)) && \
273 (page_get_owner(mfn_to_page(mfn)) != NULL)) || \
274 (mfn >= 0x100000))
275 #endif
277 static l1_pgentry_t *fix_pae_highmem_pl1e;
279 /* Cache the address of PAE high-memory fixmap page tables. */
280 static int __init cache_pae_fixmap_address(void)
281 {
282 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
283 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
284 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
285 return 0;
286 }
287 __initcall(cache_pae_fixmap_address);
289 static void __write_ptbase(unsigned long mfn)
290 {
291 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
292 struct pae_l3_cache *cache = &current->arch.pae_l3_cache;
293 unsigned int cpu = smp_processor_id();
295 /* Fast path 1: does this mfn need a shadow at all? */
296 if ( !l3tab_needs_shadow(mfn) )
297 {
298 write_cr3(mfn << PAGE_SHIFT);
299 return;
300 }
302 /* Caching logic is not interrupt safe. */
303 ASSERT(!in_irq());
305 /* Fast path 2: is this mfn already cached? */
306 if ( cache->high_mfn == mfn )
307 {
308 write_cr3(__pa(cache->table[cache->inuse_idx]));
309 return;
310 }
312 /* Protects against pae_flush_pgd(). */
313 spin_lock(&cache->lock);
315 cache->inuse_idx ^= 1;
316 cache->high_mfn = mfn;
318 /* Map the guest L3 table and copy to the chosen low-memory cache. */
319 *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
320 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
321 lowmem_l3tab = cache->table[cache->inuse_idx];
322 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
323 *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
325 /* Install the low-memory L3 table in CR3. */
326 write_cr3(__pa(lowmem_l3tab));
328 spin_unlock(&cache->lock);
329 }
331 #else /* !CONFIG_X86_PAE */
333 static void __write_ptbase(unsigned long mfn)
334 {
335 write_cr3(mfn << PAGE_SHIFT);
336 }
338 #endif /* !CONFIG_X86_PAE */
340 void write_ptbase(struct vcpu *v)
341 {
342 __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
343 }
345 void invalidate_shadow_ldt(struct vcpu *v)
346 {
347 int i;
348 unsigned long pfn;
349 struct page_info *page;
351 if ( v->arch.shadow_ldt_mapcnt == 0 )
352 return;
354 v->arch.shadow_ldt_mapcnt = 0;
356 for ( i = 16; i < 32; i++ )
357 {
358 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
359 if ( pfn == 0 ) continue;
360 v->arch.perdomain_ptes[i] = l1e_empty();
361 page = mfn_to_page(pfn);
362 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
363 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
364 put_page_and_type(page);
365 }
367 /* Dispose of the (now possibly invalid) mappings from the TLB. */
368 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
369 }
372 static int alloc_segdesc_page(struct page_info *page)
373 {
374 struct desc_struct *descs;
375 int i;
377 descs = map_domain_page(page_to_mfn(page));
379 for ( i = 0; i < 512; i++ )
380 if ( unlikely(!check_descriptor(&descs[i])) )
381 goto fail;
383 unmap_domain_page(descs);
384 return 1;
386 fail:
387 unmap_domain_page(descs);
388 return 0;
389 }
392 /* Map shadow page at offset @off. */
393 int map_ldt_shadow_page(unsigned int off)
394 {
395 struct vcpu *v = current;
396 struct domain *d = v->domain;
397 unsigned long gmfn, mfn;
398 l1_pgentry_t l1e, nl1e;
399 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
400 int res;
402 #if defined(__x86_64__)
403 /* If in user mode, switch to kernel mode just to read LDT mapping. */
404 int user_mode = !(v->arch.flags & TF_kernel_mode);
405 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
406 #elif defined(__i386__)
407 #define TOGGLE_MODE() ((void)0)
408 #endif
410 BUG_ON(unlikely(in_irq()));
412 shadow_sync_va(v, gva);
414 TOGGLE_MODE();
415 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
416 sizeof(l1e));
417 TOGGLE_MODE();
419 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
420 return 0;
422 gmfn = l1e_get_pfn(l1e);
423 mfn = gmfn_to_mfn(d, gmfn);
424 if ( unlikely(!VALID_MFN(mfn)) )
425 return 0;
427 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
429 if ( !res && unlikely(shadow_mode_refcounts(d)) )
430 {
431 shadow_lock(d);
432 shadow_remove_all_write_access(d, gmfn, mfn);
433 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
434 shadow_unlock(d);
435 }
437 if ( unlikely(!res) )
438 return 0;
440 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
442 v->arch.perdomain_ptes[off + 16] = nl1e;
443 v->arch.shadow_ldt_mapcnt++;
445 return 1;
446 }
449 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
450 {
451 struct page_info *page = mfn_to_page(page_nr);
453 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
454 {
455 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
456 return 0;
457 }
459 return 1;
460 }
463 static int get_page_and_type_from_pagenr(unsigned long page_nr,
464 unsigned long type,
465 struct domain *d)
466 {
467 struct page_info *page = mfn_to_page(page_nr);
469 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
470 return 0;
472 if ( unlikely(!get_page_type(page, type)) )
473 {
474 put_page(page);
475 return 0;
476 }
478 return 1;
479 }
481 #ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
482 /*
483 * We allow root tables to map each other (a.k.a. linear page tables). It
484 * needs some special care with reference counts and access permissions:
485 * 1. The mapping entry must be read-only, or the guest may get write access
486 * to its own PTEs.
487 * 2. We must only bump the reference counts for an *already validated*
488 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
489 * on a validation that is required to complete that validation.
490 * 3. We only need to increment the reference counts for the mapped page
491 * frame if it is mapped by a different root table. This is sufficient and
492 * also necessary to allow validation of a root table mapping itself.
493 */
494 static int
495 get_linear_pagetable(
496 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
497 {
498 unsigned long x, y;
499 struct page_info *page;
500 unsigned long pfn;
502 ASSERT( !shadow_mode_refcounts(d) );
504 if ( (root_get_flags(re) & _PAGE_RW) )
505 {
506 MEM_LOG("Attempt to create linear p.t. with write perms");
507 return 0;
508 }
510 if ( (pfn = root_get_pfn(re)) != re_pfn )
511 {
512 /* Make sure the mapped frame belongs to the correct domain. */
513 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
514 return 0;
516 /*
517 * Make sure that the mapped frame is an already-validated L2 table.
518 * If so, atomically increment the count (checking for overflow).
519 */
520 page = mfn_to_page(pfn);
521 y = page->u.inuse.type_info;
522 do {
523 x = y;
524 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
525 unlikely((x & (PGT_type_mask|PGT_validated)) !=
526 (PGT_root_page_table|PGT_validated)) )
527 {
528 put_page(page);
529 return 0;
530 }
531 }
532 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
533 }
535 return 1;
536 }
537 #endif /* !CONFIG_X86_PAE */
539 int
540 get_page_from_l1e(
541 l1_pgentry_t l1e, struct domain *d)
542 {
543 unsigned long mfn = l1e_get_pfn(l1e);
544 struct page_info *page = mfn_to_page(mfn);
545 int okay;
547 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
548 return 1;
550 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
551 {
552 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
553 return 0;
554 }
556 if ( unlikely(!mfn_valid(mfn)) ||
557 unlikely(page_get_owner(page) == dom_io) )
558 {
559 /* DOMID_IO reverts to caller for privilege checks. */
560 if ( d == dom_io )
561 d = current->domain;
563 if ( !iomem_access_permitted(d, mfn, mfn) )
564 {
565 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
566 return 0;
567 }
569 /* No reference counting for out-of-range I/O pages. */
570 if ( !mfn_valid(mfn) )
571 return 1;
573 d = dom_io;
574 }
576 okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
577 get_page_and_type(page, d, PGT_writable_page) :
578 get_page(page, d));
579 if ( !okay )
580 {
581 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
582 " for dom%d",
583 mfn, get_gpfn_from_mfn(mfn),
584 l1e_get_intpte(l1e), d->domain_id);
585 }
587 return okay;
588 }
591 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
592 static int
593 get_page_from_l2e(
594 l2_pgentry_t l2e, unsigned long pfn,
595 struct domain *d, unsigned long vaddr)
596 {
597 int rc;
599 ASSERT(!shadow_mode_refcounts(d));
601 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
602 return 1;
604 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
605 {
606 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
607 return 0;
608 }
610 vaddr >>= L2_PAGETABLE_SHIFT;
611 vaddr <<= PGT_va_shift;
612 rc = get_page_and_type_from_pagenr(
613 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
614 #if CONFIG_PAGING_LEVELS == 2
615 if ( unlikely(!rc) )
616 rc = get_linear_pagetable(l2e, pfn, d);
617 #endif
618 return rc;
619 }
622 #if CONFIG_PAGING_LEVELS >= 3
623 static int
624 get_page_from_l3e(
625 l3_pgentry_t l3e, unsigned long pfn,
626 struct domain *d, unsigned long vaddr)
627 {
628 int rc;
630 ASSERT(!shadow_mode_refcounts(d));
632 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
633 return 1;
635 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
636 {
637 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
638 return 0;
639 }
641 vaddr >>= L3_PAGETABLE_SHIFT;
642 vaddr <<= PGT_va_shift;
643 rc = get_page_and_type_from_pagenr(
644 l3e_get_pfn(l3e),
645 PGT_l2_page_table | vaddr, d);
646 return rc;
647 }
648 #endif /* 3 level */
650 #if CONFIG_PAGING_LEVELS >= 4
651 static int
652 get_page_from_l4e(
653 l4_pgentry_t l4e, unsigned long pfn,
654 struct domain *d, unsigned long vaddr)
655 {
656 int rc;
658 ASSERT( !shadow_mode_refcounts(d) );
660 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
661 return 1;
663 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
664 {
665 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
666 return 0;
667 }
669 vaddr >>= L4_PAGETABLE_SHIFT;
670 vaddr <<= PGT_va_shift;
671 rc = get_page_and_type_from_pagenr(
672 l4e_get_pfn(l4e),
673 PGT_l3_page_table | vaddr, d);
675 if ( unlikely(!rc) )
676 rc = get_linear_pagetable(l4e, pfn, d);
678 return rc;
679 }
680 #endif /* 4 level */
683 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
684 {
685 unsigned long pfn = l1e_get_pfn(l1e);
686 struct page_info *page = mfn_to_page(pfn);
687 struct domain *e;
688 struct vcpu *v;
690 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
691 return;
693 e = page_get_owner(page);
695 /*
696 * Check if this is a mapping that was established via a grant reference.
697 * If it was then we should not be here: we require that such mappings are
698 * explicitly destroyed via the grant-table interface.
699 *
700 * The upshot of this is that the guest can end up with active grants that
701 * it cannot destroy (because it no longer has a PTE to present to the
702 * grant-table interface). This can lead to subtle hard-to-catch bugs,
703 * hence a special grant PTE flag can be enabled to catch the bug early.
704 *
705 * (Note that the undestroyable active grants are not a security hole in
706 * Xen. All active grants can safely be cleaned up when the domain dies.)
707 */
708 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
709 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
710 {
711 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
712 l1e_get_intpte(l1e));
713 domain_crash(d);
714 }
716 if ( l1e_get_flags(l1e) & _PAGE_RW )
717 {
718 put_page_and_type(page);
719 }
720 else
721 {
722 /* We expect this is rare so we blow the entire shadow LDT. */
723 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
724 PGT_ldt_page)) &&
725 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
726 (d == e) )
727 {
728 for_each_vcpu ( d, v )
729 invalidate_shadow_ldt(v);
730 }
731 put_page(page);
732 }
733 }
736 /*
737 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
738 * Note also that this automatically deals correctly with linear p.t.'s.
739 */
740 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
741 {
742 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
743 (l2e_get_pfn(l2e) != pfn) )
744 put_page_and_type(mfn_to_page(l2e_get_pfn(l2e)));
745 }
748 #if CONFIG_PAGING_LEVELS >= 3
749 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
750 {
751 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
752 (l3e_get_pfn(l3e) != pfn) )
753 put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
754 }
755 #endif
757 #if CONFIG_PAGING_LEVELS >= 4
758 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
759 {
760 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
761 (l4e_get_pfn(l4e) != pfn) )
762 put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
763 }
764 #endif
766 static int alloc_l1_table(struct page_info *page)
767 {
768 struct domain *d = page_get_owner(page);
769 unsigned long pfn = page_to_mfn(page);
770 l1_pgentry_t *pl1e;
771 int i;
773 ASSERT(!shadow_mode_refcounts(d));
775 pl1e = map_domain_page(pfn);
777 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
778 if ( is_guest_l1_slot(i) &&
779 unlikely(!get_page_from_l1e(pl1e[i], d)) )
780 goto fail;
782 unmap_domain_page(pl1e);
783 return 1;
785 fail:
786 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
787 while ( i-- > 0 )
788 if ( is_guest_l1_slot(i) )
789 put_page_from_l1e(pl1e[i], d);
791 unmap_domain_page(pl1e);
792 return 0;
793 }
795 #ifdef CONFIG_X86_PAE
796 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
797 {
798 struct page_info *page;
799 l2_pgentry_t *pl2e;
800 l3_pgentry_t l3e3;
801 int i;
803 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
805 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
806 l3e3 = pl3e[3];
807 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
808 {
809 MEM_LOG("PAE L3 3rd slot is empty");
810 return 0;
811 }
813 /*
814 * The Xen-private mappings include linear mappings. The L2 thus cannot
815 * be shared by multiple L3 tables. The test here is adequate because:
816 * 1. Cannot appear in slots != 3 because the page would then then have
817 * unknown va backpointer, which get_page_type() explicitly disallows.
818 * 2. Cannot appear in another page table's L3:
819 * a. alloc_l3_table() calls this function and this check will fail
820 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
821 */
822 page = l3e_get_page(l3e3);
823 BUG_ON(page->u.inuse.type_info & PGT_pinned);
824 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
825 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
826 {
827 MEM_LOG("PAE L3 3rd slot is shared");
828 return 0;
829 }
831 /* Xen private mappings. */
832 pl2e = map_domain_page(l3e_get_pfn(l3e3));
833 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
834 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
835 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
836 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
837 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
838 l2e_from_page(
839 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
840 __PAGE_HYPERVISOR);
841 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
842 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
843 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
844 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
845 l2e_empty();
846 unmap_domain_page(pl2e);
848 return 1;
849 }
851 /* Flush a pgdir update into low-memory caches. */
852 static void pae_flush_pgd(
853 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
854 {
855 struct domain *d = page_get_owner(mfn_to_page(mfn));
856 struct vcpu *v;
857 intpte_t _ol3e, _nl3e, _pl3e;
858 l3_pgentry_t *l3tab_ptr;
859 struct pae_l3_cache *cache;
861 /* If below 4GB then the pgdir is not shadowed in low memory. */
862 if ( !l3tab_needs_shadow(mfn) )
863 return;
865 for_each_vcpu ( d, v )
866 {
867 cache = &v->arch.pae_l3_cache;
869 spin_lock(&cache->lock);
871 if ( cache->high_mfn == mfn )
872 {
873 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
874 _ol3e = l3e_get_intpte(*l3tab_ptr);
875 _nl3e = l3e_get_intpte(nl3e);
876 _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
877 BUG_ON(_pl3e != _ol3e);
878 }
880 spin_unlock(&cache->lock);
881 }
883 flush_tlb_mask(d->domain_dirty_cpumask);
884 }
886 static inline int l1_backptr(
887 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
888 {
889 unsigned long l2_backptr = l2_type & PGT_va_mask;
890 ASSERT(l2_backptr != PGT_va_unknown);
891 ASSERT(l2_backptr != PGT_va_mutable);
892 *backptr =
893 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
894 (offset_in_l2 << L2_PAGETABLE_SHIFT);
895 return 1;
896 }
898 #elif CONFIG_X86_64
899 # define create_pae_xen_mappings(pl3e) (1)
900 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
902 static inline int l1_backptr(
903 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
904 {
905 unsigned long l2_backptr = l2_type & PGT_va_mask;
906 ASSERT(l2_backptr != PGT_va_unknown);
907 ASSERT(l2_backptr != PGT_va_mutable);
908 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
909 (offset_in_l2 << L2_PAGETABLE_SHIFT);
910 return 1;
911 }
913 static inline int l2_backptr(
914 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
915 {
916 unsigned long l3_backptr = l3_type & PGT_va_mask;
917 ASSERT(l3_backptr != PGT_va_unknown);
918 ASSERT(l3_backptr != PGT_va_mutable);
919 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
920 (offset_in_l3 << L3_PAGETABLE_SHIFT);
921 return 1;
922 }
924 static inline int l3_backptr(
925 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
926 {
927 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
928 return 1;
929 }
930 #else
931 # define create_pae_xen_mappings(pl3e) (1)
932 # define l1_backptr(bp,l2o,l2t) \
933 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
934 #endif
936 static int alloc_l2_table(struct page_info *page, unsigned long type)
937 {
938 struct domain *d = page_get_owner(page);
939 unsigned long pfn = page_to_mfn(page);
940 unsigned long vaddr;
941 l2_pgentry_t *pl2e;
942 int i;
944 /* See the code in shadow_promote() to understand why this is here. */
945 if ( (PGT_base_page_table == PGT_l2_page_table) &&
946 unlikely(shadow_mode_refcounts(d)) )
947 return 1;
948 ASSERT(!shadow_mode_refcounts(d));
950 pl2e = map_domain_page(pfn);
952 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
953 {
954 if ( !l1_backptr(&vaddr, i, type) )
955 goto fail;
956 if ( is_guest_l2_slot(type, i) &&
957 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
958 goto fail;
959 }
961 #if CONFIG_PAGING_LEVELS == 2
962 /* Xen private mappings. */
963 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
964 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
965 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
966 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
967 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
968 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
969 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
970 l2e_from_page(
971 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
972 __PAGE_HYPERVISOR);
973 #endif
975 unmap_domain_page(pl2e);
976 return 1;
978 fail:
979 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
980 while ( i-- > 0 )
981 if ( is_guest_l2_slot(type, i) )
982 put_page_from_l2e(pl2e[i], pfn);
984 unmap_domain_page(pl2e);
985 return 0;
986 }
989 #if CONFIG_PAGING_LEVELS >= 3
990 static int alloc_l3_table(struct page_info *page, unsigned long type)
991 {
992 struct domain *d = page_get_owner(page);
993 unsigned long pfn = page_to_mfn(page);
994 unsigned long vaddr;
995 l3_pgentry_t *pl3e;
996 int i;
998 ASSERT(!shadow_mode_refcounts(d));
1000 #ifdef CONFIG_X86_PAE
1001 /*
1002 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1003 * the weird 'extended cr3' format for dealing with high-order address
1004 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1005 */
1006 if ( (pfn >= 0x100000) &&
1007 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1008 d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) )
1010 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1011 return 0;
1013 #endif
1015 pl3e = map_domain_page(pfn);
1016 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1018 #if CONFIG_PAGING_LEVELS >= 4
1019 if ( !l2_backptr(&vaddr, i, type) )
1020 goto fail;
1021 #else
1022 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
1023 #endif
1024 if ( is_guest_l3_slot(i) &&
1025 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
1026 goto fail;
1029 if ( !create_pae_xen_mappings(pl3e) )
1030 goto fail;
1032 unmap_domain_page(pl3e);
1033 return 1;
1035 fail:
1036 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1037 while ( i-- > 0 )
1038 if ( is_guest_l3_slot(i) )
1039 put_page_from_l3e(pl3e[i], pfn);
1041 unmap_domain_page(pl3e);
1042 return 0;
1044 #else
1045 #define alloc_l3_table(page, type) (0)
1046 #endif
1048 #if CONFIG_PAGING_LEVELS >= 4
1049 static int alloc_l4_table(struct page_info *page, unsigned long type)
1051 struct domain *d = page_get_owner(page);
1052 unsigned long pfn = page_to_mfn(page);
1053 l4_pgentry_t *pl4e = page_to_virt(page);
1054 unsigned long vaddr;
1055 int i;
1057 /* See the code in shadow_promote() to understand why this is here. */
1058 if ( (PGT_base_page_table == PGT_l4_page_table) &&
1059 shadow_mode_refcounts(d) )
1060 return 1;
1061 ASSERT(!shadow_mode_refcounts(d));
1063 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1065 if ( !l3_backptr(&vaddr, i, type) )
1066 goto fail;
1068 if ( is_guest_l4_slot(i) &&
1069 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
1070 goto fail;
1073 /* Xen private mappings. */
1074 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1075 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1076 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1077 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1078 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1079 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1080 l4e_from_page(
1081 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
1082 __PAGE_HYPERVISOR);
1084 return 1;
1086 fail:
1087 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1088 while ( i-- > 0 )
1089 if ( is_guest_l4_slot(i) )
1090 put_page_from_l4e(pl4e[i], pfn);
1092 return 0;
1094 #else
1095 #define alloc_l4_table(page, type) (0)
1096 #endif
1099 static void free_l1_table(struct page_info *page)
1101 struct domain *d = page_get_owner(page);
1102 unsigned long pfn = page_to_mfn(page);
1103 l1_pgentry_t *pl1e;
1104 int i;
1106 pl1e = map_domain_page(pfn);
1108 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1109 if ( is_guest_l1_slot(i) )
1110 put_page_from_l1e(pl1e[i], d);
1112 unmap_domain_page(pl1e);
1116 static void free_l2_table(struct page_info *page)
1118 unsigned long pfn = page_to_mfn(page);
1119 l2_pgentry_t *pl2e;
1120 int i;
1122 pl2e = map_domain_page(pfn);
1124 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1125 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
1126 put_page_from_l2e(pl2e[i], pfn);
1128 unmap_domain_page(pl2e);
1132 #if CONFIG_PAGING_LEVELS >= 3
1134 static void free_l3_table(struct page_info *page)
1136 unsigned long pfn = page_to_mfn(page);
1137 l3_pgentry_t *pl3e;
1138 int i;
1140 pl3e = map_domain_page(pfn);
1142 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1143 if ( is_guest_l3_slot(i) )
1144 put_page_from_l3e(pl3e[i], pfn);
1146 unmap_domain_page(pl3e);
1149 #endif
1151 #if CONFIG_PAGING_LEVELS >= 4
1153 static void free_l4_table(struct page_info *page)
1155 unsigned long pfn = page_to_mfn(page);
1156 l4_pgentry_t *pl4e = page_to_virt(page);
1157 int i;
1159 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1160 if ( is_guest_l4_slot(i) )
1161 put_page_from_l4e(pl4e[i], pfn);
1164 #endif
1166 static inline int update_l1e(l1_pgentry_t *pl1e,
1167 l1_pgentry_t ol1e,
1168 l1_pgentry_t nl1e)
1170 #ifndef PTE_UPDATE_WITH_CMPXCHG
1171 return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e));
1172 #else
1173 intpte_t o = l1e_get_intpte(ol1e);
1174 intpte_t n = l1e_get_intpte(nl1e);
1176 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
1177 unlikely(o != l1e_get_intpte(ol1e)) )
1179 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1180 ": saw %" PRIpte,
1181 l1e_get_intpte(ol1e),
1182 l1e_get_intpte(nl1e),
1183 o);
1184 return 0;
1186 return 1;
1187 #endif
1191 /* Update the L1 entry at pl1e to new value nl1e. */
1192 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1194 l1_pgentry_t ol1e;
1195 struct domain *d = current->domain;
1197 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1198 return 0;
1200 if ( unlikely(shadow_mode_refcounts(d)) )
1201 return update_l1e(pl1e, ol1e, nl1e);
1203 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1205 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1207 MEM_LOG("Bad L1 flags %x",
1208 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1209 return 0;
1212 /* Fast path for identical mapping, r/w and presence. */
1213 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1214 return update_l1e(pl1e, ol1e, nl1e);
1216 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1217 return 0;
1219 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1221 put_page_from_l1e(nl1e, d);
1222 return 0;
1225 else
1227 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1228 return 0;
1231 put_page_from_l1e(ol1e, d);
1232 return 1;
1235 #ifndef PTE_UPDATE_WITH_CMPXCHG
1236 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
1237 #else
1238 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1239 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1240 _t ## e_get_intpte(_o), \
1241 _t ## e_get_intpte(_n)); \
1242 if ( __o != _t ## e_get_intpte(_o) ) \
1243 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1244 ": saw %" PRIpte "", \
1245 (_t ## e_get_intpte(_o)), \
1246 (_t ## e_get_intpte(_n)), \
1247 (__o)); \
1248 (__o == _t ## e_get_intpte(_o)); })
1249 #endif
1251 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1252 static int mod_l2_entry(l2_pgentry_t *pl2e,
1253 l2_pgentry_t nl2e,
1254 unsigned long pfn,
1255 unsigned long type)
1257 l2_pgentry_t ol2e;
1258 unsigned long vaddr = 0;
1260 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1262 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1263 return 0;
1266 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1267 return 0;
1269 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1271 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1273 MEM_LOG("Bad L2 flags %x",
1274 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1275 return 0;
1278 /* Fast path for identical mapping and presence. */
1279 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1280 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1282 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1283 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1284 return 0;
1286 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1288 put_page_from_l2e(nl2e, pfn);
1289 return 0;
1292 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1294 return 0;
1297 put_page_from_l2e(ol2e, pfn);
1298 return 1;
1302 #if CONFIG_PAGING_LEVELS >= 3
1304 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1305 static int mod_l3_entry(l3_pgentry_t *pl3e,
1306 l3_pgentry_t nl3e,
1307 unsigned long pfn,
1308 unsigned long type)
1310 l3_pgentry_t ol3e;
1311 unsigned long vaddr;
1312 int okay;
1314 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1316 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1317 return 0;
1320 #ifdef CONFIG_X86_PAE
1321 /*
1322 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1323 * would be a pain to ensure they remain continuously valid throughout.
1324 */
1325 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1326 return 0;
1327 #endif
1329 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1330 return 0;
1332 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1334 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1336 MEM_LOG("Bad L3 flags %x",
1337 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1338 return 0;
1341 /* Fast path for identical mapping and presence. */
1342 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1343 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1345 #if CONFIG_PAGING_LEVELS >= 4
1346 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1347 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1348 return 0;
1349 #else
1350 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1351 << L3_PAGETABLE_SHIFT;
1352 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1353 return 0;
1354 #endif
1356 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1358 put_page_from_l3e(nl3e, pfn);
1359 return 0;
1362 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1364 return 0;
1367 okay = create_pae_xen_mappings(pl3e);
1368 BUG_ON(!okay);
1370 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1372 put_page_from_l3e(ol3e, pfn);
1373 return 1;
1376 #endif
1378 #if CONFIG_PAGING_LEVELS >= 4
1380 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1381 static int mod_l4_entry(l4_pgentry_t *pl4e,
1382 l4_pgentry_t nl4e,
1383 unsigned long pfn,
1384 unsigned long type)
1386 l4_pgentry_t ol4e;
1387 unsigned long vaddr;
1389 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1391 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1392 return 0;
1395 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1396 return 0;
1398 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1400 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1402 MEM_LOG("Bad L4 flags %x",
1403 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1404 return 0;
1407 /* Fast path for identical mapping and presence. */
1408 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1409 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1411 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1412 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1413 return 0;
1415 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1417 put_page_from_l4e(nl4e, pfn);
1418 return 0;
1421 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1423 return 0;
1426 put_page_from_l4e(ol4e, pfn);
1427 return 1;
1430 #endif
1432 int alloc_page_type(struct page_info *page, unsigned long type)
1434 struct domain *owner = page_get_owner(page);
1436 if ( owner != NULL )
1437 mark_dirty(owner, page_to_mfn(page));
1439 switch ( type & PGT_type_mask )
1441 case PGT_l1_page_table:
1442 return alloc_l1_table(page);
1443 case PGT_l2_page_table:
1444 return alloc_l2_table(page, type);
1445 case PGT_l3_page_table:
1446 return alloc_l3_table(page, type);
1447 case PGT_l4_page_table:
1448 return alloc_l4_table(page, type);
1449 case PGT_gdt_page:
1450 case PGT_ldt_page:
1451 return alloc_segdesc_page(page);
1452 default:
1453 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1454 type, page->u.inuse.type_info,
1455 page->count_info);
1456 BUG();
1459 return 0;
1463 void free_page_type(struct page_info *page, unsigned long type)
1465 struct domain *owner = page_get_owner(page);
1466 unsigned long gmfn;
1468 if ( likely(owner != NULL) )
1470 /*
1471 * We have to flush before the next use of the linear mapping
1472 * (e.g., update_va_mapping()) or we could end up modifying a page
1473 * that is no longer a page table (and hence screw up ref counts).
1474 */
1475 percpu_info[smp_processor_id()].deferred_ops |= DOP_FLUSH_ALL_TLBS;
1477 if ( unlikely(shadow_mode_enabled(owner)) )
1479 /* Raw page tables are rewritten during save/restore. */
1480 if ( !shadow_mode_translate(owner) )
1481 mark_dirty(owner, page_to_mfn(page));
1483 if ( shadow_mode_refcounts(owner) )
1484 return;
1486 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1487 ASSERT(VALID_M2P(gmfn));
1488 remove_shadow(owner, gmfn, type & PGT_type_mask);
1492 switch ( type & PGT_type_mask )
1494 case PGT_l1_page_table:
1495 free_l1_table(page);
1496 break;
1498 case PGT_l2_page_table:
1499 free_l2_table(page);
1500 break;
1502 #if CONFIG_PAGING_LEVELS >= 3
1503 case PGT_l3_page_table:
1504 free_l3_table(page);
1505 break;
1506 #endif
1508 #if CONFIG_PAGING_LEVELS >= 4
1509 case PGT_l4_page_table:
1510 free_l4_table(page);
1511 break;
1512 #endif
1514 default:
1515 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1516 type, page_to_mfn(page));
1517 BUG();
1522 void put_page_type(struct page_info *page)
1524 unsigned long nx, x, y = page->u.inuse.type_info;
1526 again:
1527 do {
1528 x = y;
1529 nx = x - 1;
1531 ASSERT((x & PGT_count_mask) != 0);
1533 /*
1534 * The page should always be validated while a reference is held. The
1535 * exception is during domain destruction, when we forcibly invalidate
1536 * page-table pages if we detect a referential loop.
1537 * See domain.c:relinquish_list().
1538 */
1539 ASSERT((x & PGT_validated) ||
1540 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1542 if ( unlikely((nx & PGT_count_mask) == 0) )
1544 /* Record TLB information for flush later. Races are harmless. */
1545 page->tlbflush_timestamp = tlbflush_current_time();
1547 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1548 likely(nx & PGT_validated) )
1550 /*
1551 * Page-table pages must be unvalidated when count is zero. The
1552 * 'free' is safe because the refcnt is non-zero and validated
1553 * bit is clear => other ops will spin or fail.
1554 */
1555 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1556 x & ~PGT_validated)) != x) )
1557 goto again;
1558 /* We cleared the 'valid bit' so we do the clean up. */
1559 free_page_type(page, x);
1560 /* Carry on, but with the 'valid bit' now clear. */
1561 x &= ~PGT_validated;
1562 nx &= ~PGT_validated;
1565 else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) ==
1566 (PGT_pinned|PGT_l1_page_table|1)) )
1568 /* Page is now only pinned. Make the back pointer mutable again. */
1569 nx |= PGT_va_mutable;
1572 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1576 int get_page_type(struct page_info *page, unsigned long type)
1578 unsigned long nx, x, y = page->u.inuse.type_info;
1580 again:
1581 do {
1582 x = y;
1583 nx = x + 1;
1584 if ( unlikely((nx & PGT_count_mask) == 0) )
1586 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1587 return 0;
1589 else if ( unlikely((x & PGT_count_mask) == 0) )
1591 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1593 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1595 /*
1596 * On type change we check to flush stale TLB
1597 * entries. This may be unnecessary (e.g., page
1598 * was GDT/LDT) but those circumstances should be
1599 * very rare.
1600 */
1601 cpumask_t mask =
1602 page_get_owner(page)->domain_dirty_cpumask;
1603 tlbflush_filter(mask, page->tlbflush_timestamp);
1605 if ( unlikely(!cpus_empty(mask)) )
1607 perfc_incrc(need_flush_tlb_flush);
1608 flush_tlb_mask(mask);
1612 /* We lose existing type, back pointer, and validity. */
1613 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1614 nx |= type;
1616 /* No special validation needed for writable pages. */
1617 /* Page tables and GDT/LDT need to be scanned for validity. */
1618 if ( type == PGT_writable_page )
1619 nx |= PGT_validated;
1622 else
1624 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1626 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1628 if ( (current->domain == page_get_owner(page)) &&
1629 ((x & PGT_type_mask) == PGT_writable_page) )
1631 /*
1632 * This ensures functions like set_gdt() see up-to-date
1633 * type info without needing to clean up writable p.t.
1634 * state on the fast path. We take this path only
1635 * when the current type is writable because:
1636 * 1. It's the only type that this path can decrement.
1637 * 2. If we take this path more liberally then we can
1638 * enter a recursive loop via get_page_from_l1e()
1639 * during pagetable revalidation.
1640 */
1641 LOCK_BIGLOCK(current->domain);
1642 cleanup_writable_pagetable(current->domain);
1643 y = page->u.inuse.type_info;
1644 UNLOCK_BIGLOCK(current->domain);
1645 /* Can we make progress now? */
1646 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1647 ((y & PGT_count_mask) == 0) )
1648 goto again;
1650 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1651 ((type & PGT_type_mask) != PGT_l1_page_table) )
1652 MEM_LOG("Bad type (saw %" PRtype_info
1653 " != exp %" PRtype_info ") "
1654 "for mfn %lx (pfn %lx)",
1655 x, type, page_to_mfn(page),
1656 get_gpfn_from_mfn(page_to_mfn(page)));
1657 return 0;
1659 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1661 /* The va backpointer is mutable, hence we update it. */
1662 nx &= ~PGT_va_mask;
1663 nx |= type; /* we know the actual type is correct */
1665 else if ( (type & PGT_va_mask) != PGT_va_mutable )
1667 ASSERT((type & PGT_va_mask) != (x & PGT_va_mask));
1668 #ifdef CONFIG_X86_PAE
1669 /* We use backptr as extra typing. Cannot be unknown. */
1670 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1671 return 0;
1672 #endif
1673 /* Fixme: add code to propagate va_unknown to subtables. */
1674 if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
1675 !shadow_mode_refcounts(page_get_owner(page)) )
1676 return 0;
1677 /* This table is possibly mapped at multiple locations. */
1678 nx &= ~PGT_va_mask;
1679 nx |= PGT_va_unknown;
1682 if ( unlikely(!(x & PGT_validated)) )
1684 /* Someone else is updating validation of this page. Wait... */
1685 while ( (y = page->u.inuse.type_info) == x )
1686 cpu_relax();
1687 goto again;
1691 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1693 if ( unlikely(!(nx & PGT_validated)) )
1695 /* Try to validate page type; drop the new reference on failure. */
1696 if ( unlikely(!alloc_page_type(page, type)) )
1698 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1699 PRtype_info ": caf=%08x taf=%" PRtype_info,
1700 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1701 type, page->count_info, page->u.inuse.type_info);
1702 /* Noone else can get a reference. We hold the only ref. */
1703 page->u.inuse.type_info = 0;
1704 return 0;
1707 /* Noone else is updating simultaneously. */
1708 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1711 return 1;
1715 int new_guest_cr3(unsigned long mfn)
1717 struct vcpu *v = current;
1718 struct domain *d = v->domain;
1719 int okay;
1720 unsigned long old_base_mfn;
1722 ASSERT(writable_pagetable_in_sync(d));
1724 if ( shadow_mode_refcounts(d) )
1726 okay = get_page_from_pagenr(mfn, d);
1727 if ( unlikely(!okay) )
1729 MEM_LOG("Error while installing new baseptr %lx", mfn);
1730 return 0;
1733 else
1735 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1736 if ( unlikely(!okay) )
1738 /* Switch to idle pagetable: this VCPU has no active p.t. now. */
1739 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1740 v->arch.guest_table = pagetable_null();
1741 update_pagetables(v);
1742 write_cr3(__pa(idle_pg_table));
1743 if ( old_base_mfn != 0 )
1744 put_page_and_type(mfn_to_page(old_base_mfn));
1746 /* Retry the validation with no active p.t. for this VCPU. */
1747 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1748 if ( !okay )
1750 /* Failure here is unrecoverable: the VCPU has no pagetable! */
1751 MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
1752 domain_crash(d);
1753 percpu_info[v->processor].deferred_ops = 0;
1754 return 0;
1759 invalidate_shadow_ldt(v);
1761 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1762 v->arch.guest_table = pagetable_from_pfn(mfn);
1763 update_pagetables(v); /* update shadow_table and monitor_table */
1765 write_ptbase(v);
1767 if ( likely(old_base_mfn != 0) )
1769 if ( shadow_mode_refcounts(d) )
1770 put_page(mfn_to_page(old_base_mfn));
1771 else
1772 put_page_and_type(mfn_to_page(old_base_mfn));
1775 /* CR3 also holds a ref to its shadow... */
1776 if ( shadow_mode_enabled(d) )
1778 if ( v->arch.monitor_shadow_ref )
1779 put_shadow_ref(v->arch.monitor_shadow_ref);
1780 v->arch.monitor_shadow_ref =
1781 pagetable_get_pfn(v->arch.monitor_table);
1782 ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
1783 get_shadow_ref(v->arch.monitor_shadow_ref);
1786 return 1;
1789 static void process_deferred_ops(unsigned int cpu)
1791 unsigned int deferred_ops;
1792 struct domain *d = current->domain;
1794 deferred_ops = percpu_info[cpu].deferred_ops;
1795 percpu_info[cpu].deferred_ops = 0;
1797 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1799 if ( shadow_mode_enabled(d) )
1800 shadow_sync_all(d);
1801 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1802 flush_tlb_mask(d->domain_dirty_cpumask);
1803 else
1804 local_flush_tlb();
1807 if ( deferred_ops & DOP_RELOAD_LDT )
1808 (void)map_ldt_shadow_page(0);
1810 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1812 put_domain(percpu_info[cpu].foreign);
1813 percpu_info[cpu].foreign = NULL;
1817 static int set_foreigndom(unsigned int cpu, domid_t domid)
1819 struct domain *e, *d = current->domain;
1820 int okay = 1;
1822 ASSERT(percpu_info[cpu].foreign == NULL);
1824 if ( likely(domid == DOMID_SELF) )
1825 goto out;
1827 if ( domid == d->domain_id )
1829 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1830 d->domain_id);
1831 okay = 0;
1833 else if ( !IS_PRIV(d) )
1835 switch ( domid )
1837 case DOMID_IO:
1838 get_knownalive_domain(dom_io);
1839 percpu_info[cpu].foreign = dom_io;
1840 break;
1841 default:
1842 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1843 okay = 0;
1844 break;
1847 else
1849 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1850 if ( e == NULL )
1852 switch ( domid )
1854 case DOMID_XEN:
1855 get_knownalive_domain(dom_xen);
1856 percpu_info[cpu].foreign = dom_xen;
1857 break;
1858 case DOMID_IO:
1859 get_knownalive_domain(dom_io);
1860 percpu_info[cpu].foreign = dom_io;
1861 break;
1862 default:
1863 MEM_LOG("Unknown domain '%u'", domid);
1864 okay = 0;
1865 break;
1870 out:
1871 return okay;
1874 static inline cpumask_t vcpumask_to_pcpumask(
1875 struct domain *d, unsigned long vmask)
1877 unsigned int vcpu_id;
1878 cpumask_t pmask = CPU_MASK_NONE;
1879 struct vcpu *v;
1881 while ( vmask != 0 )
1883 vcpu_id = find_first_set_bit(vmask);
1884 vmask &= ~(1UL << vcpu_id);
1885 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1886 ((v = d->vcpu[vcpu_id]) != NULL) )
1887 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1890 return pmask;
1893 int do_mmuext_op(
1894 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1895 unsigned int count,
1896 XEN_GUEST_HANDLE(uint) pdone,
1897 unsigned int foreigndom)
1899 struct mmuext_op op;
1900 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1901 unsigned long mfn, type;
1902 unsigned int done = 0;
1903 struct page_info *page;
1904 struct vcpu *v = current;
1905 struct domain *d = v->domain;
1907 LOCK_BIGLOCK(d);
1909 cleanup_writable_pagetable(d);
1911 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1913 count &= ~MMU_UPDATE_PREEMPTED;
1914 if ( unlikely(!guest_handle_is_null(pdone)) )
1915 (void)copy_from_guest(&done, pdone, 1);
1918 if ( !set_foreigndom(cpu, foreigndom) )
1920 rc = -ESRCH;
1921 goto out;
1924 if ( unlikely(!guest_handle_okay(uops, count)) )
1926 rc = -EFAULT;
1927 goto out;
1930 for ( i = 0; i < count; i++ )
1932 if ( hypercall_preempt_check() )
1934 rc = hypercall_create_continuation(
1935 __HYPERVISOR_mmuext_op, "hihi",
1936 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1937 break;
1940 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
1942 MEM_LOG("Bad __copy_from_guest");
1943 rc = -EFAULT;
1944 break;
1947 okay = 1;
1948 mfn = op.arg1.mfn;
1949 page = mfn_to_page(mfn);
1951 switch ( op.cmd )
1953 case MMUEXT_PIN_L1_TABLE:
1954 type = PGT_l1_page_table | PGT_va_mutable;
1955 goto pin_page;
1957 case MMUEXT_PIN_L2_TABLE:
1958 case MMUEXT_PIN_L3_TABLE:
1959 case MMUEXT_PIN_L4_TABLE:
1960 /* Ignore pinning of subdirectories. */
1961 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) != (CONFIG_PAGING_LEVELS - 1) )
1962 break;
1964 type = PGT_root_page_table;
1966 pin_page:
1967 if ( shadow_mode_refcounts(FOREIGNDOM) )
1968 break;
1970 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1971 if ( unlikely(!okay) )
1973 MEM_LOG("Error while pinning mfn %lx", mfn);
1974 break;
1977 if ( unlikely(test_and_set_bit(_PGT_pinned,
1978 &page->u.inuse.type_info)) )
1980 MEM_LOG("Mfn %lx already pinned", mfn);
1981 put_page_and_type(page);
1982 okay = 0;
1983 break;
1986 break;
1988 case MMUEXT_UNPIN_TABLE:
1989 if ( shadow_mode_refcounts(d) )
1990 break;
1992 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
1994 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1995 mfn, page_get_owner(page));
1997 else if ( likely(test_and_clear_bit(_PGT_pinned,
1998 &page->u.inuse.type_info)) )
2000 put_page_and_type(page);
2001 put_page(page);
2003 else
2005 okay = 0;
2006 put_page(page);
2007 MEM_LOG("Mfn %lx not pinned", mfn);
2009 break;
2011 case MMUEXT_NEW_BASEPTR:
2012 mfn = gmfn_to_mfn(current->domain, mfn);
2013 okay = new_guest_cr3(mfn);
2014 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
2015 break;
2017 #ifdef __x86_64__
2018 case MMUEXT_NEW_USER_BASEPTR:
2019 okay = get_page_and_type_from_pagenr(
2020 mfn, PGT_root_page_table, d);
2021 if ( unlikely(!okay) )
2023 MEM_LOG("Error while installing new mfn %lx", mfn);
2025 else
2027 unsigned long old_mfn =
2028 pagetable_get_pfn(v->arch.guest_table_user);
2029 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2030 if ( old_mfn != 0 )
2031 put_page_and_type(mfn_to_page(old_mfn));
2033 break;
2034 #endif
2036 case MMUEXT_TLB_FLUSH_LOCAL:
2037 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
2038 break;
2040 case MMUEXT_INVLPG_LOCAL:
2041 if ( shadow_mode_enabled(d) )
2042 shadow_invlpg(v, op.arg1.linear_addr);
2043 local_flush_tlb_one(op.arg1.linear_addr);
2044 break;
2046 case MMUEXT_TLB_FLUSH_MULTI:
2047 case MMUEXT_INVLPG_MULTI:
2049 unsigned long vmask;
2050 cpumask_t pmask;
2051 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
2053 okay = 0;
2054 break;
2056 pmask = vcpumask_to_pcpumask(d, vmask);
2057 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2058 flush_tlb_mask(pmask);
2059 else
2060 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2061 break;
2064 case MMUEXT_TLB_FLUSH_ALL:
2065 flush_tlb_mask(d->domain_dirty_cpumask);
2066 break;
2068 case MMUEXT_INVLPG_ALL:
2069 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2070 break;
2072 case MMUEXT_FLUSH_CACHE:
2073 if ( unlikely(!cache_flush_permitted(d)) )
2075 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2076 okay = 0;
2078 else
2080 wbinvd();
2082 break;
2084 case MMUEXT_SET_LDT:
2086 unsigned long ptr = op.arg1.linear_addr;
2087 unsigned long ents = op.arg2.nr_ents;
2089 if ( shadow_mode_external(d) )
2091 MEM_LOG("ignoring SET_LDT hypercall from external "
2092 "domain %u", d->domain_id);
2093 okay = 0;
2095 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2096 (ents > 8192) ||
2097 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2099 okay = 0;
2100 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2102 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2103 (v->arch.guest_context.ldt_base != ptr) )
2105 invalidate_shadow_ldt(v);
2106 v->arch.guest_context.ldt_base = ptr;
2107 v->arch.guest_context.ldt_ents = ents;
2108 load_LDT(v);
2109 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
2110 if ( ents != 0 )
2111 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
2113 break;
2116 default:
2117 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2118 okay = 0;
2119 break;
2122 if ( unlikely(!okay) )
2124 rc = -EINVAL;
2125 break;
2128 guest_handle_add_offset(uops, 1);
2131 out:
2132 process_deferred_ops(cpu);
2134 /* Add incremental work we have done to the @done output parameter. */
2135 done += i;
2136 if ( unlikely(!guest_handle_is_null(pdone)) )
2137 copy_to_guest(pdone, &done, 1);
2139 UNLOCK_BIGLOCK(d);
2140 return rc;
2143 int do_mmu_update(
2144 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2145 unsigned int count,
2146 XEN_GUEST_HANDLE(uint) pdone,
2147 unsigned int foreigndom)
2149 struct mmu_update req;
2150 void *va;
2151 unsigned long gpfn, gmfn, mfn;
2152 struct page_info *page;
2153 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
2154 unsigned int cmd, done = 0;
2155 struct vcpu *v = current;
2156 struct domain *d = v->domain;
2157 unsigned long type_info;
2158 struct domain_mmap_cache mapcache, sh_mapcache;
2160 LOCK_BIGLOCK(d);
2162 cleanup_writable_pagetable(d);
2164 if ( unlikely(shadow_mode_enabled(d)) )
2165 check_pagetable(v, "pre-mmu"); /* debug */
2167 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2169 count &= ~MMU_UPDATE_PREEMPTED;
2170 if ( unlikely(!guest_handle_is_null(pdone)) )
2171 (void)copy_from_guest(&done, pdone, 1);
2174 domain_mmap_cache_init(&mapcache);
2175 domain_mmap_cache_init(&sh_mapcache);
2177 if ( !set_foreigndom(cpu, foreigndom) )
2179 rc = -ESRCH;
2180 goto out;
2183 perfc_incrc(calls_to_mmu_update);
2184 perfc_addc(num_page_updates, count);
2185 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2187 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2189 rc = -EFAULT;
2190 goto out;
2193 for ( i = 0; i < count; i++ )
2195 if ( hypercall_preempt_check() )
2197 rc = hypercall_create_continuation(
2198 __HYPERVISOR_mmu_update, "hihi",
2199 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2200 break;
2203 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2205 MEM_LOG("Bad __copy_from_guest");
2206 rc = -EFAULT;
2207 break;
2210 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2211 okay = 0;
2213 switch ( cmd )
2215 /*
2216 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2217 */
2218 case MMU_NORMAL_PT_UPDATE:
2220 gmfn = req.ptr >> PAGE_SHIFT;
2221 mfn = gmfn_to_mfn(d, gmfn);
2223 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2225 MEM_LOG("Could not get page for normal update");
2226 break;
2229 va = map_domain_page_with_cache(mfn, &mapcache);
2230 va = (void *)((unsigned long)va +
2231 (unsigned long)(req.ptr & ~PAGE_MASK));
2232 page = mfn_to_page(mfn);
2234 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2236 case PGT_l1_page_table:
2237 case PGT_l2_page_table:
2238 case PGT_l3_page_table:
2239 case PGT_l4_page_table:
2241 ASSERT(!shadow_mode_refcounts(d));
2242 if ( unlikely(!get_page_type(
2243 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2244 goto not_a_pt;
2246 switch ( type_info & PGT_type_mask )
2248 case PGT_l1_page_table:
2250 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2251 okay = mod_l1_entry(va, l1e);
2252 if ( okay && unlikely(shadow_mode_enabled(d)) )
2253 shadow_l1_normal_pt_update(
2254 d, req.ptr, l1e, &sh_mapcache);
2256 break;
2257 case PGT_l2_page_table:
2259 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2260 okay = mod_l2_entry(
2261 (l2_pgentry_t *)va, l2e, mfn, type_info);
2262 if ( okay && unlikely(shadow_mode_enabled(d)) )
2263 shadow_l2_normal_pt_update(
2264 d, req.ptr, l2e, &sh_mapcache);
2266 break;
2267 #if CONFIG_PAGING_LEVELS >= 3
2268 case PGT_l3_page_table:
2270 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2271 okay = mod_l3_entry(va, l3e, mfn, type_info);
2272 if ( okay && unlikely(shadow_mode_enabled(d)) )
2273 shadow_l3_normal_pt_update(
2274 d, req.ptr, l3e, &sh_mapcache);
2276 break;
2277 #endif
2278 #if CONFIG_PAGING_LEVELS >= 4
2279 case PGT_l4_page_table:
2281 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2282 okay = mod_l4_entry(va, l4e, mfn, type_info);
2283 if ( okay && unlikely(shadow_mode_enabled(d)) )
2284 shadow_l4_normal_pt_update(
2285 d, req.ptr, l4e, &sh_mapcache);
2287 break;
2288 #endif
2291 put_page_type(page);
2293 break;
2295 default:
2296 not_a_pt:
2298 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2299 break;
2301 if ( shadow_mode_enabled(d) )
2303 shadow_lock(d);
2304 __mark_dirty(d, mfn);
2305 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2306 shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
2309 *(intpte_t *)va = req.val;
2310 okay = 1;
2312 if ( shadow_mode_enabled(d) )
2313 shadow_unlock(d);
2315 put_page_type(page);
2317 break;
2320 unmap_domain_page_with_cache(va, &mapcache);
2322 put_page(page);
2323 break;
2325 case MMU_MACHPHYS_UPDATE:
2327 if ( shadow_mode_translate(FOREIGNDOM) )
2329 MEM_LOG("can't mutate m2p table of translate mode guest");
2330 break;
2333 mfn = req.ptr >> PAGE_SHIFT;
2334 gpfn = req.val;
2336 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2338 MEM_LOG("Could not get page for mach->phys update");
2339 break;
2342 set_gpfn_from_mfn(mfn, gpfn);
2343 okay = 1;
2345 mark_dirty(FOREIGNDOM, mfn);
2347 put_page(mfn_to_page(mfn));
2348 break;
2350 default:
2351 MEM_LOG("Invalid page update command %x", cmd);
2352 break;
2355 if ( unlikely(!okay) )
2357 rc = -EINVAL;
2358 break;
2361 guest_handle_add_offset(ureqs, 1);
2364 out:
2365 domain_mmap_cache_destroy(&mapcache);
2366 domain_mmap_cache_destroy(&sh_mapcache);
2368 process_deferred_ops(cpu);
2370 /* Add incremental work we have done to the @done output parameter. */
2371 done += i;
2372 if ( unlikely(!guest_handle_is_null(pdone)) )
2373 copy_to_guest(pdone, &done, 1);
2375 if ( unlikely(shadow_mode_enabled(d)) )
2376 check_pagetable(v, "post-mmu"); /* debug */
2378 UNLOCK_BIGLOCK(d);
2379 return rc;
2383 static int create_grant_pte_mapping(
2384 unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
2386 int rc = GNTST_okay;
2387 void *va;
2388 unsigned long gmfn, mfn;
2389 struct page_info *page;
2390 u32 type_info;
2391 l1_pgentry_t ol1e;
2392 struct domain *d = v->domain;
2394 ASSERT(spin_is_locked(&d->big_lock));
2395 ASSERT(!shadow_mode_refcounts(d));
2397 gmfn = pte_addr >> PAGE_SHIFT;
2398 mfn = gmfn_to_mfn(d, gmfn);
2400 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2402 MEM_LOG("Could not get page for normal update");
2403 return GNTST_general_error;
2406 va = map_domain_page(mfn);
2407 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2408 page = mfn_to_page(mfn);
2410 type_info = page->u.inuse.type_info;
2411 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2412 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2414 MEM_LOG("Grant map attempted to update a non-L1 page");
2415 rc = GNTST_general_error;
2416 goto failed;
2419 ol1e = *(l1_pgentry_t *)va;
2420 if ( !update_l1e(va, ol1e, _nl1e) )
2422 put_page_type(page);
2423 rc = GNTST_general_error;
2424 goto failed;
2427 put_page_from_l1e(ol1e, d);
2429 if ( unlikely(shadow_mode_enabled(d)) )
2431 struct domain_mmap_cache sh_mapcache;
2432 domain_mmap_cache_init(&sh_mapcache);
2433 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2434 domain_mmap_cache_destroy(&sh_mapcache);
2437 put_page_type(page);
2439 failed:
2440 unmap_domain_page(va);
2441 put_page(page);
2442 return rc;
2445 static int destroy_grant_pte_mapping(
2446 unsigned long addr, unsigned long frame, struct domain *d)
2448 int rc = GNTST_okay;
2449 void *va;
2450 unsigned long gmfn, mfn;
2451 struct page_info *page;
2452 u32 type_info;
2453 l1_pgentry_t ol1e;
2455 ASSERT(!shadow_mode_refcounts(d));
2457 gmfn = addr >> PAGE_SHIFT;
2458 mfn = gmfn_to_mfn(d, gmfn);
2460 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2462 MEM_LOG("Could not get page for normal update");
2463 return GNTST_general_error;
2466 va = map_domain_page(mfn);
2467 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2468 page = mfn_to_page(mfn);
2470 type_info = page->u.inuse.type_info;
2471 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2472 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2474 MEM_LOG("Grant map attempted to update a non-L1 page");
2475 rc = GNTST_general_error;
2476 goto failed;
2479 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2481 put_page_type(page);
2482 rc = GNTST_general_error;
2483 goto failed;
2486 /* Check that the virtual address supplied is actually mapped to frame. */
2487 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2489 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2490 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2491 put_page_type(page);
2492 rc = GNTST_general_error;
2493 goto failed;
2496 /* Delete pagetable entry. */
2497 if ( unlikely(__put_user(0, (intpte_t *)va)))
2499 MEM_LOG("Cannot delete PTE entry at %p", va);
2500 put_page_type(page);
2501 rc = GNTST_general_error;
2502 goto failed;
2505 if ( unlikely(shadow_mode_enabled(d)) )
2507 struct domain_mmap_cache sh_mapcache;
2508 domain_mmap_cache_init(&sh_mapcache);
2509 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2510 domain_mmap_cache_destroy(&sh_mapcache);
2513 put_page_type(page);
2515 failed:
2516 unmap_domain_page(va);
2517 put_page(page);
2518 return rc;
2522 static int create_grant_va_mapping(
2523 unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
2525 l1_pgentry_t *pl1e, ol1e;
2526 struct domain *d = v->domain;
2528 ASSERT(spin_is_locked(&d->big_lock));
2529 ASSERT(!shadow_mode_refcounts(d));
2531 /*
2532 * This is actually overkill - we don't need to sync the L1 itself,
2533 * just everything involved in getting to this L1 (i.e. we need
2534 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2535 */
2536 __shadow_sync_va(v, va);
2538 pl1e = &linear_pg_table[l1_linear_offset(va)];
2540 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2541 !update_l1e(pl1e, ol1e, _nl1e) )
2542 return GNTST_general_error;
2544 put_page_from_l1e(ol1e, d);
2546 if ( unlikely(shadow_mode_enabled(d)) )
2547 shadow_do_update_va_mapping(va, _nl1e, v);
2549 return GNTST_okay;
2552 static int destroy_grant_va_mapping(
2553 unsigned long addr, unsigned long frame)
2555 l1_pgentry_t *pl1e, ol1e;
2557 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2559 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2561 MEM_LOG("Could not find PTE entry for address %lx", addr);
2562 return GNTST_general_error;
2565 /*
2566 * Check that the virtual address supplied is actually mapped to
2567 * frame.
2568 */
2569 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2571 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2572 l1e_get_pfn(ol1e), addr, frame);
2573 return GNTST_general_error;
2576 /* Delete pagetable entry. */
2577 if ( unlikely(__put_user(0, &pl1e->l1)) )
2579 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2580 return GNTST_general_error;
2583 return 0;
2586 int create_grant_host_mapping(
2587 unsigned long addr, unsigned long frame, unsigned int flags)
2589 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2591 if ( (flags & GNTMAP_application_map) )
2592 l1e_add_flags(pte,_PAGE_USER);
2593 if ( !(flags & GNTMAP_readonly) )
2594 l1e_add_flags(pte,_PAGE_RW);
2596 if ( flags & GNTMAP_contains_pte )
2597 return create_grant_pte_mapping(addr, pte, current);
2598 return create_grant_va_mapping(addr, pte, current);
2601 int destroy_grant_host_mapping(
2602 unsigned long addr, unsigned long frame, unsigned int flags)
2604 if ( flags & GNTMAP_contains_pte )
2605 return destroy_grant_pte_mapping(addr, frame, current->domain);
2606 return destroy_grant_va_mapping(addr, frame);
2609 int steal_page(
2610 struct domain *d, struct page_info *page, unsigned int memflags)
2612 u32 _d, _nd, x, y;
2614 spin_lock(&d->page_alloc_lock);
2616 /*
2617 * The tricky bit: atomically release ownership while there is just one
2618 * benign reference to the page (PGC_allocated). If that reference
2619 * disappears then the deallocation routine will safely spin.
2620 */
2621 _d = pickle_domptr(d);
2622 _nd = page->u.inuse._domain;
2623 y = page->count_info;
2624 do {
2625 x = y;
2626 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2627 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2628 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2629 " caf=%08x, taf=%" PRtype_info "\n",
2630 (void *) page_to_mfn(page),
2631 d, d->domain_id, unpickle_domptr(_nd), x,
2632 page->u.inuse.type_info);
2633 spin_unlock(&d->page_alloc_lock);
2634 return -1;
2636 __asm__ __volatile__(
2637 LOCK_PREFIX "cmpxchg8b %2"
2638 : "=d" (_nd), "=a" (y),
2639 "=m" (*(volatile u64 *)(&page->count_info))
2640 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2641 } while (unlikely(_nd != _d) || unlikely(y != x));
2643 /*
2644 * Unlink from 'd'. At least one reference remains (now anonymous), so
2645 * noone else is spinning to try to delete this page from 'd'.
2646 */
2647 if ( !(memflags & MEMF_no_refcount) )
2648 d->tot_pages--;
2649 list_del(&page->list);
2651 spin_unlock(&d->page_alloc_lock);
2653 return 0;
2656 int do_update_va_mapping(unsigned long va, u64 val64,
2657 unsigned long flags)
2659 l1_pgentry_t val = l1e_from_intpte(val64);
2660 struct vcpu *v = current;
2661 struct domain *d = v->domain;
2662 unsigned int cpu = smp_processor_id();
2663 unsigned long vmask, bmap_ptr;
2664 cpumask_t pmask;
2665 int rc = 0;
2667 perfc_incrc(calls_to_update_va);
2669 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2670 return -EINVAL;
2672 LOCK_BIGLOCK(d);
2674 cleanup_writable_pagetable(d);
2676 if ( unlikely(shadow_mode_enabled(d)) )
2677 check_pagetable(v, "pre-va"); /* debug */
2679 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2680 val)) )
2681 rc = -EINVAL;
2683 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2685 if ( unlikely(percpu_info[cpu].foreign &&
2686 (shadow_mode_translate(d) ||
2687 shadow_mode_translate(percpu_info[cpu].foreign))) )
2689 /*
2690 * The foreign domain's pfn's are in a different namespace. There's
2691 * not enough information in just a gpte to figure out how to
2692 * (re-)shadow this entry.
2693 */
2694 domain_crash(d);
2697 rc = shadow_do_update_va_mapping(va, val, v);
2699 check_pagetable(v, "post-va"); /* debug */
2702 switch ( flags & UVMF_FLUSHTYPE_MASK )
2704 case UVMF_TLB_FLUSH:
2705 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2707 case UVMF_LOCAL:
2708 if ( unlikely(shadow_mode_enabled(d)) )
2709 shadow_sync_all(d);
2710 local_flush_tlb();
2711 break;
2712 case UVMF_ALL:
2713 flush_tlb_mask(d->domain_dirty_cpumask);
2714 break;
2715 default:
2716 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2717 rc = -EFAULT;
2718 pmask = vcpumask_to_pcpumask(d, vmask);
2719 flush_tlb_mask(pmask);
2720 break;
2722 break;
2724 case UVMF_INVLPG:
2725 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2727 case UVMF_LOCAL:
2728 if ( unlikely(shadow_mode_enabled(d)) )
2729 shadow_invlpg(current, va);
2730 local_flush_tlb_one(va);
2731 break;
2732 case UVMF_ALL:
2733 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2734 break;
2735 default:
2736 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2737 rc = -EFAULT;
2738 pmask = vcpumask_to_pcpumask(d, vmask);
2739 flush_tlb_one_mask(pmask, va);
2740 break;
2742 break;
2745 process_deferred_ops(cpu);
2747 UNLOCK_BIGLOCK(d);
2749 return rc;
2752 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2753 unsigned long flags,
2754 domid_t domid)
2756 unsigned int cpu = smp_processor_id();
2757 int rc;
2759 if ( unlikely(!IS_PRIV(current->domain)) )
2760 return -EPERM;
2762 if ( !set_foreigndom(cpu, domid) )
2763 return -ESRCH;
2765 rc = do_update_va_mapping(va, val64, flags);
2767 return rc;
2772 /*************************
2773 * Descriptor Tables
2774 */
2776 void destroy_gdt(struct vcpu *v)
2778 int i;
2779 unsigned long pfn;
2781 v->arch.guest_context.gdt_ents = 0;
2782 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2784 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2785 put_page_and_type(mfn_to_page(pfn));
2786 v->arch.perdomain_ptes[i] = l1e_empty();
2787 v->arch.guest_context.gdt_frames[i] = 0;
2792 long set_gdt(struct vcpu *v,
2793 unsigned long *frames,
2794 unsigned int entries)
2796 struct domain *d = v->domain;
2797 /* NB. There are 512 8-byte entries per GDT page. */
2798 int i, nr_pages = (entries + 511) / 512;
2799 unsigned long mfn;
2801 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2802 return -EINVAL;
2804 shadow_sync_all(d);
2806 /* Check the pages in the new GDT. */
2807 for ( i = 0; i < nr_pages; i++ ) {
2808 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2809 if ( !mfn_valid(mfn) ||
2810 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2811 goto fail;
2814 /* Tear down the old GDT. */
2815 destroy_gdt(v);
2817 /* Install the new GDT. */
2818 v->arch.guest_context.gdt_ents = entries;
2819 for ( i = 0; i < nr_pages; i++ )
2821 v->arch.guest_context.gdt_frames[i] = frames[i];
2822 v->arch.perdomain_ptes[i] =
2823 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2826 return 0;
2828 fail:
2829 while ( i-- > 0 )
2830 put_page_and_type(mfn_to_page(frames[i]));
2831 return -EINVAL;
2835 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2837 int nr_pages = (entries + 511) / 512;
2838 unsigned long frames[16];
2839 long ret;
2841 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2842 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2843 return -EINVAL;
2845 if ( copy_from_guest((unsigned long *)frames, frame_list, nr_pages) )
2846 return -EFAULT;
2848 LOCK_BIGLOCK(current->domain);
2850 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2851 local_flush_tlb();
2853 UNLOCK_BIGLOCK(current->domain);
2855 return ret;
2859 long do_update_descriptor(u64 pa, u64 desc)
2861 struct domain *dom = current->domain;
2862 unsigned long gmfn = pa >> PAGE_SHIFT;
2863 unsigned long mfn;
2864 unsigned int offset;
2865 struct desc_struct *gdt_pent, d;
2866 struct page_info *page;
2867 long ret = -EINVAL;
2869 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2871 *(u64 *)&d = desc;
2873 LOCK_BIGLOCK(dom);
2875 if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
2876 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2877 !mfn_valid(mfn) ||
2878 !check_descriptor(&d) )
2880 UNLOCK_BIGLOCK(dom);
2881 return -EINVAL;
2884 page = mfn_to_page(mfn);
2885 if ( unlikely(!get_page(page, dom)) )
2887 UNLOCK_BIGLOCK(dom);
2888 return -EINVAL;
2891 /* Check if the given frame is in use in an unsafe context. */
2892 switch ( page->u.inuse.type_info & PGT_type_mask )
2894 case PGT_gdt_page:
2895 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2896 goto out;
2897 break;
2898 case PGT_ldt_page:
2899 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2900 goto out;
2901 break;
2902 default:
2903 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2904 goto out;
2905 break;
2908 if ( shadow_mode_enabled(dom) )
2910 shadow_lock(dom);
2912 __mark_dirty(dom, mfn);
2914 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2915 shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
2918 /* All is good so make the update. */
2919 gdt_pent = map_domain_page(mfn);
2920 memcpy(&gdt_pent[offset], &d, 8);
2921 unmap_domain_page(gdt_pent);
2923 if ( shadow_mode_enabled(dom) )
2924 shadow_unlock(dom);
2926 put_page_type(page);
2928 ret = 0; /* success */
2930 out:
2931 put_page(page);
2933 UNLOCK_BIGLOCK(dom);
2935 return ret;
2938 typedef struct e820entry e820entry_t;
2939 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
2941 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2943 switch ( op )
2945 case XENMEM_add_to_physmap:
2947 struct xen_add_to_physmap xatp;
2948 unsigned long mfn = 0, gpfn;
2949 struct domain *d;
2951 if ( copy_from_guest(&xatp, arg, 1) )
2952 return -EFAULT;
2954 if ( (d = find_domain_by_id(xatp.domid)) == NULL )
2955 return -ESRCH;
2957 switch ( xatp.space )
2959 case XENMAPSPACE_shared_info:
2960 if ( xatp.idx == 0 )
2961 mfn = virt_to_mfn(d->shared_info);
2962 break;
2963 case XENMAPSPACE_grant_table:
2964 if ( xatp.idx < NR_GRANT_FRAMES )
2965 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
2966 break;
2967 default:
2968 break;
2971 if ( !shadow_mode_translate(d) || (mfn == 0) )
2973 put_domain(d);
2974 return -EINVAL;
2977 LOCK_BIGLOCK(d);
2979 /* Remove previously mapped page if it was present. */
2980 if ( mfn_valid(gmfn_to_mfn(d, xatp.gpfn)) )
2981 guest_remove_page(d, xatp.gpfn);
2983 /* Unmap from old location, if any. */
2984 gpfn = get_gpfn_from_mfn(mfn);
2985 if ( gpfn != INVALID_M2P_ENTRY )
2986 guest_physmap_remove_page(d, gpfn, mfn);
2988 /* Map at new location. */
2989 guest_physmap_add_page(d, xatp.gpfn, mfn);
2991 UNLOCK_BIGLOCK(d);
2993 put_domain(d);
2995 break;
2998 case XENMEM_memory_map:
3000 return -ENOSYS;
3003 case XENMEM_machine_memory_map:
3005 struct xen_memory_map memmap;
3006 XEN_GUEST_HANDLE(e820entry_t) buffer;
3007 int count;
3009 if ( !IS_PRIV(current->domain) )
3010 return -EINVAL;
3012 if ( copy_from_guest(&memmap, arg, 1) )
3013 return -EFAULT;
3014 if ( memmap.nr_entries < e820.nr_map + 1 )
3015 return -EINVAL;
3017 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3019 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3020 if ( copy_to_guest(buffer, &e820.map[0], count) < 0 )
3021 return -EFAULT;
3023 memmap.nr_entries = count;
3025 if ( copy_to_guest(arg, &memmap, 1) )
3026 return -EFAULT;
3028 return 0;
3031 default:
3032 return subarch_memory_op(op, arg);
3035 return 0;
3039 /*************************
3040 * Writable Pagetables
3041 */
3043 #ifdef VVERBOSE
3044 int ptwr_debug = 0x0;
3045 #define PTWR_PRINTK(_f, _a...) \
3046 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
3047 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
3048 #else
3049 #define PTWR_PRINTK(_f, _a...) ((void)0)
3050 #endif
3053 #ifdef PERF_ARRAYS
3055 /**************** writeable pagetables profiling functions *****************/
3057 #define ptwr_eip_buckets 256
3059 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
3061 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
3063 struct {
3064 unsigned long eip;
3065 domid_t id;
3066 u32 val[ptwr_eip_stat_thresholdN];
3067 } typedef ptwr_eip_stat_t;
3069 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
3071 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
3073 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
3074 ptwr_eip_buckets;
3077 static void ptwr_eip_stat_inc(u32 *n)
3079 unsigned int i, j;
3081 if ( ++(*n) != 0 )
3082 return;
3084 *n = ~0;
3086 /* Re-scale all buckets. */
3087 for ( i = 0; i < ptwr_eip_buckets; i++ )
3088 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3089 ptwr_eip_stats[i].val[j] >>= 1;
3092 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
3094 unsigned int i, j, b;
3096 i = b = ptwr_eip_stat_hash(eip, id);
3098 do
3100 if ( !ptwr_eip_stats[i].eip )
3102 /* doesn't exist */
3103 ptwr_eip_stats[i].eip = eip;
3104 ptwr_eip_stats[i].id = id;
3105 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
3108 if ( ptwr_eip_stats[i].eip == eip && ptwr_eip_stats[i].id == id)
3110 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3111 if ( modified <= ptwr_eip_stat_threshold[j] )
3112 break;
3113 BUG_ON(j >= ptwr_eip_stat_thresholdN);
3114 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
3115 return;
3118 i = (i+1) % ptwr_eip_buckets;
3120 while ( i != b );
3122 printk("ptwr_eip_stat: too many EIPs in use!\n");
3124 ptwr_eip_stat_print();
3125 ptwr_eip_stat_reset();
3128 void ptwr_eip_stat_reset(void)
3130 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
3133 void ptwr_eip_stat_print(void)
3135 struct domain *e;
3136 domid_t d;
3137 unsigned int i, j;
3139 for_each_domain( e )
3141 d = e->domain_id;
3143 for ( i = 0; i < ptwr_eip_buckets; i++ )
3145 if ( !ptwr_eip_stats[i].eip || ptwr_eip_stats[i].id != d )
3146 continue;
3148 printk("D %5d eip %p ",
3149 ptwr_eip_stats[i].id, (void *)ptwr_eip_stats[i].eip);
3151 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3152 printk("<=%u %4u \t",
3153 ptwr_eip_stat_threshold[j],
3154 ptwr_eip_stats[i].val[j]);
3155 printk("\n");
3160 #else /* PERF_ARRAYS */
3162 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
3164 #endif
3166 /*******************************************************************/
3168 /* Re-validate a given p.t. page, given its prior snapshot */
3169 int revalidate_l1(
3170 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
3172 l1_pgentry_t ol1e, nl1e;
3173 int modified = 0, i;
3175 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3177 ol1e = snapshot[i];
3178 nl1e = l1page[i];
3180 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
3181 continue;
3183 /* Update number of entries modified. */
3184 modified++;
3186 /*
3187 * Fast path for PTEs that have merely been write-protected
3188 * (e.g., during a Unix fork()). A strict reduction in privilege.
3189 */
3190 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
3192 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3193 put_page_type(mfn_to_page(l1e_get_pfn(nl1e)));
3194 continue;
3197 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3199 /*
3200 * Make the remaining p.t's consistent before crashing, so the
3201 * reference counts are correct.
3202 */
3203 memcpy(&l1page[i], &snapshot[i],
3204 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
3206 /* Crash the offending domain. */
3207 MEM_LOG("ptwr: Could not revalidate l1 page");
3208 domain_crash(d);
3209 break;
3212 put_page_from_l1e(ol1e, d);
3215 return modified;
3219 /* Flush the given writable p.t. page and write-protect it again. */
3220 void ptwr_flush(struct domain *d, const int which)
3222 unsigned long l1va;
3223 l1_pgentry_t *pl1e, pte, *ptep;
3224 l2_pgentry_t *pl2e;
3225 unsigned int modified;
3227 #ifdef CONFIG_X86_64
3228 struct vcpu *v = current;
3229 int user_mode = !(v->arch.flags & TF_kernel_mode);
3230 #endif
3232 ASSERT(!shadow_mode_enabled(d));
3234 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3235 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
3236 __write_ptbase(pagetable_get_pfn(
3237 d->arch.ptwr[which].vcpu->arch.guest_table));
3238 else
3239 TOGGLE_MODE();
3241 l1va = d->arch.ptwr[which].l1va;
3242 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
3244 /*
3245 * STEP 1. Write-protect the p.t. page so no more updates can occur.
3246 */
3248 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
3250 MEM_LOG("ptwr: Could not read pte at %p", ptep);
3251 /*
3252 * Really a bug. We could read this PTE during the initial fault,
3253 * and pagetables can't have changed meantime.
3254 */
3255 BUG();
3257 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
3258 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3259 l1e_remove_flags(pte, _PAGE_RW);
3261 /* Write-protect the p.t. page in the guest page table. */
3262 if ( unlikely(__put_user(pte, ptep)) )
3264 MEM_LOG("ptwr: Could not update pte at %p", ptep);
3265 /*
3266 * Really a bug. We could write this PTE during the initial fault,
3267 * and pagetables can't have changed meantime.
3268 */
3269 BUG();
3272 /* Ensure that there are no stale writable mappings in any TLB. */
3273 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3274 flush_tlb_one_mask(d->domain_dirty_cpumask, l1va);
3275 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3276 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3278 /*
3279 * STEP 2. Validate any modified PTEs.
3280 */
3282 if ( likely(d == current->domain) )
3284 pl1e = map_domain_page(l1e_get_pfn(pte));
3285 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3286 unmap_domain_page(pl1e);
3287 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3288 ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
3289 d->arch.ptwr[which].prev_nr_updates = modified;
3291 else
3293 /*
3294 * Must make a temporary global mapping, since we are running in the
3295 * wrong address space, so no access to our own mapcache.
3296 */
3297 pl1e = map_domain_page_global(l1e_get_pfn(pte));
3298 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3299 unmap_domain_page_global(pl1e);
3302 /*
3303 * STEP 3. Reattach the L1 p.t. page into the current address space.
3304 */
3306 if ( which == PTWR_PT_ACTIVE )
3308 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3309 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3312 /*
3313 * STEP 4. Final tidy-up.
3314 */
3316 d->arch.ptwr[which].l1va = 0;
3318 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3319 write_ptbase(current);
3320 else
3321 TOGGLE_MODE();
3324 static int ptwr_emulated_update(
3325 unsigned long addr,
3326 paddr_t old,
3327 paddr_t val,
3328 unsigned int bytes,
3329 unsigned int do_cmpxchg)
3331 unsigned long pfn, l1va;
3332 struct page_info *page;
3333 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3334 struct domain *d = current->domain;
3336 /* Aligned access only, thank you. */
3337 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3339 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3340 bytes, addr);
3341 return X86EMUL_UNHANDLEABLE;
3344 /* Turn a sub-word access into a full-word access. */
3345 if ( bytes != sizeof(paddr_t) )
3347 paddr_t full;
3348 unsigned int offset = addr & (sizeof(paddr_t)-1);
3350 /* Align address; read full word. */
3351 addr &= ~(sizeof(paddr_t)-1);
3352 if ( copy_from_user(&full, (void *)addr, sizeof(paddr_t)) )
3354 propagate_page_fault(addr, 0); /* read fault */
3355 return X86EMUL_PROPAGATE_FAULT;
3357 /* Mask out bits provided by caller. */
3358 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3359 /* Shift the caller value and OR in the missing bits. */
3360 val &= (((paddr_t)1 << (bytes*8)) - 1);
3361 val <<= (offset)*8;
3362 val |= full;
3363 /* Also fill in missing parts of the cmpxchg old value. */
3364 old &= (((paddr_t)1 << (bytes*8)) - 1);
3365 old <<= (offset)*8;
3366 old |= full;
3369 #if 0 /* XXX KAF: I don't think this can happen. */
3370 /*
3371 * We must not emulate an update to a PTE that is temporarily marked
3372 * writable by the batched ptwr logic, else we can corrupt page refcnts!
3373 */
3374 if ( ((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3375 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3376 ptwr_flush(d, PTWR_PT_ACTIVE);
3377 if ( ((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3378 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3379 ptwr_flush(d, PTWR_PT_INACTIVE);
3380 #else
3381 ASSERT(((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) == 0) ||
3382 (l1_linear_offset(l1va) != l1_linear_offset(addr)));
3383 ASSERT(((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) == 0) ||
3384 (l1_linear_offset(l1va) != l1_linear_offset(addr)));
3385 #endif
3387 /* Read the PTE that maps the page being updated. */
3388 if ( __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3389 sizeof(pte)) )
3391 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3392 return X86EMUL_UNHANDLEABLE;
3395 pfn = l1e_get_pfn(pte);
3396 page = mfn_to_page(pfn);
3398 /* We are looking only for read-only mappings of p.t. pages. */
3399 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3400 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3401 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3402 ASSERT(page_get_owner(page) == d);
3404 /* Check the new PTE. */
3405 nl1e = l1e_from_intpte(val);
3406 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3408 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3409 return X86EMUL_UNHANDLEABLE;
3412 /* Checked successfully: do the update (write or cmpxchg). */
3413 pl1e = map_domain_page(page_to_mfn(page));
3414 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3415 if ( do_cmpxchg )
3417 ol1e = l1e_from_intpte(old);
3418 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3420 unmap_domain_page(pl1e);
3421 put_page_from_l1e(nl1e, d);
3422 return X86EMUL_CMPXCHG_FAILED;
3425 else
3427 ol1e = *pl1e;
3428 *pl1e = nl1e;
3430 unmap_domain_page(pl1e);
3432 /* Finally, drop the old PTE. */
3433 put_page_from_l1e(ol1e, d);
3435 return X86EMUL_CONTINUE;
3438 static int ptwr_emulated_write(
3439 unsigned long addr,
3440 unsigned long val,
3441 unsigned int bytes,
3442 struct x86_emulate_ctxt *ctxt)
3444 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3447 static int ptwr_emulated_cmpxchg(
3448 unsigned long addr,
3449 unsigned long old,
3450 unsigned long new,
3451 unsigned int bytes,
3452 struct x86_emulate_ctxt *ctxt)
3454 return ptwr_emulated_update(addr, old, new, bytes, 1);
3457 static int ptwr_emulated_cmpxchg8b(
3458 unsigned long addr,
3459 unsigned long old,
3460 unsigned long old_hi,
3461 unsigned long new,
3462 unsigned long new_hi,
3463 struct x86_emulate_ctxt *ctxt)
3465 if ( CONFIG_PAGING_LEVELS == 2 )
3466 return X86EMUL_UNHANDLEABLE;
3467 else
3468 return ptwr_emulated_update(
3469 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3472 static struct x86_emulate_ops ptwr_emulate_ops = {
3473 .read_std = x86_emulate_read_std,
3474 .write_std = x86_emulate_write_std,
3475 .read_emulated = x86_emulate_read_std,
3476 .write_emulated = ptwr_emulated_write,
3477 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3478 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3479 };
3481 /* Write page fault handler: check if guest is trying to modify a PTE. */
3482 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3483 struct cpu_user_regs *regs)
3485 unsigned long pfn;
3486 struct page_info *page;
3487 l1_pgentry_t *pl1e, pte;
3488 l2_pgentry_t *pl2e, l2e;
3489 int which, flags;
3490 unsigned long l2_idx;
3491 struct x86_emulate_ctxt emul_ctxt;
3493 ASSERT(!shadow_mode_enabled(d));
3495 /*
3496 * Attempt to read the PTE that maps the VA being accessed. By checking for
3497 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3498 * NB. The L2 entry cannot be detached as the caller already checked that.
3499 */
3500 if ( !(l2e_get_flags(__linear_l2_table[l2_linear_offset(addr)]) &
3501 _PAGE_PRESENT) ||
3502 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
3503 sizeof(pte)) )
3505 return 0;
3508 pfn = l1e_get_pfn(pte);
3509 page = mfn_to_page(pfn);
3511 #ifdef CONFIG_X86_64
3512 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT | _PAGE_USER)
3513 #else
3514 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT)
3515 #endif
3517 /*
3518 * Check the required flags for a valid wrpt mapping. If the page is
3519 * already writable then we can return straight to the guest (SMP race).
3520 * We decide whether or not to propagate the fault by testing for write
3521 * permissions in page directories by writing back to the linear mapping.
3522 */
3523 if ( (flags = l1e_get_flags(pte) & WRPT_PTE_FLAGS) == WRPT_PTE_FLAGS )
3524 return __put_user(
3525 pte.l1, &linear_pg_table[l1_linear_offset(addr)].l1) ?
3526 0 : EXCRET_not_a_fault;
3528 /* We are looking only for read-only mappings of p.t. pages. */
3529 if ( ((flags | _PAGE_RW) != WRPT_PTE_FLAGS) ||
3530 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3531 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3532 (page_get_owner(page) != d) )
3534 return 0;
3537 #if 0 /* Leave this in as useful for debugging */
3538 goto emulate;
3539 #endif
3541 PTWR_PRINTK("ptwr_page_fault on l1 pt at va %lx, pfn %lx, eip %lx\n",
3542 addr, pfn, (unsigned long)regs->eip);
3544 /* Get the L2 index at which this L1 p.t. is always mapped. */
3545 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3546 if ( unlikely(l2_idx >= PGT_va_unknown) )
3547 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3548 l2_idx >>= PGT_va_shift;
3550 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3551 goto emulate; /* Urk! Pagetable maps itself! */
3553 /*
3554 * Is the L1 p.t. mapped into the current address space? If so we call it
3555 * an ACTIVE p.t., otherwise it is INACTIVE.
3556 */
3557 pl2e = &__linear_l2_table[l2_idx];
3558 which = PTWR_PT_INACTIVE;
3560 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3562 /*
3563 * Check the PRESENT bit to set ACTIVE mode.
3564 * If the PRESENT bit is clear, we may be conflicting with the current
3565 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3566 * The ptwr_flush call below will restore the PRESENT bit.
3567 */
3568 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3569 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3570 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3571 which = PTWR_PT_ACTIVE;
3574 /*
3575 * Multi-processor guest? Then ensure that the page table is hooked into
3576 * at most one L2, and also ensure that there is only one mapping of the
3577 * page table itself (or there can be conflicting writable mappings from
3578 * other VCPUs).
3579 */
3580 if ( d->vcpu[0]->next_in_list != NULL )
3582 if ( /* Hooked into at most one L2 table (which this VCPU maps)? */
3583 ((page->u.inuse.type_info & PGT_count_mask) !=
3584 (!!(page->u.inuse.type_info & PGT_pinned) +
3585 (which == PTWR_PT_ACTIVE))) ||
3586 /* PTEs are mapped read-only in only one place? */
3587 ((page->count_info & PGC_count_mask) !=
3588 (!!(page->count_info & PGC_allocated) + /* alloc count */
3589 (page->u.inuse.type_info & PGT_count_mask) + /* type count */
3590 1)) ) /* map count */
3592 /* Could be conflicting writable mappings from other VCPUs. */
3593 cleanup_writable_pagetable(d);
3594 goto emulate;
3598 /*
3599 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at a
3600 * time. If there is already one, we must flush it out.
3601 */
3602 if ( d->arch.ptwr[which].l1va )
3603 ptwr_flush(d, which);
3605 /*
3606 * If last batch made no updates then we are probably stuck. Emulate this
3607 * update to ensure we make progress.
3608 */
3609 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3611 /* Ensure that we don't get stuck in an emulation-only rut. */
3612 d->arch.ptwr[which].prev_nr_updates = 1;
3613 goto emulate;
3616 PTWR_PRINTK("[%c] batched ptwr_page_fault at va %lx, pt for %08lx, "
3617 "pfn %lx\n", PTWR_PRINT_WHICH, addr,
3618 l2_idx << L2_PAGETABLE_SHIFT, pfn);
3620 d->arch.ptwr[which].l1va = addr | 1;
3621 d->arch.ptwr[which].l2_idx = l2_idx;
3622 d->arch.ptwr[which].vcpu = current;
3624 #ifdef PERF_ARRAYS
3625 d->arch.ptwr[which].eip = regs->eip;
3626 #endif
3628 /* For safety, disconnect the L1 p.t. page from current space. */
3629 if ( which == PTWR_PT_ACTIVE )
3631 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
3632 flush_tlb_mask(d->domain_dirty_cpumask);
3635 /* Temporarily map the L1 page, and make a copy of it. */
3636 pl1e = map_domain_page(pfn);
3637 memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE);
3638 unmap_domain_page(pl1e);
3640 /* Finally, make the p.t. page writable by the guest OS. */
3641 l1e_add_flags(pte, _PAGE_RW);
3642 if ( unlikely(__put_user(pte.l1,
3643 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3645 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
3646 &linear_pg_table[l1_linear_offset(addr)]);
3647 /* Toss the writable pagetable state and crash. */
3648 d->arch.ptwr[which].l1va = 0;
3649 domain_crash(d);
3650 return 0;
3653 return EXCRET_fault_fixed;
3655 emulate:
3656 emul_ctxt.regs = guest_cpu_user_regs();
3657 emul_ctxt.cr2 = addr;
3658 emul_ctxt.mode = X86EMUL_MODE_HOST;
3659 if ( x86_emulate_memop(&emul_ctxt, &ptwr_emulate_ops) )
3660 return 0;
3661 perfc_incrc(ptwr_emulations);
3662 return EXCRET_fault_fixed;
3665 int ptwr_init(struct domain *d)
3667 void *x = alloc_xenheap_page();
3668 void *y = alloc_xenheap_page();
3670 if ( (x == NULL) || (y == NULL) )
3672 free_xenheap_page(x);
3673 free_xenheap_page(y);
3674 return -ENOMEM;
3677 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3678 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3680 return 0;
3683 void ptwr_destroy(struct domain *d)
3685 LOCK_BIGLOCK(d);
3686 cleanup_writable_pagetable(d);
3687 UNLOCK_BIGLOCK(d);
3688 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3689 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3692 void cleanup_writable_pagetable(struct domain *d)
3694 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3695 return;
3697 if ( unlikely(shadow_mode_enabled(d)) )
3699 shadow_sync_all(d);
3701 else
3703 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3704 ptwr_flush(d, PTWR_PT_ACTIVE);
3705 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3706 ptwr_flush(d, PTWR_PT_INACTIVE);
3710 int map_pages_to_xen(
3711 unsigned long virt,
3712 unsigned long mfn,
3713 unsigned long nr_mfns,
3714 unsigned long flags)
3716 l2_pgentry_t *pl2e, ol2e;
3717 l1_pgentry_t *pl1e, ol1e;
3718 unsigned int i;
3720 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3721 flags &= ~MAP_SMALL_PAGES;
3723 while ( nr_mfns != 0 )
3725 pl2e = virt_to_xen_l2e(virt);
3727 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3728 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3729 !map_small_pages )
3731 /* Super-page mapping. */
3732 ol2e = *pl2e;
3733 *pl2e = l2e_from_pfn(mfn, flags|_PAGE_PSE);
3735 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3737 local_flush_tlb_pge();
3738 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3739 free_xen_pagetable(l2e_get_page(ol2e));
3742 virt += 1UL << L2_PAGETABLE_SHIFT;
3743 mfn += 1UL << PAGETABLE_ORDER;
3744 nr_mfns -= 1UL << PAGETABLE_ORDER;
3746 else
3748 /* Normal page mapping. */
3749 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3751 pl1e = page_to_virt(alloc_xen_pagetable());
3752 clear_page(pl1e);
3753 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3755 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3757 pl1e = page_to_virt(alloc_xen_pagetable());
3758 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3759 pl1e[i] = l1e_from_pfn(
3760 l2e_get_pfn(*pl2e) + i,
3761 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3762 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3763 local_flush_tlb_pge();
3766 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3767 ol1e = *pl1e;
3768 *pl1e = l1e_from_pfn(mfn, flags);
3769 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3770 local_flush_tlb_one(virt);
3772 virt += 1UL << L1_PAGETABLE_SHIFT;
3773 mfn += 1UL;
3774 nr_mfns -= 1UL;
3778 return 0;
3781 void __set_fixmap(
3782 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3784 BUG_ON(idx >= __end_of_fixed_addresses);
3785 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3788 #ifdef MEMORY_GUARD
3790 void memguard_init(void)
3792 map_pages_to_xen(
3793 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3794 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3797 static void __memguard_change_range(void *p, unsigned long l, int guard)
3799 unsigned long _p = (unsigned long)p;
3800 unsigned long _l = (unsigned long)l;
3801 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3803 /* Ensure we are dealing with a page-aligned whole number of pages. */
3804 ASSERT((_p&PAGE_MASK) != 0);
3805 ASSERT((_l&PAGE_MASK) != 0);
3806 ASSERT((_p&~PAGE_MASK) == 0);
3807 ASSERT((_l&~PAGE_MASK) == 0);
3809 if ( guard )
3810 flags &= ~_PAGE_PRESENT;
3812 map_pages_to_xen(
3813 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3816 void memguard_guard_range(void *p, unsigned long l)
3818 __memguard_change_range(p, l, 1);
3821 void memguard_unguard_range(void *p, unsigned long l)
3823 __memguard_change_range(p, l, 0);
3826 #endif
3828 /*
3829 * Local variables:
3830 * mode: C
3831 * c-set-style: "BSD"
3832 * c-basic-offset: 4
3833 * tab-width: 4
3834 * indent-tabs-mode: nil
3835 * End:
3836 */