ia64/xen-unstable

view xen/arch/x86/mm.c @ 9917:42c73f3d7ac1

This patch changes to format in which EIP is printed to be consistent on 64-bit regardless of whether the addresses
leading bits are set or clear. It additionally changes the formatting so that trailing spaces are avoided, and it
suppresses the printing of empty records. It also prevents combining records with identical EIP but from different
domains.

It further changes the type of some variables from plain int to unsigned int, as that is yielding more efficient code
on x86-64 (signed 32-bit array indices require explicit sign extension, whereas in most cases an extra copy can be
avoided when the index type is unsigned, since all 32-bit operations already zero-extend their results).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue May 02 15:36:07 2006 +0100 (2006-05-02)
parents 4e1b8be54311
children 810ad61870e8
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <xen/domain_page.h>
98 #include <xen/event.h>
99 #include <xen/iocap.h>
100 #include <xen/guest_access.h>
101 #include <asm/shadow.h>
102 #include <asm/page.h>
103 #include <asm/flushtlb.h>
104 #include <asm/io.h>
105 #include <asm/ldt.h>
106 #include <asm/x86_emulate.h>
107 #include <public/memory.h>
109 #ifdef VERBOSE
110 #define MEM_LOG(_f, _a...) \
111 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
112 current->domain->domain_id , __LINE__ , ## _a )
113 #else
114 #define MEM_LOG(_f, _a...) ((void)0)
115 #endif
117 /*
118 * Both do_mmuext_op() and do_mmu_update():
119 * We steal the m.s.b. of the @count parameter to indicate whether this
120 * invocation of do_mmu_update() is resuming a previously preempted call.
121 */
122 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
124 static void free_l2_table(struct page_info *page);
125 static void free_l1_table(struct page_info *page);
127 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
128 unsigned long type);
129 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
131 /* Used to defer flushing of memory structures. */
132 static struct {
133 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
134 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
135 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
136 unsigned int deferred_ops;
137 /* If non-NULL, specifies a foreign subject domain for some operations. */
138 struct domain *foreign;
139 } __cacheline_aligned percpu_info[NR_CPUS];
141 /*
142 * Returns the current foreign domain; defaults to the currently-executing
143 * domain if a foreign override hasn't been specified.
144 */
145 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
147 /* Private domain structs for DOMID_XEN and DOMID_IO. */
148 static struct domain *dom_xen, *dom_io;
150 /* Frame table and its size in pages. */
151 struct page_info *frame_table;
152 unsigned long max_page;
153 unsigned long total_pages;
155 void __init init_frametable(void)
156 {
157 unsigned long nr_pages, page_step, i, mfn;
159 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
161 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
162 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
164 for ( i = 0; i < nr_pages; i += page_step )
165 {
166 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
167 if ( mfn == 0 )
168 panic("Not enough memory for frame table\n");
169 map_pages_to_xen(
170 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
171 mfn, page_step, PAGE_HYPERVISOR);
172 }
174 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
175 }
177 void arch_init_memory(void)
178 {
179 extern void subarch_init_memory(void);
181 unsigned long i, pfn, rstart_pfn, rend_pfn;
183 memset(percpu_info, 0, sizeof(percpu_info));
185 /*
186 * Initialise our DOMID_XEN domain.
187 * Any Xen-heap pages that we will allow to be mapped will have
188 * their domain field set to dom_xen.
189 */
190 dom_xen = alloc_domain();
191 spin_lock_init(&dom_xen->page_alloc_lock);
192 atomic_set(&dom_xen->refcnt, 1);
193 dom_xen->domain_id = DOMID_XEN;
195 /*
196 * Initialise our DOMID_IO domain.
197 * This domain owns I/O pages that are within the range of the page_info
198 * array. Mappings occur at the priv of the caller.
199 */
200 dom_io = alloc_domain();
201 spin_lock_init(&dom_io->page_alloc_lock);
202 atomic_set(&dom_io->refcnt, 1);
203 dom_io->domain_id = DOMID_IO;
205 /* First 1MB of RAM is historically marked as I/O. */
206 for ( i = 0; i < 0x100; i++ )
207 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
209 /* Any areas not specified as RAM by the e820 map are considered I/O. */
210 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
211 {
212 if ( e820.map[i].type != E820_RAM )
213 continue;
214 /* Every page from cursor to start of next RAM region is I/O. */
215 rstart_pfn = PFN_UP(e820.map[i].addr);
216 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
217 for ( ; pfn < rstart_pfn; pfn++ )
218 {
219 BUG_ON(!mfn_valid(pfn));
220 share_xen_page_with_guest(
221 mfn_to_page(pfn), dom_io, XENSHARE_writable);
222 }
223 /* Skip the RAM region. */
224 pfn = rend_pfn;
225 }
226 BUG_ON(pfn != max_page);
228 subarch_init_memory();
229 }
231 void share_xen_page_with_guest(
232 struct page_info *page, struct domain *d, int readonly)
233 {
234 if ( page_get_owner(page) == d )
235 return;
237 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
239 spin_lock(&d->page_alloc_lock);
241 /* The incremented type count pins as writable or read-only. */
242 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
243 page->u.inuse.type_info |= PGT_validated | 1;
245 page_set_owner(page, d);
246 wmb(); /* install valid domain ptr before updating refcnt. */
247 ASSERT(page->count_info == 0);
248 page->count_info |= PGC_allocated | 1;
250 if ( unlikely(d->xenheap_pages++ == 0) )
251 get_knownalive_domain(d);
252 list_add_tail(&page->list, &d->xenpage_list);
254 spin_unlock(&d->page_alloc_lock);
255 }
257 void share_xen_page_with_privileged_guests(
258 struct page_info *page, int readonly)
259 {
260 share_xen_page_with_guest(page, dom_xen, readonly);
261 }
263 void write_ptbase(struct vcpu *v)
264 {
265 write_cr3(pagetable_get_paddr(v->arch.monitor_table));
266 }
268 void invalidate_shadow_ldt(struct vcpu *v)
269 {
270 int i;
271 unsigned long pfn;
272 struct page_info *page;
274 if ( v->arch.shadow_ldt_mapcnt == 0 )
275 return;
277 v->arch.shadow_ldt_mapcnt = 0;
279 for ( i = 16; i < 32; i++ )
280 {
281 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
282 if ( pfn == 0 ) continue;
283 v->arch.perdomain_ptes[i] = l1e_empty();
284 page = mfn_to_page(pfn);
285 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
286 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
287 put_page_and_type(page);
288 }
290 /* Dispose of the (now possibly invalid) mappings from the TLB. */
291 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
292 }
295 static int alloc_segdesc_page(struct page_info *page)
296 {
297 struct desc_struct *descs;
298 int i;
300 descs = map_domain_page(page_to_mfn(page));
302 for ( i = 0; i < 512; i++ )
303 if ( unlikely(!check_descriptor(&descs[i])) )
304 goto fail;
306 unmap_domain_page(descs);
307 return 1;
309 fail:
310 unmap_domain_page(descs);
311 return 0;
312 }
315 /* Map shadow page at offset @off. */
316 int map_ldt_shadow_page(unsigned int off)
317 {
318 struct vcpu *v = current;
319 struct domain *d = v->domain;
320 unsigned long gmfn, mfn;
321 l1_pgentry_t l1e, nl1e;
322 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
323 int res;
325 #if defined(__x86_64__)
326 /* If in user mode, switch to kernel mode just to read LDT mapping. */
327 int user_mode = !(v->arch.flags & TF_kernel_mode);
328 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
329 #elif defined(__i386__)
330 #define TOGGLE_MODE() ((void)0)
331 #endif
333 BUG_ON(unlikely(in_irq()));
335 shadow_sync_va(v, gva);
337 TOGGLE_MODE();
338 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
339 sizeof(l1e));
340 TOGGLE_MODE();
342 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
343 return 0;
345 gmfn = l1e_get_pfn(l1e);
346 mfn = gmfn_to_mfn(d, gmfn);
347 if ( unlikely(!VALID_MFN(mfn)) )
348 return 0;
350 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
352 if ( !res && unlikely(shadow_mode_refcounts(d)) )
353 {
354 shadow_lock(d);
355 shadow_remove_all_write_access(d, gmfn, mfn);
356 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
357 shadow_unlock(d);
358 }
360 if ( unlikely(!res) )
361 return 0;
363 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
365 v->arch.perdomain_ptes[off + 16] = nl1e;
366 v->arch.shadow_ldt_mapcnt++;
368 return 1;
369 }
372 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
373 {
374 struct page_info *page = mfn_to_page(page_nr);
376 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
377 {
378 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
379 return 0;
380 }
382 return 1;
383 }
386 static int get_page_and_type_from_pagenr(unsigned long page_nr,
387 unsigned long type,
388 struct domain *d)
389 {
390 struct page_info *page = mfn_to_page(page_nr);
392 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
393 return 0;
395 if ( unlikely(!get_page_type(page, type)) )
396 {
397 put_page(page);
398 return 0;
399 }
401 return 1;
402 }
404 /*
405 * We allow root tables to map each other (a.k.a. linear page tables). It
406 * needs some special care with reference counts and access permissions:
407 * 1. The mapping entry must be read-only, or the guest may get write access
408 * to its own PTEs.
409 * 2. We must only bump the reference counts for an *already validated*
410 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
411 * on a validation that is required to complete that validation.
412 * 3. We only need to increment the reference counts for the mapped page
413 * frame if it is mapped by a different root table. This is sufficient and
414 * also necessary to allow validation of a root table mapping itself.
415 */
416 static int
417 get_linear_pagetable(
418 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
419 {
420 unsigned long x, y;
421 struct page_info *page;
422 unsigned long pfn;
424 ASSERT( !shadow_mode_refcounts(d) );
426 if ( (root_get_flags(re) & _PAGE_RW) )
427 {
428 MEM_LOG("Attempt to create linear p.t. with write perms");
429 return 0;
430 }
432 if ( (pfn = root_get_pfn(re)) != re_pfn )
433 {
434 /* Make sure the mapped frame belongs to the correct domain. */
435 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
436 return 0;
438 /*
439 * Make sure that the mapped frame is an already-validated L2 table.
440 * If so, atomically increment the count (checking for overflow).
441 */
442 page = mfn_to_page(pfn);
443 y = page->u.inuse.type_info;
444 do {
445 x = y;
446 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
447 unlikely((x & (PGT_type_mask|PGT_validated)) !=
448 (PGT_root_page_table|PGT_validated)) )
449 {
450 put_page(page);
451 return 0;
452 }
453 }
454 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
455 }
457 return 1;
458 }
460 int
461 get_page_from_l1e(
462 l1_pgentry_t l1e, struct domain *d)
463 {
464 unsigned long mfn = l1e_get_pfn(l1e);
465 struct page_info *page = mfn_to_page(mfn);
466 int okay;
468 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
469 return 1;
471 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
472 {
473 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
474 return 0;
475 }
477 if ( unlikely(!mfn_valid(mfn)) ||
478 unlikely(page_get_owner(page) == dom_io) )
479 {
480 /* DOMID_IO reverts to caller for privilege checks. */
481 if ( d == dom_io )
482 d = current->domain;
484 if ( !iomem_access_permitted(d, mfn, mfn) )
485 {
486 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
487 return 0;
488 }
490 /* No reference counting for out-of-range I/O pages. */
491 if ( !mfn_valid(mfn) )
492 return 1;
494 d = dom_io;
495 }
497 okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
498 get_page_and_type(page, d, PGT_writable_page) :
499 get_page(page, d));
500 if ( !okay )
501 {
502 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
503 " for dom%d",
504 mfn, get_gpfn_from_mfn(mfn),
505 l1e_get_intpte(l1e), d->domain_id);
506 }
508 return okay;
509 }
512 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
513 static int
514 get_page_from_l2e(
515 l2_pgentry_t l2e, unsigned long pfn,
516 struct domain *d, unsigned long vaddr)
517 {
518 int rc;
520 ASSERT(!shadow_mode_refcounts(d));
522 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
523 return 1;
525 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
526 {
527 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
528 return 0;
529 }
531 vaddr >>= L2_PAGETABLE_SHIFT;
532 vaddr <<= PGT_va_shift;
533 rc = get_page_and_type_from_pagenr(
534 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
535 #if CONFIG_PAGING_LEVELS == 2
536 if ( unlikely(!rc) )
537 rc = get_linear_pagetable(l2e, pfn, d);
538 #endif
539 return rc;
540 }
543 #if CONFIG_PAGING_LEVELS >= 3
544 static int
545 get_page_from_l3e(
546 l3_pgentry_t l3e, unsigned long pfn,
547 struct domain *d, unsigned long vaddr)
548 {
549 int rc;
551 ASSERT(!shadow_mode_refcounts(d));
553 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
554 return 1;
556 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
557 {
558 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
559 return 0;
560 }
562 vaddr >>= L3_PAGETABLE_SHIFT;
563 vaddr <<= PGT_va_shift;
564 rc = get_page_and_type_from_pagenr(
565 l3e_get_pfn(l3e),
566 PGT_l2_page_table | vaddr, d);
567 #if CONFIG_PAGING_LEVELS == 3
568 if ( unlikely(!rc) )
569 rc = get_linear_pagetable(l3e, pfn, d);
570 #endif
571 return rc;
572 }
573 #endif /* 3 level */
575 #if CONFIG_PAGING_LEVELS >= 4
576 static int
577 get_page_from_l4e(
578 l4_pgentry_t l4e, unsigned long pfn,
579 struct domain *d, unsigned long vaddr)
580 {
581 int rc;
583 ASSERT( !shadow_mode_refcounts(d) );
585 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
586 return 1;
588 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
589 {
590 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
591 return 0;
592 }
594 vaddr >>= L4_PAGETABLE_SHIFT;
595 vaddr <<= PGT_va_shift;
596 rc = get_page_and_type_from_pagenr(
597 l4e_get_pfn(l4e),
598 PGT_l3_page_table | vaddr, d);
600 if ( unlikely(!rc) )
601 rc = get_linear_pagetable(l4e, pfn, d);
603 return rc;
604 }
605 #endif /* 4 level */
608 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
609 {
610 unsigned long pfn = l1e_get_pfn(l1e);
611 struct page_info *page = mfn_to_page(pfn);
612 struct domain *e;
613 struct vcpu *v;
615 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
616 return;
618 e = page_get_owner(page);
620 /*
621 * Check if this is a mapping that was established via a grant reference.
622 * If it was then we should not be here: we require that such mappings are
623 * explicitly destroyed via the grant-table interface.
624 *
625 * The upshot of this is that the guest can end up with active grants that
626 * it cannot destroy (because it no longer has a PTE to present to the
627 * grant-table interface). This can lead to subtle hard-to-catch bugs,
628 * hence a special grant PTE flag can be enabled to catch the bug early.
629 *
630 * (Note that the undestroyable active grants are not a security hole in
631 * Xen. All active grants can safely be cleaned up when the domain dies.)
632 */
633 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
634 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
635 {
636 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
637 l1e_get_intpte(l1e));
638 domain_crash(d);
639 }
641 if ( l1e_get_flags(l1e) & _PAGE_RW )
642 {
643 put_page_and_type(page);
644 }
645 else
646 {
647 /* We expect this is rare so we blow the entire shadow LDT. */
648 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
649 PGT_ldt_page)) &&
650 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
651 (d == e) )
652 {
653 for_each_vcpu ( d, v )
654 invalidate_shadow_ldt(v);
655 }
656 put_page(page);
657 }
658 }
661 /*
662 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
663 * Note also that this automatically deals correctly with linear p.t.'s.
664 */
665 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
666 {
667 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
668 (l2e_get_pfn(l2e) != pfn) )
669 put_page_and_type(mfn_to_page(l2e_get_pfn(l2e)));
670 }
673 #if CONFIG_PAGING_LEVELS >= 3
674 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
675 {
676 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
677 (l3e_get_pfn(l3e) != pfn) )
678 put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
679 }
680 #endif
682 #if CONFIG_PAGING_LEVELS >= 4
683 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
684 {
685 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
686 (l4e_get_pfn(l4e) != pfn) )
687 put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
688 }
689 #endif
691 static int alloc_l1_table(struct page_info *page)
692 {
693 struct domain *d = page_get_owner(page);
694 unsigned long pfn = page_to_mfn(page);
695 l1_pgentry_t *pl1e;
696 int i;
698 ASSERT(!shadow_mode_refcounts(d));
700 pl1e = map_domain_page(pfn);
702 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
703 if ( is_guest_l1_slot(i) &&
704 unlikely(!get_page_from_l1e(pl1e[i], d)) )
705 goto fail;
707 unmap_domain_page(pl1e);
708 return 1;
710 fail:
711 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
712 while ( i-- > 0 )
713 if ( is_guest_l1_slot(i) )
714 put_page_from_l1e(pl1e[i], d);
716 unmap_domain_page(pl1e);
717 return 0;
718 }
720 #ifdef CONFIG_X86_PAE
721 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
722 {
723 struct page_info *page;
724 l2_pgentry_t *pl2e;
725 l3_pgentry_t l3e3;
726 int i;
728 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
730 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
731 l3e3 = pl3e[3];
732 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
733 {
734 MEM_LOG("PAE L3 3rd slot is empty");
735 return 0;
736 }
738 /*
739 * The Xen-private mappings include linear mappings. The L2 thus cannot
740 * be shared by multiple L3 tables. The test here is adequate because:
741 * 1. Cannot appear in slots != 3 because the page would then then have
742 * unknown va backpointer, which get_page_type() explicitly disallows.
743 * 2. Cannot appear in another page table's L3:
744 * a. alloc_l3_table() calls this function and this check will fail
745 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
746 */
747 page = l3e_get_page(l3e3);
748 BUG_ON(page->u.inuse.type_info & PGT_pinned);
749 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
750 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
751 {
752 MEM_LOG("PAE L3 3rd slot is shared");
753 return 0;
754 }
756 /* Xen private mappings. */
757 pl2e = map_domain_page(l3e_get_pfn(l3e3));
758 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
759 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
760 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
761 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
762 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
763 l2e_from_page(
764 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
765 __PAGE_HYPERVISOR);
766 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
767 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
768 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
769 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
770 l2e_empty();
771 unmap_domain_page(pl2e);
773 return 1;
774 }
776 static inline int l1_backptr(
777 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
778 {
779 unsigned long l2_backptr = l2_type & PGT_va_mask;
780 ASSERT(l2_backptr != PGT_va_unknown);
781 ASSERT(l2_backptr != PGT_va_mutable);
782 *backptr =
783 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
784 (offset_in_l2 << L2_PAGETABLE_SHIFT);
785 return 1;
786 }
788 #elif CONFIG_X86_64
789 # define create_pae_xen_mappings(pl3e) (1)
791 static inline int l1_backptr(
792 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
793 {
794 unsigned long l2_backptr = l2_type & PGT_va_mask;
795 ASSERT(l2_backptr != PGT_va_unknown);
796 ASSERT(l2_backptr != PGT_va_mutable);
797 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
798 (offset_in_l2 << L2_PAGETABLE_SHIFT);
799 return 1;
800 }
802 static inline int l2_backptr(
803 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
804 {
805 unsigned long l3_backptr = l3_type & PGT_va_mask;
806 ASSERT(l3_backptr != PGT_va_unknown);
807 ASSERT(l3_backptr != PGT_va_mutable);
808 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
809 (offset_in_l3 << L3_PAGETABLE_SHIFT);
810 return 1;
811 }
813 static inline int l3_backptr(
814 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
815 {
816 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
817 return 1;
818 }
819 #else
820 # define create_pae_xen_mappings(pl3e) (1)
821 # define l1_backptr(bp,l2o,l2t) \
822 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
823 #endif
825 static int alloc_l2_table(struct page_info *page, unsigned long type)
826 {
827 struct domain *d = page_get_owner(page);
828 unsigned long pfn = page_to_mfn(page);
829 unsigned long vaddr;
830 l2_pgentry_t *pl2e;
831 int i;
833 /* See the code in shadow_promote() to understand why this is here. */
834 if ( (PGT_base_page_table == PGT_l2_page_table) &&
835 unlikely(shadow_mode_refcounts(d)) )
836 return 1;
837 ASSERT(!shadow_mode_refcounts(d));
839 pl2e = map_domain_page(pfn);
841 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
842 {
843 if ( !l1_backptr(&vaddr, i, type) )
844 goto fail;
845 if ( is_guest_l2_slot(type, i) &&
846 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
847 goto fail;
848 }
850 #if CONFIG_PAGING_LEVELS == 2
851 /* Xen private mappings. */
852 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
853 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
854 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
855 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
856 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
857 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
858 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
859 l2e_from_page(
860 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
861 __PAGE_HYPERVISOR);
862 #endif
864 unmap_domain_page(pl2e);
865 return 1;
867 fail:
868 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
869 while ( i-- > 0 )
870 if ( is_guest_l2_slot(type, i) )
871 put_page_from_l2e(pl2e[i], pfn);
873 unmap_domain_page(pl2e);
874 return 0;
875 }
878 #if CONFIG_PAGING_LEVELS >= 3
879 static int alloc_l3_table(struct page_info *page, unsigned long type)
880 {
881 struct domain *d = page_get_owner(page);
882 unsigned long pfn = page_to_mfn(page);
883 unsigned long vaddr;
884 l3_pgentry_t *pl3e;
885 int i;
887 ASSERT(!shadow_mode_refcounts(d));
889 #ifdef CONFIG_X86_PAE
890 if ( pfn >= 0x100000 )
891 {
892 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
893 return 0;
894 }
895 #endif
897 pl3e = map_domain_page(pfn);
898 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
899 {
900 #if CONFIG_PAGING_LEVELS >= 4
901 if ( !l2_backptr(&vaddr, i, type) )
902 goto fail;
903 #else
904 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
905 #endif
906 if ( is_guest_l3_slot(i) &&
907 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
908 goto fail;
909 }
911 if ( !create_pae_xen_mappings(pl3e) )
912 goto fail;
914 unmap_domain_page(pl3e);
915 return 1;
917 fail:
918 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
919 while ( i-- > 0 )
920 if ( is_guest_l3_slot(i) )
921 put_page_from_l3e(pl3e[i], pfn);
923 unmap_domain_page(pl3e);
924 return 0;
925 }
926 #else
927 #define alloc_l3_table(page, type) (0)
928 #endif
930 #if CONFIG_PAGING_LEVELS >= 4
931 static int alloc_l4_table(struct page_info *page, unsigned long type)
932 {
933 struct domain *d = page_get_owner(page);
934 unsigned long pfn = page_to_mfn(page);
935 l4_pgentry_t *pl4e = page_to_virt(page);
936 unsigned long vaddr;
937 int i;
939 /* See the code in shadow_promote() to understand why this is here. */
940 if ( (PGT_base_page_table == PGT_l4_page_table) &&
941 shadow_mode_refcounts(d) )
942 return 1;
943 ASSERT(!shadow_mode_refcounts(d));
945 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
946 {
947 if ( !l3_backptr(&vaddr, i, type) )
948 goto fail;
950 if ( is_guest_l4_slot(i) &&
951 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
952 goto fail;
953 }
955 /* Xen private mappings. */
956 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
957 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
958 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
959 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
960 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
961 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
962 l4e_from_page(
963 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
964 __PAGE_HYPERVISOR);
966 return 1;
968 fail:
969 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
970 while ( i-- > 0 )
971 if ( is_guest_l4_slot(i) )
972 put_page_from_l4e(pl4e[i], pfn);
974 return 0;
975 }
976 #else
977 #define alloc_l4_table(page, type) (0)
978 #endif
981 static void free_l1_table(struct page_info *page)
982 {
983 struct domain *d = page_get_owner(page);
984 unsigned long pfn = page_to_mfn(page);
985 l1_pgentry_t *pl1e;
986 int i;
988 pl1e = map_domain_page(pfn);
990 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
991 if ( is_guest_l1_slot(i) )
992 put_page_from_l1e(pl1e[i], d);
994 unmap_domain_page(pl1e);
995 }
998 static void free_l2_table(struct page_info *page)
999 {
1000 unsigned long pfn = page_to_mfn(page);
1001 l2_pgentry_t *pl2e;
1002 int i;
1004 pl2e = map_domain_page(pfn);
1006 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1007 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
1008 put_page_from_l2e(pl2e[i], pfn);
1010 unmap_domain_page(pl2e);
1014 #if CONFIG_PAGING_LEVELS >= 3
1016 static void free_l3_table(struct page_info *page)
1018 unsigned long pfn = page_to_mfn(page);
1019 l3_pgentry_t *pl3e;
1020 int i;
1022 pl3e = map_domain_page(pfn);
1024 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1025 if ( is_guest_l3_slot(i) )
1026 put_page_from_l3e(pl3e[i], pfn);
1028 unmap_domain_page(pl3e);
1031 #endif
1033 #if CONFIG_PAGING_LEVELS >= 4
1035 static void free_l4_table(struct page_info *page)
1037 unsigned long pfn = page_to_mfn(page);
1038 l4_pgentry_t *pl4e = page_to_virt(page);
1039 int i;
1041 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1042 if ( is_guest_l4_slot(i) )
1043 put_page_from_l4e(pl4e[i], pfn);
1046 #endif
1048 static inline int update_l1e(l1_pgentry_t *pl1e,
1049 l1_pgentry_t ol1e,
1050 l1_pgentry_t nl1e)
1052 intpte_t o = l1e_get_intpte(ol1e);
1053 intpte_t n = l1e_get_intpte(nl1e);
1055 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
1056 unlikely(o != l1e_get_intpte(ol1e)) )
1058 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1059 ": saw %" PRIpte,
1060 l1e_get_intpte(ol1e),
1061 l1e_get_intpte(nl1e),
1062 o);
1063 return 0;
1065 return 1;
1069 /* Update the L1 entry at pl1e to new value nl1e. */
1070 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1072 l1_pgentry_t ol1e;
1073 struct domain *d = current->domain;
1075 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1076 return 0;
1078 if ( unlikely(shadow_mode_refcounts(d)) )
1079 return update_l1e(pl1e, ol1e, nl1e);
1081 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1083 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1085 MEM_LOG("Bad L1 flags %x",
1086 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1087 return 0;
1090 /* Fast path for identical mapping, r/w and presence. */
1091 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1092 return update_l1e(pl1e, ol1e, nl1e);
1094 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1095 return 0;
1097 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1099 put_page_from_l1e(nl1e, d);
1100 return 0;
1103 else
1105 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1106 return 0;
1109 put_page_from_l1e(ol1e, d);
1110 return 1;
1113 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1114 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1115 _t ## e_get_intpte(_o), \
1116 _t ## e_get_intpte(_n)); \
1117 if ( __o != _t ## e_get_intpte(_o) ) \
1118 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1119 ": saw %" PRIpte "", \
1120 (_t ## e_get_intpte(_o)), \
1121 (_t ## e_get_intpte(_n)), \
1122 (__o)); \
1123 (__o == _t ## e_get_intpte(_o)); })
1125 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1126 static int mod_l2_entry(l2_pgentry_t *pl2e,
1127 l2_pgentry_t nl2e,
1128 unsigned long pfn,
1129 unsigned long type)
1131 l2_pgentry_t ol2e;
1132 unsigned long vaddr = 0;
1134 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1136 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1137 return 0;
1140 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1141 return 0;
1143 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1145 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1147 MEM_LOG("Bad L2 flags %x",
1148 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1149 return 0;
1152 /* Fast path for identical mapping and presence. */
1153 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1154 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1156 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1157 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1158 return 0;
1160 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1162 put_page_from_l2e(nl2e, pfn);
1163 return 0;
1166 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1168 return 0;
1171 put_page_from_l2e(ol2e, pfn);
1172 return 1;
1176 #if CONFIG_PAGING_LEVELS >= 3
1178 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1179 static int mod_l3_entry(l3_pgentry_t *pl3e,
1180 l3_pgentry_t nl3e,
1181 unsigned long pfn,
1182 unsigned long type)
1184 l3_pgentry_t ol3e;
1185 unsigned long vaddr;
1186 int okay;
1188 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1190 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1191 return 0;
1194 #ifdef CONFIG_X86_PAE
1195 /*
1196 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1197 * would be a pain to ensure they remain continuously valid throughout.
1198 */
1199 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1200 return 0;
1201 #endif
1203 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1204 return 0;
1206 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1208 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1210 MEM_LOG("Bad L3 flags %x",
1211 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1212 return 0;
1215 /* Fast path for identical mapping and presence. */
1216 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1217 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1219 #if CONFIG_PAGING_LEVELS >= 4
1220 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1221 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1222 return 0;
1223 #else
1224 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1225 << L3_PAGETABLE_SHIFT;
1226 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1227 return 0;
1228 #endif
1230 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1232 put_page_from_l3e(nl3e, pfn);
1233 return 0;
1236 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1238 return 0;
1241 okay = create_pae_xen_mappings(pl3e);
1242 BUG_ON(!okay);
1244 put_page_from_l3e(ol3e, pfn);
1245 return 1;
1248 #endif
1250 #if CONFIG_PAGING_LEVELS >= 4
1252 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1253 static int mod_l4_entry(l4_pgentry_t *pl4e,
1254 l4_pgentry_t nl4e,
1255 unsigned long pfn,
1256 unsigned long type)
1258 l4_pgentry_t ol4e;
1259 unsigned long vaddr;
1261 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1263 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1264 return 0;
1267 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1268 return 0;
1270 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1272 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1274 MEM_LOG("Bad L4 flags %x",
1275 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1276 return 0;
1279 /* Fast path for identical mapping and presence. */
1280 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1281 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1283 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1284 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1285 return 0;
1287 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1289 put_page_from_l4e(nl4e, pfn);
1290 return 0;
1293 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1295 return 0;
1298 put_page_from_l4e(ol4e, pfn);
1299 return 1;
1302 #endif
1304 int alloc_page_type(struct page_info *page, unsigned long type)
1306 struct domain *owner = page_get_owner(page);
1308 if ( owner != NULL )
1309 mark_dirty(owner, page_to_mfn(page));
1311 switch ( type & PGT_type_mask )
1313 case PGT_l1_page_table:
1314 return alloc_l1_table(page);
1315 case PGT_l2_page_table:
1316 return alloc_l2_table(page, type);
1317 case PGT_l3_page_table:
1318 return alloc_l3_table(page, type);
1319 case PGT_l4_page_table:
1320 return alloc_l4_table(page, type);
1321 case PGT_gdt_page:
1322 case PGT_ldt_page:
1323 return alloc_segdesc_page(page);
1324 default:
1325 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1326 type, page->u.inuse.type_info,
1327 page->count_info);
1328 BUG();
1331 return 0;
1335 void free_page_type(struct page_info *page, unsigned long type)
1337 struct domain *owner = page_get_owner(page);
1338 unsigned long gmfn;
1340 if ( likely(owner != NULL) )
1342 /*
1343 * We have to flush before the next use of the linear mapping
1344 * (e.g., update_va_mapping()) or we could end up modifying a page
1345 * that is no longer a page table (and hence screw up ref counts).
1346 */
1347 percpu_info[smp_processor_id()].deferred_ops |= DOP_FLUSH_ALL_TLBS;
1349 if ( unlikely(shadow_mode_enabled(owner)) )
1351 /* Raw page tables are rewritten during save/restore. */
1352 if ( !shadow_mode_translate(owner) )
1353 mark_dirty(owner, page_to_mfn(page));
1355 if ( shadow_mode_refcounts(owner) )
1356 return;
1358 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1359 ASSERT(VALID_M2P(gmfn));
1360 remove_shadow(owner, gmfn, type & PGT_type_mask);
1364 switch ( type & PGT_type_mask )
1366 case PGT_l1_page_table:
1367 free_l1_table(page);
1368 break;
1370 case PGT_l2_page_table:
1371 free_l2_table(page);
1372 break;
1374 #if CONFIG_PAGING_LEVELS >= 3
1375 case PGT_l3_page_table:
1376 free_l3_table(page);
1377 break;
1378 #endif
1380 #if CONFIG_PAGING_LEVELS >= 4
1381 case PGT_l4_page_table:
1382 free_l4_table(page);
1383 break;
1384 #endif
1386 default:
1387 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1388 type, page_to_mfn(page));
1389 BUG();
1394 void put_page_type(struct page_info *page)
1396 unsigned long nx, x, y = page->u.inuse.type_info;
1398 again:
1399 do {
1400 x = y;
1401 nx = x - 1;
1403 ASSERT((x & PGT_count_mask) != 0);
1405 /*
1406 * The page should always be validated while a reference is held. The
1407 * exception is during domain destruction, when we forcibly invalidate
1408 * page-table pages if we detect a referential loop.
1409 * See domain.c:relinquish_list().
1410 */
1411 ASSERT((x & PGT_validated) ||
1412 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1414 if ( unlikely((nx & PGT_count_mask) == 0) )
1416 /* Record TLB information for flush later. Races are harmless. */
1417 page->tlbflush_timestamp = tlbflush_current_time();
1419 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1420 likely(nx & PGT_validated) )
1422 /*
1423 * Page-table pages must be unvalidated when count is zero. The
1424 * 'free' is safe because the refcnt is non-zero and validated
1425 * bit is clear => other ops will spin or fail.
1426 */
1427 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1428 x & ~PGT_validated)) != x) )
1429 goto again;
1430 /* We cleared the 'valid bit' so we do the clean up. */
1431 free_page_type(page, x);
1432 /* Carry on, but with the 'valid bit' now clear. */
1433 x &= ~PGT_validated;
1434 nx &= ~PGT_validated;
1437 else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) ==
1438 (PGT_pinned|PGT_l1_page_table|1)) )
1440 /* Page is now only pinned. Make the back pointer mutable again. */
1441 nx |= PGT_va_mutable;
1444 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1448 int get_page_type(struct page_info *page, unsigned long type)
1450 unsigned long nx, x, y = page->u.inuse.type_info;
1452 again:
1453 do {
1454 x = y;
1455 nx = x + 1;
1456 if ( unlikely((nx & PGT_count_mask) == 0) )
1458 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1459 return 0;
1461 else if ( unlikely((x & PGT_count_mask) == 0) )
1463 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1465 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1467 /*
1468 * On type change we check to flush stale TLB
1469 * entries. This may be unnecessary (e.g., page
1470 * was GDT/LDT) but those circumstances should be
1471 * very rare.
1472 */
1473 cpumask_t mask =
1474 page_get_owner(page)->domain_dirty_cpumask;
1475 tlbflush_filter(mask, page->tlbflush_timestamp);
1477 if ( unlikely(!cpus_empty(mask)) )
1479 perfc_incrc(need_flush_tlb_flush);
1480 flush_tlb_mask(mask);
1484 /* We lose existing type, back pointer, and validity. */
1485 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1486 nx |= type;
1488 /* No special validation needed for writable pages. */
1489 /* Page tables and GDT/LDT need to be scanned for validity. */
1490 if ( type == PGT_writable_page )
1491 nx |= PGT_validated;
1494 else
1496 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1498 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1500 if ( current->domain == page_get_owner(page) )
1502 /*
1503 * This ensures functions like set_gdt() see up-to-date
1504 * type info without needing to clean up writable p.t.
1505 * state on the fast path.
1506 */
1507 LOCK_BIGLOCK(current->domain);
1508 cleanup_writable_pagetable(current->domain);
1509 y = page->u.inuse.type_info;
1510 UNLOCK_BIGLOCK(current->domain);
1511 /* Can we make progress now? */
1512 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1513 ((y & PGT_count_mask) == 0) )
1514 goto again;
1516 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1517 ((type & PGT_type_mask) != PGT_l1_page_table) )
1518 MEM_LOG("Bad type (saw %" PRtype_info
1519 " != exp %" PRtype_info ") "
1520 "for mfn %lx (pfn %lx)",
1521 x, type, page_to_mfn(page),
1522 get_gpfn_from_mfn(page_to_mfn(page)));
1523 return 0;
1525 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1527 /* The va backpointer is mutable, hence we update it. */
1528 nx &= ~PGT_va_mask;
1529 nx |= type; /* we know the actual type is correct */
1531 else if ( (type & PGT_va_mask) != PGT_va_mutable )
1533 ASSERT((type & PGT_va_mask) != (x & PGT_va_mask));
1534 #ifdef CONFIG_X86_PAE
1535 /* We use backptr as extra typing. Cannot be unknown. */
1536 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1537 return 0;
1538 #endif
1539 /* Fixme: add code to propagate va_unknown to subtables. */
1540 if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
1541 !shadow_mode_refcounts(page_get_owner(page)) )
1542 return 0;
1543 /* This table is possibly mapped at multiple locations. */
1544 nx &= ~PGT_va_mask;
1545 nx |= PGT_va_unknown;
1548 if ( unlikely(!(x & PGT_validated)) )
1550 /* Someone else is updating validation of this page. Wait... */
1551 while ( (y = page->u.inuse.type_info) == x )
1552 cpu_relax();
1553 goto again;
1557 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1559 if ( unlikely(!(nx & PGT_validated)) )
1561 /* Try to validate page type; drop the new reference on failure. */
1562 if ( unlikely(!alloc_page_type(page, type)) )
1564 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1565 PRtype_info ": caf=%08x taf=%" PRtype_info,
1566 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1567 type, page->count_info, page->u.inuse.type_info);
1568 /* Noone else can get a reference. We hold the only ref. */
1569 page->u.inuse.type_info = 0;
1570 return 0;
1573 /* Noone else is updating simultaneously. */
1574 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1577 return 1;
1581 int new_guest_cr3(unsigned long mfn)
1583 struct vcpu *v = current;
1584 struct domain *d = v->domain;
1585 int okay;
1586 unsigned long old_base_mfn;
1588 ASSERT(writable_pagetable_in_sync(d));
1590 if ( shadow_mode_refcounts(d) )
1592 okay = get_page_from_pagenr(mfn, d);
1593 if ( unlikely(!okay) )
1595 MEM_LOG("Error while installing new baseptr %lx", mfn);
1596 return 0;
1599 else
1601 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1602 if ( unlikely(!okay) )
1604 /* Switch to idle pagetable: this VCPU has no active p.t. now. */
1605 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1606 v->arch.guest_table = mk_pagetable(0);
1607 update_pagetables(v);
1608 write_cr3(__pa(idle_pg_table));
1609 if ( old_base_mfn != 0 )
1610 put_page_and_type(mfn_to_page(old_base_mfn));
1612 /* Retry the validation with no active p.t. for this VCPU. */
1613 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1614 if ( !okay )
1616 /* Failure here is unrecoverable: the VCPU has no pagetable! */
1617 MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
1618 domain_crash(d);
1619 percpu_info[v->processor].deferred_ops = 0;
1620 return 0;
1625 invalidate_shadow_ldt(v);
1627 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1628 v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1629 update_pagetables(v); /* update shadow_table and monitor_table */
1631 write_ptbase(v);
1633 if ( likely(old_base_mfn != 0) )
1635 if ( shadow_mode_refcounts(d) )
1636 put_page(mfn_to_page(old_base_mfn));
1637 else
1638 put_page_and_type(mfn_to_page(old_base_mfn));
1641 /* CR3 also holds a ref to its shadow... */
1642 if ( shadow_mode_enabled(d) )
1644 if ( v->arch.monitor_shadow_ref )
1645 put_shadow_ref(v->arch.monitor_shadow_ref);
1646 v->arch.monitor_shadow_ref =
1647 pagetable_get_pfn(v->arch.monitor_table);
1648 ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
1649 get_shadow_ref(v->arch.monitor_shadow_ref);
1652 return 1;
1655 static void process_deferred_ops(unsigned int cpu)
1657 unsigned int deferred_ops;
1658 struct domain *d = current->domain;
1660 deferred_ops = percpu_info[cpu].deferred_ops;
1661 percpu_info[cpu].deferred_ops = 0;
1663 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1665 if ( shadow_mode_enabled(d) )
1666 shadow_sync_all(d);
1667 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1668 flush_tlb_mask(d->domain_dirty_cpumask);
1669 else
1670 local_flush_tlb();
1673 if ( deferred_ops & DOP_RELOAD_LDT )
1674 (void)map_ldt_shadow_page(0);
1676 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1678 put_domain(percpu_info[cpu].foreign);
1679 percpu_info[cpu].foreign = NULL;
1683 static int set_foreigndom(unsigned int cpu, domid_t domid)
1685 struct domain *e, *d = current->domain;
1686 int okay = 1;
1688 ASSERT(percpu_info[cpu].foreign == NULL);
1690 if ( likely(domid == DOMID_SELF) )
1691 goto out;
1693 if ( domid == d->domain_id )
1695 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1696 d->domain_id);
1697 okay = 0;
1699 else if ( !IS_PRIV(d) )
1701 switch ( domid )
1703 case DOMID_IO:
1704 get_knownalive_domain(dom_io);
1705 percpu_info[cpu].foreign = dom_io;
1706 break;
1707 default:
1708 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1709 okay = 0;
1710 break;
1713 else
1715 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1716 if ( e == NULL )
1718 switch ( domid )
1720 case DOMID_XEN:
1721 get_knownalive_domain(dom_xen);
1722 percpu_info[cpu].foreign = dom_xen;
1723 break;
1724 case DOMID_IO:
1725 get_knownalive_domain(dom_io);
1726 percpu_info[cpu].foreign = dom_io;
1727 break;
1728 default:
1729 MEM_LOG("Unknown domain '%u'", domid);
1730 okay = 0;
1731 break;
1736 out:
1737 return okay;
1740 static inline cpumask_t vcpumask_to_pcpumask(
1741 struct domain *d, unsigned long vmask)
1743 unsigned int vcpu_id;
1744 cpumask_t pmask = CPU_MASK_NONE;
1745 struct vcpu *v;
1747 while ( vmask != 0 )
1749 vcpu_id = find_first_set_bit(vmask);
1750 vmask &= ~(1UL << vcpu_id);
1751 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1752 ((v = d->vcpu[vcpu_id]) != NULL) )
1753 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1756 return pmask;
1759 int do_mmuext_op(
1760 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1761 unsigned int count,
1762 XEN_GUEST_HANDLE(uint) pdone,
1763 unsigned int foreigndom)
1765 struct mmuext_op op;
1766 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1767 unsigned long mfn, type;
1768 unsigned int done = 0;
1769 struct page_info *page;
1770 struct vcpu *v = current;
1771 struct domain *d = v->domain;
1773 LOCK_BIGLOCK(d);
1775 cleanup_writable_pagetable(d);
1777 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1779 count &= ~MMU_UPDATE_PREEMPTED;
1780 if ( unlikely(!guest_handle_is_null(pdone)) )
1781 (void)copy_from_guest(&done, pdone, 1);
1784 if ( !set_foreigndom(cpu, foreigndom) )
1786 rc = -ESRCH;
1787 goto out;
1790 if ( unlikely(!guest_handle_okay(uops, count)) )
1792 rc = -EFAULT;
1793 goto out;
1796 for ( i = 0; i < count; i++ )
1798 if ( hypercall_preempt_check() )
1800 rc = hypercall_create_continuation(
1801 __HYPERVISOR_mmuext_op, "hihi",
1802 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1803 break;
1806 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
1808 MEM_LOG("Bad __copy_from_guest");
1809 rc = -EFAULT;
1810 break;
1813 okay = 1;
1814 mfn = op.arg1.mfn;
1815 page = mfn_to_page(mfn);
1817 switch ( op.cmd )
1819 case MMUEXT_PIN_L1_TABLE:
1820 type = PGT_l1_page_table | PGT_va_mutable;
1821 goto pin_page;
1823 case MMUEXT_PIN_L2_TABLE:
1824 case MMUEXT_PIN_L3_TABLE:
1825 case MMUEXT_PIN_L4_TABLE:
1826 /* Ignore pinning of subdirectories. */
1827 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) != (CONFIG_PAGING_LEVELS - 1) )
1828 break;
1830 type = PGT_root_page_table;
1832 pin_page:
1833 if ( shadow_mode_refcounts(FOREIGNDOM) )
1834 break;
1836 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1837 if ( unlikely(!okay) )
1839 MEM_LOG("Error while pinning mfn %lx", mfn);
1840 break;
1843 if ( unlikely(test_and_set_bit(_PGT_pinned,
1844 &page->u.inuse.type_info)) )
1846 MEM_LOG("Mfn %lx already pinned", mfn);
1847 put_page_and_type(page);
1848 okay = 0;
1849 break;
1852 break;
1854 case MMUEXT_UNPIN_TABLE:
1855 if ( shadow_mode_refcounts(d) )
1856 break;
1858 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
1860 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1861 mfn, page_get_owner(page));
1863 else if ( likely(test_and_clear_bit(_PGT_pinned,
1864 &page->u.inuse.type_info)) )
1866 put_page_and_type(page);
1867 put_page(page);
1869 else
1871 okay = 0;
1872 put_page(page);
1873 MEM_LOG("Mfn %lx not pinned", mfn);
1875 break;
1877 case MMUEXT_NEW_BASEPTR:
1878 mfn = gmfn_to_mfn(current->domain, mfn);
1879 okay = new_guest_cr3(mfn);
1880 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1881 break;
1883 #ifdef __x86_64__
1884 case MMUEXT_NEW_USER_BASEPTR:
1885 okay = get_page_and_type_from_pagenr(
1886 mfn, PGT_root_page_table, d);
1887 if ( unlikely(!okay) )
1889 MEM_LOG("Error while installing new mfn %lx", mfn);
1891 else
1893 unsigned long old_mfn =
1894 pagetable_get_pfn(v->arch.guest_table_user);
1895 v->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT);
1896 if ( old_mfn != 0 )
1897 put_page_and_type(mfn_to_page(old_mfn));
1899 break;
1900 #endif
1902 case MMUEXT_TLB_FLUSH_LOCAL:
1903 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1904 break;
1906 case MMUEXT_INVLPG_LOCAL:
1907 if ( shadow_mode_enabled(d) )
1908 shadow_invlpg(v, op.arg1.linear_addr);
1909 local_flush_tlb_one(op.arg1.linear_addr);
1910 break;
1912 case MMUEXT_TLB_FLUSH_MULTI:
1913 case MMUEXT_INVLPG_MULTI:
1915 unsigned long vmask;
1916 cpumask_t pmask;
1917 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
1919 okay = 0;
1920 break;
1922 pmask = vcpumask_to_pcpumask(d, vmask);
1923 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
1924 flush_tlb_mask(pmask);
1925 else
1926 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
1927 break;
1930 case MMUEXT_TLB_FLUSH_ALL:
1931 flush_tlb_mask(d->domain_dirty_cpumask);
1932 break;
1934 case MMUEXT_INVLPG_ALL:
1935 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
1936 break;
1938 case MMUEXT_FLUSH_CACHE:
1939 if ( unlikely(!cache_flush_permitted(d)) )
1941 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
1942 okay = 0;
1944 else
1946 wbinvd();
1948 break;
1950 case MMUEXT_SET_LDT:
1952 unsigned long ptr = op.arg1.linear_addr;
1953 unsigned long ents = op.arg2.nr_ents;
1955 if ( shadow_mode_external(d) )
1957 MEM_LOG("ignoring SET_LDT hypercall from external "
1958 "domain %u", d->domain_id);
1959 okay = 0;
1961 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1962 (ents > 8192) ||
1963 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
1965 okay = 0;
1966 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
1968 else if ( (v->arch.guest_context.ldt_ents != ents) ||
1969 (v->arch.guest_context.ldt_base != ptr) )
1971 invalidate_shadow_ldt(v);
1972 v->arch.guest_context.ldt_base = ptr;
1973 v->arch.guest_context.ldt_ents = ents;
1974 load_LDT(v);
1975 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1976 if ( ents != 0 )
1977 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1979 break;
1982 default:
1983 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
1984 okay = 0;
1985 break;
1988 if ( unlikely(!okay) )
1990 rc = -EINVAL;
1991 break;
1994 guest_handle_add_offset(uops, 1);
1997 out:
1998 process_deferred_ops(cpu);
2000 /* Add incremental work we have done to the @done output parameter. */
2001 done += i;
2002 if ( unlikely(!guest_handle_is_null(pdone)) )
2003 copy_to_guest(pdone, &done, 1);
2005 UNLOCK_BIGLOCK(d);
2006 return rc;
2009 int do_mmu_update(
2010 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2011 unsigned int count,
2012 XEN_GUEST_HANDLE(uint) pdone,
2013 unsigned int foreigndom)
2015 struct mmu_update req;
2016 void *va;
2017 unsigned long gpfn, gmfn, mfn;
2018 struct page_info *page;
2019 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
2020 unsigned int cmd, done = 0;
2021 struct vcpu *v = current;
2022 struct domain *d = v->domain;
2023 unsigned long type_info;
2024 struct domain_mmap_cache mapcache, sh_mapcache;
2026 LOCK_BIGLOCK(d);
2028 cleanup_writable_pagetable(d);
2030 if ( unlikely(shadow_mode_enabled(d)) )
2031 check_pagetable(v, "pre-mmu"); /* debug */
2033 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2035 count &= ~MMU_UPDATE_PREEMPTED;
2036 if ( unlikely(!guest_handle_is_null(pdone)) )
2037 (void)copy_from_guest(&done, pdone, 1);
2040 domain_mmap_cache_init(&mapcache);
2041 domain_mmap_cache_init(&sh_mapcache);
2043 if ( !set_foreigndom(cpu, foreigndom) )
2045 rc = -ESRCH;
2046 goto out;
2049 perfc_incrc(calls_to_mmu_update);
2050 perfc_addc(num_page_updates, count);
2051 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2053 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2055 rc = -EFAULT;
2056 goto out;
2059 for ( i = 0; i < count; i++ )
2061 if ( hypercall_preempt_check() )
2063 rc = hypercall_create_continuation(
2064 __HYPERVISOR_mmu_update, "hihi",
2065 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2066 break;
2069 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2071 MEM_LOG("Bad __copy_from_guest");
2072 rc = -EFAULT;
2073 break;
2076 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2077 okay = 0;
2079 switch ( cmd )
2081 /*
2082 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2083 */
2084 case MMU_NORMAL_PT_UPDATE:
2086 gmfn = req.ptr >> PAGE_SHIFT;
2087 mfn = gmfn_to_mfn(d, gmfn);
2089 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2091 MEM_LOG("Could not get page for normal update");
2092 break;
2095 va = map_domain_page_with_cache(mfn, &mapcache);
2096 va = (void *)((unsigned long)va +
2097 (unsigned long)(req.ptr & ~PAGE_MASK));
2098 page = mfn_to_page(mfn);
2100 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2102 case PGT_l1_page_table:
2103 ASSERT( !shadow_mode_refcounts(d) );
2104 if ( likely(get_page_type(
2105 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2107 l1_pgentry_t l1e;
2109 /* FIXME: doesn't work with PAE */
2110 l1e = l1e_from_intpte(req.val);
2111 okay = mod_l1_entry(va, l1e);
2112 if ( okay && unlikely(shadow_mode_enabled(d)) )
2113 shadow_l1_normal_pt_update(
2114 d, req.ptr, l1e, &sh_mapcache);
2115 put_page_type(page);
2117 break;
2118 case PGT_l2_page_table:
2119 ASSERT( !shadow_mode_refcounts(d) );
2120 if ( likely(get_page_type(
2121 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2123 l2_pgentry_t l2e;
2125 /* FIXME: doesn't work with PAE */
2126 l2e = l2e_from_intpte(req.val);
2127 okay = mod_l2_entry(
2128 (l2_pgentry_t *)va, l2e, mfn, type_info);
2129 if ( okay && unlikely(shadow_mode_enabled(d)) )
2130 shadow_l2_normal_pt_update(
2131 d, req.ptr, l2e, &sh_mapcache);
2132 put_page_type(page);
2134 break;
2135 #if CONFIG_PAGING_LEVELS >= 3
2136 case PGT_l3_page_table:
2137 ASSERT( !shadow_mode_refcounts(d) );
2138 if ( likely(get_page_type(
2139 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2141 l3_pgentry_t l3e;
2143 /* FIXME: doesn't work with PAE */
2144 l3e = l3e_from_intpte(req.val);
2145 okay = mod_l3_entry(va, l3e, mfn, type_info);
2146 if ( okay && unlikely(shadow_mode_enabled(d)) )
2147 shadow_l3_normal_pt_update(
2148 d, req.ptr, l3e, &sh_mapcache);
2149 put_page_type(page);
2151 break;
2152 #endif
2153 #if CONFIG_PAGING_LEVELS >= 4
2154 case PGT_l4_page_table:
2155 ASSERT( !shadow_mode_refcounts(d) );
2156 if ( likely(get_page_type(
2157 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2159 l4_pgentry_t l4e;
2161 l4e = l4e_from_intpte(req.val);
2162 okay = mod_l4_entry(va, l4e, mfn, type_info);
2163 if ( okay && unlikely(shadow_mode_enabled(d)) )
2164 shadow_l4_normal_pt_update(
2165 d, req.ptr, l4e, &sh_mapcache);
2166 put_page_type(page);
2168 break;
2169 #endif
2170 default:
2171 if ( likely(get_page_type(page, PGT_writable_page)) )
2173 if ( shadow_mode_enabled(d) )
2175 shadow_lock(d);
2177 __mark_dirty(d, mfn);
2179 if ( page_is_page_table(page) &&
2180 !page_out_of_sync(page) )
2182 shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
2186 *(intpte_t *)va = req.val;
2187 okay = 1;
2189 if ( shadow_mode_enabled(d) )
2190 shadow_unlock(d);
2192 put_page_type(page);
2194 break;
2197 unmap_domain_page_with_cache(va, &mapcache);
2199 put_page(page);
2200 break;
2202 case MMU_MACHPHYS_UPDATE:
2204 if ( shadow_mode_translate(FOREIGNDOM) )
2206 MEM_LOG("can't mutate m2p table of translate mode guest");
2207 break;
2210 mfn = req.ptr >> PAGE_SHIFT;
2211 gpfn = req.val;
2213 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2215 MEM_LOG("Could not get page for mach->phys update");
2216 break;
2219 set_gpfn_from_mfn(mfn, gpfn);
2220 okay = 1;
2222 mark_dirty(FOREIGNDOM, mfn);
2224 put_page(mfn_to_page(mfn));
2225 break;
2227 default:
2228 MEM_LOG("Invalid page update command %x", cmd);
2229 break;
2232 if ( unlikely(!okay) )
2234 rc = -EINVAL;
2235 break;
2238 guest_handle_add_offset(ureqs, 1);
2241 out:
2242 domain_mmap_cache_destroy(&mapcache);
2243 domain_mmap_cache_destroy(&sh_mapcache);
2245 process_deferred_ops(cpu);
2247 /* Add incremental work we have done to the @done output parameter. */
2248 done += i;
2249 if ( unlikely(!guest_handle_is_null(pdone)) )
2250 copy_to_guest(pdone, &done, 1);
2252 if ( unlikely(shadow_mode_enabled(d)) )
2253 check_pagetable(v, "post-mmu"); /* debug */
2255 UNLOCK_BIGLOCK(d);
2256 return rc;
2260 static int create_grant_pte_mapping(
2261 unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
2263 int rc = GNTST_okay;
2264 void *va;
2265 unsigned long gmfn, mfn;
2266 struct page_info *page;
2267 u32 type_info;
2268 l1_pgentry_t ol1e;
2269 struct domain *d = v->domain;
2271 ASSERT(spin_is_locked(&d->big_lock));
2272 ASSERT(!shadow_mode_refcounts(d));
2274 gmfn = pte_addr >> PAGE_SHIFT;
2275 mfn = gmfn_to_mfn(d, gmfn);
2277 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2279 MEM_LOG("Could not get page for normal update");
2280 return GNTST_general_error;
2283 va = map_domain_page(mfn);
2284 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2285 page = mfn_to_page(mfn);
2287 type_info = page->u.inuse.type_info;
2288 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2289 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2291 MEM_LOG("Grant map attempted to update a non-L1 page");
2292 rc = GNTST_general_error;
2293 goto failed;
2296 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) ||
2297 !update_l1e(va, ol1e, _nl1e) )
2299 put_page_type(page);
2300 rc = GNTST_general_error;
2301 goto failed;
2304 put_page_from_l1e(ol1e, d);
2306 if ( unlikely(shadow_mode_enabled(d)) )
2308 struct domain_mmap_cache sh_mapcache;
2309 domain_mmap_cache_init(&sh_mapcache);
2310 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2311 domain_mmap_cache_destroy(&sh_mapcache);
2314 put_page_type(page);
2316 failed:
2317 unmap_domain_page(va);
2318 put_page(page);
2319 return rc;
2322 static int destroy_grant_pte_mapping(
2323 unsigned long addr, unsigned long frame, struct domain *d)
2325 int rc = GNTST_okay;
2326 void *va;
2327 unsigned long gmfn, mfn;
2328 struct page_info *page;
2329 u32 type_info;
2330 l1_pgentry_t ol1e;
2332 ASSERT(!shadow_mode_refcounts(d));
2334 gmfn = addr >> PAGE_SHIFT;
2335 mfn = gmfn_to_mfn(d, gmfn);
2337 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2339 MEM_LOG("Could not get page for normal update");
2340 return GNTST_general_error;
2343 va = map_domain_page(mfn);
2344 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2345 page = mfn_to_page(mfn);
2347 type_info = page->u.inuse.type_info;
2348 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2349 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2351 MEM_LOG("Grant map attempted to update a non-L1 page");
2352 rc = GNTST_general_error;
2353 goto failed;
2356 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2358 put_page_type(page);
2359 rc = GNTST_general_error;
2360 goto failed;
2363 /* Check that the virtual address supplied is actually mapped to frame. */
2364 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2366 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2367 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2368 put_page_type(page);
2369 rc = GNTST_general_error;
2370 goto failed;
2373 /* Delete pagetable entry. */
2374 if ( unlikely(__put_user(0, (intpte_t *)va)))
2376 MEM_LOG("Cannot delete PTE entry at %p", va);
2377 put_page_type(page);
2378 rc = GNTST_general_error;
2379 goto failed;
2382 if ( unlikely(shadow_mode_enabled(d)) )
2384 struct domain_mmap_cache sh_mapcache;
2385 domain_mmap_cache_init(&sh_mapcache);
2386 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2387 domain_mmap_cache_destroy(&sh_mapcache);
2390 put_page_type(page);
2392 failed:
2393 unmap_domain_page(va);
2394 put_page(page);
2395 return rc;
2399 static int create_grant_va_mapping(
2400 unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
2402 l1_pgentry_t *pl1e, ol1e;
2403 struct domain *d = v->domain;
2405 ASSERT(spin_is_locked(&d->big_lock));
2406 ASSERT(!shadow_mode_refcounts(d));
2408 /*
2409 * This is actually overkill - we don't need to sync the L1 itself,
2410 * just everything involved in getting to this L1 (i.e. we need
2411 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2412 */
2413 __shadow_sync_va(v, va);
2415 pl1e = &linear_pg_table[l1_linear_offset(va)];
2417 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2418 !update_l1e(pl1e, ol1e, _nl1e) )
2419 return GNTST_general_error;
2421 put_page_from_l1e(ol1e, d);
2423 if ( unlikely(shadow_mode_enabled(d)) )
2424 shadow_do_update_va_mapping(va, _nl1e, v);
2426 return GNTST_okay;
2429 static int destroy_grant_va_mapping(
2430 unsigned long addr, unsigned long frame)
2432 l1_pgentry_t *pl1e, ol1e;
2434 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2436 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2438 MEM_LOG("Could not find PTE entry for address %lx", addr);
2439 return GNTST_general_error;
2442 /*
2443 * Check that the virtual address supplied is actually mapped to
2444 * frame.
2445 */
2446 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2448 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2449 l1e_get_pfn(ol1e), addr, frame);
2450 return GNTST_general_error;
2453 /* Delete pagetable entry. */
2454 if ( unlikely(__put_user(0, &pl1e->l1)) )
2456 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2457 return GNTST_general_error;
2460 return 0;
2463 int create_grant_host_mapping(
2464 unsigned long addr, unsigned long frame, unsigned int flags)
2466 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2468 if ( (flags & GNTMAP_application_map) )
2469 l1e_add_flags(pte,_PAGE_USER);
2470 if ( !(flags & GNTMAP_readonly) )
2471 l1e_add_flags(pte,_PAGE_RW);
2473 if ( flags & GNTMAP_contains_pte )
2474 return create_grant_pte_mapping(addr, pte, current);
2475 return create_grant_va_mapping(addr, pte, current);
2478 int destroy_grant_host_mapping(
2479 unsigned long addr, unsigned long frame, unsigned int flags)
2481 if ( flags & GNTMAP_contains_pte )
2482 return destroy_grant_pte_mapping(addr, frame, current->domain);
2483 return destroy_grant_va_mapping(addr, frame);
2486 int steal_page_for_grant_transfer(
2487 struct domain *d, struct page_info *page)
2489 u32 _d, _nd, x, y;
2491 spin_lock(&d->page_alloc_lock);
2493 /*
2494 * The tricky bit: atomically release ownership while there is just one
2495 * benign reference to the page (PGC_allocated). If that reference
2496 * disappears then the deallocation routine will safely spin.
2497 */
2498 _d = pickle_domptr(d);
2499 _nd = page->u.inuse._domain;
2500 y = page->count_info;
2501 do {
2502 x = y;
2503 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2504 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2505 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2506 " caf=%08x, taf=%" PRtype_info "\n",
2507 (void *) page_to_mfn(page),
2508 d, d->domain_id, unpickle_domptr(_nd), x,
2509 page->u.inuse.type_info);
2510 spin_unlock(&d->page_alloc_lock);
2511 return -1;
2513 __asm__ __volatile__(
2514 LOCK_PREFIX "cmpxchg8b %2"
2515 : "=d" (_nd), "=a" (y),
2516 "=m" (*(volatile u64 *)(&page->count_info))
2517 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2518 } while (unlikely(_nd != _d) || unlikely(y != x));
2520 /*
2521 * Unlink from 'd'. At least one reference remains (now anonymous), so
2522 * noone else is spinning to try to delete this page from 'd'.
2523 */
2524 d->tot_pages--;
2525 list_del(&page->list);
2527 spin_unlock(&d->page_alloc_lock);
2529 return 0;
2532 int do_update_va_mapping(unsigned long va, u64 val64,
2533 unsigned long flags)
2535 l1_pgentry_t val = l1e_from_intpte(val64);
2536 struct vcpu *v = current;
2537 struct domain *d = v->domain;
2538 unsigned int cpu = smp_processor_id();
2539 unsigned long vmask, bmap_ptr;
2540 cpumask_t pmask;
2541 int rc = 0;
2543 perfc_incrc(calls_to_update_va);
2545 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2546 return -EINVAL;
2548 LOCK_BIGLOCK(d);
2550 cleanup_writable_pagetable(d);
2552 if ( unlikely(shadow_mode_enabled(d)) )
2553 check_pagetable(v, "pre-va"); /* debug */
2555 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2556 val)) )
2557 rc = -EINVAL;
2559 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2561 if ( unlikely(percpu_info[cpu].foreign &&
2562 (shadow_mode_translate(d) ||
2563 shadow_mode_translate(percpu_info[cpu].foreign))) )
2565 /*
2566 * The foreign domain's pfn's are in a different namespace. There's
2567 * not enough information in just a gpte to figure out how to
2568 * (re-)shadow this entry.
2569 */
2570 domain_crash(d);
2573 rc = shadow_do_update_va_mapping(va, val, v);
2575 check_pagetable(v, "post-va"); /* debug */
2578 switch ( flags & UVMF_FLUSHTYPE_MASK )
2580 case UVMF_TLB_FLUSH:
2581 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2583 case UVMF_LOCAL:
2584 if ( unlikely(shadow_mode_enabled(d)) )
2585 shadow_sync_all(d);
2586 local_flush_tlb();
2587 break;
2588 case UVMF_ALL:
2589 flush_tlb_mask(d->domain_dirty_cpumask);
2590 break;
2591 default:
2592 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2593 rc = -EFAULT;
2594 pmask = vcpumask_to_pcpumask(d, vmask);
2595 flush_tlb_mask(pmask);
2596 break;
2598 break;
2600 case UVMF_INVLPG:
2601 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2603 case UVMF_LOCAL:
2604 if ( unlikely(shadow_mode_enabled(d)) )
2605 shadow_invlpg(current, va);
2606 local_flush_tlb_one(va);
2607 break;
2608 case UVMF_ALL:
2609 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2610 break;
2611 default:
2612 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2613 rc = -EFAULT;
2614 pmask = vcpumask_to_pcpumask(d, vmask);
2615 flush_tlb_one_mask(pmask, va);
2616 break;
2618 break;
2621 process_deferred_ops(cpu);
2623 UNLOCK_BIGLOCK(d);
2625 return rc;
2628 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2629 unsigned long flags,
2630 domid_t domid)
2632 unsigned int cpu = smp_processor_id();
2633 int rc;
2635 if ( unlikely(!IS_PRIV(current->domain)) )
2636 return -EPERM;
2638 if ( !set_foreigndom(cpu, domid) )
2639 return -ESRCH;
2641 rc = do_update_va_mapping(va, val64, flags);
2643 return rc;
2648 /*************************
2649 * Descriptor Tables
2650 */
2652 void destroy_gdt(struct vcpu *v)
2654 int i;
2655 unsigned long pfn;
2657 v->arch.guest_context.gdt_ents = 0;
2658 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2660 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2661 put_page_and_type(mfn_to_page(pfn));
2662 v->arch.perdomain_ptes[i] = l1e_empty();
2663 v->arch.guest_context.gdt_frames[i] = 0;
2668 long set_gdt(struct vcpu *v,
2669 unsigned long *frames,
2670 unsigned int entries)
2672 struct domain *d = v->domain;
2673 /* NB. There are 512 8-byte entries per GDT page. */
2674 int i, nr_pages = (entries + 511) / 512;
2675 unsigned long mfn;
2677 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2678 return -EINVAL;
2680 shadow_sync_all(d);
2682 /* Check the pages in the new GDT. */
2683 for ( i = 0; i < nr_pages; i++ ) {
2684 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2685 if ( !mfn_valid(mfn) ||
2686 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2687 goto fail;
2690 /* Tear down the old GDT. */
2691 destroy_gdt(v);
2693 /* Install the new GDT. */
2694 v->arch.guest_context.gdt_ents = entries;
2695 for ( i = 0; i < nr_pages; i++ )
2697 v->arch.guest_context.gdt_frames[i] = frames[i];
2698 v->arch.perdomain_ptes[i] =
2699 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2702 return 0;
2704 fail:
2705 while ( i-- > 0 )
2706 put_page_and_type(mfn_to_page(frames[i]));
2707 return -EINVAL;
2711 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2713 int nr_pages = (entries + 511) / 512;
2714 unsigned long frames[16];
2715 long ret;
2717 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2718 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2719 return -EINVAL;
2721 if ( copy_from_guest((unsigned long *)frames, frame_list, nr_pages) )
2722 return -EFAULT;
2724 LOCK_BIGLOCK(current->domain);
2726 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2727 local_flush_tlb();
2729 UNLOCK_BIGLOCK(current->domain);
2731 return ret;
2735 long do_update_descriptor(u64 pa, u64 desc)
2737 struct domain *dom = current->domain;
2738 unsigned long gmfn = pa >> PAGE_SHIFT;
2739 unsigned long mfn;
2740 unsigned int offset;
2741 struct desc_struct *gdt_pent, d;
2742 struct page_info *page;
2743 long ret = -EINVAL;
2745 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2747 *(u64 *)&d = desc;
2749 LOCK_BIGLOCK(dom);
2751 if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
2752 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2753 !mfn_valid(mfn) ||
2754 !check_descriptor(&d) )
2756 UNLOCK_BIGLOCK(dom);
2757 return -EINVAL;
2760 page = mfn_to_page(mfn);
2761 if ( unlikely(!get_page(page, dom)) )
2763 UNLOCK_BIGLOCK(dom);
2764 return -EINVAL;
2767 /* Check if the given frame is in use in an unsafe context. */
2768 switch ( page->u.inuse.type_info & PGT_type_mask )
2770 case PGT_gdt_page:
2771 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2772 goto out;
2773 break;
2774 case PGT_ldt_page:
2775 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2776 goto out;
2777 break;
2778 default:
2779 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2780 goto out;
2781 break;
2784 if ( shadow_mode_enabled(dom) )
2786 shadow_lock(dom);
2788 __mark_dirty(dom, mfn);
2790 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2791 shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
2794 /* All is good so make the update. */
2795 gdt_pent = map_domain_page(mfn);
2796 memcpy(&gdt_pent[offset], &d, 8);
2797 unmap_domain_page(gdt_pent);
2799 if ( shadow_mode_enabled(dom) )
2800 shadow_unlock(dom);
2802 put_page_type(page);
2804 ret = 0; /* success */
2806 out:
2807 put_page(page);
2809 UNLOCK_BIGLOCK(dom);
2811 return ret;
2815 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2817 switch ( op )
2819 case XENMEM_add_to_physmap:
2821 struct xen_add_to_physmap xatp;
2822 unsigned long mfn = 0, gpfn;
2823 struct domain *d;
2825 if ( copy_from_guest(&xatp, arg, 1) )
2826 return -EFAULT;
2828 if ( (d = find_domain_by_id(xatp.domid)) == NULL )
2829 return -ESRCH;
2831 switch ( xatp.space )
2833 case XENMAPSPACE_shared_info:
2834 if ( xatp.idx == 0 )
2835 mfn = virt_to_mfn(d->shared_info);
2836 break;
2837 case XENMAPSPACE_grant_table:
2838 if ( xatp.idx < NR_GRANT_FRAMES )
2839 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
2840 break;
2841 default:
2842 break;
2845 if ( !shadow_mode_translate(d) || (mfn == 0) )
2847 put_domain(d);
2848 return -EINVAL;
2851 LOCK_BIGLOCK(d);
2853 /* Remove previously mapped page if it was present. */
2854 if ( mfn_valid(gmfn_to_mfn(d, xatp.gpfn)) )
2855 guest_remove_page(d, xatp.gpfn);
2857 /* Unmap from old location, if any. */
2858 gpfn = get_gpfn_from_mfn(mfn);
2859 if ( gpfn != INVALID_M2P_ENTRY )
2860 guest_physmap_remove_page(d, gpfn, mfn);
2862 /* Map at new location. */
2863 guest_physmap_add_page(d, xatp.gpfn, mfn);
2865 UNLOCK_BIGLOCK(d);
2867 put_domain(d);
2869 break;
2872 default:
2873 return subarch_memory_op(op, arg);
2876 return 0;
2880 /*************************
2881 * Writable Pagetables
2882 */
2884 #ifdef VVERBOSE
2885 int ptwr_debug = 0x0;
2886 #define PTWR_PRINTK(_f, _a...) \
2887 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2888 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2889 #else
2890 #define PTWR_PRINTK(_f, _a...) ((void)0)
2891 #endif
2894 #ifdef PERF_ARRAYS
2896 /**************** writeable pagetables profiling functions *****************/
2898 #define ptwr_eip_buckets 256
2900 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
2902 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
2904 struct {
2905 unsigned long eip;
2906 domid_t id;
2907 u32 val[ptwr_eip_stat_thresholdN];
2908 } typedef ptwr_eip_stat_t;
2910 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
2912 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
2914 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
2915 ptwr_eip_buckets;
2918 static void ptwr_eip_stat_inc(u32 *n)
2920 unsigned int i, j;
2922 if ( ++(*n) != 0 )
2923 return;
2925 *n = ~0;
2927 /* Re-scale all buckets. */
2928 for ( i = 0; i < ptwr_eip_buckets; i++ )
2929 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2930 ptwr_eip_stats[i].val[j] >>= 1;
2933 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
2935 unsigned int i, j, b;
2937 i = b = ptwr_eip_stat_hash(eip, id);
2939 do
2941 if ( !ptwr_eip_stats[i].eip )
2943 /* doesn't exist */
2944 ptwr_eip_stats[i].eip = eip;
2945 ptwr_eip_stats[i].id = id;
2946 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
2949 if ( ptwr_eip_stats[i].eip == eip && ptwr_eip_stats[i].id == id)
2951 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2952 if ( modified <= ptwr_eip_stat_threshold[j] )
2953 break;
2954 BUG_ON(j >= ptwr_eip_stat_thresholdN);
2955 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
2956 return;
2959 i = (i+1) % ptwr_eip_buckets;
2961 while ( i != b );
2963 printk("ptwr_eip_stat: too many EIPs in use!\n");
2965 ptwr_eip_stat_print();
2966 ptwr_eip_stat_reset();
2969 void ptwr_eip_stat_reset(void)
2971 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
2974 void ptwr_eip_stat_print(void)
2976 struct domain *e;
2977 domid_t d;
2978 unsigned int i, j;
2980 for_each_domain( e )
2982 d = e->domain_id;
2984 for ( i = 0; i < ptwr_eip_buckets; i++ )
2986 if ( !ptwr_eip_stats[i].eip || ptwr_eip_stats[i].id != d )
2987 continue;
2989 printk("D %5d eip %p ",
2990 ptwr_eip_stats[i].id, (void *)ptwr_eip_stats[i].eip);
2992 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2993 printk("<=%u %4u \t",
2994 ptwr_eip_stat_threshold[j],
2995 ptwr_eip_stats[i].val[j]);
2996 printk("\n");
3001 #else /* PERF_ARRAYS */
3003 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
3005 #endif
3007 /*******************************************************************/
3009 /* Re-validate a given p.t. page, given its prior snapshot */
3010 int revalidate_l1(
3011 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
3013 l1_pgentry_t ol1e, nl1e;
3014 int modified = 0, i;
3016 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3018 ol1e = snapshot[i];
3019 nl1e = l1page[i];
3021 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
3022 continue;
3024 /* Update number of entries modified. */
3025 modified++;
3027 /*
3028 * Fast path for PTEs that have merely been write-protected
3029 * (e.g., during a Unix fork()). A strict reduction in privilege.
3030 */
3031 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
3033 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3034 put_page_type(mfn_to_page(l1e_get_pfn(nl1e)));
3035 continue;
3038 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3040 /*
3041 * Make the remaining p.t's consistent before crashing, so the
3042 * reference counts are correct.
3043 */
3044 memcpy(&l1page[i], &snapshot[i],
3045 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
3047 /* Crash the offending domain. */
3048 MEM_LOG("ptwr: Could not revalidate l1 page");
3049 domain_crash(d);
3050 break;
3053 put_page_from_l1e(ol1e, d);
3056 return modified;
3060 /* Flush the given writable p.t. page and write-protect it again. */
3061 void ptwr_flush(struct domain *d, const int which)
3063 unsigned long l1va;
3064 l1_pgentry_t *pl1e, pte, *ptep;
3065 l2_pgentry_t *pl2e;
3066 unsigned int modified;
3068 #ifdef CONFIG_X86_64
3069 struct vcpu *v = current;
3070 int user_mode = !(v->arch.flags & TF_kernel_mode);
3071 #endif
3073 ASSERT(!shadow_mode_enabled(d));
3075 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3076 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
3077 write_cr3(pagetable_get_paddr(
3078 d->arch.ptwr[which].vcpu->arch.guest_table));
3079 else
3080 TOGGLE_MODE();
3082 l1va = d->arch.ptwr[which].l1va;
3083 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
3085 /*
3086 * STEP 1. Write-protect the p.t. page so no more updates can occur.
3087 */
3089 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
3091 MEM_LOG("ptwr: Could not read pte at %p", ptep);
3092 /*
3093 * Really a bug. We could read this PTE during the initial fault,
3094 * and pagetables can't have changed meantime.
3095 */
3096 BUG();
3098 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
3099 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3100 l1e_remove_flags(pte, _PAGE_RW);
3102 /* Write-protect the p.t. page in the guest page table. */
3103 if ( unlikely(__put_user(pte, ptep)) )
3105 MEM_LOG("ptwr: Could not update pte at %p", ptep);
3106 /*
3107 * Really a bug. We could write this PTE during the initial fault,
3108 * and pagetables can't have changed meantime.
3109 */
3110 BUG();
3113 /* Ensure that there are no stale writable mappings in any TLB. */
3114 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3115 flush_tlb_one_mask(d->domain_dirty_cpumask, l1va);
3116 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3117 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3119 /*
3120 * STEP 2. Validate any modified PTEs.
3121 */
3123 if ( likely(d == current->domain) )
3125 pl1e = map_domain_page(l1e_get_pfn(pte));
3126 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3127 unmap_domain_page(pl1e);
3128 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3129 ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
3130 d->arch.ptwr[which].prev_nr_updates = modified;
3132 else
3134 /*
3135 * Must make a temporary global mapping, since we are running in the
3136 * wrong address space, so no access to our own mapcache.
3137 */
3138 pl1e = map_domain_page_global(l1e_get_pfn(pte));
3139 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3140 unmap_domain_page_global(pl1e);
3143 /*
3144 * STEP 3. Reattach the L1 p.t. page into the current address space.
3145 */
3147 if ( which == PTWR_PT_ACTIVE )
3149 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3150 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3153 /*
3154 * STEP 4. Final tidy-up.
3155 */
3157 d->arch.ptwr[which].l1va = 0;
3159 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3160 write_ptbase(current);
3161 else
3162 TOGGLE_MODE();
3165 static int ptwr_emulated_update(
3166 unsigned long addr,
3167 paddr_t old,
3168 paddr_t val,
3169 unsigned int bytes,
3170 unsigned int do_cmpxchg)
3172 unsigned long pfn, l1va;
3173 struct page_info *page;
3174 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3175 struct domain *d = current->domain;
3177 /* Aligned access only, thank you. */
3178 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3180 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3181 bytes, addr);
3182 return X86EMUL_UNHANDLEABLE;
3185 /* Turn a sub-word access into a full-word access. */
3186 if ( bytes != sizeof(paddr_t) )
3188 int rc;
3189 paddr_t full;
3190 unsigned int offset = addr & (sizeof(paddr_t)-1);
3192 /* Align address; read full word. */
3193 addr &= ~(sizeof(paddr_t)-1);
3194 if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full,
3195 sizeof(paddr_t))) )
3196 return rc;
3197 /* Mask out bits provided by caller. */
3198 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3199 /* Shift the caller value and OR in the missing bits. */
3200 val &= (((paddr_t)1 << (bytes*8)) - 1);
3201 val <<= (offset)*8;
3202 val |= full;
3203 /* Also fill in missing parts of the cmpxchg old value. */
3204 old &= (((paddr_t)1 << (bytes*8)) - 1);
3205 old <<= (offset)*8;
3206 old |= full;
3209 /*
3210 * We must not emulate an update to a PTE that is temporarily marked
3211 * writable by the batched ptwr logic, else we can corrupt page refcnts!
3212 */
3213 if ( ((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3214 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3215 ptwr_flush(d, PTWR_PT_ACTIVE);
3216 if ( ((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3217 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3218 ptwr_flush(d, PTWR_PT_INACTIVE);
3220 /* Read the PTE that maps the page being updated. */
3221 if ( __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3222 sizeof(pte)) )
3224 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3225 return X86EMUL_UNHANDLEABLE;
3228 pfn = l1e_get_pfn(pte);
3229 page = mfn_to_page(pfn);
3231 /* We are looking only for read-only mappings of p.t. pages. */
3232 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3233 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3234 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3235 ASSERT(page_get_owner(page) == d);
3237 /* Check the new PTE. */
3238 nl1e = l1e_from_intpte(val);
3239 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3241 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3242 return X86EMUL_UNHANDLEABLE;
3245 /* Checked successfully: do the update (write or cmpxchg). */
3246 pl1e = map_domain_page(page_to_mfn(page));
3247 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3248 if ( do_cmpxchg )
3250 ol1e = l1e_from_intpte(old);
3251 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3253 unmap_domain_page(pl1e);
3254 put_page_from_l1e(nl1e, d);
3255 return X86EMUL_CMPXCHG_FAILED;
3258 else
3260 ol1e = *pl1e;
3261 *pl1e = nl1e;
3263 unmap_domain_page(pl1e);
3265 /* Finally, drop the old PTE. */
3266 put_page_from_l1e(ol1e, d);
3268 return X86EMUL_CONTINUE;
3271 static int ptwr_emulated_write(
3272 unsigned long addr,
3273 unsigned long val,
3274 unsigned int bytes)
3276 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3279 static int ptwr_emulated_cmpxchg(
3280 unsigned long addr,
3281 unsigned long old,
3282 unsigned long new,
3283 unsigned int bytes)
3285 return ptwr_emulated_update(addr, old, new, bytes, 1);
3288 static int ptwr_emulated_cmpxchg8b(
3289 unsigned long addr,
3290 unsigned long old,
3291 unsigned long old_hi,
3292 unsigned long new,
3293 unsigned long new_hi)
3295 if ( CONFIG_PAGING_LEVELS == 2 )
3296 return X86EMUL_UNHANDLEABLE;
3297 else
3298 return ptwr_emulated_update(
3299 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3302 static struct x86_mem_emulator ptwr_mem_emulator = {
3303 .read_std = x86_emulate_read_std,
3304 .write_std = x86_emulate_write_std,
3305 .read_emulated = x86_emulate_read_std,
3306 .write_emulated = ptwr_emulated_write,
3307 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3308 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3309 };
3311 /* Write page fault handler: check if guest is trying to modify a PTE. */
3312 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3313 struct cpu_user_regs *regs)
3315 unsigned long pfn;
3316 struct page_info *page;
3317 l1_pgentry_t *pl1e, pte;
3318 l2_pgentry_t *pl2e, l2e;
3319 int which, flags;
3320 unsigned long l2_idx;
3322 if ( unlikely(shadow_mode_enabled(d)) )
3323 return 0;
3325 /*
3326 * Attempt to read the PTE that maps the VA being accessed. By checking for
3327 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3328 */
3329 if ( !(l2e_get_flags(__linear_l2_table[l2_linear_offset(addr)]) &
3330 _PAGE_PRESENT) ||
3331 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
3332 sizeof(pte)) )
3334 return 0;
3337 pfn = l1e_get_pfn(pte);
3338 page = mfn_to_page(pfn);
3340 #ifdef CONFIG_X86_64
3341 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT | _PAGE_USER)
3342 #else
3343 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT)
3344 #endif
3346 /*
3347 * Check the required flags for a valid wrpt mapping. If the page is
3348 * already writable then we can return straight to the guest (SMP race).
3349 * We decide whether or not to propagate the fault by testing for write
3350 * permissions in page directories by writing back to the linear mapping.
3351 */
3352 if ( (flags = l1e_get_flags(pte) & WRPT_PTE_FLAGS) == WRPT_PTE_FLAGS )
3353 return __put_user(
3354 pte.l1, &linear_pg_table[l1_linear_offset(addr)].l1) ?
3355 0 : EXCRET_not_a_fault;
3357 /* We are looking only for read-only mappings of p.t. pages. */
3358 if ( ((flags | _PAGE_RW) != WRPT_PTE_FLAGS) ||
3359 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3360 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3361 (page_get_owner(page) != d) )
3363 return 0;
3366 #if 0 /* Leave this in as useful for debugging */
3367 goto emulate;
3368 #endif
3370 PTWR_PRINTK("ptwr_page_fault on l1 pt at va %lx, pfn %lx, eip %lx\n",
3371 addr, pfn, (unsigned long)regs->eip);
3373 /* Get the L2 index at which this L1 p.t. is always mapped. */
3374 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3375 if ( unlikely(l2_idx >= PGT_va_unknown) )
3376 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3377 l2_idx >>= PGT_va_shift;
3379 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3380 goto emulate; /* Urk! Pagetable maps itself! */
3382 /*
3383 * Is the L1 p.t. mapped into the current address space? If so we call it
3384 * an ACTIVE p.t., otherwise it is INACTIVE.
3385 */
3386 pl2e = &__linear_l2_table[l2_idx];
3387 which = PTWR_PT_INACTIVE;
3389 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3391 /*
3392 * Check the PRESENT bit to set ACTIVE mode.
3393 * If the PRESENT bit is clear, we may be conflicting with the current
3394 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3395 * The ptwr_flush call below will restore the PRESENT bit.
3396 */
3397 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3398 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3399 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3400 which = PTWR_PT_ACTIVE;
3403 /*
3404 * If this is a multi-processor guest then ensure that the page is hooked
3405 * into at most one L2 table, which must be the one running on this VCPU.
3406 */
3407 if ( (d->vcpu[0]->next_in_list != NULL) &&
3408 ((page->u.inuse.type_info & PGT_count_mask) !=
3409 (!!(page->u.inuse.type_info & PGT_pinned) +
3410 (which == PTWR_PT_ACTIVE))) )
3412 /* Could be conflicting writable mappings from other VCPUs. */
3413 cleanup_writable_pagetable(d);
3414 goto emulate;
3417 /*
3418 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
3419 * time. If there is already one, we must flush it out.
3420 */
3421 if ( d->arch.ptwr[which].l1va )
3422 ptwr_flush(d, which);
3424 /*
3425 * If last batch made no updates then we are probably stuck. Emulate this
3426 * update to ensure we make progress.
3427 */
3428 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3430 /* Ensure that we don't get stuck in an emulation-only rut. */
3431 d->arch.ptwr[which].prev_nr_updates = 1;
3432 goto emulate;
3435 PTWR_PRINTK("[%c] batched ptwr_page_fault at va %lx, pt for %08lx, "
3436 "pfn %lx\n", PTWR_PRINT_WHICH, addr,
3437 l2_idx << L2_PAGETABLE_SHIFT, pfn);
3439 d->arch.ptwr[which].l1va = addr | 1;
3440 d->arch.ptwr[which].l2_idx = l2_idx;
3441 d->arch.ptwr[which].vcpu = current;
3443 #ifdef PERF_ARRAYS
3444 d->arch.ptwr[which].eip = regs->eip;
3445 #endif
3447 /* For safety, disconnect the L1 p.t. page from current space. */
3448 if ( which == PTWR_PT_ACTIVE )
3450 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
3451 flush_tlb_mask(d->domain_dirty_cpumask);
3454 /* Temporarily map the L1 page, and make a copy of it. */
3455 pl1e = map_domain_page(pfn);
3456 memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE);
3457 unmap_domain_page(pl1e);
3459 /* Finally, make the p.t. page writable by the guest OS. */
3460 l1e_add_flags(pte, _PAGE_RW);
3461 if ( unlikely(__put_user(pte.l1,
3462 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3464 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
3465 &linear_pg_table[l1_linear_offset(addr)]);
3466 /* Toss the writable pagetable state and crash. */
3467 d->arch.ptwr[which].l1va = 0;
3468 domain_crash(d);
3469 return 0;
3472 return EXCRET_fault_fixed;
3474 emulate:
3475 if ( x86_emulate_memop(guest_cpu_user_regs(), addr,
3476 &ptwr_mem_emulator, X86EMUL_MODE_HOST) )
3477 return 0;
3478 perfc_incrc(ptwr_emulations);
3479 return EXCRET_fault_fixed;
3482 int ptwr_init(struct domain *d)
3484 void *x = alloc_xenheap_page();
3485 void *y = alloc_xenheap_page();
3487 if ( (x == NULL) || (y == NULL) )
3489 free_xenheap_page(x);
3490 free_xenheap_page(y);
3491 return -ENOMEM;
3494 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3495 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3497 return 0;
3500 void ptwr_destroy(struct domain *d)
3502 LOCK_BIGLOCK(d);
3503 cleanup_writable_pagetable(d);
3504 UNLOCK_BIGLOCK(d);
3505 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3506 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3509 void cleanup_writable_pagetable(struct domain *d)
3511 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3512 return;
3514 if ( unlikely(shadow_mode_enabled(d)) )
3516 shadow_sync_all(d);
3518 else
3520 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3521 ptwr_flush(d, PTWR_PT_ACTIVE);
3522 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3523 ptwr_flush(d, PTWR_PT_INACTIVE);
3527 int map_pages_to_xen(
3528 unsigned long virt,
3529 unsigned long mfn,
3530 unsigned long nr_mfns,
3531 unsigned long flags)
3533 l2_pgentry_t *pl2e, ol2e;
3534 l1_pgentry_t *pl1e, ol1e;
3535 unsigned int i;
3537 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3538 flags &= ~MAP_SMALL_PAGES;
3540 while ( nr_mfns != 0 )
3542 pl2e = virt_to_xen_l2e(virt);
3544 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3545 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3546 !map_small_pages )
3548 /* Super-page mapping. */
3549 ol2e = *pl2e;
3550 *pl2e = l2e_from_pfn(mfn, flags|_PAGE_PSE);
3552 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3554 local_flush_tlb_pge();
3555 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3556 free_xen_pagetable(l2e_get_page(*pl2e));
3559 virt += 1UL << L2_PAGETABLE_SHIFT;
3560 mfn += 1UL << PAGETABLE_ORDER;
3561 nr_mfns -= 1UL << PAGETABLE_ORDER;
3563 else
3565 /* Normal page mapping. */
3566 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3568 pl1e = page_to_virt(alloc_xen_pagetable());
3569 clear_page(pl1e);
3570 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3572 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3574 pl1e = page_to_virt(alloc_xen_pagetable());
3575 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3576 pl1e[i] = l1e_from_pfn(
3577 l2e_get_pfn(*pl2e) + i,
3578 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3579 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3580 local_flush_tlb_pge();
3583 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3584 ol1e = *pl1e;
3585 *pl1e = l1e_from_pfn(mfn, flags);
3586 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3587 local_flush_tlb_one(virt);
3589 virt += 1UL << L1_PAGETABLE_SHIFT;
3590 mfn += 1UL;
3591 nr_mfns -= 1UL;
3595 return 0;
3598 void __set_fixmap(
3599 enum fixed_addresses idx, unsigned long p, unsigned long flags)
3601 if ( unlikely(idx >= __end_of_fixed_addresses) )
3602 BUG();
3603 map_pages_to_xen(fix_to_virt(idx), p >> PAGE_SHIFT, 1, flags);
3606 #ifdef MEMORY_GUARD
3608 void memguard_init(void)
3610 map_pages_to_xen(
3611 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3612 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3615 static void __memguard_change_range(void *p, unsigned long l, int guard)
3617 unsigned long _p = (unsigned long)p;
3618 unsigned long _l = (unsigned long)l;
3619 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3621 /* Ensure we are dealing with a page-aligned whole number of pages. */
3622 ASSERT((_p&PAGE_MASK) != 0);
3623 ASSERT((_l&PAGE_MASK) != 0);
3624 ASSERT((_p&~PAGE_MASK) == 0);
3625 ASSERT((_l&~PAGE_MASK) == 0);
3627 if ( guard )
3628 flags &= ~_PAGE_PRESENT;
3630 map_pages_to_xen(
3631 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3634 void memguard_guard_range(void *p, unsigned long l)
3636 __memguard_change_range(p, l, 1);
3639 void memguard_unguard_range(void *p, unsigned long l)
3641 __memguard_change_range(p, l, 0);
3644 #endif
3646 /*
3647 * Local variables:
3648 * mode: C
3649 * c-set-style: "BSD"
3650 * c-basic-offset: 4
3651 * tab-width: 4
3652 * indent-tabs-mode: nil
3653 * End:
3654 */