ia64/xen-unstable

view xen/arch/x86/mm.c @ 8978:066ac36725f3

Disable pagetable pinning for shadow-mode-refcount guests.

Signed-off-by: Steven Smith <sos22@cam.ac.uk>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Feb 23 14:50:00 2006 +0100 (2006-02-23)
parents b246f429f683
children 5adaa6908727
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <xen/domain_page.h>
98 #include <xen/event.h>
99 #include <xen/iocap.h>
100 #include <asm/shadow.h>
101 #include <asm/page.h>
102 #include <asm/flushtlb.h>
103 #include <asm/io.h>
104 #include <asm/uaccess.h>
105 #include <asm/ldt.h>
106 #include <asm/x86_emulate.h>
107 #include <public/memory.h>
109 #ifdef VERBOSE
110 #define MEM_LOG(_f, _a...) \
111 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
112 current->domain->domain_id , __LINE__ , ## _a )
113 #else
114 #define MEM_LOG(_f, _a...) ((void)0)
115 #endif
117 /*
118 * Both do_mmuext_op() and do_mmu_update():
119 * We steal the m.s.b. of the @count parameter to indicate whether this
120 * invocation of do_mmu_update() is resuming a previously preempted call.
121 */
122 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
124 static void free_l2_table(struct page_info *page);
125 static void free_l1_table(struct page_info *page);
127 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
128 unsigned long type);
129 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
131 /* Used to defer flushing of memory structures. */
132 static struct {
133 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
134 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
135 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
136 unsigned int deferred_ops;
137 /* If non-NULL, specifies a foreign subject domain for some operations. */
138 struct domain *foreign;
139 } __cacheline_aligned percpu_info[NR_CPUS];
141 /*
142 * Returns the current foreign domain; defaults to the currently-executing
143 * domain if a foreign override hasn't been specified.
144 */
145 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
147 /* Private domain structs for DOMID_XEN and DOMID_IO. */
148 static struct domain *dom_xen, *dom_io;
150 /* Frame table and its size in pages. */
151 struct page_info *frame_table;
152 unsigned long max_page;
153 unsigned long total_pages;
155 void __init init_frametable(void)
156 {
157 unsigned long nr_pages, page_step, i, mfn;
159 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
161 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
162 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
164 for ( i = 0; i < nr_pages; i += page_step )
165 {
166 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
167 if ( mfn == 0 )
168 panic("Not enough memory for frame table\n");
169 map_pages_to_xen(
170 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
171 mfn, page_step, PAGE_HYPERVISOR);
172 }
174 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
175 }
177 void arch_init_memory(void)
178 {
179 extern void subarch_init_memory(struct domain *);
181 unsigned long i, pfn, rstart_pfn, rend_pfn;
182 struct page_info *page;
184 memset(percpu_info, 0, sizeof(percpu_info));
186 /*
187 * Initialise our DOMID_XEN domain.
188 * Any Xen-heap pages that we will allow to be mapped will have
189 * their domain field set to dom_xen.
190 */
191 dom_xen = alloc_domain();
192 atomic_set(&dom_xen->refcnt, 1);
193 dom_xen->domain_id = DOMID_XEN;
195 /*
196 * Initialise our DOMID_IO domain.
197 * This domain owns I/O pages that are within the range of the page_info
198 * array. Mappings occur at the priv of the caller.
199 */
200 dom_io = alloc_domain();
201 atomic_set(&dom_io->refcnt, 1);
202 dom_io->domain_id = DOMID_IO;
204 /* First 1MB of RAM is historically marked as I/O. */
205 for ( i = 0; i < 0x100; i++ )
206 {
207 page = mfn_to_page(i);
208 page->count_info = PGC_allocated | 1;
209 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
210 page_set_owner(page, dom_io);
211 }
213 /* Any areas not specified as RAM by the e820 map are considered I/O. */
214 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
215 {
216 if ( e820.map[i].type != E820_RAM )
217 continue;
218 /* Every page from cursor to start of next RAM region is I/O. */
219 rstart_pfn = PFN_UP(e820.map[i].addr);
220 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
221 for ( ; pfn < rstart_pfn; pfn++ )
222 {
223 BUG_ON(!mfn_valid(pfn));
224 page = mfn_to_page(pfn);
225 page->count_info = PGC_allocated | 1;
226 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
227 page_set_owner(page, dom_io);
228 }
229 /* Skip the RAM region. */
230 pfn = rend_pfn;
231 }
232 BUG_ON(pfn != max_page);
234 subarch_init_memory(dom_xen);
235 }
237 void write_ptbase(struct vcpu *v)
238 {
239 write_cr3(pagetable_get_paddr(v->arch.monitor_table));
240 }
242 void invalidate_shadow_ldt(struct vcpu *v)
243 {
244 int i;
245 unsigned long pfn;
246 struct page_info *page;
248 if ( v->arch.shadow_ldt_mapcnt == 0 )
249 return;
251 v->arch.shadow_ldt_mapcnt = 0;
253 for ( i = 16; i < 32; i++ )
254 {
255 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
256 if ( pfn == 0 ) continue;
257 v->arch.perdomain_ptes[i] = l1e_empty();
258 page = mfn_to_page(pfn);
259 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
260 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
261 put_page_and_type(page);
262 }
264 /* Dispose of the (now possibly invalid) mappings from the TLB. */
265 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
266 }
269 static int alloc_segdesc_page(struct page_info *page)
270 {
271 struct desc_struct *descs;
272 int i;
274 descs = map_domain_page(page_to_mfn(page));
276 for ( i = 0; i < 512; i++ )
277 if ( unlikely(!check_descriptor(&descs[i])) )
278 goto fail;
280 unmap_domain_page(descs);
281 return 1;
283 fail:
284 unmap_domain_page(descs);
285 return 0;
286 }
289 /* Map shadow page at offset @off. */
290 int map_ldt_shadow_page(unsigned int off)
291 {
292 struct vcpu *v = current;
293 struct domain *d = v->domain;
294 unsigned long gmfn, mfn;
295 l1_pgentry_t l1e, nl1e;
296 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
297 int res;
299 #if defined(__x86_64__)
300 /* If in user mode, switch to kernel mode just to read LDT mapping. */
301 int user_mode = !(v->arch.flags & TF_kernel_mode);
302 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
303 #elif defined(__i386__)
304 #define TOGGLE_MODE() ((void)0)
305 #endif
307 BUG_ON(unlikely(in_irq()));
309 shadow_sync_va(v, gva);
311 TOGGLE_MODE();
312 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
313 sizeof(l1e));
314 TOGGLE_MODE();
316 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
317 return 0;
319 gmfn = l1e_get_pfn(l1e);
320 mfn = gmfn_to_mfn(d, gmfn);
321 if ( unlikely(!VALID_MFN(mfn)) )
322 return 0;
324 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
326 if ( !res && unlikely(shadow_mode_refcounts(d)) )
327 {
328 shadow_lock(d);
329 shadow_remove_all_write_access(d, gmfn, mfn);
330 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
331 shadow_unlock(d);
332 }
334 if ( unlikely(!res) )
335 return 0;
337 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
339 v->arch.perdomain_ptes[off + 16] = nl1e;
340 v->arch.shadow_ldt_mapcnt++;
342 return 1;
343 }
346 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
347 {
348 struct page_info *page = mfn_to_page(page_nr);
350 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
351 {
352 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
353 return 0;
354 }
356 return 1;
357 }
360 static int get_page_and_type_from_pagenr(unsigned long page_nr,
361 unsigned long type,
362 struct domain *d)
363 {
364 struct page_info *page = mfn_to_page(page_nr);
366 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
367 return 0;
369 if ( unlikely(!get_page_type(page, type)) )
370 {
371 put_page(page);
372 return 0;
373 }
375 return 1;
376 }
378 /*
379 * We allow root tables to map each other (a.k.a. linear page tables). It
380 * needs some special care with reference counts and access permissions:
381 * 1. The mapping entry must be read-only, or the guest may get write access
382 * to its own PTEs.
383 * 2. We must only bump the reference counts for an *already validated*
384 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
385 * on a validation that is required to complete that validation.
386 * 3. We only need to increment the reference counts for the mapped page
387 * frame if it is mapped by a different root table. This is sufficient and
388 * also necessary to allow validation of a root table mapping itself.
389 */
390 static int
391 get_linear_pagetable(
392 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
393 {
394 unsigned long x, y;
395 struct page_info *page;
396 unsigned long pfn;
398 ASSERT( !shadow_mode_refcounts(d) );
400 if ( (root_get_flags(re) & _PAGE_RW) )
401 {
402 MEM_LOG("Attempt to create linear p.t. with write perms");
403 return 0;
404 }
406 if ( (pfn = root_get_pfn(re)) != re_pfn )
407 {
408 /* Make sure the mapped frame belongs to the correct domain. */
409 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
410 return 0;
412 /*
413 * Make sure that the mapped frame is an already-validated L2 table.
414 * If so, atomically increment the count (checking for overflow).
415 */
416 page = mfn_to_page(pfn);
417 y = page->u.inuse.type_info;
418 do {
419 x = y;
420 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
421 unlikely((x & (PGT_type_mask|PGT_validated)) !=
422 (PGT_root_page_table|PGT_validated)) )
423 {
424 put_page(page);
425 return 0;
426 }
427 }
428 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
429 }
431 return 1;
432 }
434 int
435 get_page_from_l1e(
436 l1_pgentry_t l1e, struct domain *d)
437 {
438 unsigned long mfn = l1e_get_pfn(l1e);
439 struct page_info *page = mfn_to_page(mfn);
440 int okay;
442 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
443 return 1;
445 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
446 {
447 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
448 return 0;
449 }
451 if ( unlikely(!mfn_valid(mfn)) ||
452 unlikely(page_get_owner(page) == dom_io) )
453 {
454 /* DOMID_IO reverts to caller for privilege checks. */
455 if ( d == dom_io )
456 d = current->domain;
458 if ( !iomem_access_permitted(d, mfn, mfn) )
459 {
460 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
461 return 0;
462 }
464 /* No reference counting for out-of-range I/O pages. */
465 if ( !mfn_valid(mfn) )
466 return 1;
468 d = dom_io;
469 }
471 okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
472 get_page_and_type(page, d, PGT_writable_page) :
473 get_page(page, d));
474 if ( !okay )
475 {
476 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
477 " for dom%d",
478 mfn, get_gpfn_from_mfn(mfn), l1e_get_intpte(l1e), d->domain_id);
479 }
481 return okay;
482 }
485 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
486 static int
487 get_page_from_l2e(
488 l2_pgentry_t l2e, unsigned long pfn,
489 struct domain *d, unsigned long vaddr)
490 {
491 int rc;
493 ASSERT(!shadow_mode_refcounts(d));
495 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
496 return 1;
498 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
499 {
500 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
501 return 0;
502 }
504 vaddr >>= L2_PAGETABLE_SHIFT;
505 vaddr <<= PGT_va_shift;
506 rc = get_page_and_type_from_pagenr(
507 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
509 #if CONFIG_PAGING_LEVELS == 2
510 if ( unlikely(!rc) )
511 rc = get_linear_pagetable(l2e, pfn, d);
512 #endif
513 return rc;
514 }
517 #if CONFIG_PAGING_LEVELS >= 3
519 static int
520 get_page_from_l3e(
521 l3_pgentry_t l3e, unsigned long pfn,
522 struct domain *d, unsigned long vaddr)
523 {
524 int rc;
526 ASSERT(!shadow_mode_refcounts(d));
528 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
529 return 1;
531 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
532 {
533 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
534 return 0;
535 }
537 vaddr >>= L3_PAGETABLE_SHIFT;
538 vaddr <<= PGT_va_shift;
539 rc = get_page_and_type_from_pagenr(
540 l3e_get_pfn(l3e),
541 PGT_l2_page_table | vaddr, d);
542 #if CONFIG_PAGING_LEVELS == 3
543 if ( unlikely(!rc) )
544 rc = get_linear_pagetable(l3e, pfn, d);
545 #endif
546 return rc;
547 }
549 #endif /* 3 level */
551 #if CONFIG_PAGING_LEVELS >= 4
553 static int
554 get_page_from_l4e(
555 l4_pgentry_t l4e, unsigned long pfn,
556 struct domain *d, unsigned long vaddr)
557 {
558 int rc;
560 ASSERT( !shadow_mode_refcounts(d) );
562 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
563 return 1;
565 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
566 {
567 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
568 return 0;
569 }
571 vaddr >>= L4_PAGETABLE_SHIFT;
572 vaddr <<= PGT_va_shift;
573 rc = get_page_and_type_from_pagenr(
574 l4e_get_pfn(l4e),
575 PGT_l3_page_table | vaddr, d);
577 if ( unlikely(!rc) )
578 rc = get_linear_pagetable(l4e, pfn, d);
580 return rc;
581 }
583 #endif /* 4 level */
586 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
587 {
588 unsigned long pfn = l1e_get_pfn(l1e);
589 struct page_info *page = mfn_to_page(pfn);
590 struct domain *e;
591 struct vcpu *v;
593 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
594 return;
596 e = page_get_owner(page);
598 /*
599 * Check if this is a mapping that was established via a grant reference.
600 * If it was then we should not be here: we require that such mappings are
601 * explicitly destroyed via the grant-table interface.
602 *
603 * The upshot of this is that the guest can end up with active grants that
604 * it cannot destroy (because it no longer has a PTE to present to the
605 * grant-table interface). This can lead to subtle hard-to-catch bugs,
606 * hence a special grant PTE flag can be enabled to catch the bug early.
607 *
608 * (Note that the undestroyable active grants are not a security hole in
609 * Xen. All active grants can safely be cleaned up when the domain dies.)
610 */
611 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
612 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
613 {
614 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
615 l1e_get_intpte(l1e));
616 domain_crash(d);
617 }
619 if ( l1e_get_flags(l1e) & _PAGE_RW )
620 {
621 put_page_and_type(page);
622 }
623 else
624 {
625 /* We expect this is rare so we blow the entire shadow LDT. */
626 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
627 PGT_ldt_page)) &&
628 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
629 (d == e) )
630 {
631 for_each_vcpu ( d, v )
632 invalidate_shadow_ldt(v);
633 }
634 put_page(page);
635 }
636 }
639 /*
640 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
641 * Note also that this automatically deals correctly with linear p.t.'s.
642 */
643 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
644 {
645 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
646 (l2e_get_pfn(l2e) != pfn) )
647 put_page_and_type(mfn_to_page(l2e_get_pfn(l2e)));
648 }
651 #if CONFIG_PAGING_LEVELS >= 3
653 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
654 {
655 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
656 (l3e_get_pfn(l3e) != pfn) )
657 put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
658 }
660 #endif
662 #if CONFIG_PAGING_LEVELS >= 4
664 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
665 {
666 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
667 (l4e_get_pfn(l4e) != pfn) )
668 put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
669 }
671 #endif
674 static int alloc_l1_table(struct page_info *page)
675 {
676 struct domain *d = page_get_owner(page);
677 unsigned long pfn = page_to_mfn(page);
678 l1_pgentry_t *pl1e;
679 int i;
681 ASSERT(!shadow_mode_refcounts(d));
683 pl1e = map_domain_page(pfn);
685 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
686 if ( is_guest_l1_slot(i) &&
687 unlikely(!get_page_from_l1e(pl1e[i], d)) )
688 goto fail;
690 unmap_domain_page(pl1e);
691 return 1;
693 fail:
694 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
695 while ( i-- > 0 )
696 if ( is_guest_l1_slot(i) )
697 put_page_from_l1e(pl1e[i], d);
699 unmap_domain_page(pl1e);
700 return 0;
701 }
703 #ifdef CONFIG_X86_PAE
704 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
705 {
706 struct page_info *page;
707 l2_pgentry_t *pl2e;
708 l3_pgentry_t l3e3;
709 int i;
711 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
713 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
714 l3e3 = pl3e[3];
715 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
716 {
717 MEM_LOG("PAE L3 3rd slot is empty");
718 return 0;
719 }
721 /*
722 * The Xen-private mappings include linear mappings. The L2 thus cannot
723 * be shared by multiple L3 tables. The test here is adequate because:
724 * 1. Cannot appear in slots != 3 because the page would then then have
725 * unknown va backpointer, which get_page_type() explicitly disallows.
726 * 2. Cannot appear in another page table's L3:
727 * a. alloc_l3_table() calls this function and this check will fail
728 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
729 */
730 page = l3e_get_page(l3e3);
731 BUG_ON(page->u.inuse.type_info & PGT_pinned);
732 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
733 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
734 {
735 MEM_LOG("PAE L3 3rd slot is shared");
736 return 0;
737 }
739 /* Xen private mappings. */
740 pl2e = map_domain_page(l3e_get_pfn(l3e3));
741 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
742 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
743 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
744 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
745 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
746 l2e_from_page(
747 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
748 __PAGE_HYPERVISOR);
749 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
750 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
751 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
752 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
753 l2e_empty();
754 unmap_domain_page(pl2e);
756 return 1;
757 }
759 static inline int l1_backptr(
760 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
761 {
762 unsigned long l2_backptr = l2_type & PGT_va_mask;
763 BUG_ON(l2_backptr == PGT_va_unknown);
764 if ( l2_backptr == PGT_va_mutable )
765 return 0;
766 *backptr =
767 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
768 (offset_in_l2 << L2_PAGETABLE_SHIFT);
769 return 1;
770 }
772 #elif CONFIG_X86_64
773 # define create_pae_xen_mappings(pl3e) (1)
775 static inline int l1_backptr(
776 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
777 {
778 unsigned long l2_backptr = l2_type & PGT_va_mask;
779 BUG_ON(l2_backptr == PGT_va_unknown);
781 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
782 (offset_in_l2 << L2_PAGETABLE_SHIFT);
783 return 1;
784 }
786 static inline int l2_backptr(
787 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
788 {
789 unsigned long l3_backptr = l3_type & PGT_va_mask;
790 BUG_ON(l3_backptr == PGT_va_unknown);
792 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
793 (offset_in_l3 << L3_PAGETABLE_SHIFT);
794 return 1;
795 }
797 static inline int l3_backptr(
798 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
799 {
800 unsigned long l4_backptr = l4_type & PGT_va_mask;
801 BUG_ON(l4_backptr == PGT_va_unknown);
803 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
804 return 1;
805 }
806 #else
807 # define create_pae_xen_mappings(pl3e) (1)
808 # define l1_backptr(bp,l2o,l2t) \
809 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
810 #endif
812 static int alloc_l2_table(struct page_info *page, unsigned long type)
813 {
814 struct domain *d = page_get_owner(page);
815 unsigned long pfn = page_to_mfn(page);
816 unsigned long vaddr;
817 l2_pgentry_t *pl2e;
818 int i;
820 /* See the code in shadow_promote() to understand why this is here. */
821 if ( (PGT_base_page_table == PGT_l2_page_table) &&
822 unlikely(shadow_mode_refcounts(d)) )
823 return 1;
824 ASSERT(!shadow_mode_refcounts(d));
826 pl2e = map_domain_page(pfn);
828 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
829 {
830 if ( !l1_backptr(&vaddr, i, type) )
831 goto fail;
832 if ( is_guest_l2_slot(type, i) &&
833 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
834 goto fail;
835 }
837 #if CONFIG_PAGING_LEVELS == 2
838 /* Xen private mappings. */
839 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
840 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
841 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
842 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
843 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
844 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
845 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
846 l2e_from_page(
847 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
848 __PAGE_HYPERVISOR);
849 #endif
851 unmap_domain_page(pl2e);
852 return 1;
854 fail:
855 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
856 while ( i-- > 0 )
857 if ( is_guest_l2_slot(type, i) )
858 put_page_from_l2e(pl2e[i], pfn);
860 unmap_domain_page(pl2e);
861 return 0;
862 }
865 #if CONFIG_PAGING_LEVELS >= 3
866 static int alloc_l3_table(struct page_info *page, unsigned long type)
867 {
868 struct domain *d = page_get_owner(page);
869 unsigned long pfn = page_to_mfn(page);
870 unsigned long vaddr;
871 l3_pgentry_t *pl3e;
872 int i;
874 ASSERT(!shadow_mode_refcounts(d));
876 #ifdef CONFIG_X86_PAE
877 if ( pfn >= 0x100000 )
878 {
879 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
880 return 0;
881 }
882 #endif
884 pl3e = map_domain_page(pfn);
885 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
886 {
887 #if CONFIG_PAGING_LEVELS >= 4
888 if ( !l2_backptr(&vaddr, i, type) )
889 goto fail;
890 #else
891 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
892 #endif
893 if ( is_guest_l3_slot(i) &&
894 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
895 goto fail;
896 }
898 if ( !create_pae_xen_mappings(pl3e) )
899 goto fail;
901 unmap_domain_page(pl3e);
902 return 1;
904 fail:
905 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
906 while ( i-- > 0 )
907 if ( is_guest_l3_slot(i) )
908 put_page_from_l3e(pl3e[i], pfn);
910 unmap_domain_page(pl3e);
911 return 0;
912 }
913 #else
914 #define alloc_l3_table(page, type) (0)
915 #endif
917 #if CONFIG_PAGING_LEVELS >= 4
918 static int alloc_l4_table(struct page_info *page, unsigned long type)
919 {
920 struct domain *d = page_get_owner(page);
921 unsigned long pfn = page_to_mfn(page);
922 l4_pgentry_t *pl4e = page_to_virt(page);
923 unsigned long vaddr;
924 int i;
926 /* See the code in shadow_promote() to understand why this is here. */
927 if ( (PGT_base_page_table == PGT_l4_page_table) &&
928 shadow_mode_refcounts(d) )
929 return 1;
930 ASSERT(!shadow_mode_refcounts(d));
932 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
933 {
934 if ( !l3_backptr(&vaddr, i, type) )
935 goto fail;
937 if ( is_guest_l4_slot(i) &&
938 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
939 goto fail;
940 }
942 /* Xen private mappings. */
943 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
944 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
945 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
946 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
947 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
948 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
949 l4e_from_page(
950 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
951 __PAGE_HYPERVISOR);
953 return 1;
955 fail:
956 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
957 while ( i-- > 0 )
958 if ( is_guest_l4_slot(i) )
959 put_page_from_l4e(pl4e[i], pfn);
961 return 0;
962 }
963 #else
964 #define alloc_l4_table(page, type) (0)
965 #endif
968 static void free_l1_table(struct page_info *page)
969 {
970 struct domain *d = page_get_owner(page);
971 unsigned long pfn = page_to_mfn(page);
972 l1_pgentry_t *pl1e;
973 int i;
975 pl1e = map_domain_page(pfn);
977 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
978 if ( is_guest_l1_slot(i) )
979 put_page_from_l1e(pl1e[i], d);
981 unmap_domain_page(pl1e);
982 }
985 static void free_l2_table(struct page_info *page)
986 {
987 unsigned long pfn = page_to_mfn(page);
988 l2_pgentry_t *pl2e;
989 int i;
991 pl2e = map_domain_page(pfn);
993 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
994 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
995 put_page_from_l2e(pl2e[i], pfn);
997 unmap_domain_page(pl2e);
998 }
1001 #if CONFIG_PAGING_LEVELS >= 3
1003 static void free_l3_table(struct page_info *page)
1005 unsigned long pfn = page_to_mfn(page);
1006 l3_pgentry_t *pl3e;
1007 int i;
1009 pl3e = map_domain_page(pfn);
1011 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1012 if ( is_guest_l3_slot(i) )
1013 put_page_from_l3e(pl3e[i], pfn);
1015 unmap_domain_page(pl3e);
1018 #endif
1020 #if CONFIG_PAGING_LEVELS >= 4
1022 static void free_l4_table(struct page_info *page)
1024 unsigned long pfn = page_to_mfn(page);
1025 l4_pgentry_t *pl4e = page_to_virt(page);
1026 int i;
1028 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1029 if ( is_guest_l4_slot(i) )
1030 put_page_from_l4e(pl4e[i], pfn);
1033 #endif
1035 static inline int update_l1e(l1_pgentry_t *pl1e,
1036 l1_pgentry_t ol1e,
1037 l1_pgentry_t nl1e)
1039 intpte_t o = l1e_get_intpte(ol1e);
1040 intpte_t n = l1e_get_intpte(nl1e);
1042 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
1043 unlikely(o != l1e_get_intpte(ol1e)) )
1045 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1046 ": saw %" PRIpte,
1047 l1e_get_intpte(ol1e),
1048 l1e_get_intpte(nl1e),
1049 o);
1050 return 0;
1052 return 1;
1056 /* Update the L1 entry at pl1e to new value nl1e. */
1057 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1059 l1_pgentry_t ol1e;
1060 struct domain *d = current->domain;
1062 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1063 return 0;
1065 if ( unlikely(shadow_mode_refcounts(d)) )
1066 return update_l1e(pl1e, ol1e, nl1e);
1068 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1070 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1072 MEM_LOG("Bad L1 flags %x",
1073 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1074 return 0;
1077 /* Fast path for identical mapping, r/w and presence. */
1078 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1079 return update_l1e(pl1e, ol1e, nl1e);
1081 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1082 return 0;
1084 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1086 put_page_from_l1e(nl1e, d);
1087 return 0;
1090 else
1092 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1093 return 0;
1096 put_page_from_l1e(ol1e, d);
1097 return 1;
1100 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1101 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1102 _t ## e_get_intpte(_o), \
1103 _t ## e_get_intpte(_n)); \
1104 if ( __o != _t ## e_get_intpte(_o) ) \
1105 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1106 ": saw %" PRIpte "", \
1107 (_t ## e_get_intpte(_o)), \
1108 (_t ## e_get_intpte(_n)), \
1109 (__o)); \
1110 (__o == _t ## e_get_intpte(_o)); })
1112 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1113 static int mod_l2_entry(l2_pgentry_t *pl2e,
1114 l2_pgentry_t nl2e,
1115 unsigned long pfn,
1116 unsigned long type)
1118 l2_pgentry_t ol2e;
1119 unsigned long vaddr = 0;
1121 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1123 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1124 return 0;
1127 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1128 return 0;
1130 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1132 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1134 MEM_LOG("Bad L2 flags %x",
1135 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1136 return 0;
1139 /* Fast path for identical mapping and presence. */
1140 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1141 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1143 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1144 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1145 return 0;
1147 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1149 put_page_from_l2e(nl2e, pfn);
1150 return 0;
1153 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1155 return 0;
1158 put_page_from_l2e(ol2e, pfn);
1159 return 1;
1163 #if CONFIG_PAGING_LEVELS >= 3
1165 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1166 static int mod_l3_entry(l3_pgentry_t *pl3e,
1167 l3_pgentry_t nl3e,
1168 unsigned long pfn,
1169 unsigned long type)
1171 l3_pgentry_t ol3e;
1172 unsigned long vaddr;
1173 int okay;
1175 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1177 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1178 return 0;
1181 #ifdef CONFIG_X86_PAE
1182 /*
1183 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1184 * would be a pain to ensure they remain continuously valid throughout.
1185 */
1186 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1187 return 0;
1188 #endif
1190 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1191 return 0;
1193 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1195 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1197 MEM_LOG("Bad L3 flags %x",
1198 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1199 return 0;
1202 /* Fast path for identical mapping and presence. */
1203 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1204 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1206 #if CONFIG_PAGING_LEVELS >= 4
1207 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1208 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1209 return 0;
1210 #else
1211 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1212 << L3_PAGETABLE_SHIFT;
1213 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1214 return 0;
1215 #endif
1217 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1219 put_page_from_l3e(nl3e, pfn);
1220 return 0;
1223 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1225 return 0;
1228 okay = create_pae_xen_mappings(pl3e);
1229 BUG_ON(!okay);
1231 put_page_from_l3e(ol3e, pfn);
1232 return 1;
1235 #endif
1237 #if CONFIG_PAGING_LEVELS >= 4
1239 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1240 static int mod_l4_entry(l4_pgentry_t *pl4e,
1241 l4_pgentry_t nl4e,
1242 unsigned long pfn,
1243 unsigned long type)
1245 l4_pgentry_t ol4e;
1246 unsigned long vaddr;
1248 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1250 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1251 return 0;
1254 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1255 return 0;
1257 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1259 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1261 MEM_LOG("Bad L4 flags %x",
1262 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1263 return 0;
1266 /* Fast path for identical mapping and presence. */
1267 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1268 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1270 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1271 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1272 return 0;
1274 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1276 put_page_from_l4e(nl4e, pfn);
1277 return 0;
1280 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1282 return 0;
1285 put_page_from_l4e(ol4e, pfn);
1286 return 1;
1289 #endif
1291 int alloc_page_type(struct page_info *page, unsigned long type)
1293 struct domain *owner = page_get_owner(page);
1295 if ( owner != NULL )
1296 mark_dirty(owner, page_to_mfn(page));
1298 switch ( type & PGT_type_mask )
1300 case PGT_l1_page_table:
1301 return alloc_l1_table(page);
1302 case PGT_l2_page_table:
1303 return alloc_l2_table(page, type);
1304 case PGT_l3_page_table:
1305 return alloc_l3_table(page, type);
1306 case PGT_l4_page_table:
1307 return alloc_l4_table(page, type);
1308 case PGT_gdt_page:
1309 case PGT_ldt_page:
1310 return alloc_segdesc_page(page);
1311 default:
1312 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1313 type, page->u.inuse.type_info,
1314 page->count_info);
1315 BUG();
1318 return 0;
1322 void free_page_type(struct page_info *page, unsigned long type)
1324 struct domain *owner = page_get_owner(page);
1325 unsigned long gmfn;
1327 if ( likely(owner != NULL) )
1329 /*
1330 * We have to flush before the next use of the linear mapping
1331 * (e.g., update_va_mapping()) or we could end up modifying a page
1332 * that is no longer a page table (and hence screw up ref counts).
1333 */
1334 percpu_info[smp_processor_id()].deferred_ops |= DOP_FLUSH_ALL_TLBS;
1336 if ( unlikely(shadow_mode_enabled(owner)) )
1338 /* Raw page tables are rewritten during save/restore. */
1339 if ( !shadow_mode_translate(owner) )
1340 mark_dirty(owner, page_to_mfn(page));
1342 if ( shadow_mode_refcounts(owner) )
1343 return;
1345 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1346 ASSERT(VALID_M2P(gmfn));
1347 remove_shadow(owner, gmfn, type & PGT_type_mask);
1351 switch ( type & PGT_type_mask )
1353 case PGT_l1_page_table:
1354 free_l1_table(page);
1355 break;
1357 case PGT_l2_page_table:
1358 free_l2_table(page);
1359 break;
1361 #if CONFIG_PAGING_LEVELS >= 3
1362 case PGT_l3_page_table:
1363 free_l3_table(page);
1364 break;
1365 #endif
1367 #if CONFIG_PAGING_LEVELS >= 4
1368 case PGT_l4_page_table:
1369 free_l4_table(page);
1370 break;
1371 #endif
1373 default:
1374 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1375 type, page_to_mfn(page));
1376 BUG();
1381 void put_page_type(struct page_info *page)
1383 unsigned long nx, x, y = page->u.inuse.type_info;
1385 again:
1386 do {
1387 x = y;
1388 nx = x - 1;
1390 ASSERT((x & PGT_count_mask) != 0);
1392 /*
1393 * The page should always be validated while a reference is held. The
1394 * exception is during domain destruction, when we forcibly invalidate
1395 * page-table pages if we detect a referential loop.
1396 * See domain.c:relinquish_list().
1397 */
1398 ASSERT((x & PGT_validated) ||
1399 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1401 if ( unlikely((nx & PGT_count_mask) == 0) )
1403 /* Record TLB information for flush later. Races are harmless. */
1404 page->tlbflush_timestamp = tlbflush_current_time();
1406 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1407 likely(nx & PGT_validated) )
1409 /*
1410 * Page-table pages must be unvalidated when count is zero. The
1411 * 'free' is safe because the refcnt is non-zero and validated
1412 * bit is clear => other ops will spin or fail.
1413 */
1414 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1415 x & ~PGT_validated)) != x) )
1416 goto again;
1417 /* We cleared the 'valid bit' so we do the clean up. */
1418 free_page_type(page, x);
1419 /* Carry on, but with the 'valid bit' now clear. */
1420 x &= ~PGT_validated;
1421 nx &= ~PGT_validated;
1424 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1425 (PGT_pinned | 1)) &&
1426 ((nx & PGT_type_mask) != PGT_writable_page)) )
1428 /* Page is now only pinned. Make the back pointer mutable again. */
1429 nx |= PGT_va_mutable;
1432 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1436 int get_page_type(struct page_info *page, unsigned long type)
1438 unsigned long nx, x, y = page->u.inuse.type_info;
1440 again:
1441 do {
1442 x = y;
1443 nx = x + 1;
1444 if ( unlikely((nx & PGT_count_mask) == 0) )
1446 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1447 return 0;
1449 else if ( unlikely((x & PGT_count_mask) == 0) )
1451 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1453 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1455 /*
1456 * On type change we check to flush stale TLB
1457 * entries. This may be unnecessary (e.g., page
1458 * was GDT/LDT) but those circumstances should be
1459 * very rare.
1460 */
1461 cpumask_t mask =
1462 page_get_owner(page)->domain_dirty_cpumask;
1463 tlbflush_filter(mask, page->tlbflush_timestamp);
1465 if ( unlikely(!cpus_empty(mask)) )
1467 perfc_incrc(need_flush_tlb_flush);
1468 flush_tlb_mask(mask);
1472 /* We lose existing type, back pointer, and validity. */
1473 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1474 nx |= type;
1476 /* No special validation needed for writable pages. */
1477 /* Page tables and GDT/LDT need to be scanned for validity. */
1478 if ( type == PGT_writable_page )
1479 nx |= PGT_validated;
1482 else
1484 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1486 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1488 if ( current->domain == page_get_owner(page) )
1490 /*
1491 * This ensures functions like set_gdt() see up-to-date
1492 * type info without needing to clean up writable p.t.
1493 * state on the fast path.
1494 */
1495 LOCK_BIGLOCK(current->domain);
1496 cleanup_writable_pagetable(current->domain);
1497 y = page->u.inuse.type_info;
1498 UNLOCK_BIGLOCK(current->domain);
1499 /* Can we make progress now? */
1500 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1501 ((y & PGT_count_mask) == 0) )
1502 goto again;
1504 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1505 ((type & PGT_type_mask) != PGT_l1_page_table) )
1506 MEM_LOG("Bad type (saw %" PRtype_info
1507 " != exp %" PRtype_info ") "
1508 "for mfn %lx (pfn %lx)",
1509 x, type, page_to_mfn(page),
1510 get_gpfn_from_mfn(page_to_mfn(page)));
1511 return 0;
1513 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1515 /* The va backpointer is mutable, hence we update it. */
1516 nx &= ~PGT_va_mask;
1517 nx |= type; /* we know the actual type is correct */
1519 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1520 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1522 #ifdef CONFIG_X86_PAE
1523 /* We use backptr as extra typing. Cannot be unknown. */
1524 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1525 return 0;
1526 #endif
1527 /* This table is possibly mapped at multiple locations. */
1528 nx &= ~PGT_va_mask;
1529 nx |= PGT_va_unknown;
1532 if ( unlikely(!(x & PGT_validated)) )
1534 /* Someone else is updating validation of this page. Wait... */
1535 while ( (y = page->u.inuse.type_info) == x )
1536 cpu_relax();
1537 goto again;
1541 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1543 if ( unlikely(!(nx & PGT_validated)) )
1545 /* Try to validate page type; drop the new reference on failure. */
1546 if ( unlikely(!alloc_page_type(page, type)) )
1548 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1549 PRtype_info ": caf=%08x taf=%" PRtype_info,
1550 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1551 type, page->count_info, page->u.inuse.type_info);
1552 /* Noone else can get a reference. We hold the only ref. */
1553 page->u.inuse.type_info = 0;
1554 return 0;
1557 /* Noone else is updating simultaneously. */
1558 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1561 return 1;
1565 int new_guest_cr3(unsigned long mfn)
1567 struct vcpu *v = current;
1568 struct domain *d = v->domain;
1569 int okay;
1570 unsigned long old_base_mfn;
1572 if ( shadow_mode_refcounts(d) )
1573 okay = get_page_from_pagenr(mfn, d);
1574 else
1575 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1577 if ( likely(okay) )
1579 invalidate_shadow_ldt(v);
1581 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1582 v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1583 update_pagetables(v); /* update shadow_table and monitor_table */
1585 write_ptbase(v);
1587 if ( shadow_mode_refcounts(d) )
1588 put_page(mfn_to_page(old_base_mfn));
1589 else
1590 put_page_and_type(mfn_to_page(old_base_mfn));
1592 /* CR3 also holds a ref to its shadow... */
1593 if ( shadow_mode_enabled(d) )
1595 if ( v->arch.monitor_shadow_ref )
1596 put_shadow_ref(v->arch.monitor_shadow_ref);
1597 v->arch.monitor_shadow_ref =
1598 pagetable_get_pfn(v->arch.monitor_table);
1599 ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
1600 get_shadow_ref(v->arch.monitor_shadow_ref);
1603 else
1605 MEM_LOG("Error while installing new baseptr %lx", mfn);
1608 return okay;
1611 static void process_deferred_ops(unsigned int cpu)
1613 unsigned int deferred_ops;
1614 struct domain *d = current->domain;
1616 deferred_ops = percpu_info[cpu].deferred_ops;
1617 percpu_info[cpu].deferred_ops = 0;
1619 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1621 if ( shadow_mode_enabled(d) )
1622 shadow_sync_all(d);
1623 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1624 flush_tlb_mask(d->domain_dirty_cpumask);
1625 else
1626 local_flush_tlb();
1629 if ( deferred_ops & DOP_RELOAD_LDT )
1630 (void)map_ldt_shadow_page(0);
1632 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1634 put_domain(percpu_info[cpu].foreign);
1635 percpu_info[cpu].foreign = NULL;
1639 static int set_foreigndom(unsigned int cpu, domid_t domid)
1641 struct domain *e, *d = current->domain;
1642 int okay = 1;
1644 ASSERT(percpu_info[cpu].foreign == NULL);
1646 if ( likely(domid == DOMID_SELF) )
1647 goto out;
1649 if ( domid == d->domain_id )
1651 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1652 d->domain_id);
1653 okay = 0;
1655 else if ( !IS_PRIV(d) )
1657 switch ( domid )
1659 case DOMID_IO:
1660 get_knownalive_domain(dom_io);
1661 percpu_info[cpu].foreign = dom_io;
1662 break;
1663 default:
1664 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1665 okay = 0;
1666 break;
1669 else
1671 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1672 if ( e == NULL )
1674 switch ( domid )
1676 case DOMID_XEN:
1677 get_knownalive_domain(dom_xen);
1678 percpu_info[cpu].foreign = dom_xen;
1679 break;
1680 case DOMID_IO:
1681 get_knownalive_domain(dom_io);
1682 percpu_info[cpu].foreign = dom_io;
1683 break;
1684 default:
1685 MEM_LOG("Unknown domain '%u'", domid);
1686 okay = 0;
1687 break;
1692 out:
1693 return okay;
1696 static inline cpumask_t vcpumask_to_pcpumask(
1697 struct domain *d, unsigned long vmask)
1699 unsigned int vcpu_id;
1700 cpumask_t pmask = CPU_MASK_NONE;
1701 struct vcpu *v;
1703 while ( vmask != 0 )
1705 vcpu_id = find_first_set_bit(vmask);
1706 vmask &= ~(1UL << vcpu_id);
1707 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1708 ((v = d->vcpu[vcpu_id]) != NULL) )
1709 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1712 return pmask;
1715 int do_mmuext_op(
1716 struct mmuext_op *uops,
1717 unsigned int count,
1718 unsigned int *pdone,
1719 unsigned int foreigndom)
1721 struct mmuext_op op;
1722 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1723 unsigned long mfn, type, done = 0;
1724 struct page_info *page;
1725 struct vcpu *v = current;
1726 struct domain *d = v->domain;
1728 LOCK_BIGLOCK(d);
1730 cleanup_writable_pagetable(d);
1732 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1734 count &= ~MMU_UPDATE_PREEMPTED;
1735 if ( unlikely(pdone != NULL) )
1736 (void)get_user(done, pdone);
1739 if ( !set_foreigndom(cpu, foreigndom) )
1741 rc = -ESRCH;
1742 goto out;
1745 if ( unlikely(!array_access_ok(uops, count, sizeof(op))) )
1747 rc = -EFAULT;
1748 goto out;
1751 for ( i = 0; i < count; i++ )
1753 if ( hypercall_preempt_check() )
1755 rc = hypercall4_create_continuation(
1756 __HYPERVISOR_mmuext_op, uops,
1757 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1758 break;
1761 if ( unlikely(__copy_from_user(&op, uops, sizeof(op)) != 0) )
1763 MEM_LOG("Bad __copy_from_user");
1764 rc = -EFAULT;
1765 break;
1768 okay = 1;
1769 mfn = op.arg1.mfn;
1770 page = mfn_to_page(mfn);
1772 switch ( op.cmd )
1774 case MMUEXT_PIN_L1_TABLE:
1775 type = PGT_l1_page_table | PGT_va_mutable;
1777 pin_page:
1778 if ( shadow_mode_refcounts(FOREIGNDOM) )
1779 break;
1781 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1782 if ( unlikely(!okay) )
1784 MEM_LOG("Error while pinning mfn %lx", mfn);
1785 break;
1788 if ( unlikely(test_and_set_bit(_PGT_pinned,
1789 &page->u.inuse.type_info)) )
1791 MEM_LOG("Mfn %lx already pinned", mfn);
1792 put_page_and_type(page);
1793 okay = 0;
1794 break;
1797 break;
1799 #ifndef CONFIG_X86_PAE /* Unsafe on PAE because of Xen-private mappings. */
1800 case MMUEXT_PIN_L2_TABLE:
1801 type = PGT_l2_page_table | PGT_va_mutable;
1802 goto pin_page;
1803 #endif
1805 case MMUEXT_PIN_L3_TABLE:
1806 type = PGT_l3_page_table | PGT_va_mutable;
1807 goto pin_page;
1809 case MMUEXT_PIN_L4_TABLE:
1810 type = PGT_l4_page_table | PGT_va_mutable;
1811 goto pin_page;
1813 case MMUEXT_UNPIN_TABLE:
1814 if ( shadow_mode_refcounts(d) )
1815 break;
1817 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
1819 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1820 mfn, page_get_owner(page));
1822 else if ( likely(test_and_clear_bit(_PGT_pinned,
1823 &page->u.inuse.type_info)) )
1825 put_page_and_type(page);
1826 put_page(page);
1828 else
1830 okay = 0;
1831 put_page(page);
1832 MEM_LOG("Mfn %lx not pinned", mfn);
1834 break;
1836 case MMUEXT_NEW_BASEPTR:
1837 mfn = gmfn_to_mfn(current->domain, mfn);
1838 okay = new_guest_cr3(mfn);
1839 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1840 break;
1842 #ifdef __x86_64__
1843 case MMUEXT_NEW_USER_BASEPTR:
1844 okay = get_page_and_type_from_pagenr(
1845 mfn, PGT_root_page_table, d);
1846 if ( unlikely(!okay) )
1848 MEM_LOG("Error while installing new mfn %lx", mfn);
1850 else
1852 unsigned long old_mfn =
1853 pagetable_get_pfn(v->arch.guest_table_user);
1854 v->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT);
1855 if ( old_mfn != 0 )
1856 put_page_and_type(mfn_to_page(old_mfn));
1858 break;
1859 #endif
1861 case MMUEXT_TLB_FLUSH_LOCAL:
1862 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1863 break;
1865 case MMUEXT_INVLPG_LOCAL:
1866 if ( shadow_mode_enabled(d) )
1867 shadow_invlpg(v, op.arg1.linear_addr);
1868 local_flush_tlb_one(op.arg1.linear_addr);
1869 break;
1871 case MMUEXT_TLB_FLUSH_MULTI:
1872 case MMUEXT_INVLPG_MULTI:
1874 unsigned long vmask;
1875 cpumask_t pmask;
1876 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
1878 okay = 0;
1879 break;
1881 pmask = vcpumask_to_pcpumask(d, vmask);
1882 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
1883 flush_tlb_mask(pmask);
1884 else
1885 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
1886 break;
1889 case MMUEXT_TLB_FLUSH_ALL:
1890 flush_tlb_mask(d->domain_dirty_cpumask);
1891 break;
1893 case MMUEXT_INVLPG_ALL:
1894 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
1895 break;
1897 case MMUEXT_FLUSH_CACHE:
1898 if ( unlikely(!cache_flush_permitted(d)) )
1900 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
1901 okay = 0;
1903 else
1905 wbinvd();
1907 break;
1909 case MMUEXT_SET_LDT:
1911 unsigned long ptr = op.arg1.linear_addr;
1912 unsigned long ents = op.arg2.nr_ents;
1914 if ( shadow_mode_external(d) )
1916 MEM_LOG("ignoring SET_LDT hypercall from external "
1917 "domain %u", d->domain_id);
1918 okay = 0;
1920 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1921 (ents > 8192) ||
1922 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
1924 okay = 0;
1925 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
1927 else if ( (v->arch.guest_context.ldt_ents != ents) ||
1928 (v->arch.guest_context.ldt_base != ptr) )
1930 invalidate_shadow_ldt(v);
1931 v->arch.guest_context.ldt_base = ptr;
1932 v->arch.guest_context.ldt_ents = ents;
1933 load_LDT(v);
1934 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1935 if ( ents != 0 )
1936 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1938 break;
1941 default:
1942 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
1943 okay = 0;
1944 break;
1947 if ( unlikely(!okay) )
1949 rc = -EINVAL;
1950 break;
1953 uops++;
1956 out:
1957 process_deferred_ops(cpu);
1959 /* Add incremental work we have done to the @done output parameter. */
1960 if ( unlikely(pdone != NULL) )
1961 __put_user(done + i, pdone);
1963 UNLOCK_BIGLOCK(d);
1964 return rc;
1967 int do_mmu_update(
1968 struct mmu_update *ureqs,
1969 unsigned int count,
1970 unsigned int *pdone,
1971 unsigned int foreigndom)
1973 struct mmu_update req;
1974 void *va;
1975 unsigned long gpfn, gmfn, mfn;
1976 struct page_info *page;
1977 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1978 unsigned int cmd, done = 0;
1979 struct vcpu *v = current;
1980 struct domain *d = v->domain;
1981 unsigned long type_info;
1982 struct domain_mmap_cache mapcache, sh_mapcache;
1984 LOCK_BIGLOCK(d);
1986 cleanup_writable_pagetable(d);
1988 if ( unlikely(shadow_mode_enabled(d)) )
1989 check_pagetable(v, "pre-mmu"); /* debug */
1991 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1993 count &= ~MMU_UPDATE_PREEMPTED;
1994 if ( unlikely(pdone != NULL) )
1995 (void)get_user(done, pdone);
1998 domain_mmap_cache_init(&mapcache);
1999 domain_mmap_cache_init(&sh_mapcache);
2001 if ( !set_foreigndom(cpu, foreigndom) )
2003 rc = -ESRCH;
2004 goto out;
2007 perfc_incrc(calls_to_mmu_update);
2008 perfc_addc(num_page_updates, count);
2009 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2011 if ( unlikely(!array_access_ok(ureqs, count, sizeof(req))) )
2013 rc = -EFAULT;
2014 goto out;
2017 for ( i = 0; i < count; i++ )
2019 if ( hypercall_preempt_check() )
2021 rc = hypercall4_create_continuation(
2022 __HYPERVISOR_mmu_update, ureqs,
2023 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2024 break;
2027 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
2029 MEM_LOG("Bad __copy_from_user");
2030 rc = -EFAULT;
2031 break;
2034 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2035 okay = 0;
2037 switch ( cmd )
2039 /*
2040 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2041 */
2042 case MMU_NORMAL_PT_UPDATE:
2044 gmfn = req.ptr >> PAGE_SHIFT;
2045 mfn = gmfn_to_mfn(d, gmfn);
2047 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2049 MEM_LOG("Could not get page for normal update");
2050 break;
2053 va = map_domain_page_with_cache(mfn, &mapcache);
2054 va = (void *)((unsigned long)va +
2055 (unsigned long)(req.ptr & ~PAGE_MASK));
2056 page = mfn_to_page(mfn);
2058 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2060 case PGT_l1_page_table:
2061 ASSERT( !shadow_mode_refcounts(d) );
2062 if ( likely(get_page_type(
2063 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2065 l1_pgentry_t l1e;
2067 /* FIXME: doesn't work with PAE */
2068 l1e = l1e_from_intpte(req.val);
2069 okay = mod_l1_entry(va, l1e);
2070 if ( okay && unlikely(shadow_mode_enabled(d)) )
2071 shadow_l1_normal_pt_update(
2072 d, req.ptr, l1e, &sh_mapcache);
2073 put_page_type(page);
2075 break;
2076 case PGT_l2_page_table:
2077 ASSERT( !shadow_mode_refcounts(d) );
2078 if ( likely(get_page_type(
2079 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2081 l2_pgentry_t l2e;
2083 /* FIXME: doesn't work with PAE */
2084 l2e = l2e_from_intpte(req.val);
2085 okay = mod_l2_entry(
2086 (l2_pgentry_t *)va, l2e, mfn, type_info);
2087 if ( okay && unlikely(shadow_mode_enabled(d)) )
2088 shadow_l2_normal_pt_update(
2089 d, req.ptr, l2e, &sh_mapcache);
2090 put_page_type(page);
2092 break;
2093 #if CONFIG_PAGING_LEVELS >= 3
2094 case PGT_l3_page_table:
2095 ASSERT( !shadow_mode_refcounts(d) );
2096 if ( likely(get_page_type(
2097 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2099 l3_pgentry_t l3e;
2101 /* FIXME: doesn't work with PAE */
2102 l3e = l3e_from_intpte(req.val);
2103 okay = mod_l3_entry(va, l3e, mfn, type_info);
2104 if ( okay && unlikely(shadow_mode_enabled(d)) )
2105 shadow_l3_normal_pt_update(
2106 d, req.ptr, l3e, &sh_mapcache);
2107 put_page_type(page);
2109 break;
2110 #endif
2111 #if CONFIG_PAGING_LEVELS >= 4
2112 case PGT_l4_page_table:
2113 ASSERT( !shadow_mode_refcounts(d) );
2114 if ( likely(get_page_type(
2115 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2117 l4_pgentry_t l4e;
2119 l4e = l4e_from_intpte(req.val);
2120 okay = mod_l4_entry(va, l4e, mfn, type_info);
2121 if ( okay && unlikely(shadow_mode_enabled(d)) )
2122 shadow_l4_normal_pt_update(
2123 d, req.ptr, l4e, &sh_mapcache);
2124 put_page_type(page);
2126 break;
2127 #endif
2128 default:
2129 if ( likely(get_page_type(page, PGT_writable_page)) )
2131 if ( shadow_mode_enabled(d) )
2133 shadow_lock(d);
2135 __mark_dirty(d, mfn);
2137 if ( page_is_page_table(page) &&
2138 !page_out_of_sync(page) )
2140 shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
2144 *(intpte_t *)va = req.val;
2145 okay = 1;
2147 if ( shadow_mode_enabled(d) )
2148 shadow_unlock(d);
2150 put_page_type(page);
2152 break;
2155 unmap_domain_page_with_cache(va, &mapcache);
2157 put_page(page);
2158 break;
2160 case MMU_MACHPHYS_UPDATE:
2162 if ( shadow_mode_translate(FOREIGNDOM) )
2164 MEM_LOG("can't mutate m2p table of translate mode guest");
2165 break;
2168 mfn = req.ptr >> PAGE_SHIFT;
2169 gpfn = req.val;
2171 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2173 MEM_LOG("Could not get page for mach->phys update");
2174 break;
2177 set_gpfn_from_mfn(mfn, gpfn);
2178 okay = 1;
2180 mark_dirty(FOREIGNDOM, mfn);
2182 put_page(mfn_to_page(mfn));
2183 break;
2185 default:
2186 MEM_LOG("Invalid page update command %x", cmd);
2187 break;
2190 if ( unlikely(!okay) )
2192 rc = -EINVAL;
2193 break;
2196 ureqs++;
2199 out:
2200 domain_mmap_cache_destroy(&mapcache);
2201 domain_mmap_cache_destroy(&sh_mapcache);
2203 process_deferred_ops(cpu);
2205 /* Add incremental work we have done to the @done output parameter. */
2206 if ( unlikely(pdone != NULL) )
2207 __put_user(done + i, pdone);
2209 if ( unlikely(shadow_mode_enabled(d)) )
2210 check_pagetable(v, "post-mmu"); /* debug */
2212 UNLOCK_BIGLOCK(d);
2213 return rc;
2217 static int create_grant_pte_mapping(
2218 unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
2220 int rc = GNTST_okay;
2221 void *va;
2222 unsigned long gmfn, mfn;
2223 struct page_info *page;
2224 u32 type_info;
2225 l1_pgentry_t ol1e;
2226 struct domain *d = v->domain;
2228 ASSERT(spin_is_locked(&d->big_lock));
2229 ASSERT(!shadow_mode_refcounts(d));
2231 gmfn = pte_addr >> PAGE_SHIFT;
2232 mfn = gmfn_to_mfn(d, gmfn);
2234 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2236 MEM_LOG("Could not get page for normal update");
2237 return GNTST_general_error;
2240 va = map_domain_page(mfn);
2241 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2242 page = mfn_to_page(mfn);
2244 type_info = page->u.inuse.type_info;
2245 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2246 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2248 MEM_LOG("Grant map attempted to update a non-L1 page");
2249 rc = GNTST_general_error;
2250 goto failed;
2253 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) ||
2254 !update_l1e(va, ol1e, _nl1e) )
2256 put_page_type(page);
2257 rc = GNTST_general_error;
2258 goto failed;
2261 put_page_from_l1e(ol1e, d);
2263 if ( unlikely(shadow_mode_enabled(d)) )
2265 struct domain_mmap_cache sh_mapcache;
2266 domain_mmap_cache_init(&sh_mapcache);
2267 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2268 domain_mmap_cache_destroy(&sh_mapcache);
2271 put_page_type(page);
2273 failed:
2274 unmap_domain_page(va);
2275 put_page(page);
2276 return rc;
2279 static int destroy_grant_pte_mapping(
2280 unsigned long addr, unsigned long frame, struct domain *d)
2282 int rc = GNTST_okay;
2283 void *va;
2284 unsigned long gmfn, mfn;
2285 struct page_info *page;
2286 u32 type_info;
2287 l1_pgentry_t ol1e;
2289 ASSERT(!shadow_mode_refcounts(d));
2291 gmfn = addr >> PAGE_SHIFT;
2292 mfn = gmfn_to_mfn(d, gmfn);
2294 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2296 MEM_LOG("Could not get page for normal update");
2297 return GNTST_general_error;
2300 va = map_domain_page(mfn);
2301 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2302 page = mfn_to_page(mfn);
2304 type_info = page->u.inuse.type_info;
2305 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2306 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2308 MEM_LOG("Grant map attempted to update a non-L1 page");
2309 rc = GNTST_general_error;
2310 goto failed;
2313 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2315 put_page_type(page);
2316 rc = GNTST_general_error;
2317 goto failed;
2320 /* Check that the virtual address supplied is actually mapped to frame. */
2321 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2323 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2324 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2325 put_page_type(page);
2326 rc = GNTST_general_error;
2327 goto failed;
2330 /* Delete pagetable entry. */
2331 if ( unlikely(__put_user(0, (intpte_t *)va)))
2333 MEM_LOG("Cannot delete PTE entry at %p", va);
2334 put_page_type(page);
2335 rc = GNTST_general_error;
2336 goto failed;
2339 if ( unlikely(shadow_mode_enabled(d)) )
2341 struct domain_mmap_cache sh_mapcache;
2342 domain_mmap_cache_init(&sh_mapcache);
2343 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2344 domain_mmap_cache_destroy(&sh_mapcache);
2347 put_page_type(page);
2349 failed:
2350 unmap_domain_page(va);
2351 put_page(page);
2352 return rc;
2356 static int create_grant_va_mapping(
2357 unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
2359 l1_pgentry_t *pl1e, ol1e;
2360 struct domain *d = v->domain;
2362 ASSERT(spin_is_locked(&d->big_lock));
2363 ASSERT(!shadow_mode_refcounts(d));
2365 /*
2366 * This is actually overkill - we don't need to sync the L1 itself,
2367 * just everything involved in getting to this L1 (i.e. we need
2368 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2369 */
2370 __shadow_sync_va(v, va);
2372 pl1e = &linear_pg_table[l1_linear_offset(va)];
2374 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2375 !update_l1e(pl1e, ol1e, _nl1e) )
2376 return GNTST_general_error;
2378 put_page_from_l1e(ol1e, d);
2380 if ( unlikely(shadow_mode_enabled(d)) )
2381 shadow_do_update_va_mapping(va, _nl1e, v);
2383 return GNTST_okay;
2386 static int destroy_grant_va_mapping(
2387 unsigned long addr, unsigned long frame)
2389 l1_pgentry_t *pl1e, ol1e;
2391 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2393 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2395 MEM_LOG("Could not find PTE entry for address %lx", addr);
2396 return GNTST_general_error;
2399 /*
2400 * Check that the virtual address supplied is actually mapped to
2401 * frame.
2402 */
2403 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2405 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2406 l1e_get_pfn(ol1e), addr, frame);
2407 return GNTST_general_error;
2410 /* Delete pagetable entry. */
2411 if ( unlikely(__put_user(0, &pl1e->l1)) )
2413 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2414 return GNTST_general_error;
2417 return 0;
2420 int create_grant_host_mapping(
2421 unsigned long addr, unsigned long frame, unsigned int flags)
2423 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2425 if ( (flags & GNTMAP_application_map) )
2426 l1e_add_flags(pte,_PAGE_USER);
2427 if ( !(flags & GNTMAP_readonly) )
2428 l1e_add_flags(pte,_PAGE_RW);
2430 if ( flags & GNTMAP_contains_pte )
2431 return create_grant_pte_mapping(addr, pte, current);
2432 return create_grant_va_mapping(addr, pte, current);
2435 int destroy_grant_host_mapping(
2436 unsigned long addr, unsigned long frame, unsigned int flags)
2438 if ( flags & GNTMAP_contains_pte )
2439 return destroy_grant_pte_mapping(addr, frame, current->domain);
2440 return destroy_grant_va_mapping(addr, frame);
2443 int steal_page_for_grant_transfer(
2444 struct domain *d, struct page_info *page)
2446 u32 _d, _nd, x, y;
2448 spin_lock(&d->page_alloc_lock);
2450 /*
2451 * The tricky bit: atomically release ownership while there is just one
2452 * benign reference to the page (PGC_allocated). If that reference
2453 * disappears then the deallocation routine will safely spin.
2454 */
2455 _d = pickle_domptr(d);
2456 _nd = page->u.inuse._domain;
2457 y = page->count_info;
2458 do {
2459 x = y;
2460 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2461 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2462 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2463 " caf=%08x, taf=%" PRtype_info "\n",
2464 (void *) page_to_mfn(page),
2465 d, d->domain_id, unpickle_domptr(_nd), x,
2466 page->u.inuse.type_info);
2467 spin_unlock(&d->page_alloc_lock);
2468 return -1;
2470 __asm__ __volatile__(
2471 LOCK_PREFIX "cmpxchg8b %2"
2472 : "=d" (_nd), "=a" (y),
2473 "=m" (*(volatile u64 *)(&page->count_info))
2474 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2475 } while (unlikely(_nd != _d) || unlikely(y != x));
2477 /*
2478 * Unlink from 'd'. At least one reference remains (now anonymous), so
2479 * noone else is spinning to try to delete this page from 'd'.
2480 */
2481 d->tot_pages--;
2482 list_del(&page->list);
2484 spin_unlock(&d->page_alloc_lock);
2486 return 0;
2489 int do_update_va_mapping(unsigned long va, u64 val64,
2490 unsigned long flags)
2492 l1_pgentry_t val = l1e_from_intpte(val64);
2493 struct vcpu *v = current;
2494 struct domain *d = v->domain;
2495 unsigned int cpu = smp_processor_id();
2496 unsigned long vmask, bmap_ptr;
2497 cpumask_t pmask;
2498 int rc = 0;
2500 perfc_incrc(calls_to_update_va);
2502 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2503 return -EINVAL;
2505 LOCK_BIGLOCK(d);
2507 cleanup_writable_pagetable(d);
2509 if ( unlikely(shadow_mode_enabled(d)) )
2510 check_pagetable(v, "pre-va"); /* debug */
2512 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2513 val)) )
2514 rc = -EINVAL;
2516 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2518 if ( unlikely(percpu_info[cpu].foreign &&
2519 (shadow_mode_translate(d) ||
2520 shadow_mode_translate(percpu_info[cpu].foreign))) )
2522 /*
2523 * The foreign domain's pfn's are in a different namespace. There's
2524 * not enough information in just a gpte to figure out how to
2525 * (re-)shadow this entry.
2526 */
2527 domain_crash(d);
2530 rc = shadow_do_update_va_mapping(va, val, v);
2532 check_pagetable(v, "post-va"); /* debug */
2535 switch ( flags & UVMF_FLUSHTYPE_MASK )
2537 case UVMF_TLB_FLUSH:
2538 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2540 case UVMF_LOCAL:
2541 if ( unlikely(shadow_mode_enabled(d)) )
2542 shadow_sync_all(d);
2543 local_flush_tlb();
2544 break;
2545 case UVMF_ALL:
2546 flush_tlb_mask(d->domain_dirty_cpumask);
2547 break;
2548 default:
2549 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2550 rc = -EFAULT;
2551 pmask = vcpumask_to_pcpumask(d, vmask);
2552 flush_tlb_mask(pmask);
2553 break;
2555 break;
2557 case UVMF_INVLPG:
2558 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2560 case UVMF_LOCAL:
2561 if ( unlikely(shadow_mode_enabled(d)) )
2562 shadow_invlpg(current, va);
2563 local_flush_tlb_one(va);
2564 break;
2565 case UVMF_ALL:
2566 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2567 break;
2568 default:
2569 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2570 rc = -EFAULT;
2571 pmask = vcpumask_to_pcpumask(d, vmask);
2572 flush_tlb_one_mask(pmask, va);
2573 break;
2575 break;
2578 process_deferred_ops(cpu);
2580 UNLOCK_BIGLOCK(d);
2582 return rc;
2585 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2586 unsigned long flags,
2587 domid_t domid)
2589 unsigned int cpu = smp_processor_id();
2590 int rc;
2592 if ( unlikely(!IS_PRIV(current->domain)) )
2593 return -EPERM;
2595 if ( !set_foreigndom(cpu, domid) )
2596 return -ESRCH;
2598 rc = do_update_va_mapping(va, val64, flags);
2600 return rc;
2605 /*************************
2606 * Descriptor Tables
2607 */
2609 void destroy_gdt(struct vcpu *v)
2611 int i;
2612 unsigned long pfn;
2614 v->arch.guest_context.gdt_ents = 0;
2615 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2617 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2618 put_page_and_type(mfn_to_page(pfn));
2619 v->arch.perdomain_ptes[i] = l1e_empty();
2620 v->arch.guest_context.gdt_frames[i] = 0;
2625 long set_gdt(struct vcpu *v,
2626 unsigned long *frames,
2627 unsigned int entries)
2629 struct domain *d = v->domain;
2630 /* NB. There are 512 8-byte entries per GDT page. */
2631 int i, nr_pages = (entries + 511) / 512;
2632 unsigned long mfn;
2634 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2635 return -EINVAL;
2637 shadow_sync_all(d);
2639 /* Check the pages in the new GDT. */
2640 for ( i = 0; i < nr_pages; i++ ) {
2641 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2642 if ( !mfn_valid(mfn) ||
2643 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2644 goto fail;
2647 /* Tear down the old GDT. */
2648 destroy_gdt(v);
2650 /* Install the new GDT. */
2651 v->arch.guest_context.gdt_ents = entries;
2652 for ( i = 0; i < nr_pages; i++ )
2654 v->arch.guest_context.gdt_frames[i] = frames[i];
2655 v->arch.perdomain_ptes[i] =
2656 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2659 return 0;
2661 fail:
2662 while ( i-- > 0 )
2663 put_page_and_type(mfn_to_page(frames[i]));
2664 return -EINVAL;
2668 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
2670 int nr_pages = (entries + 511) / 512;
2671 unsigned long frames[16];
2672 long ret;
2674 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2675 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2676 return -EINVAL;
2678 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
2679 return -EFAULT;
2681 LOCK_BIGLOCK(current->domain);
2683 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2684 local_flush_tlb();
2686 UNLOCK_BIGLOCK(current->domain);
2688 return ret;
2692 long do_update_descriptor(u64 pa, u64 desc)
2694 struct domain *dom = current->domain;
2695 unsigned long gmfn = pa >> PAGE_SHIFT;
2696 unsigned long mfn;
2697 unsigned int offset;
2698 struct desc_struct *gdt_pent, d;
2699 struct page_info *page;
2700 long ret = -EINVAL;
2702 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2704 *(u64 *)&d = desc;
2706 LOCK_BIGLOCK(dom);
2708 if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
2709 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2710 !mfn_valid(mfn) ||
2711 !check_descriptor(&d) )
2713 UNLOCK_BIGLOCK(dom);
2714 return -EINVAL;
2717 page = mfn_to_page(mfn);
2718 if ( unlikely(!get_page(page, dom)) )
2720 UNLOCK_BIGLOCK(dom);
2721 return -EINVAL;
2724 /* Check if the given frame is in use in an unsafe context. */
2725 switch ( page->u.inuse.type_info & PGT_type_mask )
2727 case PGT_gdt_page:
2728 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2729 goto out;
2730 break;
2731 case PGT_ldt_page:
2732 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2733 goto out;
2734 break;
2735 default:
2736 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2737 goto out;
2738 break;
2741 if ( shadow_mode_enabled(dom) )
2743 shadow_lock(dom);
2745 __mark_dirty(dom, mfn);
2747 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2748 shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
2751 /* All is good so make the update. */
2752 gdt_pent = map_domain_page(mfn);
2753 memcpy(&gdt_pent[offset], &d, 8);
2754 unmap_domain_page(gdt_pent);
2756 if ( shadow_mode_enabled(dom) )
2757 shadow_unlock(dom);
2759 put_page_type(page);
2761 ret = 0; /* success */
2763 out:
2764 put_page(page);
2766 UNLOCK_BIGLOCK(dom);
2768 return ret;
2772 long arch_memory_op(int op, void *arg)
2774 struct xen_reserved_phys_area xrpa;
2775 unsigned long pfn;
2776 struct domain *d;
2777 unsigned int i;
2779 switch ( op )
2781 case XENMEM_reserved_phys_area:
2782 if ( copy_from_user(&xrpa, arg, sizeof(xrpa)) )
2783 return -EFAULT;
2785 /* No guest has more than one reserved area. */
2786 if ( xrpa.idx != 0 )
2787 return -ESRCH;
2789 if ( (d = find_domain_by_id(xrpa.domid)) == NULL )
2790 return -ESRCH;
2792 /* Only initialised translated guests have a reserved area. */
2793 if ( !shadow_mode_translate(d) || (d->max_pages == 0) )
2795 put_domain(d);
2796 return -ESRCH;
2799 LOCK_BIGLOCK(d);
2800 if ( d->arch.first_reserved_pfn == 0 )
2802 d->arch.first_reserved_pfn = pfn = d->max_pages;
2803 guest_physmap_add_page(
2804 d, pfn + 0, virt_to_maddr(d->shared_info) >> PAGE_SHIFT);
2805 for ( i = 0; i < NR_GRANT_FRAMES; i++ )
2806 guest_physmap_add_page(
2807 d, pfn + 1 + i, gnttab_shared_mfn(d, d->grant_table, i));
2809 UNLOCK_BIGLOCK(d);
2811 xrpa.first_gpfn = d->arch.first_reserved_pfn;
2812 xrpa.nr_gpfns = 32;
2814 put_domain(d);
2816 if ( copy_to_user(arg, &xrpa, sizeof(xrpa)) )
2817 return -EFAULT;
2819 break;
2821 default:
2822 return subarch_memory_op(op, arg);
2825 return 0;
2829 /*************************
2830 * Writable Pagetables
2831 */
2833 #ifdef VVERBOSE
2834 int ptwr_debug = 0x0;
2835 #define PTWR_PRINTK(_f, _a...) \
2836 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2837 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2838 #else
2839 #define PTWR_PRINTK(_f, _a...) ((void)0)
2840 #endif
2843 #ifdef PERF_ARRAYS
2845 /**************** writeable pagetables profiling functions *****************/
2847 #define ptwr_eip_buckets 256
2849 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
2851 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
2853 struct {
2854 unsigned long eip;
2855 domid_t id;
2856 u32 val[ptwr_eip_stat_thresholdN];
2857 } typedef ptwr_eip_stat_t;
2859 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
2861 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
2863 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
2864 ptwr_eip_buckets;
2867 static void ptwr_eip_stat_inc(u32 *n)
2869 int i, j;
2871 if ( ++(*n) != 0 )
2872 return;
2874 *n = ~0;
2876 /* Re-scale all buckets. */
2877 for ( i = 0; i <ptwr_eip_buckets; i++ )
2878 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2879 ptwr_eip_stats[i].val[j] >>= 1;
2882 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
2884 int i, j, b;
2886 i = b = ptwr_eip_stat_hash(eip, id);
2888 do
2890 if ( !ptwr_eip_stats[i].eip )
2892 /* doesn't exist */
2893 ptwr_eip_stats[i].eip = eip;
2894 ptwr_eip_stats[i].id = id;
2895 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
2898 if ( ptwr_eip_stats[i].eip == eip )
2900 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2901 if ( modified <= ptwr_eip_stat_threshold[j] )
2902 break;
2903 BUG_ON(j >= ptwr_eip_stat_thresholdN);
2904 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
2905 return;
2908 i = (i+1) % ptwr_eip_buckets;
2910 while ( i != b );
2912 printk("ptwr_eip_stat: too many EIPs in use!\n");
2914 ptwr_eip_stat_print();
2915 ptwr_eip_stat_reset();
2918 void ptwr_eip_stat_reset(void)
2920 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
2923 void ptwr_eip_stat_print(void)
2925 struct domain *e;
2926 domid_t d;
2927 int i, j;
2929 for_each_domain( e )
2931 d = e->domain_id;
2933 for ( i = 0; i < ptwr_eip_buckets; i++ )
2935 if ( ptwr_eip_stats[i].eip && ptwr_eip_stats[i].id != d )
2936 continue;
2938 printk("D %d eip %08lx ",
2939 ptwr_eip_stats[i].id, ptwr_eip_stats[i].eip);
2941 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2942 printk("<=%u %4u \t",
2943 ptwr_eip_stat_threshold[j],
2944 ptwr_eip_stats[i].val[j]);
2945 printk("\n");
2950 #else /* PERF_ARRAYS */
2952 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
2954 #endif
2956 /*******************************************************************/
2958 /* Re-validate a given p.t. page, given its prior snapshot */
2959 int revalidate_l1(
2960 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
2962 l1_pgentry_t ol1e, nl1e;
2963 int modified = 0, i;
2965 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2967 ol1e = snapshot[i];
2968 nl1e = l1page[i];
2970 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
2971 continue;
2973 /* Update number of entries modified. */
2974 modified++;
2976 /*
2977 * Fast path for PTEs that have merely been write-protected
2978 * (e.g., during a Unix fork()). A strict reduction in privilege.
2979 */
2980 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
2982 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
2983 put_page_type(mfn_to_page(l1e_get_pfn(nl1e)));
2984 continue;
2987 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2989 /*
2990 * Make the remaining p.t's consistent before crashing, so the
2991 * reference counts are correct.
2992 */
2993 memcpy(&l1page[i], &snapshot[i],
2994 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
2996 /* Crash the offending domain. */
2997 MEM_LOG("ptwr: Could not revalidate l1 page");
2998 domain_crash(d);
2999 break;
3002 put_page_from_l1e(ol1e, d);
3005 return modified;
3009 /* Flush the given writable p.t. page and write-protect it again. */
3010 void ptwr_flush(struct domain *d, const int which)
3012 unsigned long l1va;
3013 l1_pgentry_t *pl1e, pte, *ptep;
3014 l2_pgentry_t *pl2e;
3015 unsigned int modified;
3017 #ifdef CONFIG_X86_64
3018 struct vcpu *v = current;
3019 int user_mode = !(v->arch.flags & TF_kernel_mode);
3020 #endif
3022 ASSERT(!shadow_mode_enabled(d));
3024 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3025 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
3026 write_cr3(pagetable_get_paddr(
3027 d->arch.ptwr[which].vcpu->arch.guest_table));
3028 else
3029 TOGGLE_MODE();
3031 l1va = d->arch.ptwr[which].l1va;
3032 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
3034 /*
3035 * STEP 1. Write-protect the p.t. page so no more updates can occur.
3036 */
3038 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
3040 MEM_LOG("ptwr: Could not read pte at %p", ptep);
3041 /*
3042 * Really a bug. We could read this PTE during the initial fault,
3043 * and pagetables can't have changed meantime.
3044 */
3045 BUG();
3047 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
3048 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3049 l1e_remove_flags(pte, _PAGE_RW);
3051 /* Write-protect the p.t. page in the guest page table. */
3052 if ( unlikely(__put_user(pte, ptep)) )
3054 MEM_LOG("ptwr: Could not update pte at %p", ptep);
3055 /*
3056 * Really a bug. We could write this PTE during the initial fault,
3057 * and pagetables can't have changed meantime.
3058 */
3059 BUG();
3062 /* Ensure that there are no stale writable mappings in any TLB. */
3063 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3064 flush_tlb_one_mask(d->domain_dirty_cpumask, l1va);
3065 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3066 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3068 /*
3069 * STEP 2. Validate any modified PTEs.
3070 */
3072 if ( likely(d == current->domain) )
3074 pl1e = map_domain_page(l1e_get_pfn(pte));
3075 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3076 unmap_domain_page(pl1e);
3077 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3078 ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
3079 d->arch.ptwr[which].prev_nr_updates = modified;
3081 else
3083 /*
3084 * Must make a temporary global mapping, since we are running in the
3085 * wrong address space, so no access to our own mapcache.
3086 */
3087 pl1e = map_domain_page_global(l1e_get_pfn(pte));
3088 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3089 unmap_domain_page_global(pl1e);
3092 /*
3093 * STEP 3. Reattach the L1 p.t. page into the current address space.
3094 */
3096 if ( which == PTWR_PT_ACTIVE )
3098 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3099 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3102 /*
3103 * STEP 4. Final tidy-up.
3104 */
3106 d->arch.ptwr[which].l1va = 0;
3108 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3109 write_ptbase(current);
3110 else
3111 TOGGLE_MODE();
3114 static int ptwr_emulated_update(
3115 unsigned long addr,
3116 paddr_t old,
3117 paddr_t val,
3118 unsigned int bytes,
3119 unsigned int do_cmpxchg)
3121 unsigned long pfn, l1va;
3122 struct page_info *page;
3123 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3124 struct domain *d = current->domain;
3126 /* Aligned access only, thank you. */
3127 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3129 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3130 bytes, addr);
3131 return X86EMUL_UNHANDLEABLE;
3134 /* Turn a sub-word access into a full-word access. */
3135 if ( bytes != sizeof(paddr_t) )
3137 int rc;
3138 paddr_t full;
3139 unsigned int offset = addr & (sizeof(paddr_t)-1);
3141 /* Align address; read full word. */
3142 addr &= ~(sizeof(paddr_t)-1);
3143 if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full,
3144 sizeof(paddr_t))) )
3145 return rc;
3146 /* Mask out bits provided by caller. */
3147 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3148 /* Shift the caller value and OR in the missing bits. */
3149 val &= (((paddr_t)1 << (bytes*8)) - 1);
3150 val <<= (offset)*8;
3151 val |= full;
3152 /* Also fill in missing parts of the cmpxchg old value. */
3153 old &= (((paddr_t)1 << (bytes*8)) - 1);
3154 old <<= (offset)*8;
3155 old |= full;
3158 /*
3159 * We must not emulate an update to a PTE that is temporarily marked
3160 * writable by the batched ptwr logic, else we can corrupt page refcnts!
3161 */
3162 if ( ((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3163 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3164 ptwr_flush(d, PTWR_PT_ACTIVE);
3165 if ( ((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3166 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3167 ptwr_flush(d, PTWR_PT_INACTIVE);
3169 /* Read the PTE that maps the page being updated. */
3170 if (__copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3171 sizeof(pte)))
3173 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3174 return X86EMUL_UNHANDLEABLE;
3177 pfn = l1e_get_pfn(pte);
3178 page = mfn_to_page(pfn);
3180 /* We are looking only for read-only mappings of p.t. pages. */
3181 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
3182 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3183 (page_get_owner(page) != d) )
3185 MEM_LOG("ptwr_emulate: Page is mistyped or bad pte "
3186 "(%lx, %" PRtype_info ")",
3187 l1e_get_pfn(pte), page->u.inuse.type_info);
3188 return X86EMUL_UNHANDLEABLE;
3191 /* Check the new PTE. */
3192 nl1e = l1e_from_intpte(val);
3193 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3195 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3196 return X86EMUL_UNHANDLEABLE;
3199 /* Checked successfully: do the update (write or cmpxchg). */
3200 pl1e = map_domain_page(page_to_mfn(page));
3201 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3202 if ( do_cmpxchg )
3204 ol1e = l1e_from_intpte(old);
3205 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3207 unmap_domain_page(pl1e);
3208 put_page_from_l1e(nl1e, d);
3209 return X86EMUL_CMPXCHG_FAILED;
3212 else
3214 ol1e = *pl1e;
3215 *pl1e = nl1e;
3217 unmap_domain_page(pl1e);
3219 /* Finally, drop the old PTE. */
3220 put_page_from_l1e(ol1e, d);
3222 return X86EMUL_CONTINUE;
3225 static int ptwr_emulated_write(
3226 unsigned long addr,
3227 unsigned long val,
3228 unsigned int bytes)
3230 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3233 static int ptwr_emulated_cmpxchg(
3234 unsigned long addr,
3235 unsigned long old,
3236 unsigned long new,
3237 unsigned int bytes)
3239 return ptwr_emulated_update(addr, old, new, bytes, 1);
3242 static int ptwr_emulated_cmpxchg8b(
3243 unsigned long addr,
3244 unsigned long old,
3245 unsigned long old_hi,
3246 unsigned long new,
3247 unsigned long new_hi)
3249 return ptwr_emulated_update(
3250 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3253 static struct x86_mem_emulator ptwr_mem_emulator = {
3254 .read_std = x86_emulate_read_std,
3255 .write_std = x86_emulate_write_std,
3256 .read_emulated = x86_emulate_read_std,
3257 .write_emulated = ptwr_emulated_write,
3258 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3259 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3260 };
3262 /* Write page fault handler: check if guest is trying to modify a PTE. */
3263 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3264 struct cpu_user_regs *regs)
3266 unsigned long pfn;
3267 struct page_info *page;
3268 l1_pgentry_t *pl1e, pte;
3269 l2_pgentry_t *pl2e, l2e;
3270 int which, flags;
3271 unsigned long l2_idx;
3273 if ( unlikely(shadow_mode_enabled(d)) )
3274 return 0;
3276 /*
3277 * Attempt to read the PTE that maps the VA being accessed. By checking for
3278 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3279 */
3280 if ( !(l2e_get_flags(__linear_l2_table[l2_linear_offset(addr)]) &
3281 _PAGE_PRESENT) ||
3282 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
3283 sizeof(pte)) )
3285 return 0;
3288 pfn = l1e_get_pfn(pte);
3289 page = mfn_to_page(pfn);
3291 #ifdef CONFIG_X86_64
3292 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT | _PAGE_USER)
3293 #else
3294 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT)
3295 #endif
3297 /*
3298 * Check the required flags for a valid wrpt mapping. If the page is
3299 * already writable then we can return straight to the guest (SMP race).
3300 * We decide whether or not to propagate the fault by testing for write
3301 * permissions in page directories by writing back to the linear mapping.
3302 */
3303 if ( (flags = l1e_get_flags(pte) & WRPT_PTE_FLAGS) == WRPT_PTE_FLAGS )
3304 return !__put_user(
3305 pte.l1, &linear_pg_table[l1_linear_offset(addr)].l1);
3307 /* We are looking only for read-only mappings of p.t. pages. */
3308 if ( ((flags | _PAGE_RW) != WRPT_PTE_FLAGS) ||
3309 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3310 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3311 (page_get_owner(page) != d) )
3313 return 0;
3316 #if 0 /* Leave this in as useful for debugging */
3317 goto emulate;
3318 #endif
3320 PTWR_PRINTK("ptwr_page_fault on l1 pt at va %lx, pfn %lx, eip %lx\n",
3321 addr, pfn, (unsigned long)regs->eip);
3323 /* Get the L2 index at which this L1 p.t. is always mapped. */
3324 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3325 if ( unlikely(l2_idx >= PGT_va_unknown) )
3326 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3327 l2_idx >>= PGT_va_shift;
3329 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3330 goto emulate; /* Urk! Pagetable maps itself! */
3332 /*
3333 * Is the L1 p.t. mapped into the current address space? If so we call it
3334 * an ACTIVE p.t., otherwise it is INACTIVE.
3335 */
3336 pl2e = &__linear_l2_table[l2_idx];
3337 which = PTWR_PT_INACTIVE;
3339 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3341 /*
3342 * Check the PRESENT bit to set ACTIVE mode.
3343 * If the PRESENT bit is clear, we may be conflicting with the current
3344 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3345 * The ptwr_flush call below will restore the PRESENT bit.
3346 */
3347 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3348 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3349 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3350 which = PTWR_PT_ACTIVE;
3353 /*
3354 * If this is a multi-processor guest then ensure that the page is hooked
3355 * into at most one L2 table, which must be the one running on this VCPU.
3356 */
3357 if ( (d->vcpu[0]->next_in_list != NULL) &&
3358 ((page->u.inuse.type_info & PGT_count_mask) !=
3359 (!!(page->u.inuse.type_info & PGT_pinned) +
3360 (which == PTWR_PT_ACTIVE))) )
3362 /* Could be conflicting writable mappings from other VCPUs. */
3363 cleanup_writable_pagetable(d);
3364 goto emulate;
3367 /*
3368 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
3369 * time. If there is already one, we must flush it out.
3370 */
3371 if ( d->arch.ptwr[which].l1va )
3372 ptwr_flush(d, which);
3374 /*
3375 * If last batch made no updates then we are probably stuck. Emulate this
3376 * update to ensure we make progress.
3377 */
3378 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3380 /* Ensure that we don't get stuck in an emulation-only rut. */
3381 d->arch.ptwr[which].prev_nr_updates = 1;
3382 goto emulate;
3385 PTWR_PRINTK("[%c] batched ptwr_page_fault at va %lx, pt for %08lx, "
3386 "pfn %lx\n", PTWR_PRINT_WHICH, addr,
3387 l2_idx << L2_PAGETABLE_SHIFT, pfn);
3389 d->arch.ptwr[which].l1va = addr | 1;
3390 d->arch.ptwr[which].l2_idx = l2_idx;
3391 d->arch.ptwr[which].vcpu = current;
3393 #ifdef PERF_ARRAYS
3394 d->arch.ptwr[which].eip = regs->eip;
3395 #endif
3397 /* For safety, disconnect the L1 p.t. page from current space. */
3398 if ( which == PTWR_PT_ACTIVE )
3400 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
3401 flush_tlb_mask(d->domain_dirty_cpumask);
3404 /* Temporarily map the L1 page, and make a copy of it. */
3405 pl1e = map_domain_page(pfn);
3406 memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE);
3407 unmap_domain_page(pl1e);
3409 /* Finally, make the p.t. page writable by the guest OS. */
3410 l1e_add_flags(pte, _PAGE_RW);
3411 if ( unlikely(__put_user(pte.l1,
3412 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3414 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
3415 &linear_pg_table[l1_linear_offset(addr)]);
3416 /* Toss the writable pagetable state and crash. */
3417 d->arch.ptwr[which].l1va = 0;
3418 domain_crash(d);
3419 return 0;
3422 return EXCRET_fault_fixed;
3424 emulate:
3425 if ( x86_emulate_memop(guest_cpu_user_regs(), addr,
3426 &ptwr_mem_emulator, X86EMUL_MODE_HOST) )
3427 return 0;
3428 perfc_incrc(ptwr_emulations);
3429 return EXCRET_fault_fixed;
3432 int ptwr_init(struct domain *d)
3434 void *x = alloc_xenheap_page();
3435 void *y = alloc_xenheap_page();
3437 if ( (x == NULL) || (y == NULL) )
3439 free_xenheap_page(x);
3440 free_xenheap_page(y);
3441 return -ENOMEM;
3444 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3445 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3447 return 0;
3450 void ptwr_destroy(struct domain *d)
3452 LOCK_BIGLOCK(d);
3453 cleanup_writable_pagetable(d);
3454 UNLOCK_BIGLOCK(d);
3455 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3456 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3459 void cleanup_writable_pagetable(struct domain *d)
3461 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3462 return;
3464 if ( unlikely(shadow_mode_enabled(d)) )
3466 shadow_sync_all(d);
3468 else
3470 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3471 ptwr_flush(d, PTWR_PT_ACTIVE);
3472 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3473 ptwr_flush(d, PTWR_PT_INACTIVE);
3477 int map_pages_to_xen(
3478 unsigned long virt,
3479 unsigned long mfn,
3480 unsigned long nr_mfns,
3481 unsigned long flags)
3483 l2_pgentry_t *pl2e, ol2e;
3484 l1_pgentry_t *pl1e, ol1e;
3485 unsigned int i;
3487 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3488 flags &= ~MAP_SMALL_PAGES;
3490 while ( nr_mfns != 0 )
3492 pl2e = virt_to_xen_l2e(virt);
3494 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3495 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3496 !map_small_pages )
3498 /* Super-page mapping. */
3499 ol2e = *pl2e;
3500 *pl2e = l2e_from_pfn(mfn, flags|_PAGE_PSE);
3502 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3504 local_flush_tlb_pge();
3505 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3506 free_xen_pagetable(l2e_get_page(*pl2e));
3509 virt += 1UL << L2_PAGETABLE_SHIFT;
3510 mfn += 1UL << PAGETABLE_ORDER;
3511 nr_mfns -= 1UL << PAGETABLE_ORDER;
3513 else
3515 /* Normal page mapping. */
3516 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3518 pl1e = page_to_virt(alloc_xen_pagetable());
3519 clear_page(pl1e);
3520 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3522 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3524 pl1e = page_to_virt(alloc_xen_pagetable());
3525 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3526 pl1e[i] = l1e_from_pfn(
3527 l2e_get_pfn(*pl2e) + i,
3528 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3529 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3530 local_flush_tlb_pge();
3533 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3534 ol1e = *pl1e;
3535 *pl1e = l1e_from_pfn(mfn, flags);
3536 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3537 local_flush_tlb_one(virt);
3539 virt += 1UL << L1_PAGETABLE_SHIFT;
3540 mfn += 1UL;
3541 nr_mfns -= 1UL;
3545 return 0;
3548 void __set_fixmap(
3549 enum fixed_addresses idx, unsigned long p, unsigned long flags)
3551 if ( unlikely(idx >= __end_of_fixed_addresses) )
3552 BUG();
3553 map_pages_to_xen(fix_to_virt(idx), p >> PAGE_SHIFT, 1, flags);
3556 #ifdef MEMORY_GUARD
3558 void memguard_init(void)
3560 map_pages_to_xen(
3561 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3562 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3565 static void __memguard_change_range(void *p, unsigned long l, int guard)
3567 unsigned long _p = (unsigned long)p;
3568 unsigned long _l = (unsigned long)l;
3569 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3571 /* Ensure we are dealing with a page-aligned whole number of pages. */
3572 ASSERT((_p&PAGE_MASK) != 0);
3573 ASSERT((_l&PAGE_MASK) != 0);
3574 ASSERT((_p&~PAGE_MASK) == 0);
3575 ASSERT((_l&~PAGE_MASK) == 0);
3577 if ( guard )
3578 flags &= ~_PAGE_PRESENT;
3580 map_pages_to_xen(
3581 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3584 void memguard_guard_range(void *p, unsigned long l)
3586 __memguard_change_range(p, l, 1);
3589 void memguard_unguard_range(void *p, unsigned long l)
3591 __memguard_change_range(p, l, 0);
3594 #endif
3596 /*
3597 * Local variables:
3598 * mode: C
3599 * c-set-style: "BSD"
3600 * c-basic-offset: 4
3601 * tab-width: 4
3602 * indent-tabs-mode: nil
3603 * End:
3604 */