direct-io.hg

view xen/arch/x86/mm.c @ 8686:c0a0f4db5ab1

Create a block of reserved PFNs in shadow translate mode guests, and
move the shared info and grant table pfns into that block. This
allows us to remove the get_gnttablist dom0 op, and simplifies the
domain creation code slightly. Having the reserved block managed by
Xen may also make it slightly easier to handle the case where the
grant table needs to be extended at run time.

Suggested-by: kaf24
Signed-off-by: Steven Smith, sos22@cam.ac.uk
author sos22@douglas.cl.cam.ac.uk
date Thu Jan 26 19:40:13 2006 +0100 (2006-01-26)
parents edf1fab86618
children 990c009015e8
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <xen/domain_page.h>
98 #include <xen/event.h>
99 #include <xen/iocap.h>
100 #include <asm/shadow.h>
101 #include <asm/page.h>
102 #include <asm/flushtlb.h>
103 #include <asm/io.h>
104 #include <asm/uaccess.h>
105 #include <asm/ldt.h>
106 #include <asm/x86_emulate.h>
108 #ifdef VERBOSE
109 #define MEM_LOG(_f, _a...) \
110 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
111 current->domain->domain_id , __LINE__ , ## _a )
112 #else
113 #define MEM_LOG(_f, _a...) ((void)0)
114 #endif
116 /*
117 * Both do_mmuext_op() and do_mmu_update():
118 * We steal the m.s.b. of the @count parameter to indicate whether this
119 * invocation of do_mmu_update() is resuming a previously preempted call.
120 */
121 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
123 static void free_l2_table(struct pfn_info *page);
124 static void free_l1_table(struct pfn_info *page);
126 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
127 unsigned long type);
128 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
130 /* Used to defer flushing of memory structures. */
131 static struct {
132 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
133 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
134 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
135 unsigned int deferred_ops;
136 /* If non-NULL, specifies a foreign subject domain for some operations. */
137 struct domain *foreign;
138 } __cacheline_aligned percpu_info[NR_CPUS];
140 /*
141 * Returns the current foreign domain; defaults to the currently-executing
142 * domain if a foreign override hasn't been specified.
143 */
144 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
146 /* Private domain structs for DOMID_XEN and DOMID_IO. */
147 static struct domain *dom_xen, *dom_io;
149 /* Frame table and its size in pages. */
150 struct pfn_info *frame_table;
151 unsigned long max_page;
152 unsigned long total_pages;
154 void __init init_frametable(void)
155 {
156 unsigned long nr_pages, page_step, i, pfn;
158 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
160 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
161 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
163 for ( i = 0; i < nr_pages; i += page_step )
164 {
165 pfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
166 if ( pfn == 0 )
167 panic("Not enough memory for frame table\n");
168 map_pages_to_xen(
169 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
170 pfn, page_step, PAGE_HYPERVISOR);
171 }
173 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
174 }
176 void arch_init_memory(void)
177 {
178 extern void subarch_init_memory(struct domain *);
180 unsigned long i, pfn, rstart_pfn, rend_pfn;
181 struct pfn_info *page;
183 memset(percpu_info, 0, sizeof(percpu_info));
185 /*
186 * Initialise our DOMID_XEN domain.
187 * Any Xen-heap pages that we will allow to be mapped will have
188 * their domain field set to dom_xen.
189 */
190 dom_xen = alloc_domain();
191 atomic_set(&dom_xen->refcnt, 1);
192 dom_xen->domain_id = DOMID_XEN;
194 /*
195 * Initialise our DOMID_IO domain.
196 * This domain owns I/O pages that are within the range of the pfn_info
197 * array. Mappings occur at the priv of the caller.
198 */
199 dom_io = alloc_domain();
200 atomic_set(&dom_io->refcnt, 1);
201 dom_io->domain_id = DOMID_IO;
203 /* First 1MB of RAM is historically marked as I/O. */
204 for ( i = 0; i < 0x100; i++ )
205 {
206 page = pfn_to_page(i);
207 page->count_info = PGC_allocated | 1;
208 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
209 page_set_owner(page, dom_io);
210 }
212 /* Any areas not specified as RAM by the e820 map are considered I/O. */
213 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
214 {
215 if ( e820.map[i].type != E820_RAM )
216 continue;
217 /* Every page from cursor to start of next RAM region is I/O. */
218 rstart_pfn = PFN_UP(e820.map[i].addr);
219 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
220 for ( ; pfn < rstart_pfn; pfn++ )
221 {
222 BUG_ON(!pfn_valid(pfn));
223 page = pfn_to_page(pfn);
224 page->count_info = PGC_allocated | 1;
225 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
226 page_set_owner(page, dom_io);
227 }
228 /* Skip the RAM region. */
229 pfn = rend_pfn;
230 }
231 BUG_ON(pfn != max_page);
233 subarch_init_memory(dom_xen);
234 }
236 void write_ptbase(struct vcpu *v)
237 {
238 write_cr3(pagetable_get_paddr(v->arch.monitor_table));
239 }
241 void invalidate_shadow_ldt(struct vcpu *v)
242 {
243 int i;
244 unsigned long pfn;
245 struct pfn_info *page;
247 if ( v->arch.shadow_ldt_mapcnt == 0 )
248 return;
250 v->arch.shadow_ldt_mapcnt = 0;
252 for ( i = 16; i < 32; i++ )
253 {
254 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
255 if ( pfn == 0 ) continue;
256 v->arch.perdomain_ptes[i] = l1e_empty();
257 page = pfn_to_page(pfn);
258 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
259 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
260 put_page_and_type(page);
261 }
263 /* Dispose of the (now possibly invalid) mappings from the TLB. */
264 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
265 }
268 static int alloc_segdesc_page(struct pfn_info *page)
269 {
270 struct desc_struct *descs;
271 int i;
273 descs = map_domain_page(page_to_pfn(page));
275 for ( i = 0; i < 512; i++ )
276 if ( unlikely(!check_descriptor(&descs[i])) )
277 goto fail;
279 unmap_domain_page(descs);
280 return 1;
282 fail:
283 unmap_domain_page(descs);
284 return 0;
285 }
288 /* Map shadow page at offset @off. */
289 int map_ldt_shadow_page(unsigned int off)
290 {
291 struct vcpu *v = current;
292 struct domain *d = v->domain;
293 unsigned long gpfn, gmfn;
294 l1_pgentry_t l1e, nl1e;
295 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
296 int res;
298 #if defined(__x86_64__)
299 /* If in user mode, switch to kernel mode just to read LDT mapping. */
300 int user_mode = !(v->arch.flags & TF_kernel_mode);
301 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
302 #elif defined(__i386__)
303 #define TOGGLE_MODE() ((void)0)
304 #endif
306 BUG_ON(unlikely(in_irq()));
308 shadow_sync_va(v, gva);
310 TOGGLE_MODE();
311 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
312 sizeof(l1e));
313 TOGGLE_MODE();
315 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
316 return 0;
318 gpfn = l1e_get_pfn(l1e);
319 gmfn = __gpfn_to_mfn(d, gpfn);
320 if ( unlikely(!VALID_MFN(gmfn)) )
321 return 0;
323 res = get_page_and_type(pfn_to_page(gmfn), d, PGT_ldt_page);
325 if ( !res && unlikely(shadow_mode_refcounts(d)) )
326 {
327 shadow_lock(d);
328 shadow_remove_all_write_access(d, gpfn, gmfn);
329 res = get_page_and_type(pfn_to_page(gmfn), d, PGT_ldt_page);
330 shadow_unlock(d);
331 }
333 if ( unlikely(!res) )
334 return 0;
336 nl1e = l1e_from_pfn(gmfn, l1e_get_flags(l1e) | _PAGE_RW);
338 v->arch.perdomain_ptes[off + 16] = nl1e;
339 v->arch.shadow_ldt_mapcnt++;
341 return 1;
342 }
345 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
346 {
347 struct pfn_info *page = pfn_to_page(page_nr);
349 if ( unlikely(!pfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
350 {
351 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
352 return 0;
353 }
355 return 1;
356 }
359 static int get_page_and_type_from_pagenr(unsigned long page_nr,
360 unsigned long type,
361 struct domain *d)
362 {
363 struct pfn_info *page = pfn_to_page(page_nr);
365 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
366 return 0;
368 if ( unlikely(!get_page_type(page, type)) )
369 {
370 put_page(page);
371 return 0;
372 }
374 return 1;
375 }
377 /*
378 * We allow root tables to map each other (a.k.a. linear page tables). It
379 * needs some special care with reference counts and access permissions:
380 * 1. The mapping entry must be read-only, or the guest may get write access
381 * to its own PTEs.
382 * 2. We must only bump the reference counts for an *already validated*
383 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
384 * on a validation that is required to complete that validation.
385 * 3. We only need to increment the reference counts for the mapped page
386 * frame if it is mapped by a different root table. This is sufficient and
387 * also necessary to allow validation of a root table mapping itself.
388 */
389 static int
390 get_linear_pagetable(
391 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
392 {
393 unsigned long x, y;
394 struct pfn_info *page;
395 unsigned long pfn;
397 ASSERT( !shadow_mode_refcounts(d) );
399 if ( (root_get_flags(re) & _PAGE_RW) )
400 {
401 MEM_LOG("Attempt to create linear p.t. with write perms");
402 return 0;
403 }
405 if ( (pfn = root_get_pfn(re)) != re_pfn )
406 {
407 /* Make sure the mapped frame belongs to the correct domain. */
408 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
409 return 0;
411 /*
412 * Make sure that the mapped frame is an already-validated L2 table.
413 * If so, atomically increment the count (checking for overflow).
414 */
415 page = pfn_to_page(pfn);
416 y = page->u.inuse.type_info;
417 do {
418 x = y;
419 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
420 unlikely((x & (PGT_type_mask|PGT_validated)) !=
421 (PGT_root_page_table|PGT_validated)) )
422 {
423 put_page(page);
424 return 0;
425 }
426 }
427 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
428 }
430 return 1;
431 }
433 int
434 get_page_from_l1e(
435 l1_pgentry_t l1e, struct domain *d)
436 {
437 unsigned long mfn = l1e_get_pfn(l1e);
438 struct pfn_info *page = pfn_to_page(mfn);
439 int okay;
441 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
442 return 1;
444 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
445 {
446 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
447 return 0;
448 }
450 if ( unlikely(!pfn_valid(mfn)) ||
451 unlikely(page_get_owner(page) == dom_io) )
452 {
453 /* DOMID_IO reverts to caller for privilege checks. */
454 if ( d == dom_io )
455 d = current->domain;
457 if ( !iomem_access_permitted(d, mfn, mfn) )
458 {
459 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
460 return 0;
461 }
463 /* No reference counting for out-of-range I/O pages. */
464 if ( !pfn_valid(mfn) )
465 return 1;
467 d = dom_io;
468 }
470 okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
471 get_page_and_type(page, d, PGT_writable_page) :
472 get_page(page, d));
473 if ( !okay )
474 {
475 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
476 " for dom%d",
477 mfn, get_pfn_from_mfn(mfn), l1e_get_intpte(l1e), d->domain_id);
478 }
480 return okay;
481 }
484 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
485 static int
486 get_page_from_l2e(
487 l2_pgentry_t l2e, unsigned long pfn,
488 struct domain *d, unsigned long vaddr)
489 {
490 int rc;
492 ASSERT(!shadow_mode_refcounts(d));
494 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
495 return 1;
497 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
498 {
499 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
500 return 0;
501 }
503 vaddr >>= L2_PAGETABLE_SHIFT;
504 vaddr <<= PGT_va_shift;
505 rc = get_page_and_type_from_pagenr(
506 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
508 #if CONFIG_PAGING_LEVELS == 2
509 if ( unlikely(!rc) )
510 rc = get_linear_pagetable(l2e, pfn, d);
511 #endif
512 return rc;
513 }
516 #if CONFIG_PAGING_LEVELS >= 3
518 static int
519 get_page_from_l3e(
520 l3_pgentry_t l3e, unsigned long pfn,
521 struct domain *d, unsigned long vaddr)
522 {
523 int rc;
525 ASSERT(!shadow_mode_refcounts(d));
527 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
528 return 1;
530 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
531 {
532 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
533 return 0;
534 }
536 vaddr >>= L3_PAGETABLE_SHIFT;
537 vaddr <<= PGT_va_shift;
538 rc = get_page_and_type_from_pagenr(
539 l3e_get_pfn(l3e),
540 PGT_l2_page_table | vaddr, d);
541 #if CONFIG_PAGING_LEVELS == 3
542 if ( unlikely(!rc) )
543 rc = get_linear_pagetable(l3e, pfn, d);
544 #endif
545 return rc;
546 }
548 #endif /* 3 level */
550 #if CONFIG_PAGING_LEVELS >= 4
552 static int
553 get_page_from_l4e(
554 l4_pgentry_t l4e, unsigned long pfn,
555 struct domain *d, unsigned long vaddr)
556 {
557 int rc;
559 ASSERT( !shadow_mode_refcounts(d) );
561 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
562 return 1;
564 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
565 {
566 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
567 return 0;
568 }
570 vaddr >>= L4_PAGETABLE_SHIFT;
571 vaddr <<= PGT_va_shift;
572 rc = get_page_and_type_from_pagenr(
573 l4e_get_pfn(l4e),
574 PGT_l3_page_table | vaddr, d);
576 if ( unlikely(!rc) )
577 rc = get_linear_pagetable(l4e, pfn, d);
579 return rc;
580 }
582 #endif /* 4 level */
585 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
586 {
587 unsigned long pfn = l1e_get_pfn(l1e);
588 struct pfn_info *page = pfn_to_page(pfn);
589 struct domain *e;
590 struct vcpu *v;
592 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !pfn_valid(pfn) )
593 return;
595 e = page_get_owner(page);
597 /*
598 * Check if this is a mapping that was established via a grant reference.
599 * If it was then we should not be here: we require that such mappings are
600 * explicitly destroyed via the grant-table interface.
601 *
602 * The upshot of this is that the guest can end up with active grants that
603 * it cannot destroy (because it no longer has a PTE to present to the
604 * grant-table interface). This can lead to subtle hard-to-catch bugs,
605 * hence a special grant PTE flag can be enabled to catch the bug early.
606 *
607 * (Note that the undestroyable active grants are not a security hole in
608 * Xen. All active grants can safely be cleaned up when the domain dies.)
609 */
610 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
611 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
612 {
613 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
614 l1e_get_intpte(l1e));
615 domain_crash(d);
616 }
618 if ( l1e_get_flags(l1e) & _PAGE_RW )
619 {
620 put_page_and_type(page);
621 }
622 else
623 {
624 /* We expect this is rare so we blow the entire shadow LDT. */
625 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
626 PGT_ldt_page)) &&
627 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
628 (d == e) )
629 {
630 for_each_vcpu ( d, v )
631 invalidate_shadow_ldt(v);
632 }
633 put_page(page);
634 }
635 }
638 /*
639 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
640 * Note also that this automatically deals correctly with linear p.t.'s.
641 */
642 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
643 {
644 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
645 (l2e_get_pfn(l2e) != pfn) )
646 put_page_and_type(pfn_to_page(l2e_get_pfn(l2e)));
647 }
650 #if CONFIG_PAGING_LEVELS >= 3
652 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
653 {
654 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
655 (l3e_get_pfn(l3e) != pfn) )
656 put_page_and_type(pfn_to_page(l3e_get_pfn(l3e)));
657 }
659 #endif
661 #if CONFIG_PAGING_LEVELS >= 4
663 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
664 {
665 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
666 (l4e_get_pfn(l4e) != pfn) )
667 put_page_and_type(pfn_to_page(l4e_get_pfn(l4e)));
668 }
670 #endif
673 static int alloc_l1_table(struct pfn_info *page)
674 {
675 struct domain *d = page_get_owner(page);
676 unsigned long pfn = page_to_pfn(page);
677 l1_pgentry_t *pl1e;
678 int i;
680 ASSERT(!shadow_mode_refcounts(d));
682 pl1e = map_domain_page(pfn);
684 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
685 if ( is_guest_l1_slot(i) &&
686 unlikely(!get_page_from_l1e(pl1e[i], d)) )
687 goto fail;
689 unmap_domain_page(pl1e);
690 return 1;
692 fail:
693 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
694 while ( i-- > 0 )
695 if ( is_guest_l1_slot(i) )
696 put_page_from_l1e(pl1e[i], d);
698 unmap_domain_page(pl1e);
699 return 0;
700 }
702 #ifdef CONFIG_X86_PAE
703 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
704 {
705 struct pfn_info *page;
706 l2_pgentry_t *pl2e;
707 l3_pgentry_t l3e3;
708 int i;
710 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
712 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
713 l3e3 = pl3e[3];
714 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
715 {
716 MEM_LOG("PAE L3 3rd slot is empty");
717 return 0;
718 }
720 /*
721 * The Xen-private mappings include linear mappings. The L2 thus cannot
722 * be shared by multiple L3 tables. The test here is adequate because:
723 * 1. Cannot appear in slots != 3 because the page would then then have
724 * unknown va backpointer, which get_page_type() explicitly disallows.
725 * 2. Cannot appear in another page table's L3:
726 * a. alloc_l3_table() calls this function and this check will fail
727 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
728 */
729 page = l3e_get_page(l3e3);
730 BUG_ON(page->u.inuse.type_info & PGT_pinned);
731 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
732 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
733 {
734 MEM_LOG("PAE L3 3rd slot is shared");
735 return 0;
736 }
738 /* Xen private mappings. */
739 pl2e = map_domain_page(l3e_get_pfn(l3e3));
740 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
741 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
742 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
743 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
744 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
745 l2e_from_page(
746 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
747 __PAGE_HYPERVISOR);
748 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
749 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
750 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
751 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
752 l2e_empty();
753 unmap_domain_page(pl2e);
755 return 1;
756 }
758 static inline int l1_backptr(
759 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
760 {
761 unsigned long l2_backptr = l2_type & PGT_va_mask;
762 BUG_ON(l2_backptr == PGT_va_unknown);
763 if ( l2_backptr == PGT_va_mutable )
764 return 0;
765 *backptr =
766 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
767 (offset_in_l2 << L2_PAGETABLE_SHIFT);
768 return 1;
769 }
771 #elif CONFIG_X86_64
772 # define create_pae_xen_mappings(pl3e) (1)
774 static inline int l1_backptr(
775 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
776 {
777 unsigned long l2_backptr = l2_type & PGT_va_mask;
778 BUG_ON(l2_backptr == PGT_va_unknown);
780 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
781 (offset_in_l2 << L2_PAGETABLE_SHIFT);
782 return 1;
783 }
785 static inline int l2_backptr(
786 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
787 {
788 unsigned long l3_backptr = l3_type & PGT_va_mask;
789 BUG_ON(l3_backptr == PGT_va_unknown);
791 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
792 (offset_in_l3 << L3_PAGETABLE_SHIFT);
793 return 1;
794 }
796 static inline int l3_backptr(
797 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
798 {
799 unsigned long l4_backptr = l4_type & PGT_va_mask;
800 BUG_ON(l4_backptr == PGT_va_unknown);
802 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
803 return 1;
804 }
805 #else
806 # define create_pae_xen_mappings(pl3e) (1)
807 # define l1_backptr(bp,l2o,l2t) \
808 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
809 #endif
811 static int alloc_l2_table(struct pfn_info *page, unsigned long type)
812 {
813 struct domain *d = page_get_owner(page);
814 unsigned long pfn = page_to_pfn(page);
815 unsigned long vaddr;
816 l2_pgentry_t *pl2e;
817 int i;
819 /* See the code in shadow_promote() to understand why this is here. */
820 if ( (PGT_base_page_table == PGT_l2_page_table) &&
821 unlikely(shadow_mode_refcounts(d)) )
822 return 1;
823 ASSERT(!shadow_mode_refcounts(d));
825 pl2e = map_domain_page(pfn);
827 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
828 {
829 if ( !l1_backptr(&vaddr, i, type) )
830 goto fail;
831 if ( is_guest_l2_slot(type, i) &&
832 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
833 goto fail;
834 }
836 #if CONFIG_PAGING_LEVELS == 2
837 /* Xen private mappings. */
838 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
839 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
840 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
841 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
842 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
843 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
844 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
845 l2e_from_page(
846 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
847 __PAGE_HYPERVISOR);
848 #endif
850 unmap_domain_page(pl2e);
851 return 1;
853 fail:
854 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
855 while ( i-- > 0 )
856 if ( is_guest_l2_slot(type, i) )
857 put_page_from_l2e(pl2e[i], pfn);
859 unmap_domain_page(pl2e);
860 return 0;
861 }
864 #if CONFIG_PAGING_LEVELS >= 3
865 static int alloc_l3_table(struct pfn_info *page, unsigned long type)
866 {
867 struct domain *d = page_get_owner(page);
868 unsigned long pfn = page_to_pfn(page);
869 unsigned long vaddr;
870 l3_pgentry_t *pl3e;
871 int i;
873 ASSERT(!shadow_mode_refcounts(d));
875 #ifdef CONFIG_X86_PAE
876 if ( pfn >= 0x100000 )
877 {
878 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
879 return 0;
880 }
881 #endif
883 pl3e = map_domain_page(pfn);
884 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
885 {
886 #if CONFIG_PAGING_LEVELS >= 4
887 if ( !l2_backptr(&vaddr, i, type) )
888 goto fail;
889 #else
890 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
891 #endif
892 if ( is_guest_l3_slot(i) &&
893 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
894 goto fail;
895 }
897 if ( !create_pae_xen_mappings(pl3e) )
898 goto fail;
900 unmap_domain_page(pl3e);
901 return 1;
903 fail:
904 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
905 while ( i-- > 0 )
906 if ( is_guest_l3_slot(i) )
907 put_page_from_l3e(pl3e[i], pfn);
909 unmap_domain_page(pl3e);
910 return 0;
911 }
912 #else
913 #define alloc_l3_table(page, type) (0)
914 #endif
916 #if CONFIG_PAGING_LEVELS >= 4
917 static int alloc_l4_table(struct pfn_info *page, unsigned long type)
918 {
919 struct domain *d = page_get_owner(page);
920 unsigned long pfn = page_to_pfn(page);
921 l4_pgentry_t *pl4e = page_to_virt(page);
922 unsigned long vaddr;
923 int i;
925 /* See the code in shadow_promote() to understand why this is here. */
926 if ( (PGT_base_page_table == PGT_l4_page_table) &&
927 shadow_mode_refcounts(d) )
928 return 1;
929 ASSERT(!shadow_mode_refcounts(d));
931 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
932 {
933 if ( !l3_backptr(&vaddr, i, type) )
934 goto fail;
936 if ( is_guest_l4_slot(i) &&
937 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
938 goto fail;
939 }
941 /* Xen private mappings. */
942 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
943 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
944 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
945 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
946 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
947 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
948 l4e_from_page(
949 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
950 __PAGE_HYPERVISOR);
952 return 1;
954 fail:
955 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
956 while ( i-- > 0 )
957 if ( is_guest_l4_slot(i) )
958 put_page_from_l4e(pl4e[i], pfn);
960 return 0;
961 }
962 #else
963 #define alloc_l4_table(page, type) (0)
964 #endif
967 static void free_l1_table(struct pfn_info *page)
968 {
969 struct domain *d = page_get_owner(page);
970 unsigned long pfn = page_to_pfn(page);
971 l1_pgentry_t *pl1e;
972 int i;
974 pl1e = map_domain_page(pfn);
976 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
977 if ( is_guest_l1_slot(i) )
978 put_page_from_l1e(pl1e[i], d);
980 unmap_domain_page(pl1e);
981 }
984 static void free_l2_table(struct pfn_info *page)
985 {
986 unsigned long pfn = page_to_pfn(page);
987 l2_pgentry_t *pl2e;
988 int i;
990 pl2e = map_domain_page(pfn);
992 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
993 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
994 put_page_from_l2e(pl2e[i], pfn);
996 unmap_domain_page(pl2e);
997 }
1000 #if CONFIG_PAGING_LEVELS >= 3
1002 static void free_l3_table(struct pfn_info *page)
1004 unsigned long pfn = page_to_pfn(page);
1005 l3_pgentry_t *pl3e;
1006 int i;
1008 pl3e = map_domain_page(pfn);
1010 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1011 if ( is_guest_l3_slot(i) )
1012 put_page_from_l3e(pl3e[i], pfn);
1014 unmap_domain_page(pl3e);
1017 #endif
1019 #if CONFIG_PAGING_LEVELS >= 4
1021 static void free_l4_table(struct pfn_info *page)
1023 unsigned long pfn = page_to_pfn(page);
1024 l4_pgentry_t *pl4e = page_to_virt(page);
1025 int i;
1027 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1028 if ( is_guest_l4_slot(i) )
1029 put_page_from_l4e(pl4e[i], pfn);
1032 #endif
1034 static inline int update_l1e(l1_pgentry_t *pl1e,
1035 l1_pgentry_t ol1e,
1036 l1_pgentry_t nl1e)
1038 intpte_t o = l1e_get_intpte(ol1e);
1039 intpte_t n = l1e_get_intpte(nl1e);
1041 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
1042 unlikely(o != l1e_get_intpte(ol1e)) )
1044 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1045 ": saw %" PRIpte,
1046 l1e_get_intpte(ol1e),
1047 l1e_get_intpte(nl1e),
1048 o);
1049 return 0;
1051 return 1;
1055 /* Update the L1 entry at pl1e to new value nl1e. */
1056 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1058 l1_pgentry_t ol1e;
1059 struct domain *d = current->domain;
1061 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1062 return 0;
1064 if ( unlikely(shadow_mode_refcounts(d)) )
1065 return update_l1e(pl1e, ol1e, nl1e);
1067 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1069 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1071 MEM_LOG("Bad L1 flags %x",
1072 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1073 return 0;
1076 /* Fast path for identical mapping, r/w and presence. */
1077 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1078 return update_l1e(pl1e, ol1e, nl1e);
1080 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1081 return 0;
1083 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1085 put_page_from_l1e(nl1e, d);
1086 return 0;
1089 else
1091 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1092 return 0;
1095 put_page_from_l1e(ol1e, d);
1096 return 1;
1099 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1100 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1101 _t ## e_get_intpte(_o), \
1102 _t ## e_get_intpte(_n)); \
1103 if ( __o != _t ## e_get_intpte(_o) ) \
1104 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1105 ": saw %" PRIpte "", \
1106 (_t ## e_get_intpte(_o)), \
1107 (_t ## e_get_intpte(_n)), \
1108 (__o)); \
1109 (__o == _t ## e_get_intpte(_o)); })
1111 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1112 static int mod_l2_entry(l2_pgentry_t *pl2e,
1113 l2_pgentry_t nl2e,
1114 unsigned long pfn,
1115 unsigned long type)
1117 l2_pgentry_t ol2e;
1118 unsigned long vaddr = 0;
1120 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1122 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1123 return 0;
1126 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1127 return 0;
1129 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1131 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1133 MEM_LOG("Bad L2 flags %x",
1134 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1135 return 0;
1138 /* Fast path for identical mapping and presence. */
1139 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1140 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1142 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1143 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1144 return 0;
1146 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1148 put_page_from_l2e(nl2e, pfn);
1149 return 0;
1152 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1154 return 0;
1157 put_page_from_l2e(ol2e, pfn);
1158 return 1;
1162 #if CONFIG_PAGING_LEVELS >= 3
1164 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1165 static int mod_l3_entry(l3_pgentry_t *pl3e,
1166 l3_pgentry_t nl3e,
1167 unsigned long pfn,
1168 unsigned long type)
1170 l3_pgentry_t ol3e;
1171 unsigned long vaddr;
1172 int okay;
1174 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1176 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1177 return 0;
1180 #ifdef CONFIG_X86_PAE
1181 /*
1182 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1183 * would be a pain to ensure they remain continuously valid throughout.
1184 */
1185 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1186 return 0;
1187 #endif
1189 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1190 return 0;
1192 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1194 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1196 MEM_LOG("Bad L3 flags %x",
1197 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1198 return 0;
1201 /* Fast path for identical mapping and presence. */
1202 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1203 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1205 #if CONFIG_PAGING_LEVELS >= 4
1206 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1207 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1208 return 0;
1209 #else
1210 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1211 << L3_PAGETABLE_SHIFT;
1212 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1213 return 0;
1214 #endif
1216 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1218 put_page_from_l3e(nl3e, pfn);
1219 return 0;
1222 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1224 return 0;
1227 okay = create_pae_xen_mappings(pl3e);
1228 BUG_ON(!okay);
1230 put_page_from_l3e(ol3e, pfn);
1231 return 1;
1234 #endif
1236 #if CONFIG_PAGING_LEVELS >= 4
1238 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1239 static int mod_l4_entry(l4_pgentry_t *pl4e,
1240 l4_pgentry_t nl4e,
1241 unsigned long pfn,
1242 unsigned long type)
1244 l4_pgentry_t ol4e;
1245 unsigned long vaddr;
1247 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1249 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1250 return 0;
1253 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1254 return 0;
1256 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1258 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1260 MEM_LOG("Bad L4 flags %x",
1261 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1262 return 0;
1265 /* Fast path for identical mapping and presence. */
1266 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1267 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1269 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1270 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1271 return 0;
1273 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1275 put_page_from_l4e(nl4e, pfn);
1276 return 0;
1279 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1281 return 0;
1284 put_page_from_l4e(ol4e, pfn);
1285 return 1;
1288 #endif
1290 int alloc_page_type(struct pfn_info *page, unsigned long type)
1292 struct domain *owner = page_get_owner(page);
1294 if ( owner != NULL )
1295 mark_dirty(owner, page_to_pfn(page));
1297 switch ( type & PGT_type_mask )
1299 case PGT_l1_page_table:
1300 return alloc_l1_table(page);
1301 case PGT_l2_page_table:
1302 return alloc_l2_table(page, type);
1303 case PGT_l3_page_table:
1304 return alloc_l3_table(page, type);
1305 case PGT_l4_page_table:
1306 return alloc_l4_table(page, type);
1307 case PGT_gdt_page:
1308 case PGT_ldt_page:
1309 return alloc_segdesc_page(page);
1310 default:
1311 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1312 type, page->u.inuse.type_info,
1313 page->count_info);
1314 BUG();
1317 return 0;
1321 void free_page_type(struct pfn_info *page, unsigned long type)
1323 struct domain *owner = page_get_owner(page);
1324 unsigned long gpfn;
1326 if ( likely(owner != NULL) )
1328 /*
1329 * We have to flush before the next use of the linear mapping
1330 * (e.g., update_va_mapping()) or we could end up modifying a page
1331 * that is no longer a page table (and hence screw up ref counts).
1332 */
1333 percpu_info[smp_processor_id()].deferred_ops |= DOP_FLUSH_ALL_TLBS;
1335 if ( unlikely(shadow_mode_enabled(owner)) )
1337 /* Raw page tables are rewritten during save/restore. */
1338 if ( !shadow_mode_translate(owner) )
1339 mark_dirty(owner, page_to_pfn(page));
1341 if ( shadow_mode_refcounts(owner) )
1342 return;
1344 gpfn = __mfn_to_gpfn(owner, page_to_pfn(page));
1345 ASSERT(VALID_M2P(gpfn));
1346 remove_shadow(owner, gpfn, type & PGT_type_mask);
1350 switch ( type & PGT_type_mask )
1352 case PGT_l1_page_table:
1353 free_l1_table(page);
1354 break;
1356 case PGT_l2_page_table:
1357 free_l2_table(page);
1358 break;
1360 #if CONFIG_PAGING_LEVELS >= 3
1361 case PGT_l3_page_table:
1362 free_l3_table(page);
1363 break;
1364 #endif
1366 #if CONFIG_PAGING_LEVELS >= 4
1367 case PGT_l4_page_table:
1368 free_l4_table(page);
1369 break;
1370 #endif
1372 default:
1373 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1374 type, page_to_pfn(page));
1375 BUG();
1380 void put_page_type(struct pfn_info *page)
1382 unsigned long nx, x, y = page->u.inuse.type_info;
1384 again:
1385 do {
1386 x = y;
1387 nx = x - 1;
1389 ASSERT((x & PGT_count_mask) != 0);
1391 /*
1392 * The page should always be validated while a reference is held. The
1393 * exception is during domain destruction, when we forcibly invalidate
1394 * page-table pages if we detect a referential loop.
1395 * See domain.c:relinquish_list().
1396 */
1397 ASSERT((x & PGT_validated) ||
1398 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1400 if ( unlikely((nx & PGT_count_mask) == 0) )
1402 /* Record TLB information for flush later. Races are harmless. */
1403 page->tlbflush_timestamp = tlbflush_current_time();
1405 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1406 likely(nx & PGT_validated) )
1408 /*
1409 * Page-table pages must be unvalidated when count is zero. The
1410 * 'free' is safe because the refcnt is non-zero and validated
1411 * bit is clear => other ops will spin or fail.
1412 */
1413 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1414 x & ~PGT_validated)) != x) )
1415 goto again;
1416 /* We cleared the 'valid bit' so we do the clean up. */
1417 free_page_type(page, x);
1418 /* Carry on, but with the 'valid bit' now clear. */
1419 x &= ~PGT_validated;
1420 nx &= ~PGT_validated;
1423 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1424 (PGT_pinned | 1)) &&
1425 ((nx & PGT_type_mask) != PGT_writable_page)) )
1427 /* Page is now only pinned. Make the back pointer mutable again. */
1428 nx |= PGT_va_mutable;
1431 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1435 int get_page_type(struct pfn_info *page, unsigned long type)
1437 unsigned long nx, x, y = page->u.inuse.type_info;
1439 again:
1440 do {
1441 x = y;
1442 nx = x + 1;
1443 if ( unlikely((nx & PGT_count_mask) == 0) )
1445 MEM_LOG("Type count overflow on pfn %lx", page_to_pfn(page));
1446 return 0;
1448 else if ( unlikely((x & PGT_count_mask) == 0) )
1450 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1452 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1454 /*
1455 * On type change we check to flush stale TLB
1456 * entries. This may be unnecessary (e.g., page
1457 * was GDT/LDT) but those circumstances should be
1458 * very rare.
1459 */
1460 cpumask_t mask =
1461 page_get_owner(page)->domain_dirty_cpumask;
1462 tlbflush_filter(mask, page->tlbflush_timestamp);
1464 if ( unlikely(!cpus_empty(mask)) )
1466 perfc_incrc(need_flush_tlb_flush);
1467 flush_tlb_mask(mask);
1471 /* We lose existing type, back pointer, and validity. */
1472 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1473 nx |= type;
1475 /* No special validation needed for writable pages. */
1476 /* Page tables and GDT/LDT need to be scanned for validity. */
1477 if ( type == PGT_writable_page )
1478 nx |= PGT_validated;
1481 else
1483 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1485 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1487 if ( current->domain == page_get_owner(page) )
1489 /*
1490 * This ensures functions like set_gdt() see up-to-date
1491 * type info without needing to clean up writable p.t.
1492 * state on the fast path.
1493 */
1494 LOCK_BIGLOCK(current->domain);
1495 cleanup_writable_pagetable(current->domain);
1496 y = page->u.inuse.type_info;
1497 UNLOCK_BIGLOCK(current->domain);
1498 /* Can we make progress now? */
1499 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1500 ((y & PGT_count_mask) == 0) )
1501 goto again;
1503 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1504 ((type & PGT_type_mask) != PGT_l1_page_table) )
1505 MEM_LOG("Bad type (saw %" PRtype_info
1506 " != exp %" PRtype_info ") "
1507 "for mfn %lx (pfn %lx)",
1508 x, type, page_to_pfn(page),
1509 get_pfn_from_mfn(page_to_pfn(page)));
1510 return 0;
1512 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1514 /* The va backpointer is mutable, hence we update it. */
1515 nx &= ~PGT_va_mask;
1516 nx |= type; /* we know the actual type is correct */
1518 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1519 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1521 #ifdef CONFIG_X86_PAE
1522 /* We use backptr as extra typing. Cannot be unknown. */
1523 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1524 return 0;
1525 #endif
1526 /* This table is possibly mapped at multiple locations. */
1527 nx &= ~PGT_va_mask;
1528 nx |= PGT_va_unknown;
1531 if ( unlikely(!(x & PGT_validated)) )
1533 /* Someone else is updating validation of this page. Wait... */
1534 while ( (y = page->u.inuse.type_info) == x )
1535 cpu_relax();
1536 goto again;
1540 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1542 if ( unlikely(!(nx & PGT_validated)) )
1544 /* Try to validate page type; drop the new reference on failure. */
1545 if ( unlikely(!alloc_page_type(page, type)) )
1547 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1548 PRtype_info ": caf=%08x taf=%" PRtype_info,
1549 page_to_pfn(page), get_pfn_from_mfn(page_to_pfn(page)),
1550 type, page->count_info, page->u.inuse.type_info);
1551 /* Noone else can get a reference. We hold the only ref. */
1552 page->u.inuse.type_info = 0;
1553 return 0;
1556 /* Noone else is updating simultaneously. */
1557 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1560 return 1;
1564 int new_guest_cr3(unsigned long mfn)
1566 struct vcpu *v = current;
1567 struct domain *d = v->domain;
1568 int okay;
1569 unsigned long old_base_mfn;
1571 if ( shadow_mode_refcounts(d) )
1572 okay = get_page_from_pagenr(mfn, d);
1573 else
1574 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1576 if ( likely(okay) )
1578 invalidate_shadow_ldt(v);
1580 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1581 v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1582 update_pagetables(v); /* update shadow_table and monitor_table */
1584 write_ptbase(v);
1586 if ( shadow_mode_refcounts(d) )
1587 put_page(pfn_to_page(old_base_mfn));
1588 else
1589 put_page_and_type(pfn_to_page(old_base_mfn));
1591 /* CR3 also holds a ref to its shadow... */
1592 if ( shadow_mode_enabled(d) )
1594 if ( v->arch.monitor_shadow_ref )
1595 put_shadow_ref(v->arch.monitor_shadow_ref);
1596 v->arch.monitor_shadow_ref =
1597 pagetable_get_pfn(v->arch.monitor_table);
1598 ASSERT(!page_get_owner(pfn_to_page(v->arch.monitor_shadow_ref)));
1599 get_shadow_ref(v->arch.monitor_shadow_ref);
1602 else
1604 MEM_LOG("Error while installing new baseptr %lx", mfn);
1607 return okay;
1610 static void process_deferred_ops(unsigned int cpu)
1612 unsigned int deferred_ops;
1613 struct domain *d = current->domain;
1615 deferred_ops = percpu_info[cpu].deferred_ops;
1616 percpu_info[cpu].deferred_ops = 0;
1618 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1620 if ( shadow_mode_enabled(d) )
1621 shadow_sync_all(d);
1622 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1623 flush_tlb_mask(d->domain_dirty_cpumask);
1624 else
1625 local_flush_tlb();
1628 if ( deferred_ops & DOP_RELOAD_LDT )
1629 (void)map_ldt_shadow_page(0);
1631 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1633 put_domain(percpu_info[cpu].foreign);
1634 percpu_info[cpu].foreign = NULL;
1638 static int set_foreigndom(unsigned int cpu, domid_t domid)
1640 struct domain *e, *d = current->domain;
1641 int okay = 1;
1643 if ( (e = percpu_info[cpu].foreign) != NULL )
1644 put_domain(e);
1645 percpu_info[cpu].foreign = NULL;
1647 if ( domid == DOMID_SELF )
1648 goto out;
1650 if ( !IS_PRIV(d) )
1652 switch ( domid )
1654 case DOMID_IO:
1655 get_knownalive_domain(dom_io);
1656 percpu_info[cpu].foreign = dom_io;
1657 break;
1658 default:
1659 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1660 okay = 0;
1661 break;
1664 else
1666 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1667 if ( e == NULL )
1669 switch ( domid )
1671 case DOMID_XEN:
1672 get_knownalive_domain(dom_xen);
1673 percpu_info[cpu].foreign = dom_xen;
1674 break;
1675 case DOMID_IO:
1676 get_knownalive_domain(dom_io);
1677 percpu_info[cpu].foreign = dom_io;
1678 break;
1679 default:
1680 MEM_LOG("Unknown domain '%u'", domid);
1681 okay = 0;
1682 break;
1687 out:
1688 return okay;
1691 static inline cpumask_t vcpumask_to_pcpumask(
1692 struct domain *d, unsigned long vmask)
1694 unsigned int vcpu_id;
1695 cpumask_t pmask = CPU_MASK_NONE;
1696 struct vcpu *v;
1698 while ( vmask != 0 )
1700 vcpu_id = find_first_set_bit(vmask);
1701 vmask &= ~(1UL << vcpu_id);
1702 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1703 ((v = d->vcpu[vcpu_id]) != NULL) )
1704 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1707 return pmask;
1710 int do_mmuext_op(
1711 struct mmuext_op *uops,
1712 unsigned int count,
1713 unsigned int *pdone,
1714 unsigned int foreigndom)
1716 struct mmuext_op op;
1717 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1718 unsigned long mfn, type, done = 0;
1719 struct pfn_info *page;
1720 struct vcpu *v = current;
1721 struct domain *d = v->domain;
1723 LOCK_BIGLOCK(d);
1725 cleanup_writable_pagetable(d);
1727 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1729 count &= ~MMU_UPDATE_PREEMPTED;
1730 if ( unlikely(pdone != NULL) )
1731 (void)get_user(done, pdone);
1734 if ( !set_foreigndom(cpu, foreigndom) )
1736 rc = -EINVAL;
1737 goto out;
1740 if ( unlikely(!array_access_ok(uops, count, sizeof(op))) )
1742 rc = -EFAULT;
1743 goto out;
1746 for ( i = 0; i < count; i++ )
1748 if ( hypercall_preempt_check() )
1750 rc = hypercall4_create_continuation(
1751 __HYPERVISOR_mmuext_op, uops,
1752 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1753 break;
1756 if ( unlikely(__copy_from_user(&op, uops, sizeof(op)) != 0) )
1758 MEM_LOG("Bad __copy_from_user");
1759 rc = -EFAULT;
1760 break;
1763 okay = 1;
1764 mfn = op.arg1.mfn;
1765 page = pfn_to_page(mfn);
1767 switch ( op.cmd )
1769 case MMUEXT_PIN_L1_TABLE:
1770 type = PGT_l1_page_table | PGT_va_mutable;
1772 pin_page:
1773 if ( shadow_mode_refcounts(FOREIGNDOM) )
1774 type = PGT_writable_page;
1776 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1777 if ( unlikely(!okay) )
1779 MEM_LOG("Error while pinning mfn %lx", mfn);
1780 break;
1783 if ( unlikely(test_and_set_bit(_PGT_pinned,
1784 &page->u.inuse.type_info)) )
1786 MEM_LOG("Mfn %lx already pinned", mfn);
1787 put_page_and_type(page);
1788 okay = 0;
1789 break;
1792 break;
1794 #ifndef CONFIG_X86_PAE /* Unsafe on PAE because of Xen-private mappings. */
1795 case MMUEXT_PIN_L2_TABLE:
1796 type = PGT_l2_page_table | PGT_va_mutable;
1797 goto pin_page;
1798 #endif
1800 case MMUEXT_PIN_L3_TABLE:
1801 type = PGT_l3_page_table | PGT_va_mutable;
1802 goto pin_page;
1804 case MMUEXT_PIN_L4_TABLE:
1805 type = PGT_l4_page_table | PGT_va_mutable;
1806 goto pin_page;
1808 case MMUEXT_UNPIN_TABLE:
1809 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
1811 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1812 mfn, page_get_owner(page));
1814 else if ( likely(test_and_clear_bit(_PGT_pinned,
1815 &page->u.inuse.type_info)) )
1817 put_page_and_type(page);
1818 put_page(page);
1820 else
1822 okay = 0;
1823 put_page(page);
1824 MEM_LOG("Mfn %lx not pinned", mfn);
1826 break;
1828 case MMUEXT_NEW_BASEPTR:
1829 mfn = __gpfn_to_mfn(current->domain, mfn);
1830 okay = new_guest_cr3(mfn);
1831 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1832 break;
1834 #ifdef __x86_64__
1835 case MMUEXT_NEW_USER_BASEPTR:
1836 okay = get_page_and_type_from_pagenr(
1837 mfn, PGT_root_page_table, d);
1838 if ( unlikely(!okay) )
1840 MEM_LOG("Error while installing new mfn %lx", mfn);
1842 else
1844 unsigned long old_mfn =
1845 pagetable_get_pfn(v->arch.guest_table_user);
1846 v->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT);
1847 if ( old_mfn != 0 )
1848 put_page_and_type(pfn_to_page(old_mfn));
1850 break;
1851 #endif
1853 case MMUEXT_TLB_FLUSH_LOCAL:
1854 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1855 break;
1857 case MMUEXT_INVLPG_LOCAL:
1858 if ( shadow_mode_enabled(d) )
1859 shadow_invlpg(v, op.arg1.linear_addr);
1860 local_flush_tlb_one(op.arg1.linear_addr);
1861 break;
1863 case MMUEXT_TLB_FLUSH_MULTI:
1864 case MMUEXT_INVLPG_MULTI:
1866 unsigned long vmask;
1867 cpumask_t pmask;
1868 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
1870 okay = 0;
1871 break;
1873 pmask = vcpumask_to_pcpumask(d, vmask);
1874 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
1875 flush_tlb_mask(pmask);
1876 else
1877 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
1878 break;
1881 case MMUEXT_TLB_FLUSH_ALL:
1882 flush_tlb_mask(d->domain_dirty_cpumask);
1883 break;
1885 case MMUEXT_INVLPG_ALL:
1886 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
1887 break;
1889 case MMUEXT_FLUSH_CACHE:
1890 if ( unlikely(!cache_flush_permitted(d)) )
1892 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
1893 okay = 0;
1895 else
1897 wbinvd();
1899 break;
1901 case MMUEXT_SET_LDT:
1903 unsigned long ptr = op.arg1.linear_addr;
1904 unsigned long ents = op.arg2.nr_ents;
1906 if ( shadow_mode_external(d) )
1908 MEM_LOG("ignoring SET_LDT hypercall from external "
1909 "domain %u", d->domain_id);
1910 okay = 0;
1912 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1913 (ents > 8192) ||
1914 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
1916 okay = 0;
1917 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
1919 else if ( (v->arch.guest_context.ldt_ents != ents) ||
1920 (v->arch.guest_context.ldt_base != ptr) )
1922 invalidate_shadow_ldt(v);
1923 v->arch.guest_context.ldt_base = ptr;
1924 v->arch.guest_context.ldt_ents = ents;
1925 load_LDT(v);
1926 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1927 if ( ents != 0 )
1928 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1930 break;
1933 case MMUEXT_PFN_HOLE_BASE:
1935 if (FOREIGNDOM->start_pfn_hole) {
1936 rc = FOREIGNDOM->start_pfn_hole;
1937 okay = 1;
1938 } else {
1939 rc = FOREIGNDOM->start_pfn_hole =
1940 FOREIGNDOM->max_pages;
1941 okay = 1;
1942 if (shadow_mode_translate(FOREIGNDOM)) {
1943 /* Fill in a few entries in the hole. At the
1944 moment, this means the shared info page and the
1945 grant table pages. */
1946 struct domain_mmap_cache c1, c2;
1947 unsigned long pfn, mfn, x;
1948 domain_mmap_cache_init(&c1);
1949 domain_mmap_cache_init(&c2);
1950 shadow_lock(FOREIGNDOM);
1951 pfn = FOREIGNDOM->start_pfn_hole;
1952 mfn = virt_to_phys(FOREIGNDOM->shared_info) >> PAGE_SHIFT;
1953 set_p2m_entry(FOREIGNDOM, pfn, mfn, &c1, &c2);
1954 set_pfn_from_mfn(mfn, pfn);
1955 pfn++;
1956 for (x = 0; x < NR_GRANT_FRAMES; x++) {
1957 mfn = gnttab_shared_mfn(FOREIGNDOM,
1958 FOREIGNDOM->grant_table,
1959 x);
1960 set_p2m_entry(FOREIGNDOM, pfn, mfn, &c1, &c2);
1961 set_pfn_from_mfn(mfn, pfn);
1962 pfn++;
1964 shadow_unlock(FOREIGNDOM);
1965 domain_mmap_cache_destroy(&c1);
1966 domain_mmap_cache_destroy(&c2);
1969 break;
1972 case MMUEXT_PFN_HOLE_SIZE:
1974 if (shadow_mode_translate(FOREIGNDOM)) {
1975 rc = PFN_HOLE_SIZE;
1976 } else {
1977 rc = 0;
1979 okay = 1;
1980 break;
1983 default:
1984 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
1985 okay = 0;
1986 break;
1989 if ( unlikely(!okay) )
1991 rc = -EINVAL;
1992 break;
1995 uops++;
1998 out:
1999 process_deferred_ops(cpu);
2001 /* Add incremental work we have done to the @done output parameter. */
2002 if ( unlikely(pdone != NULL) )
2003 __put_user(done + i, pdone);
2005 UNLOCK_BIGLOCK(d);
2006 return rc;
2009 int do_mmu_update(
2010 mmu_update_t *ureqs,
2011 unsigned int count,
2012 unsigned int *pdone,
2013 unsigned int foreigndom)
2015 mmu_update_t req;
2016 void *va;
2017 unsigned long gpfn, mfn;
2018 struct pfn_info *page;
2019 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
2020 unsigned int cmd, done = 0;
2021 struct vcpu *v = current;
2022 struct domain *d = v->domain;
2023 unsigned long type_info;
2024 struct domain_mmap_cache mapcache, sh_mapcache;
2026 LOCK_BIGLOCK(d);
2028 cleanup_writable_pagetable(d);
2030 if ( unlikely(shadow_mode_enabled(d)) )
2031 check_pagetable(v, "pre-mmu"); /* debug */
2033 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2035 count &= ~MMU_UPDATE_PREEMPTED;
2036 if ( unlikely(pdone != NULL) )
2037 (void)get_user(done, pdone);
2040 domain_mmap_cache_init(&mapcache);
2041 domain_mmap_cache_init(&sh_mapcache);
2043 if ( !set_foreigndom(cpu, foreigndom) )
2045 rc = -EINVAL;
2046 goto out;
2049 perfc_incrc(calls_to_mmu_update);
2050 perfc_addc(num_page_updates, count);
2051 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2053 if ( unlikely(!array_access_ok(ureqs, count, sizeof(req))) )
2055 rc = -EFAULT;
2056 goto out;
2059 for ( i = 0; i < count; i++ )
2061 if ( hypercall_preempt_check() )
2063 rc = hypercall4_create_continuation(
2064 __HYPERVISOR_mmu_update, ureqs,
2065 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2066 break;
2069 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
2071 MEM_LOG("Bad __copy_from_user");
2072 rc = -EFAULT;
2073 break;
2076 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2077 okay = 0;
2079 switch ( cmd )
2081 /*
2082 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2083 */
2084 case MMU_NORMAL_PT_UPDATE:
2086 gpfn = req.ptr >> PAGE_SHIFT;
2087 mfn = __gpfn_to_mfn(d, gpfn);
2089 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2091 MEM_LOG("Could not get page for normal update");
2092 break;
2095 va = map_domain_page_with_cache(mfn, &mapcache);
2096 va = (void *)((unsigned long)va +
2097 (unsigned long)(req.ptr & ~PAGE_MASK));
2098 page = pfn_to_page(mfn);
2100 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2102 case PGT_l1_page_table:
2103 ASSERT( !shadow_mode_refcounts(d) );
2104 if ( likely(get_page_type(
2105 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2107 l1_pgentry_t l1e;
2109 /* FIXME: doesn't work with PAE */
2110 l1e = l1e_from_intpte(req.val);
2111 okay = mod_l1_entry(va, l1e);
2112 if ( okay && unlikely(shadow_mode_enabled(d)) )
2113 shadow_l1_normal_pt_update(
2114 d, req.ptr, l1e, &sh_mapcache);
2115 put_page_type(page);
2117 break;
2118 case PGT_l2_page_table:
2119 ASSERT( !shadow_mode_refcounts(d) );
2120 if ( likely(get_page_type(
2121 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2123 l2_pgentry_t l2e;
2125 /* FIXME: doesn't work with PAE */
2126 l2e = l2e_from_intpte(req.val);
2127 okay = mod_l2_entry(
2128 (l2_pgentry_t *)va, l2e, mfn, type_info);
2129 if ( okay && unlikely(shadow_mode_enabled(d)) )
2130 shadow_l2_normal_pt_update(
2131 d, req.ptr, l2e, &sh_mapcache);
2132 put_page_type(page);
2134 break;
2135 #if CONFIG_PAGING_LEVELS >= 3
2136 case PGT_l3_page_table:
2137 ASSERT( !shadow_mode_refcounts(d) );
2138 if ( likely(get_page_type(
2139 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2141 l3_pgentry_t l3e;
2143 /* FIXME: doesn't work with PAE */
2144 l3e = l3e_from_intpte(req.val);
2145 okay = mod_l3_entry(va, l3e, mfn, type_info);
2146 if ( okay && unlikely(shadow_mode_enabled(d)) )
2147 shadow_l3_normal_pt_update(
2148 d, req.ptr, l3e, &sh_mapcache);
2149 put_page_type(page);
2151 break;
2152 #endif
2153 #if CONFIG_PAGING_LEVELS >= 4
2154 case PGT_l4_page_table:
2155 ASSERT( !shadow_mode_refcounts(d) );
2156 if ( likely(get_page_type(
2157 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2159 l4_pgentry_t l4e;
2161 l4e = l4e_from_intpte(req.val);
2162 okay = mod_l4_entry(va, l4e, mfn, type_info);
2163 if ( okay && unlikely(shadow_mode_enabled(d)) )
2164 shadow_l4_normal_pt_update(
2165 d, req.ptr, l4e, &sh_mapcache);
2166 put_page_type(page);
2168 break;
2169 #endif
2170 default:
2171 if ( likely(get_page_type(page, PGT_writable_page)) )
2173 if ( shadow_mode_enabled(d) )
2175 shadow_lock(d);
2177 __mark_dirty(d, mfn);
2179 if ( page_is_page_table(page) &&
2180 !page_out_of_sync(page) )
2182 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
2186 *(intpte_t *)va = req.val;
2187 okay = 1;
2189 if ( shadow_mode_enabled(d) )
2190 shadow_unlock(d);
2192 put_page_type(page);
2194 break;
2197 unmap_domain_page_with_cache(va, &mapcache);
2199 put_page(page);
2200 break;
2202 case MMU_MACHPHYS_UPDATE:
2204 if (shadow_mode_translate(FOREIGNDOM)) {
2205 MEM_LOG("can't mutate m2p table of translate mode guest");
2206 break;
2209 mfn = req.ptr >> PAGE_SHIFT;
2210 gpfn = req.val;
2212 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2214 MEM_LOG("Could not get page for mach->phys update");
2215 break;
2218 set_pfn_from_mfn(mfn, gpfn);
2219 okay = 1;
2221 mark_dirty(FOREIGNDOM, mfn);
2223 put_page(pfn_to_page(mfn));
2224 break;
2226 default:
2227 MEM_LOG("Invalid page update command %x", cmd);
2228 break;
2231 if ( unlikely(!okay) )
2233 rc = -EINVAL;
2234 break;
2237 ureqs++;
2240 out:
2241 domain_mmap_cache_destroy(&mapcache);
2242 domain_mmap_cache_destroy(&sh_mapcache);
2244 process_deferred_ops(cpu);
2246 /* Add incremental work we have done to the @done output parameter. */
2247 if ( unlikely(pdone != NULL) )
2248 __put_user(done + i, pdone);
2250 if ( unlikely(shadow_mode_enabled(d)) )
2251 check_pagetable(v, "post-mmu"); /* debug */
2253 UNLOCK_BIGLOCK(d);
2254 return rc;
2258 static int create_grant_pte_mapping(
2259 unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
2261 int rc = GNTST_okay;
2262 void *va;
2263 unsigned long gpfn, mfn;
2264 struct pfn_info *page;
2265 u32 type_info;
2266 l1_pgentry_t ol1e;
2267 struct domain *d = v->domain;
2269 ASSERT(spin_is_locked(&d->big_lock));
2270 ASSERT(!shadow_mode_refcounts(d));
2272 gpfn = pte_addr >> PAGE_SHIFT;
2273 mfn = __gpfn_to_mfn(d, gpfn);
2275 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2277 MEM_LOG("Could not get page for normal update");
2278 return GNTST_general_error;
2281 va = map_domain_page(mfn);
2282 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2283 page = pfn_to_page(mfn);
2285 type_info = page->u.inuse.type_info;
2286 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2287 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2289 MEM_LOG("Grant map attempted to update a non-L1 page");
2290 rc = GNTST_general_error;
2291 goto failed;
2294 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) ||
2295 !update_l1e(va, ol1e, _nl1e) )
2297 put_page_type(page);
2298 rc = GNTST_general_error;
2299 goto failed;
2302 put_page_from_l1e(ol1e, d);
2304 if ( unlikely(shadow_mode_enabled(d)) )
2306 struct domain_mmap_cache sh_mapcache;
2307 domain_mmap_cache_init(&sh_mapcache);
2308 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2309 domain_mmap_cache_destroy(&sh_mapcache);
2312 put_page_type(page);
2314 failed:
2315 unmap_domain_page(va);
2316 put_page(page);
2317 return rc;
2320 static int destroy_grant_pte_mapping(
2321 unsigned long addr, unsigned long frame, struct domain *d)
2323 int rc = GNTST_okay;
2324 void *va;
2325 unsigned long gpfn, mfn;
2326 struct pfn_info *page;
2327 u32 type_info;
2328 l1_pgentry_t ol1e;
2330 ASSERT(!shadow_mode_refcounts(d));
2332 gpfn = addr >> PAGE_SHIFT;
2333 mfn = __gpfn_to_mfn(d, gpfn);
2335 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2337 MEM_LOG("Could not get page for normal update");
2338 return GNTST_general_error;
2341 va = map_domain_page(mfn);
2342 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2343 page = pfn_to_page(mfn);
2345 type_info = page->u.inuse.type_info;
2346 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2347 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2349 MEM_LOG("Grant map attempted to update a non-L1 page");
2350 rc = GNTST_general_error;
2351 goto failed;
2354 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2356 put_page_type(page);
2357 rc = GNTST_general_error;
2358 goto failed;
2361 /* Check that the virtual address supplied is actually mapped to frame. */
2362 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2364 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2365 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2366 put_page_type(page);
2367 rc = GNTST_general_error;
2368 goto failed;
2371 /* Delete pagetable entry. */
2372 if ( unlikely(__put_user(0, (intpte_t *)va)))
2374 MEM_LOG("Cannot delete PTE entry at %p", va);
2375 put_page_type(page);
2376 rc = GNTST_general_error;
2377 goto failed;
2380 if ( unlikely(shadow_mode_enabled(d)) )
2382 struct domain_mmap_cache sh_mapcache;
2383 domain_mmap_cache_init(&sh_mapcache);
2384 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2385 domain_mmap_cache_destroy(&sh_mapcache);
2388 put_page_type(page);
2390 failed:
2391 unmap_domain_page(va);
2392 put_page(page);
2393 return rc;
2397 static int create_grant_va_mapping(
2398 unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
2400 l1_pgentry_t *pl1e, ol1e;
2401 struct domain *d = v->domain;
2403 ASSERT(spin_is_locked(&d->big_lock));
2404 ASSERT(!shadow_mode_refcounts(d));
2406 /*
2407 * This is actually overkill - we don't need to sync the L1 itself,
2408 * just everything involved in getting to this L1 (i.e. we need
2409 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2410 */
2411 __shadow_sync_va(v, va);
2413 pl1e = &linear_pg_table[l1_linear_offset(va)];
2415 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2416 !update_l1e(pl1e, ol1e, _nl1e) )
2417 return GNTST_general_error;
2419 put_page_from_l1e(ol1e, d);
2421 if ( unlikely(shadow_mode_enabled(d)) )
2422 shadow_do_update_va_mapping(va, _nl1e, v);
2424 return GNTST_okay;
2427 static int destroy_grant_va_mapping(
2428 unsigned long addr, unsigned long frame)
2430 l1_pgentry_t *pl1e, ol1e;
2432 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2434 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2436 MEM_LOG("Could not find PTE entry for address %lx", addr);
2437 return GNTST_general_error;
2440 /*
2441 * Check that the virtual address supplied is actually mapped to
2442 * frame.
2443 */
2444 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2446 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2447 l1e_get_pfn(ol1e), addr, frame);
2448 return GNTST_general_error;
2451 /* Delete pagetable entry. */
2452 if ( unlikely(__put_user(0, &pl1e->l1)) )
2454 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2455 return GNTST_general_error;
2458 return 0;
2461 int create_grant_host_mapping(
2462 unsigned long addr, unsigned long frame, unsigned int flags)
2464 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2466 if ( (flags & GNTMAP_application_map) )
2467 l1e_add_flags(pte,_PAGE_USER);
2468 if ( !(flags & GNTMAP_readonly) )
2469 l1e_add_flags(pte,_PAGE_RW);
2471 if ( flags & GNTMAP_contains_pte )
2472 return create_grant_pte_mapping(addr, pte, current);
2473 return create_grant_va_mapping(addr, pte, current);
2476 int destroy_grant_host_mapping(
2477 unsigned long addr, unsigned long frame, unsigned int flags)
2479 if ( flags & GNTMAP_contains_pte )
2480 return destroy_grant_pte_mapping(addr, frame, current->domain);
2481 return destroy_grant_va_mapping(addr, frame);
2484 int steal_page_for_grant_transfer(
2485 struct domain *d, struct pfn_info *page)
2487 u32 _d, _nd, x, y;
2489 spin_lock(&d->page_alloc_lock);
2491 /*
2492 * The tricky bit: atomically release ownership while there is just one
2493 * benign reference to the page (PGC_allocated). If that reference
2494 * disappears then the deallocation routine will safely spin.
2495 */
2496 _d = pickle_domptr(d);
2497 _nd = page->u.inuse._domain;
2498 y = page->count_info;
2499 do {
2500 x = y;
2501 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2502 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2503 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2504 " caf=%08x, taf=%" PRtype_info "\n",
2505 (void *) page_to_pfn(page),
2506 d, d->domain_id, unpickle_domptr(_nd), x,
2507 page->u.inuse.type_info);
2508 spin_unlock(&d->page_alloc_lock);
2509 return -1;
2511 __asm__ __volatile__(
2512 LOCK_PREFIX "cmpxchg8b %2"
2513 : "=d" (_nd), "=a" (y),
2514 "=m" (*(volatile u64 *)(&page->count_info))
2515 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2516 } while (unlikely(_nd != _d) || unlikely(y != x));
2518 /*
2519 * Unlink from 'd'. At least one reference remains (now anonymous), so
2520 * noone else is spinning to try to delete this page from 'd'.
2521 */
2522 d->tot_pages--;
2523 list_del(&page->list);
2525 spin_unlock(&d->page_alloc_lock);
2527 return 0;
2530 int do_update_va_mapping(unsigned long va, u64 val64,
2531 unsigned long flags)
2533 l1_pgentry_t val = l1e_from_intpte(val64);
2534 struct vcpu *v = current;
2535 struct domain *d = v->domain;
2536 unsigned int cpu = smp_processor_id();
2537 unsigned long vmask, bmap_ptr;
2538 cpumask_t pmask;
2539 int rc = 0;
2541 perfc_incrc(calls_to_update_va);
2543 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2544 return -EINVAL;
2546 LOCK_BIGLOCK(d);
2548 cleanup_writable_pagetable(d);
2550 if ( unlikely(shadow_mode_enabled(d)) )
2551 check_pagetable(v, "pre-va"); /* debug */
2553 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2554 val)) )
2555 rc = -EINVAL;
2557 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2559 if ( unlikely(percpu_info[cpu].foreign &&
2560 (shadow_mode_translate(d) ||
2561 shadow_mode_translate(percpu_info[cpu].foreign))) )
2563 /*
2564 * The foreign domain's pfn's are in a different namespace. There's
2565 * not enough information in just a gpte to figure out how to
2566 * (re-)shadow this entry.
2567 */
2568 domain_crash(d);
2571 rc = shadow_do_update_va_mapping(va, val, v);
2573 check_pagetable(v, "post-va"); /* debug */
2576 switch ( flags & UVMF_FLUSHTYPE_MASK )
2578 case UVMF_TLB_FLUSH:
2579 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2581 case UVMF_LOCAL:
2582 if ( unlikely(shadow_mode_enabled(d)) )
2583 shadow_sync_all(d);
2584 local_flush_tlb();
2585 break;
2586 case UVMF_ALL:
2587 flush_tlb_mask(d->domain_dirty_cpumask);
2588 break;
2589 default:
2590 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2591 rc = -EFAULT;
2592 pmask = vcpumask_to_pcpumask(d, vmask);
2593 flush_tlb_mask(pmask);
2594 break;
2596 break;
2598 case UVMF_INVLPG:
2599 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2601 case UVMF_LOCAL:
2602 if ( unlikely(shadow_mode_enabled(d)) )
2603 shadow_invlpg(current, va);
2604 local_flush_tlb_one(va);
2605 break;
2606 case UVMF_ALL:
2607 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2608 break;
2609 default:
2610 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2611 rc = -EFAULT;
2612 pmask = vcpumask_to_pcpumask(d, vmask);
2613 flush_tlb_one_mask(pmask, va);
2614 break;
2616 break;
2619 process_deferred_ops(cpu);
2621 UNLOCK_BIGLOCK(d);
2623 return rc;
2626 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2627 unsigned long flags,
2628 domid_t domid)
2630 unsigned int cpu = smp_processor_id();
2631 struct domain *d;
2632 int rc;
2634 if ( unlikely(!IS_PRIV(current->domain)) )
2635 return -EPERM;
2637 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
2638 if ( unlikely(d == NULL) )
2640 MEM_LOG("Unknown domain '%u'", domid);
2641 return -ESRCH;
2644 rc = do_update_va_mapping(va, val64, flags);
2646 return rc;
2651 /*************************
2652 * Descriptor Tables
2653 */
2655 void destroy_gdt(struct vcpu *v)
2657 int i;
2658 unsigned long pfn;
2660 v->arch.guest_context.gdt_ents = 0;
2661 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2663 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2664 put_page_and_type(pfn_to_page(pfn));
2665 v->arch.perdomain_ptes[i] = l1e_empty();
2666 v->arch.guest_context.gdt_frames[i] = 0;
2671 long set_gdt(struct vcpu *v,
2672 unsigned long *frames,
2673 unsigned int entries)
2675 struct domain *d = v->domain;
2676 /* NB. There are 512 8-byte entries per GDT page. */
2677 int i, nr_pages = (entries + 511) / 512;
2678 unsigned long pfn;
2680 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2681 return -EINVAL;
2683 shadow_sync_all(d);
2685 /* Check the pages in the new GDT. */
2686 for ( i = 0; i < nr_pages; i++ ) {
2687 pfn = frames[i] = __gpfn_to_mfn(d, frames[i]);
2688 if ((pfn >= max_page) ||
2689 !get_page_and_type(pfn_to_page(pfn), d, PGT_gdt_page) )
2690 goto fail;
2693 /* Tear down the old GDT. */
2694 destroy_gdt(v);
2696 /* Install the new GDT. */
2697 v->arch.guest_context.gdt_ents = entries;
2698 for ( i = 0; i < nr_pages; i++ )
2700 v->arch.guest_context.gdt_frames[i] = frames[i];
2701 v->arch.perdomain_ptes[i] =
2702 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2705 return 0;
2707 fail:
2708 while ( i-- > 0 )
2709 put_page_and_type(pfn_to_page(frames[i]));
2710 return -EINVAL;
2714 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
2716 int nr_pages = (entries + 511) / 512;
2717 unsigned long frames[16];
2718 long ret;
2720 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2721 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2722 return -EINVAL;
2724 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
2725 return -EFAULT;
2727 LOCK_BIGLOCK(current->domain);
2729 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2730 local_flush_tlb();
2732 UNLOCK_BIGLOCK(current->domain);
2734 return ret;
2738 long do_update_descriptor(u64 pa, u64 desc)
2740 struct domain *dom = current->domain;
2741 unsigned long gpfn = pa >> PAGE_SHIFT;
2742 unsigned long mfn;
2743 unsigned int offset;
2744 struct desc_struct *gdt_pent, d;
2745 struct pfn_info *page;
2746 long ret = -EINVAL;
2748 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2750 *(u64 *)&d = desc;
2752 LOCK_BIGLOCK(dom);
2754 if ( !VALID_MFN(mfn = __gpfn_to_mfn(dom, gpfn)) ||
2755 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2756 (mfn >= max_page) ||
2757 !check_descriptor(&d) )
2759 UNLOCK_BIGLOCK(dom);
2760 return -EINVAL;
2763 page = pfn_to_page(mfn);
2764 if ( unlikely(!get_page(page, dom)) )
2766 UNLOCK_BIGLOCK(dom);
2767 return -EINVAL;
2770 /* Check if the given frame is in use in an unsafe context. */
2771 switch ( page->u.inuse.type_info & PGT_type_mask )
2773 case PGT_gdt_page:
2774 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2775 goto out;
2776 break;
2777 case PGT_ldt_page:
2778 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2779 goto out;
2780 break;
2781 default:
2782 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2783 goto out;
2784 break;
2787 if ( shadow_mode_enabled(dom) )
2789 shadow_lock(dom);
2791 __mark_dirty(dom, mfn);
2793 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2794 shadow_mark_mfn_out_of_sync(current, gpfn, mfn);
2797 /* All is good so make the update. */
2798 gdt_pent = map_domain_page(mfn);
2799 memcpy(&gdt_pent[offset], &d, 8);
2800 unmap_domain_page(gdt_pent);
2802 if ( shadow_mode_enabled(dom) )
2803 shadow_unlock(dom);
2805 put_page_type(page);
2807 ret = 0; /* success */
2809 out:
2810 put_page(page);
2812 UNLOCK_BIGLOCK(dom);
2814 return ret;
2819 /*************************
2820 * Writable Pagetables
2821 */
2823 #ifdef VVERBOSE
2824 int ptwr_debug = 0x0;
2825 #define PTWR_PRINTK(_f, _a...) \
2826 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2827 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2828 #else
2829 #define PTWR_PRINTK(_f, _a...) ((void)0)
2830 #endif
2833 #ifdef PERF_ARRAYS
2835 /**************** writeable pagetables profiling functions *****************/
2837 #define ptwr_eip_buckets 256
2839 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
2841 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
2843 struct {
2844 unsigned long eip;
2845 domid_t id;
2846 u32 val[ptwr_eip_stat_thresholdN];
2847 } typedef ptwr_eip_stat_t;
2849 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
2851 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
2853 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
2854 ptwr_eip_buckets;
2857 static void ptwr_eip_stat_inc(u32 *n)
2859 int i, j;
2861 if ( ++(*n) != 0 )
2862 return;
2864 *n = ~0;
2866 /* Re-scale all buckets. */
2867 for ( i = 0; i <ptwr_eip_buckets; i++ )
2868 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2869 ptwr_eip_stats[i].val[j] >>= 1;
2872 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
2874 int i, j, b;
2876 i = b = ptwr_eip_stat_hash(eip, id);
2878 do
2880 if ( !ptwr_eip_stats[i].eip )
2882 /* doesn't exist */
2883 ptwr_eip_stats[i].eip = eip;
2884 ptwr_eip_stats[i].id = id;
2885 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
2888 if ( ptwr_eip_stats[i].eip == eip )
2890 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2891 if ( modified <= ptwr_eip_stat_threshold[j] )
2892 break;
2893 BUG_ON(j >= ptwr_eip_stat_thresholdN);
2894 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
2895 return;
2898 i = (i+1) % ptwr_eip_buckets;
2900 while ( i != b );
2902 printk("ptwr_eip_stat: too many EIPs in use!\n");
2904 ptwr_eip_stat_print();
2905 ptwr_eip_stat_reset();
2908 void ptwr_eip_stat_reset(void)
2910 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
2913 void ptwr_eip_stat_print(void)
2915 struct domain *e;
2916 domid_t d;
2917 int i, j;
2919 for_each_domain( e )
2921 d = e->domain_id;
2923 for ( i = 0; i < ptwr_eip_buckets; i++ )
2925 if ( ptwr_eip_stats[i].eip && ptwr_eip_stats[i].id != d )
2926 continue;
2928 printk("D %d eip %08lx ",
2929 ptwr_eip_stats[i].id, ptwr_eip_stats[i].eip);
2931 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2932 printk("<=%u %4u \t",
2933 ptwr_eip_stat_threshold[j],
2934 ptwr_eip_stats[i].val[j]);
2935 printk("\n");
2940 #else /* PERF_ARRAYS */
2942 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
2944 #endif
2946 /*******************************************************************/
2948 /* Re-validate a given p.t. page, given its prior snapshot */
2949 int revalidate_l1(
2950 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
2952 l1_pgentry_t ol1e, nl1e;
2953 int modified = 0, i;
2955 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2957 ol1e = snapshot[i];
2958 nl1e = l1page[i];
2960 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
2961 continue;
2963 /* Update number of entries modified. */
2964 modified++;
2966 /*
2967 * Fast path for PTEs that have merely been write-protected
2968 * (e.g., during a Unix fork()). A strict reduction in privilege.
2969 */
2970 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
2972 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
2973 put_page_type(pfn_to_page(l1e_get_pfn(nl1e)));
2974 continue;
2977 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2979 /*
2980 * Make the remaining p.t's consistent before crashing, so the
2981 * reference counts are correct.
2982 */
2983 memcpy(&l1page[i], &snapshot[i],
2984 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
2986 /* Crash the offending domain. */
2987 MEM_LOG("ptwr: Could not revalidate l1 page");
2988 domain_crash(d);
2989 break;
2992 put_page_from_l1e(ol1e, d);
2995 return modified;
2999 /* Flush the given writable p.t. page and write-protect it again. */
3000 void ptwr_flush(struct domain *d, const int which)
3002 unsigned long l1va;
3003 l1_pgentry_t *pl1e, pte, *ptep;
3004 l2_pgentry_t *pl2e;
3005 unsigned int modified;
3007 #ifdef CONFIG_X86_64
3008 struct vcpu *v = current;
3009 int user_mode = !(v->arch.flags & TF_kernel_mode);
3010 #endif
3012 ASSERT(!shadow_mode_enabled(d));
3014 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3015 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
3016 write_cr3(pagetable_get_paddr(
3017 d->arch.ptwr[which].vcpu->arch.guest_table));
3018 else
3019 TOGGLE_MODE();
3021 l1va = d->arch.ptwr[which].l1va;
3022 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
3024 /*
3025 * STEP 1. Write-protect the p.t. page so no more updates can occur.
3026 */
3028 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
3030 MEM_LOG("ptwr: Could not read pte at %p", ptep);
3031 /*
3032 * Really a bug. We could read this PTE during the initial fault,
3033 * and pagetables can't have changed meantime.
3034 */
3035 BUG();
3037 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
3038 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3039 l1e_remove_flags(pte, _PAGE_RW);
3041 /* Write-protect the p.t. page in the guest page table. */
3042 if ( unlikely(__put_user(pte, ptep)) )
3044 MEM_LOG("ptwr: Could not update pte at %p", ptep);
3045 /*
3046 * Really a bug. We could write this PTE during the initial fault,
3047 * and pagetables can't have changed meantime.
3048 */
3049 BUG();
3052 /* Ensure that there are no stale writable mappings in any TLB. */
3053 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3054 flush_tlb_one_mask(d->domain_dirty_cpumask, l1va);
3055 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3056 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3058 /*
3059 * STEP 2. Validate any modified PTEs.
3060 */
3062 if ( likely(d == current->domain) )
3064 pl1e = map_domain_page(l1e_get_pfn(pte));
3065 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3066 unmap_domain_page(pl1e);
3067 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3068 ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
3069 d->arch.ptwr[which].prev_nr_updates = modified;
3071 else
3073 /*
3074 * Must make a temporary global mapping, since we are running in the
3075 * wrong address space, so no access to our own mapcache.
3076 */
3077 pl1e = map_domain_page_global(l1e_get_pfn(pte));
3078 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3079 unmap_domain_page_global(pl1e);
3082 /*
3083 * STEP 3. Reattach the L1 p.t. page into the current address space.
3084 */
3086 if ( which == PTWR_PT_ACTIVE )
3088 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3089 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3092 /*
3093 * STEP 4. Final tidy-up.
3094 */
3096 d->arch.ptwr[which].l1va = 0;
3098 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3099 write_ptbase(current);
3100 else
3101 TOGGLE_MODE();
3104 static int ptwr_emulated_update(
3105 unsigned long addr,
3106 physaddr_t old,
3107 physaddr_t val,
3108 unsigned int bytes,
3109 unsigned int do_cmpxchg)
3111 unsigned long pfn, l1va;
3112 struct pfn_info *page;
3113 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3114 struct domain *d = current->domain;
3116 /* Aligned access only, thank you. */
3117 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3119 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3120 bytes, addr);
3121 return X86EMUL_UNHANDLEABLE;
3124 /* Turn a sub-word access into a full-word access. */
3125 if ( bytes != sizeof(physaddr_t) )
3127 int rc;
3128 physaddr_t full;
3129 unsigned int offset = addr & (sizeof(physaddr_t)-1);
3131 /* Align address; read full word. */
3132 addr &= ~(sizeof(physaddr_t)-1);
3133 if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full,
3134 sizeof(physaddr_t))) )
3135 return rc;
3136 /* Mask out bits provided by caller. */
3137 full &= ~((((physaddr_t)1 << (bytes*8)) - 1) << (offset*8));
3138 /* Shift the caller value and OR in the missing bits. */
3139 val &= (((physaddr_t)1 << (bytes*8)) - 1);
3140 val <<= (offset)*8;
3141 val |= full;
3142 /* Also fill in missing parts of the cmpxchg old value. */
3143 old &= (((physaddr_t)1 << (bytes*8)) - 1);
3144 old <<= (offset)*8;
3145 old |= full;
3148 /*
3149 * We must not emulate an update to a PTE that is temporarily marked
3150 * writable by the batched ptwr logic, else we can corrupt page refcnts!
3151 */
3152 if ( ((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3153 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3154 ptwr_flush(d, PTWR_PT_ACTIVE);
3155 if ( ((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3156 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3157 ptwr_flush(d, PTWR_PT_INACTIVE);
3159 /* Read the PTE that maps the page being updated. */
3160 if (__copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3161 sizeof(pte)))
3163 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3164 return X86EMUL_UNHANDLEABLE;
3167 pfn = l1e_get_pfn(pte);
3168 page = pfn_to_page(pfn);
3170 /* We are looking only for read-only mappings of p.t. pages. */
3171 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
3172 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3173 (page_get_owner(page) != d) )
3175 MEM_LOG("ptwr_emulate: Page is mistyped or bad pte "
3176 "(%lx, %" PRtype_info ")",
3177 l1e_get_pfn(pte), page->u.inuse.type_info);
3178 return X86EMUL_UNHANDLEABLE;
3181 /* Check the new PTE. */
3182 nl1e = l1e_from_intpte(val);
3183 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3185 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3186 return X86EMUL_UNHANDLEABLE;
3189 /* Checked successfully: do the update (write or cmpxchg). */
3190 pl1e = map_domain_page(page_to_pfn(page));
3191 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3192 if ( do_cmpxchg )
3194 ol1e = l1e_from_intpte(old);
3195 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3197 unmap_domain_page(pl1e);
3198 put_page_from_l1e(nl1e, d);
3199 return X86EMUL_CMPXCHG_FAILED;
3202 else
3204 ol1e = *pl1e;
3205 *pl1e = nl1e;
3207 unmap_domain_page(pl1e);
3209 /* Finally, drop the old PTE. */
3210 put_page_from_l1e(ol1e, d);
3212 return X86EMUL_CONTINUE;
3215 static int ptwr_emulated_write(
3216 unsigned long addr,
3217 unsigned long val,
3218 unsigned int bytes)
3220 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3223 static int ptwr_emulated_cmpxchg(
3224 unsigned long addr,
3225 unsigned long old,
3226 unsigned long new,
3227 unsigned int bytes)
3229 return ptwr_emulated_update(addr, old, new, bytes, 1);
3232 static int ptwr_emulated_cmpxchg8b(
3233 unsigned long addr,
3234 unsigned long old,
3235 unsigned long old_hi,
3236 unsigned long new,
3237 unsigned long new_hi)
3239 return ptwr_emulated_update(
3240 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3243 static struct x86_mem_emulator ptwr_mem_emulator = {
3244 .read_std = x86_emulate_read_std,
3245 .write_std = x86_emulate_write_std,
3246 .read_emulated = x86_emulate_read_std,
3247 .write_emulated = ptwr_emulated_write,
3248 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3249 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3250 };
3252 /* Write page fault handler: check if guest is trying to modify a PTE. */
3253 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3254 struct cpu_user_regs *regs)
3256 unsigned long pfn;
3257 struct pfn_info *page;
3258 l1_pgentry_t *pl1e, pte;
3259 l2_pgentry_t *pl2e, l2e;
3260 int which, flags;
3261 unsigned long l2_idx;
3263 if ( unlikely(shadow_mode_enabled(d)) )
3264 return 0;
3266 /*
3267 * Attempt to read the PTE that maps the VA being accessed. By checking for
3268 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3269 */
3270 if ( !(l2e_get_flags(__linear_l2_table[l2_linear_offset(addr)]) &
3271 _PAGE_PRESENT) ||
3272 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
3273 sizeof(pte)) )
3275 return 0;
3278 pfn = l1e_get_pfn(pte);
3279 page = pfn_to_page(pfn);
3281 #ifdef CONFIG_X86_64
3282 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT | _PAGE_USER)
3283 #else
3284 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT)
3285 #endif
3287 /*
3288 * Check the required flags for a valid wrpt mapping. If the page is
3289 * already writable then we can return straight to the guest (SMP race).
3290 * We decide whether or not to propagate the fault by testing for write
3291 * permissions in page directories by writing back to the linear mapping.
3292 */
3293 if ( (flags = l1e_get_flags(pte) & WRPT_PTE_FLAGS) == WRPT_PTE_FLAGS )
3294 return !__put_user(
3295 pte.l1, &linear_pg_table[l1_linear_offset(addr)].l1);
3297 /* We are looking only for read-only mappings of p.t. pages. */
3298 if ( ((flags | _PAGE_RW) != WRPT_PTE_FLAGS) ||
3299 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3300 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3301 (page_get_owner(page) != d) )
3303 return 0;
3306 #if 0 /* Leave this in as useful for debugging */
3307 goto emulate;
3308 #endif
3310 PTWR_PRINTK("ptwr_page_fault on l1 pt at va %lx, pfn %lx, eip %lx\n",
3311 addr, pfn, (unsigned long)regs->eip);
3313 /* Get the L2 index at which this L1 p.t. is always mapped. */
3314 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3315 if ( unlikely(l2_idx >= PGT_va_unknown) )
3316 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3317 l2_idx >>= PGT_va_shift;
3319 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3320 goto emulate; /* Urk! Pagetable maps itself! */
3322 /*
3323 * Is the L1 p.t. mapped into the current address space? If so we call it
3324 * an ACTIVE p.t., otherwise it is INACTIVE.
3325 */
3326 pl2e = &__linear_l2_table[l2_idx];
3327 which = PTWR_PT_INACTIVE;
3329 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3331 /*
3332 * Check the PRESENT bit to set ACTIVE mode.
3333 * If the PRESENT bit is clear, we may be conflicting with the current
3334 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3335 * The ptwr_flush call below will restore the PRESENT bit.
3336 */
3337 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3338 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3339 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3340 which = PTWR_PT_ACTIVE;
3343 /*
3344 * If this is a multi-processor guest then ensure that the page is hooked
3345 * into at most one L2 table, which must be the one running on this VCPU.
3346 */
3347 if ( (d->vcpu[0]->next_in_list != NULL) &&
3348 ((page->u.inuse.type_info & PGT_count_mask) !=
3349 (!!(page->u.inuse.type_info & PGT_pinned) +
3350 (which == PTWR_PT_ACTIVE))) )
3352 /* Could be conflicting writable mappings from other VCPUs. */
3353 cleanup_writable_pagetable(d);
3354 goto emulate;
3357 /*
3358 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
3359 * time. If there is already one, we must flush it out.
3360 */
3361 if ( d->arch.ptwr[which].l1va )
3362 ptwr_flush(d, which);
3364 /*
3365 * If last batch made no updates then we are probably stuck. Emulate this
3366 * update to ensure we make progress.
3367 */
3368 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3370 /* Ensure that we don't get stuck in an emulation-only rut. */
3371 d->arch.ptwr[which].prev_nr_updates = 1;
3372 goto emulate;
3375 PTWR_PRINTK("[%c] batched ptwr_page_fault at va %lx, pt for %08lx, "
3376 "pfn %lx\n", PTWR_PRINT_WHICH, addr,
3377 l2_idx << L2_PAGETABLE_SHIFT, pfn);
3379 d->arch.ptwr[which].l1va = addr | 1;
3380 d->arch.ptwr[which].l2_idx = l2_idx;
3381 d->arch.ptwr[which].vcpu = current;
3383 #ifdef PERF_ARRAYS
3384 d->arch.ptwr[which].eip = regs->eip;
3385 #endif
3387 /* For safety, disconnect the L1 p.t. page from current space. */
3388 if ( which == PTWR_PT_ACTIVE )
3390 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
3391 flush_tlb_mask(d->domain_dirty_cpumask);
3394 /* Temporarily map the L1 page, and make a copy of it. */
3395 pl1e = map_domain_page(pfn);
3396 memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE);
3397 unmap_domain_page(pl1e);
3399 /* Finally, make the p.t. page writable by the guest OS. */
3400 l1e_add_flags(pte, _PAGE_RW);
3401 if ( unlikely(__put_user(pte.l1,
3402 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3404 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
3405 &linear_pg_table[l1_linear_offset(addr)]);
3406 /* Toss the writable pagetable state and crash. */
3407 d->arch.ptwr[which].l1va = 0;
3408 domain_crash(d);
3409 return 0;
3412 return EXCRET_fault_fixed;
3414 emulate:
3415 if ( x86_emulate_memop(guest_cpu_user_regs(), addr,
3416 &ptwr_mem_emulator, X86EMUL_MODE_HOST) )
3417 return 0;
3418 perfc_incrc(ptwr_emulations);
3419 return EXCRET_fault_fixed;
3422 int ptwr_init(struct domain *d)
3424 void *x = alloc_xenheap_page();
3425 void *y = alloc_xenheap_page();
3427 if ( (x == NULL) || (y == NULL) )
3429 free_xenheap_page(x);
3430 free_xenheap_page(y);
3431 return -ENOMEM;
3434 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3435 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3437 return 0;
3440 void ptwr_destroy(struct domain *d)
3442 LOCK_BIGLOCK(d);
3443 cleanup_writable_pagetable(d);
3444 UNLOCK_BIGLOCK(d);
3445 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3446 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3449 void cleanup_writable_pagetable(struct domain *d)
3451 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3452 return;
3454 if ( unlikely(shadow_mode_enabled(d)) )
3456 shadow_sync_all(d);
3458 else
3460 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3461 ptwr_flush(d, PTWR_PT_ACTIVE);
3462 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3463 ptwr_flush(d, PTWR_PT_INACTIVE);
3467 int map_pages_to_xen(
3468 unsigned long virt,
3469 unsigned long pfn,
3470 unsigned long nr_pfns,
3471 unsigned long flags)
3473 l2_pgentry_t *pl2e, ol2e;
3474 l1_pgentry_t *pl1e, ol1e;
3475 unsigned int i;
3477 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3478 flags &= ~MAP_SMALL_PAGES;
3480 while ( nr_pfns != 0 )
3482 pl2e = virt_to_xen_l2e(virt);
3484 if ( ((((virt>>PAGE_SHIFT) | pfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3485 (nr_pfns >= (1<<PAGETABLE_ORDER)) &&
3486 !map_small_pages )
3488 /* Super-page mapping. */
3489 ol2e = *pl2e;
3490 *pl2e = l2e_from_pfn(pfn, flags|_PAGE_PSE);
3492 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3494 local_flush_tlb_pge();
3495 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3496 free_xen_pagetable(l2e_get_page(*pl2e));
3499 virt += 1UL << L2_PAGETABLE_SHIFT;
3500 pfn += 1UL << PAGETABLE_ORDER;
3501 nr_pfns -= 1UL << PAGETABLE_ORDER;
3503 else
3505 /* Normal page mapping. */
3506 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3508 pl1e = page_to_virt(alloc_xen_pagetable());
3509 clear_page(pl1e);
3510 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3512 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3514 pl1e = page_to_virt(alloc_xen_pagetable());
3515 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3516 pl1e[i] = l1e_from_pfn(
3517 l2e_get_pfn(*pl2e) + i,
3518 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3519 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3520 local_flush_tlb_pge();
3523 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3524 ol1e = *pl1e;
3525 *pl1e = l1e_from_pfn(pfn, flags);
3526 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3527 local_flush_tlb_one(virt);
3529 virt += 1UL << L1_PAGETABLE_SHIFT;
3530 pfn += 1UL;
3531 nr_pfns -= 1UL;
3535 return 0;
3538 void __set_fixmap(
3539 enum fixed_addresses idx, unsigned long p, unsigned long flags)
3541 if ( unlikely(idx >= __end_of_fixed_addresses) )
3542 BUG();
3543 map_pages_to_xen(fix_to_virt(idx), p >> PAGE_SHIFT, 1, flags);
3546 #ifdef MEMORY_GUARD
3548 void memguard_init(void)
3550 map_pages_to_xen(
3551 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3552 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3555 static void __memguard_change_range(void *p, unsigned long l, int guard)
3557 unsigned long _p = (unsigned long)p;
3558 unsigned long _l = (unsigned long)l;
3559 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3561 /* Ensure we are dealing with a page-aligned whole number of pages. */
3562 ASSERT((_p&PAGE_MASK) != 0);
3563 ASSERT((_l&PAGE_MASK) != 0);
3564 ASSERT((_p&~PAGE_MASK) == 0);
3565 ASSERT((_l&~PAGE_MASK) == 0);
3567 if ( guard )
3568 flags &= ~_PAGE_PRESENT;
3570 map_pages_to_xen(
3571 _p, virt_to_phys(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3574 void memguard_guard_range(void *p, unsigned long l)
3576 __memguard_change_range(p, l, 1);
3579 void memguard_unguard_range(void *p, unsigned long l)
3581 __memguard_change_range(p, l, 0);
3584 #endif
3586 /*
3587 * Local variables:
3588 * mode: C
3589 * c-set-style: "BSD"
3590 * c-basic-offset: 4
3591 * tab-width: 4
3592 * indent-tabs-mode: nil
3593 * End:
3594 */