ia64/xen-unstable

view xen/arch/x86/mm.c @ 6294:1a0723cd37f1

Fix many uses of machine addresses in XenLinux. Primarily
this fixes users of virt_to_machine/machine_to_virt to
use virt_to_mfn/mfn_to_virt where that is more appropriate.

This should be a big step to improved PAE stability.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Aug 19 16:06:43 2005 +0000 (2005-08-19)
parents 47d49e8b8042
children bd5533956fb0 6721abf6b16d 81576d3d1ca8
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <xen/domain_page.h>
98 #include <xen/event.h>
99 #include <asm/shadow.h>
100 #include <asm/page.h>
101 #include <asm/flushtlb.h>
102 #include <asm/io.h>
103 #include <asm/uaccess.h>
104 #include <asm/ldt.h>
105 #include <asm/x86_emulate.h>
107 #ifdef VERBOSE
108 #define MEM_LOG(_f, _a...) \
109 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
110 current->domain->domain_id , __LINE__ , ## _a )
111 #else
112 #define MEM_LOG(_f, _a...) ((void)0)
113 #endif
115 /*
116 * Both do_mmuext_op() and do_mmu_update():
117 * We steal the m.s.b. of the @count parameter to indicate whether this
118 * invocation of do_mmu_update() is resuming a previously preempted call.
119 */
120 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
122 static void free_l2_table(struct pfn_info *page);
123 static void free_l1_table(struct pfn_info *page);
125 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
126 unsigned long type);
127 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
129 /* Used to defer flushing of memory structures. */
130 static struct {
131 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
132 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
133 unsigned int deferred_ops;
134 /* If non-NULL, specifies a foreign subject domain for some operations. */
135 struct domain *foreign;
136 } __cacheline_aligned percpu_info[NR_CPUS];
138 /*
139 * Returns the current foreign domain; defaults to the currently-executing
140 * domain if a foreign override hasn't been specified.
141 */
142 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
144 /* Private domain structs for DOMID_XEN and DOMID_IO. */
145 static struct domain *dom_xen, *dom_io;
147 /* Frame table and its size in pages. */
148 struct pfn_info *frame_table;
149 unsigned long max_page;
151 void __init init_frametable(void)
152 {
153 unsigned long nr_pages, page_step, i, pfn;
155 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
157 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
158 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
160 for ( i = 0; i < nr_pages; i += page_step )
161 {
162 pfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
163 if ( pfn == 0 )
164 panic("Not enough memory for frame table\n");
165 map_pages_to_xen(
166 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
167 pfn, page_step, PAGE_HYPERVISOR);
168 }
170 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
171 }
173 void arch_init_memory(void)
174 {
175 extern void subarch_init_memory(struct domain *);
177 unsigned long i, pfn, rstart_pfn, rend_pfn;
178 struct pfn_info *page;
180 memset(percpu_info, 0, sizeof(percpu_info));
182 /*
183 * Initialise our DOMID_XEN domain.
184 * Any Xen-heap pages that we will allow to be mapped will have
185 * their domain field set to dom_xen.
186 */
187 dom_xen = alloc_domain_struct();
188 atomic_set(&dom_xen->refcnt, 1);
189 dom_xen->domain_id = DOMID_XEN;
191 /*
192 * Initialise our DOMID_IO domain.
193 * This domain owns I/O pages that are within the range of the pfn_info
194 * array. Mappings occur at the priv of the caller.
195 */
196 dom_io = alloc_domain_struct();
197 atomic_set(&dom_io->refcnt, 1);
198 dom_io->domain_id = DOMID_IO;
200 /* First 1MB of RAM is historically marked as I/O. */
201 for ( i = 0; i < 0x100; i++ )
202 {
203 page = &frame_table[i];
204 page->count_info = PGC_allocated | 1;
205 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
206 page_set_owner(page, dom_io);
207 }
209 /* Any areas not specified as RAM by the e820 map are considered I/O. */
210 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
211 {
212 if ( e820.map[i].type != E820_RAM )
213 continue;
214 /* Every page from cursor to start of next RAM region is I/O. */
215 rstart_pfn = PFN_UP(e820.map[i].addr);
216 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
217 while ( pfn < rstart_pfn )
218 {
219 BUG_ON(!pfn_valid(pfn));
220 page = &frame_table[pfn++];
221 page->count_info = PGC_allocated | 1;
222 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
223 page_set_owner(page, dom_io);
224 }
225 /* Skip the RAM region. */
226 pfn = rend_pfn;
227 }
228 BUG_ON(pfn != max_page);
230 subarch_init_memory(dom_xen);
231 }
233 void write_ptbase(struct vcpu *v)
234 {
235 write_cr3(pagetable_get_paddr(v->arch.monitor_table));
236 }
238 void invalidate_shadow_ldt(struct vcpu *v)
239 {
240 int i;
241 unsigned long pfn;
242 struct pfn_info *page;
244 if ( v->arch.shadow_ldt_mapcnt == 0 )
245 return;
247 v->arch.shadow_ldt_mapcnt = 0;
249 for ( i = 16; i < 32; i++ )
250 {
251 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
252 if ( pfn == 0 ) continue;
253 v->arch.perdomain_ptes[i] = l1e_empty();
254 page = &frame_table[pfn];
255 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
256 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
257 put_page_and_type(page);
258 }
260 /* Dispose of the (now possibly invalid) mappings from the TLB. */
261 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
262 }
265 static int alloc_segdesc_page(struct pfn_info *page)
266 {
267 struct desc_struct *descs;
268 int i;
270 descs = map_domain_page(page_to_pfn(page));
272 for ( i = 0; i < 512; i++ )
273 if ( unlikely(!check_descriptor(&descs[i])) )
274 goto fail;
276 unmap_domain_page(descs);
277 return 1;
279 fail:
280 unmap_domain_page(descs);
281 return 0;
282 }
285 /* Map shadow page at offset @off. */
286 int map_ldt_shadow_page(unsigned int off)
287 {
288 struct vcpu *v = current;
289 struct domain *d = v->domain;
290 unsigned long gpfn, gmfn;
291 l1_pgentry_t l1e, nl1e;
292 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
293 int res;
295 #if defined(__x86_64__)
296 /* If in user mode, switch to kernel mode just to read LDT mapping. */
297 extern void toggle_guest_mode(struct vcpu *);
298 int user_mode = !(v->arch.flags & TF_kernel_mode);
299 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
300 #elif defined(__i386__)
301 #define TOGGLE_MODE() ((void)0)
302 #endif
304 BUG_ON(unlikely(in_irq()));
306 shadow_sync_va(v, gva);
308 TOGGLE_MODE();
309 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
310 sizeof(l1e));
311 TOGGLE_MODE();
313 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
314 return 0;
316 gpfn = l1e_get_pfn(l1e);
317 gmfn = __gpfn_to_mfn(d, gpfn);
318 if ( unlikely(!VALID_MFN(gmfn)) )
319 return 0;
321 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
323 if ( !res && unlikely(shadow_mode_refcounts(d)) )
324 {
325 shadow_lock(d);
326 shadow_remove_all_write_access(d, gpfn, gmfn);
327 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
328 shadow_unlock(d);
329 }
331 if ( unlikely(!res) )
332 return 0;
334 nl1e = l1e_from_pfn(gmfn, l1e_get_flags(l1e) | _PAGE_RW);
336 v->arch.perdomain_ptes[off + 16] = nl1e;
337 v->arch.shadow_ldt_mapcnt++;
339 return 1;
340 }
343 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
344 {
345 struct pfn_info *page = &frame_table[page_nr];
347 if ( unlikely(!pfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
348 {
349 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
350 return 0;
351 }
353 return 1;
354 }
357 static int get_page_and_type_from_pagenr(unsigned long page_nr,
358 unsigned long type,
359 struct domain *d)
360 {
361 struct pfn_info *page = &frame_table[page_nr];
363 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
364 return 0;
366 if ( unlikely(!get_page_type(page, type)) )
367 {
368 if ( (type & PGT_type_mask) != PGT_l1_page_table )
369 MEM_LOG("Bad page type for pfn %lx (%" PRtype_info ")",
370 page_nr, page->u.inuse.type_info);
371 put_page(page);
372 return 0;
373 }
375 return 1;
376 }
378 /*
379 * We allow root tables to map each other (a.k.a. linear page tables). It
380 * needs some special care with reference counts and access permissions:
381 * 1. The mapping entry must be read-only, or the guest may get write access
382 * to its own PTEs.
383 * 2. We must only bump the reference counts for an *already validated*
384 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
385 * on a validation that is required to complete that validation.
386 * 3. We only need to increment the reference counts for the mapped page
387 * frame if it is mapped by a different root table. This is sufficient and
388 * also necessary to allow validation of a root table mapping itself.
389 */
390 static int
391 get_linear_pagetable(
392 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
393 {
394 unsigned long x, y;
395 struct pfn_info *page;
396 unsigned long pfn;
398 ASSERT( !shadow_mode_refcounts(d) );
400 if ( (root_get_flags(re) & _PAGE_RW) )
401 {
402 MEM_LOG("Attempt to create linear p.t. with write perms");
403 return 0;
404 }
406 if ( (pfn = root_get_pfn(re)) != re_pfn )
407 {
408 /* Make sure the mapped frame belongs to the correct domain. */
409 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
410 return 0;
412 /*
413 * Make sure that the mapped frame is an already-validated L2 table.
414 * If so, atomically increment the count (checking for overflow).
415 */
416 page = &frame_table[pfn];
417 y = page->u.inuse.type_info;
418 do {
419 x = y;
420 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
421 unlikely((x & (PGT_type_mask|PGT_validated)) !=
422 (PGT_root_page_table|PGT_validated)) )
423 {
424 put_page(page);
425 return 0;
426 }
427 }
428 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
429 }
431 return 1;
432 }
434 int
435 get_page_from_l1e(
436 l1_pgentry_t l1e, struct domain *d)
437 {
438 unsigned long mfn = l1e_get_pfn(l1e);
439 struct pfn_info *page = &frame_table[mfn];
440 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
442 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
443 return 1;
445 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
446 {
447 MEM_LOG("Bad L1 flags %x\n", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
448 return 0;
449 }
451 if ( unlikely(!pfn_valid(mfn)) ||
452 unlikely(page_get_owner(page) == dom_io) )
453 {
454 /* DOMID_IO reverts to caller for privilege checks. */
455 if ( d == dom_io )
456 d = current->domain;
458 if ( (!IS_PRIV(d)) &&
459 (!IS_CAPABLE_PHYSDEV(d) || !domain_iomem_in_pfn(d, mfn)) )
460 {
461 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
462 return 0;
463 }
465 /* No reference counting for out-of-range I/O pages. */
466 if ( !pfn_valid(mfn) )
467 return 1;
469 d = dom_io;
470 }
472 return ((l1e_get_flags(l1e) & _PAGE_RW) ?
473 get_page_and_type(page, d, PGT_writable_page) :
474 get_page(page, d));
475 }
478 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
479 static int
480 get_page_from_l2e(
481 l2_pgentry_t l2e, unsigned long pfn,
482 struct domain *d, unsigned long vaddr)
483 {
484 int rc;
486 ASSERT(!shadow_mode_refcounts(d));
488 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
489 return 1;
491 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
492 {
493 MEM_LOG("Bad L2 flags %x\n", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
494 return 0;
495 }
497 vaddr >>= L2_PAGETABLE_SHIFT;
498 vaddr <<= PGT_va_shift;
499 rc = get_page_and_type_from_pagenr(
500 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
502 #if CONFIG_PAGING_LEVELS == 2
503 if (!rc)
504 rc = get_linear_pagetable(l2e, pfn, d);
505 #endif
506 return rc;
507 }
510 #if CONFIG_PAGING_LEVELS >= 3
512 static int
513 get_page_from_l3e(
514 l3_pgentry_t l3e, unsigned long pfn,
515 struct domain *d, unsigned long vaddr)
516 {
517 ASSERT( !shadow_mode_refcounts(d) );
519 int rc;
521 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
522 return 1;
524 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
525 {
526 MEM_LOG("Bad L3 flags %x\n", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
527 return 0;
528 }
530 vaddr >>= L3_PAGETABLE_SHIFT;
531 vaddr <<= PGT_va_shift;
532 rc = get_page_and_type_from_pagenr(
533 l3e_get_pfn(l3e),
534 PGT_l2_page_table | vaddr, d);
535 #if CONFIG_PAGING_LEVELS == 3
536 if (!rc)
537 rc = get_linear_pagetable(l3e, pfn, d);
538 #endif
539 return rc;
540 }
542 #endif /* 3 level */
544 #if CONFIG_PAGING_LEVELS >= 4
546 static int
547 get_page_from_l4e(
548 l4_pgentry_t l4e, unsigned long pfn,
549 struct domain *d, unsigned long vaddr)
550 {
551 int rc;
553 ASSERT( !shadow_mode_refcounts(d) );
555 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
556 return 1;
558 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
559 {
560 MEM_LOG("Bad L4 flags %x\n", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
561 return 0;
562 }
564 vaddr >>= L4_PAGETABLE_SHIFT;
565 vaddr <<= PGT_va_shift;
566 rc = get_page_and_type_from_pagenr(
567 l4e_get_pfn(l4e),
568 PGT_l3_page_table | vaddr, d);
570 if ( unlikely(!rc) )
571 return get_linear_pagetable(l4e, pfn, d);
573 return 1;
574 }
576 #endif /* 4 level */
579 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
580 {
581 unsigned long pfn = l1e_get_pfn(l1e);
582 struct pfn_info *page = &frame_table[pfn];
583 struct domain *e;
585 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !pfn_valid(pfn) )
586 return;
588 e = page_get_owner(page);
589 if ( unlikely(e != d) )
590 {
591 /*
592 * Unmap a foreign page that may have been mapped via a grant table.
593 * Note that this can fail for a privileged domain that can map foreign
594 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
595 * counted via a grant entry and some counted directly in the page
596 * structure's reference count. Note that reference counts won't get
597 * dangerously confused as long as we always try to decrement the
598 * grant entry first. We may end up with a mismatch between which
599 * mappings and which unmappings are counted via the grant entry, but
600 * really it doesn't matter as privileged domains have carte blanche.
601 */
602 if (likely(gnttab_check_unmap(e, d, pfn,
603 !(l1e_get_flags(l1e) & _PAGE_RW))))
604 return;
605 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
606 }
608 if ( l1e_get_flags(l1e) & _PAGE_RW )
609 {
610 put_page_and_type(page);
611 }
612 else
613 {
614 /* We expect this is rare so we blow the entire shadow LDT. */
615 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
616 PGT_ldt_page)) &&
617 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
619 // XXX SMP BUG?
620 invalidate_shadow_ldt(e->vcpu[0]);
621 put_page(page);
622 }
623 }
626 /*
627 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
628 * Note also that this automatically deals correctly with linear p.t.'s.
629 */
630 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
631 {
632 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
633 (l2e_get_pfn(l2e) != pfn) )
634 put_page_and_type(&frame_table[l2e_get_pfn(l2e)]);
635 }
638 #if CONFIG_PAGING_LEVELS >= 3
640 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
641 {
642 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
643 (l3e_get_pfn(l3e) != pfn) )
644 put_page_and_type(&frame_table[l3e_get_pfn(l3e)]);
645 }
647 #endif
649 #if CONFIG_PAGING_LEVELS >= 4
651 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
652 {
653 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
654 (l4e_get_pfn(l4e) != pfn) )
655 put_page_and_type(&frame_table[l4e_get_pfn(l4e)]);
656 }
658 #endif
661 static int alloc_l1_table(struct pfn_info *page)
662 {
663 struct domain *d = page_get_owner(page);
664 unsigned long pfn = page_to_pfn(page);
665 l1_pgentry_t *pl1e;
666 int i;
668 ASSERT(!shadow_mode_refcounts(d));
670 pl1e = map_domain_page(pfn);
672 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
673 if ( is_guest_l1_slot(i) &&
674 unlikely(!get_page_from_l1e(pl1e[i], d)) )
675 goto fail;
677 unmap_domain_page(pl1e);
678 return 1;
680 fail:
681 while ( i-- > 0 )
682 if ( is_guest_l1_slot(i) )
683 put_page_from_l1e(pl1e[i], d);
685 unmap_domain_page(pl1e);
686 return 0;
687 }
689 #ifdef CONFIG_X86_PAE
690 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
691 {
692 struct pfn_info *page;
693 l2_pgentry_t *pl2e;
694 l3_pgentry_t l3e3;
695 int i;
697 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
699 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
700 l3e3 = pl3e[3];
701 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
702 {
703 MEM_LOG("PAE L3 3rd slot is empty");
704 return 0;
705 }
707 /*
708 * The Xen-private mappings include linear mappings. The L2 thus cannot
709 * be shared by multiple L3 tables. The test here is adequate because:
710 * 1. Cannot appear in slots != 3 because the page would then then have
711 * unknown va backpointer, which get_page_type() explicitly disallows.
712 * 2. Cannot appear in another page table's L3:
713 * a. alloc_l3_table() calls this function and this check will fail
714 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
715 */
716 page = l3e_get_page(l3e3);
717 BUG_ON(page->u.inuse.type_info & PGT_pinned);
718 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
719 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
720 {
721 MEM_LOG("PAE L3 3rd slot is shared");
722 return 0;
723 }
725 /* Xen private mappings. */
726 pl2e = map_domain_page(l3e_get_pfn(l3e3));
727 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
728 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
729 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
730 for ( i = 0; i < (PERDOMAIN_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
731 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
732 l2e_from_page(
733 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
734 __PAGE_HYPERVISOR);
735 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
736 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
737 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
738 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
739 l2e_empty();
740 unmap_domain_page(pl2e);
742 return 1;
743 }
745 static inline int l1_backptr(
746 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
747 {
748 unsigned long l2_backptr = l2_type & PGT_va_mask;
749 BUG_ON(l2_backptr == PGT_va_unknown);
750 if ( l2_backptr == PGT_va_mutable )
751 return 0;
752 *backptr =
753 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
754 (offset_in_l2 << L2_PAGETABLE_SHIFT);
755 return 1;
756 }
758 #elif CONFIG_X86_64
759 # define create_pae_xen_mappings(pl3e) (1)
761 static inline int l1_backptr(
762 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
763 {
764 unsigned long l2_backptr = l2_type & PGT_va_mask;
765 BUG_ON(l2_backptr == PGT_va_unknown);
767 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
768 (offset_in_l2 << L2_PAGETABLE_SHIFT);
769 return 1;
770 }
772 static inline int l2_backptr(
773 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
774 {
775 unsigned long l3_backptr = l3_type & PGT_va_mask;
776 BUG_ON(l3_backptr == PGT_va_unknown);
778 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
779 (offset_in_l3 << L3_PAGETABLE_SHIFT);
780 return 1;
781 }
783 static inline int l3_backptr(
784 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
785 {
786 unsigned long l4_backptr = l4_type & PGT_va_mask;
787 BUG_ON(l4_backptr == PGT_va_unknown);
789 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
790 return 1;
791 }
792 #else
793 # define create_pae_xen_mappings(pl3e) (1)
794 # define l1_backptr(bp,l2o,l2t) \
795 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
796 #endif
798 static int alloc_l2_table(struct pfn_info *page, unsigned long type)
799 {
800 struct domain *d = page_get_owner(page);
801 unsigned long pfn = page_to_pfn(page);
802 unsigned long vaddr;
803 l2_pgentry_t *pl2e;
804 int i;
806 /* See the code in shadow_promote() to understand why this is here. */
807 if ( (PGT_base_page_table == PGT_l2_page_table) &&
808 unlikely(shadow_mode_refcounts(d)) )
809 return 1;
810 ASSERT(!shadow_mode_refcounts(d));
812 pl2e = map_domain_page(pfn);
814 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
815 {
816 if ( !l1_backptr(&vaddr, i, type) )
817 goto fail;
818 if ( is_guest_l2_slot(type, i) &&
819 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
820 goto fail;
821 }
823 #if CONFIG_PAGING_LEVELS == 2
824 /* Xen private mappings. */
825 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
826 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
827 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
828 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
829 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
830 pl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
831 l2e_from_page(
832 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt),
833 __PAGE_HYPERVISOR);
834 #endif
836 unmap_domain_page(pl2e);
837 return 1;
839 fail:
840 while ( i-- > 0 )
841 if ( is_guest_l2_slot(type, i) )
842 put_page_from_l2e(pl2e[i], pfn);
844 unmap_domain_page(pl2e);
845 return 0;
846 }
849 #if CONFIG_PAGING_LEVELS >= 3
850 static int alloc_l3_table(struct pfn_info *page, unsigned long type)
851 {
852 struct domain *d = page_get_owner(page);
853 unsigned long pfn = page_to_pfn(page);
854 unsigned long vaddr;
855 l3_pgentry_t *pl3e;
856 int i;
858 ASSERT(!shadow_mode_refcounts(d));
860 #ifdef CONFIG_X86_PAE
861 if ( pfn >= 0x100000 )
862 {
863 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
864 return 0;
865 }
866 #endif
868 pl3e = map_domain_page(pfn);
869 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
870 {
871 #if CONFIG_PAGING_LEVELS >= 4
872 if ( !l2_backptr(&vaddr, i, type) )
873 goto fail;
874 #else
875 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
876 #endif
877 if ( is_guest_l3_slot(i) &&
878 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
879 goto fail;
880 }
882 if ( !create_pae_xen_mappings(pl3e) )
883 goto fail;
885 unmap_domain_page(pl3e);
886 return 1;
888 fail:
889 while ( i-- > 0 )
890 if ( is_guest_l3_slot(i) )
891 put_page_from_l3e(pl3e[i], pfn);
893 unmap_domain_page(pl3e);
894 return 0;
895 }
896 #else
897 #define alloc_l3_table(page, type) (0)
898 #endif
900 #if CONFIG_PAGING_LEVELS >= 4
901 static int alloc_l4_table(struct pfn_info *page, unsigned long type)
902 {
903 struct domain *d = page_get_owner(page);
904 unsigned long pfn = page_to_pfn(page);
905 l4_pgentry_t *pl4e = page_to_virt(page);
906 unsigned long vaddr;
907 int i;
909 /* See the code in shadow_promote() to understand why this is here. */
910 if ( (PGT_base_page_table == PGT_l4_page_table) &&
911 shadow_mode_refcounts(d) )
912 return 1;
913 ASSERT(!shadow_mode_refcounts(d));
915 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
916 {
917 if ( !l3_backptr(&vaddr, i, type) )
918 goto fail;
920 if ( is_guest_l4_slot(i) &&
921 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
922 goto fail;
923 }
925 /* Xen private mappings. */
926 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
927 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
928 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
929 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
930 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
931 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
932 l4e_from_page(
933 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
934 __PAGE_HYPERVISOR);
936 return 1;
938 fail:
939 while ( i-- > 0 )
940 if ( is_guest_l4_slot(i) )
941 put_page_from_l4e(pl4e[i], pfn);
943 return 0;
944 }
945 #else
946 #define alloc_l4_table(page, type) (0)
947 #endif
950 static void free_l1_table(struct pfn_info *page)
951 {
952 struct domain *d = page_get_owner(page);
953 unsigned long pfn = page_to_pfn(page);
954 l1_pgentry_t *pl1e;
955 int i;
957 pl1e = map_domain_page(pfn);
959 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
960 if ( is_guest_l1_slot(i) )
961 put_page_from_l1e(pl1e[i], d);
963 unmap_domain_page(pl1e);
964 }
967 static void free_l2_table(struct pfn_info *page)
968 {
969 unsigned long pfn = page_to_pfn(page);
970 l2_pgentry_t *pl2e;
971 int i;
973 pl2e = map_domain_page(pfn);
975 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
976 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
977 put_page_from_l2e(pl2e[i], pfn);
979 unmap_domain_page(pl2e);
980 }
983 #if CONFIG_PAGING_LEVELS >= 3
985 static void free_l3_table(struct pfn_info *page)
986 {
987 unsigned long pfn = page_to_pfn(page);
988 l3_pgentry_t *pl3e;
989 int i;
991 pl3e = map_domain_page(pfn);
993 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
994 if ( is_guest_l3_slot(i) )
995 put_page_from_l3e(pl3e[i], pfn);
997 unmap_domain_page(pl3e);
998 }
1000 #endif
1002 #if CONFIG_PAGING_LEVELS >= 4
1004 static void free_l4_table(struct pfn_info *page)
1006 unsigned long pfn = page_to_pfn(page);
1007 l4_pgentry_t *pl4e = page_to_virt(page);
1008 int i;
1010 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1011 if ( is_guest_l4_slot(i) )
1012 put_page_from_l4e(pl4e[i], pfn);
1015 #endif
1017 static inline int update_l1e(l1_pgentry_t *pl1e,
1018 l1_pgentry_t ol1e,
1019 l1_pgentry_t nl1e)
1021 intpte_t o = l1e_get_intpte(ol1e);
1022 intpte_t n = l1e_get_intpte(nl1e);
1024 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
1025 unlikely(o != l1e_get_intpte(ol1e)) )
1027 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1028 ": saw %" PRIpte "\n",
1029 l1e_get_intpte(ol1e),
1030 l1e_get_intpte(nl1e),
1031 o);
1032 return 0;
1034 return 1;
1038 /* Update the L1 entry at pl1e to new value nl1e. */
1039 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1041 l1_pgentry_t ol1e;
1042 struct domain *d = current->domain;
1044 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1045 return 0;
1047 if ( unlikely(shadow_mode_refcounts(d)) )
1048 return update_l1e(pl1e, ol1e, nl1e);
1050 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1052 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1054 MEM_LOG("Bad L1 flags %x\n",
1055 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1056 return 0;
1059 /* Fast path for identical mapping, r/w and presence. */
1060 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1061 return update_l1e(pl1e, ol1e, nl1e);
1063 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1064 return 0;
1066 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1068 put_page_from_l1e(nl1e, d);
1069 return 0;
1072 else
1074 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1075 return 0;
1078 put_page_from_l1e(ol1e, d);
1079 return 1;
1082 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1083 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1084 _t ## e_get_intpte(_o), \
1085 _t ## e_get_intpte(_n)); \
1086 if ( __o != _t ## e_get_intpte(_o) ) \
1087 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1088 ": saw %" PRIpte "", \
1089 (_t ## e_get_intpte(_o)), \
1090 (_t ## e_get_intpte(_n)), \
1091 (__o)); \
1092 (__o == _t ## e_get_intpte(_o)); })
1094 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1095 static int mod_l2_entry(l2_pgentry_t *pl2e,
1096 l2_pgentry_t nl2e,
1097 unsigned long pfn,
1098 unsigned long type)
1100 l2_pgentry_t ol2e;
1101 unsigned long vaddr = 0;
1103 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1105 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1106 return 0;
1109 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1110 return 0;
1112 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1114 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1116 MEM_LOG("Bad L2 flags %x\n",
1117 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1118 return 0;
1121 /* Fast path for identical mapping and presence. */
1122 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1123 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1125 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1126 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1127 return 0;
1129 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1131 put_page_from_l2e(nl2e, pfn);
1132 return 0;
1135 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1137 return 0;
1140 put_page_from_l2e(ol2e, pfn);
1141 return 1;
1145 #if CONFIG_PAGING_LEVELS >= 3
1147 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1148 static int mod_l3_entry(l3_pgentry_t *pl3e,
1149 l3_pgentry_t nl3e,
1150 unsigned long pfn,
1151 unsigned long type)
1153 l3_pgentry_t ol3e;
1154 unsigned long vaddr;
1156 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1158 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1159 return 0;
1162 #ifdef CONFIG_X86_PAE
1163 /*
1164 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1165 * would be a pain to ensure they remain continuously valid throughout.
1166 */
1167 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1168 return 0;
1169 #endif
1171 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1172 return 0;
1174 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1176 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1178 MEM_LOG("Bad L3 flags %x\n",
1179 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1180 return 0;
1183 /* Fast path for identical mapping and presence. */
1184 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1185 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1187 #if CONFIG_PAGING_LEVELS >= 4
1188 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1189 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1190 return 0;
1191 #else
1192 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1193 << L3_PAGETABLE_SHIFT;
1194 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1195 return 0;
1196 #endif
1198 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1200 put_page_from_l3e(nl3e, pfn);
1201 return 0;
1204 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1206 return 0;
1209 BUG_ON(!create_pae_xen_mappings(pl3e));
1210 put_page_from_l3e(ol3e, pfn);
1211 return 1;
1214 #endif
1216 #if CONFIG_PAGING_LEVELS >= 4
1218 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1219 static int mod_l4_entry(l4_pgentry_t *pl4e,
1220 l4_pgentry_t nl4e,
1221 unsigned long pfn,
1222 unsigned long type)
1224 l4_pgentry_t ol4e;
1225 unsigned long vaddr;
1227 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1229 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1230 return 0;
1233 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1234 return 0;
1236 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1238 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1240 MEM_LOG("Bad L4 flags %x\n",
1241 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1242 return 0;
1245 /* Fast path for identical mapping and presence. */
1246 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1247 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1249 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1250 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1251 return 0;
1253 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1255 put_page_from_l4e(nl4e, pfn);
1256 return 0;
1259 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1261 return 0;
1264 put_page_from_l4e(ol4e, pfn);
1265 return 1;
1268 #endif
1270 int alloc_page_type(struct pfn_info *page, unsigned long type)
1272 switch ( type & PGT_type_mask )
1274 case PGT_l1_page_table:
1275 return alloc_l1_table(page);
1276 case PGT_l2_page_table:
1277 return alloc_l2_table(page, type);
1278 case PGT_l3_page_table:
1279 return alloc_l3_table(page, type);
1280 case PGT_l4_page_table:
1281 return alloc_l4_table(page, type);
1282 case PGT_gdt_page:
1283 case PGT_ldt_page:
1284 return alloc_segdesc_page(page);
1285 default:
1286 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1287 type, page->u.inuse.type_info,
1288 page->count_info);
1289 BUG();
1292 return 0;
1296 void free_page_type(struct pfn_info *page, unsigned long type)
1298 struct domain *owner = page_get_owner(page);
1299 unsigned long gpfn;
1301 if ( owner != NULL )
1303 if ( unlikely(shadow_mode_refcounts(owner)) )
1304 return;
1305 if ( unlikely(shadow_mode_enabled(owner)) )
1307 gpfn = __mfn_to_gpfn(owner, page_to_pfn(page));
1308 ASSERT(VALID_M2P(gpfn));
1309 remove_shadow(owner, gpfn, type & PGT_type_mask);
1313 switch (type & PGT_type_mask)
1315 case PGT_l1_page_table:
1316 free_l1_table(page);
1317 break;
1319 case PGT_l2_page_table:
1320 free_l2_table(page);
1321 break;
1323 #if CONFIG_PAGING_LEVELS >= 3
1324 case PGT_l3_page_table:
1325 free_l3_table(page);
1326 break;
1327 #endif
1329 #if CONFIG_PAGING_LEVELS >= 4
1330 case PGT_l4_page_table:
1331 free_l4_table(page);
1332 break;
1333 #endif
1335 default:
1336 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1337 type, page_to_pfn(page));
1338 BUG();
1343 void put_page_type(struct pfn_info *page)
1345 unsigned long nx, x, y = page->u.inuse.type_info;
1347 again:
1348 do {
1349 x = y;
1350 nx = x - 1;
1352 ASSERT((x & PGT_count_mask) != 0);
1354 /*
1355 * The page should always be validated while a reference is held. The
1356 * exception is during domain destruction, when we forcibly invalidate
1357 * page-table pages if we detect a referential loop.
1358 * See domain.c:relinquish_list().
1359 */
1360 ASSERT((x & PGT_validated) ||
1361 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1363 if ( unlikely((nx & PGT_count_mask) == 0) )
1365 /* Record TLB information for flush later. Races are harmless. */
1366 page->tlbflush_timestamp = tlbflush_current_time();
1368 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1369 likely(nx & PGT_validated) )
1371 /*
1372 * Page-table pages must be unvalidated when count is zero. The
1373 * 'free' is safe because the refcnt is non-zero and validated
1374 * bit is clear => other ops will spin or fail.
1375 */
1376 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1377 x & ~PGT_validated)) != x) )
1378 goto again;
1379 /* We cleared the 'valid bit' so we do the clean up. */
1380 free_page_type(page, x);
1381 /* Carry on, but with the 'valid bit' now clear. */
1382 x &= ~PGT_validated;
1383 nx &= ~PGT_validated;
1386 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1387 (PGT_pinned | 1)) &&
1388 ((nx & PGT_type_mask) != PGT_writable_page)) )
1390 /* Page is now only pinned. Make the back pointer mutable again. */
1391 nx |= PGT_va_mutable;
1394 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1398 int get_page_type(struct pfn_info *page, unsigned long type)
1400 unsigned long nx, x, y = page->u.inuse.type_info;
1402 again:
1403 do {
1404 x = y;
1405 nx = x + 1;
1406 if ( unlikely((nx & PGT_count_mask) == 0) )
1408 MEM_LOG("Type count overflow on pfn %lx", page_to_pfn(page));
1409 return 0;
1411 else if ( unlikely((x & PGT_count_mask) == 0) )
1413 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1415 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1417 /*
1418 * On type change we check to flush stale TLB
1419 * entries. This may be unnecessary (e.g., page
1420 * was GDT/LDT) but those circumstances should be
1421 * very rare.
1422 */
1423 cpumask_t mask = page_get_owner(page)->cpumask;
1424 tlbflush_filter(mask, page->tlbflush_timestamp);
1426 if ( unlikely(!cpus_empty(mask)) )
1428 perfc_incrc(need_flush_tlb_flush);
1429 flush_tlb_mask(mask);
1433 /* We lose existing type, back pointer, and validity. */
1434 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1435 nx |= type;
1437 /* No special validation needed for writable pages. */
1438 /* Page tables and GDT/LDT need to be scanned for validity. */
1439 if ( type == PGT_writable_page )
1440 nx |= PGT_validated;
1443 else
1445 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1447 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1449 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1450 ((type & PGT_type_mask) != PGT_l1_page_table) )
1451 MEM_LOG("Bad type (saw %" PRtype_info
1452 "!= exp %" PRtype_info ") for pfn %lx",
1453 x, type, page_to_pfn(page));
1454 return 0;
1456 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1458 /* The va backpointer is mutable, hence we update it. */
1459 nx &= ~PGT_va_mask;
1460 nx |= type; /* we know the actual type is correct */
1462 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1463 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1465 #ifdef CONFIG_X86_PAE
1466 /* We use backptr as extra typing. Cannot be unknown. */
1467 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1468 return 0;
1469 #endif
1470 /* This table is possibly mapped at multiple locations. */
1471 nx &= ~PGT_va_mask;
1472 nx |= PGT_va_unknown;
1475 if ( unlikely(!(x & PGT_validated)) )
1477 /* Someone else is updating validation of this page. Wait... */
1478 while ( (y = page->u.inuse.type_info) == x )
1479 cpu_relax();
1480 goto again;
1484 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1486 if ( unlikely(!(nx & PGT_validated)) )
1488 /* Try to validate page type; drop the new reference on failure. */
1489 if ( unlikely(!alloc_page_type(page, type)) )
1491 MEM_LOG("Error while validating pfn %lx for type %" PRtype_info "."
1492 " caf=%08x taf=%" PRtype_info,
1493 page_to_pfn(page), type,
1494 page->count_info,
1495 page->u.inuse.type_info);
1496 /* Noone else can get a reference. We hold the only ref. */
1497 page->u.inuse.type_info = 0;
1498 return 0;
1501 /* Noone else is updating simultaneously. */
1502 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1505 return 1;
1509 int new_guest_cr3(unsigned long mfn)
1511 struct vcpu *v = current;
1512 struct domain *d = v->domain;
1513 int okay;
1514 unsigned long old_base_mfn;
1516 if ( shadow_mode_refcounts(d) )
1517 okay = get_page_from_pagenr(mfn, d);
1518 else
1519 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1521 if ( likely(okay) )
1523 invalidate_shadow_ldt(v);
1525 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1526 v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1527 update_pagetables(v); /* update shadow_table and monitor_table */
1529 write_ptbase(v);
1531 if ( shadow_mode_refcounts(d) )
1532 put_page(&frame_table[old_base_mfn]);
1533 else
1534 put_page_and_type(&frame_table[old_base_mfn]);
1536 /* CR3 also holds a ref to its shadow... */
1537 if ( shadow_mode_enabled(d) )
1539 if ( v->arch.monitor_shadow_ref )
1540 put_shadow_ref(v->arch.monitor_shadow_ref);
1541 v->arch.monitor_shadow_ref =
1542 pagetable_get_pfn(v->arch.monitor_table);
1543 ASSERT(!page_get_owner(&frame_table[v->arch.monitor_shadow_ref]));
1544 get_shadow_ref(v->arch.monitor_shadow_ref);
1547 else
1549 MEM_LOG("Error while installing new baseptr %lx", mfn);
1552 return okay;
1555 static void process_deferred_ops(unsigned int cpu)
1557 unsigned int deferred_ops;
1558 struct domain *d = current->domain;
1560 deferred_ops = percpu_info[cpu].deferred_ops;
1561 percpu_info[cpu].deferred_ops = 0;
1563 if ( deferred_ops & DOP_FLUSH_TLB )
1565 if ( shadow_mode_enabled(d) )
1566 shadow_sync_all(d);
1567 local_flush_tlb();
1570 if ( deferred_ops & DOP_RELOAD_LDT )
1571 (void)map_ldt_shadow_page(0);
1573 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1575 put_domain(percpu_info[cpu].foreign);
1576 percpu_info[cpu].foreign = NULL;
1580 static int set_foreigndom(unsigned int cpu, domid_t domid)
1582 struct domain *e, *d = current->domain;
1583 int okay = 1;
1585 if ( (e = percpu_info[cpu].foreign) != NULL )
1586 put_domain(e);
1587 percpu_info[cpu].foreign = NULL;
1589 if ( domid == DOMID_SELF )
1590 goto out;
1592 if ( !IS_PRIV(d) )
1594 switch ( domid )
1596 case DOMID_IO:
1597 get_knownalive_domain(dom_io);
1598 percpu_info[cpu].foreign = dom_io;
1599 break;
1600 default:
1601 MEM_LOG("Dom %u cannot set foreign dom\n", d->domain_id);
1602 okay = 0;
1603 break;
1606 else
1608 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1609 if ( e == NULL )
1611 switch ( domid )
1613 case DOMID_XEN:
1614 get_knownalive_domain(dom_xen);
1615 percpu_info[cpu].foreign = dom_xen;
1616 break;
1617 case DOMID_IO:
1618 get_knownalive_domain(dom_io);
1619 percpu_info[cpu].foreign = dom_io;
1620 break;
1621 default:
1622 MEM_LOG("Unknown domain '%u'", domid);
1623 okay = 0;
1624 break;
1629 out:
1630 return okay;
1633 static inline cpumask_t vcpumask_to_pcpumask(
1634 struct domain *d, unsigned long vmask)
1636 unsigned int vcpu_id;
1637 cpumask_t pmask;
1638 struct vcpu *v;
1640 while ( vmask != 0 )
1642 vcpu_id = find_first_set_bit(vmask);
1643 vmask &= ~(1UL << vcpu_id);
1644 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1645 ((v = d->vcpu[vcpu_id]) != NULL) )
1646 cpu_set(v->processor, pmask);
1649 return pmask;
1652 int do_mmuext_op(
1653 struct mmuext_op *uops,
1654 unsigned int count,
1655 unsigned int *pdone,
1656 unsigned int foreigndom)
1658 struct mmuext_op op;
1659 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1660 unsigned long type, done = 0;
1661 struct pfn_info *page;
1662 struct vcpu *v = current;
1663 struct domain *d = v->domain, *e;
1664 u32 x, y, _d, _nd;
1666 LOCK_BIGLOCK(d);
1668 cleanup_writable_pagetable(d);
1670 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1672 count &= ~MMU_UPDATE_PREEMPTED;
1673 if ( unlikely(pdone != NULL) )
1674 (void)get_user(done, pdone);
1677 if ( !set_foreigndom(cpu, foreigndom) )
1679 rc = -EINVAL;
1680 goto out;
1683 if ( unlikely(!array_access_ok(uops, count, sizeof(op))) )
1685 rc = -EFAULT;
1686 goto out;
1689 for ( i = 0; i < count; i++ )
1691 if ( hypercall_preempt_check() )
1693 rc = hypercall4_create_continuation(
1694 __HYPERVISOR_mmuext_op, uops,
1695 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1696 break;
1699 if ( unlikely(__copy_from_user(&op, uops, sizeof(op)) != 0) )
1701 MEM_LOG("Bad __copy_from_user");
1702 rc = -EFAULT;
1703 break;
1706 okay = 1;
1707 page = &frame_table[op.mfn];
1709 switch ( op.cmd )
1711 case MMUEXT_PIN_L1_TABLE:
1712 type = PGT_l1_page_table | PGT_va_mutable;
1714 pin_page:
1715 if ( shadow_mode_refcounts(FOREIGNDOM) )
1716 type = PGT_writable_page;
1718 okay = get_page_and_type_from_pagenr(op.mfn, type, FOREIGNDOM);
1719 if ( unlikely(!okay) )
1721 MEM_LOG("Error while pinning mfn %lx", op.mfn);
1722 break;
1725 if ( unlikely(test_and_set_bit(_PGT_pinned,
1726 &page->u.inuse.type_info)) )
1728 MEM_LOG("Mfn %lx already pinned", op.mfn);
1729 put_page_and_type(page);
1730 okay = 0;
1731 break;
1734 break;
1736 #ifndef CONFIG_X86_PAE /* Unsafe on PAE because of Xen-private mappings. */
1737 case MMUEXT_PIN_L2_TABLE:
1738 type = PGT_l2_page_table | PGT_va_mutable;
1739 goto pin_page;
1740 #endif
1742 case MMUEXT_PIN_L3_TABLE:
1743 type = PGT_l3_page_table | PGT_va_mutable;
1744 goto pin_page;
1746 case MMUEXT_PIN_L4_TABLE:
1747 type = PGT_l4_page_table | PGT_va_mutable;
1748 goto pin_page;
1750 case MMUEXT_UNPIN_TABLE:
1751 if ( unlikely(!(okay = get_page_from_pagenr(op.mfn, FOREIGNDOM))) )
1753 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1754 op.mfn, page_get_owner(page));
1756 else if ( likely(test_and_clear_bit(_PGT_pinned,
1757 &page->u.inuse.type_info)) )
1759 put_page_and_type(page);
1760 put_page(page);
1762 else
1764 okay = 0;
1765 put_page(page);
1766 MEM_LOG("Mfn %lx not pinned", op.mfn);
1768 break;
1770 case MMUEXT_NEW_BASEPTR:
1771 okay = new_guest_cr3(op.mfn);
1772 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1773 break;
1775 #ifdef __x86_64__
1776 case MMUEXT_NEW_USER_BASEPTR:
1777 okay = get_page_and_type_from_pagenr(
1778 op.mfn, PGT_root_page_table, d);
1779 if ( unlikely(!okay) )
1781 MEM_LOG("Error while installing new mfn %lx", op.mfn);
1783 else
1785 unsigned long old_mfn =
1786 pagetable_get_pfn(v->arch.guest_table_user);
1787 v->arch.guest_table_user = mk_pagetable(op.mfn << PAGE_SHIFT);
1788 if ( old_mfn != 0 )
1789 put_page_and_type(&frame_table[old_mfn]);
1791 break;
1792 #endif
1794 case MMUEXT_TLB_FLUSH_LOCAL:
1795 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1796 break;
1798 case MMUEXT_INVLPG_LOCAL:
1799 if ( shadow_mode_enabled(d) )
1800 shadow_invlpg(v, op.linear_addr);
1801 local_flush_tlb_one(op.linear_addr);
1802 break;
1804 case MMUEXT_TLB_FLUSH_MULTI:
1805 case MMUEXT_INVLPG_MULTI:
1807 unsigned long vmask;
1808 cpumask_t pmask;
1809 if ( unlikely(get_user(vmask, (unsigned long *)op.vcpumask)) )
1811 okay = 0;
1812 break;
1814 pmask = vcpumask_to_pcpumask(d, vmask);
1815 cpus_and(pmask, pmask, d->cpumask);
1816 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
1817 flush_tlb_mask(pmask);
1818 else
1819 flush_tlb_one_mask(pmask, op.linear_addr);
1820 break;
1823 case MMUEXT_TLB_FLUSH_ALL:
1824 flush_tlb_mask(d->cpumask);
1825 break;
1827 case MMUEXT_INVLPG_ALL:
1828 flush_tlb_one_mask(d->cpumask, op.linear_addr);
1829 break;
1831 case MMUEXT_FLUSH_CACHE:
1832 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1834 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1835 okay = 0;
1837 else
1839 wbinvd();
1841 break;
1843 case MMUEXT_SET_LDT:
1845 if ( shadow_mode_external(d) )
1847 MEM_LOG("ignoring SET_LDT hypercall from external "
1848 "domain %u\n", d->domain_id);
1849 okay = 0;
1850 break;
1853 unsigned long ptr = op.linear_addr;
1854 unsigned long ents = op.nr_ents;
1855 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1856 (ents > 8192) ||
1857 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
1859 okay = 0;
1860 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
1862 else if ( (v->arch.guest_context.ldt_ents != ents) ||
1863 (v->arch.guest_context.ldt_base != ptr) )
1865 invalidate_shadow_ldt(v);
1866 v->arch.guest_context.ldt_base = ptr;
1867 v->arch.guest_context.ldt_ents = ents;
1868 load_LDT(v);
1869 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1870 if ( ents != 0 )
1871 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1873 break;
1876 case MMUEXT_REASSIGN_PAGE:
1877 if ( unlikely(!IS_PRIV(d)) )
1879 MEM_LOG("Dom %u has no reassignment priv", d->domain_id);
1880 okay = 0;
1881 break;
1884 e = percpu_info[cpu].foreign;
1885 if ( unlikely(e == NULL) )
1887 MEM_LOG("No FOREIGNDOM to reassign mfn %lx to", op.mfn);
1888 okay = 0;
1889 break;
1892 /*
1893 * Grab both page_list locks, in order. This prevents the page from
1894 * disappearing elsewhere while we modify the owner, and we'll need
1895 * both locks if we're successful so that we can change lists.
1896 */
1897 if ( d < e )
1899 spin_lock(&d->page_alloc_lock);
1900 spin_lock(&e->page_alloc_lock);
1902 else
1904 spin_lock(&e->page_alloc_lock);
1905 spin_lock(&d->page_alloc_lock);
1908 /*
1909 * Check that 'e' will accept the page and has reservation
1910 * headroom. Also, a domain mustn't have PGC_allocated pages when
1911 * it is dying.
1912 */
1913 ASSERT(e->tot_pages <= e->max_pages);
1914 if ( unlikely(test_bit(_DOMF_dying, &e->domain_flags)) ||
1915 unlikely(e->tot_pages == e->max_pages) ||
1916 unlikely(IS_XEN_HEAP_FRAME(page)) )
1918 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1919 "page is in Xen heap (%lx), or dom is dying (%ld).\n",
1920 e->tot_pages, e->max_pages, op.mfn, e->domain_flags);
1921 okay = 0;
1922 goto reassign_fail;
1925 /*
1926 * The tricky bit: atomically change owner while there is just one
1927 * benign reference to the page (PGC_allocated). If that reference
1928 * disappears then the deallocation routine will safely spin.
1929 */
1930 _d = pickle_domptr(d);
1931 _nd = page->u.inuse._domain;
1932 y = page->count_info;
1933 do {
1934 x = y;
1935 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1936 (1|PGC_allocated)) ||
1937 unlikely(_nd != _d) )
1939 MEM_LOG("Bad page values %lx: ed=%p(%u), sd=%p,"
1940 " caf=%08x, taf=%" PRtype_info "\n",
1941 page_to_pfn(page), d, d->domain_id,
1942 unpickle_domptr(_nd), x, page->u.inuse.type_info);
1943 okay = 0;
1944 goto reassign_fail;
1946 __asm__ __volatile__(
1947 LOCK_PREFIX "cmpxchg8b %3"
1948 : "=d" (_nd), "=a" (y), "=c" (e),
1949 "=m" (*(volatile u64 *)(&page->count_info))
1950 : "0" (_d), "1" (x), "c" (e), "b" (x) );
1952 while ( unlikely(_nd != _d) || unlikely(y != x) );
1954 /*
1955 * Unlink from 'd'. We transferred at least one reference to 'e',
1956 * so noone else is spinning to try to delete this page from 'd'.
1957 */
1958 d->tot_pages--;
1959 list_del(&page->list);
1961 /*
1962 * Add the page to 'e'. Someone may already have removed the last
1963 * reference and want to remove the page from 'e'. However, we have
1964 * the lock so they'll spin waiting for us.
1965 */
1966 if ( unlikely(e->tot_pages++ == 0) )
1967 get_knownalive_domain(e);
1968 list_add_tail(&page->list, &e->page_list);
1970 reassign_fail:
1971 spin_unlock(&d->page_alloc_lock);
1972 spin_unlock(&e->page_alloc_lock);
1973 break;
1975 default:
1976 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
1977 okay = 0;
1978 break;
1981 if ( unlikely(!okay) )
1983 rc = -EINVAL;
1984 break;
1987 uops++;
1990 out:
1991 process_deferred_ops(cpu);
1993 /* Add incremental work we have done to the @done output parameter. */
1994 if ( unlikely(pdone != NULL) )
1995 __put_user(done + i, pdone);
1997 UNLOCK_BIGLOCK(d);
1998 return rc;
2001 int do_mmu_update(
2002 mmu_update_t *ureqs,
2003 unsigned int count,
2004 unsigned int *pdone,
2005 unsigned int foreigndom)
2007 mmu_update_t req;
2008 void *va;
2009 unsigned long gpfn, mfn;
2010 struct pfn_info *page;
2011 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
2012 unsigned int cmd, done = 0;
2013 struct vcpu *v = current;
2014 struct domain *d = v->domain;
2015 unsigned long type_info;
2016 struct domain_mmap_cache mapcache, sh_mapcache;
2018 LOCK_BIGLOCK(d);
2020 cleanup_writable_pagetable(d);
2022 if ( unlikely(shadow_mode_enabled(d)) )
2023 check_pagetable(v, "pre-mmu"); /* debug */
2025 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2027 count &= ~MMU_UPDATE_PREEMPTED;
2028 if ( unlikely(pdone != NULL) )
2029 (void)get_user(done, pdone);
2032 domain_mmap_cache_init(&mapcache);
2033 domain_mmap_cache_init(&sh_mapcache);
2035 if ( !set_foreigndom(cpu, foreigndom) )
2037 rc = -EINVAL;
2038 goto out;
2041 perfc_incrc(calls_to_mmu_update);
2042 perfc_addc(num_page_updates, count);
2043 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2045 if ( unlikely(!array_access_ok(ureqs, count, sizeof(req))) )
2047 rc = -EFAULT;
2048 goto out;
2051 for ( i = 0; i < count; i++ )
2053 if ( hypercall_preempt_check() )
2055 rc = hypercall4_create_continuation(
2056 __HYPERVISOR_mmu_update, ureqs,
2057 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2058 break;
2061 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
2063 MEM_LOG("Bad __copy_from_user");
2064 rc = -EFAULT;
2065 break;
2068 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2069 okay = 0;
2071 switch ( cmd )
2073 /*
2074 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2075 */
2076 case MMU_NORMAL_PT_UPDATE:
2078 gpfn = req.ptr >> PAGE_SHIFT;
2079 mfn = __gpfn_to_mfn(d, gpfn);
2081 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2083 MEM_LOG("Could not get page for normal update");
2084 break;
2087 va = map_domain_page_with_cache(mfn, &mapcache);
2088 va = (void *)((unsigned long)va +
2089 (unsigned long)(req.ptr & ~PAGE_MASK));
2090 page = &frame_table[mfn];
2092 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2094 case PGT_l1_page_table:
2095 ASSERT( !shadow_mode_refcounts(d) );
2096 if ( likely(get_page_type(
2097 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2099 l1_pgentry_t l1e;
2101 /* FIXME: doesn't work with PAE */
2102 l1e = l1e_from_intpte(req.val);
2103 okay = mod_l1_entry(va, l1e);
2104 if ( okay && unlikely(shadow_mode_enabled(d)) )
2105 shadow_l1_normal_pt_update(
2106 d, req.ptr, l1e, &sh_mapcache);
2107 put_page_type(page);
2109 break;
2110 case PGT_l2_page_table:
2111 ASSERT( !shadow_mode_refcounts(d) );
2112 if ( likely(get_page_type(
2113 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2115 l2_pgentry_t l2e;
2117 /* FIXME: doesn't work with PAE */
2118 l2e = l2e_from_intpte(req.val);
2119 okay = mod_l2_entry(
2120 (l2_pgentry_t *)va, l2e, mfn, type_info);
2121 if ( okay && unlikely(shadow_mode_enabled(d)) )
2122 shadow_l2_normal_pt_update(
2123 d, req.ptr, l2e, &sh_mapcache);
2124 put_page_type(page);
2126 break;
2127 #if CONFIG_PAGING_LEVELS >= 3
2128 case PGT_l3_page_table:
2129 ASSERT( !shadow_mode_refcounts(d) );
2130 if ( likely(get_page_type(
2131 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2133 l3_pgentry_t l3e;
2135 /* FIXME: doesn't work with PAE */
2136 l3e = l3e_from_intpte(req.val);
2137 okay = mod_l3_entry(va, l3e, mfn, type_info);
2138 if ( okay && unlikely(shadow_mode_enabled(d)) )
2139 shadow_l3_normal_pt_update(
2140 d, req.ptr, l3e, &sh_mapcache);
2141 put_page_type(page);
2143 break;
2144 #endif
2145 #if CONFIG_PAGING_LEVELS >= 4
2146 case PGT_l4_page_table:
2147 ASSERT( !shadow_mode_refcounts(d) );
2148 if ( likely(get_page_type(
2149 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2151 l4_pgentry_t l4e;
2153 l4e = l4e_from_intpte(req.val);
2154 okay = mod_l4_entry(va, l4e, mfn, type_info);
2155 if ( okay && unlikely(shadow_mode_enabled(d)) )
2156 shadow_l4_normal_pt_update(
2157 d, req.ptr, l4e, &sh_mapcache);
2158 put_page_type(page);
2160 break;
2161 #endif
2162 default:
2163 if ( likely(get_page_type(page, PGT_writable_page)) )
2165 if ( shadow_mode_enabled(d) )
2167 shadow_lock(d);
2169 if ( shadow_mode_log_dirty(d) )
2170 __mark_dirty(d, mfn);
2172 if ( page_is_page_table(page) &&
2173 !page_out_of_sync(page) )
2175 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
2179 *(unsigned long *)va = req.val;
2180 okay = 1;
2182 if ( shadow_mode_enabled(d) )
2183 shadow_unlock(d);
2185 put_page_type(page);
2187 break;
2190 unmap_domain_page_with_cache(va, &mapcache);
2192 put_page(page);
2193 break;
2195 case MMU_MACHPHYS_UPDATE:
2197 mfn = req.ptr >> PAGE_SHIFT;
2198 gpfn = req.val;
2200 /* HACK ALERT... Need to think about this some more... */
2201 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && IS_PRIV(d)) )
2203 shadow_lock(FOREIGNDOM);
2204 printk("privileged guest dom%d requests pfn=%lx to "
2205 "map mfn=%lx for dom%d\n",
2206 d->domain_id, gpfn, mfn, FOREIGNDOM->domain_id);
2207 set_machinetophys(mfn, gpfn);
2208 set_p2m_entry(FOREIGNDOM, gpfn, mfn, &sh_mapcache, &mapcache);
2209 okay = 1;
2210 shadow_unlock(FOREIGNDOM);
2211 break;
2214 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2216 MEM_LOG("Could not get page for mach->phys update");
2217 break;
2220 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && !IS_PRIV(d)) )
2222 MEM_LOG("can't mutate the m2p of translated guests");
2223 break;
2226 set_machinetophys(mfn, gpfn);
2227 okay = 1;
2229 /*
2230 * If in log-dirty mode, mark the corresponding
2231 * page as dirty.
2232 */
2233 if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) &&
2234 mark_dirty(FOREIGNDOM, mfn) )
2235 FOREIGNDOM->arch.shadow_dirty_block_count++;
2237 put_page(&frame_table[mfn]);
2238 break;
2240 default:
2241 MEM_LOG("Invalid page update command %x", cmd);
2242 break;
2245 if ( unlikely(!okay) )
2247 rc = -EINVAL;
2248 break;
2251 ureqs++;
2254 out:
2255 domain_mmap_cache_destroy(&mapcache);
2256 domain_mmap_cache_destroy(&sh_mapcache);
2258 process_deferred_ops(cpu);
2260 /* Add incremental work we have done to the @done output parameter. */
2261 if ( unlikely(pdone != NULL) )
2262 __put_user(done + i, pdone);
2264 if ( unlikely(shadow_mode_enabled(d)) )
2265 check_pagetable(v, "post-mmu"); /* debug */
2267 UNLOCK_BIGLOCK(d);
2268 return rc;
2272 int update_grant_pte_mapping(
2273 unsigned long pte_addr, l1_pgentry_t _nl1e,
2274 struct domain *d, struct vcpu *v)
2276 int rc = GNTST_okay;
2277 void *va;
2278 unsigned long gpfn, mfn;
2279 struct pfn_info *page;
2280 u32 type_info;
2281 l1_pgentry_t ol1e;
2283 ASSERT(spin_is_locked(&d->big_lock));
2284 ASSERT(!shadow_mode_refcounts(d));
2285 ASSERT((l1e_get_flags(_nl1e) & L1_DISALLOW_MASK) == 0);
2287 gpfn = pte_addr >> PAGE_SHIFT;
2288 mfn = __gpfn_to_mfn(d, gpfn);
2290 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2292 MEM_LOG("Could not get page for normal update");
2293 return GNTST_general_error;
2296 va = map_domain_page(mfn);
2297 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2298 page = pfn_to_page(mfn);
2300 type_info = page->u.inuse.type_info;
2301 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2302 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2304 DPRINTK("Grant map attempted to update a non-L1 page\n");
2305 rc = GNTST_general_error;
2306 goto failed;
2309 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) ||
2310 !update_l1e(va, ol1e, _nl1e) )
2312 put_page_type(page);
2313 rc = GNTST_general_error;
2314 goto failed;
2317 put_page_from_l1e(ol1e, d);
2319 rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_all : GNTST_okay;
2321 if ( unlikely(shadow_mode_enabled(d)) )
2323 struct domain_mmap_cache sh_mapcache;
2324 domain_mmap_cache_init(&sh_mapcache);
2325 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2326 domain_mmap_cache_destroy(&sh_mapcache);
2329 put_page_type(page);
2331 failed:
2332 unmap_domain_page(va);
2333 put_page(page);
2334 return rc;
2337 int clear_grant_pte_mapping(
2338 unsigned long addr, unsigned long frame, struct domain *d)
2340 int rc = GNTST_okay;
2341 void *va;
2342 unsigned long gpfn, mfn;
2343 struct pfn_info *page;
2344 u32 type_info;
2345 l1_pgentry_t ol1e;
2347 ASSERT(!shadow_mode_refcounts(d));
2349 gpfn = addr >> PAGE_SHIFT;
2350 mfn = __gpfn_to_mfn(d, gpfn);
2352 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2354 MEM_LOG("Could not get page for normal update");
2355 return GNTST_general_error;
2358 va = map_domain_page(mfn);
2359 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2360 page = pfn_to_page(mfn);
2362 type_info = page->u.inuse.type_info;
2363 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2364 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2366 DPRINTK("Grant map attempted to update a non-L1 page\n");
2367 rc = GNTST_general_error;
2368 goto failed;
2371 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2373 put_page_type(page);
2374 rc = GNTST_general_error;
2375 goto failed;
2378 /* Check that the virtual address supplied is actually mapped to frame. */
2379 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2381 DPRINTK("PTE entry %lx for address %lx doesn't match frame %lx\n",
2382 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2383 put_page_type(page);
2384 rc = GNTST_general_error;
2385 goto failed;
2388 /* Delete pagetable entry. */
2389 if ( unlikely(__put_user(0, (unsigned long *)va)))
2391 DPRINTK("Cannot delete PTE entry at %p.\n", va);
2392 put_page_type(page);
2393 rc = GNTST_general_error;
2394 goto failed;
2397 if ( unlikely(shadow_mode_enabled(d)) )
2399 struct domain_mmap_cache sh_mapcache;
2400 domain_mmap_cache_init(&sh_mapcache);
2401 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2402 domain_mmap_cache_destroy(&sh_mapcache);
2405 put_page_type(page);
2407 failed:
2408 unmap_domain_page(va);
2409 put_page(page);
2410 return rc;
2414 int update_grant_va_mapping(
2415 unsigned long va, l1_pgentry_t _nl1e, struct domain *d, struct vcpu *v)
2417 int rc = GNTST_okay;
2418 l1_pgentry_t *pl1e, ol1e;
2420 ASSERT(spin_is_locked(&d->big_lock));
2421 ASSERT(!shadow_mode_refcounts(d));
2422 ASSERT((l1e_get_flags(_nl1e) & L1_DISALLOW_MASK) == 0);
2424 /*
2425 * This is actually overkill - we don't need to sync the L1 itself,
2426 * just everything involved in getting to this L1 (i.e. we need
2427 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2428 */
2429 __shadow_sync_va(v, va);
2431 pl1e = &linear_pg_table[l1_linear_offset(va)];
2433 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2434 !update_l1e(pl1e, ol1e, _nl1e) )
2435 return GNTST_general_error;
2437 put_page_from_l1e(ol1e, d);
2439 rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_one : GNTST_okay;
2441 if ( unlikely(shadow_mode_enabled(d)) )
2442 shadow_do_update_va_mapping(va, _nl1e, v);
2444 return rc;
2447 int clear_grant_va_mapping(unsigned long addr, unsigned long frame)
2449 l1_pgentry_t *pl1e;
2450 unsigned long _ol1e;
2452 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2454 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
2456 DPRINTK("Could not find PTE entry for address %lx\n", addr);
2457 return GNTST_general_error;
2460 /*
2461 * Check that the virtual address supplied is actually mapped to
2462 * frame.
2463 */
2464 if ( unlikely((_ol1e >> PAGE_SHIFT) != frame ))
2466 DPRINTK("PTE entry %lx for address %lx doesn't match frame %lx\n",
2467 _ol1e, addr, frame);
2468 return GNTST_general_error;
2471 /* Delete pagetable entry. */
2472 if ( unlikely(__put_user(0, (unsigned long *)pl1e)))
2474 DPRINTK("Cannot delete PTE entry at %p.\n", (unsigned long *)pl1e);
2475 return GNTST_general_error;
2478 return 0;
2482 int do_update_va_mapping(unsigned long va, u64 val64,
2483 unsigned long flags)
2485 l1_pgentry_t val = l1e_from_intpte(val64);
2486 struct vcpu *v = current;
2487 struct domain *d = v->domain;
2488 unsigned int cpu = v->processor;
2489 unsigned long vmask, bmap_ptr;
2490 cpumask_t pmask;
2491 int rc = 0;
2493 perfc_incrc(calls_to_update_va);
2495 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2496 return -EINVAL;
2498 LOCK_BIGLOCK(d);
2500 cleanup_writable_pagetable(d);
2502 if ( unlikely(shadow_mode_enabled(d)) )
2503 check_pagetable(v, "pre-va"); /* debug */
2505 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2506 val)) )
2507 rc = -EINVAL;
2509 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2511 if ( unlikely(percpu_info[cpu].foreign &&
2512 (shadow_mode_translate(d) ||
2513 shadow_mode_translate(percpu_info[cpu].foreign))) )
2515 /*
2516 * The foreign domain's pfn's are in a different namespace. There's
2517 * not enough information in just a gpte to figure out how to
2518 * (re-)shadow this entry.
2519 */
2520 domain_crash();
2523 rc = shadow_do_update_va_mapping(va, val, v);
2525 check_pagetable(v, "post-va"); /* debug */
2528 switch ( flags & UVMF_FLUSHTYPE_MASK )
2530 case UVMF_TLB_FLUSH:
2531 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2533 case UVMF_LOCAL:
2534 if ( unlikely(shadow_mode_enabled(d)) )
2535 shadow_sync_all(d);
2536 local_flush_tlb();
2537 break;
2538 case UVMF_ALL:
2539 flush_tlb_mask(d->cpumask);
2540 break;
2541 default:
2542 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2543 rc = -EFAULT;
2544 pmask = vcpumask_to_pcpumask(d, vmask);
2545 cpus_and(pmask, pmask, d->cpumask);
2546 flush_tlb_mask(pmask);
2547 break;
2549 break;
2551 case UVMF_INVLPG:
2552 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2554 case UVMF_LOCAL:
2555 if ( unlikely(shadow_mode_enabled(d)) )
2556 shadow_invlpg(current, va);
2557 local_flush_tlb_one(va);
2558 break;
2559 case UVMF_ALL:
2560 flush_tlb_one_mask(d->cpumask, va);
2561 break;
2562 default:
2563 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2564 rc = -EFAULT;
2565 pmask = vcpumask_to_pcpumask(d, vmask);
2566 cpus_and(pmask, pmask, d->cpumask);
2567 flush_tlb_one_mask(pmask, va);
2568 break;
2570 break;
2573 process_deferred_ops(cpu);
2575 UNLOCK_BIGLOCK(d);
2577 return rc;
2580 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2581 unsigned long flags,
2582 domid_t domid)
2584 unsigned int cpu = smp_processor_id();
2585 struct domain *d;
2586 int rc;
2588 if ( unlikely(!IS_PRIV(current->domain)) )
2589 return -EPERM;
2591 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
2592 if ( unlikely(d == NULL) )
2594 MEM_LOG("Unknown domain '%u'", domid);
2595 return -ESRCH;
2598 rc = do_update_va_mapping(va, val64, flags);
2600 return rc;
2605 /*************************
2606 * Descriptor Tables
2607 */
2609 void destroy_gdt(struct vcpu *v)
2611 int i;
2612 unsigned long pfn;
2614 v->arch.guest_context.gdt_ents = 0;
2615 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2617 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2618 put_page_and_type(&frame_table[pfn]);
2619 v->arch.perdomain_ptes[i] = l1e_empty();
2620 v->arch.guest_context.gdt_frames[i] = 0;
2625 long set_gdt(struct vcpu *v,
2626 unsigned long *frames,
2627 unsigned int entries)
2629 struct domain *d = v->domain;
2630 /* NB. There are 512 8-byte entries per GDT page. */
2631 int i, nr_pages = (entries + 511) / 512;
2632 unsigned long pfn;
2634 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2635 return -EINVAL;
2637 shadow_sync_all(d);
2639 /* Check the pages in the new GDT. */
2640 for ( i = 0; i < nr_pages; i++ ) {
2641 pfn = frames[i];
2642 if ((pfn >= max_page) ||
2643 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2644 goto fail;
2647 /* Tear down the old GDT. */
2648 destroy_gdt(v);
2650 /* Install the new GDT. */
2651 v->arch.guest_context.gdt_ents = entries;
2652 for ( i = 0; i < nr_pages; i++ )
2654 v->arch.guest_context.gdt_frames[i] = frames[i];
2655 v->arch.perdomain_ptes[i] =
2656 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2659 return 0;
2661 fail:
2662 while ( i-- > 0 )
2663 put_page_and_type(&frame_table[frames[i]]);
2664 return -EINVAL;
2668 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
2670 int nr_pages = (entries + 511) / 512;
2671 unsigned long frames[16];
2672 long ret;
2674 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2675 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2676 return -EINVAL;
2678 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
2679 return -EFAULT;
2681 LOCK_BIGLOCK(current->domain);
2683 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2684 local_flush_tlb();
2686 UNLOCK_BIGLOCK(current->domain);
2688 return ret;
2692 long do_update_descriptor(u64 pa, u64 desc)
2694 struct domain *dom = current->domain;
2695 unsigned long gpfn = pa >> PAGE_SHIFT;
2696 unsigned long mfn;
2697 unsigned int offset;
2698 struct desc_struct *gdt_pent, d;
2699 struct pfn_info *page;
2700 long ret = -EINVAL;
2702 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2704 *(u64 *)&d = desc;
2706 LOCK_BIGLOCK(dom);
2708 if ( !VALID_MFN(mfn = __gpfn_to_mfn(dom, gpfn)) ||
2709 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2710 (mfn >= max_page) ||
2711 !check_descriptor(&d) )
2713 UNLOCK_BIGLOCK(dom);
2714 return -EINVAL;
2717 page = &frame_table[mfn];
2718 if ( unlikely(!get_page(page, dom)) )
2720 UNLOCK_BIGLOCK(dom);
2721 return -EINVAL;
2724 /* Check if the given frame is in use in an unsafe context. */
2725 switch ( page->u.inuse.type_info & PGT_type_mask )
2727 case PGT_gdt_page:
2728 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2729 goto out;
2730 break;
2731 case PGT_ldt_page:
2732 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2733 goto out;
2734 break;
2735 default:
2736 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2737 goto out;
2738 break;
2741 if ( shadow_mode_enabled(dom) )
2743 shadow_lock(dom);
2745 if ( shadow_mode_log_dirty(dom) )
2746 __mark_dirty(dom, mfn);
2748 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2749 shadow_mark_mfn_out_of_sync(current, gpfn, mfn);
2752 /* All is good so make the update. */
2753 gdt_pent = map_domain_page(mfn);
2754 memcpy(&gdt_pent[offset], &d, 8);
2755 unmap_domain_page(gdt_pent);
2757 if ( shadow_mode_enabled(dom) )
2758 shadow_unlock(dom);
2760 put_page_type(page);
2762 ret = 0; /* success */
2764 out:
2765 put_page(page);
2767 UNLOCK_BIGLOCK(dom);
2769 return ret;
2774 /*************************
2775 * Writable Pagetables
2776 */
2778 #ifdef VVERBOSE
2779 int ptwr_debug = 0x0;
2780 #define PTWR_PRINTK(_f, _a...) \
2781 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2782 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2783 #else
2784 #define PTWR_PRINTK(_f, _a...) ((void)0)
2785 #endif
2788 #ifdef PERF_ARRAYS
2790 /**************** writeable pagetables profiling functions *****************/
2792 #define ptwr_eip_buckets 256
2794 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
2796 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
2798 struct {
2799 unsigned long eip;
2800 domid_t id;
2801 u32 val[ptwr_eip_stat_thresholdN];
2802 } typedef ptwr_eip_stat_t;
2804 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
2806 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
2808 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
2809 ptwr_eip_buckets;
2812 static void ptwr_eip_stat_inc(u32 *n)
2814 int i, j;
2816 if ( ++(*n) != 0 )
2817 return;
2819 *n = ~0;
2821 /* Re-scale all buckets. */
2822 for ( i = 0; i <ptwr_eip_buckets; i++ )
2823 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2824 ptwr_eip_stats[i].val[j] >>= 1;
2827 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
2829 int i, j, b;
2831 i = b = ptwr_eip_stat_hash(eip, id);
2833 do
2835 if ( !ptwr_eip_stats[i].eip )
2837 /* doesn't exist */
2838 ptwr_eip_stats[i].eip = eip;
2839 ptwr_eip_stats[i].id = id;
2840 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
2843 if ( ptwr_eip_stats[i].eip == eip )
2845 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2846 if ( modified <= ptwr_eip_stat_threshold[j] )
2847 break;
2848 BUG_ON(j >= ptwr_eip_stat_thresholdN);
2849 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
2850 return;
2853 i = (i+1) % ptwr_eip_buckets;
2855 while ( i != b );
2857 printk("ptwr_eip_stat: too many EIPs in use!\n");
2859 ptwr_eip_stat_print();
2860 ptwr_eip_stat_reset();
2863 void ptwr_eip_stat_reset(void)
2865 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
2868 void ptwr_eip_stat_print(void)
2870 struct domain *e;
2871 domid_t d;
2872 int i, j;
2874 for_each_domain( e )
2876 d = e->domain_id;
2878 for ( i = 0; i < ptwr_eip_buckets; i++ )
2880 if ( ptwr_eip_stats[i].eip && ptwr_eip_stats[i].id != d )
2881 continue;
2883 printk("D %d eip %08lx ",
2884 ptwr_eip_stats[i].id, ptwr_eip_stats[i].eip);
2886 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2887 printk("<=%u %4u \t",
2888 ptwr_eip_stat_threshold[j],
2889 ptwr_eip_stats[i].val[j]);
2890 printk("\n");
2895 #else /* PERF_ARRAYS */
2897 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
2899 #endif
2901 /*******************************************************************/
2903 /* Re-validate a given p.t. page, given its prior snapshot */
2904 int revalidate_l1(
2905 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
2907 l1_pgentry_t ol1e, nl1e;
2908 int modified = 0, i;
2910 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2912 ol1e = snapshot[i];
2913 nl1e = l1page[i];
2915 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
2916 continue;
2918 /* Update number of entries modified. */
2919 modified++;
2921 /*
2922 * Fast path for PTEs that have merely been write-protected
2923 * (e.g., during a Unix fork()). A strict reduction in privilege.
2924 */
2925 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
2927 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
2928 put_page_type(&frame_table[l1e_get_pfn(nl1e)]);
2929 continue;
2932 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2934 MEM_LOG("ptwr: Could not re-validate l1 page\n");
2935 /*
2936 * Make the remaining p.t's consistent before crashing, so the
2937 * reference counts are correct.
2938 */
2939 memcpy(&l1page[i], &snapshot[i],
2940 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
2941 domain_crash();
2942 break;
2945 put_page_from_l1e(ol1e, d);
2948 return modified;
2952 /* Flush the given writable p.t. page and write-protect it again. */
2953 void ptwr_flush(struct domain *d, const int which)
2955 unsigned long l1va;
2956 l1_pgentry_t *pl1e, pte, *ptep;
2957 l2_pgentry_t *pl2e;
2958 unsigned int modified;
2960 #ifdef CONFIG_X86_64
2961 struct vcpu *v = current;
2962 extern void toggle_guest_mode(struct vcpu *);
2963 int user_mode = !(v->arch.flags & TF_kernel_mode);
2964 #endif
2966 ASSERT(!shadow_mode_enabled(d));
2968 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
2969 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
2970 write_cr3(pagetable_get_paddr(
2971 d->arch.ptwr[which].vcpu->arch.guest_table));
2972 else
2973 TOGGLE_MODE();
2975 l1va = d->arch.ptwr[which].l1va;
2976 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
2978 /*
2979 * STEP 1. Write-protect the p.t. page so no more updates can occur.
2980 */
2982 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
2984 MEM_LOG("ptwr: Could not read pte at %p", ptep);
2985 /*
2986 * Really a bug. We could read this PTE during the initial fault,
2987 * and pagetables can't have changed meantime.
2988 */
2989 BUG();
2991 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
2992 PTWR_PRINT_WHICH, ptep, pte.l1);
2993 l1e_remove_flags(pte, _PAGE_RW);
2995 /* Write-protect the p.t. page in the guest page table. */
2996 if ( unlikely(__put_user(pte, ptep)) )
2998 MEM_LOG("ptwr: Could not update pte at %p", ptep);
2999 /*
3000 * Really a bug. We could write this PTE during the initial fault,
3001 * and pagetables can't have changed meantime.
3002 */
3003 BUG();
3006 /* Ensure that there are no stale writable mappings in any TLB. */
3007 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3008 flush_tlb_one_mask(d->cpumask, l1va);
3009 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3010 PTWR_PRINT_WHICH, ptep, pte.l1);
3012 /*
3013 * STEP 2. Validate any modified PTEs.
3014 */
3016 pl1e = d->arch.ptwr[which].pl1e;
3017 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3018 unmap_domain_page(pl1e);
3019 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3020 ptwr_eip_stat_update( d->arch.ptwr[which].eip, d->domain_id, modified);
3021 d->arch.ptwr[which].prev_nr_updates = modified;
3023 /*
3024 * STEP 3. Reattach the L1 p.t. page into the current address space.
3025 */
3027 if ( which == PTWR_PT_ACTIVE )
3029 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3030 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3033 /*
3034 * STEP 4. Final tidy-up.
3035 */
3037 d->arch.ptwr[which].l1va = 0;
3039 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3040 write_ptbase(current);
3041 else
3042 TOGGLE_MODE();
3045 static int ptwr_emulated_update(
3046 unsigned long addr,
3047 physaddr_t old,
3048 physaddr_t val,
3049 unsigned int bytes,
3050 unsigned int do_cmpxchg)
3052 unsigned long pfn;
3053 struct pfn_info *page;
3054 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3055 struct domain *d = current->domain;
3057 /* Aligned access only, thank you. */
3058 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3060 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)\n",
3061 bytes, addr);
3062 return X86EMUL_UNHANDLEABLE;
3065 /* Turn a sub-word access into a full-word access. */
3066 if ( bytes != sizeof(physaddr_t) )
3068 int rc;
3069 physaddr_t full;
3070 unsigned int offset = addr & (sizeof(physaddr_t)-1);
3072 /* Align address; read full word. */
3073 addr &= ~(sizeof(physaddr_t)-1);
3074 if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full,
3075 sizeof(physaddr_t))) )
3076 return rc;
3077 /* Mask out bits provided by caller. */
3078 full &= ~((((physaddr_t)1 << (bytes*8)) - 1) << (offset*8));
3079 /* Shift the caller value and OR in the missing bits. */
3080 val &= (((physaddr_t)1 << (bytes*8)) - 1);
3081 val <<= (offset)*8;
3082 val |= full;
3083 /* Also fill in missing parts of the cmpxchg old value. */
3084 old &= (((physaddr_t)1 << (bytes*8)) - 1);
3085 old <<= (offset)*8;
3086 old |= full;
3089 /* Read the PTE that maps the page being updated. */
3090 if (__copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3091 sizeof(pte)))
3093 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table\n");
3094 return X86EMUL_UNHANDLEABLE;
3097 pfn = l1e_get_pfn(pte);
3098 page = &frame_table[pfn];
3100 /* We are looking only for read-only mappings of p.t. pages. */
3101 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
3102 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3103 (page_get_owner(page) != d) )
3105 MEM_LOG("ptwr_emulate: Page is mistyped or bad pte "
3106 "(%lx, %" PRtype_info ")\n",
3107 l1e_get_pfn(pte), page->u.inuse.type_info);
3108 return X86EMUL_UNHANDLEABLE;
3111 /* Check the new PTE. */
3112 nl1e = l1e_from_intpte(val);
3113 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3114 return X86EMUL_UNHANDLEABLE;
3116 /* Checked successfully: do the update (write or cmpxchg). */
3117 pl1e = map_domain_page(page_to_pfn(page));
3118 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3119 if ( do_cmpxchg )
3121 ol1e = l1e_from_intpte(old);
3122 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3124 unmap_domain_page(pl1e);
3125 put_page_from_l1e(nl1e, d);
3126 return X86EMUL_CMPXCHG_FAILED;
3129 else
3131 ol1e = *pl1e;
3132 *pl1e = nl1e;
3134 unmap_domain_page(pl1e);
3136 /* Finally, drop the old PTE. */
3137 put_page_from_l1e(ol1e, d);
3139 return X86EMUL_CONTINUE;
3142 static int ptwr_emulated_write(
3143 unsigned long addr,
3144 unsigned long val,
3145 unsigned int bytes)
3147 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3150 static int ptwr_emulated_cmpxchg(
3151 unsigned long addr,
3152 unsigned long old,
3153 unsigned long new,
3154 unsigned int bytes)
3156 return ptwr_emulated_update(addr, old, new, bytes, 1);
3159 static int ptwr_emulated_cmpxchg8b(
3160 unsigned long addr,
3161 unsigned long old,
3162 unsigned long old_hi,
3163 unsigned long new,
3164 unsigned long new_hi)
3166 return ptwr_emulated_update(
3167 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3170 static struct x86_mem_emulator ptwr_mem_emulator = {
3171 .read_std = x86_emulate_read_std,
3172 .write_std = x86_emulate_write_std,
3173 .read_emulated = x86_emulate_read_std,
3174 .write_emulated = ptwr_emulated_write,
3175 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3176 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3177 };
3179 /* Write page fault handler: check if guest is trying to modify a PTE. */
3180 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3181 struct cpu_user_regs *regs)
3183 unsigned long pfn;
3184 struct pfn_info *page;
3185 l1_pgentry_t pte;
3186 l2_pgentry_t *pl2e, l2e;
3187 int which;
3188 unsigned long l2_idx;
3190 if ( unlikely(shadow_mode_enabled(d)) )
3191 return 0;
3193 /*
3194 * Attempt to read the PTE that maps the VA being accessed. By checking for
3195 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3196 */
3197 if ( !(l2e_get_flags(__linear_l2_table[l2_linear_offset(addr)]) &
3198 _PAGE_PRESENT) ||
3199 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
3200 sizeof(pte)) )
3202 return 0;
3205 pfn = l1e_get_pfn(pte);
3206 page = &frame_table[pfn];
3208 /* We are looking only for read-only mappings of p.t. pages. */
3209 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
3210 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3211 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3212 (page_get_owner(page) != d) )
3214 return 0;
3217 #if 0 /* Leave this in as useful for debugging */
3218 goto emulate;
3219 #endif
3221 /* Get the L2 index at which this L1 p.t. is always mapped. */
3222 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3223 if ( unlikely(l2_idx >= PGT_va_unknown) )
3224 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3225 l2_idx >>= PGT_va_shift;
3227 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3228 goto emulate; /* Urk! Pagetable maps itself! */
3230 /*
3231 * Is the L1 p.t. mapped into the current address space? If so we call it
3232 * an ACTIVE p.t., otherwise it is INACTIVE.
3233 */
3234 pl2e = &__linear_l2_table[l2_idx];
3235 which = PTWR_PT_INACTIVE;
3237 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3239 /*
3240 * Check the PRESENT bit to set ACTIVE mode.
3241 * If the PRESENT bit is clear, we may be conflicting with the current
3242 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3243 * The ptwr_flush call below will restore the PRESENT bit.
3244 */
3245 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3246 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3247 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3248 which = PTWR_PT_ACTIVE;
3251 /*
3252 * If this is a multi-processor guest then ensure that the page is hooked
3253 * into at most one L2 table, which must be the one running on this VCPU.
3254 */
3255 if ( (d->vcpu[0]->next_in_list != NULL) &&
3256 ((page->u.inuse.type_info & PGT_count_mask) !=
3257 (!!(page->u.inuse.type_info & PGT_pinned) +
3258 (which == PTWR_PT_ACTIVE))) )
3260 /* Could be conflicting writable mappings from other VCPUs. */
3261 cleanup_writable_pagetable(d);
3262 goto emulate;
3265 PTWR_PRINTK("[%c] page_fault on l1 pt at va %lx, pt for %08lx, "
3266 "pfn %lx\n", PTWR_PRINT_WHICH,
3267 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
3269 /*
3270 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
3271 * time. If there is already one, we must flush it out.
3272 */
3273 if ( d->arch.ptwr[which].l1va )
3274 ptwr_flush(d, which);
3276 /*
3277 * If last batch made no updates then we are probably stuck. Emulate this
3278 * update to ensure we make progress.
3279 */
3280 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3282 /* Ensure that we don't get stuck in an emulation-only rut. */
3283 d->arch.ptwr[which].prev_nr_updates = 1;
3284 goto emulate;
3287 d->arch.ptwr[which].l1va = addr | 1;
3288 d->arch.ptwr[which].l2_idx = l2_idx;
3289 d->arch.ptwr[which].vcpu = current;
3291 #ifdef PERF_ARRAYS
3292 d->arch.ptwr[which].eip = regs->eip;
3293 #endif
3295 /* For safety, disconnect the L1 p.t. page from current space. */
3296 if ( which == PTWR_PT_ACTIVE )
3298 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
3299 flush_tlb_mask(d->cpumask);
3302 /* Temporarily map the L1 page, and make a copy of it. */
3303 d->arch.ptwr[which].pl1e = map_domain_page(pfn);
3304 memcpy(d->arch.ptwr[which].page,
3305 d->arch.ptwr[which].pl1e,
3306 L1_PAGETABLE_ENTRIES * sizeof(l1_pgentry_t));
3308 /* Finally, make the p.t. page writable by the guest OS. */
3309 l1e_add_flags(pte, _PAGE_RW);
3310 if ( unlikely(__put_user(pte.l1,
3311 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3313 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
3314 &linear_pg_table[l1_linear_offset(addr)]);
3315 /* Toss the writable pagetable state and crash. */
3316 unmap_domain_page(d->arch.ptwr[which].pl1e);
3317 d->arch.ptwr[which].l1va = 0;
3318 domain_crash();
3319 return 0;
3322 return EXCRET_fault_fixed;
3324 emulate:
3325 if ( x86_emulate_memop(guest_cpu_user_regs(), addr,
3326 &ptwr_mem_emulator, BITS_PER_LONG/8) )
3327 return 0;
3328 perfc_incrc(ptwr_emulations);
3329 return EXCRET_fault_fixed;
3332 int ptwr_init(struct domain *d)
3334 void *x = alloc_xenheap_page();
3335 void *y = alloc_xenheap_page();
3337 if ( (x == NULL) || (y == NULL) )
3339 if ( x != NULL )
3340 free_xenheap_page(x);
3341 if ( y != NULL )
3342 free_xenheap_page(y);
3343 return -ENOMEM;
3346 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3347 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3349 return 0;
3352 void ptwr_destroy(struct domain *d)
3354 cleanup_writable_pagetable(d);
3355 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3356 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3359 void cleanup_writable_pagetable(struct domain *d)
3361 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3362 return;
3364 if ( unlikely(shadow_mode_enabled(d)) )
3366 shadow_sync_all(d);
3368 else
3370 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3371 ptwr_flush(d, PTWR_PT_ACTIVE);
3372 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3373 ptwr_flush(d, PTWR_PT_INACTIVE);
3377 int map_pages_to_xen(
3378 unsigned long virt,
3379 unsigned long pfn,
3380 unsigned long nr_pfns,
3381 unsigned long flags)
3383 l2_pgentry_t *pl2e, ol2e;
3384 l1_pgentry_t *pl1e, ol1e;
3385 unsigned int i;
3387 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3388 flags &= ~MAP_SMALL_PAGES;
3390 while ( nr_pfns != 0 )
3392 pl2e = virt_to_xen_l2e(virt);
3394 if ( ((((virt>>PAGE_SHIFT) | pfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3395 (nr_pfns >= (1<<PAGETABLE_ORDER)) &&
3396 !map_small_pages )
3398 /* Super-page mapping. */
3399 ol2e = *pl2e;
3400 *pl2e = l2e_from_pfn(pfn, flags|_PAGE_PSE);
3402 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3404 local_flush_tlb_pge();
3405 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3406 free_xen_pagetable(l2e_get_page(*pl2e));
3409 virt += 1UL << L2_PAGETABLE_SHIFT;
3410 pfn += 1UL << PAGETABLE_ORDER;
3411 nr_pfns -= 1UL << PAGETABLE_ORDER;
3413 else
3415 /* Normal page mapping. */
3416 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3418 pl1e = page_to_virt(alloc_xen_pagetable());
3419 clear_page(pl1e);
3420 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3422 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3424 pl1e = page_to_virt(alloc_xen_pagetable());
3425 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3426 pl1e[i] = l1e_from_pfn(
3427 l2e_get_pfn(*pl2e) + i,
3428 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3429 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3430 local_flush_tlb_pge();
3433 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3434 ol1e = *pl1e;
3435 *pl1e = l1e_from_pfn(pfn, flags);
3436 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3437 local_flush_tlb_one(virt);
3439 virt += 1UL << L1_PAGETABLE_SHIFT;
3440 pfn += 1UL;
3441 nr_pfns -= 1UL;
3445 return 0;
3448 void __set_fixmap(
3449 enum fixed_addresses idx, unsigned long p, unsigned long flags)
3451 if ( unlikely(idx >= __end_of_fixed_addresses) )
3452 BUG();
3453 map_pages_to_xen(fix_to_virt(idx), p >> PAGE_SHIFT, 1, flags);
3456 #ifdef MEMORY_GUARD
3458 void memguard_init(void)
3460 map_pages_to_xen(
3461 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3462 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3465 static void __memguard_change_range(void *p, unsigned long l, int guard)
3467 unsigned long _p = (unsigned long)p;
3468 unsigned long _l = (unsigned long)l;
3469 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3471 /* Ensure we are dealing with a page-aligned whole number of pages. */
3472 ASSERT((_p&PAGE_MASK) != 0);
3473 ASSERT((_l&PAGE_MASK) != 0);
3474 ASSERT((_p&~PAGE_MASK) == 0);
3475 ASSERT((_l&~PAGE_MASK) == 0);
3477 if ( guard )
3478 flags &= ~_PAGE_PRESENT;
3480 map_pages_to_xen(
3481 _p, virt_to_phys(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3484 void memguard_guard_range(void *p, unsigned long l)
3486 __memguard_change_range(p, l, 1);
3489 void memguard_unguard_range(void *p, unsigned long l)
3491 __memguard_change_range(p, l, 0);
3494 #endif
3496 /*
3497 * Local variables:
3498 * mode: C
3499 * c-set-style: "BSD"
3500 * c-basic-offset: 4
3501 * tab-width: 4
3502 * indent-tabs-mode: nil
3503 * End:
3504 */