ia64/xen-unstable

view xen/arch/x86/mm.c @ 5585:b08cd3331fdb

bitkeeper revision 1.1760 (42c05ebeLIfrneiw1jaZMwle-z9usw)

Check set_gdt() bounds before copy_from_user.
Signed-off-by: Chris Wright <chrisw@osdl.org>
author kaf24@firebug.cl.cam.ac.uk
date Mon Jun 27 20:17:02 2005 +0000 (2005-06-27)
parents 30082c72ed69
children ec3b7c87b577
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <xen/domain_page.h>
98 #include <asm/shadow.h>
99 #include <asm/page.h>
100 #include <asm/flushtlb.h>
101 #include <asm/io.h>
102 #include <asm/uaccess.h>
103 #include <asm/ldt.h>
104 #include <asm/x86_emulate.h>
106 #ifdef VERBOSE
107 #define MEM_LOG(_f, _a...) \
108 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
109 current->domain->domain_id , __LINE__ , ## _a )
110 #else
111 #define MEM_LOG(_f, _a...) ((void)0)
112 #endif
114 /*
115 * Both do_mmuext_op() and do_mmu_update():
116 * We steal the m.s.b. of the @count parameter to indicate whether this
117 * invocation of do_mmu_update() is resuming a previously preempted call.
118 */
119 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
121 static void free_l2_table(struct pfn_info *page);
122 static void free_l1_table(struct pfn_info *page);
124 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
125 unsigned int type);
126 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
128 /* Used to defer flushing of memory structures. */
129 static struct {
130 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
131 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
132 unsigned int deferred_ops;
133 /* If non-NULL, specifies a foreign subject domain for some operations. */
134 struct domain *foreign;
135 } __cacheline_aligned percpu_info[NR_CPUS];
137 /*
138 * Returns the current foreign domain; defaults to the currently-executing
139 * domain if a foreign override hasn't been specified.
140 */
141 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
143 /* Private domain structs for DOMID_XEN and DOMID_IO. */
144 static struct domain *dom_xen, *dom_io;
146 /* Frame table and its size in pages. */
147 struct pfn_info *frame_table;
148 unsigned long max_page;
150 void __init init_frametable(void)
151 {
152 unsigned long nr_pages, page_step, i, pfn;
154 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
156 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
157 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
159 for ( i = 0; i < nr_pages; i += page_step )
160 {
161 pfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
162 if ( pfn == 0 )
163 panic("Not enough memory for frame table\n");
164 map_pages_to_xen(
165 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
166 pfn, page_step, PAGE_HYPERVISOR);
167 }
169 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
170 }
172 void arch_init_memory(void)
173 {
174 extern void subarch_init_memory(struct domain *);
176 unsigned long i, pfn, rstart_pfn, rend_pfn;
177 struct pfn_info *page;
179 memset(percpu_info, 0, sizeof(percpu_info));
181 /*
182 * Initialise our DOMID_XEN domain.
183 * Any Xen-heap pages that we will allow to be mapped will have
184 * their domain field set to dom_xen.
185 */
186 dom_xen = alloc_domain_struct();
187 atomic_set(&dom_xen->refcnt, 1);
188 dom_xen->domain_id = DOMID_XEN;
190 /*
191 * Initialise our DOMID_IO domain.
192 * This domain owns I/O pages that are within the range of the pfn_info
193 * array. Mappings occur at the priv of the caller.
194 */
195 dom_io = alloc_domain_struct();
196 atomic_set(&dom_io->refcnt, 1);
197 dom_io->domain_id = DOMID_IO;
199 /* First 1MB of RAM is historically marked as I/O. */
200 for ( i = 0; i < 0x100; i++ )
201 {
202 page = &frame_table[i];
203 page->count_info = PGC_allocated | 1;
204 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
205 page_set_owner(page, dom_io);
206 }
208 /* Any areas not specified as RAM by the e820 map are considered I/O. */
209 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
210 {
211 if ( e820.map[i].type != E820_RAM )
212 continue;
213 /* Every page from cursor to start of next RAM region is I/O. */
214 rstart_pfn = PFN_UP(e820.map[i].addr);
215 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
216 while ( pfn < rstart_pfn )
217 {
218 BUG_ON(!pfn_valid(pfn));
219 page = &frame_table[pfn++];
220 page->count_info = PGC_allocated | 1;
221 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
222 page_set_owner(page, dom_io);
223 }
224 /* Skip the RAM region. */
225 pfn = rend_pfn;
226 }
227 BUG_ON(pfn != max_page);
229 subarch_init_memory(dom_xen);
230 }
232 void write_ptbase(struct vcpu *v)
233 {
234 write_cr3(pagetable_get_paddr(v->arch.monitor_table));
235 }
237 void invalidate_shadow_ldt(struct vcpu *v)
238 {
239 int i;
240 unsigned long pfn;
241 struct pfn_info *page;
243 if ( v->arch.shadow_ldt_mapcnt == 0 )
244 return;
246 v->arch.shadow_ldt_mapcnt = 0;
248 for ( i = 16; i < 32; i++ )
249 {
250 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
251 if ( pfn == 0 ) continue;
252 v->arch.perdomain_ptes[i] = l1e_empty();
253 page = &frame_table[pfn];
254 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
255 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
256 put_page_and_type(page);
257 }
259 /* Dispose of the (now possibly invalid) mappings from the TLB. */
260 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
261 }
264 static int alloc_segdesc_page(struct pfn_info *page)
265 {
266 struct desc_struct *descs;
267 int i;
269 descs = map_domain_page(page_to_pfn(page));
271 for ( i = 0; i < 512; i++ )
272 if ( unlikely(!check_descriptor(&descs[i])) )
273 goto fail;
275 unmap_domain_page(descs);
276 return 1;
278 fail:
279 unmap_domain_page(descs);
280 return 0;
281 }
284 /* Map shadow page at offset @off. */
285 int map_ldt_shadow_page(unsigned int off)
286 {
287 struct vcpu *v = current;
288 struct domain *d = v->domain;
289 unsigned long gpfn, gmfn;
290 l1_pgentry_t l1e, nl1e;
291 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
292 int res;
294 #if defined(__x86_64__)
295 /* If in user mode, switch to kernel mode just to read LDT mapping. */
296 extern void toggle_guest_mode(struct vcpu *);
297 int user_mode = !(v->arch.flags & TF_kernel_mode);
298 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
299 #elif defined(__i386__)
300 #define TOGGLE_MODE() ((void)0)
301 #endif
303 BUG_ON(unlikely(in_irq()));
305 shadow_sync_va(v, gva);
307 TOGGLE_MODE();
308 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
309 sizeof(l1e));
310 TOGGLE_MODE();
312 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
313 return 0;
315 gpfn = l1e_get_pfn(l1e);
316 gmfn = __gpfn_to_mfn(d, gpfn);
317 if ( unlikely(!VALID_MFN(gmfn)) )
318 return 0;
320 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
322 if ( !res && unlikely(shadow_mode_refcounts(d)) )
323 {
324 shadow_lock(d);
325 shadow_remove_all_write_access(d, gpfn, gmfn);
326 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
327 shadow_unlock(d);
328 }
330 if ( unlikely(!res) )
331 return 0;
333 nl1e = l1e_from_pfn(gmfn, l1e_get_flags(l1e) | _PAGE_RW);
335 v->arch.perdomain_ptes[off + 16] = nl1e;
336 v->arch.shadow_ldt_mapcnt++;
338 return 1;
339 }
342 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
343 {
344 struct pfn_info *page = &frame_table[page_nr];
346 if ( unlikely(!pfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
347 {
348 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
349 return 0;
350 }
352 return 1;
353 }
356 static int get_page_and_type_from_pagenr(unsigned long page_nr,
357 u32 type,
358 struct domain *d)
359 {
360 struct pfn_info *page = &frame_table[page_nr];
362 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
363 return 0;
365 if ( unlikely(!get_page_type(page, type)) )
366 {
367 if ( (type & PGT_type_mask) != PGT_l1_page_table )
368 MEM_LOG("Bad page type for pfn %lx (%08x)",
369 page_nr, page->u.inuse.type_info);
370 put_page(page);
371 return 0;
372 }
374 return 1;
375 }
377 /*
378 * We allow root tables to map each other (a.k.a. linear page tables). It
379 * needs some special care with reference counts and access permissions:
380 * 1. The mapping entry must be read-only, or the guest may get write access
381 * to its own PTEs.
382 * 2. We must only bump the reference counts for an *already validated*
383 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
384 * on a validation that is required to complete that validation.
385 * 3. We only need to increment the reference counts for the mapped page
386 * frame if it is mapped by a different root table. This is sufficient and
387 * also necessary to allow validation of a root table mapping itself.
388 */
389 static int
390 get_linear_pagetable(
391 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
392 {
393 u32 x, y;
394 struct pfn_info *page;
395 unsigned long pfn;
397 ASSERT( !shadow_mode_refcounts(d) );
399 if ( (root_get_flags(re) & _PAGE_RW) )
400 {
401 MEM_LOG("Attempt to create linear p.t. with write perms");
402 return 0;
403 }
405 if ( (pfn = root_get_pfn(re)) != re_pfn )
406 {
407 /* Make sure the mapped frame belongs to the correct domain. */
408 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
409 return 0;
411 /*
412 * Make sure that the mapped frame is an already-validated L2 table.
413 * If so, atomically increment the count (checking for overflow).
414 */
415 page = &frame_table[pfn];
416 y = page->u.inuse.type_info;
417 do {
418 x = y;
419 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
420 unlikely((x & (PGT_type_mask|PGT_validated)) !=
421 (PGT_root_page_table|PGT_validated)) )
422 {
423 put_page(page);
424 return 0;
425 }
426 }
427 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
428 }
430 return 1;
431 }
433 int
434 get_page_from_l1e(
435 l1_pgentry_t l1e, struct domain *d)
436 {
437 unsigned long mfn = l1e_get_pfn(l1e);
438 struct pfn_info *page = &frame_table[mfn];
439 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
441 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
442 return 1;
444 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
445 {
446 MEM_LOG("Bad L1 flags %x\n", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
447 return 0;
448 }
450 if ( unlikely(!pfn_valid(mfn)) ||
451 unlikely(page_get_owner(page) == dom_io) )
452 {
453 /* DOMID_IO reverts to caller for privilege checks. */
454 if ( d == dom_io )
455 d = current->domain;
457 if ( (!IS_PRIV(d)) &&
458 (!IS_CAPABLE_PHYSDEV(d) || !domain_iomem_in_pfn(d, mfn)) )
459 {
460 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
461 return 0;
462 }
464 /* No reference counting for out-of-range I/O pages. */
465 if ( !pfn_valid(mfn) )
466 return 1;
468 d = dom_io;
469 }
471 return ((l1e_get_flags(l1e) & _PAGE_RW) ?
472 get_page_and_type(page, d, PGT_writable_page) :
473 get_page(page, d));
474 }
477 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
478 static int
479 get_page_from_l2e(
480 l2_pgentry_t l2e, unsigned long pfn,
481 struct domain *d, unsigned long vaddr)
482 {
483 int rc;
485 ASSERT(!shadow_mode_refcounts(d));
487 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
488 return 1;
490 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
491 {
492 MEM_LOG("Bad L2 flags %x\n", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
493 return 0;
494 }
496 vaddr >>= L2_PAGETABLE_SHIFT;
497 vaddr <<= PGT_va_shift;
498 rc = get_page_and_type_from_pagenr(
499 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
501 #if CONFIG_PAGING_LEVELS == 2
502 if (!rc)
503 rc = get_linear_pagetable(l2e, pfn, d);
504 #endif
505 return rc;
506 }
509 #if CONFIG_PAGING_LEVELS >= 3
511 static int
512 get_page_from_l3e(
513 l3_pgentry_t l3e, unsigned long pfn,
514 struct domain *d, unsigned long vaddr)
515 {
516 ASSERT( !shadow_mode_refcounts(d) );
518 int rc;
520 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
521 return 1;
523 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
524 {
525 MEM_LOG("Bad L3 flags %x\n", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
526 return 0;
527 }
529 vaddr >>= L3_PAGETABLE_SHIFT;
530 vaddr <<= PGT_va_shift;
531 rc = get_page_and_type_from_pagenr(
532 l3e_get_pfn(l3e),
533 PGT_l2_page_table | vaddr, d);
534 #if CONFIG_PAGING_LEVELS == 3
535 if (!rc)
536 rc = get_linear_pagetable(l3e, pfn, d);
537 #endif
538 return rc;
539 }
541 #endif /* 3 level */
543 #if CONFIG_PAGING_LEVELS >= 4
545 static int
546 get_page_from_l4e(
547 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
548 {
549 int rc;
551 ASSERT( !shadow_mode_refcounts(d) );
553 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
554 return 1;
556 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
557 {
558 MEM_LOG("Bad L4 flags %x\n", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
559 return 0;
560 }
562 rc = get_page_and_type_from_pagenr(
563 l4e_get_pfn(l4e), PGT_l3_page_table, d);
565 if ( unlikely(!rc) )
566 return get_linear_pagetable(l4e, pfn, d);
568 return 1;
569 }
571 #endif /* 4 level */
574 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
575 {
576 unsigned long pfn = l1e_get_pfn(l1e);
577 struct pfn_info *page = &frame_table[pfn];
578 struct domain *e;
580 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !pfn_valid(pfn) )
581 return;
583 e = page_get_owner(page);
584 if ( unlikely(e != d) )
585 {
586 /*
587 * Unmap a foreign page that may have been mapped via a grant table.
588 * Note that this can fail for a privileged domain that can map foreign
589 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
590 * counted via a grant entry and some counted directly in the page
591 * structure's reference count. Note that reference counts won't get
592 * dangerously confused as long as we always try to decrement the
593 * grant entry first. We may end up with a mismatch between which
594 * mappings and which unmappings are counted via the grant entry, but
595 * really it doesn't matter as privileged domains have carte blanche.
596 */
597 if (likely(gnttab_check_unmap(e, d, pfn,
598 !(l1e_get_flags(l1e) & _PAGE_RW))))
599 return;
600 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
601 }
603 if ( l1e_get_flags(l1e) & _PAGE_RW )
604 {
605 put_page_and_type(page);
606 }
607 else
608 {
609 /* We expect this is rare so we blow the entire shadow LDT. */
610 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
611 PGT_ldt_page)) &&
612 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
614 // XXX SMP BUG?
615 invalidate_shadow_ldt(e->vcpu[0]);
616 put_page(page);
617 }
618 }
621 /*
622 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
623 * Note also that this automatically deals correctly with linear p.t.'s.
624 */
625 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
626 {
627 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
628 (l2e_get_pfn(l2e) != pfn) )
629 put_page_and_type(&frame_table[l2e_get_pfn(l2e)]);
630 }
633 #if CONFIG_PAGING_LEVELS >= 3
635 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
636 {
637 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
638 (l3e_get_pfn(l3e) != pfn) )
639 put_page_and_type(&frame_table[l3e_get_pfn(l3e)]);
640 }
642 #endif
644 #if CONFIG_PAGING_LEVELS >= 4
646 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
647 {
648 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
649 (l4e_get_pfn(l4e) != pfn) )
650 put_page_and_type(&frame_table[l4e_get_pfn(l4e)]);
651 }
653 #endif
656 static int alloc_l1_table(struct pfn_info *page)
657 {
658 struct domain *d = page_get_owner(page);
659 unsigned long pfn = page_to_pfn(page);
660 l1_pgentry_t *pl1e;
661 int i;
663 ASSERT(!shadow_mode_refcounts(d));
665 pl1e = map_domain_page(pfn);
667 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
668 if ( is_guest_l1_slot(i) &&
669 unlikely(!get_page_from_l1e(pl1e[i], d)) )
670 goto fail;
672 unmap_domain_page(pl1e);
673 return 1;
675 fail:
676 while ( i-- > 0 )
677 if ( is_guest_l1_slot(i) )
678 put_page_from_l1e(pl1e[i], d);
680 unmap_domain_page(pl1e);
681 return 0;
682 }
684 #ifdef CONFIG_X86_PAE
685 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
686 {
687 struct pfn_info *page;
688 l2_pgentry_t *pl2e;
689 l3_pgentry_t l3e3;
690 int i;
692 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
694 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
695 l3e3 = pl3e[3];
696 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
697 {
698 MEM_LOG("PAE L3 3rd slot is empty");
699 return 0;
700 }
702 /*
703 * The Xen-private mappings include linear mappings. The L2 thus cannot
704 * be shared by multiple L3 tables. The test here is adequate because:
705 * 1. Cannot appear in slots != 3 because the page would then then have
706 * unknown va backpointer, which get_page_type() explicitly disallows.
707 * 2. Cannot appear in another page table's L3:
708 * a. alloc_l3_table() calls this function and this check will fail
709 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
710 */
711 page = l3e_get_page(l3e3);
712 BUG_ON(page->u.inuse.type_info & PGT_pinned);
713 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
714 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
715 {
716 MEM_LOG("PAE L3 3rd slot is shared");
717 return 0;
718 }
720 /* Xen private mappings. */
721 pl2e = map_domain_page(l3e_get_pfn(l3e3));
722 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
723 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
724 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
725 for ( i = 0; i < (PERDOMAIN_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
726 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
727 l2e_from_page(
728 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
729 __PAGE_HYPERVISOR);
730 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
731 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
732 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
733 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
734 l2e_empty();
735 unmap_domain_page(pl2e);
737 return 1;
738 }
740 static inline int l1_backptr(
741 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
742 {
743 unsigned long l2_backptr = l2_type & PGT_va_mask;
744 BUG_ON(l2_backptr == PGT_va_unknown);
745 if ( l2_backptr == PGT_va_mutable )
746 return 0;
747 *backptr =
748 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
749 (offset_in_l2 << L2_PAGETABLE_SHIFT);
750 return 1;
751 }
753 #else
754 # define create_pae_xen_mappings(pl3e) (1)
755 # define l1_backptr(bp,l2o,l2t) \
756 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
757 #endif
759 static int alloc_l2_table(struct pfn_info *page, unsigned int type)
760 {
761 struct domain *d = page_get_owner(page);
762 unsigned long pfn = page_to_pfn(page);
763 unsigned long vaddr;
764 l2_pgentry_t *pl2e;
765 int i;
767 /* See the code in shadow_promote() to understand why this is here. */
768 if ( (PGT_base_page_table == PGT_l2_page_table) &&
769 unlikely(shadow_mode_refcounts(d)) )
770 return 1;
771 ASSERT(!shadow_mode_refcounts(d));
773 pl2e = map_domain_page(pfn);
775 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
776 {
777 if ( !l1_backptr(&vaddr, i, type) )
778 goto fail;
779 if ( is_guest_l2_slot(type, i) &&
780 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
781 goto fail;
782 }
784 #if CONFIG_PAGING_LEVELS == 2
785 /* Xen private mappings. */
786 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
787 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
788 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
789 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
790 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
791 pl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
792 l2e_from_page(
793 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt),
794 __PAGE_HYPERVISOR);
795 #endif
797 unmap_domain_page(pl2e);
798 return 1;
800 fail:
801 while ( i-- > 0 )
802 if ( is_guest_l2_slot(type, i) )
803 put_page_from_l2e(pl2e[i], pfn);
805 unmap_domain_page(pl2e);
806 return 0;
807 }
810 #if CONFIG_PAGING_LEVELS >= 3
811 static int alloc_l3_table(struct pfn_info *page)
812 {
813 struct domain *d = page_get_owner(page);
814 unsigned long pfn = page_to_pfn(page);
815 unsigned long vaddr;
816 l3_pgentry_t *pl3e;
817 int i;
819 ASSERT(!shadow_mode_refcounts(d));
821 pl3e = map_domain_page(pfn);
822 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
823 {
824 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
825 if ( is_guest_l3_slot(i) &&
826 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
827 goto fail;
828 }
830 if ( !create_pae_xen_mappings(pl3e) )
831 goto fail;
833 unmap_domain_page(pl3e);
834 return 1;
836 fail:
837 while ( i-- > 0 )
838 if ( is_guest_l3_slot(i) )
839 put_page_from_l3e(pl3e[i], pfn);
841 unmap_domain_page(pl3e);
842 return 0;
843 }
844 #else
845 #define alloc_l3_table(page) (0)
846 #endif
848 #if CONFIG_PAGING_LEVELS >= 4
849 static int alloc_l4_table(struct pfn_info *page)
850 {
851 struct domain *d = page_get_owner(page);
852 unsigned long pfn = page_to_pfn(page);
853 l4_pgentry_t *pl4e = page_to_virt(page);
854 int i;
856 /* See the code in shadow_promote() to understand why this is here. */
857 if ( (PGT_base_page_table == PGT_l4_page_table) &&
858 shadow_mode_refcounts(d) )
859 return 1;
860 ASSERT(!shadow_mode_refcounts(d));
862 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
863 if ( is_guest_l4_slot(i) &&
864 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
865 goto fail;
867 /* Xen private mappings. */
868 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
869 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
870 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
871 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
872 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
873 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
874 l4e_from_page(
875 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
876 __PAGE_HYPERVISOR);
878 return 1;
880 fail:
881 while ( i-- > 0 )
882 if ( is_guest_l4_slot(i) )
883 put_page_from_l4e(pl4e[i], pfn);
885 return 0;
886 }
887 #else
888 #define alloc_l4_table(page) (0)
889 #endif
892 static void free_l1_table(struct pfn_info *page)
893 {
894 struct domain *d = page_get_owner(page);
895 unsigned long pfn = page_to_pfn(page);
896 l1_pgentry_t *pl1e;
897 int i;
899 pl1e = map_domain_page(pfn);
901 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
902 if ( is_guest_l1_slot(i) )
903 put_page_from_l1e(pl1e[i], d);
905 unmap_domain_page(pl1e);
906 }
909 static void free_l2_table(struct pfn_info *page)
910 {
911 unsigned long pfn = page_to_pfn(page);
912 l2_pgentry_t *pl2e;
913 int i;
915 pl2e = map_domain_page(pfn);
917 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
918 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
919 put_page_from_l2e(pl2e[i], pfn);
921 unmap_domain_page(pl2e);
922 }
925 #if CONFIG_PAGING_LEVELS >= 3
927 static void free_l3_table(struct pfn_info *page)
928 {
929 unsigned long pfn = page_to_pfn(page);
930 l3_pgentry_t *pl3e;
931 int i;
933 pl3e = map_domain_page(pfn);
935 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
936 if ( is_guest_l3_slot(i) )
937 put_page_from_l3e(pl3e[i], pfn);
939 unmap_domain_page(pl3e);
940 }
942 #endif
944 #if CONFIG_PAGING_LEVELS >= 4
946 static void free_l4_table(struct pfn_info *page)
947 {
948 unsigned long pfn = page_to_pfn(page);
949 l4_pgentry_t *pl4e = page_to_virt(page);
950 int i;
952 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
953 if ( is_guest_l4_slot(i) )
954 put_page_from_l4e(pl4e[i], pfn);
955 }
957 #endif
959 static inline int update_l1e(l1_pgentry_t *pl1e,
960 l1_pgentry_t ol1e,
961 l1_pgentry_t nl1e)
962 {
963 intpte_t o = l1e_get_intpte(ol1e);
964 intpte_t n = l1e_get_intpte(nl1e);
966 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
967 unlikely(o != l1e_get_intpte(ol1e)) )
968 {
969 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
970 ": saw %" PRIpte "\n",
971 l1e_get_intpte(ol1e),
972 l1e_get_intpte(nl1e),
973 o);
974 return 0;
975 }
976 return 1;
977 }
980 /* Update the L1 entry at pl1e to new value nl1e. */
981 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
982 {
983 l1_pgentry_t ol1e;
984 struct domain *d = current->domain;
986 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
987 return 0;
989 if ( unlikely(shadow_mode_refcounts(d)) )
990 return update_l1e(pl1e, ol1e, nl1e);
992 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
993 {
994 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
995 {
996 MEM_LOG("Bad L1 flags %x\n",
997 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
998 return 0;
999 }
1001 /* Fast path for identical mapping, r/w and presence. */
1002 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1003 return update_l1e(pl1e, ol1e, nl1e);
1005 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1006 return 0;
1008 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1010 put_page_from_l1e(nl1e, d);
1011 return 0;
1014 else
1016 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1017 return 0;
1020 put_page_from_l1e(ol1e, d);
1021 return 1;
1024 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1025 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1026 _t ## e_get_intpte(_o), \
1027 _t ## e_get_intpte(_n)); \
1028 if ( __o != _t ## e_get_intpte(_o) ) \
1029 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1030 ": saw %" PRIpte "", \
1031 (_t ## e_get_intpte(_o)), \
1032 (_t ## e_get_intpte(_n)), \
1033 (__o)); \
1034 (__o == _t ## e_get_intpte(_o)); })
1036 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1037 static int mod_l2_entry(l2_pgentry_t *pl2e,
1038 l2_pgentry_t nl2e,
1039 unsigned long pfn,
1040 unsigned int type)
1042 l2_pgentry_t ol2e;
1043 unsigned long vaddr;
1045 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1047 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1048 return 0;
1051 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1052 return 0;
1054 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1056 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1058 MEM_LOG("Bad L2 flags %x\n",
1059 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1060 return 0;
1063 /* Fast path for identical mapping and presence. */
1064 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1065 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1067 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1068 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1069 return 0;
1071 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1073 put_page_from_l2e(nl2e, pfn);
1074 return 0;
1077 else
1079 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1080 return 0;
1083 put_page_from_l2e(ol2e, pfn);
1084 return 1;
1088 #if CONFIG_PAGING_LEVELS >= 3
1090 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1091 static int mod_l3_entry(l3_pgentry_t *pl3e,
1092 l3_pgentry_t nl3e,
1093 unsigned long pfn)
1095 l3_pgentry_t ol3e;
1096 unsigned long vaddr;
1098 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1100 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1101 return 0;
1104 #ifdef CONFIG_X86_PAE
1105 /*
1106 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1107 * would be a pain to ensure they remain continuously valid throughout.
1108 */
1109 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1110 return 0;
1111 #endif
1113 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1114 return 0;
1116 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1118 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1120 MEM_LOG("Bad L3 flags %x\n",
1121 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1122 return 0;
1125 /* Fast path for identical mapping and presence. */
1126 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1127 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1129 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1130 << L3_PAGETABLE_SHIFT;
1131 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1132 return 0;
1134 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1136 BUG_ON(!create_pae_xen_mappings(pl3e));
1137 put_page_from_l3e(nl3e, pfn);
1138 return 0;
1141 put_page_from_l3e(ol3e, pfn);
1142 return 1;
1145 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1147 BUG_ON(!create_pae_xen_mappings(pl3e));
1148 return 0;
1151 put_page_from_l3e(ol3e, pfn);
1152 return 1;
1155 #endif
1157 #if CONFIG_PAGING_LEVELS >= 4
1159 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1160 static int mod_l4_entry(l4_pgentry_t *pl4e,
1161 l4_pgentry_t nl4e,
1162 unsigned long pfn)
1164 l4_pgentry_t ol4e;
1166 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1168 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1169 return 0;
1172 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1173 return 0;
1175 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1177 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1179 MEM_LOG("Bad L4 flags %x\n",
1180 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1181 return 0;
1184 /* Fast path for identical mapping and presence. */
1185 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1186 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1188 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1189 return 0;
1191 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1193 put_page_from_l4e(nl4e, pfn);
1194 return 0;
1197 put_page_from_l4e(ol4e, pfn);
1198 return 1;
1201 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1202 return 0;
1204 put_page_from_l4e(ol4e, pfn);
1205 return 1;
1208 #endif
1210 int alloc_page_type(struct pfn_info *page, unsigned int type)
1212 switch ( type & PGT_type_mask )
1214 case PGT_l1_page_table:
1215 return alloc_l1_table(page);
1216 case PGT_l2_page_table:
1217 return alloc_l2_table(page, type);
1218 case PGT_l3_page_table:
1219 return alloc_l3_table(page);
1220 case PGT_l4_page_table:
1221 return alloc_l4_table(page);
1222 case PGT_gdt_page:
1223 case PGT_ldt_page:
1224 return alloc_segdesc_page(page);
1225 default:
1226 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
1227 type, page->u.inuse.type_info,
1228 page->count_info);
1229 BUG();
1232 return 0;
1236 void free_page_type(struct pfn_info *page, unsigned int type)
1238 struct domain *owner = page_get_owner(page);
1239 unsigned long gpfn;
1241 if ( owner != NULL )
1243 if ( unlikely(shadow_mode_refcounts(owner)) )
1244 return;
1245 if ( unlikely(shadow_mode_enabled(owner)) )
1247 gpfn = __mfn_to_gpfn(owner, page_to_pfn(page));
1248 ASSERT(VALID_M2P(gpfn));
1249 remove_shadow(owner, gpfn, type & PGT_type_mask);
1253 switch (type & PGT_type_mask)
1255 case PGT_l1_page_table:
1256 free_l1_table(page);
1257 break;
1259 case PGT_l2_page_table:
1260 free_l2_table(page);
1261 break;
1263 #if CONFIG_PAGING_LEVELS >= 3
1264 case PGT_l3_page_table:
1265 free_l3_table(page);
1266 break;
1267 #endif
1269 #if CONFIG_PAGING_LEVELS >= 4
1270 case PGT_l4_page_table:
1271 free_l4_table(page);
1272 break;
1273 #endif
1275 default:
1276 printk("%s: type %x pfn %lx\n",__FUNCTION__,
1277 type, page_to_pfn(page));
1278 BUG();
1283 void put_page_type(struct pfn_info *page)
1285 u32 nx, x, y = page->u.inuse.type_info;
1287 again:
1288 do {
1289 x = y;
1290 nx = x - 1;
1292 ASSERT((x & PGT_count_mask) != 0);
1294 /*
1295 * The page should always be validated while a reference is held. The
1296 * exception is during domain destruction, when we forcibly invalidate
1297 * page-table pages if we detect a referential loop.
1298 * See domain.c:relinquish_list().
1299 */
1300 ASSERT((x & PGT_validated) ||
1301 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1303 if ( unlikely((nx & PGT_count_mask) == 0) )
1305 /* Record TLB information for flush later. Races are harmless. */
1306 page->tlbflush_timestamp = tlbflush_current_time();
1308 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1309 likely(nx & PGT_validated) )
1311 /*
1312 * Page-table pages must be unvalidated when count is zero. The
1313 * 'free' is safe because the refcnt is non-zero and validated
1314 * bit is clear => other ops will spin or fail.
1315 */
1316 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1317 x & ~PGT_validated)) != x) )
1318 goto again;
1319 /* We cleared the 'valid bit' so we do the clean up. */
1320 free_page_type(page, x);
1321 /* Carry on, but with the 'valid bit' now clear. */
1322 x &= ~PGT_validated;
1323 nx &= ~PGT_validated;
1326 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1327 (PGT_pinned | 1)) &&
1328 ((nx & PGT_type_mask) != PGT_writable_page)) )
1330 /* Page is now only pinned. Make the back pointer mutable again. */
1331 nx |= PGT_va_mutable;
1334 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1338 int get_page_type(struct pfn_info *page, u32 type)
1340 u32 nx, x, y = page->u.inuse.type_info;
1342 again:
1343 do {
1344 x = y;
1345 nx = x + 1;
1346 if ( unlikely((nx & PGT_count_mask) == 0) )
1348 MEM_LOG("Type count overflow on pfn %lx", page_to_pfn(page));
1349 return 0;
1351 else if ( unlikely((x & PGT_count_mask) == 0) )
1353 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1355 /*
1356 * On type change we check to flush stale TLB entries. This
1357 * may be unnecessary (e.g., page was GDT/LDT) but those
1358 * circumstances should be very rare.
1359 */
1360 cpumask_t mask = page_get_owner(page)->cpumask;
1361 tlbflush_filter(mask, page->tlbflush_timestamp);
1363 if ( unlikely(!cpus_empty(mask)) )
1365 perfc_incrc(need_flush_tlb_flush);
1366 flush_tlb_mask(mask);
1369 /* We lose existing type, back pointer, and validity. */
1370 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1371 nx |= type;
1373 /* No special validation needed for writable pages. */
1374 /* Page tables and GDT/LDT need to be scanned for validity. */
1375 if ( type == PGT_writable_page )
1376 nx |= PGT_validated;
1379 else
1381 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1383 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1385 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1386 ((type & PGT_type_mask) != PGT_l1_page_table) )
1387 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %lx",
1388 x, type, page_to_pfn(page));
1389 return 0;
1391 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1393 /* The va backpointer is mutable, hence we update it. */
1394 nx &= ~PGT_va_mask;
1395 nx |= type; /* we know the actual type is correct */
1397 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1398 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1400 #ifdef CONFIG_X86_PAE
1401 /* We use backptr as extra typing. Cannot be unknown. */
1402 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1403 return 0;
1404 #endif
1405 /* This table is possibly mapped at multiple locations. */
1406 nx &= ~PGT_va_mask;
1407 nx |= PGT_va_unknown;
1410 if ( unlikely(!(x & PGT_validated)) )
1412 /* Someone else is updating validation of this page. Wait... */
1413 while ( (y = page->u.inuse.type_info) == x )
1414 cpu_relax();
1415 goto again;
1419 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1421 if ( unlikely(!(nx & PGT_validated)) )
1423 /* Try to validate page type; drop the new reference on failure. */
1424 if ( unlikely(!alloc_page_type(page, type)) )
1426 MEM_LOG("Error while validating pfn %lx for type %08x."
1427 " caf=%08x taf=%08x",
1428 page_to_pfn(page), type,
1429 page->count_info,
1430 page->u.inuse.type_info);
1431 /* Noone else can get a reference. We hold the only ref. */
1432 page->u.inuse.type_info = 0;
1433 return 0;
1436 /* Noone else is updating simultaneously. */
1437 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1440 return 1;
1444 int new_guest_cr3(unsigned long mfn)
1446 struct vcpu *v = current;
1447 struct domain *d = v->domain;
1448 int okay;
1449 unsigned long old_base_mfn;
1451 if ( shadow_mode_refcounts(d) )
1452 okay = get_page_from_pagenr(mfn, d);
1453 else
1454 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1456 if ( likely(okay) )
1458 invalidate_shadow_ldt(v);
1460 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1461 v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1462 update_pagetables(v); /* update shadow_table and monitor_table */
1464 write_ptbase(v);
1466 if ( shadow_mode_refcounts(d) )
1467 put_page(&frame_table[old_base_mfn]);
1468 else
1469 put_page_and_type(&frame_table[old_base_mfn]);
1471 /* CR3 also holds a ref to its shadow... */
1472 if ( shadow_mode_enabled(d) )
1474 if ( v->arch.monitor_shadow_ref )
1475 put_shadow_ref(v->arch.monitor_shadow_ref);
1476 v->arch.monitor_shadow_ref =
1477 pagetable_get_pfn(v->arch.monitor_table);
1478 ASSERT(!page_get_owner(&frame_table[v->arch.monitor_shadow_ref]));
1479 get_shadow_ref(v->arch.monitor_shadow_ref);
1482 else
1484 MEM_LOG("Error while installing new baseptr %lx", mfn);
1487 return okay;
1490 static void process_deferred_ops(unsigned int cpu)
1492 unsigned int deferred_ops;
1493 struct domain *d = current->domain;
1495 deferred_ops = percpu_info[cpu].deferred_ops;
1496 percpu_info[cpu].deferred_ops = 0;
1498 if ( deferred_ops & DOP_FLUSH_TLB )
1500 if ( shadow_mode_enabled(d) )
1501 shadow_sync_all(d);
1502 local_flush_tlb();
1505 if ( deferred_ops & DOP_RELOAD_LDT )
1506 (void)map_ldt_shadow_page(0);
1508 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1510 put_domain(percpu_info[cpu].foreign);
1511 percpu_info[cpu].foreign = NULL;
1515 static int set_foreigndom(unsigned int cpu, domid_t domid)
1517 struct domain *e, *d = current->domain;
1518 int okay = 1;
1520 if ( (e = percpu_info[cpu].foreign) != NULL )
1521 put_domain(e);
1522 percpu_info[cpu].foreign = NULL;
1524 if ( domid == DOMID_SELF )
1525 goto out;
1527 if ( !IS_PRIV(d) )
1529 switch ( domid )
1531 case DOMID_IO:
1532 get_knownalive_domain(dom_io);
1533 percpu_info[cpu].foreign = dom_io;
1534 break;
1535 default:
1536 MEM_LOG("Dom %u cannot set foreign dom\n", d->domain_id);
1537 okay = 0;
1538 break;
1541 else
1543 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1544 if ( e == NULL )
1546 switch ( domid )
1548 case DOMID_XEN:
1549 get_knownalive_domain(dom_xen);
1550 percpu_info[cpu].foreign = dom_xen;
1551 break;
1552 case DOMID_IO:
1553 get_knownalive_domain(dom_io);
1554 percpu_info[cpu].foreign = dom_io;
1555 break;
1556 default:
1557 MEM_LOG("Unknown domain '%u'", domid);
1558 okay = 0;
1559 break;
1564 out:
1565 return okay;
1568 static inline cpumask_t vcpumask_to_pcpumask(
1569 struct domain *d, unsigned long vmask)
1571 unsigned int vcpu_id;
1572 cpumask_t pmask;
1573 struct vcpu *v;
1575 while ( vmask != 0 )
1577 vcpu_id = find_first_set_bit(vmask);
1578 vmask &= ~(1UL << vcpu_id);
1579 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1580 ((v = d->vcpu[vcpu_id]) != NULL) )
1581 cpu_set(v->processor, pmask);
1584 return pmask;
1587 int do_mmuext_op(
1588 struct mmuext_op *uops,
1589 unsigned int count,
1590 unsigned int *pdone,
1591 unsigned int foreigndom)
1593 struct mmuext_op op;
1594 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1595 unsigned int type, done = 0;
1596 struct pfn_info *page;
1597 struct vcpu *v = current;
1598 struct domain *d = v->domain, *e;
1599 u32 x, y, _d, _nd;
1601 LOCK_BIGLOCK(d);
1603 cleanup_writable_pagetable(d);
1605 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1607 count &= ~MMU_UPDATE_PREEMPTED;
1608 if ( unlikely(pdone != NULL) )
1609 (void)get_user(done, pdone);
1612 if ( !set_foreigndom(cpu, foreigndom) )
1614 rc = -EINVAL;
1615 goto out;
1618 if ( unlikely(!array_access_ok(uops, count, sizeof(op))) )
1620 rc = -EFAULT;
1621 goto out;
1624 for ( i = 0; i < count; i++ )
1626 if ( hypercall_preempt_check() )
1628 rc = hypercall4_create_continuation(
1629 __HYPERVISOR_mmuext_op, uops,
1630 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1631 break;
1634 if ( unlikely(__copy_from_user(&op, uops, sizeof(op)) != 0) )
1636 MEM_LOG("Bad __copy_from_user");
1637 rc = -EFAULT;
1638 break;
1641 okay = 1;
1642 page = &frame_table[op.mfn];
1644 switch ( op.cmd )
1646 case MMUEXT_PIN_L1_TABLE:
1647 type = PGT_l1_page_table | PGT_va_mutable;
1649 pin_page:
1650 if ( shadow_mode_refcounts(FOREIGNDOM) )
1651 type = PGT_writable_page;
1653 okay = get_page_and_type_from_pagenr(op.mfn, type, FOREIGNDOM);
1654 if ( unlikely(!okay) )
1656 MEM_LOG("Error while pinning mfn %lx", op.mfn);
1657 break;
1660 if ( unlikely(test_and_set_bit(_PGT_pinned,
1661 &page->u.inuse.type_info)) )
1663 MEM_LOG("Mfn %lx already pinned", op.mfn);
1664 put_page_and_type(page);
1665 okay = 0;
1666 break;
1669 break;
1671 #ifndef CONFIG_X86_PAE /* Unsafe on PAE because of Xen-private mappings. */
1672 case MMUEXT_PIN_L2_TABLE:
1673 type = PGT_l2_page_table;
1674 goto pin_page;
1675 #endif
1677 case MMUEXT_PIN_L3_TABLE:
1678 type = PGT_l3_page_table;
1679 goto pin_page;
1681 case MMUEXT_PIN_L4_TABLE:
1682 type = PGT_l4_page_table;
1683 goto pin_page;
1685 case MMUEXT_UNPIN_TABLE:
1686 if ( unlikely(!(okay = get_page_from_pagenr(op.mfn, FOREIGNDOM))) )
1688 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1689 op.mfn, page_get_owner(page));
1691 else if ( likely(test_and_clear_bit(_PGT_pinned,
1692 &page->u.inuse.type_info)) )
1694 put_page_and_type(page);
1695 put_page(page);
1697 else
1699 okay = 0;
1700 put_page(page);
1701 MEM_LOG("Mfn %lx not pinned", op.mfn);
1703 break;
1705 case MMUEXT_NEW_BASEPTR:
1706 okay = new_guest_cr3(op.mfn);
1707 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1708 break;
1710 #ifdef __x86_64__
1711 case MMUEXT_NEW_USER_BASEPTR:
1712 okay = get_page_and_type_from_pagenr(
1713 op.mfn, PGT_root_page_table, d);
1714 if ( unlikely(!okay) )
1716 MEM_LOG("Error while installing new mfn %lx", op.mfn);
1718 else
1720 unsigned long old_mfn =
1721 pagetable_get_pfn(v->arch.guest_table_user);
1722 v->arch.guest_table_user = mk_pagetable(op.mfn << PAGE_SHIFT);
1723 if ( old_mfn != 0 )
1724 put_page_and_type(&frame_table[old_mfn]);
1726 break;
1727 #endif
1729 case MMUEXT_TLB_FLUSH_LOCAL:
1730 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1731 break;
1733 case MMUEXT_INVLPG_LOCAL:
1734 if ( shadow_mode_enabled(d) )
1735 shadow_invlpg(v, op.linear_addr);
1736 local_flush_tlb_one(op.linear_addr);
1737 break;
1739 case MMUEXT_TLB_FLUSH_MULTI:
1740 case MMUEXT_INVLPG_MULTI:
1742 unsigned long vmask;
1743 cpumask_t pmask;
1744 if ( unlikely(get_user(vmask, (unsigned long *)op.vcpumask)) )
1746 okay = 0;
1747 break;
1749 pmask = vcpumask_to_pcpumask(d, vmask);
1750 cpus_and(pmask, pmask, d->cpumask);
1751 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
1752 flush_tlb_mask(pmask);
1753 else
1754 flush_tlb_one_mask(pmask, op.linear_addr);
1755 break;
1758 case MMUEXT_TLB_FLUSH_ALL:
1759 flush_tlb_mask(d->cpumask);
1760 break;
1762 case MMUEXT_INVLPG_ALL:
1763 flush_tlb_one_mask(d->cpumask, op.linear_addr);
1764 break;
1766 case MMUEXT_FLUSH_CACHE:
1767 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1769 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1770 okay = 0;
1772 else
1774 wbinvd();
1776 break;
1778 case MMUEXT_SET_LDT:
1780 if ( shadow_mode_external(d) )
1782 MEM_LOG("ignoring SET_LDT hypercall from external "
1783 "domain %u\n", d->domain_id);
1784 okay = 0;
1785 break;
1788 unsigned long ptr = op.linear_addr;
1789 unsigned long ents = op.nr_ents;
1790 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1791 (ents > 8192) ||
1792 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
1794 okay = 0;
1795 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
1797 else if ( (v->arch.guest_context.ldt_ents != ents) ||
1798 (v->arch.guest_context.ldt_base != ptr) )
1800 invalidate_shadow_ldt(v);
1801 v->arch.guest_context.ldt_base = ptr;
1802 v->arch.guest_context.ldt_ents = ents;
1803 load_LDT(v);
1804 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1805 if ( ents != 0 )
1806 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1808 break;
1811 case MMUEXT_REASSIGN_PAGE:
1812 if ( unlikely(!IS_PRIV(d)) )
1814 MEM_LOG("Dom %u has no reassignment priv", d->domain_id);
1815 okay = 0;
1816 break;
1819 e = percpu_info[cpu].foreign;
1820 if ( unlikely(e == NULL) )
1822 MEM_LOG("No FOREIGNDOM to reassign mfn %lx to", op.mfn);
1823 okay = 0;
1824 break;
1827 /*
1828 * Grab both page_list locks, in order. This prevents the page from
1829 * disappearing elsewhere while we modify the owner, and we'll need
1830 * both locks if we're successful so that we can change lists.
1831 */
1832 if ( d < e )
1834 spin_lock(&d->page_alloc_lock);
1835 spin_lock(&e->page_alloc_lock);
1837 else
1839 spin_lock(&e->page_alloc_lock);
1840 spin_lock(&d->page_alloc_lock);
1843 /*
1844 * Check that 'e' will accept the page and has reservation
1845 * headroom. Also, a domain mustn't have PGC_allocated pages when
1846 * it is dying.
1847 */
1848 ASSERT(e->tot_pages <= e->max_pages);
1849 if ( unlikely(test_bit(_DOMF_dying, &e->domain_flags)) ||
1850 unlikely(e->tot_pages == e->max_pages) ||
1851 unlikely(IS_XEN_HEAP_FRAME(page)) )
1853 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1854 "page is in Xen heap (%lx), or dom is dying (%ld).\n",
1855 e->tot_pages, e->max_pages, op.mfn, e->domain_flags);
1856 okay = 0;
1857 goto reassign_fail;
1860 /*
1861 * The tricky bit: atomically change owner while there is just one
1862 * benign reference to the page (PGC_allocated). If that reference
1863 * disappears then the deallocation routine will safely spin.
1864 */
1865 _d = pickle_domptr(d);
1866 _nd = page->u.inuse._domain;
1867 y = page->count_info;
1868 do {
1869 x = y;
1870 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1871 (1|PGC_allocated)) ||
1872 unlikely(_nd != _d) )
1874 MEM_LOG("Bad page values %lx: ed=%p(%u), sd=%p,"
1875 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1876 d, d->domain_id, unpickle_domptr(_nd), x,
1877 page->u.inuse.type_info);
1878 okay = 0;
1879 goto reassign_fail;
1881 __asm__ __volatile__(
1882 LOCK_PREFIX "cmpxchg8b %3"
1883 : "=d" (_nd), "=a" (y), "=c" (e),
1884 "=m" (*(volatile u64 *)(&page->count_info))
1885 : "0" (_d), "1" (x), "c" (e), "b" (x) );
1887 while ( unlikely(_nd != _d) || unlikely(y != x) );
1889 /*
1890 * Unlink from 'd'. We transferred at least one reference to 'e',
1891 * so noone else is spinning to try to delete this page from 'd'.
1892 */
1893 d->tot_pages--;
1894 list_del(&page->list);
1896 /*
1897 * Add the page to 'e'. Someone may already have removed the last
1898 * reference and want to remove the page from 'e'. However, we have
1899 * the lock so they'll spin waiting for us.
1900 */
1901 if ( unlikely(e->tot_pages++ == 0) )
1902 get_knownalive_domain(e);
1903 list_add_tail(&page->list, &e->page_list);
1905 reassign_fail:
1906 spin_unlock(&d->page_alloc_lock);
1907 spin_unlock(&e->page_alloc_lock);
1908 break;
1910 default:
1911 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
1912 okay = 0;
1913 break;
1916 if ( unlikely(!okay) )
1918 rc = -EINVAL;
1919 break;
1922 uops++;
1925 out:
1926 process_deferred_ops(cpu);
1928 /* Add incremental work we have done to the @done output parameter. */
1929 if ( unlikely(pdone != NULL) )
1930 __put_user(done + i, pdone);
1932 UNLOCK_BIGLOCK(d);
1933 return rc;
1936 int do_mmu_update(
1937 mmu_update_t *ureqs,
1938 unsigned int count,
1939 unsigned int *pdone,
1940 unsigned int foreigndom)
1942 mmu_update_t req;
1943 void *va;
1944 unsigned long gpfn, mfn;
1945 struct pfn_info *page;
1946 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1947 unsigned int cmd, done = 0;
1948 struct vcpu *v = current;
1949 struct domain *d = v->domain;
1950 u32 type_info;
1951 struct domain_mmap_cache mapcache, sh_mapcache;
1953 LOCK_BIGLOCK(d);
1955 cleanup_writable_pagetable(d);
1957 if ( unlikely(shadow_mode_enabled(d)) )
1958 check_pagetable(v, "pre-mmu"); /* debug */
1960 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1962 count &= ~MMU_UPDATE_PREEMPTED;
1963 if ( unlikely(pdone != NULL) )
1964 (void)get_user(done, pdone);
1967 domain_mmap_cache_init(&mapcache);
1968 domain_mmap_cache_init(&sh_mapcache);
1970 if ( !set_foreigndom(cpu, foreigndom) )
1972 rc = -EINVAL;
1973 goto out;
1976 perfc_incrc(calls_to_mmu_update);
1977 perfc_addc(num_page_updates, count);
1978 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
1980 if ( unlikely(!array_access_ok(ureqs, count, sizeof(req))) )
1982 rc = -EFAULT;
1983 goto out;
1986 for ( i = 0; i < count; i++ )
1988 if ( hypercall_preempt_check() )
1990 rc = hypercall4_create_continuation(
1991 __HYPERVISOR_mmu_update, ureqs,
1992 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1993 break;
1996 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1998 MEM_LOG("Bad __copy_from_user");
1999 rc = -EFAULT;
2000 break;
2003 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2004 okay = 0;
2006 switch ( cmd )
2008 /*
2009 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2010 */
2011 case MMU_NORMAL_PT_UPDATE:
2013 gpfn = req.ptr >> PAGE_SHIFT;
2014 mfn = __gpfn_to_mfn(d, gpfn);
2016 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2018 MEM_LOG("Could not get page for normal update");
2019 break;
2022 va = map_domain_page_with_cache(mfn, &mapcache);
2023 va = (void *)((unsigned long)va + (req.ptr & ~PAGE_MASK));
2024 page = &frame_table[mfn];
2026 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2028 case PGT_l1_page_table:
2029 ASSERT( !shadow_mode_refcounts(d) );
2030 if ( likely(get_page_type(
2031 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2033 l1_pgentry_t l1e;
2035 /* FIXME: doesn't work with PAE */
2036 l1e = l1e_from_intpte(req.val);
2037 okay = mod_l1_entry(va, l1e);
2038 if ( okay && unlikely(shadow_mode_enabled(d)) )
2039 shadow_l1_normal_pt_update(d, req.ptr, l1e, &sh_mapcache);
2040 put_page_type(page);
2042 break;
2043 case PGT_l2_page_table:
2044 ASSERT( !shadow_mode_refcounts(d) );
2045 if ( likely(get_page_type(
2046 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2048 l2_pgentry_t l2e;
2050 /* FIXME: doesn't work with PAE */
2051 l2e = l2e_from_intpte(req.val);
2052 okay = mod_l2_entry((l2_pgentry_t *)va, l2e, mfn, type_info);
2053 if ( okay && unlikely(shadow_mode_enabled(d)) )
2054 shadow_l2_normal_pt_update(d, req.ptr, l2e, &sh_mapcache);
2055 put_page_type(page);
2057 break;
2058 #if CONFIG_PAGING_LEVELS >= 3
2059 case PGT_l3_page_table:
2060 ASSERT( !shadow_mode_refcounts(d) );
2061 if ( likely(get_page_type(page, PGT_l3_page_table)) )
2063 l3_pgentry_t l3e;
2065 /* FIXME: doesn't work with PAE */
2066 l3e = l3e_from_intpte(req.val);
2067 okay = mod_l3_entry(va, l3e, mfn);
2068 if ( okay && unlikely(shadow_mode_enabled(d)) )
2069 shadow_l3_normal_pt_update(d, req.ptr, l3e, &sh_mapcache);
2070 put_page_type(page);
2072 break;
2073 #endif
2074 #if CONFIG_PAGING_LEVELS >= 4
2075 case PGT_l4_page_table:
2076 ASSERT( !shadow_mode_refcounts(d) );
2077 if ( likely(get_page_type(page, PGT_l4_page_table)) )
2079 l4_pgentry_t l4e;
2081 l4e = l4e_from_intpte(req.val);
2082 okay = mod_l4_entry(va, l4e, mfn);
2083 if ( okay && unlikely(shadow_mode_enabled(d)) )
2084 shadow_l4_normal_pt_update(d, req.ptr, l4e, &sh_mapcache);
2085 put_page_type(page);
2087 break;
2088 #endif
2089 default:
2090 if ( likely(get_page_type(page, PGT_writable_page)) )
2092 if ( shadow_mode_enabled(d) )
2094 shadow_lock(d);
2096 if ( shadow_mode_log_dirty(d) )
2097 __mark_dirty(d, mfn);
2099 if ( page_is_page_table(page) &&
2100 !page_out_of_sync(page) )
2102 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
2106 *(unsigned long *)va = req.val;
2107 okay = 1;
2109 if ( shadow_mode_enabled(d) )
2110 shadow_unlock(d);
2112 put_page_type(page);
2114 break;
2117 unmap_domain_page_with_cache(va, &mapcache);
2119 put_page(page);
2120 break;
2122 case MMU_MACHPHYS_UPDATE:
2124 mfn = req.ptr >> PAGE_SHIFT;
2125 gpfn = req.val;
2127 /* HACK ALERT... Need to think about this some more... */
2128 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && IS_PRIV(d)) )
2130 shadow_lock(FOREIGNDOM);
2131 printk("privileged guest dom%d requests pfn=%lx to map mfn=%lx for dom%d\n",
2132 d->domain_id, gpfn, mfn, FOREIGNDOM->domain_id);
2133 set_machinetophys(mfn, gpfn);
2134 set_p2m_entry(FOREIGNDOM, gpfn, mfn, &sh_mapcache, &mapcache);
2135 okay = 1;
2136 shadow_unlock(FOREIGNDOM);
2137 break;
2140 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2142 MEM_LOG("Could not get page for mach->phys update");
2143 break;
2146 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && !IS_PRIV(d)) )
2148 MEM_LOG("can't mutate the m2p of translated guests");
2149 break;
2152 set_machinetophys(mfn, gpfn);
2153 okay = 1;
2155 /*
2156 * If in log-dirty mode, mark the corresponding
2157 * page as dirty.
2158 */
2159 if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) &&
2160 mark_dirty(FOREIGNDOM, mfn) )
2161 FOREIGNDOM->arch.shadow_dirty_block_count++;
2163 put_page(&frame_table[mfn]);
2164 break;
2166 default:
2167 MEM_LOG("Invalid page update command %lx", req.ptr);
2168 break;
2171 if ( unlikely(!okay) )
2173 rc = -EINVAL;
2174 break;
2177 ureqs++;
2180 out:
2181 domain_mmap_cache_destroy(&mapcache);
2182 domain_mmap_cache_destroy(&sh_mapcache);
2184 process_deferred_ops(cpu);
2186 /* Add incremental work we have done to the @done output parameter. */
2187 if ( unlikely(pdone != NULL) )
2188 __put_user(done + i, pdone);
2190 if ( unlikely(shadow_mode_enabled(d)) )
2191 check_pagetable(v, "post-mmu"); /* debug */
2193 UNLOCK_BIGLOCK(d);
2194 return rc;
2197 /* This function assumes the caller is holding the domain's BIGLOCK
2198 * and is running in a shadow mode
2199 */
2200 int update_grant_va_mapping(unsigned long va,
2201 l1_pgentry_t _nl1e,
2202 struct domain *d,
2203 struct vcpu *v)
2205 /* Caller must:
2206 * . own d's BIGLOCK
2207 * . already have 'get_page' correctly on the to-be-installed nl1e
2208 * . be responsible for flushing the TLB
2209 * . check PTE being installed isn't DISALLOWED
2210 */
2212 int rc = 0;
2213 l1_pgentry_t *pl1e;
2214 l1_pgentry_t ol1e;
2216 cleanup_writable_pagetable(d);
2218 // This is actually overkill - we don't need to sync the L1 itself,
2219 // just everything involved in getting to this L1 (i.e. we need
2220 // linear_pg_table[l1_linear_offset(va)] to be in sync)...
2221 //
2222 __shadow_sync_va(v, va);
2224 pl1e = &linear_pg_table[l1_linear_offset(va)];
2226 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
2227 rc = -EINVAL;
2228 else if ( !shadow_mode_refcounts(d) )
2230 if ( update_l1e(pl1e, ol1e, _nl1e) )
2232 put_page_from_l1e(ol1e, d);
2233 if ( l1e_get_flags(ol1e) & _PAGE_PRESENT )
2234 rc = 0; /* Caller needs to invalidate TLB entry */
2235 else
2236 rc = 1; /* Caller need not invalidate TLB entry */
2238 else
2239 rc = -EINVAL;
2241 else
2243 printk("grant tables and shadow mode currently don't work together\n");
2244 BUG();
2247 if ( unlikely(shadow_mode_enabled(d)) )
2248 shadow_do_update_va_mapping(va, _nl1e, v);
2250 return rc;
2254 int do_update_va_mapping(unsigned long va,
2255 unsigned long val32,
2256 unsigned long flags)
2258 l1_pgentry_t val = l1e_from_intpte(val32);
2259 struct vcpu *v = current;
2260 struct domain *d = v->domain;
2261 unsigned int cpu = v->processor;
2262 unsigned long vmask, bmap_ptr;
2263 cpumask_t pmask;
2264 int rc = 0;
2266 perfc_incrc(calls_to_update_va);
2268 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2269 return -EINVAL;
2271 LOCK_BIGLOCK(d);
2273 cleanup_writable_pagetable(d);
2275 if ( unlikely(shadow_mode_enabled(d)) )
2276 check_pagetable(v, "pre-va"); /* debug */
2278 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2279 val)) )
2280 rc = -EINVAL;
2282 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2284 if ( unlikely(percpu_info[cpu].foreign &&
2285 (shadow_mode_translate(d) ||
2286 shadow_mode_translate(percpu_info[cpu].foreign))) )
2288 // The foreign domain's pfn's are in a different namespace.
2289 // There's not enough information in just a gpte to figure out
2290 // how to (re-)shadow this entry.
2291 //
2292 domain_crash();
2295 rc = shadow_do_update_va_mapping(va, val, v);
2297 check_pagetable(v, "post-va"); /* debug */
2300 switch ( flags & UVMF_FLUSHTYPE_MASK )
2302 case UVMF_TLB_FLUSH:
2303 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2305 case UVMF_LOCAL:
2306 if ( unlikely(shadow_mode_enabled(d)) )
2307 shadow_sync_all(d);
2308 local_flush_tlb();
2309 break;
2310 case UVMF_ALL:
2311 flush_tlb_mask(d->cpumask);
2312 break;
2313 default:
2314 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2315 rc = -EFAULT;
2316 pmask = vcpumask_to_pcpumask(d, vmask);
2317 cpus_and(pmask, pmask, d->cpumask);
2318 flush_tlb_mask(pmask);
2319 break;
2321 break;
2323 case UVMF_INVLPG:
2324 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2326 case UVMF_LOCAL:
2327 if ( unlikely(shadow_mode_enabled(d)) )
2328 shadow_invlpg(current, va);
2329 local_flush_tlb_one(va);
2330 break;
2331 case UVMF_ALL:
2332 flush_tlb_one_mask(d->cpumask, va);
2333 break;
2334 default:
2335 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2336 rc = -EFAULT;
2337 pmask = vcpumask_to_pcpumask(d, vmask);
2338 cpus_and(pmask, pmask, d->cpumask);
2339 flush_tlb_one_mask(pmask, va);
2340 break;
2342 break;
2345 process_deferred_ops(cpu);
2347 UNLOCK_BIGLOCK(d);
2349 return rc;
2352 int do_update_va_mapping_otherdomain(unsigned long va,
2353 unsigned long val32,
2354 unsigned long flags,
2355 domid_t domid)
2357 unsigned int cpu = smp_processor_id();
2358 struct domain *d;
2359 int rc;
2361 if ( unlikely(!IS_PRIV(current->domain)) )
2362 return -EPERM;
2364 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
2365 if ( unlikely(d == NULL) )
2367 MEM_LOG("Unknown domain '%u'", domid);
2368 return -ESRCH;
2371 rc = do_update_va_mapping(va, val32, flags);
2373 return rc;
2378 /*************************
2379 * Descriptor Tables
2380 */
2382 void destroy_gdt(struct vcpu *v)
2384 int i;
2385 unsigned long pfn;
2387 v->arch.guest_context.gdt_ents = 0;
2388 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2390 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2391 put_page_and_type(&frame_table[pfn]);
2392 v->arch.perdomain_ptes[i] = l1e_empty();
2393 v->arch.guest_context.gdt_frames[i] = 0;
2398 long set_gdt(struct vcpu *v,
2399 unsigned long *frames,
2400 unsigned int entries)
2402 struct domain *d = v->domain;
2403 /* NB. There are 512 8-byte entries per GDT page. */
2404 int i, nr_pages = (entries + 511) / 512;
2405 unsigned long pfn;
2407 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2408 return -EINVAL;
2410 shadow_sync_all(d);
2412 /* Check the pages in the new GDT. */
2413 for ( i = 0; i < nr_pages; i++ )
2414 if ( ((pfn = frames[i]) >= max_page) ||
2415 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2416 goto fail;
2418 /* Tear down the old GDT. */
2419 destroy_gdt(v);
2421 /* Install the new GDT. */
2422 v->arch.guest_context.gdt_ents = entries;
2423 for ( i = 0; i < nr_pages; i++ )
2425 v->arch.guest_context.gdt_frames[i] = frames[i];
2426 v->arch.perdomain_ptes[i] =
2427 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2430 return 0;
2432 fail:
2433 while ( i-- > 0 )
2434 put_page_and_type(&frame_table[frames[i]]);
2435 return -EINVAL;
2439 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
2441 int nr_pages = (entries + 511) / 512;
2442 unsigned long frames[16];
2443 long ret;
2445 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2446 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2447 return -EINVAL;
2449 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
2450 return -EFAULT;
2452 LOCK_BIGLOCK(current->domain);
2454 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2455 local_flush_tlb();
2457 UNLOCK_BIGLOCK(current->domain);
2459 return ret;
2463 long do_update_descriptor(unsigned long pa, u64 desc)
2465 struct domain *dom = current->domain;
2466 unsigned long gpfn = pa >> PAGE_SHIFT;
2467 unsigned long mfn;
2468 unsigned int offset = (pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2469 struct desc_struct *gdt_pent, d;
2470 struct pfn_info *page;
2471 long ret = -EINVAL;
2473 *(u64 *)&d = desc;
2475 LOCK_BIGLOCK(dom);
2477 if ( !VALID_MFN(mfn = __gpfn_to_mfn(dom, gpfn)) ||
2478 ((pa % sizeof(struct desc_struct)) != 0) ||
2479 (mfn >= max_page) ||
2480 !check_descriptor(&d) )
2482 UNLOCK_BIGLOCK(dom);
2483 return -EINVAL;
2486 page = &frame_table[mfn];
2487 if ( unlikely(!get_page(page, dom)) )
2489 UNLOCK_BIGLOCK(dom);
2490 return -EINVAL;
2493 /* Check if the given frame is in use in an unsafe context. */
2494 switch ( page->u.inuse.type_info & PGT_type_mask )
2496 case PGT_gdt_page:
2497 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2498 goto out;
2499 break;
2500 case PGT_ldt_page:
2501 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2502 goto out;
2503 break;
2504 default:
2505 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2506 goto out;
2507 break;
2510 if ( shadow_mode_enabled(dom) )
2512 shadow_lock(dom);
2514 if ( shadow_mode_log_dirty(dom) )
2515 __mark_dirty(dom, mfn);
2517 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2518 shadow_mark_mfn_out_of_sync(current, gpfn, mfn);
2521 /* All is good so make the update. */
2522 gdt_pent = map_domain_page(mfn);
2523 memcpy(&gdt_pent[offset], &d, 8);
2524 unmap_domain_page(gdt_pent);
2526 if ( shadow_mode_enabled(dom) )
2527 shadow_unlock(dom);
2529 put_page_type(page);
2531 ret = 0; /* success */
2533 out:
2534 put_page(page);
2536 UNLOCK_BIGLOCK(dom);
2538 return ret;
2543 /*************************
2544 * Writable Pagetables
2545 */
2547 #ifdef VERBOSE
2548 int ptwr_debug = 0x0;
2549 #define PTWR_PRINTK(_f, _a...) \
2550 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2551 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2552 #else
2553 #define PTWR_PRINTK(_f, _a...) ((void)0)
2554 #endif
2556 /* Re-validate a given p.t. page, given its prior snapshot */
2557 int revalidate_l1(struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
2559 l1_pgentry_t ol1e, nl1e;
2560 int modified = 0, i;
2562 #if 0
2563 if ( d->domain_id )
2564 printk("%s: l1page mfn=%lx snapshot mfn=%lx\n", __func__,
2565 l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)l1page)]),
2566 l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)snapshot)]));
2567 #endif
2569 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2571 ol1e = snapshot[i];
2572 nl1e = l1page[i];
2574 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
2575 continue;
2577 /* Update number of entries modified. */
2578 modified++;
2580 /*
2581 * Fast path for PTEs that have merely been write-protected
2582 * (e.g., during a Unix fork()). A strict reduction in privilege.
2583 */
2584 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
2586 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
2587 put_page_type(&frame_table[l1e_get_pfn(nl1e)]);
2588 continue;
2591 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2593 MEM_LOG("ptwr: Could not re-validate l1 page\n");
2594 /*
2595 * Make the remaining p.t's consistent before crashing, so the
2596 * reference counts are correct.
2597 */
2598 memcpy(&l1page[i], &snapshot[i],
2599 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
2600 domain_crash();
2601 break;
2604 put_page_from_l1e(ol1e, d);
2607 return modified;
2611 /* Flush the given writable p.t. page and write-protect it again. */
2612 void ptwr_flush(struct domain *d, const int which)
2614 unsigned long pte, *ptep, l1va;
2615 l1_pgentry_t *pl1e;
2616 l2_pgentry_t *pl2e;
2617 unsigned int modified;
2619 ASSERT(!shadow_mode_enabled(d));
2621 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
2622 write_ptbase(d->arch.ptwr[which].vcpu);
2624 l1va = d->arch.ptwr[which].l1va;
2625 ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)];
2627 /*
2628 * STEP 1. Write-protect the p.t. page so no more updates can occur.
2629 */
2631 if ( unlikely(__get_user(pte, ptep)) )
2633 MEM_LOG("ptwr: Could not read pte at %p", ptep);
2634 /*
2635 * Really a bug. We could read this PTE during the initial fault,
2636 * and pagetables can't have changed meantime.
2637 */
2638 BUG();
2640 PTWR_PRINTK("[%c] disconnected_l1va at %p is %lx\n",
2641 PTWR_PRINT_WHICH, ptep, pte);
2642 pte &= ~_PAGE_RW;
2644 /* Write-protect the p.t. page in the guest page table. */
2645 if ( unlikely(__put_user(pte, ptep)) )
2647 MEM_LOG("ptwr: Could not update pte at %p", ptep);
2648 /*
2649 * Really a bug. We could write this PTE during the initial fault,
2650 * and pagetables can't have changed meantime.
2651 */
2652 BUG();
2655 /* Ensure that there are no stale writable mappings in any TLB. */
2656 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
2657 flush_tlb_one_mask(d->cpumask, l1va);
2658 PTWR_PRINTK("[%c] disconnected_l1va at %p now %lx\n",
2659 PTWR_PRINT_WHICH, ptep, pte);
2661 /*
2662 * STEP 2. Validate any modified PTEs.
2663 */
2665 pl1e = d->arch.ptwr[which].pl1e;
2666 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
2667 unmap_domain_page(pl1e);
2668 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
2669 d->arch.ptwr[which].prev_nr_updates = modified;
2671 /*
2672 * STEP 3. Reattach the L1 p.t. page into the current address space.
2673 */
2675 if ( which == PTWR_PT_ACTIVE )
2677 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
2678 l2e_add_flags(*pl2e, _PAGE_PRESENT);
2681 /*
2682 * STEP 4. Final tidy-up.
2683 */
2685 d->arch.ptwr[which].l1va = 0;
2687 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
2688 write_ptbase(current);
2691 static int ptwr_emulated_update(
2692 unsigned long addr,
2693 physaddr_t old,
2694 physaddr_t val,
2695 unsigned int bytes,
2696 unsigned int do_cmpxchg)
2698 unsigned long pfn;
2699 struct pfn_info *page;
2700 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
2701 struct domain *d = current->domain;
2703 /* Aligned access only, thank you. */
2704 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
2706 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)\n",
2707 bytes, addr);
2708 return X86EMUL_UNHANDLEABLE;
2711 /* Turn a sub-word access into a full-word access. */
2712 if (bytes != sizeof(physaddr_t))
2714 int rc;
2715 physaddr_t full;
2716 unsigned int offset = addr & (sizeof(physaddr_t)-1);
2718 /* Align address; read full word. */
2719 addr &= ~(sizeof(physaddr_t)-1);
2720 if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full,
2721 sizeof(physaddr_t))) )
2722 return rc;
2723 /* Mask out bits provided by caller. */
2724 full &= ~((((physaddr_t)1 << (bytes*8)) - 1) << (offset*8));
2725 /* Shift the caller value and OR in the missing bits. */
2726 val &= (((physaddr_t)1 << (bytes*8)) - 1);
2727 val <<= (offset)*8;
2728 val |= full;
2731 /* Read the PTE that maps the page being updated. */
2732 if (__copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
2733 sizeof(pte)))
2735 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table\n");
2736 return X86EMUL_UNHANDLEABLE;
2739 pfn = l1e_get_pfn(pte);
2740 page = &frame_table[pfn];
2742 /* We are looking only for read-only mappings of p.t. pages. */
2743 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
2744 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
2745 (page_get_owner(page) != d) )
2747 MEM_LOG("ptwr_emulate: Page is mistyped or bad pte (%lx, %08x)\n",
2748 l1e_get_pfn(pte), page->u.inuse.type_info);
2749 return X86EMUL_UNHANDLEABLE;
2752 /* Check the new PTE. */
2753 nl1e = l1e_from_intpte(val);
2754 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2755 return X86EMUL_UNHANDLEABLE;
2757 /* Checked successfully: do the update (write or cmpxchg). */
2758 pl1e = map_domain_page(page_to_pfn(page));
2759 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
2760 if ( do_cmpxchg )
2762 ol1e = l1e_from_intpte(old);
2763 if ( cmpxchg((unsigned long *)pl1e, old, val) != old )
2765 unmap_domain_page(pl1e);
2766 put_page_from_l1e(nl1e, d);
2767 return X86EMUL_CMPXCHG_FAILED;
2770 else
2772 ol1e = *pl1e;
2773 *pl1e = nl1e;
2775 unmap_domain_page(pl1e);
2777 /* Finally, drop the old PTE. */
2778 put_page_from_l1e(ol1e, d);
2780 return X86EMUL_CONTINUE;
2783 static int ptwr_emulated_write(
2784 unsigned long addr,
2785 unsigned long val,
2786 unsigned int bytes)
2788 return ptwr_emulated_update(addr, 0, val, bytes, 0);
2791 static int ptwr_emulated_cmpxchg(
2792 unsigned long addr,
2793 unsigned long old,
2794 unsigned long new,
2795 unsigned int bytes)
2797 return ptwr_emulated_update(addr, old, new, bytes, 1);
2800 static int ptwr_emulated_cmpxchg8b(
2801 unsigned long addr,
2802 unsigned long old,
2803 unsigned long old_hi,
2804 unsigned long new,
2805 unsigned long new_hi)
2807 return ptwr_emulated_update(
2808 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
2811 static struct x86_mem_emulator ptwr_mem_emulator = {
2812 .read_std = x86_emulate_read_std,
2813 .write_std = x86_emulate_write_std,
2814 .read_emulated = x86_emulate_read_std,
2815 .write_emulated = ptwr_emulated_write,
2816 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
2817 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
2818 };
2820 /* Write page fault handler: check if guest is trying to modify a PTE. */
2821 int ptwr_do_page_fault(struct domain *d, unsigned long addr)
2823 unsigned long pfn;
2824 struct pfn_info *page;
2825 l1_pgentry_t pte;
2826 l2_pgentry_t *pl2e;
2827 int which;
2828 u32 l2_idx;
2830 if ( unlikely(shadow_mode_enabled(d)) )
2831 return 0;
2833 /*
2834 * Attempt to read the PTE that maps the VA being accessed. By checking for
2835 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
2836 */
2837 if ( !(l2e_get_flags(__linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
2838 _PAGE_PRESENT) ||
2839 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
2840 sizeof(pte)) )
2842 return 0;
2845 pfn = l1e_get_pfn(pte);
2846 page = &frame_table[pfn];
2848 /* We are looking only for read-only mappings of p.t. pages. */
2849 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
2850 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
2851 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
2852 (page_get_owner(page) != d) )
2854 return 0;
2857 /* x86/64: Writable pagetable code needs auditing. Use emulator for now. */
2858 #if defined(__x86_64__)
2859 goto emulate;
2860 #endif
2862 /* Get the L2 index at which this L1 p.t. is always mapped. */
2863 l2_idx = page->u.inuse.type_info & PGT_va_mask;
2864 if ( unlikely(l2_idx >= PGT_va_unknown) )
2865 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
2866 l2_idx >>= PGT_va_shift;
2868 if ( unlikely(l2_idx == (addr >> L2_PAGETABLE_SHIFT)) )
2869 goto emulate; /* Urk! Pagetable maps itself! */
2871 /*
2872 * Is the L1 p.t. mapped into the current address space? If so we call it
2873 * an ACTIVE p.t., otherwise it is INACTIVE.
2874 */
2875 pl2e = &__linear_l2_table[l2_idx];
2876 which = PTWR_PT_INACTIVE;
2877 if ( (l2e_get_pfn(*pl2e)) == pfn )
2879 /*
2880 * Check the PRESENT bit to set ACTIVE mode.
2881 * If the PRESENT bit is clear, we may be conflicting with the current
2882 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
2883 * The ptwr_flush call below will restore the PRESENT bit.
2884 */
2885 if ( likely(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
2886 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
2887 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
2888 which = PTWR_PT_ACTIVE;
2891 /*
2892 * If this is a multi-processor guest then ensure that the page is hooked
2893 * into at most one L2 table, which must be the one running on this VCPU.
2894 */
2895 if ( (d->vcpu[0]->next_in_list != NULL) &&
2896 ((page->u.inuse.type_info & PGT_count_mask) !=
2897 (!!(page->u.inuse.type_info & PGT_pinned) +
2898 (which == PTWR_PT_ACTIVE))) )
2900 /* Could be conflicting writable mappings from other VCPUs. */
2901 cleanup_writable_pagetable(d);
2902 goto emulate;
2905 PTWR_PRINTK("[%c] page_fault on l1 pt at va %lx, pt for %08x, "
2906 "pfn %lx\n", PTWR_PRINT_WHICH,
2907 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
2909 /*
2910 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
2911 * time. If there is already one, we must flush it out.
2912 */
2913 if ( d->arch.ptwr[which].l1va )
2914 ptwr_flush(d, which);
2916 /*
2917 * If last batch made no updates then we are probably stuck. Emulate this
2918 * update to ensure we make progress.
2919 */
2920 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
2922 /* Ensure that we don't get stuck in an emulation-only rut. */
2923 d->arch.ptwr[which].prev_nr_updates = 1;
2924 goto emulate;
2927 d->arch.ptwr[which].l1va = addr | 1;
2928 d->arch.ptwr[which].l2_idx = l2_idx;
2929 d->arch.ptwr[which].vcpu = current;
2931 /* For safety, disconnect the L1 p.t. page from current space. */
2932 if ( which == PTWR_PT_ACTIVE )
2934 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
2935 flush_tlb_mask(d->cpumask);
2938 /* Temporarily map the L1 page, and make a copy of it. */
2939 d->arch.ptwr[which].pl1e = map_domain_page(pfn);
2940 memcpy(d->arch.ptwr[which].page,
2941 d->arch.ptwr[which].pl1e,
2942 L1_PAGETABLE_ENTRIES * sizeof(l1_pgentry_t));
2944 /* Finally, make the p.t. page writable by the guest OS. */
2945 l1e_add_flags(pte, _PAGE_RW);
2946 if ( unlikely(__copy_to_user(&linear_pg_table[addr>>PAGE_SHIFT],
2947 &pte, sizeof(pte))) )
2949 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
2950 &linear_pg_table[addr>>PAGE_SHIFT]);
2951 /* Toss the writable pagetable state and crash. */
2952 unmap_domain_page(d->arch.ptwr[which].pl1e);
2953 d->arch.ptwr[which].l1va = 0;
2954 domain_crash();
2955 return 0;
2958 return EXCRET_fault_fixed;
2960 emulate:
2961 if ( x86_emulate_memop(guest_cpu_user_regs(), addr,
2962 &ptwr_mem_emulator, BITS_PER_LONG/8) )
2963 return 0;
2964 perfc_incrc(ptwr_emulations);
2965 return EXCRET_fault_fixed;
2968 int ptwr_init(struct domain *d)
2970 void *x = alloc_xenheap_page();
2971 void *y = alloc_xenheap_page();
2973 if ( (x == NULL) || (y == NULL) )
2975 if ( x != NULL )
2976 free_xenheap_page(x);
2977 if ( y != NULL )
2978 free_xenheap_page(y);
2979 return -ENOMEM;
2982 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
2983 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
2985 return 0;
2988 void ptwr_destroy(struct domain *d)
2990 cleanup_writable_pagetable(d);
2991 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
2992 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
2995 void cleanup_writable_pagetable(struct domain *d)
2997 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
2998 return;
3000 if ( unlikely(shadow_mode_enabled(d)) )
3002 shadow_sync_all(d);
3004 else
3006 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3007 ptwr_flush(d, PTWR_PT_ACTIVE);
3008 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3009 ptwr_flush(d, PTWR_PT_INACTIVE);
3013 int map_pages_to_xen(
3014 unsigned long virt,
3015 unsigned long pfn,
3016 unsigned long nr_pfns,
3017 unsigned long flags)
3019 l2_pgentry_t *pl2e, ol2e;
3020 l1_pgentry_t *pl1e, ol1e;
3021 unsigned int i;
3023 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3024 flags &= ~MAP_SMALL_PAGES;
3026 while ( nr_pfns != 0 )
3028 pl2e = virt_to_xen_l2e(virt);
3030 if ( ((((virt>>PAGE_SHIFT) | pfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3031 (nr_pfns >= (1<<PAGETABLE_ORDER)) &&
3032 !map_small_pages )
3034 /* Super-page mapping. */
3035 ol2e = *pl2e;
3036 *pl2e = l2e_from_pfn(pfn, flags|_PAGE_PSE);
3038 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3040 local_flush_tlb_pge();
3041 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3042 free_xen_pagetable(l2e_get_page(*pl2e));
3045 virt += 1UL << L2_PAGETABLE_SHIFT;
3046 pfn += 1UL << PAGETABLE_ORDER;
3047 nr_pfns -= 1UL << PAGETABLE_ORDER;
3049 else
3051 /* Normal page mapping. */
3052 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3054 pl1e = page_to_virt(alloc_xen_pagetable());
3055 clear_page(pl1e);
3056 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3058 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3060 pl1e = page_to_virt(alloc_xen_pagetable());
3061 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3062 pl1e[i] = l1e_from_pfn(
3063 l2e_get_pfn(*pl2e) + i,
3064 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3065 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3066 local_flush_tlb_pge();
3069 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3070 ol1e = *pl1e;
3071 *pl1e = l1e_from_pfn(pfn, flags);
3072 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3073 local_flush_tlb_one(virt);
3075 virt += 1UL << L1_PAGETABLE_SHIFT;
3076 pfn += 1UL;
3077 nr_pfns -= 1UL;
3081 return 0;
3084 void __set_fixmap(
3085 enum fixed_addresses idx, unsigned long p, unsigned long flags)
3087 if ( unlikely(idx >= __end_of_fixed_addresses) )
3088 BUG();
3089 map_pages_to_xen(fix_to_virt(idx), p >> PAGE_SHIFT, 1, flags);
3092 #ifdef MEMORY_GUARD
3094 void memguard_init(void)
3096 map_pages_to_xen(
3097 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3098 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3101 static void __memguard_change_range(void *p, unsigned long l, int guard)
3103 unsigned long _p = (unsigned long)p;
3104 unsigned long _l = (unsigned long)l;
3105 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3107 /* Ensure we are dealing with a page-aligned whole number of pages. */
3108 ASSERT((_p&PAGE_MASK) != 0);
3109 ASSERT((_l&PAGE_MASK) != 0);
3110 ASSERT((_p&~PAGE_MASK) == 0);
3111 ASSERT((_l&~PAGE_MASK) == 0);
3113 if ( guard )
3114 flags &= ~_PAGE_PRESENT;
3116 map_pages_to_xen(
3117 _p, virt_to_phys(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3120 void memguard_guard_range(void *p, unsigned long l)
3122 __memguard_change_range(p, l, 1);
3125 void memguard_unguard_range(void *p, unsigned long l)
3127 __memguard_change_range(p, l, 0);
3130 #endif
3132 /*
3133 * Local variables:
3134 * mode: C
3135 * c-set-style: "BSD"
3136 * c-basic-offset: 4
3137 * tab-width: 4
3138 * indent-tabs-mode: nil
3139 * End:
3140 */