ia64/xen-unstable

view xen/arch/x86/mm.c @ 5790:ebfde26a769a

Avoid some unnecessary TLB flushes. This will probably make no real
difference on any sensible guest operating system.

Signed-off-by: Steven Smith, sos22@cl.cam.ac.uk.
author sos22@douglas.cl.cam.ac.uk
date Fri Jul 15 09:24:29 2005 +0000 (2005-07-15)
parents ec3b7c87b577
children a83ac0806d6b
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <xen/domain_page.h>
98 #include <asm/shadow.h>
99 #include <asm/page.h>
100 #include <asm/flushtlb.h>
101 #include <asm/io.h>
102 #include <asm/uaccess.h>
103 #include <asm/ldt.h>
104 #include <asm/x86_emulate.h>
106 #ifdef VERBOSE
107 #define MEM_LOG(_f, _a...) \
108 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
109 current->domain->domain_id , __LINE__ , ## _a )
110 #else
111 #define MEM_LOG(_f, _a...) ((void)0)
112 #endif
114 /*
115 * Both do_mmuext_op() and do_mmu_update():
116 * We steal the m.s.b. of the @count parameter to indicate whether this
117 * invocation of do_mmu_update() is resuming a previously preempted call.
118 */
119 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
121 static void free_l2_table(struct pfn_info *page);
122 static void free_l1_table(struct pfn_info *page);
124 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
125 unsigned int type);
126 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
128 /* Used to defer flushing of memory structures. */
129 static struct {
130 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
131 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
132 unsigned int deferred_ops;
133 /* If non-NULL, specifies a foreign subject domain for some operations. */
134 struct domain *foreign;
135 } __cacheline_aligned percpu_info[NR_CPUS];
137 /*
138 * Returns the current foreign domain; defaults to the currently-executing
139 * domain if a foreign override hasn't been specified.
140 */
141 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
143 /* Private domain structs for DOMID_XEN and DOMID_IO. */
144 static struct domain *dom_xen, *dom_io;
146 /* Frame table and its size in pages. */
147 struct pfn_info *frame_table;
148 unsigned long max_page;
150 void __init init_frametable(void)
151 {
152 unsigned long nr_pages, page_step, i, pfn;
154 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
156 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
157 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
159 for ( i = 0; i < nr_pages; i += page_step )
160 {
161 pfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
162 if ( pfn == 0 )
163 panic("Not enough memory for frame table\n");
164 map_pages_to_xen(
165 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
166 pfn, page_step, PAGE_HYPERVISOR);
167 }
169 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
170 }
172 void arch_init_memory(void)
173 {
174 extern void subarch_init_memory(struct domain *);
176 unsigned long i, pfn, rstart_pfn, rend_pfn;
177 struct pfn_info *page;
179 memset(percpu_info, 0, sizeof(percpu_info));
181 /*
182 * Initialise our DOMID_XEN domain.
183 * Any Xen-heap pages that we will allow to be mapped will have
184 * their domain field set to dom_xen.
185 */
186 dom_xen = alloc_domain_struct();
187 atomic_set(&dom_xen->refcnt, 1);
188 dom_xen->domain_id = DOMID_XEN;
190 /*
191 * Initialise our DOMID_IO domain.
192 * This domain owns I/O pages that are within the range of the pfn_info
193 * array. Mappings occur at the priv of the caller.
194 */
195 dom_io = alloc_domain_struct();
196 atomic_set(&dom_io->refcnt, 1);
197 dom_io->domain_id = DOMID_IO;
199 /* First 1MB of RAM is historically marked as I/O. */
200 for ( i = 0; i < 0x100; i++ )
201 {
202 page = &frame_table[i];
203 page->count_info = PGC_allocated | 1;
204 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
205 page_set_owner(page, dom_io);
206 }
208 /* Any areas not specified as RAM by the e820 map are considered I/O. */
209 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
210 {
211 if ( e820.map[i].type != E820_RAM )
212 continue;
213 /* Every page from cursor to start of next RAM region is I/O. */
214 rstart_pfn = PFN_UP(e820.map[i].addr);
215 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
216 while ( pfn < rstart_pfn )
217 {
218 BUG_ON(!pfn_valid(pfn));
219 page = &frame_table[pfn++];
220 page->count_info = PGC_allocated | 1;
221 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
222 page_set_owner(page, dom_io);
223 }
224 /* Skip the RAM region. */
225 pfn = rend_pfn;
226 }
227 BUG_ON(pfn != max_page);
229 subarch_init_memory(dom_xen);
230 }
232 void write_ptbase(struct vcpu *v)
233 {
234 write_cr3(pagetable_get_paddr(v->arch.monitor_table));
235 }
237 void invalidate_shadow_ldt(struct vcpu *v)
238 {
239 int i;
240 unsigned long pfn;
241 struct pfn_info *page;
243 if ( v->arch.shadow_ldt_mapcnt == 0 )
244 return;
246 v->arch.shadow_ldt_mapcnt = 0;
248 for ( i = 16; i < 32; i++ )
249 {
250 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
251 if ( pfn == 0 ) continue;
252 v->arch.perdomain_ptes[i] = l1e_empty();
253 page = &frame_table[pfn];
254 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
255 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
256 put_page_and_type(page);
257 }
259 /* Dispose of the (now possibly invalid) mappings from the TLB. */
260 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
261 }
264 static int alloc_segdesc_page(struct pfn_info *page)
265 {
266 struct desc_struct *descs;
267 int i;
269 descs = map_domain_page(page_to_pfn(page));
271 for ( i = 0; i < 512; i++ )
272 if ( unlikely(!check_descriptor(&descs[i])) )
273 goto fail;
275 unmap_domain_page(descs);
276 return 1;
278 fail:
279 unmap_domain_page(descs);
280 return 0;
281 }
284 /* Map shadow page at offset @off. */
285 int map_ldt_shadow_page(unsigned int off)
286 {
287 struct vcpu *v = current;
288 struct domain *d = v->domain;
289 unsigned long gpfn, gmfn;
290 l1_pgentry_t l1e, nl1e;
291 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
292 int res;
294 #if defined(__x86_64__)
295 /* If in user mode, switch to kernel mode just to read LDT mapping. */
296 extern void toggle_guest_mode(struct vcpu *);
297 int user_mode = !(v->arch.flags & TF_kernel_mode);
298 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
299 #elif defined(__i386__)
300 #define TOGGLE_MODE() ((void)0)
301 #endif
303 BUG_ON(unlikely(in_irq()));
305 shadow_sync_va(v, gva);
307 TOGGLE_MODE();
308 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
309 sizeof(l1e));
310 TOGGLE_MODE();
312 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
313 return 0;
315 gpfn = l1e_get_pfn(l1e);
316 gmfn = __gpfn_to_mfn(d, gpfn);
317 if ( unlikely(!VALID_MFN(gmfn)) )
318 return 0;
320 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
322 if ( !res && unlikely(shadow_mode_refcounts(d)) )
323 {
324 shadow_lock(d);
325 shadow_remove_all_write_access(d, gpfn, gmfn);
326 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
327 shadow_unlock(d);
328 }
330 if ( unlikely(!res) )
331 return 0;
333 nl1e = l1e_from_pfn(gmfn, l1e_get_flags(l1e) | _PAGE_RW);
335 v->arch.perdomain_ptes[off + 16] = nl1e;
336 v->arch.shadow_ldt_mapcnt++;
338 return 1;
339 }
342 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
343 {
344 struct pfn_info *page = &frame_table[page_nr];
346 if ( unlikely(!pfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
347 {
348 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
349 return 0;
350 }
352 return 1;
353 }
356 static int get_page_and_type_from_pagenr(unsigned long page_nr,
357 u32 type,
358 struct domain *d)
359 {
360 struct pfn_info *page = &frame_table[page_nr];
362 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
363 return 0;
365 if ( unlikely(!get_page_type(page, type)) )
366 {
367 if ( (type & PGT_type_mask) != PGT_l1_page_table )
368 MEM_LOG("Bad page type for pfn %lx (%08x)",
369 page_nr, page->u.inuse.type_info);
370 put_page(page);
371 return 0;
372 }
374 return 1;
375 }
377 /*
378 * We allow root tables to map each other (a.k.a. linear page tables). It
379 * needs some special care with reference counts and access permissions:
380 * 1. The mapping entry must be read-only, or the guest may get write access
381 * to its own PTEs.
382 * 2. We must only bump the reference counts for an *already validated*
383 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
384 * on a validation that is required to complete that validation.
385 * 3. We only need to increment the reference counts for the mapped page
386 * frame if it is mapped by a different root table. This is sufficient and
387 * also necessary to allow validation of a root table mapping itself.
388 */
389 static int
390 get_linear_pagetable(
391 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
392 {
393 u32 x, y;
394 struct pfn_info *page;
395 unsigned long pfn;
397 ASSERT( !shadow_mode_refcounts(d) );
399 if ( (root_get_flags(re) & _PAGE_RW) )
400 {
401 MEM_LOG("Attempt to create linear p.t. with write perms");
402 return 0;
403 }
405 if ( (pfn = root_get_pfn(re)) != re_pfn )
406 {
407 /* Make sure the mapped frame belongs to the correct domain. */
408 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
409 return 0;
411 /*
412 * Make sure that the mapped frame is an already-validated L2 table.
413 * If so, atomically increment the count (checking for overflow).
414 */
415 page = &frame_table[pfn];
416 y = page->u.inuse.type_info;
417 do {
418 x = y;
419 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
420 unlikely((x & (PGT_type_mask|PGT_validated)) !=
421 (PGT_root_page_table|PGT_validated)) )
422 {
423 put_page(page);
424 return 0;
425 }
426 }
427 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
428 }
430 return 1;
431 }
433 int
434 get_page_from_l1e(
435 l1_pgentry_t l1e, struct domain *d)
436 {
437 unsigned long mfn = l1e_get_pfn(l1e);
438 struct pfn_info *page = &frame_table[mfn];
439 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
441 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
442 return 1;
444 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
445 {
446 MEM_LOG("Bad L1 flags %x\n", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
447 return 0;
448 }
450 if ( unlikely(!pfn_valid(mfn)) ||
451 unlikely(page_get_owner(page) == dom_io) )
452 {
453 /* DOMID_IO reverts to caller for privilege checks. */
454 if ( d == dom_io )
455 d = current->domain;
457 if ( (!IS_PRIV(d)) &&
458 (!IS_CAPABLE_PHYSDEV(d) || !domain_iomem_in_pfn(d, mfn)) )
459 {
460 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
461 return 0;
462 }
464 /* No reference counting for out-of-range I/O pages. */
465 if ( !pfn_valid(mfn) )
466 return 1;
468 d = dom_io;
469 }
471 return ((l1e_get_flags(l1e) & _PAGE_RW) ?
472 get_page_and_type(page, d, PGT_writable_page) :
473 get_page(page, d));
474 }
477 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
478 static int
479 get_page_from_l2e(
480 l2_pgentry_t l2e, unsigned long pfn,
481 struct domain *d, unsigned long vaddr)
482 {
483 int rc;
485 ASSERT(!shadow_mode_refcounts(d));
487 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
488 return 1;
490 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
491 {
492 MEM_LOG("Bad L2 flags %x\n", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
493 return 0;
494 }
496 vaddr >>= L2_PAGETABLE_SHIFT;
497 vaddr <<= PGT_va_shift;
498 rc = get_page_and_type_from_pagenr(
499 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
501 #if CONFIG_PAGING_LEVELS == 2
502 if (!rc)
503 rc = get_linear_pagetable(l2e, pfn, d);
504 #endif
505 return rc;
506 }
509 #if CONFIG_PAGING_LEVELS >= 3
511 static int
512 get_page_from_l3e(
513 l3_pgentry_t l3e, unsigned long pfn,
514 struct domain *d, unsigned long vaddr)
515 {
516 ASSERT( !shadow_mode_refcounts(d) );
518 int rc;
520 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
521 return 1;
523 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
524 {
525 MEM_LOG("Bad L3 flags %x\n", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
526 return 0;
527 }
529 vaddr >>= L3_PAGETABLE_SHIFT;
530 vaddr <<= PGT_va_shift;
531 rc = get_page_and_type_from_pagenr(
532 l3e_get_pfn(l3e),
533 PGT_l2_page_table | vaddr, d);
534 #if CONFIG_PAGING_LEVELS == 3
535 if (!rc)
536 rc = get_linear_pagetable(l3e, pfn, d);
537 #endif
538 return rc;
539 }
541 #endif /* 3 level */
543 #if CONFIG_PAGING_LEVELS >= 4
545 static int
546 get_page_from_l4e(
547 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
548 {
549 int rc;
551 ASSERT( !shadow_mode_refcounts(d) );
553 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
554 return 1;
556 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
557 {
558 MEM_LOG("Bad L4 flags %x\n", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
559 return 0;
560 }
562 rc = get_page_and_type_from_pagenr(
563 l4e_get_pfn(l4e), PGT_l3_page_table, d);
565 if ( unlikely(!rc) )
566 return get_linear_pagetable(l4e, pfn, d);
568 return 1;
569 }
571 #endif /* 4 level */
574 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
575 {
576 unsigned long pfn = l1e_get_pfn(l1e);
577 struct pfn_info *page = &frame_table[pfn];
578 struct domain *e;
580 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !pfn_valid(pfn) )
581 return;
583 e = page_get_owner(page);
584 if ( unlikely(e != d) )
585 {
586 /*
587 * Unmap a foreign page that may have been mapped via a grant table.
588 * Note that this can fail for a privileged domain that can map foreign
589 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
590 * counted via a grant entry and some counted directly in the page
591 * structure's reference count. Note that reference counts won't get
592 * dangerously confused as long as we always try to decrement the
593 * grant entry first. We may end up with a mismatch between which
594 * mappings and which unmappings are counted via the grant entry, but
595 * really it doesn't matter as privileged domains have carte blanche.
596 */
597 if (likely(gnttab_check_unmap(e, d, pfn,
598 !(l1e_get_flags(l1e) & _PAGE_RW))))
599 return;
600 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
601 }
603 if ( l1e_get_flags(l1e) & _PAGE_RW )
604 {
605 put_page_and_type(page);
606 }
607 else
608 {
609 /* We expect this is rare so we blow the entire shadow LDT. */
610 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
611 PGT_ldt_page)) &&
612 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
614 // XXX SMP BUG?
615 invalidate_shadow_ldt(e->vcpu[0]);
616 put_page(page);
617 }
618 }
621 /*
622 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
623 * Note also that this automatically deals correctly with linear p.t.'s.
624 */
625 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
626 {
627 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
628 (l2e_get_pfn(l2e) != pfn) )
629 put_page_and_type(&frame_table[l2e_get_pfn(l2e)]);
630 }
633 #if CONFIG_PAGING_LEVELS >= 3
635 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
636 {
637 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
638 (l3e_get_pfn(l3e) != pfn) )
639 put_page_and_type(&frame_table[l3e_get_pfn(l3e)]);
640 }
642 #endif
644 #if CONFIG_PAGING_LEVELS >= 4
646 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
647 {
648 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
649 (l4e_get_pfn(l4e) != pfn) )
650 put_page_and_type(&frame_table[l4e_get_pfn(l4e)]);
651 }
653 #endif
656 static int alloc_l1_table(struct pfn_info *page)
657 {
658 struct domain *d = page_get_owner(page);
659 unsigned long pfn = page_to_pfn(page);
660 l1_pgentry_t *pl1e;
661 int i;
663 ASSERT(!shadow_mode_refcounts(d));
665 pl1e = map_domain_page(pfn);
667 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
668 if ( is_guest_l1_slot(i) &&
669 unlikely(!get_page_from_l1e(pl1e[i], d)) )
670 goto fail;
672 unmap_domain_page(pl1e);
673 return 1;
675 fail:
676 while ( i-- > 0 )
677 if ( is_guest_l1_slot(i) )
678 put_page_from_l1e(pl1e[i], d);
680 unmap_domain_page(pl1e);
681 return 0;
682 }
684 #ifdef CONFIG_X86_PAE
685 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
686 {
687 struct pfn_info *page;
688 l2_pgentry_t *pl2e;
689 l3_pgentry_t l3e3;
690 int i;
692 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
694 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
695 l3e3 = pl3e[3];
696 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
697 {
698 MEM_LOG("PAE L3 3rd slot is empty");
699 return 0;
700 }
702 /*
703 * The Xen-private mappings include linear mappings. The L2 thus cannot
704 * be shared by multiple L3 tables. The test here is adequate because:
705 * 1. Cannot appear in slots != 3 because the page would then then have
706 * unknown va backpointer, which get_page_type() explicitly disallows.
707 * 2. Cannot appear in another page table's L3:
708 * a. alloc_l3_table() calls this function and this check will fail
709 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
710 */
711 page = l3e_get_page(l3e3);
712 BUG_ON(page->u.inuse.type_info & PGT_pinned);
713 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
714 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
715 {
716 MEM_LOG("PAE L3 3rd slot is shared");
717 return 0;
718 }
720 /* Xen private mappings. */
721 pl2e = map_domain_page(l3e_get_pfn(l3e3));
722 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
723 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
724 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
725 for ( i = 0; i < (PERDOMAIN_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
726 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
727 l2e_from_page(
728 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
729 __PAGE_HYPERVISOR);
730 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
731 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
732 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
733 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
734 l2e_empty();
735 unmap_domain_page(pl2e);
737 return 1;
738 }
740 static inline int l1_backptr(
741 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
742 {
743 unsigned long l2_backptr = l2_type & PGT_va_mask;
744 BUG_ON(l2_backptr == PGT_va_unknown);
745 if ( l2_backptr == PGT_va_mutable )
746 return 0;
747 *backptr =
748 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
749 (offset_in_l2 << L2_PAGETABLE_SHIFT);
750 return 1;
751 }
753 #else
754 # define create_pae_xen_mappings(pl3e) (1)
755 # define l1_backptr(bp,l2o,l2t) \
756 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
757 #endif
759 static int alloc_l2_table(struct pfn_info *page, unsigned int type)
760 {
761 struct domain *d = page_get_owner(page);
762 unsigned long pfn = page_to_pfn(page);
763 unsigned long vaddr;
764 l2_pgentry_t *pl2e;
765 int i;
767 /* See the code in shadow_promote() to understand why this is here. */
768 if ( (PGT_base_page_table == PGT_l2_page_table) &&
769 unlikely(shadow_mode_refcounts(d)) )
770 return 1;
771 ASSERT(!shadow_mode_refcounts(d));
773 pl2e = map_domain_page(pfn);
775 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
776 {
777 if ( !l1_backptr(&vaddr, i, type) )
778 goto fail;
779 if ( is_guest_l2_slot(type, i) &&
780 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
781 goto fail;
782 }
784 #if CONFIG_PAGING_LEVELS == 2
785 /* Xen private mappings. */
786 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
787 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
788 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
789 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
790 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
791 pl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
792 l2e_from_page(
793 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt),
794 __PAGE_HYPERVISOR);
795 #endif
797 unmap_domain_page(pl2e);
798 return 1;
800 fail:
801 while ( i-- > 0 )
802 if ( is_guest_l2_slot(type, i) )
803 put_page_from_l2e(pl2e[i], pfn);
805 unmap_domain_page(pl2e);
806 return 0;
807 }
810 #if CONFIG_PAGING_LEVELS >= 3
811 static int alloc_l3_table(struct pfn_info *page)
812 {
813 struct domain *d = page_get_owner(page);
814 unsigned long pfn = page_to_pfn(page);
815 unsigned long vaddr;
816 l3_pgentry_t *pl3e;
817 int i;
819 ASSERT(!shadow_mode_refcounts(d));
821 pl3e = map_domain_page(pfn);
822 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
823 {
824 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
825 if ( is_guest_l3_slot(i) &&
826 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
827 goto fail;
828 }
830 if ( !create_pae_xen_mappings(pl3e) )
831 goto fail;
833 unmap_domain_page(pl3e);
834 return 1;
836 fail:
837 while ( i-- > 0 )
838 if ( is_guest_l3_slot(i) )
839 put_page_from_l3e(pl3e[i], pfn);
841 unmap_domain_page(pl3e);
842 return 0;
843 }
844 #else
845 #define alloc_l3_table(page) (0)
846 #endif
848 #if CONFIG_PAGING_LEVELS >= 4
849 static int alloc_l4_table(struct pfn_info *page)
850 {
851 struct domain *d = page_get_owner(page);
852 unsigned long pfn = page_to_pfn(page);
853 l4_pgentry_t *pl4e = page_to_virt(page);
854 int i;
856 /* See the code in shadow_promote() to understand why this is here. */
857 if ( (PGT_base_page_table == PGT_l4_page_table) &&
858 shadow_mode_refcounts(d) )
859 return 1;
860 ASSERT(!shadow_mode_refcounts(d));
862 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
863 if ( is_guest_l4_slot(i) &&
864 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
865 goto fail;
867 /* Xen private mappings. */
868 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
869 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
870 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
871 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
872 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
873 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
874 l4e_from_page(
875 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
876 __PAGE_HYPERVISOR);
878 return 1;
880 fail:
881 while ( i-- > 0 )
882 if ( is_guest_l4_slot(i) )
883 put_page_from_l4e(pl4e[i], pfn);
885 return 0;
886 }
887 #else
888 #define alloc_l4_table(page) (0)
889 #endif
892 static void free_l1_table(struct pfn_info *page)
893 {
894 struct domain *d = page_get_owner(page);
895 unsigned long pfn = page_to_pfn(page);
896 l1_pgentry_t *pl1e;
897 int i;
899 pl1e = map_domain_page(pfn);
901 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
902 if ( is_guest_l1_slot(i) )
903 put_page_from_l1e(pl1e[i], d);
905 unmap_domain_page(pl1e);
906 }
909 static void free_l2_table(struct pfn_info *page)
910 {
911 unsigned long pfn = page_to_pfn(page);
912 l2_pgentry_t *pl2e;
913 int i;
915 pl2e = map_domain_page(pfn);
917 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
918 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
919 put_page_from_l2e(pl2e[i], pfn);
921 unmap_domain_page(pl2e);
922 }
925 #if CONFIG_PAGING_LEVELS >= 3
927 static void free_l3_table(struct pfn_info *page)
928 {
929 unsigned long pfn = page_to_pfn(page);
930 l3_pgentry_t *pl3e;
931 int i;
933 pl3e = map_domain_page(pfn);
935 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
936 if ( is_guest_l3_slot(i) )
937 put_page_from_l3e(pl3e[i], pfn);
939 unmap_domain_page(pl3e);
940 }
942 #endif
944 #if CONFIG_PAGING_LEVELS >= 4
946 static void free_l4_table(struct pfn_info *page)
947 {
948 unsigned long pfn = page_to_pfn(page);
949 l4_pgentry_t *pl4e = page_to_virt(page);
950 int i;
952 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
953 if ( is_guest_l4_slot(i) )
954 put_page_from_l4e(pl4e[i], pfn);
955 }
957 #endif
959 static inline int update_l1e(l1_pgentry_t *pl1e,
960 l1_pgentry_t ol1e,
961 l1_pgentry_t nl1e)
962 {
963 intpte_t o = l1e_get_intpte(ol1e);
964 intpte_t n = l1e_get_intpte(nl1e);
966 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
967 unlikely(o != l1e_get_intpte(ol1e)) )
968 {
969 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
970 ": saw %" PRIpte "\n",
971 l1e_get_intpte(ol1e),
972 l1e_get_intpte(nl1e),
973 o);
974 return 0;
975 }
976 return 1;
977 }
980 /* Update the L1 entry at pl1e to new value nl1e. */
981 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
982 {
983 l1_pgentry_t ol1e;
984 struct domain *d = current->domain;
986 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
987 return 0;
989 if ( unlikely(shadow_mode_refcounts(d)) )
990 return update_l1e(pl1e, ol1e, nl1e);
992 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
993 {
994 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
995 {
996 MEM_LOG("Bad L1 flags %x\n",
997 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
998 return 0;
999 }
1001 /* Fast path for identical mapping, r/w and presence. */
1002 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1003 return update_l1e(pl1e, ol1e, nl1e);
1005 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1006 return 0;
1008 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1010 put_page_from_l1e(nl1e, d);
1011 return 0;
1014 else
1016 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1017 return 0;
1020 put_page_from_l1e(ol1e, d);
1021 return 1;
1024 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1025 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1026 _t ## e_get_intpte(_o), \
1027 _t ## e_get_intpte(_n)); \
1028 if ( __o != _t ## e_get_intpte(_o) ) \
1029 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1030 ": saw %" PRIpte "", \
1031 (_t ## e_get_intpte(_o)), \
1032 (_t ## e_get_intpte(_n)), \
1033 (__o)); \
1034 (__o == _t ## e_get_intpte(_o)); })
1036 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1037 static int mod_l2_entry(l2_pgentry_t *pl2e,
1038 l2_pgentry_t nl2e,
1039 unsigned long pfn,
1040 unsigned int type)
1042 l2_pgentry_t ol2e;
1043 unsigned long vaddr;
1045 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1047 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1048 return 0;
1051 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1052 return 0;
1054 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1056 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1058 MEM_LOG("Bad L2 flags %x\n",
1059 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1060 return 0;
1063 /* Fast path for identical mapping and presence. */
1064 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1065 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1067 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1068 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1069 return 0;
1071 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1073 put_page_from_l2e(nl2e, pfn);
1074 return 0;
1077 else
1079 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1080 return 0;
1083 put_page_from_l2e(ol2e, pfn);
1084 return 1;
1088 #if CONFIG_PAGING_LEVELS >= 3
1090 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1091 static int mod_l3_entry(l3_pgentry_t *pl3e,
1092 l3_pgentry_t nl3e,
1093 unsigned long pfn)
1095 l3_pgentry_t ol3e;
1096 unsigned long vaddr;
1098 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1100 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1101 return 0;
1104 #ifdef CONFIG_X86_PAE
1105 /*
1106 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1107 * would be a pain to ensure they remain continuously valid throughout.
1108 */
1109 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1110 return 0;
1111 #endif
1113 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1114 return 0;
1116 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1118 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1120 MEM_LOG("Bad L3 flags %x\n",
1121 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1122 return 0;
1125 /* Fast path for identical mapping and presence. */
1126 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1127 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1129 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1130 << L3_PAGETABLE_SHIFT;
1131 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1132 return 0;
1134 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1136 BUG_ON(!create_pae_xen_mappings(pl3e));
1137 put_page_from_l3e(nl3e, pfn);
1138 return 0;
1141 put_page_from_l3e(ol3e, pfn);
1142 return 1;
1145 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1147 BUG_ON(!create_pae_xen_mappings(pl3e));
1148 return 0;
1151 put_page_from_l3e(ol3e, pfn);
1152 return 1;
1155 #endif
1157 #if CONFIG_PAGING_LEVELS >= 4
1159 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1160 static int mod_l4_entry(l4_pgentry_t *pl4e,
1161 l4_pgentry_t nl4e,
1162 unsigned long pfn)
1164 l4_pgentry_t ol4e;
1166 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1168 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1169 return 0;
1172 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1173 return 0;
1175 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1177 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1179 MEM_LOG("Bad L4 flags %x\n",
1180 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1181 return 0;
1184 /* Fast path for identical mapping and presence. */
1185 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1186 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1188 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1189 return 0;
1191 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1193 put_page_from_l4e(nl4e, pfn);
1194 return 0;
1197 put_page_from_l4e(ol4e, pfn);
1198 return 1;
1201 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1202 return 0;
1204 put_page_from_l4e(ol4e, pfn);
1205 return 1;
1208 #endif
1210 int alloc_page_type(struct pfn_info *page, unsigned int type)
1212 switch ( type & PGT_type_mask )
1214 case PGT_l1_page_table:
1215 return alloc_l1_table(page);
1216 case PGT_l2_page_table:
1217 return alloc_l2_table(page, type);
1218 case PGT_l3_page_table:
1219 return alloc_l3_table(page);
1220 case PGT_l4_page_table:
1221 return alloc_l4_table(page);
1222 case PGT_gdt_page:
1223 case PGT_ldt_page:
1224 return alloc_segdesc_page(page);
1225 default:
1226 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
1227 type, page->u.inuse.type_info,
1228 page->count_info);
1229 BUG();
1232 return 0;
1236 void free_page_type(struct pfn_info *page, unsigned int type)
1238 struct domain *owner = page_get_owner(page);
1239 unsigned long gpfn;
1241 if ( owner != NULL )
1243 if ( unlikely(shadow_mode_refcounts(owner)) )
1244 return;
1245 if ( unlikely(shadow_mode_enabled(owner)) )
1247 gpfn = __mfn_to_gpfn(owner, page_to_pfn(page));
1248 ASSERT(VALID_M2P(gpfn));
1249 remove_shadow(owner, gpfn, type & PGT_type_mask);
1253 switch (type & PGT_type_mask)
1255 case PGT_l1_page_table:
1256 free_l1_table(page);
1257 break;
1259 case PGT_l2_page_table:
1260 free_l2_table(page);
1261 break;
1263 #if CONFIG_PAGING_LEVELS >= 3
1264 case PGT_l3_page_table:
1265 free_l3_table(page);
1266 break;
1267 #endif
1269 #if CONFIG_PAGING_LEVELS >= 4
1270 case PGT_l4_page_table:
1271 free_l4_table(page);
1272 break;
1273 #endif
1275 default:
1276 printk("%s: type %x pfn %lx\n",__FUNCTION__,
1277 type, page_to_pfn(page));
1278 BUG();
1283 void put_page_type(struct pfn_info *page)
1285 u32 nx, x, y = page->u.inuse.type_info;
1287 again:
1288 do {
1289 x = y;
1290 nx = x - 1;
1292 ASSERT((x & PGT_count_mask) != 0);
1294 /*
1295 * The page should always be validated while a reference is held. The
1296 * exception is during domain destruction, when we forcibly invalidate
1297 * page-table pages if we detect a referential loop.
1298 * See domain.c:relinquish_list().
1299 */
1300 ASSERT((x & PGT_validated) ||
1301 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1303 if ( unlikely((nx & PGT_count_mask) == 0) )
1305 /* Record TLB information for flush later. Races are harmless. */
1306 page->tlbflush_timestamp = tlbflush_current_time();
1308 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1309 likely(nx & PGT_validated) )
1311 /*
1312 * Page-table pages must be unvalidated when count is zero. The
1313 * 'free' is safe because the refcnt is non-zero and validated
1314 * bit is clear => other ops will spin or fail.
1315 */
1316 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1317 x & ~PGT_validated)) != x) )
1318 goto again;
1319 /* We cleared the 'valid bit' so we do the clean up. */
1320 free_page_type(page, x);
1321 /* Carry on, but with the 'valid bit' now clear. */
1322 x &= ~PGT_validated;
1323 nx &= ~PGT_validated;
1326 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1327 (PGT_pinned | 1)) &&
1328 ((nx & PGT_type_mask) != PGT_writable_page)) )
1330 /* Page is now only pinned. Make the back pointer mutable again. */
1331 nx |= PGT_va_mutable;
1334 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1338 int get_page_type(struct pfn_info *page, u32 type)
1340 u32 nx, x, y = page->u.inuse.type_info;
1342 again:
1343 do {
1344 x = y;
1345 nx = x + 1;
1346 if ( unlikely((nx & PGT_count_mask) == 0) )
1348 MEM_LOG("Type count overflow on pfn %lx", page_to_pfn(page));
1349 return 0;
1351 else if ( unlikely((x & PGT_count_mask) == 0) )
1353 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1355 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1357 /*
1358 * On type change we check to flush stale TLB
1359 * entries. This may be unnecessary (e.g., page
1360 * was GDT/LDT) but those circumstances should be
1361 * very rare.
1362 */
1363 cpumask_t mask = page_get_owner(page)->cpumask;
1364 tlbflush_filter(mask, page->tlbflush_timestamp);
1366 if ( unlikely(!cpus_empty(mask)) )
1368 perfc_incrc(need_flush_tlb_flush);
1369 flush_tlb_mask(mask);
1373 /* We lose existing type, back pointer, and validity. */
1374 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1375 nx |= type;
1377 /* No special validation needed for writable pages. */
1378 /* Page tables and GDT/LDT need to be scanned for validity. */
1379 if ( type == PGT_writable_page )
1380 nx |= PGT_validated;
1383 else
1385 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1387 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1389 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1390 ((type & PGT_type_mask) != PGT_l1_page_table) )
1391 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %lx",
1392 x, type, page_to_pfn(page));
1393 return 0;
1395 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1397 /* The va backpointer is mutable, hence we update it. */
1398 nx &= ~PGT_va_mask;
1399 nx |= type; /* we know the actual type is correct */
1401 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1402 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1404 #ifdef CONFIG_X86_PAE
1405 /* We use backptr as extra typing. Cannot be unknown. */
1406 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1407 return 0;
1408 #endif
1409 /* This table is possibly mapped at multiple locations. */
1410 nx &= ~PGT_va_mask;
1411 nx |= PGT_va_unknown;
1414 if ( unlikely(!(x & PGT_validated)) )
1416 /* Someone else is updating validation of this page. Wait... */
1417 while ( (y = page->u.inuse.type_info) == x )
1418 cpu_relax();
1419 goto again;
1423 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1425 if ( unlikely(!(nx & PGT_validated)) )
1427 /* Try to validate page type; drop the new reference on failure. */
1428 if ( unlikely(!alloc_page_type(page, type)) )
1430 MEM_LOG("Error while validating pfn %lx for type %08x."
1431 " caf=%08x taf=%08x",
1432 page_to_pfn(page), type,
1433 page->count_info,
1434 page->u.inuse.type_info);
1435 /* Noone else can get a reference. We hold the only ref. */
1436 page->u.inuse.type_info = 0;
1437 return 0;
1440 /* Noone else is updating simultaneously. */
1441 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1444 return 1;
1448 int new_guest_cr3(unsigned long mfn)
1450 struct vcpu *v = current;
1451 struct domain *d = v->domain;
1452 int okay;
1453 unsigned long old_base_mfn;
1455 if ( shadow_mode_refcounts(d) )
1456 okay = get_page_from_pagenr(mfn, d);
1457 else
1458 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1460 if ( likely(okay) )
1462 invalidate_shadow_ldt(v);
1464 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1465 v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1466 update_pagetables(v); /* update shadow_table and monitor_table */
1468 write_ptbase(v);
1470 if ( shadow_mode_refcounts(d) )
1471 put_page(&frame_table[old_base_mfn]);
1472 else
1473 put_page_and_type(&frame_table[old_base_mfn]);
1475 /* CR3 also holds a ref to its shadow... */
1476 if ( shadow_mode_enabled(d) )
1478 if ( v->arch.monitor_shadow_ref )
1479 put_shadow_ref(v->arch.monitor_shadow_ref);
1480 v->arch.monitor_shadow_ref =
1481 pagetable_get_pfn(v->arch.monitor_table);
1482 ASSERT(!page_get_owner(&frame_table[v->arch.monitor_shadow_ref]));
1483 get_shadow_ref(v->arch.monitor_shadow_ref);
1486 else
1488 MEM_LOG("Error while installing new baseptr %lx", mfn);
1491 return okay;
1494 static void process_deferred_ops(unsigned int cpu)
1496 unsigned int deferred_ops;
1497 struct domain *d = current->domain;
1499 deferred_ops = percpu_info[cpu].deferred_ops;
1500 percpu_info[cpu].deferred_ops = 0;
1502 if ( deferred_ops & DOP_FLUSH_TLB )
1504 if ( shadow_mode_enabled(d) )
1505 shadow_sync_all(d);
1506 local_flush_tlb();
1509 if ( deferred_ops & DOP_RELOAD_LDT )
1510 (void)map_ldt_shadow_page(0);
1512 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1514 put_domain(percpu_info[cpu].foreign);
1515 percpu_info[cpu].foreign = NULL;
1519 static int set_foreigndom(unsigned int cpu, domid_t domid)
1521 struct domain *e, *d = current->domain;
1522 int okay = 1;
1524 if ( (e = percpu_info[cpu].foreign) != NULL )
1525 put_domain(e);
1526 percpu_info[cpu].foreign = NULL;
1528 if ( domid == DOMID_SELF )
1529 goto out;
1531 if ( !IS_PRIV(d) )
1533 switch ( domid )
1535 case DOMID_IO:
1536 get_knownalive_domain(dom_io);
1537 percpu_info[cpu].foreign = dom_io;
1538 break;
1539 default:
1540 MEM_LOG("Dom %u cannot set foreign dom\n", d->domain_id);
1541 okay = 0;
1542 break;
1545 else
1547 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1548 if ( e == NULL )
1550 switch ( domid )
1552 case DOMID_XEN:
1553 get_knownalive_domain(dom_xen);
1554 percpu_info[cpu].foreign = dom_xen;
1555 break;
1556 case DOMID_IO:
1557 get_knownalive_domain(dom_io);
1558 percpu_info[cpu].foreign = dom_io;
1559 break;
1560 default:
1561 MEM_LOG("Unknown domain '%u'", domid);
1562 okay = 0;
1563 break;
1568 out:
1569 return okay;
1572 static inline cpumask_t vcpumask_to_pcpumask(
1573 struct domain *d, unsigned long vmask)
1575 unsigned int vcpu_id;
1576 cpumask_t pmask;
1577 struct vcpu *v;
1579 while ( vmask != 0 )
1581 vcpu_id = find_first_set_bit(vmask);
1582 vmask &= ~(1UL << vcpu_id);
1583 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1584 ((v = d->vcpu[vcpu_id]) != NULL) )
1585 cpu_set(v->processor, pmask);
1588 return pmask;
1591 int do_mmuext_op(
1592 struct mmuext_op *uops,
1593 unsigned int count,
1594 unsigned int *pdone,
1595 unsigned int foreigndom)
1597 struct mmuext_op op;
1598 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1599 unsigned int type, done = 0;
1600 struct pfn_info *page;
1601 struct vcpu *v = current;
1602 struct domain *d = v->domain, *e;
1603 u32 x, y, _d, _nd;
1605 LOCK_BIGLOCK(d);
1607 cleanup_writable_pagetable(d);
1609 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1611 count &= ~MMU_UPDATE_PREEMPTED;
1612 if ( unlikely(pdone != NULL) )
1613 (void)get_user(done, pdone);
1616 if ( !set_foreigndom(cpu, foreigndom) )
1618 rc = -EINVAL;
1619 goto out;
1622 if ( unlikely(!array_access_ok(uops, count, sizeof(op))) )
1624 rc = -EFAULT;
1625 goto out;
1628 for ( i = 0; i < count; i++ )
1630 if ( hypercall_preempt_check() )
1632 rc = hypercall4_create_continuation(
1633 __HYPERVISOR_mmuext_op, uops,
1634 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1635 break;
1638 if ( unlikely(__copy_from_user(&op, uops, sizeof(op)) != 0) )
1640 MEM_LOG("Bad __copy_from_user");
1641 rc = -EFAULT;
1642 break;
1645 okay = 1;
1646 page = &frame_table[op.mfn];
1648 switch ( op.cmd )
1650 case MMUEXT_PIN_L1_TABLE:
1651 type = PGT_l1_page_table | PGT_va_mutable;
1653 pin_page:
1654 if ( shadow_mode_refcounts(FOREIGNDOM) )
1655 type = PGT_writable_page;
1657 okay = get_page_and_type_from_pagenr(op.mfn, type, FOREIGNDOM);
1658 if ( unlikely(!okay) )
1660 MEM_LOG("Error while pinning mfn %lx", op.mfn);
1661 break;
1664 if ( unlikely(test_and_set_bit(_PGT_pinned,
1665 &page->u.inuse.type_info)) )
1667 MEM_LOG("Mfn %lx already pinned", op.mfn);
1668 put_page_and_type(page);
1669 okay = 0;
1670 break;
1673 break;
1675 #ifndef CONFIG_X86_PAE /* Unsafe on PAE because of Xen-private mappings. */
1676 case MMUEXT_PIN_L2_TABLE:
1677 type = PGT_l2_page_table;
1678 goto pin_page;
1679 #endif
1681 case MMUEXT_PIN_L3_TABLE:
1682 type = PGT_l3_page_table;
1683 goto pin_page;
1685 case MMUEXT_PIN_L4_TABLE:
1686 type = PGT_l4_page_table;
1687 goto pin_page;
1689 case MMUEXT_UNPIN_TABLE:
1690 if ( unlikely(!(okay = get_page_from_pagenr(op.mfn, FOREIGNDOM))) )
1692 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1693 op.mfn, page_get_owner(page));
1695 else if ( likely(test_and_clear_bit(_PGT_pinned,
1696 &page->u.inuse.type_info)) )
1698 put_page_and_type(page);
1699 put_page(page);
1701 else
1703 okay = 0;
1704 put_page(page);
1705 MEM_LOG("Mfn %lx not pinned", op.mfn);
1707 break;
1709 case MMUEXT_NEW_BASEPTR:
1710 okay = new_guest_cr3(op.mfn);
1711 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1712 break;
1714 #ifdef __x86_64__
1715 case MMUEXT_NEW_USER_BASEPTR:
1716 okay = get_page_and_type_from_pagenr(
1717 op.mfn, PGT_root_page_table, d);
1718 if ( unlikely(!okay) )
1720 MEM_LOG("Error while installing new mfn %lx", op.mfn);
1722 else
1724 unsigned long old_mfn =
1725 pagetable_get_pfn(v->arch.guest_table_user);
1726 v->arch.guest_table_user = mk_pagetable(op.mfn << PAGE_SHIFT);
1727 if ( old_mfn != 0 )
1728 put_page_and_type(&frame_table[old_mfn]);
1730 break;
1731 #endif
1733 case MMUEXT_TLB_FLUSH_LOCAL:
1734 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1735 break;
1737 case MMUEXT_INVLPG_LOCAL:
1738 if ( shadow_mode_enabled(d) )
1739 shadow_invlpg(v, op.linear_addr);
1740 local_flush_tlb_one(op.linear_addr);
1741 break;
1743 case MMUEXT_TLB_FLUSH_MULTI:
1744 case MMUEXT_INVLPG_MULTI:
1746 unsigned long vmask;
1747 cpumask_t pmask;
1748 if ( unlikely(get_user(vmask, (unsigned long *)op.vcpumask)) )
1750 okay = 0;
1751 break;
1753 pmask = vcpumask_to_pcpumask(d, vmask);
1754 cpus_and(pmask, pmask, d->cpumask);
1755 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
1756 flush_tlb_mask(pmask);
1757 else
1758 flush_tlb_one_mask(pmask, op.linear_addr);
1759 break;
1762 case MMUEXT_TLB_FLUSH_ALL:
1763 flush_tlb_mask(d->cpumask);
1764 break;
1766 case MMUEXT_INVLPG_ALL:
1767 flush_tlb_one_mask(d->cpumask, op.linear_addr);
1768 break;
1770 case MMUEXT_FLUSH_CACHE:
1771 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1773 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1774 okay = 0;
1776 else
1778 wbinvd();
1780 break;
1782 case MMUEXT_SET_LDT:
1784 if ( shadow_mode_external(d) )
1786 MEM_LOG("ignoring SET_LDT hypercall from external "
1787 "domain %u\n", d->domain_id);
1788 okay = 0;
1789 break;
1792 unsigned long ptr = op.linear_addr;
1793 unsigned long ents = op.nr_ents;
1794 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1795 (ents > 8192) ||
1796 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
1798 okay = 0;
1799 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
1801 else if ( (v->arch.guest_context.ldt_ents != ents) ||
1802 (v->arch.guest_context.ldt_base != ptr) )
1804 invalidate_shadow_ldt(v);
1805 v->arch.guest_context.ldt_base = ptr;
1806 v->arch.guest_context.ldt_ents = ents;
1807 load_LDT(v);
1808 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1809 if ( ents != 0 )
1810 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1812 break;
1815 case MMUEXT_REASSIGN_PAGE:
1816 if ( unlikely(!IS_PRIV(d)) )
1818 MEM_LOG("Dom %u has no reassignment priv", d->domain_id);
1819 okay = 0;
1820 break;
1823 e = percpu_info[cpu].foreign;
1824 if ( unlikely(e == NULL) )
1826 MEM_LOG("No FOREIGNDOM to reassign mfn %lx to", op.mfn);
1827 okay = 0;
1828 break;
1831 /*
1832 * Grab both page_list locks, in order. This prevents the page from
1833 * disappearing elsewhere while we modify the owner, and we'll need
1834 * both locks if we're successful so that we can change lists.
1835 */
1836 if ( d < e )
1838 spin_lock(&d->page_alloc_lock);
1839 spin_lock(&e->page_alloc_lock);
1841 else
1843 spin_lock(&e->page_alloc_lock);
1844 spin_lock(&d->page_alloc_lock);
1847 /*
1848 * Check that 'e' will accept the page and has reservation
1849 * headroom. Also, a domain mustn't have PGC_allocated pages when
1850 * it is dying.
1851 */
1852 ASSERT(e->tot_pages <= e->max_pages);
1853 if ( unlikely(test_bit(_DOMF_dying, &e->domain_flags)) ||
1854 unlikely(e->tot_pages == e->max_pages) ||
1855 unlikely(IS_XEN_HEAP_FRAME(page)) )
1857 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1858 "page is in Xen heap (%lx), or dom is dying (%ld).\n",
1859 e->tot_pages, e->max_pages, op.mfn, e->domain_flags);
1860 okay = 0;
1861 goto reassign_fail;
1864 /*
1865 * The tricky bit: atomically change owner while there is just one
1866 * benign reference to the page (PGC_allocated). If that reference
1867 * disappears then the deallocation routine will safely spin.
1868 */
1869 _d = pickle_domptr(d);
1870 _nd = page->u.inuse._domain;
1871 y = page->count_info;
1872 do {
1873 x = y;
1874 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1875 (1|PGC_allocated)) ||
1876 unlikely(_nd != _d) )
1878 MEM_LOG("Bad page values %lx: ed=%p(%u), sd=%p,"
1879 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1880 d, d->domain_id, unpickle_domptr(_nd), x,
1881 page->u.inuse.type_info);
1882 okay = 0;
1883 goto reassign_fail;
1885 __asm__ __volatile__(
1886 LOCK_PREFIX "cmpxchg8b %3"
1887 : "=d" (_nd), "=a" (y), "=c" (e),
1888 "=m" (*(volatile u64 *)(&page->count_info))
1889 : "0" (_d), "1" (x), "c" (e), "b" (x) );
1891 while ( unlikely(_nd != _d) || unlikely(y != x) );
1893 /*
1894 * Unlink from 'd'. We transferred at least one reference to 'e',
1895 * so noone else is spinning to try to delete this page from 'd'.
1896 */
1897 d->tot_pages--;
1898 list_del(&page->list);
1900 /*
1901 * Add the page to 'e'. Someone may already have removed the last
1902 * reference and want to remove the page from 'e'. However, we have
1903 * the lock so they'll spin waiting for us.
1904 */
1905 if ( unlikely(e->tot_pages++ == 0) )
1906 get_knownalive_domain(e);
1907 list_add_tail(&page->list, &e->page_list);
1909 reassign_fail:
1910 spin_unlock(&d->page_alloc_lock);
1911 spin_unlock(&e->page_alloc_lock);
1912 break;
1914 default:
1915 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
1916 okay = 0;
1917 break;
1920 if ( unlikely(!okay) )
1922 rc = -EINVAL;
1923 break;
1926 uops++;
1929 out:
1930 process_deferred_ops(cpu);
1932 /* Add incremental work we have done to the @done output parameter. */
1933 if ( unlikely(pdone != NULL) )
1934 __put_user(done + i, pdone);
1936 UNLOCK_BIGLOCK(d);
1937 return rc;
1940 int do_mmu_update(
1941 mmu_update_t *ureqs,
1942 unsigned int count,
1943 unsigned int *pdone,
1944 unsigned int foreigndom)
1946 mmu_update_t req;
1947 void *va;
1948 unsigned long gpfn, mfn;
1949 struct pfn_info *page;
1950 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1951 unsigned int cmd, done = 0;
1952 struct vcpu *v = current;
1953 struct domain *d = v->domain;
1954 u32 type_info;
1955 struct domain_mmap_cache mapcache, sh_mapcache;
1957 LOCK_BIGLOCK(d);
1959 cleanup_writable_pagetable(d);
1961 if ( unlikely(shadow_mode_enabled(d)) )
1962 check_pagetable(v, "pre-mmu"); /* debug */
1964 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1966 count &= ~MMU_UPDATE_PREEMPTED;
1967 if ( unlikely(pdone != NULL) )
1968 (void)get_user(done, pdone);
1971 domain_mmap_cache_init(&mapcache);
1972 domain_mmap_cache_init(&sh_mapcache);
1974 if ( !set_foreigndom(cpu, foreigndom) )
1976 rc = -EINVAL;
1977 goto out;
1980 perfc_incrc(calls_to_mmu_update);
1981 perfc_addc(num_page_updates, count);
1982 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
1984 if ( unlikely(!array_access_ok(ureqs, count, sizeof(req))) )
1986 rc = -EFAULT;
1987 goto out;
1990 for ( i = 0; i < count; i++ )
1992 if ( hypercall_preempt_check() )
1994 rc = hypercall4_create_continuation(
1995 __HYPERVISOR_mmu_update, ureqs,
1996 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1997 break;
2000 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
2002 MEM_LOG("Bad __copy_from_user");
2003 rc = -EFAULT;
2004 break;
2007 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2008 okay = 0;
2010 switch ( cmd )
2012 /*
2013 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2014 */
2015 case MMU_NORMAL_PT_UPDATE:
2017 gpfn = req.ptr >> PAGE_SHIFT;
2018 mfn = __gpfn_to_mfn(d, gpfn);
2020 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2022 MEM_LOG("Could not get page for normal update");
2023 break;
2026 va = map_domain_page_with_cache(mfn, &mapcache);
2027 va = (void *)((unsigned long)va +
2028 (unsigned long)(req.ptr & ~PAGE_MASK));
2029 page = &frame_table[mfn];
2031 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2033 case PGT_l1_page_table:
2034 ASSERT( !shadow_mode_refcounts(d) );
2035 if ( likely(get_page_type(
2036 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2038 l1_pgentry_t l1e;
2040 /* FIXME: doesn't work with PAE */
2041 l1e = l1e_from_intpte(req.val);
2042 okay = mod_l1_entry(va, l1e);
2043 if ( okay && unlikely(shadow_mode_enabled(d)) )
2044 shadow_l1_normal_pt_update(d, req.ptr, l1e, &sh_mapcache);
2045 put_page_type(page);
2047 break;
2048 case PGT_l2_page_table:
2049 ASSERT( !shadow_mode_refcounts(d) );
2050 if ( likely(get_page_type(
2051 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2053 l2_pgentry_t l2e;
2055 /* FIXME: doesn't work with PAE */
2056 l2e = l2e_from_intpte(req.val);
2057 okay = mod_l2_entry((l2_pgentry_t *)va, l2e, mfn, type_info);
2058 if ( okay && unlikely(shadow_mode_enabled(d)) )
2059 shadow_l2_normal_pt_update(d, req.ptr, l2e, &sh_mapcache);
2060 put_page_type(page);
2062 break;
2063 #if CONFIG_PAGING_LEVELS >= 3
2064 case PGT_l3_page_table:
2065 ASSERT( !shadow_mode_refcounts(d) );
2066 if ( likely(get_page_type(page, PGT_l3_page_table)) )
2068 l3_pgentry_t l3e;
2070 /* FIXME: doesn't work with PAE */
2071 l3e = l3e_from_intpte(req.val);
2072 okay = mod_l3_entry(va, l3e, mfn);
2073 if ( okay && unlikely(shadow_mode_enabled(d)) )
2074 shadow_l3_normal_pt_update(d, req.ptr, l3e, &sh_mapcache);
2075 put_page_type(page);
2077 break;
2078 #endif
2079 #if CONFIG_PAGING_LEVELS >= 4
2080 case PGT_l4_page_table:
2081 ASSERT( !shadow_mode_refcounts(d) );
2082 if ( likely(get_page_type(page, PGT_l4_page_table)) )
2084 l4_pgentry_t l4e;
2086 l4e = l4e_from_intpte(req.val);
2087 okay = mod_l4_entry(va, l4e, mfn);
2088 if ( okay && unlikely(shadow_mode_enabled(d)) )
2089 shadow_l4_normal_pt_update(d, req.ptr, l4e, &sh_mapcache);
2090 put_page_type(page);
2092 break;
2093 #endif
2094 default:
2095 if ( likely(get_page_type(page, PGT_writable_page)) )
2097 if ( shadow_mode_enabled(d) )
2099 shadow_lock(d);
2101 if ( shadow_mode_log_dirty(d) )
2102 __mark_dirty(d, mfn);
2104 if ( page_is_page_table(page) &&
2105 !page_out_of_sync(page) )
2107 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
2111 *(unsigned long *)va = req.val;
2112 okay = 1;
2114 if ( shadow_mode_enabled(d) )
2115 shadow_unlock(d);
2117 put_page_type(page);
2119 break;
2122 unmap_domain_page_with_cache(va, &mapcache);
2124 put_page(page);
2125 break;
2127 case MMU_MACHPHYS_UPDATE:
2129 mfn = req.ptr >> PAGE_SHIFT;
2130 gpfn = req.val;
2132 /* HACK ALERT... Need to think about this some more... */
2133 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && IS_PRIV(d)) )
2135 shadow_lock(FOREIGNDOM);
2136 printk("privileged guest dom%d requests pfn=%lx to map mfn=%lx for dom%d\n",
2137 d->domain_id, gpfn, mfn, FOREIGNDOM->domain_id);
2138 set_machinetophys(mfn, gpfn);
2139 set_p2m_entry(FOREIGNDOM, gpfn, mfn, &sh_mapcache, &mapcache);
2140 okay = 1;
2141 shadow_unlock(FOREIGNDOM);
2142 break;
2145 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2147 MEM_LOG("Could not get page for mach->phys update");
2148 break;
2151 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && !IS_PRIV(d)) )
2153 MEM_LOG("can't mutate the m2p of translated guests");
2154 break;
2157 set_machinetophys(mfn, gpfn);
2158 okay = 1;
2160 /*
2161 * If in log-dirty mode, mark the corresponding
2162 * page as dirty.
2163 */
2164 if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) &&
2165 mark_dirty(FOREIGNDOM, mfn) )
2166 FOREIGNDOM->arch.shadow_dirty_block_count++;
2168 put_page(&frame_table[mfn]);
2169 break;
2171 default:
2172 MEM_LOG("Invalid page update command %x", cmd);
2173 break;
2176 if ( unlikely(!okay) )
2178 rc = -EINVAL;
2179 break;
2182 ureqs++;
2185 out:
2186 domain_mmap_cache_destroy(&mapcache);
2187 domain_mmap_cache_destroy(&sh_mapcache);
2189 process_deferred_ops(cpu);
2191 /* Add incremental work we have done to the @done output parameter. */
2192 if ( unlikely(pdone != NULL) )
2193 __put_user(done + i, pdone);
2195 if ( unlikely(shadow_mode_enabled(d)) )
2196 check_pagetable(v, "post-mmu"); /* debug */
2198 UNLOCK_BIGLOCK(d);
2199 return rc;
2202 /* This function assumes the caller is holding the domain's BIGLOCK
2203 * and is running in a shadow mode
2204 */
2205 int update_grant_va_mapping(unsigned long va,
2206 l1_pgentry_t _nl1e,
2207 struct domain *d,
2208 struct vcpu *v)
2210 /* Caller must:
2211 * . own d's BIGLOCK
2212 * . already have 'get_page' correctly on the to-be-installed nl1e
2213 * . be responsible for flushing the TLB
2214 * . check PTE being installed isn't DISALLOWED
2215 */
2217 int rc = 0;
2218 l1_pgentry_t *pl1e;
2219 l1_pgentry_t ol1e;
2221 cleanup_writable_pagetable(d);
2223 // This is actually overkill - we don't need to sync the L1 itself,
2224 // just everything involved in getting to this L1 (i.e. we need
2225 // linear_pg_table[l1_linear_offset(va)] to be in sync)...
2226 //
2227 __shadow_sync_va(v, va);
2229 pl1e = &linear_pg_table[l1_linear_offset(va)];
2231 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
2232 rc = -EINVAL;
2233 else if ( !shadow_mode_refcounts(d) )
2235 if ( update_l1e(pl1e, ol1e, _nl1e) )
2237 put_page_from_l1e(ol1e, d);
2238 if ( l1e_get_flags(ol1e) & _PAGE_PRESENT )
2239 rc = 0; /* Caller needs to invalidate TLB entry */
2240 else
2241 rc = 1; /* Caller need not invalidate TLB entry */
2243 else
2244 rc = -EINVAL;
2246 else
2248 printk("grant tables and shadow mode currently don't work together\n");
2249 BUG();
2252 if ( unlikely(shadow_mode_enabled(d)) )
2253 shadow_do_update_va_mapping(va, _nl1e, v);
2255 return rc;
2259 int do_update_va_mapping(unsigned long va, u64 val64,
2260 unsigned long flags)
2262 l1_pgentry_t val = l1e_from_intpte(val64);
2263 struct vcpu *v = current;
2264 struct domain *d = v->domain;
2265 unsigned int cpu = v->processor;
2266 unsigned long vmask, bmap_ptr;
2267 cpumask_t pmask;
2268 int rc = 0;
2270 perfc_incrc(calls_to_update_va);
2272 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2273 return -EINVAL;
2275 LOCK_BIGLOCK(d);
2277 cleanup_writable_pagetable(d);
2279 if ( unlikely(shadow_mode_enabled(d)) )
2280 check_pagetable(v, "pre-va"); /* debug */
2282 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2283 val)) )
2284 rc = -EINVAL;
2286 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2288 if ( unlikely(percpu_info[cpu].foreign &&
2289 (shadow_mode_translate(d) ||
2290 shadow_mode_translate(percpu_info[cpu].foreign))) )
2292 // The foreign domain's pfn's are in a different namespace.
2293 // There's not enough information in just a gpte to figure out
2294 // how to (re-)shadow this entry.
2295 //
2296 domain_crash();
2299 rc = shadow_do_update_va_mapping(va, val, v);
2301 check_pagetable(v, "post-va"); /* debug */
2304 switch ( flags & UVMF_FLUSHTYPE_MASK )
2306 case UVMF_TLB_FLUSH:
2307 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2309 case UVMF_LOCAL:
2310 if ( unlikely(shadow_mode_enabled(d)) )
2311 shadow_sync_all(d);
2312 local_flush_tlb();
2313 break;
2314 case UVMF_ALL:
2315 flush_tlb_mask(d->cpumask);
2316 break;
2317 default:
2318 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2319 rc = -EFAULT;
2320 pmask = vcpumask_to_pcpumask(d, vmask);
2321 cpus_and(pmask, pmask, d->cpumask);
2322 flush_tlb_mask(pmask);
2323 break;
2325 break;
2327 case UVMF_INVLPG:
2328 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2330 case UVMF_LOCAL:
2331 if ( unlikely(shadow_mode_enabled(d)) )
2332 shadow_invlpg(current, va);
2333 local_flush_tlb_one(va);
2334 break;
2335 case UVMF_ALL:
2336 flush_tlb_one_mask(d->cpumask, va);
2337 break;
2338 default:
2339 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2340 rc = -EFAULT;
2341 pmask = vcpumask_to_pcpumask(d, vmask);
2342 cpus_and(pmask, pmask, d->cpumask);
2343 flush_tlb_one_mask(pmask, va);
2344 break;
2346 break;
2349 process_deferred_ops(cpu);
2351 UNLOCK_BIGLOCK(d);
2353 return rc;
2356 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2357 unsigned long flags,
2358 domid_t domid)
2360 unsigned int cpu = smp_processor_id();
2361 struct domain *d;
2362 int rc;
2364 if ( unlikely(!IS_PRIV(current->domain)) )
2365 return -EPERM;
2367 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
2368 if ( unlikely(d == NULL) )
2370 MEM_LOG("Unknown domain '%u'", domid);
2371 return -ESRCH;
2374 rc = do_update_va_mapping(va, val64, flags);
2376 return rc;
2381 /*************************
2382 * Descriptor Tables
2383 */
2385 void destroy_gdt(struct vcpu *v)
2387 int i;
2388 unsigned long pfn;
2390 v->arch.guest_context.gdt_ents = 0;
2391 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2393 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2394 put_page_and_type(&frame_table[pfn]);
2395 v->arch.perdomain_ptes[i] = l1e_empty();
2396 v->arch.guest_context.gdt_frames[i] = 0;
2401 long set_gdt(struct vcpu *v,
2402 unsigned long *frames,
2403 unsigned int entries)
2405 struct domain *d = v->domain;
2406 /* NB. There are 512 8-byte entries per GDT page. */
2407 int i, nr_pages = (entries + 511) / 512;
2408 unsigned long pfn;
2410 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2411 return -EINVAL;
2413 shadow_sync_all(d);
2415 /* Check the pages in the new GDT. */
2416 for ( i = 0; i < nr_pages; i++ )
2417 if ( ((pfn = frames[i]) >= max_page) ||
2418 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2419 goto fail;
2421 /* Tear down the old GDT. */
2422 destroy_gdt(v);
2424 /* Install the new GDT. */
2425 v->arch.guest_context.gdt_ents = entries;
2426 for ( i = 0; i < nr_pages; i++ )
2428 v->arch.guest_context.gdt_frames[i] = frames[i];
2429 v->arch.perdomain_ptes[i] =
2430 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2433 return 0;
2435 fail:
2436 while ( i-- > 0 )
2437 put_page_and_type(&frame_table[frames[i]]);
2438 return -EINVAL;
2442 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
2444 int nr_pages = (entries + 511) / 512;
2445 unsigned long frames[16];
2446 long ret;
2448 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2449 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2450 return -EINVAL;
2452 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
2453 return -EFAULT;
2455 LOCK_BIGLOCK(current->domain);
2457 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2458 local_flush_tlb();
2460 UNLOCK_BIGLOCK(current->domain);
2462 return ret;
2466 long do_update_descriptor(unsigned long pa, u64 desc)
2468 struct domain *dom = current->domain;
2469 unsigned long gpfn = pa >> PAGE_SHIFT;
2470 unsigned long mfn;
2471 unsigned int offset = (pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2472 struct desc_struct *gdt_pent, d;
2473 struct pfn_info *page;
2474 long ret = -EINVAL;
2476 *(u64 *)&d = desc;
2478 LOCK_BIGLOCK(dom);
2480 if ( !VALID_MFN(mfn = __gpfn_to_mfn(dom, gpfn)) ||
2481 ((pa % sizeof(struct desc_struct)) != 0) ||
2482 (mfn >= max_page) ||
2483 !check_descriptor(&d) )
2485 UNLOCK_BIGLOCK(dom);
2486 return -EINVAL;
2489 page = &frame_table[mfn];
2490 if ( unlikely(!get_page(page, dom)) )
2492 UNLOCK_BIGLOCK(dom);
2493 return -EINVAL;
2496 /* Check if the given frame is in use in an unsafe context. */
2497 switch ( page->u.inuse.type_info & PGT_type_mask )
2499 case PGT_gdt_page:
2500 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2501 goto out;
2502 break;
2503 case PGT_ldt_page:
2504 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2505 goto out;
2506 break;
2507 default:
2508 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2509 goto out;
2510 break;
2513 if ( shadow_mode_enabled(dom) )
2515 shadow_lock(dom);
2517 if ( shadow_mode_log_dirty(dom) )
2518 __mark_dirty(dom, mfn);
2520 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2521 shadow_mark_mfn_out_of_sync(current, gpfn, mfn);
2524 /* All is good so make the update. */
2525 gdt_pent = map_domain_page(mfn);
2526 memcpy(&gdt_pent[offset], &d, 8);
2527 unmap_domain_page(gdt_pent);
2529 if ( shadow_mode_enabled(dom) )
2530 shadow_unlock(dom);
2532 put_page_type(page);
2534 ret = 0; /* success */
2536 out:
2537 put_page(page);
2539 UNLOCK_BIGLOCK(dom);
2541 return ret;
2546 /*************************
2547 * Writable Pagetables
2548 */
2550 #ifdef VERBOSE
2551 int ptwr_debug = 0x0;
2552 #define PTWR_PRINTK(_f, _a...) \
2553 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2554 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2555 #else
2556 #define PTWR_PRINTK(_f, _a...) ((void)0)
2557 #endif
2559 /* Re-validate a given p.t. page, given its prior snapshot */
2560 int revalidate_l1(struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
2562 l1_pgentry_t ol1e, nl1e;
2563 int modified = 0, i;
2565 #if 0
2566 if ( d->domain_id )
2567 printk("%s: l1page mfn=%lx snapshot mfn=%lx\n", __func__,
2568 l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)l1page)]),
2569 l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)snapshot)]));
2570 #endif
2572 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2574 ol1e = snapshot[i];
2575 nl1e = l1page[i];
2577 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
2578 continue;
2580 /* Update number of entries modified. */
2581 modified++;
2583 /*
2584 * Fast path for PTEs that have merely been write-protected
2585 * (e.g., during a Unix fork()). A strict reduction in privilege.
2586 */
2587 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
2589 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
2590 put_page_type(&frame_table[l1e_get_pfn(nl1e)]);
2591 continue;
2594 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2596 MEM_LOG("ptwr: Could not re-validate l1 page\n");
2597 /*
2598 * Make the remaining p.t's consistent before crashing, so the
2599 * reference counts are correct.
2600 */
2601 memcpy(&l1page[i], &snapshot[i],
2602 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
2603 domain_crash();
2604 break;
2607 put_page_from_l1e(ol1e, d);
2610 return modified;
2614 /* Flush the given writable p.t. page and write-protect it again. */
2615 void ptwr_flush(struct domain *d, const int which)
2617 unsigned long pte, *ptep, l1va;
2618 l1_pgentry_t *pl1e;
2619 l2_pgentry_t *pl2e;
2620 unsigned int modified;
2622 ASSERT(!shadow_mode_enabled(d));
2624 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
2625 write_ptbase(d->arch.ptwr[which].vcpu);
2627 l1va = d->arch.ptwr[which].l1va;
2628 ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)];
2630 /*
2631 * STEP 1. Write-protect the p.t. page so no more updates can occur.
2632 */
2634 if ( unlikely(__get_user(pte, ptep)) )
2636 MEM_LOG("ptwr: Could not read pte at %p", ptep);
2637 /*
2638 * Really a bug. We could read this PTE during the initial fault,
2639 * and pagetables can't have changed meantime.
2640 */
2641 BUG();
2643 PTWR_PRINTK("[%c] disconnected_l1va at %p is %lx\n",
2644 PTWR_PRINT_WHICH, ptep, pte);
2645 pte &= ~_PAGE_RW;
2647 /* Write-protect the p.t. page in the guest page table. */
2648 if ( unlikely(__put_user(pte, ptep)) )
2650 MEM_LOG("ptwr: Could not update pte at %p", ptep);
2651 /*
2652 * Really a bug. We could write this PTE during the initial fault,
2653 * and pagetables can't have changed meantime.
2654 */
2655 BUG();
2658 /* Ensure that there are no stale writable mappings in any TLB. */
2659 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
2660 flush_tlb_one_mask(d->cpumask, l1va);
2661 PTWR_PRINTK("[%c] disconnected_l1va at %p now %lx\n",
2662 PTWR_PRINT_WHICH, ptep, pte);
2664 /*
2665 * STEP 2. Validate any modified PTEs.
2666 */
2668 pl1e = d->arch.ptwr[which].pl1e;
2669 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
2670 unmap_domain_page(pl1e);
2671 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
2672 d->arch.ptwr[which].prev_nr_updates = modified;
2674 /*
2675 * STEP 3. Reattach the L1 p.t. page into the current address space.
2676 */
2678 if ( which == PTWR_PT_ACTIVE )
2680 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
2681 l2e_add_flags(*pl2e, _PAGE_PRESENT);
2684 /*
2685 * STEP 4. Final tidy-up.
2686 */
2688 d->arch.ptwr[which].l1va = 0;
2690 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
2691 write_ptbase(current);
2694 static int ptwr_emulated_update(
2695 unsigned long addr,
2696 physaddr_t old,
2697 physaddr_t val,
2698 unsigned int bytes,
2699 unsigned int do_cmpxchg)
2701 unsigned long pfn;
2702 struct pfn_info *page;
2703 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
2704 struct domain *d = current->domain;
2706 /* Aligned access only, thank you. */
2707 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
2709 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)\n",
2710 bytes, addr);
2711 return X86EMUL_UNHANDLEABLE;
2714 /* Turn a sub-word access into a full-word access. */
2715 if (bytes != sizeof(physaddr_t))
2717 int rc;
2718 physaddr_t full;
2719 unsigned int offset = addr & (sizeof(physaddr_t)-1);
2721 /* Align address; read full word. */
2722 addr &= ~(sizeof(physaddr_t)-1);
2723 if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full,
2724 sizeof(physaddr_t))) )
2725 return rc;
2726 /* Mask out bits provided by caller. */
2727 full &= ~((((physaddr_t)1 << (bytes*8)) - 1) << (offset*8));
2728 /* Shift the caller value and OR in the missing bits. */
2729 val &= (((physaddr_t)1 << (bytes*8)) - 1);
2730 val <<= (offset)*8;
2731 val |= full;
2734 /* Read the PTE that maps the page being updated. */
2735 if (__copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
2736 sizeof(pte)))
2738 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table\n");
2739 return X86EMUL_UNHANDLEABLE;
2742 pfn = l1e_get_pfn(pte);
2743 page = &frame_table[pfn];
2745 /* We are looking only for read-only mappings of p.t. pages. */
2746 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
2747 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
2748 (page_get_owner(page) != d) )
2750 MEM_LOG("ptwr_emulate: Page is mistyped or bad pte (%lx, %08x)\n",
2751 l1e_get_pfn(pte), page->u.inuse.type_info);
2752 return X86EMUL_UNHANDLEABLE;
2755 /* Check the new PTE. */
2756 nl1e = l1e_from_intpte(val);
2757 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2758 return X86EMUL_UNHANDLEABLE;
2760 /* Checked successfully: do the update (write or cmpxchg). */
2761 pl1e = map_domain_page(page_to_pfn(page));
2762 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
2763 if ( do_cmpxchg )
2765 ol1e = l1e_from_intpte(old);
2766 if ( cmpxchg((unsigned long *)pl1e, old, val) != old )
2768 unmap_domain_page(pl1e);
2769 put_page_from_l1e(nl1e, d);
2770 return X86EMUL_CMPXCHG_FAILED;
2773 else
2775 ol1e = *pl1e;
2776 *pl1e = nl1e;
2778 unmap_domain_page(pl1e);
2780 /* Finally, drop the old PTE. */
2781 put_page_from_l1e(ol1e, d);
2783 return X86EMUL_CONTINUE;
2786 static int ptwr_emulated_write(
2787 unsigned long addr,
2788 unsigned long val,
2789 unsigned int bytes)
2791 return ptwr_emulated_update(addr, 0, val, bytes, 0);
2794 static int ptwr_emulated_cmpxchg(
2795 unsigned long addr,
2796 unsigned long old,
2797 unsigned long new,
2798 unsigned int bytes)
2800 return ptwr_emulated_update(addr, old, new, bytes, 1);
2803 static int ptwr_emulated_cmpxchg8b(
2804 unsigned long addr,
2805 unsigned long old,
2806 unsigned long old_hi,
2807 unsigned long new,
2808 unsigned long new_hi)
2810 return ptwr_emulated_update(
2811 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
2814 static struct x86_mem_emulator ptwr_mem_emulator = {
2815 .read_std = x86_emulate_read_std,
2816 .write_std = x86_emulate_write_std,
2817 .read_emulated = x86_emulate_read_std,
2818 .write_emulated = ptwr_emulated_write,
2819 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
2820 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
2821 };
2823 /* Write page fault handler: check if guest is trying to modify a PTE. */
2824 int ptwr_do_page_fault(struct domain *d, unsigned long addr)
2826 unsigned long pfn;
2827 struct pfn_info *page;
2828 l1_pgentry_t pte;
2829 l2_pgentry_t *pl2e;
2830 int which;
2831 u32 l2_idx;
2833 if ( unlikely(shadow_mode_enabled(d)) )
2834 return 0;
2836 /*
2837 * Attempt to read the PTE that maps the VA being accessed. By checking for
2838 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
2839 */
2840 if ( !(l2e_get_flags(__linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
2841 _PAGE_PRESENT) ||
2842 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
2843 sizeof(pte)) )
2845 return 0;
2848 pfn = l1e_get_pfn(pte);
2849 page = &frame_table[pfn];
2851 /* We are looking only for read-only mappings of p.t. pages. */
2852 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
2853 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
2854 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
2855 (page_get_owner(page) != d) )
2857 return 0;
2860 /* x86/64: Writable pagetable code needs auditing. Use emulator for now. */
2861 #if defined(__x86_64__)
2862 goto emulate;
2863 #endif
2865 /* Get the L2 index at which this L1 p.t. is always mapped. */
2866 l2_idx = page->u.inuse.type_info & PGT_va_mask;
2867 if ( unlikely(l2_idx >= PGT_va_unknown) )
2868 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
2869 l2_idx >>= PGT_va_shift;
2871 if ( unlikely(l2_idx == (addr >> L2_PAGETABLE_SHIFT)) )
2872 goto emulate; /* Urk! Pagetable maps itself! */
2874 /*
2875 * Is the L1 p.t. mapped into the current address space? If so we call it
2876 * an ACTIVE p.t., otherwise it is INACTIVE.
2877 */
2878 pl2e = &__linear_l2_table[l2_idx];
2879 which = PTWR_PT_INACTIVE;
2880 if ( (l2e_get_pfn(*pl2e)) == pfn )
2882 /*
2883 * Check the PRESENT bit to set ACTIVE mode.
2884 * If the PRESENT bit is clear, we may be conflicting with the current
2885 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
2886 * The ptwr_flush call below will restore the PRESENT bit.
2887 */
2888 if ( likely(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
2889 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
2890 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
2891 which = PTWR_PT_ACTIVE;
2894 /*
2895 * If this is a multi-processor guest then ensure that the page is hooked
2896 * into at most one L2 table, which must be the one running on this VCPU.
2897 */
2898 if ( (d->vcpu[0]->next_in_list != NULL) &&
2899 ((page->u.inuse.type_info & PGT_count_mask) !=
2900 (!!(page->u.inuse.type_info & PGT_pinned) +
2901 (which == PTWR_PT_ACTIVE))) )
2903 /* Could be conflicting writable mappings from other VCPUs. */
2904 cleanup_writable_pagetable(d);
2905 goto emulate;
2908 PTWR_PRINTK("[%c] page_fault on l1 pt at va %lx, pt for %08x, "
2909 "pfn %lx\n", PTWR_PRINT_WHICH,
2910 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
2912 /*
2913 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
2914 * time. If there is already one, we must flush it out.
2915 */
2916 if ( d->arch.ptwr[which].l1va )
2917 ptwr_flush(d, which);
2919 /*
2920 * If last batch made no updates then we are probably stuck. Emulate this
2921 * update to ensure we make progress.
2922 */
2923 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
2925 /* Ensure that we don't get stuck in an emulation-only rut. */
2926 d->arch.ptwr[which].prev_nr_updates = 1;
2927 goto emulate;
2930 d->arch.ptwr[which].l1va = addr | 1;
2931 d->arch.ptwr[which].l2_idx = l2_idx;
2932 d->arch.ptwr[which].vcpu = current;
2934 /* For safety, disconnect the L1 p.t. page from current space. */
2935 if ( which == PTWR_PT_ACTIVE )
2937 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
2938 flush_tlb_mask(d->cpumask);
2941 /* Temporarily map the L1 page, and make a copy of it. */
2942 d->arch.ptwr[which].pl1e = map_domain_page(pfn);
2943 memcpy(d->arch.ptwr[which].page,
2944 d->arch.ptwr[which].pl1e,
2945 L1_PAGETABLE_ENTRIES * sizeof(l1_pgentry_t));
2947 /* Finally, make the p.t. page writable by the guest OS. */
2948 l1e_add_flags(pte, _PAGE_RW);
2949 if ( unlikely(__copy_to_user(&linear_pg_table[addr>>PAGE_SHIFT],
2950 &pte, sizeof(pte))) )
2952 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
2953 &linear_pg_table[addr>>PAGE_SHIFT]);
2954 /* Toss the writable pagetable state and crash. */
2955 unmap_domain_page(d->arch.ptwr[which].pl1e);
2956 d->arch.ptwr[which].l1va = 0;
2957 domain_crash();
2958 return 0;
2961 return EXCRET_fault_fixed;
2963 emulate:
2964 if ( x86_emulate_memop(guest_cpu_user_regs(), addr,
2965 &ptwr_mem_emulator, BITS_PER_LONG/8) )
2966 return 0;
2967 perfc_incrc(ptwr_emulations);
2968 return EXCRET_fault_fixed;
2971 int ptwr_init(struct domain *d)
2973 void *x = alloc_xenheap_page();
2974 void *y = alloc_xenheap_page();
2976 if ( (x == NULL) || (y == NULL) )
2978 if ( x != NULL )
2979 free_xenheap_page(x);
2980 if ( y != NULL )
2981 free_xenheap_page(y);
2982 return -ENOMEM;
2985 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
2986 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
2988 return 0;
2991 void ptwr_destroy(struct domain *d)
2993 cleanup_writable_pagetable(d);
2994 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
2995 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
2998 void cleanup_writable_pagetable(struct domain *d)
3000 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3001 return;
3003 if ( unlikely(shadow_mode_enabled(d)) )
3005 shadow_sync_all(d);
3007 else
3009 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3010 ptwr_flush(d, PTWR_PT_ACTIVE);
3011 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3012 ptwr_flush(d, PTWR_PT_INACTIVE);
3016 int map_pages_to_xen(
3017 unsigned long virt,
3018 unsigned long pfn,
3019 unsigned long nr_pfns,
3020 unsigned long flags)
3022 l2_pgentry_t *pl2e, ol2e;
3023 l1_pgentry_t *pl1e, ol1e;
3024 unsigned int i;
3026 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3027 flags &= ~MAP_SMALL_PAGES;
3029 while ( nr_pfns != 0 )
3031 pl2e = virt_to_xen_l2e(virt);
3033 if ( ((((virt>>PAGE_SHIFT) | pfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3034 (nr_pfns >= (1<<PAGETABLE_ORDER)) &&
3035 !map_small_pages )
3037 /* Super-page mapping. */
3038 ol2e = *pl2e;
3039 *pl2e = l2e_from_pfn(pfn, flags|_PAGE_PSE);
3041 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3043 local_flush_tlb_pge();
3044 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3045 free_xen_pagetable(l2e_get_page(*pl2e));
3048 virt += 1UL << L2_PAGETABLE_SHIFT;
3049 pfn += 1UL << PAGETABLE_ORDER;
3050 nr_pfns -= 1UL << PAGETABLE_ORDER;
3052 else
3054 /* Normal page mapping. */
3055 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3057 pl1e = page_to_virt(alloc_xen_pagetable());
3058 clear_page(pl1e);
3059 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3061 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3063 pl1e = page_to_virt(alloc_xen_pagetable());
3064 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3065 pl1e[i] = l1e_from_pfn(
3066 l2e_get_pfn(*pl2e) + i,
3067 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3068 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3069 local_flush_tlb_pge();
3072 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3073 ol1e = *pl1e;
3074 *pl1e = l1e_from_pfn(pfn, flags);
3075 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3076 local_flush_tlb_one(virt);
3078 virt += 1UL << L1_PAGETABLE_SHIFT;
3079 pfn += 1UL;
3080 nr_pfns -= 1UL;
3084 return 0;
3087 void __set_fixmap(
3088 enum fixed_addresses idx, unsigned long p, unsigned long flags)
3090 if ( unlikely(idx >= __end_of_fixed_addresses) )
3091 BUG();
3092 map_pages_to_xen(fix_to_virt(idx), p >> PAGE_SHIFT, 1, flags);
3095 #ifdef MEMORY_GUARD
3097 void memguard_init(void)
3099 map_pages_to_xen(
3100 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3101 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3104 static void __memguard_change_range(void *p, unsigned long l, int guard)
3106 unsigned long _p = (unsigned long)p;
3107 unsigned long _l = (unsigned long)l;
3108 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3110 /* Ensure we are dealing with a page-aligned whole number of pages. */
3111 ASSERT((_p&PAGE_MASK) != 0);
3112 ASSERT((_l&PAGE_MASK) != 0);
3113 ASSERT((_p&~PAGE_MASK) == 0);
3114 ASSERT((_l&~PAGE_MASK) == 0);
3116 if ( guard )
3117 flags &= ~_PAGE_PRESENT;
3119 map_pages_to_xen(
3120 _p, virt_to_phys(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3123 void memguard_guard_range(void *p, unsigned long l)
3125 __memguard_change_range(p, l, 1);
3128 void memguard_unguard_range(void *p, unsigned long l)
3130 __memguard_change_range(p, l, 0);
3133 #endif
3135 /*
3136 * Local variables:
3137 * mode: C
3138 * c-set-style: "BSD"
3139 * c-basic-offset: 4
3140 * tab-width: 4
3141 * indent-tabs-mode: nil
3142 * End:
3143 */