direct-io.hg

view xen/arch/x86/mm.c @ 10173:954f4dea9da6

[PAE] Allow pgdirs above 4GB for paravirt guests.
**NOTE**: This obviates the need for lowmem_emergency_pool.
Unpriv guests no longer need to be able to allocate memory
below 4GB for PAE PDPTs.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri May 26 17:22:30 2006 +0100 (2006-05-26)
parents 41de9cd7971b
children d5f98d23427a
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <xen/domain_page.h>
98 #include <xen/event.h>
99 #include <xen/iocap.h>
100 #include <xen/guest_access.h>
101 #include <asm/shadow.h>
102 #include <asm/page.h>
103 #include <asm/flushtlb.h>
104 #include <asm/io.h>
105 #include <asm/ldt.h>
106 #include <asm/x86_emulate.h>
107 #include <public/memory.h>
109 #ifdef VERBOSE
110 #define MEM_LOG(_f, _a...) \
111 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
112 current->domain->domain_id , __LINE__ , ## _a )
113 #else
114 #define MEM_LOG(_f, _a...) ((void)0)
115 #endif
117 /*
118 * Both do_mmuext_op() and do_mmu_update():
119 * We steal the m.s.b. of the @count parameter to indicate whether this
120 * invocation of do_mmu_update() is resuming a previously preempted call.
121 */
122 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
124 static void free_l2_table(struct page_info *page);
125 static void free_l1_table(struct page_info *page);
127 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
128 unsigned long type);
129 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
131 /* Used to defer flushing of memory structures. */
132 static struct {
133 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
134 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
135 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
136 unsigned int deferred_ops;
137 /* If non-NULL, specifies a foreign subject domain for some operations. */
138 struct domain *foreign;
139 } __cacheline_aligned percpu_info[NR_CPUS];
141 /*
142 * Returns the current foreign domain; defaults to the currently-executing
143 * domain if a foreign override hasn't been specified.
144 */
145 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
147 /* Private domain structs for DOMID_XEN and DOMID_IO. */
148 static struct domain *dom_xen, *dom_io;
150 /* Frame table and its size in pages. */
151 struct page_info *frame_table;
152 unsigned long max_page;
153 unsigned long total_pages;
155 void __init init_frametable(void)
156 {
157 unsigned long nr_pages, page_step, i, mfn;
159 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
161 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
162 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
164 for ( i = 0; i < nr_pages; i += page_step )
165 {
166 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
167 if ( mfn == 0 )
168 panic("Not enough memory for frame table\n");
169 map_pages_to_xen(
170 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
171 mfn, page_step, PAGE_HYPERVISOR);
172 }
174 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
175 }
177 void arch_init_memory(void)
178 {
179 extern void subarch_init_memory(void);
181 unsigned long i, pfn, rstart_pfn, rend_pfn;
183 memset(percpu_info, 0, sizeof(percpu_info));
185 /*
186 * Initialise our DOMID_XEN domain.
187 * Any Xen-heap pages that we will allow to be mapped will have
188 * their domain field set to dom_xen.
189 */
190 dom_xen = alloc_domain();
191 spin_lock_init(&dom_xen->page_alloc_lock);
192 atomic_set(&dom_xen->refcnt, 1);
193 dom_xen->domain_id = DOMID_XEN;
195 /*
196 * Initialise our DOMID_IO domain.
197 * This domain owns I/O pages that are within the range of the page_info
198 * array. Mappings occur at the priv of the caller.
199 */
200 dom_io = alloc_domain();
201 spin_lock_init(&dom_io->page_alloc_lock);
202 atomic_set(&dom_io->refcnt, 1);
203 dom_io->domain_id = DOMID_IO;
205 /* First 1MB of RAM is historically marked as I/O. */
206 for ( i = 0; i < 0x100; i++ )
207 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
209 /* Any areas not specified as RAM by the e820 map are considered I/O. */
210 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
211 {
212 if ( e820.map[i].type != E820_RAM )
213 continue;
214 /* Every page from cursor to start of next RAM region is I/O. */
215 rstart_pfn = PFN_UP(e820.map[i].addr);
216 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
217 for ( ; pfn < rstart_pfn; pfn++ )
218 {
219 BUG_ON(!mfn_valid(pfn));
220 share_xen_page_with_guest(
221 mfn_to_page(pfn), dom_io, XENSHARE_writable);
222 }
223 /* Skip the RAM region. */
224 pfn = rend_pfn;
225 }
226 BUG_ON(pfn != max_page);
228 subarch_init_memory();
229 }
231 void share_xen_page_with_guest(
232 struct page_info *page, struct domain *d, int readonly)
233 {
234 if ( page_get_owner(page) == d )
235 return;
237 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
239 spin_lock(&d->page_alloc_lock);
241 /* The incremented type count pins as writable or read-only. */
242 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
243 page->u.inuse.type_info |= PGT_validated | 1;
245 page_set_owner(page, d);
246 wmb(); /* install valid domain ptr before updating refcnt. */
247 ASSERT(page->count_info == 0);
248 page->count_info |= PGC_allocated | 1;
250 if ( unlikely(d->xenheap_pages++ == 0) )
251 get_knownalive_domain(d);
252 list_add_tail(&page->list, &d->xenpage_list);
254 spin_unlock(&d->page_alloc_lock);
255 }
257 void share_xen_page_with_privileged_guests(
258 struct page_info *page, int readonly)
259 {
260 share_xen_page_with_guest(page, dom_xen, readonly);
261 }
263 static void __write_ptbase(unsigned long mfn)
264 {
265 #ifdef CONFIG_X86_PAE
266 if ( mfn >= 0x100000 )
267 {
268 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
269 struct vcpu *v = current;
270 unsigned long flags;
272 /* Protects against re-entry and against __pae_flush_pgd(). */
273 local_irq_save(flags);
275 /* Pick an unused low-memory L3 cache slot. */
276 v->arch.lowmem_l3tab_inuse ^= 1;
277 lowmem_l3tab = v->arch.lowmem_l3tab[v->arch.lowmem_l3tab_inuse];
278 v->arch.lowmem_l3tab_high_mfn[v->arch.lowmem_l3tab_inuse] = mfn;
280 /* Map the guest L3 table and copy to the chosen low-memory cache. */
281 highmem_l3tab = map_domain_page(mfn);
282 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(v->arch.lowmem_l3tab));
283 unmap_domain_page(highmem_l3tab);
285 /* Install the low-memory L3 table in CR3. */
286 write_cr3(__pa(lowmem_l3tab));
288 local_irq_restore(flags);
289 return;
290 }
291 #endif
293 write_cr3(mfn << PAGE_SHIFT);
294 }
296 void write_ptbase(struct vcpu *v)
297 {
298 __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
299 }
301 void invalidate_shadow_ldt(struct vcpu *v)
302 {
303 int i;
304 unsigned long pfn;
305 struct page_info *page;
307 if ( v->arch.shadow_ldt_mapcnt == 0 )
308 return;
310 v->arch.shadow_ldt_mapcnt = 0;
312 for ( i = 16; i < 32; i++ )
313 {
314 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
315 if ( pfn == 0 ) continue;
316 v->arch.perdomain_ptes[i] = l1e_empty();
317 page = mfn_to_page(pfn);
318 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
319 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
320 put_page_and_type(page);
321 }
323 /* Dispose of the (now possibly invalid) mappings from the TLB. */
324 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
325 }
328 static int alloc_segdesc_page(struct page_info *page)
329 {
330 struct desc_struct *descs;
331 int i;
333 descs = map_domain_page(page_to_mfn(page));
335 for ( i = 0; i < 512; i++ )
336 if ( unlikely(!check_descriptor(&descs[i])) )
337 goto fail;
339 unmap_domain_page(descs);
340 return 1;
342 fail:
343 unmap_domain_page(descs);
344 return 0;
345 }
348 /* Map shadow page at offset @off. */
349 int map_ldt_shadow_page(unsigned int off)
350 {
351 struct vcpu *v = current;
352 struct domain *d = v->domain;
353 unsigned long gmfn, mfn;
354 l1_pgentry_t l1e, nl1e;
355 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
356 int res;
358 #if defined(__x86_64__)
359 /* If in user mode, switch to kernel mode just to read LDT mapping. */
360 int user_mode = !(v->arch.flags & TF_kernel_mode);
361 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
362 #elif defined(__i386__)
363 #define TOGGLE_MODE() ((void)0)
364 #endif
366 BUG_ON(unlikely(in_irq()));
368 shadow_sync_va(v, gva);
370 TOGGLE_MODE();
371 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
372 sizeof(l1e));
373 TOGGLE_MODE();
375 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
376 return 0;
378 gmfn = l1e_get_pfn(l1e);
379 mfn = gmfn_to_mfn(d, gmfn);
380 if ( unlikely(!VALID_MFN(mfn)) )
381 return 0;
383 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
385 if ( !res && unlikely(shadow_mode_refcounts(d)) )
386 {
387 shadow_lock(d);
388 shadow_remove_all_write_access(d, gmfn, mfn);
389 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
390 shadow_unlock(d);
391 }
393 if ( unlikely(!res) )
394 return 0;
396 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
398 v->arch.perdomain_ptes[off + 16] = nl1e;
399 v->arch.shadow_ldt_mapcnt++;
401 return 1;
402 }
405 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
406 {
407 struct page_info *page = mfn_to_page(page_nr);
409 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
410 {
411 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
412 return 0;
413 }
415 return 1;
416 }
419 static int get_page_and_type_from_pagenr(unsigned long page_nr,
420 unsigned long type,
421 struct domain *d)
422 {
423 struct page_info *page = mfn_to_page(page_nr);
425 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
426 return 0;
428 if ( unlikely(!get_page_type(page, type)) )
429 {
430 put_page(page);
431 return 0;
432 }
434 return 1;
435 }
437 #ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
438 /*
439 * We allow root tables to map each other (a.k.a. linear page tables). It
440 * needs some special care with reference counts and access permissions:
441 * 1. The mapping entry must be read-only, or the guest may get write access
442 * to its own PTEs.
443 * 2. We must only bump the reference counts for an *already validated*
444 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
445 * on a validation that is required to complete that validation.
446 * 3. We only need to increment the reference counts for the mapped page
447 * frame if it is mapped by a different root table. This is sufficient and
448 * also necessary to allow validation of a root table mapping itself.
449 */
450 static int
451 get_linear_pagetable(
452 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
453 {
454 unsigned long x, y;
455 struct page_info *page;
456 unsigned long pfn;
458 ASSERT( !shadow_mode_refcounts(d) );
460 if ( (root_get_flags(re) & _PAGE_RW) )
461 {
462 MEM_LOG("Attempt to create linear p.t. with write perms");
463 return 0;
464 }
466 if ( (pfn = root_get_pfn(re)) != re_pfn )
467 {
468 /* Make sure the mapped frame belongs to the correct domain. */
469 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
470 return 0;
472 /*
473 * Make sure that the mapped frame is an already-validated L2 table.
474 * If so, atomically increment the count (checking for overflow).
475 */
476 page = mfn_to_page(pfn);
477 y = page->u.inuse.type_info;
478 do {
479 x = y;
480 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
481 unlikely((x & (PGT_type_mask|PGT_validated)) !=
482 (PGT_root_page_table|PGT_validated)) )
483 {
484 put_page(page);
485 return 0;
486 }
487 }
488 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
489 }
491 return 1;
492 }
493 #endif /* !CONFIG_X86_PAE */
495 int
496 get_page_from_l1e(
497 l1_pgentry_t l1e, struct domain *d)
498 {
499 unsigned long mfn = l1e_get_pfn(l1e);
500 struct page_info *page = mfn_to_page(mfn);
501 int okay;
503 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
504 return 1;
506 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
507 {
508 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
509 return 0;
510 }
512 if ( unlikely(!mfn_valid(mfn)) ||
513 unlikely(page_get_owner(page) == dom_io) )
514 {
515 /* DOMID_IO reverts to caller for privilege checks. */
516 if ( d == dom_io )
517 d = current->domain;
519 if ( !iomem_access_permitted(d, mfn, mfn) )
520 {
521 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
522 return 0;
523 }
525 /* No reference counting for out-of-range I/O pages. */
526 if ( !mfn_valid(mfn) )
527 return 1;
529 d = dom_io;
530 }
532 okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
533 get_page_and_type(page, d, PGT_writable_page) :
534 get_page(page, d));
535 if ( !okay )
536 {
537 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
538 " for dom%d",
539 mfn, get_gpfn_from_mfn(mfn),
540 l1e_get_intpte(l1e), d->domain_id);
541 }
543 return okay;
544 }
547 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
548 static int
549 get_page_from_l2e(
550 l2_pgentry_t l2e, unsigned long pfn,
551 struct domain *d, unsigned long vaddr)
552 {
553 int rc;
555 ASSERT(!shadow_mode_refcounts(d));
557 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
558 return 1;
560 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
561 {
562 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
563 return 0;
564 }
566 vaddr >>= L2_PAGETABLE_SHIFT;
567 vaddr <<= PGT_va_shift;
568 rc = get_page_and_type_from_pagenr(
569 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
570 #if CONFIG_PAGING_LEVELS == 2
571 if ( unlikely(!rc) )
572 rc = get_linear_pagetable(l2e, pfn, d);
573 #endif
574 return rc;
575 }
578 #if CONFIG_PAGING_LEVELS >= 3
579 static int
580 get_page_from_l3e(
581 l3_pgentry_t l3e, unsigned long pfn,
582 struct domain *d, unsigned long vaddr)
583 {
584 int rc;
586 ASSERT(!shadow_mode_refcounts(d));
588 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
589 return 1;
591 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
592 {
593 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
594 return 0;
595 }
597 vaddr >>= L3_PAGETABLE_SHIFT;
598 vaddr <<= PGT_va_shift;
599 rc = get_page_and_type_from_pagenr(
600 l3e_get_pfn(l3e),
601 PGT_l2_page_table | vaddr, d);
602 return rc;
603 }
604 #endif /* 3 level */
606 #if CONFIG_PAGING_LEVELS >= 4
607 static int
608 get_page_from_l4e(
609 l4_pgentry_t l4e, unsigned long pfn,
610 struct domain *d, unsigned long vaddr)
611 {
612 int rc;
614 ASSERT( !shadow_mode_refcounts(d) );
616 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
617 return 1;
619 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
620 {
621 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
622 return 0;
623 }
625 vaddr >>= L4_PAGETABLE_SHIFT;
626 vaddr <<= PGT_va_shift;
627 rc = get_page_and_type_from_pagenr(
628 l4e_get_pfn(l4e),
629 PGT_l3_page_table | vaddr, d);
631 if ( unlikely(!rc) )
632 rc = get_linear_pagetable(l4e, pfn, d);
634 return rc;
635 }
636 #endif /* 4 level */
639 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
640 {
641 unsigned long pfn = l1e_get_pfn(l1e);
642 struct page_info *page = mfn_to_page(pfn);
643 struct domain *e;
644 struct vcpu *v;
646 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
647 return;
649 e = page_get_owner(page);
651 /*
652 * Check if this is a mapping that was established via a grant reference.
653 * If it was then we should not be here: we require that such mappings are
654 * explicitly destroyed via the grant-table interface.
655 *
656 * The upshot of this is that the guest can end up with active grants that
657 * it cannot destroy (because it no longer has a PTE to present to the
658 * grant-table interface). This can lead to subtle hard-to-catch bugs,
659 * hence a special grant PTE flag can be enabled to catch the bug early.
660 *
661 * (Note that the undestroyable active grants are not a security hole in
662 * Xen. All active grants can safely be cleaned up when the domain dies.)
663 */
664 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
665 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
666 {
667 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
668 l1e_get_intpte(l1e));
669 domain_crash(d);
670 }
672 if ( l1e_get_flags(l1e) & _PAGE_RW )
673 {
674 put_page_and_type(page);
675 }
676 else
677 {
678 /* We expect this is rare so we blow the entire shadow LDT. */
679 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
680 PGT_ldt_page)) &&
681 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
682 (d == e) )
683 {
684 for_each_vcpu ( d, v )
685 invalidate_shadow_ldt(v);
686 }
687 put_page(page);
688 }
689 }
692 /*
693 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
694 * Note also that this automatically deals correctly with linear p.t.'s.
695 */
696 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
697 {
698 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
699 (l2e_get_pfn(l2e) != pfn) )
700 put_page_and_type(mfn_to_page(l2e_get_pfn(l2e)));
701 }
704 #if CONFIG_PAGING_LEVELS >= 3
705 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
706 {
707 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
708 (l3e_get_pfn(l3e) != pfn) )
709 put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
710 }
711 #endif
713 #if CONFIG_PAGING_LEVELS >= 4
714 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
715 {
716 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
717 (l4e_get_pfn(l4e) != pfn) )
718 put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
719 }
720 #endif
722 static int alloc_l1_table(struct page_info *page)
723 {
724 struct domain *d = page_get_owner(page);
725 unsigned long pfn = page_to_mfn(page);
726 l1_pgentry_t *pl1e;
727 int i;
729 ASSERT(!shadow_mode_refcounts(d));
731 pl1e = map_domain_page(pfn);
733 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
734 if ( is_guest_l1_slot(i) &&
735 unlikely(!get_page_from_l1e(pl1e[i], d)) )
736 goto fail;
738 unmap_domain_page(pl1e);
739 return 1;
741 fail:
742 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
743 while ( i-- > 0 )
744 if ( is_guest_l1_slot(i) )
745 put_page_from_l1e(pl1e[i], d);
747 unmap_domain_page(pl1e);
748 return 0;
749 }
751 #ifdef CONFIG_X86_PAE
752 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
753 {
754 struct page_info *page;
755 l2_pgentry_t *pl2e;
756 l3_pgentry_t l3e3;
757 int i;
759 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
761 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
762 l3e3 = pl3e[3];
763 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
764 {
765 MEM_LOG("PAE L3 3rd slot is empty");
766 return 0;
767 }
769 /*
770 * The Xen-private mappings include linear mappings. The L2 thus cannot
771 * be shared by multiple L3 tables. The test here is adequate because:
772 * 1. Cannot appear in slots != 3 because the page would then then have
773 * unknown va backpointer, which get_page_type() explicitly disallows.
774 * 2. Cannot appear in another page table's L3:
775 * a. alloc_l3_table() calls this function and this check will fail
776 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
777 */
778 page = l3e_get_page(l3e3);
779 BUG_ON(page->u.inuse.type_info & PGT_pinned);
780 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
781 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
782 {
783 MEM_LOG("PAE L3 3rd slot is shared");
784 return 0;
785 }
787 /* Xen private mappings. */
788 pl2e = map_domain_page(l3e_get_pfn(l3e3));
789 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
790 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
791 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
792 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
793 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
794 l2e_from_page(
795 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
796 __PAGE_HYPERVISOR);
797 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
798 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
799 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
800 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
801 l2e_empty();
802 unmap_domain_page(pl2e);
804 return 1;
805 }
807 struct pae_flush_pgd {
808 unsigned long l3tab_mfn;
809 unsigned int l3tab_idx;
810 l3_pgentry_t nl3e;
811 };
813 static void __pae_flush_pgd(void *data)
814 {
815 struct pae_flush_pgd *args = data;
816 struct vcpu *v = this_cpu(curr_vcpu);
817 int i = v->arch.lowmem_l3tab_inuse;
818 intpte_t _ol3e, _nl3e, _pl3e;
819 l3_pgentry_t *l3tab_ptr;
821 ASSERT(!local_irq_is_enabled());
823 if ( v->arch.lowmem_l3tab_high_mfn[i] != args->l3tab_mfn )
824 return;
826 l3tab_ptr = &v->arch.lowmem_l3tab[i][args->l3tab_idx];
828 _ol3e = l3e_get_intpte(*l3tab_ptr);
829 _nl3e = l3e_get_intpte(args->nl3e);
830 _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
831 BUG_ON(_pl3e != _ol3e);
832 }
834 /* Flush a pgdir update into low-memory caches. */
835 static void pae_flush_pgd(
836 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
837 {
838 struct domain *d = page_get_owner(mfn_to_page(mfn));
839 struct pae_flush_pgd args = {
840 .l3tab_mfn = mfn,
841 .l3tab_idx = idx,
842 .nl3e = nl3e };
844 /* If below 4GB then the pgdir is not shadowed in low memory. */
845 if ( mfn < 0x100000 )
846 return;
848 on_selected_cpus(d->domain_dirty_cpumask, __pae_flush_pgd, &args, 1, 1);
849 }
851 static inline int l1_backptr(
852 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
853 {
854 unsigned long l2_backptr = l2_type & PGT_va_mask;
855 ASSERT(l2_backptr != PGT_va_unknown);
856 ASSERT(l2_backptr != PGT_va_mutable);
857 *backptr =
858 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
859 (offset_in_l2 << L2_PAGETABLE_SHIFT);
860 return 1;
861 }
863 #elif CONFIG_X86_64
864 # define create_pae_xen_mappings(pl3e) (1)
865 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
867 static inline int l1_backptr(
868 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
869 {
870 unsigned long l2_backptr = l2_type & PGT_va_mask;
871 ASSERT(l2_backptr != PGT_va_unknown);
872 ASSERT(l2_backptr != PGT_va_mutable);
873 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
874 (offset_in_l2 << L2_PAGETABLE_SHIFT);
875 return 1;
876 }
878 static inline int l2_backptr(
879 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
880 {
881 unsigned long l3_backptr = l3_type & PGT_va_mask;
882 ASSERT(l3_backptr != PGT_va_unknown);
883 ASSERT(l3_backptr != PGT_va_mutable);
884 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
885 (offset_in_l3 << L3_PAGETABLE_SHIFT);
886 return 1;
887 }
889 static inline int l3_backptr(
890 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
891 {
892 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
893 return 1;
894 }
895 #else
896 # define create_pae_xen_mappings(pl3e) (1)
897 # define l1_backptr(bp,l2o,l2t) \
898 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
899 #endif
901 static int alloc_l2_table(struct page_info *page, unsigned long type)
902 {
903 struct domain *d = page_get_owner(page);
904 unsigned long pfn = page_to_mfn(page);
905 unsigned long vaddr;
906 l2_pgentry_t *pl2e;
907 int i;
909 /* See the code in shadow_promote() to understand why this is here. */
910 if ( (PGT_base_page_table == PGT_l2_page_table) &&
911 unlikely(shadow_mode_refcounts(d)) )
912 return 1;
913 ASSERT(!shadow_mode_refcounts(d));
915 pl2e = map_domain_page(pfn);
917 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
918 {
919 if ( !l1_backptr(&vaddr, i, type) )
920 goto fail;
921 if ( is_guest_l2_slot(type, i) &&
922 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
923 goto fail;
924 }
926 #if CONFIG_PAGING_LEVELS == 2
927 /* Xen private mappings. */
928 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
929 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
930 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
931 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
932 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
933 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
934 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
935 l2e_from_page(
936 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
937 __PAGE_HYPERVISOR);
938 #endif
940 unmap_domain_page(pl2e);
941 return 1;
943 fail:
944 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
945 while ( i-- > 0 )
946 if ( is_guest_l2_slot(type, i) )
947 put_page_from_l2e(pl2e[i], pfn);
949 unmap_domain_page(pl2e);
950 return 0;
951 }
954 #if CONFIG_PAGING_LEVELS >= 3
955 static int alloc_l3_table(struct page_info *page, unsigned long type)
956 {
957 struct domain *d = page_get_owner(page);
958 unsigned long pfn = page_to_mfn(page);
959 unsigned long vaddr;
960 l3_pgentry_t *pl3e;
961 int i;
963 ASSERT(!shadow_mode_refcounts(d));
965 pl3e = map_domain_page(pfn);
966 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
967 {
968 #if CONFIG_PAGING_LEVELS >= 4
969 if ( !l2_backptr(&vaddr, i, type) )
970 goto fail;
971 #else
972 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
973 #endif
974 if ( is_guest_l3_slot(i) &&
975 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
976 goto fail;
977 }
979 if ( !create_pae_xen_mappings(pl3e) )
980 goto fail;
982 unmap_domain_page(pl3e);
983 return 1;
985 fail:
986 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
987 while ( i-- > 0 )
988 if ( is_guest_l3_slot(i) )
989 put_page_from_l3e(pl3e[i], pfn);
991 unmap_domain_page(pl3e);
992 return 0;
993 }
994 #else
995 #define alloc_l3_table(page, type) (0)
996 #endif
998 #if CONFIG_PAGING_LEVELS >= 4
999 static int alloc_l4_table(struct page_info *page, unsigned long type)
1001 struct domain *d = page_get_owner(page);
1002 unsigned long pfn = page_to_mfn(page);
1003 l4_pgentry_t *pl4e = page_to_virt(page);
1004 unsigned long vaddr;
1005 int i;
1007 /* See the code in shadow_promote() to understand why this is here. */
1008 if ( (PGT_base_page_table == PGT_l4_page_table) &&
1009 shadow_mode_refcounts(d) )
1010 return 1;
1011 ASSERT(!shadow_mode_refcounts(d));
1013 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1015 if ( !l3_backptr(&vaddr, i, type) )
1016 goto fail;
1018 if ( is_guest_l4_slot(i) &&
1019 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
1020 goto fail;
1023 /* Xen private mappings. */
1024 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1025 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1026 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1027 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1028 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1029 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1030 l4e_from_page(
1031 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
1032 __PAGE_HYPERVISOR);
1034 return 1;
1036 fail:
1037 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1038 while ( i-- > 0 )
1039 if ( is_guest_l4_slot(i) )
1040 put_page_from_l4e(pl4e[i], pfn);
1042 return 0;
1044 #else
1045 #define alloc_l4_table(page, type) (0)
1046 #endif
1049 static void free_l1_table(struct page_info *page)
1051 struct domain *d = page_get_owner(page);
1052 unsigned long pfn = page_to_mfn(page);
1053 l1_pgentry_t *pl1e;
1054 int i;
1056 pl1e = map_domain_page(pfn);
1058 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1059 if ( is_guest_l1_slot(i) )
1060 put_page_from_l1e(pl1e[i], d);
1062 unmap_domain_page(pl1e);
1066 static void free_l2_table(struct page_info *page)
1068 unsigned long pfn = page_to_mfn(page);
1069 l2_pgentry_t *pl2e;
1070 int i;
1072 pl2e = map_domain_page(pfn);
1074 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1075 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
1076 put_page_from_l2e(pl2e[i], pfn);
1078 unmap_domain_page(pl2e);
1082 #if CONFIG_PAGING_LEVELS >= 3
1084 static void free_l3_table(struct page_info *page)
1086 unsigned long pfn = page_to_mfn(page);
1087 l3_pgentry_t *pl3e;
1088 int i;
1090 pl3e = map_domain_page(pfn);
1092 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1093 if ( is_guest_l3_slot(i) )
1094 put_page_from_l3e(pl3e[i], pfn);
1096 unmap_domain_page(pl3e);
1099 #endif
1101 #if CONFIG_PAGING_LEVELS >= 4
1103 static void free_l4_table(struct page_info *page)
1105 unsigned long pfn = page_to_mfn(page);
1106 l4_pgentry_t *pl4e = page_to_virt(page);
1107 int i;
1109 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1110 if ( is_guest_l4_slot(i) )
1111 put_page_from_l4e(pl4e[i], pfn);
1114 #endif
1116 static inline int update_l1e(l1_pgentry_t *pl1e,
1117 l1_pgentry_t ol1e,
1118 l1_pgentry_t nl1e)
1120 intpte_t o = l1e_get_intpte(ol1e);
1121 intpte_t n = l1e_get_intpte(nl1e);
1123 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
1124 unlikely(o != l1e_get_intpte(ol1e)) )
1126 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1127 ": saw %" PRIpte,
1128 l1e_get_intpte(ol1e),
1129 l1e_get_intpte(nl1e),
1130 o);
1131 return 0;
1133 return 1;
1137 /* Update the L1 entry at pl1e to new value nl1e. */
1138 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1140 l1_pgentry_t ol1e;
1141 struct domain *d = current->domain;
1143 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1144 return 0;
1146 if ( unlikely(shadow_mode_refcounts(d)) )
1147 return update_l1e(pl1e, ol1e, nl1e);
1149 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1151 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1153 MEM_LOG("Bad L1 flags %x",
1154 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1155 return 0;
1158 /* Fast path for identical mapping, r/w and presence. */
1159 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1160 return update_l1e(pl1e, ol1e, nl1e);
1162 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1163 return 0;
1165 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1167 put_page_from_l1e(nl1e, d);
1168 return 0;
1171 else
1173 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1174 return 0;
1177 put_page_from_l1e(ol1e, d);
1178 return 1;
1181 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1182 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1183 _t ## e_get_intpte(_o), \
1184 _t ## e_get_intpte(_n)); \
1185 if ( __o != _t ## e_get_intpte(_o) ) \
1186 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1187 ": saw %" PRIpte "", \
1188 (_t ## e_get_intpte(_o)), \
1189 (_t ## e_get_intpte(_n)), \
1190 (__o)); \
1191 (__o == _t ## e_get_intpte(_o)); })
1193 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1194 static int mod_l2_entry(l2_pgentry_t *pl2e,
1195 l2_pgentry_t nl2e,
1196 unsigned long pfn,
1197 unsigned long type)
1199 l2_pgentry_t ol2e;
1200 unsigned long vaddr = 0;
1202 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1204 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1205 return 0;
1208 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1209 return 0;
1211 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1213 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1215 MEM_LOG("Bad L2 flags %x",
1216 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1217 return 0;
1220 /* Fast path for identical mapping and presence. */
1221 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1222 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1224 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1225 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1226 return 0;
1228 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1230 put_page_from_l2e(nl2e, pfn);
1231 return 0;
1234 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1236 return 0;
1239 put_page_from_l2e(ol2e, pfn);
1240 return 1;
1244 #if CONFIG_PAGING_LEVELS >= 3
1246 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1247 static int mod_l3_entry(l3_pgentry_t *pl3e,
1248 l3_pgentry_t nl3e,
1249 unsigned long pfn,
1250 unsigned long type)
1252 l3_pgentry_t ol3e;
1253 unsigned long vaddr;
1254 int okay;
1256 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1258 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1259 return 0;
1262 #ifdef CONFIG_X86_PAE
1263 /*
1264 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1265 * would be a pain to ensure they remain continuously valid throughout.
1266 */
1267 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1268 return 0;
1269 #endif
1271 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1272 return 0;
1274 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1276 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1278 MEM_LOG("Bad L3 flags %x",
1279 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1280 return 0;
1283 /* Fast path for identical mapping and presence. */
1284 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1285 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1287 #if CONFIG_PAGING_LEVELS >= 4
1288 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1289 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1290 return 0;
1291 #else
1292 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1293 << L3_PAGETABLE_SHIFT;
1294 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1295 return 0;
1296 #endif
1298 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1300 put_page_from_l3e(nl3e, pfn);
1301 return 0;
1304 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1306 return 0;
1309 okay = create_pae_xen_mappings(pl3e);
1310 BUG_ON(!okay);
1312 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1314 put_page_from_l3e(ol3e, pfn);
1315 return 1;
1318 #endif
1320 #if CONFIG_PAGING_LEVELS >= 4
1322 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1323 static int mod_l4_entry(l4_pgentry_t *pl4e,
1324 l4_pgentry_t nl4e,
1325 unsigned long pfn,
1326 unsigned long type)
1328 l4_pgentry_t ol4e;
1329 unsigned long vaddr;
1331 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1333 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1334 return 0;
1337 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1338 return 0;
1340 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1342 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1344 MEM_LOG("Bad L4 flags %x",
1345 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1346 return 0;
1349 /* Fast path for identical mapping and presence. */
1350 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1351 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1353 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1354 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1355 return 0;
1357 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1359 put_page_from_l4e(nl4e, pfn);
1360 return 0;
1363 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1365 return 0;
1368 put_page_from_l4e(ol4e, pfn);
1369 return 1;
1372 #endif
1374 int alloc_page_type(struct page_info *page, unsigned long type)
1376 struct domain *owner = page_get_owner(page);
1378 if ( owner != NULL )
1379 mark_dirty(owner, page_to_mfn(page));
1381 switch ( type & PGT_type_mask )
1383 case PGT_l1_page_table:
1384 return alloc_l1_table(page);
1385 case PGT_l2_page_table:
1386 return alloc_l2_table(page, type);
1387 case PGT_l3_page_table:
1388 return alloc_l3_table(page, type);
1389 case PGT_l4_page_table:
1390 return alloc_l4_table(page, type);
1391 case PGT_gdt_page:
1392 case PGT_ldt_page:
1393 return alloc_segdesc_page(page);
1394 default:
1395 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1396 type, page->u.inuse.type_info,
1397 page->count_info);
1398 BUG();
1401 return 0;
1405 void free_page_type(struct page_info *page, unsigned long type)
1407 struct domain *owner = page_get_owner(page);
1408 unsigned long gmfn;
1410 if ( likely(owner != NULL) )
1412 /*
1413 * We have to flush before the next use of the linear mapping
1414 * (e.g., update_va_mapping()) or we could end up modifying a page
1415 * that is no longer a page table (and hence screw up ref counts).
1416 */
1417 percpu_info[smp_processor_id()].deferred_ops |= DOP_FLUSH_ALL_TLBS;
1419 if ( unlikely(shadow_mode_enabled(owner)) )
1421 /* Raw page tables are rewritten during save/restore. */
1422 if ( !shadow_mode_translate(owner) )
1423 mark_dirty(owner, page_to_mfn(page));
1425 if ( shadow_mode_refcounts(owner) )
1426 return;
1428 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1429 ASSERT(VALID_M2P(gmfn));
1430 remove_shadow(owner, gmfn, type & PGT_type_mask);
1434 switch ( type & PGT_type_mask )
1436 case PGT_l1_page_table:
1437 free_l1_table(page);
1438 break;
1440 case PGT_l2_page_table:
1441 free_l2_table(page);
1442 break;
1444 #if CONFIG_PAGING_LEVELS >= 3
1445 case PGT_l3_page_table:
1446 free_l3_table(page);
1447 break;
1448 #endif
1450 #if CONFIG_PAGING_LEVELS >= 4
1451 case PGT_l4_page_table:
1452 free_l4_table(page);
1453 break;
1454 #endif
1456 default:
1457 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1458 type, page_to_mfn(page));
1459 BUG();
1464 void put_page_type(struct page_info *page)
1466 unsigned long nx, x, y = page->u.inuse.type_info;
1468 again:
1469 do {
1470 x = y;
1471 nx = x - 1;
1473 ASSERT((x & PGT_count_mask) != 0);
1475 /*
1476 * The page should always be validated while a reference is held. The
1477 * exception is during domain destruction, when we forcibly invalidate
1478 * page-table pages if we detect a referential loop.
1479 * See domain.c:relinquish_list().
1480 */
1481 ASSERT((x & PGT_validated) ||
1482 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1484 if ( unlikely((nx & PGT_count_mask) == 0) )
1486 /* Record TLB information for flush later. Races are harmless. */
1487 page->tlbflush_timestamp = tlbflush_current_time();
1489 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1490 likely(nx & PGT_validated) )
1492 /*
1493 * Page-table pages must be unvalidated when count is zero. The
1494 * 'free' is safe because the refcnt is non-zero and validated
1495 * bit is clear => other ops will spin or fail.
1496 */
1497 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1498 x & ~PGT_validated)) != x) )
1499 goto again;
1500 /* We cleared the 'valid bit' so we do the clean up. */
1501 free_page_type(page, x);
1502 /* Carry on, but with the 'valid bit' now clear. */
1503 x &= ~PGT_validated;
1504 nx &= ~PGT_validated;
1507 else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) ==
1508 (PGT_pinned|PGT_l1_page_table|1)) )
1510 /* Page is now only pinned. Make the back pointer mutable again. */
1511 nx |= PGT_va_mutable;
1514 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1518 int get_page_type(struct page_info *page, unsigned long type)
1520 unsigned long nx, x, y = page->u.inuse.type_info;
1522 again:
1523 do {
1524 x = y;
1525 nx = x + 1;
1526 if ( unlikely((nx & PGT_count_mask) == 0) )
1528 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1529 return 0;
1531 else if ( unlikely((x & PGT_count_mask) == 0) )
1533 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1535 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1537 /*
1538 * On type change we check to flush stale TLB
1539 * entries. This may be unnecessary (e.g., page
1540 * was GDT/LDT) but those circumstances should be
1541 * very rare.
1542 */
1543 cpumask_t mask =
1544 page_get_owner(page)->domain_dirty_cpumask;
1545 tlbflush_filter(mask, page->tlbflush_timestamp);
1547 if ( unlikely(!cpus_empty(mask)) )
1549 perfc_incrc(need_flush_tlb_flush);
1550 flush_tlb_mask(mask);
1554 /* We lose existing type, back pointer, and validity. */
1555 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1556 nx |= type;
1558 /* No special validation needed for writable pages. */
1559 /* Page tables and GDT/LDT need to be scanned for validity. */
1560 if ( type == PGT_writable_page )
1561 nx |= PGT_validated;
1564 else
1566 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1568 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1570 if ( current->domain == page_get_owner(page) )
1572 /*
1573 * This ensures functions like set_gdt() see up-to-date
1574 * type info without needing to clean up writable p.t.
1575 * state on the fast path.
1576 */
1577 LOCK_BIGLOCK(current->domain);
1578 cleanup_writable_pagetable(current->domain);
1579 y = page->u.inuse.type_info;
1580 UNLOCK_BIGLOCK(current->domain);
1581 /* Can we make progress now? */
1582 if ( ((y & PGT_type_mask) == (type & PGT_type_mask)) ||
1583 ((y & PGT_count_mask) == 0) )
1584 goto again;
1586 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1587 ((type & PGT_type_mask) != PGT_l1_page_table) )
1588 MEM_LOG("Bad type (saw %" PRtype_info
1589 " != exp %" PRtype_info ") "
1590 "for mfn %lx (pfn %lx)",
1591 x, type, page_to_mfn(page),
1592 get_gpfn_from_mfn(page_to_mfn(page)));
1593 return 0;
1595 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1597 /* The va backpointer is mutable, hence we update it. */
1598 nx &= ~PGT_va_mask;
1599 nx |= type; /* we know the actual type is correct */
1601 else if ( (type & PGT_va_mask) != PGT_va_mutable )
1603 ASSERT((type & PGT_va_mask) != (x & PGT_va_mask));
1604 #ifdef CONFIG_X86_PAE
1605 /* We use backptr as extra typing. Cannot be unknown. */
1606 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1607 return 0;
1608 #endif
1609 /* Fixme: add code to propagate va_unknown to subtables. */
1610 if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
1611 !shadow_mode_refcounts(page_get_owner(page)) )
1612 return 0;
1613 /* This table is possibly mapped at multiple locations. */
1614 nx &= ~PGT_va_mask;
1615 nx |= PGT_va_unknown;
1618 if ( unlikely(!(x & PGT_validated)) )
1620 /* Someone else is updating validation of this page. Wait... */
1621 while ( (y = page->u.inuse.type_info) == x )
1622 cpu_relax();
1623 goto again;
1627 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1629 if ( unlikely(!(nx & PGT_validated)) )
1631 /* Try to validate page type; drop the new reference on failure. */
1632 if ( unlikely(!alloc_page_type(page, type)) )
1634 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1635 PRtype_info ": caf=%08x taf=%" PRtype_info,
1636 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1637 type, page->count_info, page->u.inuse.type_info);
1638 /* Noone else can get a reference. We hold the only ref. */
1639 page->u.inuse.type_info = 0;
1640 return 0;
1643 /* Noone else is updating simultaneously. */
1644 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1647 return 1;
1651 int new_guest_cr3(unsigned long mfn)
1653 struct vcpu *v = current;
1654 struct domain *d = v->domain;
1655 int okay;
1656 unsigned long old_base_mfn;
1658 ASSERT(writable_pagetable_in_sync(d));
1660 if ( shadow_mode_refcounts(d) )
1662 okay = get_page_from_pagenr(mfn, d);
1663 if ( unlikely(!okay) )
1665 MEM_LOG("Error while installing new baseptr %lx", mfn);
1666 return 0;
1669 else
1671 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1672 if ( unlikely(!okay) )
1674 /* Switch to idle pagetable: this VCPU has no active p.t. now. */
1675 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1676 v->arch.guest_table = mk_pagetable(0);
1677 update_pagetables(v);
1678 write_cr3(__pa(idle_pg_table));
1679 if ( old_base_mfn != 0 )
1680 put_page_and_type(mfn_to_page(old_base_mfn));
1682 /* Retry the validation with no active p.t. for this VCPU. */
1683 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1684 if ( !okay )
1686 /* Failure here is unrecoverable: the VCPU has no pagetable! */
1687 MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
1688 domain_crash(d);
1689 percpu_info[v->processor].deferred_ops = 0;
1690 return 0;
1695 invalidate_shadow_ldt(v);
1697 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1698 v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1699 update_pagetables(v); /* update shadow_table and monitor_table */
1701 write_ptbase(v);
1703 if ( likely(old_base_mfn != 0) )
1705 if ( shadow_mode_refcounts(d) )
1706 put_page(mfn_to_page(old_base_mfn));
1707 else
1708 put_page_and_type(mfn_to_page(old_base_mfn));
1711 /* CR3 also holds a ref to its shadow... */
1712 if ( shadow_mode_enabled(d) )
1714 if ( v->arch.monitor_shadow_ref )
1715 put_shadow_ref(v->arch.monitor_shadow_ref);
1716 v->arch.monitor_shadow_ref =
1717 pagetable_get_pfn(v->arch.monitor_table);
1718 ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
1719 get_shadow_ref(v->arch.monitor_shadow_ref);
1722 return 1;
1725 static void process_deferred_ops(unsigned int cpu)
1727 unsigned int deferred_ops;
1728 struct domain *d = current->domain;
1730 deferred_ops = percpu_info[cpu].deferred_ops;
1731 percpu_info[cpu].deferred_ops = 0;
1733 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1735 if ( shadow_mode_enabled(d) )
1736 shadow_sync_all(d);
1737 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1738 flush_tlb_mask(d->domain_dirty_cpumask);
1739 else
1740 local_flush_tlb();
1743 if ( deferred_ops & DOP_RELOAD_LDT )
1744 (void)map_ldt_shadow_page(0);
1746 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1748 put_domain(percpu_info[cpu].foreign);
1749 percpu_info[cpu].foreign = NULL;
1753 static int set_foreigndom(unsigned int cpu, domid_t domid)
1755 struct domain *e, *d = current->domain;
1756 int okay = 1;
1758 ASSERT(percpu_info[cpu].foreign == NULL);
1760 if ( likely(domid == DOMID_SELF) )
1761 goto out;
1763 if ( domid == d->domain_id )
1765 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1766 d->domain_id);
1767 okay = 0;
1769 else if ( !IS_PRIV(d) )
1771 switch ( domid )
1773 case DOMID_IO:
1774 get_knownalive_domain(dom_io);
1775 percpu_info[cpu].foreign = dom_io;
1776 break;
1777 default:
1778 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1779 okay = 0;
1780 break;
1783 else
1785 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1786 if ( e == NULL )
1788 switch ( domid )
1790 case DOMID_XEN:
1791 get_knownalive_domain(dom_xen);
1792 percpu_info[cpu].foreign = dom_xen;
1793 break;
1794 case DOMID_IO:
1795 get_knownalive_domain(dom_io);
1796 percpu_info[cpu].foreign = dom_io;
1797 break;
1798 default:
1799 MEM_LOG("Unknown domain '%u'", domid);
1800 okay = 0;
1801 break;
1806 out:
1807 return okay;
1810 static inline cpumask_t vcpumask_to_pcpumask(
1811 struct domain *d, unsigned long vmask)
1813 unsigned int vcpu_id;
1814 cpumask_t pmask = CPU_MASK_NONE;
1815 struct vcpu *v;
1817 while ( vmask != 0 )
1819 vcpu_id = find_first_set_bit(vmask);
1820 vmask &= ~(1UL << vcpu_id);
1821 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1822 ((v = d->vcpu[vcpu_id]) != NULL) )
1823 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1826 return pmask;
1829 int do_mmuext_op(
1830 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1831 unsigned int count,
1832 XEN_GUEST_HANDLE(uint) pdone,
1833 unsigned int foreigndom)
1835 struct mmuext_op op;
1836 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1837 unsigned long mfn, type;
1838 unsigned int done = 0;
1839 struct page_info *page;
1840 struct vcpu *v = current;
1841 struct domain *d = v->domain;
1843 LOCK_BIGLOCK(d);
1845 cleanup_writable_pagetable(d);
1847 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1849 count &= ~MMU_UPDATE_PREEMPTED;
1850 if ( unlikely(!guest_handle_is_null(pdone)) )
1851 (void)copy_from_guest(&done, pdone, 1);
1854 if ( !set_foreigndom(cpu, foreigndom) )
1856 rc = -ESRCH;
1857 goto out;
1860 if ( unlikely(!guest_handle_okay(uops, count)) )
1862 rc = -EFAULT;
1863 goto out;
1866 for ( i = 0; i < count; i++ )
1868 if ( hypercall_preempt_check() )
1870 rc = hypercall_create_continuation(
1871 __HYPERVISOR_mmuext_op, "hihi",
1872 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1873 break;
1876 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
1878 MEM_LOG("Bad __copy_from_guest");
1879 rc = -EFAULT;
1880 break;
1883 okay = 1;
1884 mfn = op.arg1.mfn;
1885 page = mfn_to_page(mfn);
1887 switch ( op.cmd )
1889 case MMUEXT_PIN_L1_TABLE:
1890 type = PGT_l1_page_table | PGT_va_mutable;
1891 goto pin_page;
1893 case MMUEXT_PIN_L2_TABLE:
1894 case MMUEXT_PIN_L3_TABLE:
1895 case MMUEXT_PIN_L4_TABLE:
1896 /* Ignore pinning of subdirectories. */
1897 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) != (CONFIG_PAGING_LEVELS - 1) )
1898 break;
1900 type = PGT_root_page_table;
1902 pin_page:
1903 if ( shadow_mode_refcounts(FOREIGNDOM) )
1904 break;
1906 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1907 if ( unlikely(!okay) )
1909 MEM_LOG("Error while pinning mfn %lx", mfn);
1910 break;
1913 if ( unlikely(test_and_set_bit(_PGT_pinned,
1914 &page->u.inuse.type_info)) )
1916 MEM_LOG("Mfn %lx already pinned", mfn);
1917 put_page_and_type(page);
1918 okay = 0;
1919 break;
1922 break;
1924 case MMUEXT_UNPIN_TABLE:
1925 if ( shadow_mode_refcounts(d) )
1926 break;
1928 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
1930 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1931 mfn, page_get_owner(page));
1933 else if ( likely(test_and_clear_bit(_PGT_pinned,
1934 &page->u.inuse.type_info)) )
1936 put_page_and_type(page);
1937 put_page(page);
1939 else
1941 okay = 0;
1942 put_page(page);
1943 MEM_LOG("Mfn %lx not pinned", mfn);
1945 break;
1947 case MMUEXT_NEW_BASEPTR:
1948 mfn = gmfn_to_mfn(current->domain, mfn);
1949 okay = new_guest_cr3(mfn);
1950 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1951 break;
1953 #ifdef __x86_64__
1954 case MMUEXT_NEW_USER_BASEPTR:
1955 okay = get_page_and_type_from_pagenr(
1956 mfn, PGT_root_page_table, d);
1957 if ( unlikely(!okay) )
1959 MEM_LOG("Error while installing new mfn %lx", mfn);
1961 else
1963 unsigned long old_mfn =
1964 pagetable_get_pfn(v->arch.guest_table_user);
1965 v->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT);
1966 if ( old_mfn != 0 )
1967 put_page_and_type(mfn_to_page(old_mfn));
1969 break;
1970 #endif
1972 case MMUEXT_TLB_FLUSH_LOCAL:
1973 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1974 break;
1976 case MMUEXT_INVLPG_LOCAL:
1977 if ( shadow_mode_enabled(d) )
1978 shadow_invlpg(v, op.arg1.linear_addr);
1979 local_flush_tlb_one(op.arg1.linear_addr);
1980 break;
1982 case MMUEXT_TLB_FLUSH_MULTI:
1983 case MMUEXT_INVLPG_MULTI:
1985 unsigned long vmask;
1986 cpumask_t pmask;
1987 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
1989 okay = 0;
1990 break;
1992 pmask = vcpumask_to_pcpumask(d, vmask);
1993 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
1994 flush_tlb_mask(pmask);
1995 else
1996 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
1997 break;
2000 case MMUEXT_TLB_FLUSH_ALL:
2001 flush_tlb_mask(d->domain_dirty_cpumask);
2002 break;
2004 case MMUEXT_INVLPG_ALL:
2005 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2006 break;
2008 case MMUEXT_FLUSH_CACHE:
2009 if ( unlikely(!cache_flush_permitted(d)) )
2011 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2012 okay = 0;
2014 else
2016 wbinvd();
2018 break;
2020 case MMUEXT_SET_LDT:
2022 unsigned long ptr = op.arg1.linear_addr;
2023 unsigned long ents = op.arg2.nr_ents;
2025 if ( shadow_mode_external(d) )
2027 MEM_LOG("ignoring SET_LDT hypercall from external "
2028 "domain %u", d->domain_id);
2029 okay = 0;
2031 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2032 (ents > 8192) ||
2033 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2035 okay = 0;
2036 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2038 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2039 (v->arch.guest_context.ldt_base != ptr) )
2041 invalidate_shadow_ldt(v);
2042 v->arch.guest_context.ldt_base = ptr;
2043 v->arch.guest_context.ldt_ents = ents;
2044 load_LDT(v);
2045 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
2046 if ( ents != 0 )
2047 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
2049 break;
2052 default:
2053 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2054 okay = 0;
2055 break;
2058 if ( unlikely(!okay) )
2060 rc = -EINVAL;
2061 break;
2064 guest_handle_add_offset(uops, 1);
2067 out:
2068 process_deferred_ops(cpu);
2070 /* Add incremental work we have done to the @done output parameter. */
2071 done += i;
2072 if ( unlikely(!guest_handle_is_null(pdone)) )
2073 copy_to_guest(pdone, &done, 1);
2075 UNLOCK_BIGLOCK(d);
2076 return rc;
2079 int do_mmu_update(
2080 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2081 unsigned int count,
2082 XEN_GUEST_HANDLE(uint) pdone,
2083 unsigned int foreigndom)
2085 struct mmu_update req;
2086 void *va;
2087 unsigned long gpfn, gmfn, mfn;
2088 struct page_info *page;
2089 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
2090 unsigned int cmd, done = 0;
2091 struct vcpu *v = current;
2092 struct domain *d = v->domain;
2093 unsigned long type_info;
2094 struct domain_mmap_cache mapcache, sh_mapcache;
2096 LOCK_BIGLOCK(d);
2098 cleanup_writable_pagetable(d);
2100 if ( unlikely(shadow_mode_enabled(d)) )
2101 check_pagetable(v, "pre-mmu"); /* debug */
2103 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2105 count &= ~MMU_UPDATE_PREEMPTED;
2106 if ( unlikely(!guest_handle_is_null(pdone)) )
2107 (void)copy_from_guest(&done, pdone, 1);
2110 domain_mmap_cache_init(&mapcache);
2111 domain_mmap_cache_init(&sh_mapcache);
2113 if ( !set_foreigndom(cpu, foreigndom) )
2115 rc = -ESRCH;
2116 goto out;
2119 perfc_incrc(calls_to_mmu_update);
2120 perfc_addc(num_page_updates, count);
2121 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2123 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2125 rc = -EFAULT;
2126 goto out;
2129 for ( i = 0; i < count; i++ )
2131 if ( hypercall_preempt_check() )
2133 rc = hypercall_create_continuation(
2134 __HYPERVISOR_mmu_update, "hihi",
2135 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2136 break;
2139 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2141 MEM_LOG("Bad __copy_from_guest");
2142 rc = -EFAULT;
2143 break;
2146 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2147 okay = 0;
2149 switch ( cmd )
2151 /*
2152 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2153 */
2154 case MMU_NORMAL_PT_UPDATE:
2156 gmfn = req.ptr >> PAGE_SHIFT;
2157 mfn = gmfn_to_mfn(d, gmfn);
2159 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2161 MEM_LOG("Could not get page for normal update");
2162 break;
2165 va = map_domain_page_with_cache(mfn, &mapcache);
2166 va = (void *)((unsigned long)va +
2167 (unsigned long)(req.ptr & ~PAGE_MASK));
2168 page = mfn_to_page(mfn);
2170 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2172 case PGT_l1_page_table:
2173 ASSERT( !shadow_mode_refcounts(d) );
2174 if ( likely(get_page_type(
2175 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2177 l1_pgentry_t l1e;
2179 /* FIXME: doesn't work with PAE */
2180 l1e = l1e_from_intpte(req.val);
2181 okay = mod_l1_entry(va, l1e);
2182 if ( okay && unlikely(shadow_mode_enabled(d)) )
2183 shadow_l1_normal_pt_update(
2184 d, req.ptr, l1e, &sh_mapcache);
2185 put_page_type(page);
2187 break;
2188 case PGT_l2_page_table:
2189 ASSERT( !shadow_mode_refcounts(d) );
2190 if ( likely(get_page_type(
2191 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2193 l2_pgentry_t l2e;
2195 /* FIXME: doesn't work with PAE */
2196 l2e = l2e_from_intpte(req.val);
2197 okay = mod_l2_entry(
2198 (l2_pgentry_t *)va, l2e, mfn, type_info);
2199 if ( okay && unlikely(shadow_mode_enabled(d)) )
2200 shadow_l2_normal_pt_update(
2201 d, req.ptr, l2e, &sh_mapcache);
2202 put_page_type(page);
2204 break;
2205 #if CONFIG_PAGING_LEVELS >= 3
2206 case PGT_l3_page_table:
2207 ASSERT( !shadow_mode_refcounts(d) );
2208 if ( likely(get_page_type(
2209 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2211 l3_pgentry_t l3e;
2213 /* FIXME: doesn't work with PAE */
2214 l3e = l3e_from_intpte(req.val);
2215 okay = mod_l3_entry(va, l3e, mfn, type_info);
2216 if ( okay && unlikely(shadow_mode_enabled(d)) )
2217 shadow_l3_normal_pt_update(
2218 d, req.ptr, l3e, &sh_mapcache);
2219 put_page_type(page);
2221 break;
2222 #endif
2223 #if CONFIG_PAGING_LEVELS >= 4
2224 case PGT_l4_page_table:
2225 ASSERT( !shadow_mode_refcounts(d) );
2226 if ( likely(get_page_type(
2227 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2229 l4_pgentry_t l4e;
2231 l4e = l4e_from_intpte(req.val);
2232 okay = mod_l4_entry(va, l4e, mfn, type_info);
2233 if ( okay && unlikely(shadow_mode_enabled(d)) )
2234 shadow_l4_normal_pt_update(
2235 d, req.ptr, l4e, &sh_mapcache);
2236 put_page_type(page);
2238 break;
2239 #endif
2240 default:
2241 if ( likely(get_page_type(page, PGT_writable_page)) )
2243 if ( shadow_mode_enabled(d) )
2245 shadow_lock(d);
2247 __mark_dirty(d, mfn);
2249 if ( page_is_page_table(page) &&
2250 !page_out_of_sync(page) )
2252 shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
2256 *(intpte_t *)va = req.val;
2257 okay = 1;
2259 if ( shadow_mode_enabled(d) )
2260 shadow_unlock(d);
2262 put_page_type(page);
2264 break;
2267 unmap_domain_page_with_cache(va, &mapcache);
2269 put_page(page);
2270 break;
2272 case MMU_MACHPHYS_UPDATE:
2274 if ( shadow_mode_translate(FOREIGNDOM) )
2276 MEM_LOG("can't mutate m2p table of translate mode guest");
2277 break;
2280 mfn = req.ptr >> PAGE_SHIFT;
2281 gpfn = req.val;
2283 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2285 MEM_LOG("Could not get page for mach->phys update");
2286 break;
2289 set_gpfn_from_mfn(mfn, gpfn);
2290 okay = 1;
2292 mark_dirty(FOREIGNDOM, mfn);
2294 put_page(mfn_to_page(mfn));
2295 break;
2297 default:
2298 MEM_LOG("Invalid page update command %x", cmd);
2299 break;
2302 if ( unlikely(!okay) )
2304 rc = -EINVAL;
2305 break;
2308 guest_handle_add_offset(ureqs, 1);
2311 out:
2312 domain_mmap_cache_destroy(&mapcache);
2313 domain_mmap_cache_destroy(&sh_mapcache);
2315 process_deferred_ops(cpu);
2317 /* Add incremental work we have done to the @done output parameter. */
2318 done += i;
2319 if ( unlikely(!guest_handle_is_null(pdone)) )
2320 copy_to_guest(pdone, &done, 1);
2322 if ( unlikely(shadow_mode_enabled(d)) )
2323 check_pagetable(v, "post-mmu"); /* debug */
2325 UNLOCK_BIGLOCK(d);
2326 return rc;
2330 static int create_grant_pte_mapping(
2331 unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
2333 int rc = GNTST_okay;
2334 void *va;
2335 unsigned long gmfn, mfn;
2336 struct page_info *page;
2337 u32 type_info;
2338 l1_pgentry_t ol1e;
2339 struct domain *d = v->domain;
2341 ASSERT(spin_is_locked(&d->big_lock));
2342 ASSERT(!shadow_mode_refcounts(d));
2344 gmfn = pte_addr >> PAGE_SHIFT;
2345 mfn = gmfn_to_mfn(d, gmfn);
2347 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2349 MEM_LOG("Could not get page for normal update");
2350 return GNTST_general_error;
2353 va = map_domain_page(mfn);
2354 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2355 page = mfn_to_page(mfn);
2357 type_info = page->u.inuse.type_info;
2358 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2359 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2361 MEM_LOG("Grant map attempted to update a non-L1 page");
2362 rc = GNTST_general_error;
2363 goto failed;
2366 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) ||
2367 !update_l1e(va, ol1e, _nl1e) )
2369 put_page_type(page);
2370 rc = GNTST_general_error;
2371 goto failed;
2374 put_page_from_l1e(ol1e, d);
2376 if ( unlikely(shadow_mode_enabled(d)) )
2378 struct domain_mmap_cache sh_mapcache;
2379 domain_mmap_cache_init(&sh_mapcache);
2380 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2381 domain_mmap_cache_destroy(&sh_mapcache);
2384 put_page_type(page);
2386 failed:
2387 unmap_domain_page(va);
2388 put_page(page);
2389 return rc;
2392 static int destroy_grant_pte_mapping(
2393 unsigned long addr, unsigned long frame, struct domain *d)
2395 int rc = GNTST_okay;
2396 void *va;
2397 unsigned long gmfn, mfn;
2398 struct page_info *page;
2399 u32 type_info;
2400 l1_pgentry_t ol1e;
2402 ASSERT(!shadow_mode_refcounts(d));
2404 gmfn = addr >> PAGE_SHIFT;
2405 mfn = gmfn_to_mfn(d, gmfn);
2407 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2409 MEM_LOG("Could not get page for normal update");
2410 return GNTST_general_error;
2413 va = map_domain_page(mfn);
2414 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2415 page = mfn_to_page(mfn);
2417 type_info = page->u.inuse.type_info;
2418 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2419 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2421 MEM_LOG("Grant map attempted to update a non-L1 page");
2422 rc = GNTST_general_error;
2423 goto failed;
2426 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2428 put_page_type(page);
2429 rc = GNTST_general_error;
2430 goto failed;
2433 /* Check that the virtual address supplied is actually mapped to frame. */
2434 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2436 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2437 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2438 put_page_type(page);
2439 rc = GNTST_general_error;
2440 goto failed;
2443 /* Delete pagetable entry. */
2444 if ( unlikely(__put_user(0, (intpte_t *)va)))
2446 MEM_LOG("Cannot delete PTE entry at %p", va);
2447 put_page_type(page);
2448 rc = GNTST_general_error;
2449 goto failed;
2452 if ( unlikely(shadow_mode_enabled(d)) )
2454 struct domain_mmap_cache sh_mapcache;
2455 domain_mmap_cache_init(&sh_mapcache);
2456 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2457 domain_mmap_cache_destroy(&sh_mapcache);
2460 put_page_type(page);
2462 failed:
2463 unmap_domain_page(va);
2464 put_page(page);
2465 return rc;
2469 static int create_grant_va_mapping(
2470 unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
2472 l1_pgentry_t *pl1e, ol1e;
2473 struct domain *d = v->domain;
2475 ASSERT(spin_is_locked(&d->big_lock));
2476 ASSERT(!shadow_mode_refcounts(d));
2478 /*
2479 * This is actually overkill - we don't need to sync the L1 itself,
2480 * just everything involved in getting to this L1 (i.e. we need
2481 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2482 */
2483 __shadow_sync_va(v, va);
2485 pl1e = &linear_pg_table[l1_linear_offset(va)];
2487 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2488 !update_l1e(pl1e, ol1e, _nl1e) )
2489 return GNTST_general_error;
2491 put_page_from_l1e(ol1e, d);
2493 if ( unlikely(shadow_mode_enabled(d)) )
2494 shadow_do_update_va_mapping(va, _nl1e, v);
2496 return GNTST_okay;
2499 static int destroy_grant_va_mapping(
2500 unsigned long addr, unsigned long frame)
2502 l1_pgentry_t *pl1e, ol1e;
2504 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2506 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2508 MEM_LOG("Could not find PTE entry for address %lx", addr);
2509 return GNTST_general_error;
2512 /*
2513 * Check that the virtual address supplied is actually mapped to
2514 * frame.
2515 */
2516 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2518 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2519 l1e_get_pfn(ol1e), addr, frame);
2520 return GNTST_general_error;
2523 /* Delete pagetable entry. */
2524 if ( unlikely(__put_user(0, &pl1e->l1)) )
2526 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2527 return GNTST_general_error;
2530 return 0;
2533 int create_grant_host_mapping(
2534 unsigned long addr, unsigned long frame, unsigned int flags)
2536 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2538 if ( (flags & GNTMAP_application_map) )
2539 l1e_add_flags(pte,_PAGE_USER);
2540 if ( !(flags & GNTMAP_readonly) )
2541 l1e_add_flags(pte,_PAGE_RW);
2543 if ( flags & GNTMAP_contains_pte )
2544 return create_grant_pte_mapping(addr, pte, current);
2545 return create_grant_va_mapping(addr, pte, current);
2548 int destroy_grant_host_mapping(
2549 unsigned long addr, unsigned long frame, unsigned int flags)
2551 if ( flags & GNTMAP_contains_pte )
2552 return destroy_grant_pte_mapping(addr, frame, current->domain);
2553 return destroy_grant_va_mapping(addr, frame);
2556 int steal_page_for_grant_transfer(
2557 struct domain *d, struct page_info *page)
2559 u32 _d, _nd, x, y;
2561 spin_lock(&d->page_alloc_lock);
2563 /*
2564 * The tricky bit: atomically release ownership while there is just one
2565 * benign reference to the page (PGC_allocated). If that reference
2566 * disappears then the deallocation routine will safely spin.
2567 */
2568 _d = pickle_domptr(d);
2569 _nd = page->u.inuse._domain;
2570 y = page->count_info;
2571 do {
2572 x = y;
2573 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2574 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2575 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2576 " caf=%08x, taf=%" PRtype_info "\n",
2577 (void *) page_to_mfn(page),
2578 d, d->domain_id, unpickle_domptr(_nd), x,
2579 page->u.inuse.type_info);
2580 spin_unlock(&d->page_alloc_lock);
2581 return -1;
2583 __asm__ __volatile__(
2584 LOCK_PREFIX "cmpxchg8b %2"
2585 : "=d" (_nd), "=a" (y),
2586 "=m" (*(volatile u64 *)(&page->count_info))
2587 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2588 } while (unlikely(_nd != _d) || unlikely(y != x));
2590 /*
2591 * Unlink from 'd'. At least one reference remains (now anonymous), so
2592 * noone else is spinning to try to delete this page from 'd'.
2593 */
2594 d->tot_pages--;
2595 list_del(&page->list);
2597 spin_unlock(&d->page_alloc_lock);
2599 return 0;
2602 int do_update_va_mapping(unsigned long va, u64 val64,
2603 unsigned long flags)
2605 l1_pgentry_t val = l1e_from_intpte(val64);
2606 struct vcpu *v = current;
2607 struct domain *d = v->domain;
2608 unsigned int cpu = smp_processor_id();
2609 unsigned long vmask, bmap_ptr;
2610 cpumask_t pmask;
2611 int rc = 0;
2613 perfc_incrc(calls_to_update_va);
2615 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2616 return -EINVAL;
2618 LOCK_BIGLOCK(d);
2620 cleanup_writable_pagetable(d);
2622 if ( unlikely(shadow_mode_enabled(d)) )
2623 check_pagetable(v, "pre-va"); /* debug */
2625 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2626 val)) )
2627 rc = -EINVAL;
2629 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2631 if ( unlikely(percpu_info[cpu].foreign &&
2632 (shadow_mode_translate(d) ||
2633 shadow_mode_translate(percpu_info[cpu].foreign))) )
2635 /*
2636 * The foreign domain's pfn's are in a different namespace. There's
2637 * not enough information in just a gpte to figure out how to
2638 * (re-)shadow this entry.
2639 */
2640 domain_crash(d);
2643 rc = shadow_do_update_va_mapping(va, val, v);
2645 check_pagetable(v, "post-va"); /* debug */
2648 switch ( flags & UVMF_FLUSHTYPE_MASK )
2650 case UVMF_TLB_FLUSH:
2651 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2653 case UVMF_LOCAL:
2654 if ( unlikely(shadow_mode_enabled(d)) )
2655 shadow_sync_all(d);
2656 local_flush_tlb();
2657 break;
2658 case UVMF_ALL:
2659 flush_tlb_mask(d->domain_dirty_cpumask);
2660 break;
2661 default:
2662 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2663 rc = -EFAULT;
2664 pmask = vcpumask_to_pcpumask(d, vmask);
2665 flush_tlb_mask(pmask);
2666 break;
2668 break;
2670 case UVMF_INVLPG:
2671 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2673 case UVMF_LOCAL:
2674 if ( unlikely(shadow_mode_enabled(d)) )
2675 shadow_invlpg(current, va);
2676 local_flush_tlb_one(va);
2677 break;
2678 case UVMF_ALL:
2679 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2680 break;
2681 default:
2682 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2683 rc = -EFAULT;
2684 pmask = vcpumask_to_pcpumask(d, vmask);
2685 flush_tlb_one_mask(pmask, va);
2686 break;
2688 break;
2691 process_deferred_ops(cpu);
2693 UNLOCK_BIGLOCK(d);
2695 return rc;
2698 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2699 unsigned long flags,
2700 domid_t domid)
2702 unsigned int cpu = smp_processor_id();
2703 int rc;
2705 if ( unlikely(!IS_PRIV(current->domain)) )
2706 return -EPERM;
2708 if ( !set_foreigndom(cpu, domid) )
2709 return -ESRCH;
2711 rc = do_update_va_mapping(va, val64, flags);
2713 return rc;
2718 /*************************
2719 * Descriptor Tables
2720 */
2722 void destroy_gdt(struct vcpu *v)
2724 int i;
2725 unsigned long pfn;
2727 v->arch.guest_context.gdt_ents = 0;
2728 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2730 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2731 put_page_and_type(mfn_to_page(pfn));
2732 v->arch.perdomain_ptes[i] = l1e_empty();
2733 v->arch.guest_context.gdt_frames[i] = 0;
2738 long set_gdt(struct vcpu *v,
2739 unsigned long *frames,
2740 unsigned int entries)
2742 struct domain *d = v->domain;
2743 /* NB. There are 512 8-byte entries per GDT page. */
2744 int i, nr_pages = (entries + 511) / 512;
2745 unsigned long mfn;
2747 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2748 return -EINVAL;
2750 shadow_sync_all(d);
2752 /* Check the pages in the new GDT. */
2753 for ( i = 0; i < nr_pages; i++ ) {
2754 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2755 if ( !mfn_valid(mfn) ||
2756 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2757 goto fail;
2760 /* Tear down the old GDT. */
2761 destroy_gdt(v);
2763 /* Install the new GDT. */
2764 v->arch.guest_context.gdt_ents = entries;
2765 for ( i = 0; i < nr_pages; i++ )
2767 v->arch.guest_context.gdt_frames[i] = frames[i];
2768 v->arch.perdomain_ptes[i] =
2769 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2772 return 0;
2774 fail:
2775 while ( i-- > 0 )
2776 put_page_and_type(mfn_to_page(frames[i]));
2777 return -EINVAL;
2781 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2783 int nr_pages = (entries + 511) / 512;
2784 unsigned long frames[16];
2785 long ret;
2787 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2788 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2789 return -EINVAL;
2791 if ( copy_from_guest((unsigned long *)frames, frame_list, nr_pages) )
2792 return -EFAULT;
2794 LOCK_BIGLOCK(current->domain);
2796 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2797 local_flush_tlb();
2799 UNLOCK_BIGLOCK(current->domain);
2801 return ret;
2805 long do_update_descriptor(u64 pa, u64 desc)
2807 struct domain *dom = current->domain;
2808 unsigned long gmfn = pa >> PAGE_SHIFT;
2809 unsigned long mfn;
2810 unsigned int offset;
2811 struct desc_struct *gdt_pent, d;
2812 struct page_info *page;
2813 long ret = -EINVAL;
2815 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2817 *(u64 *)&d = desc;
2819 LOCK_BIGLOCK(dom);
2821 if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
2822 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2823 !mfn_valid(mfn) ||
2824 !check_descriptor(&d) )
2826 UNLOCK_BIGLOCK(dom);
2827 return -EINVAL;
2830 page = mfn_to_page(mfn);
2831 if ( unlikely(!get_page(page, dom)) )
2833 UNLOCK_BIGLOCK(dom);
2834 return -EINVAL;
2837 /* Check if the given frame is in use in an unsafe context. */
2838 switch ( page->u.inuse.type_info & PGT_type_mask )
2840 case PGT_gdt_page:
2841 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2842 goto out;
2843 break;
2844 case PGT_ldt_page:
2845 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2846 goto out;
2847 break;
2848 default:
2849 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2850 goto out;
2851 break;
2854 if ( shadow_mode_enabled(dom) )
2856 shadow_lock(dom);
2858 __mark_dirty(dom, mfn);
2860 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2861 shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
2864 /* All is good so make the update. */
2865 gdt_pent = map_domain_page(mfn);
2866 memcpy(&gdt_pent[offset], &d, 8);
2867 unmap_domain_page(gdt_pent);
2869 if ( shadow_mode_enabled(dom) )
2870 shadow_unlock(dom);
2872 put_page_type(page);
2874 ret = 0; /* success */
2876 out:
2877 put_page(page);
2879 UNLOCK_BIGLOCK(dom);
2881 return ret;
2884 typedef struct e820entry e820entry_t;
2885 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
2887 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2889 switch ( op )
2891 case XENMEM_add_to_physmap:
2893 struct xen_add_to_physmap xatp;
2894 unsigned long mfn = 0, gpfn;
2895 struct domain *d;
2897 if ( copy_from_guest(&xatp, arg, 1) )
2898 return -EFAULT;
2900 if ( (d = find_domain_by_id(xatp.domid)) == NULL )
2901 return -ESRCH;
2903 switch ( xatp.space )
2905 case XENMAPSPACE_shared_info:
2906 if ( xatp.idx == 0 )
2907 mfn = virt_to_mfn(d->shared_info);
2908 break;
2909 case XENMAPSPACE_grant_table:
2910 if ( xatp.idx < NR_GRANT_FRAMES )
2911 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
2912 break;
2913 default:
2914 break;
2917 if ( !shadow_mode_translate(d) || (mfn == 0) )
2919 put_domain(d);
2920 return -EINVAL;
2923 LOCK_BIGLOCK(d);
2925 /* Remove previously mapped page if it was present. */
2926 if ( mfn_valid(gmfn_to_mfn(d, xatp.gpfn)) )
2927 guest_remove_page(d, xatp.gpfn);
2929 /* Unmap from old location, if any. */
2930 gpfn = get_gpfn_from_mfn(mfn);
2931 if ( gpfn != INVALID_M2P_ENTRY )
2932 guest_physmap_remove_page(d, gpfn, mfn);
2934 /* Map at new location. */
2935 guest_physmap_add_page(d, xatp.gpfn, mfn);
2937 UNLOCK_BIGLOCK(d);
2939 put_domain(d);
2941 break;
2944 case XENMEM_memory_map:
2946 return -ENOSYS;
2949 case XENMEM_machine_memory_map:
2951 struct xen_memory_map memmap;
2952 XEN_GUEST_HANDLE(e820entry_t) buffer;
2953 int count;
2955 if ( !IS_PRIV(current->domain) )
2956 return -EINVAL;
2958 if ( copy_from_guest(&memmap, arg, 1) )
2959 return -EFAULT;
2960 if ( memmap.nr_entries < e820.nr_map + 1 )
2961 return -EINVAL;
2963 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
2965 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
2966 if ( copy_to_guest(buffer, &e820.map[0], count) < 0 )
2967 return -EFAULT;
2969 memmap.nr_entries = count;
2971 if ( copy_to_guest(arg, &memmap, 1) )
2972 return -EFAULT;
2974 return 0;
2977 default:
2978 return subarch_memory_op(op, arg);
2981 return 0;
2985 /*************************
2986 * Writable Pagetables
2987 */
2989 #ifdef VVERBOSE
2990 int ptwr_debug = 0x0;
2991 #define PTWR_PRINTK(_f, _a...) \
2992 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2993 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2994 #else
2995 #define PTWR_PRINTK(_f, _a...) ((void)0)
2996 #endif
2999 #ifdef PERF_ARRAYS
3001 /**************** writeable pagetables profiling functions *****************/
3003 #define ptwr_eip_buckets 256
3005 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
3007 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
3009 struct {
3010 unsigned long eip;
3011 domid_t id;
3012 u32 val[ptwr_eip_stat_thresholdN];
3013 } typedef ptwr_eip_stat_t;
3015 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
3017 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
3019 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
3020 ptwr_eip_buckets;
3023 static void ptwr_eip_stat_inc(u32 *n)
3025 unsigned int i, j;
3027 if ( ++(*n) != 0 )
3028 return;
3030 *n = ~0;
3032 /* Re-scale all buckets. */
3033 for ( i = 0; i < ptwr_eip_buckets; i++ )
3034 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3035 ptwr_eip_stats[i].val[j] >>= 1;
3038 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
3040 unsigned int i, j, b;
3042 i = b = ptwr_eip_stat_hash(eip, id);
3044 do
3046 if ( !ptwr_eip_stats[i].eip )
3048 /* doesn't exist */
3049 ptwr_eip_stats[i].eip = eip;
3050 ptwr_eip_stats[i].id = id;
3051 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
3054 if ( ptwr_eip_stats[i].eip == eip && ptwr_eip_stats[i].id == id)
3056 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3057 if ( modified <= ptwr_eip_stat_threshold[j] )
3058 break;
3059 BUG_ON(j >= ptwr_eip_stat_thresholdN);
3060 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
3061 return;
3064 i = (i+1) % ptwr_eip_buckets;
3066 while ( i != b );
3068 printk("ptwr_eip_stat: too many EIPs in use!\n");
3070 ptwr_eip_stat_print();
3071 ptwr_eip_stat_reset();
3074 void ptwr_eip_stat_reset(void)
3076 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
3079 void ptwr_eip_stat_print(void)
3081 struct domain *e;
3082 domid_t d;
3083 unsigned int i, j;
3085 for_each_domain( e )
3087 d = e->domain_id;
3089 for ( i = 0; i < ptwr_eip_buckets; i++ )
3091 if ( !ptwr_eip_stats[i].eip || ptwr_eip_stats[i].id != d )
3092 continue;
3094 printk("D %5d eip %p ",
3095 ptwr_eip_stats[i].id, (void *)ptwr_eip_stats[i].eip);
3097 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
3098 printk("<=%u %4u \t",
3099 ptwr_eip_stat_threshold[j],
3100 ptwr_eip_stats[i].val[j]);
3101 printk("\n");
3106 #else /* PERF_ARRAYS */
3108 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
3110 #endif
3112 /*******************************************************************/
3114 /* Re-validate a given p.t. page, given its prior snapshot */
3115 int revalidate_l1(
3116 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
3118 l1_pgentry_t ol1e, nl1e;
3119 int modified = 0, i;
3121 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3123 ol1e = snapshot[i];
3124 nl1e = l1page[i];
3126 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
3127 continue;
3129 /* Update number of entries modified. */
3130 modified++;
3132 /*
3133 * Fast path for PTEs that have merely been write-protected
3134 * (e.g., during a Unix fork()). A strict reduction in privilege.
3135 */
3136 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
3138 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3139 put_page_type(mfn_to_page(l1e_get_pfn(nl1e)));
3140 continue;
3143 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3145 /*
3146 * Make the remaining p.t's consistent before crashing, so the
3147 * reference counts are correct.
3148 */
3149 memcpy(&l1page[i], &snapshot[i],
3150 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
3152 /* Crash the offending domain. */
3153 MEM_LOG("ptwr: Could not revalidate l1 page");
3154 domain_crash(d);
3155 break;
3158 put_page_from_l1e(ol1e, d);
3161 return modified;
3165 /* Flush the given writable p.t. page and write-protect it again. */
3166 void ptwr_flush(struct domain *d, const int which)
3168 unsigned long l1va;
3169 l1_pgentry_t *pl1e, pte, *ptep;
3170 l2_pgentry_t *pl2e;
3171 unsigned int modified;
3173 #ifdef CONFIG_X86_64
3174 struct vcpu *v = current;
3175 int user_mode = !(v->arch.flags & TF_kernel_mode);
3176 #endif
3178 ASSERT(!shadow_mode_enabled(d));
3180 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3181 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
3182 __write_ptbase(pagetable_get_pfn(
3183 d->arch.ptwr[which].vcpu->arch.guest_table));
3184 else
3185 TOGGLE_MODE();
3187 l1va = d->arch.ptwr[which].l1va;
3188 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
3190 /*
3191 * STEP 1. Write-protect the p.t. page so no more updates can occur.
3192 */
3194 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
3196 MEM_LOG("ptwr: Could not read pte at %p", ptep);
3197 /*
3198 * Really a bug. We could read this PTE during the initial fault,
3199 * and pagetables can't have changed meantime.
3200 */
3201 BUG();
3203 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
3204 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3205 l1e_remove_flags(pte, _PAGE_RW);
3207 /* Write-protect the p.t. page in the guest page table. */
3208 if ( unlikely(__put_user(pte, ptep)) )
3210 MEM_LOG("ptwr: Could not update pte at %p", ptep);
3211 /*
3212 * Really a bug. We could write this PTE during the initial fault,
3213 * and pagetables can't have changed meantime.
3214 */
3215 BUG();
3218 /* Ensure that there are no stale writable mappings in any TLB. */
3219 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3220 flush_tlb_one_mask(d->domain_dirty_cpumask, l1va);
3221 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3222 PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
3224 /*
3225 * STEP 2. Validate any modified PTEs.
3226 */
3228 if ( likely(d == current->domain) )
3230 pl1e = map_domain_page(l1e_get_pfn(pte));
3231 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3232 unmap_domain_page(pl1e);
3233 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3234 ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
3235 d->arch.ptwr[which].prev_nr_updates = modified;
3237 else
3239 /*
3240 * Must make a temporary global mapping, since we are running in the
3241 * wrong address space, so no access to our own mapcache.
3242 */
3243 pl1e = map_domain_page_global(l1e_get_pfn(pte));
3244 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3245 unmap_domain_page_global(pl1e);
3248 /*
3249 * STEP 3. Reattach the L1 p.t. page into the current address space.
3250 */
3252 if ( which == PTWR_PT_ACTIVE )
3254 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3255 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3258 /*
3259 * STEP 4. Final tidy-up.
3260 */
3262 d->arch.ptwr[which].l1va = 0;
3264 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3265 write_ptbase(current);
3266 else
3267 TOGGLE_MODE();
3270 static int ptwr_emulated_update(
3271 unsigned long addr,
3272 paddr_t old,
3273 paddr_t val,
3274 unsigned int bytes,
3275 unsigned int do_cmpxchg)
3277 unsigned long pfn, l1va;
3278 struct page_info *page;
3279 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3280 struct domain *d = current->domain;
3282 /* Aligned access only, thank you. */
3283 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3285 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3286 bytes, addr);
3287 return X86EMUL_UNHANDLEABLE;
3290 /* Turn a sub-word access into a full-word access. */
3291 if ( bytes != sizeof(paddr_t) )
3293 paddr_t full;
3294 unsigned int offset = addr & (sizeof(paddr_t)-1);
3296 /* Align address; read full word. */
3297 addr &= ~(sizeof(paddr_t)-1);
3298 if ( copy_from_user(&full, (void *)addr, sizeof(paddr_t)) )
3300 propagate_page_fault(addr, 4); /* user mode, read fault */
3301 return X86EMUL_PROPAGATE_FAULT;
3303 /* Mask out bits provided by caller. */
3304 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3305 /* Shift the caller value and OR in the missing bits. */
3306 val &= (((paddr_t)1 << (bytes*8)) - 1);
3307 val <<= (offset)*8;
3308 val |= full;
3309 /* Also fill in missing parts of the cmpxchg old value. */
3310 old &= (((paddr_t)1 << (bytes*8)) - 1);
3311 old <<= (offset)*8;
3312 old |= full;
3315 /*
3316 * We must not emulate an update to a PTE that is temporarily marked
3317 * writable by the batched ptwr logic, else we can corrupt page refcnts!
3318 */
3319 if ( ((l1va = d->arch.ptwr[PTWR_PT_ACTIVE].l1va) != 0) &&
3320 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3321 ptwr_flush(d, PTWR_PT_ACTIVE);
3322 if ( ((l1va = d->arch.ptwr[PTWR_PT_INACTIVE].l1va) != 0) &&
3323 (l1_linear_offset(l1va) == l1_linear_offset(addr)) )
3324 ptwr_flush(d, PTWR_PT_INACTIVE);
3326 /* Read the PTE that maps the page being updated. */
3327 if ( __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3328 sizeof(pte)) )
3330 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3331 return X86EMUL_UNHANDLEABLE;
3334 pfn = l1e_get_pfn(pte);
3335 page = mfn_to_page(pfn);
3337 /* We are looking only for read-only mappings of p.t. pages. */
3338 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3339 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3340 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3341 ASSERT(page_get_owner(page) == d);
3343 /* Check the new PTE. */
3344 nl1e = l1e_from_intpte(val);
3345 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3347 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3348 return X86EMUL_UNHANDLEABLE;
3351 /* Checked successfully: do the update (write or cmpxchg). */
3352 pl1e = map_domain_page(page_to_mfn(page));
3353 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3354 if ( do_cmpxchg )
3356 ol1e = l1e_from_intpte(old);
3357 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3359 unmap_domain_page(pl1e);
3360 put_page_from_l1e(nl1e, d);
3361 return X86EMUL_CMPXCHG_FAILED;
3364 else
3366 ol1e = *pl1e;
3367 *pl1e = nl1e;
3369 unmap_domain_page(pl1e);
3371 /* Finally, drop the old PTE. */
3372 put_page_from_l1e(ol1e, d);
3374 return X86EMUL_CONTINUE;
3377 static int ptwr_emulated_write(
3378 unsigned long addr,
3379 unsigned long val,
3380 unsigned int bytes,
3381 struct x86_emulate_ctxt *ctxt)
3383 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3386 static int ptwr_emulated_cmpxchg(
3387 unsigned long addr,
3388 unsigned long old,
3389 unsigned long new,
3390 unsigned int bytes,
3391 struct x86_emulate_ctxt *ctxt)
3393 return ptwr_emulated_update(addr, old, new, bytes, 1);
3396 static int ptwr_emulated_cmpxchg8b(
3397 unsigned long addr,
3398 unsigned long old,
3399 unsigned long old_hi,
3400 unsigned long new,
3401 unsigned long new_hi,
3402 struct x86_emulate_ctxt *ctxt)
3404 if ( CONFIG_PAGING_LEVELS == 2 )
3405 return X86EMUL_UNHANDLEABLE;
3406 else
3407 return ptwr_emulated_update(
3408 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3411 static struct x86_emulate_ops ptwr_emulate_ops = {
3412 .read_std = x86_emulate_read_std,
3413 .write_std = x86_emulate_write_std,
3414 .read_emulated = x86_emulate_read_std,
3415 .write_emulated = ptwr_emulated_write,
3416 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3417 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3418 };
3420 /* Write page fault handler: check if guest is trying to modify a PTE. */
3421 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3422 struct cpu_user_regs *regs)
3424 unsigned long pfn;
3425 struct page_info *page;
3426 l1_pgentry_t *pl1e, pte;
3427 l2_pgentry_t *pl2e, l2e;
3428 int which, flags;
3429 unsigned long l2_idx;
3430 struct x86_emulate_ctxt emul_ctxt;
3432 if ( unlikely(shadow_mode_enabled(d)) )
3433 return 0;
3435 /*
3436 * Attempt to read the PTE that maps the VA being accessed. By checking for
3437 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3438 */
3439 if ( !(l2e_get_flags(__linear_l2_table[l2_linear_offset(addr)]) &
3440 _PAGE_PRESENT) ||
3441 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
3442 sizeof(pte)) )
3444 return 0;
3447 pfn = l1e_get_pfn(pte);
3448 page = mfn_to_page(pfn);
3450 #ifdef CONFIG_X86_64
3451 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT | _PAGE_USER)
3452 #else
3453 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT)
3454 #endif
3456 /*
3457 * Check the required flags for a valid wrpt mapping. If the page is
3458 * already writable then we can return straight to the guest (SMP race).
3459 * We decide whether or not to propagate the fault by testing for write
3460 * permissions in page directories by writing back to the linear mapping.
3461 */
3462 if ( (flags = l1e_get_flags(pte) & WRPT_PTE_FLAGS) == WRPT_PTE_FLAGS )
3463 return __put_user(
3464 pte.l1, &linear_pg_table[l1_linear_offset(addr)].l1) ?
3465 0 : EXCRET_not_a_fault;
3467 /* We are looking only for read-only mappings of p.t. pages. */
3468 if ( ((flags | _PAGE_RW) != WRPT_PTE_FLAGS) ||
3469 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3470 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3471 (page_get_owner(page) != d) )
3473 return 0;
3476 #if 0 /* Leave this in as useful for debugging */
3477 goto emulate;
3478 #endif
3480 PTWR_PRINTK("ptwr_page_fault on l1 pt at va %lx, pfn %lx, eip %lx\n",
3481 addr, pfn, (unsigned long)regs->eip);
3483 /* Get the L2 index at which this L1 p.t. is always mapped. */
3484 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3485 if ( unlikely(l2_idx >= PGT_va_unknown) )
3486 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3487 l2_idx >>= PGT_va_shift;
3489 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3490 goto emulate; /* Urk! Pagetable maps itself! */
3492 /*
3493 * Is the L1 p.t. mapped into the current address space? If so we call it
3494 * an ACTIVE p.t., otherwise it is INACTIVE.
3495 */
3496 pl2e = &__linear_l2_table[l2_idx];
3497 which = PTWR_PT_INACTIVE;
3499 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3501 /*
3502 * Check the PRESENT bit to set ACTIVE mode.
3503 * If the PRESENT bit is clear, we may be conflicting with the current
3504 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3505 * The ptwr_flush call below will restore the PRESENT bit.
3506 */
3507 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3508 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3509 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3510 which = PTWR_PT_ACTIVE;
3513 /*
3514 * If this is a multi-processor guest then ensure that the page is hooked
3515 * into at most one L2 table, which must be the one running on this VCPU.
3516 */
3517 if ( (d->vcpu[0]->next_in_list != NULL) &&
3518 ((page->u.inuse.type_info & PGT_count_mask) !=
3519 (!!(page->u.inuse.type_info & PGT_pinned) +
3520 (which == PTWR_PT_ACTIVE))) )
3522 /* Could be conflicting writable mappings from other VCPUs. */
3523 cleanup_writable_pagetable(d);
3524 goto emulate;
3527 /*
3528 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
3529 * time. If there is already one, we must flush it out.
3530 */
3531 if ( d->arch.ptwr[which].l1va )
3532 ptwr_flush(d, which);
3534 /*
3535 * If last batch made no updates then we are probably stuck. Emulate this
3536 * update to ensure we make progress.
3537 */
3538 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3540 /* Ensure that we don't get stuck in an emulation-only rut. */
3541 d->arch.ptwr[which].prev_nr_updates = 1;
3542 goto emulate;
3545 PTWR_PRINTK("[%c] batched ptwr_page_fault at va %lx, pt for %08lx, "
3546 "pfn %lx\n", PTWR_PRINT_WHICH, addr,
3547 l2_idx << L2_PAGETABLE_SHIFT, pfn);
3549 d->arch.ptwr[which].l1va = addr | 1;
3550 d->arch.ptwr[which].l2_idx = l2_idx;
3551 d->arch.ptwr[which].vcpu = current;
3553 #ifdef PERF_ARRAYS
3554 d->arch.ptwr[which].eip = regs->eip;
3555 #endif
3557 /* For safety, disconnect the L1 p.t. page from current space. */
3558 if ( which == PTWR_PT_ACTIVE )
3560 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
3561 flush_tlb_mask(d->domain_dirty_cpumask);
3564 /* Temporarily map the L1 page, and make a copy of it. */
3565 pl1e = map_domain_page(pfn);
3566 memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE);
3567 unmap_domain_page(pl1e);
3569 /* Finally, make the p.t. page writable by the guest OS. */
3570 l1e_add_flags(pte, _PAGE_RW);
3571 if ( unlikely(__put_user(pte.l1,
3572 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3574 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
3575 &linear_pg_table[l1_linear_offset(addr)]);
3576 /* Toss the writable pagetable state and crash. */
3577 d->arch.ptwr[which].l1va = 0;
3578 domain_crash(d);
3579 return 0;
3582 return EXCRET_fault_fixed;
3584 emulate:
3585 emul_ctxt.regs = guest_cpu_user_regs();
3586 emul_ctxt.cr2 = addr;
3587 emul_ctxt.mode = X86EMUL_MODE_HOST;
3588 if ( x86_emulate_memop(&emul_ctxt, &ptwr_emulate_ops) )
3589 return 0;
3590 perfc_incrc(ptwr_emulations);
3591 return EXCRET_fault_fixed;
3594 int ptwr_init(struct domain *d)
3596 void *x = alloc_xenheap_page();
3597 void *y = alloc_xenheap_page();
3599 if ( (x == NULL) || (y == NULL) )
3601 free_xenheap_page(x);
3602 free_xenheap_page(y);
3603 return -ENOMEM;
3606 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3607 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3609 return 0;
3612 void ptwr_destroy(struct domain *d)
3614 LOCK_BIGLOCK(d);
3615 cleanup_writable_pagetable(d);
3616 UNLOCK_BIGLOCK(d);
3617 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3618 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3621 void cleanup_writable_pagetable(struct domain *d)
3623 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3624 return;
3626 if ( unlikely(shadow_mode_enabled(d)) )
3628 shadow_sync_all(d);
3630 else
3632 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3633 ptwr_flush(d, PTWR_PT_ACTIVE);
3634 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3635 ptwr_flush(d, PTWR_PT_INACTIVE);
3639 int map_pages_to_xen(
3640 unsigned long virt,
3641 unsigned long mfn,
3642 unsigned long nr_mfns,
3643 unsigned long flags)
3645 l2_pgentry_t *pl2e, ol2e;
3646 l1_pgentry_t *pl1e, ol1e;
3647 unsigned int i;
3649 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3650 flags &= ~MAP_SMALL_PAGES;
3652 while ( nr_mfns != 0 )
3654 pl2e = virt_to_xen_l2e(virt);
3656 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3657 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3658 !map_small_pages )
3660 /* Super-page mapping. */
3661 ol2e = *pl2e;
3662 *pl2e = l2e_from_pfn(mfn, flags|_PAGE_PSE);
3664 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3666 local_flush_tlb_pge();
3667 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3668 free_xen_pagetable(l2e_get_page(*pl2e));
3671 virt += 1UL << L2_PAGETABLE_SHIFT;
3672 mfn += 1UL << PAGETABLE_ORDER;
3673 nr_mfns -= 1UL << PAGETABLE_ORDER;
3675 else
3677 /* Normal page mapping. */
3678 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3680 pl1e = page_to_virt(alloc_xen_pagetable());
3681 clear_page(pl1e);
3682 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3684 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3686 pl1e = page_to_virt(alloc_xen_pagetable());
3687 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3688 pl1e[i] = l1e_from_pfn(
3689 l2e_get_pfn(*pl2e) + i,
3690 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3691 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3692 local_flush_tlb_pge();
3695 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3696 ol1e = *pl1e;
3697 *pl1e = l1e_from_pfn(mfn, flags);
3698 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3699 local_flush_tlb_one(virt);
3701 virt += 1UL << L1_PAGETABLE_SHIFT;
3702 mfn += 1UL;
3703 nr_mfns -= 1UL;
3707 return 0;
3710 void __set_fixmap(
3711 enum fixed_addresses idx, unsigned long p, unsigned long flags)
3713 if ( unlikely(idx >= __end_of_fixed_addresses) )
3714 BUG();
3715 map_pages_to_xen(fix_to_virt(idx), p >> PAGE_SHIFT, 1, flags);
3718 #ifdef MEMORY_GUARD
3720 void memguard_init(void)
3722 map_pages_to_xen(
3723 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3724 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3727 static void __memguard_change_range(void *p, unsigned long l, int guard)
3729 unsigned long _p = (unsigned long)p;
3730 unsigned long _l = (unsigned long)l;
3731 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3733 /* Ensure we are dealing with a page-aligned whole number of pages. */
3734 ASSERT((_p&PAGE_MASK) != 0);
3735 ASSERT((_l&PAGE_MASK) != 0);
3736 ASSERT((_p&~PAGE_MASK) == 0);
3737 ASSERT((_l&~PAGE_MASK) == 0);
3739 if ( guard )
3740 flags &= ~_PAGE_PRESENT;
3742 map_pages_to_xen(
3743 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3746 void memguard_guard_range(void *p, unsigned long l)
3748 __memguard_change_range(p, l, 1);
3751 void memguard_unguard_range(void *p, unsigned long l)
3753 __memguard_change_range(p, l, 0);
3756 #endif
3758 /*
3759 * Local variables:
3760 * mode: C
3761 * c-set-style: "BSD"
3762 * c-basic-offset: 4
3763 * tab-width: 4
3764 * indent-tabs-mode: nil
3765 * End:
3766 */