ia64/xen-unstable

view xen/arch/x86/mm.c @ 16407:2e5d922b7ee3

xen: Allow granting of foreign access to iomem pages, and with
arbitrary cache attributes.
Signed-off-by: Kieran Mansley <kmansley@solarflare.com>
Signed-off-by: Keir Fraser <keir.fraser@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Nov 20 17:26:48 2007 +0000 (2007-11-20)
parents 5b8730c78454
children 8c305873f2b8
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
118 /*
119 * PTE updates can be done with ordinary writes except:
120 * 1. Debug builds get extra checking by using CMPXCHG[8B].
121 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
122 */
123 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
124 #define PTE_UPDATE_WITH_CMPXCHG
125 #endif
127 /* Used to defer flushing of memory structures. */
128 struct percpu_mm_info {
129 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
130 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
131 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
132 unsigned int deferred_ops;
133 /* If non-NULL, specifies a foreign subject domain for some operations. */
134 struct domain *foreign;
135 };
136 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
138 /*
139 * Returns the current foreign domain; defaults to the currently-executing
140 * domain if a foreign override hasn't been specified.
141 */
142 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
144 /* Private domain structs for DOMID_XEN and DOMID_IO. */
145 static struct domain *dom_xen, *dom_io;
147 /* Frame table and its size in pages. */
148 struct page_info *frame_table;
149 unsigned long max_page;
150 unsigned long total_pages;
152 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
154 #define l1_disallow_mask(d) \
155 ((d != dom_io) && \
156 (rangeset_is_empty((d)->iomem_caps) && \
157 rangeset_is_empty((d)->arch.ioport_caps)) ? \
158 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
160 #ifdef CONFIG_COMPAT
161 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
162 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
163 L3_DISALLOW_MASK : \
164 COMPAT_L3_DISALLOW_MASK)
165 #else
166 #define l3_disallow_mask(d) L3_DISALLOW_MASK
167 #endif
169 static void queue_deferred_ops(struct domain *d, unsigned int ops)
170 {
171 ASSERT(d == current->domain);
172 this_cpu(percpu_mm_info).deferred_ops |= ops;
173 }
175 void __init init_frametable(void)
176 {
177 unsigned long nr_pages, page_step, i, mfn;
179 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
181 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
182 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
184 for ( i = 0; i < nr_pages; i += page_step )
185 {
186 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
187 if ( mfn == 0 )
188 panic("Not enough memory for frame table\n");
189 map_pages_to_xen(
190 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
191 mfn, page_step, PAGE_HYPERVISOR);
192 }
194 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
195 }
197 void __init arch_init_memory(void)
198 {
199 extern void subarch_init_memory(void);
201 unsigned long i, pfn, rstart_pfn, rend_pfn;
203 /*
204 * Initialise our DOMID_XEN domain.
205 * Any Xen-heap pages that we will allow to be mapped will have
206 * their domain field set to dom_xen.
207 */
208 dom_xen = alloc_domain(DOMID_XEN);
209 BUG_ON(dom_xen == NULL);
211 /*
212 * Initialise our DOMID_IO domain.
213 * This domain owns I/O pages that are within the range of the page_info
214 * array. Mappings occur at the priv of the caller.
215 */
216 dom_io = alloc_domain(DOMID_IO);
217 BUG_ON(dom_io == NULL);
219 /* First 1MB of RAM is historically marked as I/O. */
220 for ( i = 0; i < 0x100; i++ )
221 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
223 /* Any areas not specified as RAM by the e820 map are considered I/O. */
224 for ( i = 0, pfn = 0; pfn < max_page; i++ )
225 {
226 while ( (i < e820.nr_map) &&
227 (e820.map[i].type != E820_RAM) &&
228 (e820.map[i].type != E820_UNUSABLE) )
229 i++;
231 if ( i >= e820.nr_map )
232 {
233 /* No more RAM regions: mark as I/O right to end of memory map. */
234 rstart_pfn = rend_pfn = max_page;
235 }
236 else
237 {
238 /* Mark as I/O just up as far as next RAM region. */
239 rstart_pfn = min_t(unsigned long, max_page,
240 PFN_UP(e820.map[i].addr));
241 rend_pfn = max_t(unsigned long, rstart_pfn,
242 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
243 }
245 /* Mark as I/O up to next RAM region. */
246 for ( ; pfn < rstart_pfn; pfn++ )
247 {
248 BUG_ON(!mfn_valid(pfn));
249 share_xen_page_with_guest(
250 mfn_to_page(pfn), dom_io, XENSHARE_writable);
251 }
253 /* Skip the RAM region. */
254 pfn = rend_pfn;
255 }
257 subarch_init_memory();
258 }
260 int memory_is_conventional_ram(paddr_t p)
261 {
262 int i;
264 for ( i = 0; i < e820.nr_map; i++ )
265 {
266 if ( (e820.map[i].type == E820_RAM) &&
267 (e820.map[i].addr <= p) &&
268 (e820.map[i].size > p) )
269 return 1;
270 }
272 return 0;
273 }
275 unsigned long domain_get_maximum_gpfn(struct domain *d)
276 {
277 if ( is_hvm_domain(d) )
278 return d->arch.p2m.max_mapped_pfn;
279 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
280 return arch_get_max_pfn(d) - 1;
281 }
283 void share_xen_page_with_guest(
284 struct page_info *page, struct domain *d, int readonly)
285 {
286 if ( page_get_owner(page) == d )
287 return;
289 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
291 spin_lock(&d->page_alloc_lock);
293 /* The incremented type count pins as writable or read-only. */
294 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
295 page->u.inuse.type_info |= PGT_validated | 1;
297 page_set_owner(page, d);
298 wmb(); /* install valid domain ptr before updating refcnt. */
299 ASSERT(page->count_info == 0);
301 /* Only add to the allocation list if the domain isn't dying. */
302 if ( !d->is_dying )
303 {
304 page->count_info |= PGC_allocated | 1;
305 if ( unlikely(d->xenheap_pages++ == 0) )
306 get_knownalive_domain(d);
307 list_add_tail(&page->list, &d->xenpage_list);
308 }
310 spin_unlock(&d->page_alloc_lock);
311 }
313 void share_xen_page_with_privileged_guests(
314 struct page_info *page, int readonly)
315 {
316 share_xen_page_with_guest(page, dom_xen, readonly);
317 }
319 #if defined(CONFIG_X86_PAE)
321 #ifdef NDEBUG
322 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
323 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
324 #else
325 /*
326 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
327 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
328 * (detected by lack of an owning domain). As required for correctness, we
329 * always shadow PDPTs above 4GB.
330 */
331 #define l3tab_needs_shadow(mfn) \
332 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
333 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
334 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
335 ((mfn) >= 0x100000))
336 #endif
338 static l1_pgentry_t *fix_pae_highmem_pl1e;
340 /* Cache the address of PAE high-memory fixmap page tables. */
341 static int __init cache_pae_fixmap_address(void)
342 {
343 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
344 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
345 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
346 return 0;
347 }
348 __initcall(cache_pae_fixmap_address);
350 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
352 void make_cr3(struct vcpu *v, unsigned long mfn)
353 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
354 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
355 {
356 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
357 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
358 unsigned int cpu = smp_processor_id();
360 /* Fast path: does this mfn need a shadow at all? */
361 if ( !l3tab_needs_shadow(mfn) )
362 {
363 v->arch.cr3 = mfn << PAGE_SHIFT;
364 /* Cache is no longer in use or valid */
365 cache->high_mfn = 0;
366 return;
367 }
369 /* Caching logic is not interrupt safe. */
370 ASSERT(!in_irq());
372 /* Protects against pae_flush_pgd(). */
373 spin_lock(&cache->lock);
375 cache->inuse_idx ^= 1;
376 cache->high_mfn = mfn;
378 /* Map the guest L3 table and copy to the chosen low-memory cache. */
379 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
380 /* First check the previous high mapping can't be in the TLB.
381 * (i.e. have we loaded CR3 since we last did this?) */
382 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
383 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
384 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
385 lowmem_l3tab = cache->table[cache->inuse_idx];
386 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
387 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
388 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
390 v->arch.cr3 = __pa(lowmem_l3tab);
392 spin_unlock(&cache->lock);
393 }
395 #else /* !CONFIG_X86_PAE */
397 void make_cr3(struct vcpu *v, unsigned long mfn)
398 {
399 v->arch.cr3 = mfn << PAGE_SHIFT;
400 }
402 #endif /* !CONFIG_X86_PAE */
404 void write_ptbase(struct vcpu *v)
405 {
406 write_cr3(v->arch.cr3);
407 }
409 /*
410 * Should be called after CR3 is updated.
411 *
412 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
413 * for HVM guests, arch.monitor_table and hvm's guest CR3.
414 *
415 * Update ref counts to shadow tables appropriately.
416 */
417 void update_cr3(struct vcpu *v)
418 {
419 unsigned long cr3_mfn=0;
421 if ( paging_mode_enabled(v->domain) )
422 {
423 paging_update_cr3(v);
424 return;
425 }
427 #if CONFIG_PAGING_LEVELS == 4
428 if ( !(v->arch.flags & TF_kernel_mode) )
429 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
430 else
431 #endif
432 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
434 make_cr3(v, cr3_mfn);
435 }
438 static void invalidate_shadow_ldt(struct vcpu *v)
439 {
440 int i;
441 unsigned long pfn;
442 struct page_info *page;
444 if ( v->arch.shadow_ldt_mapcnt == 0 )
445 return;
447 v->arch.shadow_ldt_mapcnt = 0;
449 for ( i = 16; i < 32; i++ )
450 {
451 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
452 if ( pfn == 0 ) continue;
453 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
454 page = mfn_to_page(pfn);
455 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
456 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
457 put_page_and_type(page);
458 }
460 /* Dispose of the (now possibly invalid) mappings from the TLB. */
461 if ( v == current )
462 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
463 else
464 flush_tlb_mask(v->domain->domain_dirty_cpumask);
465 }
468 static int alloc_segdesc_page(struct page_info *page)
469 {
470 struct desc_struct *descs;
471 int i;
473 descs = map_domain_page(page_to_mfn(page));
475 for ( i = 0; i < 512; i++ )
476 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
477 goto fail;
479 unmap_domain_page(descs);
480 return 1;
482 fail:
483 unmap_domain_page(descs);
484 return 0;
485 }
488 /* Map shadow page at offset @off. */
489 int map_ldt_shadow_page(unsigned int off)
490 {
491 struct vcpu *v = current;
492 struct domain *d = v->domain;
493 unsigned long gmfn, mfn;
494 l1_pgentry_t l1e, nl1e;
495 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
496 int okay;
498 BUG_ON(unlikely(in_irq()));
500 guest_get_eff_kern_l1e(v, gva, &l1e);
501 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
502 return 0;
504 gmfn = l1e_get_pfn(l1e);
505 mfn = gmfn_to_mfn(d, gmfn);
506 if ( unlikely(!mfn_valid(mfn)) )
507 return 0;
509 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
510 if ( unlikely(!okay) )
511 return 0;
513 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
515 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
516 v->arch.shadow_ldt_mapcnt++;
518 return 1;
519 }
522 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
523 {
524 struct page_info *page = mfn_to_page(page_nr);
526 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
527 {
528 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
529 return 0;
530 }
532 return 1;
533 }
536 static int get_page_and_type_from_pagenr(unsigned long page_nr,
537 unsigned long type,
538 struct domain *d)
539 {
540 struct page_info *page = mfn_to_page(page_nr);
542 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
543 return 0;
545 if ( unlikely(!get_page_type(page, type)) )
546 {
547 put_page(page);
548 return 0;
549 }
551 return 1;
552 }
554 /*
555 * We allow root tables to map each other (a.k.a. linear page tables). It
556 * needs some special care with reference counts and access permissions:
557 * 1. The mapping entry must be read-only, or the guest may get write access
558 * to its own PTEs.
559 * 2. We must only bump the reference counts for an *already validated*
560 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
561 * on a validation that is required to complete that validation.
562 * 3. We only need to increment the reference counts for the mapped page
563 * frame if it is mapped by a different root table. This is sufficient and
564 * also necessary to allow validation of a root table mapping itself.
565 */
566 #define define_get_linear_pagetable(level) \
567 static int \
568 get_##level##_linear_pagetable( \
569 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
570 { \
571 unsigned long x, y; \
572 struct page_info *page; \
573 unsigned long pfn; \
574 \
575 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
576 { \
577 MEM_LOG("Attempt to create linear p.t. with write perms"); \
578 return 0; \
579 } \
580 \
581 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
582 { \
583 /* Make sure the mapped frame belongs to the correct domain. */ \
584 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
585 return 0; \
586 \
587 /* \
588 * Ensure that the mapped frame is an already-validated page table. \
589 * If so, atomically increment the count (checking for overflow). \
590 */ \
591 page = mfn_to_page(pfn); \
592 y = page->u.inuse.type_info; \
593 do { \
594 x = y; \
595 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
596 unlikely((x & (PGT_type_mask|PGT_validated)) != \
597 (PGT_##level##_page_table|PGT_validated)) ) \
598 { \
599 put_page(page); \
600 return 0; \
601 } \
602 } \
603 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
604 } \
605 \
606 return 1; \
607 }
610 int is_iomem_page(unsigned long mfn)
611 {
612 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
613 }
616 int
617 get_page_from_l1e(
618 l1_pgentry_t l1e, struct domain *d)
619 {
620 unsigned long mfn = l1e_get_pfn(l1e);
621 struct page_info *page = mfn_to_page(mfn);
622 uint32_t l1f = l1e_get_flags(l1e);
623 int okay;
625 if ( !(l1f & _PAGE_PRESENT) )
626 return 1;
628 if ( unlikely(l1f & l1_disallow_mask(d)) )
629 {
630 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
631 return 0;
632 }
634 if ( is_iomem_page(mfn) )
635 {
636 /* DOMID_IO reverts to caller for privilege checks. */
637 if ( d == dom_io )
638 d = current->domain;
640 if ( !iomem_access_permitted(d, mfn, mfn) )
641 {
642 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
643 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
644 d->domain_id, mfn);
645 return 0;
646 }
648 return 1;
649 }
651 /* Foreign mappings into guests in shadow external mode don't
652 * contribute to writeable mapping refcounts. (This allows the
653 * qemu-dm helper process in dom0 to map the domain's memory without
654 * messing up the count of "real" writable mappings.) */
655 okay = (((l1f & _PAGE_RW) &&
656 !(unlikely(paging_mode_external(d) && (d != current->domain))))
657 ? get_page_and_type(page, d, PGT_writable_page)
658 : get_page(page, d));
659 if ( !okay )
660 {
661 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
662 " for dom%d",
663 mfn, get_gpfn_from_mfn(mfn),
664 l1e_get_intpte(l1e), d->domain_id);
665 }
666 else if ( pte_flags_to_cacheattr(l1f) !=
667 ((page->count_info >> PGC_cacheattr_base) & 7) )
668 {
669 uint32_t x, nx, y = page->count_info;
670 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
672 if ( is_xen_heap_page(page) )
673 {
674 if ( (l1f & _PAGE_RW) &&
675 !(unlikely(paging_mode_external(d) &&
676 (d != current->domain))) )
677 put_page_type(page);
678 put_page(page);
679 MEM_LOG("Attempt to change cache attributes of Xen heap page");
680 return 0;
681 }
683 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
684 {
685 x = y;
686 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
687 y = cmpxchg(&page->count_info, x, nx);
688 }
690 #ifdef __x86_64__
691 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
692 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
693 #endif
694 }
696 return okay;
697 }
700 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
701 define_get_linear_pagetable(l2);
702 static int
703 get_page_from_l2e(
704 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
705 {
706 int rc;
708 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
709 return 1;
711 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
712 {
713 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
714 return 0;
715 }
717 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
718 if ( unlikely(!rc) )
719 rc = get_l2_linear_pagetable(l2e, pfn, d);
721 return rc;
722 }
725 #if CONFIG_PAGING_LEVELS >= 3
726 define_get_linear_pagetable(l3);
727 static int
728 get_page_from_l3e(
729 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
730 {
731 int rc;
733 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
734 return 1;
736 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
737 {
738 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
739 return 0;
740 }
742 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
743 if ( unlikely(!rc) )
744 rc = get_l3_linear_pagetable(l3e, pfn, d);
746 return rc;
747 }
748 #endif /* 3 level */
750 #if CONFIG_PAGING_LEVELS >= 4
751 define_get_linear_pagetable(l4);
752 static int
753 get_page_from_l4e(
754 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
755 {
756 int rc;
758 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
759 return 1;
761 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
762 {
763 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
764 return 0;
765 }
767 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
768 if ( unlikely(!rc) )
769 rc = get_l4_linear_pagetable(l4e, pfn, d);
771 return rc;
772 }
773 #endif /* 4 level */
775 #ifdef __x86_64__
777 #ifdef USER_MAPPINGS_ARE_GLOBAL
778 #define adjust_guest_l1e(pl1e, d) \
779 do { \
780 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
781 likely(!is_pv_32on64_domain(d)) ) \
782 { \
783 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
784 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
785 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
786 MEM_LOG("Global bit is set to kernel page %lx", \
787 l1e_get_pfn((pl1e))); \
788 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
789 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
790 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
791 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
792 } \
793 } while ( 0 )
794 #else
795 #define adjust_guest_l1e(pl1e, d) \
796 do { \
797 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
798 likely(!is_pv_32on64_domain(d)) ) \
799 l1e_add_flags((pl1e), _PAGE_USER); \
800 } while ( 0 )
801 #endif
803 #define adjust_guest_l2e(pl2e, d) \
804 do { \
805 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
806 likely(!is_pv_32on64_domain(d)) ) \
807 l2e_add_flags((pl2e), _PAGE_USER); \
808 } while ( 0 )
810 #define adjust_guest_l3e(pl3e, d) \
811 do { \
812 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
813 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
814 _PAGE_USER : \
815 _PAGE_USER|_PAGE_RW); \
816 } while ( 0 )
818 #define adjust_guest_l4e(pl4e, d) \
819 do { \
820 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
821 likely(!is_pv_32on64_domain(d)) ) \
822 l4e_add_flags((pl4e), _PAGE_USER); \
823 } while ( 0 )
825 #else /* !defined(__x86_64__) */
827 #define adjust_guest_l1e(_p, _d) ((void)(_d))
828 #define adjust_guest_l2e(_p, _d) ((void)(_d))
829 #define adjust_guest_l3e(_p, _d) ((void)(_d))
831 #endif
833 #ifdef CONFIG_COMPAT
834 #define unadjust_guest_l3e(pl3e, d) \
835 do { \
836 if ( unlikely(is_pv_32on64_domain(d)) && \
837 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
838 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
839 } while ( 0 )
840 #else
841 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
842 #endif
844 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
845 {
846 unsigned long pfn = l1e_get_pfn(l1e);
847 struct page_info *page;
848 struct domain *e;
849 struct vcpu *v;
851 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
852 return;
854 page = mfn_to_page(pfn);
856 e = page_get_owner(page);
858 /*
859 * Check if this is a mapping that was established via a grant reference.
860 * If it was then we should not be here: we require that such mappings are
861 * explicitly destroyed via the grant-table interface.
862 *
863 * The upshot of this is that the guest can end up with active grants that
864 * it cannot destroy (because it no longer has a PTE to present to the
865 * grant-table interface). This can lead to subtle hard-to-catch bugs,
866 * hence a special grant PTE flag can be enabled to catch the bug early.
867 *
868 * (Note that the undestroyable active grants are not a security hole in
869 * Xen. All active grants can safely be cleaned up when the domain dies.)
870 */
871 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
872 !d->is_shutting_down && !d->is_dying )
873 {
874 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
875 l1e_get_intpte(l1e));
876 domain_crash(d);
877 }
879 /* Remember we didn't take a type-count of foreign writable mappings
880 * to paging-external domains */
881 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
882 !(unlikely((e != d) && paging_mode_external(e))) )
883 {
884 put_page_and_type(page);
885 }
886 else
887 {
888 /* We expect this is rare so we blow the entire shadow LDT. */
889 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
890 PGT_ldt_page)) &&
891 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
892 (d == e) )
893 {
894 for_each_vcpu ( d, v )
895 invalidate_shadow_ldt(v);
896 }
897 put_page(page);
898 }
899 }
902 /*
903 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
904 * Note also that this automatically deals correctly with linear p.t.'s.
905 */
906 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
907 {
908 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
909 (l2e_get_pfn(l2e) != pfn) )
910 put_page_and_type(l2e_get_page(l2e));
911 }
914 #if CONFIG_PAGING_LEVELS >= 3
915 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
916 {
917 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
918 (l3e_get_pfn(l3e) != pfn) )
919 put_page_and_type(l3e_get_page(l3e));
920 }
921 #endif
923 #if CONFIG_PAGING_LEVELS >= 4
924 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
925 {
926 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
927 (l4e_get_pfn(l4e) != pfn) )
928 put_page_and_type(l4e_get_page(l4e));
929 }
930 #endif
932 static int alloc_l1_table(struct page_info *page)
933 {
934 struct domain *d = page_get_owner(page);
935 unsigned long pfn = page_to_mfn(page);
936 l1_pgentry_t *pl1e;
937 int i;
939 pl1e = map_domain_page(pfn);
941 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
942 {
943 if ( is_guest_l1_slot(i) &&
944 unlikely(!get_page_from_l1e(pl1e[i], d)) )
945 goto fail;
947 adjust_guest_l1e(pl1e[i], d);
948 }
950 unmap_domain_page(pl1e);
951 return 1;
953 fail:
954 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
955 while ( i-- > 0 )
956 if ( is_guest_l1_slot(i) )
957 put_page_from_l1e(pl1e[i], d);
959 unmap_domain_page(pl1e);
960 return 0;
961 }
963 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
964 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
965 {
966 struct page_info *page;
967 l2_pgentry_t *pl2e;
968 l3_pgentry_t l3e3;
969 #ifndef CONFIG_COMPAT
970 l2_pgentry_t l2e;
971 int i;
972 #endif
974 if ( !is_pv_32bit_domain(d) )
975 return 1;
977 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
979 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
980 l3e3 = pl3e[3];
981 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
982 {
983 MEM_LOG("PAE L3 3rd slot is empty");
984 return 0;
985 }
987 /*
988 * The Xen-private mappings include linear mappings. The L2 thus cannot
989 * be shared by multiple L3 tables. The test here is adequate because:
990 * 1. Cannot appear in slots != 3 because get_page_type() checks the
991 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
992 * 2. Cannot appear in another page table's L3:
993 * a. alloc_l3_table() calls this function and this check will fail
994 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
995 */
996 page = l3e_get_page(l3e3);
997 BUG_ON(page->u.inuse.type_info & PGT_pinned);
998 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
999 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1000 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1002 MEM_LOG("PAE L3 3rd slot is shared");
1003 return 0;
1006 /* Xen private mappings. */
1007 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1008 #ifndef CONFIG_COMPAT
1009 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1010 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1011 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1012 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1014 l2e = l2e_from_page(
1015 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1016 __PAGE_HYPERVISOR);
1017 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1019 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1021 l2e = l2e_empty();
1022 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1023 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1024 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1026 #else
1027 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1028 &compat_idle_pg_table_l2[
1029 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1030 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1031 #endif
1032 unmap_domain_page(pl2e);
1034 return 1;
1036 #else
1037 # define create_pae_xen_mappings(d, pl3e) (1)
1038 #endif
1040 #ifdef CONFIG_X86_PAE
1041 /* Flush a pgdir update into low-memory caches. */
1042 static void pae_flush_pgd(
1043 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1045 struct domain *d = page_get_owner(mfn_to_page(mfn));
1046 struct vcpu *v;
1047 intpte_t _ol3e, _nl3e, _pl3e;
1048 l3_pgentry_t *l3tab_ptr;
1049 struct pae_l3_cache *cache;
1051 if ( unlikely(shadow_mode_enabled(d)) )
1053 cpumask_t m = CPU_MASK_NONE;
1054 /* Re-shadow this l3 table on any vcpus that are using it */
1055 for_each_vcpu ( d, v )
1056 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1058 paging_update_cr3(v);
1059 cpus_or(m, m, v->vcpu_dirty_cpumask);
1061 flush_tlb_mask(m);
1064 /* If below 4GB then the pgdir is not shadowed in low memory. */
1065 if ( !l3tab_needs_shadow(mfn) )
1066 return;
1068 for_each_vcpu ( d, v )
1070 cache = &v->arch.pae_l3_cache;
1072 spin_lock(&cache->lock);
1074 if ( cache->high_mfn == mfn )
1076 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1077 _ol3e = l3e_get_intpte(*l3tab_ptr);
1078 _nl3e = l3e_get_intpte(nl3e);
1079 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1080 BUG_ON(_pl3e != _ol3e);
1083 spin_unlock(&cache->lock);
1086 flush_tlb_mask(d->domain_dirty_cpumask);
1088 #else
1089 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1090 #endif
1092 static int alloc_l2_table(struct page_info *page, unsigned long type)
1094 struct domain *d = page_get_owner(page);
1095 unsigned long pfn = page_to_mfn(page);
1096 l2_pgentry_t *pl2e;
1097 int i;
1099 pl2e = map_domain_page(pfn);
1101 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1103 if ( is_guest_l2_slot(d, type, i) &&
1104 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1105 goto fail;
1107 adjust_guest_l2e(pl2e[i], d);
1110 #if CONFIG_PAGING_LEVELS == 2
1111 /* Xen private mappings. */
1112 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1113 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1114 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1115 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1116 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1117 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1118 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1119 l2e_from_page(
1120 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1121 __PAGE_HYPERVISOR);
1122 #endif
1124 unmap_domain_page(pl2e);
1125 return 1;
1127 fail:
1128 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1129 while ( i-- > 0 )
1130 if ( is_guest_l2_slot(d, type, i) )
1131 put_page_from_l2e(pl2e[i], pfn);
1133 unmap_domain_page(pl2e);
1134 return 0;
1138 #if CONFIG_PAGING_LEVELS >= 3
1139 static int alloc_l3_table(struct page_info *page)
1141 struct domain *d = page_get_owner(page);
1142 unsigned long pfn = page_to_mfn(page);
1143 l3_pgentry_t *pl3e;
1144 int i;
1146 #ifdef CONFIG_X86_PAE
1147 /*
1148 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1149 * the weird 'extended cr3' format for dealing with high-order address
1150 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1151 */
1152 if ( (pfn >= 0x100000) &&
1153 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1154 d->vcpu[0] && d->vcpu[0]->is_initialised )
1156 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1157 return 0;
1159 #endif
1161 pl3e = map_domain_page(pfn);
1163 /*
1164 * PAE guests allocate full pages, but aren't required to initialize
1165 * more than the first four entries; when running in compatibility
1166 * mode, however, the full page is visible to the MMU, and hence all
1167 * 512 entries must be valid/verified, which is most easily achieved
1168 * by clearing them out.
1169 */
1170 if ( is_pv_32on64_domain(d) )
1171 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1173 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1175 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1176 if ( is_pv_32bit_domain(d) && (i == 3) )
1178 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1179 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1180 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1181 PGT_l2_page_table |
1182 PGT_pae_xen_l2,
1183 d) )
1184 goto fail;
1186 else
1187 #endif
1188 if ( is_guest_l3_slot(i) &&
1189 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1190 goto fail;
1192 adjust_guest_l3e(pl3e[i], d);
1195 if ( !create_pae_xen_mappings(d, pl3e) )
1196 goto fail;
1198 unmap_domain_page(pl3e);
1199 return 1;
1201 fail:
1202 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1203 while ( i-- > 0 )
1204 if ( is_guest_l3_slot(i) )
1205 put_page_from_l3e(pl3e[i], pfn);
1207 unmap_domain_page(pl3e);
1208 return 0;
1210 #else
1211 #define alloc_l3_table(page) (0)
1212 #endif
1214 #if CONFIG_PAGING_LEVELS >= 4
1215 static int alloc_l4_table(struct page_info *page)
1217 struct domain *d = page_get_owner(page);
1218 unsigned long pfn = page_to_mfn(page);
1219 l4_pgentry_t *pl4e = page_to_virt(page);
1220 int i;
1222 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1224 if ( is_guest_l4_slot(d, i) &&
1225 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1226 goto fail;
1228 adjust_guest_l4e(pl4e[i], d);
1231 /* Xen private mappings. */
1232 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1233 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1234 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1235 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1236 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1237 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1238 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1239 __PAGE_HYPERVISOR);
1240 if ( is_pv_32on64_domain(d) )
1241 pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1242 l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
1243 __PAGE_HYPERVISOR);
1245 return 1;
1247 fail:
1248 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1249 while ( i-- > 0 )
1250 if ( is_guest_l4_slot(d, i) )
1251 put_page_from_l4e(pl4e[i], pfn);
1253 return 0;
1255 #else
1256 #define alloc_l4_table(page) (0)
1257 #endif
1260 static void free_l1_table(struct page_info *page)
1262 struct domain *d = page_get_owner(page);
1263 unsigned long pfn = page_to_mfn(page);
1264 l1_pgentry_t *pl1e;
1265 int i;
1267 pl1e = map_domain_page(pfn);
1269 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1270 if ( is_guest_l1_slot(i) )
1271 put_page_from_l1e(pl1e[i], d);
1273 unmap_domain_page(pl1e);
1277 static void free_l2_table(struct page_info *page)
1279 #ifdef CONFIG_COMPAT
1280 struct domain *d = page_get_owner(page);
1281 #endif
1282 unsigned long pfn = page_to_mfn(page);
1283 l2_pgentry_t *pl2e;
1284 int i;
1286 pl2e = map_domain_page(pfn);
1288 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1289 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1290 put_page_from_l2e(pl2e[i], pfn);
1292 unmap_domain_page(pl2e);
1294 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1298 #if CONFIG_PAGING_LEVELS >= 3
1300 static void free_l3_table(struct page_info *page)
1302 struct domain *d = page_get_owner(page);
1303 unsigned long pfn = page_to_mfn(page);
1304 l3_pgentry_t *pl3e;
1305 int i;
1307 pl3e = map_domain_page(pfn);
1309 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1310 if ( is_guest_l3_slot(i) )
1312 put_page_from_l3e(pl3e[i], pfn);
1313 unadjust_guest_l3e(pl3e[i], d);
1316 unmap_domain_page(pl3e);
1319 #endif
1321 #if CONFIG_PAGING_LEVELS >= 4
1323 static void free_l4_table(struct page_info *page)
1325 struct domain *d = page_get_owner(page);
1326 unsigned long pfn = page_to_mfn(page);
1327 l4_pgentry_t *pl4e = page_to_virt(page);
1328 int i;
1330 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1331 if ( is_guest_l4_slot(d, i) )
1332 put_page_from_l4e(pl4e[i], pfn);
1335 #endif
1338 /* How to write an entry to the guest pagetables.
1339 * Returns 0 for failure (pointer not valid), 1 for success. */
1340 static inline int update_intpte(intpte_t *p,
1341 intpte_t old,
1342 intpte_t new,
1343 unsigned long mfn,
1344 struct vcpu *v)
1346 int rv = 1;
1347 #ifndef PTE_UPDATE_WITH_CMPXCHG
1348 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1349 #else
1351 intpte_t t = old;
1352 for ( ; ; )
1354 rv = paging_cmpxchg_guest_entry(v, p, &t, new, _mfn(mfn));
1355 if ( unlikely(rv == 0) )
1357 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1358 ": saw %" PRIpte, old, new, t);
1359 break;
1362 if ( t == old )
1363 break;
1365 /* Allowed to change in Accessed/Dirty flags only. */
1366 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1368 old = t;
1371 #endif
1372 return rv;
1375 /* Macro that wraps the appropriate type-changes around update_intpte().
1376 * Arguments are: type, ptr, old, new, mfn, vcpu */
1377 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v) \
1378 update_intpte(&_t ## e_get_intpte(*(_p)), \
1379 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1380 (_m), (_v))
1382 /* Update the L1 entry at pl1e to new value nl1e. */
1383 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1384 unsigned long gl1mfn)
1386 l1_pgentry_t ol1e;
1387 struct domain *d = current->domain;
1388 unsigned long mfn;
1390 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1391 return 0;
1393 if ( unlikely(paging_mode_refcounts(d)) )
1394 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1396 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1398 /* Translate foreign guest addresses. */
1399 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1400 if ( unlikely(mfn == INVALID_MFN) )
1401 return 0;
1402 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1403 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1405 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1407 MEM_LOG("Bad L1 flags %x",
1408 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1409 return 0;
1412 adjust_guest_l1e(nl1e, d);
1414 /* Fast path for identical mapping, r/w and presence. */
1415 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1416 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1418 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1419 return 0;
1421 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1423 put_page_from_l1e(nl1e, d);
1424 return 0;
1427 else
1429 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1430 return 0;
1433 put_page_from_l1e(ol1e, d);
1434 return 1;
1438 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1439 static int mod_l2_entry(l2_pgentry_t *pl2e,
1440 l2_pgentry_t nl2e,
1441 unsigned long pfn,
1442 unsigned long type)
1444 l2_pgentry_t ol2e;
1445 struct domain *d = current->domain;
1447 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1449 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1450 return 0;
1453 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1454 return 0;
1456 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1458 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1460 MEM_LOG("Bad L2 flags %x",
1461 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1462 return 0;
1465 adjust_guest_l2e(nl2e, d);
1467 /* Fast path for identical mapping and presence. */
1468 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1469 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current);
1471 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1472 return 0;
1474 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1476 put_page_from_l2e(nl2e, pfn);
1477 return 0;
1480 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1482 return 0;
1485 put_page_from_l2e(ol2e, pfn);
1486 return 1;
1489 #if CONFIG_PAGING_LEVELS >= 3
1491 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1492 static int mod_l3_entry(l3_pgentry_t *pl3e,
1493 l3_pgentry_t nl3e,
1494 unsigned long pfn)
1496 l3_pgentry_t ol3e;
1497 struct domain *d = current->domain;
1498 int okay;
1500 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1502 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1503 return 0;
1506 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1507 /*
1508 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1509 * would be a pain to ensure they remain continuously valid throughout.
1510 */
1511 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1512 return 0;
1513 #endif
1515 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1516 return 0;
1518 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1520 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1522 MEM_LOG("Bad L3 flags %x",
1523 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1524 return 0;
1527 adjust_guest_l3e(nl3e, d);
1529 /* Fast path for identical mapping and presence. */
1530 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1531 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current);
1533 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1534 return 0;
1536 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1538 put_page_from_l3e(nl3e, pfn);
1539 return 0;
1542 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1544 return 0;
1547 okay = create_pae_xen_mappings(d, pl3e);
1548 BUG_ON(!okay);
1550 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1552 put_page_from_l3e(ol3e, pfn);
1553 return 1;
1556 #endif
1558 #if CONFIG_PAGING_LEVELS >= 4
1560 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1561 static int mod_l4_entry(struct domain *d,
1562 l4_pgentry_t *pl4e,
1563 l4_pgentry_t nl4e,
1564 unsigned long pfn)
1566 l4_pgentry_t ol4e;
1568 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1570 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1571 return 0;
1574 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1575 return 0;
1577 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1579 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1581 MEM_LOG("Bad L4 flags %x",
1582 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1583 return 0;
1586 adjust_guest_l4e(nl4e, current->domain);
1588 /* Fast path for identical mapping and presence. */
1589 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1590 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current);
1592 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1593 return 0;
1595 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1597 put_page_from_l4e(nl4e, pfn);
1598 return 0;
1601 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1603 return 0;
1606 put_page_from_l4e(ol4e, pfn);
1607 return 1;
1610 #endif
1612 static int alloc_page_type(struct page_info *page, unsigned long type)
1614 struct domain *owner = page_get_owner(page);
1616 /* A page table is dirtied when its type count becomes non-zero. */
1617 if ( likely(owner != NULL) )
1618 paging_mark_dirty(owner, page_to_mfn(page));
1620 switch ( type & PGT_type_mask )
1622 case PGT_l1_page_table:
1623 return alloc_l1_table(page);
1624 case PGT_l2_page_table:
1625 return alloc_l2_table(page, type);
1626 case PGT_l3_page_table:
1627 return alloc_l3_table(page);
1628 case PGT_l4_page_table:
1629 return alloc_l4_table(page);
1630 case PGT_gdt_page:
1631 case PGT_ldt_page:
1632 return alloc_segdesc_page(page);
1633 default:
1634 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1635 type, page->u.inuse.type_info,
1636 page->count_info);
1637 BUG();
1640 return 0;
1644 void free_page_type(struct page_info *page, unsigned long type)
1646 struct domain *owner = page_get_owner(page);
1647 unsigned long gmfn;
1649 if ( likely(owner != NULL) )
1651 /*
1652 * We have to flush before the next use of the linear mapping
1653 * (e.g., update_va_mapping()) or we could end up modifying a page
1654 * that is no longer a page table (and hence screw up ref counts).
1655 */
1656 if ( current->domain == owner )
1657 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1658 else
1659 flush_tlb_mask(owner->domain_dirty_cpumask);
1661 if ( unlikely(paging_mode_enabled(owner)) )
1663 /* A page table is dirtied when its type count becomes zero. */
1664 paging_mark_dirty(owner, page_to_mfn(page));
1666 if ( shadow_mode_refcounts(owner) )
1667 return;
1669 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1670 ASSERT(VALID_M2P(gmfn));
1671 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1675 switch ( type & PGT_type_mask )
1677 case PGT_l1_page_table:
1678 free_l1_table(page);
1679 break;
1681 case PGT_l2_page_table:
1682 free_l2_table(page);
1683 break;
1685 #if CONFIG_PAGING_LEVELS >= 3
1686 case PGT_l3_page_table:
1687 free_l3_table(page);
1688 break;
1689 #endif
1691 #if CONFIG_PAGING_LEVELS >= 4
1692 case PGT_l4_page_table:
1693 free_l4_table(page);
1694 break;
1695 #endif
1697 default:
1698 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1699 type, page_to_mfn(page));
1700 BUG();
1705 void put_page_type(struct page_info *page)
1707 unsigned long nx, x, y = page->u.inuse.type_info;
1709 again:
1710 do {
1711 x = y;
1712 nx = x - 1;
1714 ASSERT((x & PGT_count_mask) != 0);
1716 if ( unlikely((nx & PGT_count_mask) == 0) )
1718 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1719 likely(nx & PGT_validated) )
1721 /*
1722 * Page-table pages must be unvalidated when count is zero. The
1723 * 'free' is safe because the refcnt is non-zero and validated
1724 * bit is clear => other ops will spin or fail.
1725 */
1726 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1727 x & ~PGT_validated)) != x) )
1728 goto again;
1729 /* We cleared the 'valid bit' so we do the clean up. */
1730 free_page_type(page, x);
1731 /* Carry on, but with the 'valid bit' now clear. */
1732 x &= ~PGT_validated;
1733 nx &= ~PGT_validated;
1736 /*
1737 * Record TLB information for flush later. We do not stamp page
1738 * tables when running in shadow mode:
1739 * 1. Pointless, since it's the shadow pt's which must be tracked.
1740 * 2. Shadow mode reuses this field for shadowed page tables to
1741 * store flags info -- we don't want to conflict with that.
1742 */
1743 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1744 (page->count_info & PGC_page_table)) )
1745 page->tlbflush_timestamp = tlbflush_current_time();
1748 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1752 int get_page_type(struct page_info *page, unsigned long type)
1754 unsigned long nx, x, y = page->u.inuse.type_info;
1756 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1758 again:
1759 do {
1760 x = y;
1761 nx = x + 1;
1762 if ( unlikely((nx & PGT_count_mask) == 0) )
1764 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1765 return 0;
1767 else if ( unlikely((x & PGT_count_mask) == 0) )
1769 struct domain *d = page_get_owner(page);
1771 /* Never allow a shadowed frame to go from type count 0 to 1 */
1772 if ( d && shadow_mode_enabled(d) )
1773 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1775 ASSERT(!(x & PGT_pae_xen_l2));
1776 if ( (x & PGT_type_mask) != type )
1778 /*
1779 * On type change we check to flush stale TLB entries. This
1780 * may be unnecessary (e.g., page was GDT/LDT) but those
1781 * circumstances should be very rare.
1782 */
1783 cpumask_t mask = d->domain_dirty_cpumask;
1785 /* Don't flush if the timestamp is old enough */
1786 tlbflush_filter(mask, page->tlbflush_timestamp);
1788 if ( unlikely(!cpus_empty(mask)) &&
1789 /* Shadow mode: track only writable pages. */
1790 (!shadow_mode_enabled(page_get_owner(page)) ||
1791 ((nx & PGT_type_mask) == PGT_writable_page)) )
1793 perfc_incr(need_flush_tlb_flush);
1794 flush_tlb_mask(mask);
1797 /* We lose existing type, back pointer, and validity. */
1798 nx &= ~(PGT_type_mask | PGT_validated);
1799 nx |= type;
1801 /* No special validation needed for writable pages. */
1802 /* Page tables and GDT/LDT need to be scanned for validity. */
1803 if ( type == PGT_writable_page )
1804 nx |= PGT_validated;
1807 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1809 /* Don't log failure if it could be a recursive-mapping attempt. */
1810 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
1811 (type == PGT_l1_page_table) )
1812 return 0;
1813 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
1814 (type == PGT_l2_page_table) )
1815 return 0;
1816 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
1817 (type == PGT_l3_page_table) )
1818 return 0;
1819 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
1820 "for mfn %lx (pfn %lx)",
1821 x, type, page_to_mfn(page),
1822 get_gpfn_from_mfn(page_to_mfn(page)));
1823 return 0;
1825 else if ( unlikely(!(x & PGT_validated)) )
1827 /* Someone else is updating validation of this page. Wait... */
1828 while ( (y = page->u.inuse.type_info) == x )
1829 cpu_relax();
1830 goto again;
1833 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1835 if ( unlikely(!(nx & PGT_validated)) )
1837 /* Try to validate page type; drop the new reference on failure. */
1838 if ( unlikely(!alloc_page_type(page, type)) )
1840 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1841 PRtype_info ": caf=%08x taf=%" PRtype_info,
1842 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1843 type, page->count_info, page->u.inuse.type_info);
1844 /* Noone else can get a reference. We hold the only ref. */
1845 page->u.inuse.type_info = 0;
1846 return 0;
1849 /* Noone else is updating simultaneously. */
1850 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1853 return 1;
1857 void cleanup_page_cacheattr(struct page_info *page)
1859 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
1861 if ( likely(cacheattr == 0) )
1862 return;
1864 page->count_info &= ~PGC_cacheattr_mask;
1866 BUG_ON(is_xen_heap_page(page));
1868 #ifdef __x86_64__
1869 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
1870 1, PAGE_HYPERVISOR);
1871 #endif
1875 int new_guest_cr3(unsigned long mfn)
1877 struct vcpu *v = current;
1878 struct domain *d = v->domain;
1879 int okay;
1880 unsigned long old_base_mfn;
1882 #ifdef CONFIG_COMPAT
1883 if ( is_pv_32on64_domain(d) )
1885 okay = paging_mode_refcounts(d)
1886 ? 0 /* Old code was broken, but what should it be? */
1887 : mod_l4_entry(
1888 d,
1889 __va(pagetable_get_paddr(v->arch.guest_table)),
1890 l4e_from_pfn(
1891 mfn,
1892 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
1893 pagetable_get_pfn(v->arch.guest_table));
1894 if ( unlikely(!okay) )
1896 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
1897 return 0;
1900 invalidate_shadow_ldt(v);
1901 write_ptbase(v);
1903 return 1;
1905 #endif
1906 okay = paging_mode_refcounts(d)
1907 ? get_page_from_pagenr(mfn, d)
1908 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1909 if ( unlikely(!okay) )
1911 MEM_LOG("Error while installing new baseptr %lx", mfn);
1912 return 0;
1915 invalidate_shadow_ldt(v);
1917 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1919 v->arch.guest_table = pagetable_from_pfn(mfn);
1920 update_cr3(v);
1922 write_ptbase(v);
1924 if ( likely(old_base_mfn != 0) )
1926 if ( paging_mode_refcounts(d) )
1927 put_page(mfn_to_page(old_base_mfn));
1928 else
1929 put_page_and_type(mfn_to_page(old_base_mfn));
1932 return 1;
1935 static void process_deferred_ops(void)
1937 unsigned int deferred_ops;
1938 struct domain *d = current->domain;
1939 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1941 deferred_ops = info->deferred_ops;
1942 info->deferred_ops = 0;
1944 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1946 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1947 flush_tlb_mask(d->domain_dirty_cpumask);
1948 else
1949 flush_tlb_local();
1952 if ( deferred_ops & DOP_RELOAD_LDT )
1953 (void)map_ldt_shadow_page(0);
1955 if ( unlikely(info->foreign != NULL) )
1957 rcu_unlock_domain(info->foreign);
1958 info->foreign = NULL;
1962 static int set_foreigndom(domid_t domid)
1964 struct domain *e, *d = current->domain;
1965 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1966 int okay = 1;
1968 ASSERT(info->foreign == NULL);
1970 if ( likely(domid == DOMID_SELF) )
1971 goto out;
1973 if ( unlikely(domid == d->domain_id) )
1975 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1976 d->domain_id);
1977 okay = 0;
1979 else if ( unlikely(paging_mode_translate(d)) )
1981 MEM_LOG("Cannot mix foreign mappings with translated domains");
1982 okay = 0;
1984 else if ( !IS_PRIV(d) )
1986 switch ( domid )
1988 case DOMID_IO:
1989 info->foreign = rcu_lock_domain(dom_io);
1990 break;
1991 default:
1992 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1993 okay = 0;
1994 break;
1997 else
1999 info->foreign = e = rcu_lock_domain_by_id(domid);
2000 if ( e == NULL )
2002 switch ( domid )
2004 case DOMID_XEN:
2005 info->foreign = rcu_lock_domain(dom_xen);
2006 break;
2007 case DOMID_IO:
2008 info->foreign = rcu_lock_domain(dom_io);
2009 break;
2010 default:
2011 MEM_LOG("Unknown domain '%u'", domid);
2012 okay = 0;
2013 break;
2018 out:
2019 return okay;
2022 static inline cpumask_t vcpumask_to_pcpumask(
2023 struct domain *d, unsigned long vmask)
2025 unsigned int vcpu_id;
2026 cpumask_t pmask = CPU_MASK_NONE;
2027 struct vcpu *v;
2029 while ( vmask != 0 )
2031 vcpu_id = find_first_set_bit(vmask);
2032 vmask &= ~(1UL << vcpu_id);
2033 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2034 ((v = d->vcpu[vcpu_id]) != NULL) )
2035 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2038 return pmask;
2041 int do_mmuext_op(
2042 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2043 unsigned int count,
2044 XEN_GUEST_HANDLE(uint) pdone,
2045 unsigned int foreigndom)
2047 struct mmuext_op op;
2048 int rc = 0, i = 0, okay;
2049 unsigned long mfn = 0, gmfn = 0, type;
2050 unsigned int done = 0;
2051 struct page_info *page;
2052 struct vcpu *v = current;
2053 struct domain *d = v->domain;
2055 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2057 count &= ~MMU_UPDATE_PREEMPTED;
2058 if ( unlikely(!guest_handle_is_null(pdone)) )
2059 (void)copy_from_guest(&done, pdone, 1);
2061 else
2062 perfc_incr(calls_to_mmuext_op);
2064 if ( unlikely(!guest_handle_okay(uops, count)) )
2066 rc = -EFAULT;
2067 goto out;
2070 if ( !set_foreigndom(foreigndom) )
2072 rc = -ESRCH;
2073 goto out;
2076 LOCK_BIGLOCK(d);
2078 for ( i = 0; i < count; i++ )
2080 if ( hypercall_preempt_check() )
2082 rc = hypercall_create_continuation(
2083 __HYPERVISOR_mmuext_op, "hihi",
2084 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2085 break;
2088 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2090 MEM_LOG("Bad __copy_from_guest");
2091 rc = -EFAULT;
2092 break;
2095 okay = 1;
2096 gmfn = op.arg1.mfn;
2097 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2098 page = mfn_to_page(mfn);
2100 switch ( op.cmd )
2102 case MMUEXT_PIN_L1_TABLE:
2103 type = PGT_l1_page_table;
2104 goto pin_page;
2106 case MMUEXT_PIN_L2_TABLE:
2107 type = PGT_l2_page_table;
2108 goto pin_page;
2110 case MMUEXT_PIN_L3_TABLE:
2111 type = PGT_l3_page_table;
2112 goto pin_page;
2114 case MMUEXT_PIN_L4_TABLE:
2115 if ( is_pv_32bit_domain(FOREIGNDOM) )
2116 break;
2117 type = PGT_l4_page_table;
2119 pin_page:
2120 rc = xsm_memory_pin_page(current->domain, page);
2121 if ( rc )
2122 break;
2124 /* Ignore pinning of invalid paging levels. */
2125 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2126 break;
2128 if ( paging_mode_refcounts(FOREIGNDOM) )
2129 break;
2131 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2132 if ( unlikely(!okay) )
2134 MEM_LOG("Error while pinning mfn %lx", mfn);
2135 break;
2138 if ( unlikely(test_and_set_bit(_PGT_pinned,
2139 &page->u.inuse.type_info)) )
2141 MEM_LOG("Mfn %lx already pinned", mfn);
2142 put_page_and_type(page);
2143 okay = 0;
2144 break;
2147 /* A page is dirtied when its pin status is set. */
2148 paging_mark_dirty(d, mfn);
2150 /* We can race domain destruction (domain_relinquish_resources). */
2151 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2153 int drop_ref;
2154 spin_lock(&FOREIGNDOM->page_alloc_lock);
2155 drop_ref = (FOREIGNDOM->is_dying &&
2156 test_and_clear_bit(_PGT_pinned,
2157 &page->u.inuse.type_info));
2158 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2159 if ( drop_ref )
2160 put_page_and_type(page);
2163 break;
2165 case MMUEXT_UNPIN_TABLE:
2166 if ( paging_mode_refcounts(d) )
2167 break;
2169 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2171 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2172 mfn, page_get_owner(page));
2174 else if ( likely(test_and_clear_bit(_PGT_pinned,
2175 &page->u.inuse.type_info)) )
2177 put_page_and_type(page);
2178 put_page(page);
2179 /* A page is dirtied when its pin status is cleared. */
2180 paging_mark_dirty(d, mfn);
2182 else
2184 okay = 0;
2185 put_page(page);
2186 MEM_LOG("Mfn %lx not pinned", mfn);
2188 break;
2190 case MMUEXT_NEW_BASEPTR:
2191 okay = new_guest_cr3(mfn);
2192 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2193 break;
2195 #ifdef __x86_64__
2196 case MMUEXT_NEW_USER_BASEPTR: {
2197 unsigned long old_mfn;
2199 if ( mfn != 0 )
2201 if ( paging_mode_refcounts(d) )
2202 okay = get_page_from_pagenr(mfn, d);
2203 else
2204 okay = get_page_and_type_from_pagenr(
2205 mfn, PGT_root_page_table, d);
2206 if ( unlikely(!okay) )
2208 MEM_LOG("Error while installing new mfn %lx", mfn);
2209 break;
2213 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2214 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2216 if ( old_mfn != 0 )
2218 if ( paging_mode_refcounts(d) )
2219 put_page(mfn_to_page(old_mfn));
2220 else
2221 put_page_and_type(mfn_to_page(old_mfn));
2224 break;
2226 #endif
2228 case MMUEXT_TLB_FLUSH_LOCAL:
2229 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2230 break;
2232 case MMUEXT_INVLPG_LOCAL:
2233 if ( !paging_mode_enabled(d)
2234 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2235 flush_tlb_one_local(op.arg1.linear_addr);
2236 break;
2238 case MMUEXT_TLB_FLUSH_MULTI:
2239 case MMUEXT_INVLPG_MULTI:
2241 unsigned long vmask;
2242 cpumask_t pmask;
2243 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2245 okay = 0;
2246 break;
2248 pmask = vcpumask_to_pcpumask(d, vmask);
2249 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2250 flush_tlb_mask(pmask);
2251 else
2252 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2253 break;
2256 case MMUEXT_TLB_FLUSH_ALL:
2257 flush_tlb_mask(d->domain_dirty_cpumask);
2258 break;
2260 case MMUEXT_INVLPG_ALL:
2261 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2262 break;
2264 case MMUEXT_FLUSH_CACHE:
2265 if ( unlikely(!cache_flush_permitted(d)) )
2267 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2268 okay = 0;
2270 else
2272 wbinvd();
2274 break;
2276 case MMUEXT_SET_LDT:
2278 unsigned long ptr = op.arg1.linear_addr;
2279 unsigned long ents = op.arg2.nr_ents;
2281 if ( paging_mode_external(d) )
2283 MEM_LOG("ignoring SET_LDT hypercall from external "
2284 "domain %u", d->domain_id);
2285 okay = 0;
2287 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2288 (ents > 8192) ||
2289 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2291 okay = 0;
2292 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2294 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2295 (v->arch.guest_context.ldt_base != ptr) )
2297 invalidate_shadow_ldt(v);
2298 v->arch.guest_context.ldt_base = ptr;
2299 v->arch.guest_context.ldt_ents = ents;
2300 load_LDT(v);
2301 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2302 if ( ents != 0 )
2303 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2305 break;
2308 default:
2309 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2310 rc = -ENOSYS;
2311 okay = 0;
2312 break;
2315 if ( unlikely(!okay) )
2317 rc = rc ? rc : -EINVAL;
2318 break;
2321 guest_handle_add_offset(uops, 1);
2324 process_deferred_ops();
2326 UNLOCK_BIGLOCK(d);
2328 perfc_add(num_mmuext_ops, i);
2330 out:
2331 /* Add incremental work we have done to the @done output parameter. */
2332 if ( unlikely(!guest_handle_is_null(pdone)) )
2334 done += i;
2335 copy_to_guest(pdone, &done, 1);
2338 return rc;
2341 int do_mmu_update(
2342 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2343 unsigned int count,
2344 XEN_GUEST_HANDLE(uint) pdone,
2345 unsigned int foreigndom)
2347 struct mmu_update req;
2348 void *va;
2349 unsigned long gpfn, gmfn, mfn;
2350 struct page_info *page;
2351 int rc = 0, okay = 1, i = 0;
2352 unsigned int cmd, done = 0;
2353 struct vcpu *v = current;
2354 struct domain *d = v->domain;
2355 unsigned long type_info;
2356 struct domain_mmap_cache mapcache;
2358 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2360 count &= ~MMU_UPDATE_PREEMPTED;
2361 if ( unlikely(!guest_handle_is_null(pdone)) )
2362 (void)copy_from_guest(&done, pdone, 1);
2364 else
2365 perfc_incr(calls_to_mmu_update);
2367 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2369 rc = -EFAULT;
2370 goto out;
2373 if ( !set_foreigndom(foreigndom) )
2375 rc = -ESRCH;
2376 goto out;
2379 domain_mmap_cache_init(&mapcache);
2381 LOCK_BIGLOCK(d);
2383 for ( i = 0; i < count; i++ )
2385 if ( hypercall_preempt_check() )
2387 rc = hypercall_create_continuation(
2388 __HYPERVISOR_mmu_update, "hihi",
2389 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2390 break;
2393 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2395 MEM_LOG("Bad __copy_from_guest");
2396 rc = -EFAULT;
2397 break;
2400 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2401 okay = 0;
2403 switch ( cmd )
2405 /*
2406 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2407 */
2408 case MMU_NORMAL_PT_UPDATE:
2410 rc = xsm_mmu_normal_update(current->domain, req.val);
2411 if ( rc )
2412 break;
2414 gmfn = req.ptr >> PAGE_SHIFT;
2415 mfn = gmfn_to_mfn(d, gmfn);
2417 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2419 MEM_LOG("Could not get page for normal update");
2420 break;
2423 va = map_domain_page_with_cache(mfn, &mapcache);
2424 va = (void *)((unsigned long)va +
2425 (unsigned long)(req.ptr & ~PAGE_MASK));
2426 page = mfn_to_page(mfn);
2428 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2430 case PGT_l1_page_table:
2431 case PGT_l2_page_table:
2432 case PGT_l3_page_table:
2433 case PGT_l4_page_table:
2435 if ( paging_mode_refcounts(d) )
2437 MEM_LOG("mmu update on auto-refcounted domain!");
2438 break;
2441 if ( unlikely(!get_page_type(
2442 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2443 goto not_a_pt;
2445 switch ( type_info & PGT_type_mask )
2447 case PGT_l1_page_table:
2449 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2450 okay = mod_l1_entry(va, l1e, mfn);
2452 break;
2453 case PGT_l2_page_table:
2455 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2456 okay = mod_l2_entry(va, l2e, mfn, type_info);
2458 break;
2459 #if CONFIG_PAGING_LEVELS >= 3
2460 case PGT_l3_page_table:
2462 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2463 okay = mod_l3_entry(va, l3e, mfn);
2465 break;
2466 #endif
2467 #if CONFIG_PAGING_LEVELS >= 4
2468 case PGT_l4_page_table:
2470 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2471 okay = mod_l4_entry(d, va, l4e, mfn);
2473 break;
2474 #endif
2477 put_page_type(page);
2479 break;
2481 default:
2482 not_a_pt:
2484 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2485 break;
2487 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2489 put_page_type(page);
2491 break;
2494 unmap_domain_page_with_cache(va, &mapcache);
2496 put_page(page);
2497 break;
2499 case MMU_MACHPHYS_UPDATE:
2501 mfn = req.ptr >> PAGE_SHIFT;
2502 gpfn = req.val;
2504 rc = xsm_mmu_machphys_update(current->domain, mfn);
2505 if ( rc )
2506 break;
2508 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2510 MEM_LOG("Could not get page for mach->phys update");
2511 break;
2514 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2516 MEM_LOG("Mach-phys update on auto-translate guest");
2517 break;
2520 set_gpfn_from_mfn(mfn, gpfn);
2521 okay = 1;
2523 paging_mark_dirty(FOREIGNDOM, mfn);
2525 put_page(mfn_to_page(mfn));
2526 break;
2528 default:
2529 MEM_LOG("Invalid page update command %x", cmd);
2530 rc = -ENOSYS;
2531 okay = 0;
2532 break;
2535 if ( unlikely(!okay) )
2537 rc = rc ? rc : -EINVAL;
2538 break;
2541 guest_handle_add_offset(ureqs, 1);
2544 process_deferred_ops();
2546 UNLOCK_BIGLOCK(d);
2548 domain_mmap_cache_destroy(&mapcache);
2550 perfc_add(num_page_updates, i);
2552 out:
2553 /* Add incremental work we have done to the @done output parameter. */
2554 if ( unlikely(!guest_handle_is_null(pdone)) )
2556 done += i;
2557 copy_to_guest(pdone, &done, 1);
2560 return rc;
2564 static int create_grant_pte_mapping(
2565 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2567 int rc = GNTST_okay;
2568 void *va;
2569 unsigned long gmfn, mfn;
2570 struct page_info *page;
2571 u32 type;
2572 l1_pgentry_t ol1e;
2573 struct domain *d = v->domain;
2575 ASSERT(spin_is_locked(&d->big_lock));
2577 adjust_guest_l1e(nl1e, d);
2579 gmfn = pte_addr >> PAGE_SHIFT;
2580 mfn = gmfn_to_mfn(d, gmfn);
2582 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2584 MEM_LOG("Could not get page for normal update");
2585 return GNTST_general_error;
2588 va = map_domain_page(mfn);
2589 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2590 page = mfn_to_page(mfn);
2592 type = page->u.inuse.type_info & PGT_type_mask;
2593 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2595 MEM_LOG("Grant map attempted to update a non-L1 page");
2596 rc = GNTST_general_error;
2597 goto failed;
2600 ol1e = *(l1_pgentry_t *)va;
2601 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v) )
2603 put_page_type(page);
2604 rc = GNTST_general_error;
2605 goto failed;
2608 if ( !paging_mode_refcounts(d) )
2609 put_page_from_l1e(ol1e, d);
2611 put_page_type(page);
2613 failed:
2614 unmap_domain_page(va);
2615 put_page(page);
2617 return rc;
2620 static int destroy_grant_pte_mapping(
2621 uint64_t addr, unsigned long frame, struct domain *d)
2623 int rc = GNTST_okay;
2624 void *va;
2625 unsigned long gmfn, mfn;
2626 struct page_info *page;
2627 u32 type;
2628 l1_pgentry_t ol1e;
2630 gmfn = addr >> PAGE_SHIFT;
2631 mfn = gmfn_to_mfn(d, gmfn);
2633 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2635 MEM_LOG("Could not get page for normal update");
2636 return GNTST_general_error;
2639 va = map_domain_page(mfn);
2640 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2641 page = mfn_to_page(mfn);
2643 type = page->u.inuse.type_info & PGT_type_mask;
2644 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2646 MEM_LOG("Grant map attempted to update a non-L1 page");
2647 rc = GNTST_general_error;
2648 goto failed;
2651 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2653 put_page_type(page);
2654 rc = GNTST_general_error;
2655 goto failed;
2658 /* Check that the virtual address supplied is actually mapped to frame. */
2659 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2661 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2662 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2663 put_page_type(page);
2664 rc = GNTST_general_error;
2665 goto failed;
2668 /* Delete pagetable entry. */
2669 if ( unlikely(!UPDATE_ENTRY(l1,
2670 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2671 d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
2673 MEM_LOG("Cannot delete PTE entry at %p", va);
2674 put_page_type(page);
2675 rc = GNTST_general_error;
2676 goto failed;
2679 put_page_type(page);
2681 failed:
2682 unmap_domain_page(va);
2683 put_page(page);
2684 return rc;
2688 static int create_grant_va_mapping(
2689 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2691 l1_pgentry_t *pl1e, ol1e;
2692 struct domain *d = v->domain;
2693 unsigned long gl1mfn;
2694 int okay;
2696 ASSERT(spin_is_locked(&d->big_lock));
2698 adjust_guest_l1e(nl1e, d);
2700 pl1e = guest_map_l1e(v, va, &gl1mfn);
2701 if ( !pl1e )
2703 MEM_LOG("Could not find L1 PTE for address %lx", va);
2704 return GNTST_general_error;
2706 ol1e = *pl1e;
2707 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v);
2708 guest_unmap_l1e(v, pl1e);
2709 pl1e = NULL;
2711 if ( !okay )
2712 return GNTST_general_error;
2714 if ( !paging_mode_refcounts(d) )
2715 put_page_from_l1e(ol1e, d);
2717 return GNTST_okay;
2720 static int replace_grant_va_mapping(
2721 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2723 l1_pgentry_t *pl1e, ol1e;
2724 unsigned long gl1mfn;
2725 int rc = 0;
2727 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2728 if ( !pl1e )
2730 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2731 return GNTST_general_error;
2733 ol1e = *pl1e;
2735 /* Check that the virtual address supplied is actually mapped to frame. */
2736 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2738 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2739 l1e_get_pfn(ol1e), addr, frame);
2740 rc = GNTST_general_error;
2741 goto out;
2744 /* Delete pagetable entry. */
2745 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v)) )
2747 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2748 rc = GNTST_general_error;
2749 goto out;
2752 out:
2753 guest_unmap_l1e(v, pl1e);
2754 return rc;
2757 static int destroy_grant_va_mapping(
2758 unsigned long addr, unsigned long frame, struct vcpu *v)
2760 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2763 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
2764 unsigned int flags, unsigned int cache_flags)
2766 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2768 if ( (flags & GNTMAP_application_map) )
2769 l1e_add_flags(pte,_PAGE_USER);
2770 if ( !(flags & GNTMAP_readonly) )
2771 l1e_add_flags(pte,_PAGE_RW);
2773 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
2775 if ( flags & GNTMAP_contains_pte )
2776 return create_grant_pte_mapping(addr, pte, current);
2777 return create_grant_va_mapping(addr, pte, current);
2780 int replace_grant_host_mapping(
2781 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
2783 l1_pgentry_t *pl1e, ol1e;
2784 unsigned long gl1mfn;
2785 int rc;
2787 if ( flags & GNTMAP_contains_pte )
2789 if ( !new_addr )
2790 return destroy_grant_pte_mapping(addr, frame, current->domain);
2792 MEM_LOG("Unsupported grant table operation");
2793 return GNTST_general_error;
2796 if ( !new_addr )
2797 return destroy_grant_va_mapping(addr, frame, current);
2799 pl1e = guest_map_l1e(current, new_addr, &gl1mfn);
2800 if ( !pl1e )
2802 MEM_LOG("Could not find L1 PTE for address %lx",
2803 (unsigned long)new_addr);
2804 return GNTST_general_error;
2806 ol1e = *pl1e;
2808 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
2809 gl1mfn, current)) )
2811 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2812 guest_unmap_l1e(current, pl1e);
2813 return GNTST_general_error;
2816 guest_unmap_l1e(current, pl1e);
2818 rc = replace_grant_va_mapping(addr, frame, ol1e, current);
2819 if ( rc && !paging_mode_refcounts(current->domain) )
2820 put_page_from_l1e(ol1e, current->domain);
2822 return rc;
2825 int steal_page(
2826 struct domain *d, struct page_info *page, unsigned int memflags)
2828 u32 _d, _nd, x, y;
2830 spin_lock(&d->page_alloc_lock);
2832 /*
2833 * The tricky bit: atomically release ownership while there is just one
2834 * benign reference to the page (PGC_allocated). If that reference
2835 * disappears then the deallocation routine will safely spin.
2836 */
2837 _d = pickle_domptr(d);
2838 _nd = page->u.inuse._domain;
2839 y = page->count_info;
2840 do {
2841 x = y;
2842 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2843 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2844 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2845 " caf=%08x, taf=%" PRtype_info "\n",
2846 (void *) page_to_mfn(page),
2847 d, d->domain_id, unpickle_domptr(_nd), x,
2848 page->u.inuse.type_info);
2849 spin_unlock(&d->page_alloc_lock);
2850 return -1;
2852 __asm__ __volatile__(
2853 LOCK_PREFIX "cmpxchg8b %2"
2854 : "=d" (_nd), "=a" (y),
2855 "=m" (*(volatile u64 *)(&page->count_info))
2856 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2857 } while (unlikely(_nd != _d) || unlikely(y != x));
2859 /*
2860 * Unlink from 'd'. At least one reference remains (now anonymous), so
2861 * noone else is spinning to try to delete this page from 'd'.
2862 */
2863 if ( !(memflags & MEMF_no_refcount) )
2864 d->tot_pages--;
2865 list_del(&page->list);
2867 spin_unlock(&d->page_alloc_lock);
2869 return 0;
2872 int do_update_va_mapping(unsigned long va, u64 val64,
2873 unsigned long flags)
2875 l1_pgentry_t val = l1e_from_intpte(val64);
2876 struct vcpu *v = current;
2877 struct domain *d = v->domain;
2878 l1_pgentry_t *pl1e;
2879 unsigned long vmask, bmap_ptr, gl1mfn;
2880 cpumask_t pmask;
2881 int rc = 0;
2883 perfc_incr(calls_to_update_va);
2885 if ( unlikely(!__addr_ok(va) && !paging_mode_external(d)) )
2886 return -EINVAL;
2888 rc = xsm_update_va_mapping(current->domain, val);
2889 if ( rc )
2890 return rc;
2892 LOCK_BIGLOCK(d);
2894 pl1e = guest_map_l1e(v, va, &gl1mfn);
2896 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) )
2897 rc = -EINVAL;
2899 if ( pl1e )
2900 guest_unmap_l1e(v, pl1e);
2901 pl1e = NULL;
2903 process_deferred_ops();
2905 UNLOCK_BIGLOCK(d);
2907 switch ( flags & UVMF_FLUSHTYPE_MASK )
2909 case UVMF_TLB_FLUSH:
2910 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2912 case UVMF_LOCAL:
2913 flush_tlb_local();
2914 break;
2915 case UVMF_ALL:
2916 flush_tlb_mask(d->domain_dirty_cpumask);
2917 break;
2918 default:
2919 if ( unlikely(!is_pv_32on64_domain(d) ?
2920 get_user(vmask, (unsigned long *)bmap_ptr) :
2921 get_user(vmask, (unsigned int *)bmap_ptr)) )
2922 rc = -EFAULT;
2923 pmask = vcpumask_to_pcpumask(d, vmask);
2924 flush_tlb_mask(pmask);
2925 break;
2927 break;
2929 case UVMF_INVLPG:
2930 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2932 case UVMF_LOCAL:
2933 if ( !paging_mode_enabled(d)
2934 || (paging_invlpg(current, va) != 0) )
2935 flush_tlb_one_local(va);
2936 break;
2937 case UVMF_ALL:
2938 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2939 break;
2940 default:
2941 if ( unlikely(!is_pv_32on64_domain(d) ?
2942 get_user(vmask, (unsigned long *)bmap_ptr) :
2943 get_user(vmask, (unsigned int *)bmap_ptr)) )
2944 rc = -EFAULT;
2945 pmask = vcpumask_to_pcpumask(d, vmask);
2946 flush_tlb_one_mask(pmask, va);
2947 break;
2949 break;
2952 return rc;
2955 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2956 unsigned long flags,
2957 domid_t domid)
2959 int rc;
2961 if ( unlikely(!IS_PRIV(current->domain)) )
2962 return -EPERM;
2964 if ( !set_foreigndom(domid) )
2965 return -ESRCH;
2967 rc = do_update_va_mapping(va, val64, flags);
2969 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
2970 process_deferred_ops(); /* only to clear foreigndom */
2972 return rc;
2977 /*************************
2978 * Descriptor Tables
2979 */
2981 void destroy_gdt(struct vcpu *v)
2983 int i;
2984 unsigned long pfn;
2986 v->arch.guest_context.gdt_ents = 0;
2987 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2989 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2990 put_page_and_type(mfn_to_page(pfn));
2991 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
2992 v->arch.guest_context.gdt_frames[i] = 0;
2997 long set_gdt(struct vcpu *v,
2998 unsigned long *frames,
2999 unsigned int entries)
3001 struct domain *d = v->domain;
3002 /* NB. There are 512 8-byte entries per GDT page. */
3003 int i, nr_pages = (entries + 511) / 512;
3004 unsigned long mfn;
3006 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3007 return -EINVAL;
3009 /* Check the pages in the new GDT. */
3010 for ( i = 0; i < nr_pages; i++ ) {
3011 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3012 if ( !mfn_valid(mfn) ||
3013 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
3014 goto fail;
3017 /* Tear down the old GDT. */
3018 destroy_gdt(v);
3020 /* Install the new GDT. */
3021 v->arch.guest_context.gdt_ents = entries;
3022 for ( i = 0; i < nr_pages; i++ )
3024 v->arch.guest_context.gdt_frames[i] = frames[i];
3025 l1e_write(&v->arch.perdomain_ptes[i],
3026 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3029 return 0;
3031 fail:
3032 while ( i-- > 0 )
3033 put_page_and_type(mfn_to_page(frames[i]));
3034 return -EINVAL;
3038 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3040 int nr_pages = (entries + 511) / 512;
3041 unsigned long frames[16];
3042 long ret;
3044 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3045 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3046 return -EINVAL;
3048 if ( copy_from_guest(frames, frame_list, nr_pages) )
3049 return -EFAULT;
3051 LOCK_BIGLOCK(current->domain);
3053 if ( (ret = set_gdt(current, frames, entries)) == 0 )
3054 flush_tlb_local();
3056 UNLOCK_BIGLOCK(current->domain);
3058 return ret;
3062 long do_update_descriptor(u64 pa, u64 desc)
3064 struct domain *dom = current->domain;
3065 unsigned long gmfn = pa >> PAGE_SHIFT;
3066 unsigned long mfn;
3067 unsigned int offset;
3068 struct desc_struct *gdt_pent, d;
3069 struct page_info *page;
3070 long ret = -EINVAL;
3072 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3074 *(u64 *)&d = desc;
3076 LOCK_BIGLOCK(dom);
3078 mfn = gmfn_to_mfn(dom, gmfn);
3079 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3080 !mfn_valid(mfn) ||
3081 !check_descriptor(dom, &d) )
3083 UNLOCK_BIGLOCK(dom);
3084 return -EINVAL;
3087 page = mfn_to_page(mfn);
3088 if ( unlikely(!get_page(page, dom)) )
3090 UNLOCK_BIGLOCK(dom);
3091 return -EINVAL;
3094 /* Check if the given frame is in use in an unsafe context. */
3095 switch ( page->u.inuse.type_info & PGT_type_mask )
3097 case PGT_gdt_page:
3098 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
3099 goto out;
3100 break;
3101 case PGT_ldt_page:
3102 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
3103 goto out;
3104 break;
3105 default:
3106 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3107 goto out;
3108 break;
3111 paging_mark_dirty(dom, mfn);
3113 /* All is good so make the update. */
3114 gdt_pent = map_domain_page(mfn);
3115 memcpy(&gdt_pent[offset], &d, 8);
3116 unmap_domain_page(gdt_pent);
3118 put_page_type(page);
3120 ret = 0; /* success */
3122 out:
3123 put_page(page);
3125 UNLOCK_BIGLOCK(dom);
3127 return ret;
3130 typedef struct e820entry e820entry_t;
3131 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3133 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3135 switch ( op )
3137 case XENMEM_add_to_physmap:
3139 struct xen_add_to_physmap xatp;
3140 unsigned long prev_mfn, mfn = 0, gpfn;
3141 struct domain *d;
3143 if ( copy_from_guest(&xatp, arg, 1) )
3144 return -EFAULT;
3146 if ( xatp.domid == DOMID_SELF )
3147 d = rcu_lock_current_domain();
3148 else if ( !IS_PRIV(current->domain) )
3149 return -EPERM;
3150 else if ( (d = rcu_lock_domain_by_id(xatp.domid)) == NULL )
3151 return -ESRCH;
3153 if ( xsm_add_to_physmap(current->domain, d) )
3155 rcu_unlock_domain(d);
3156 return -EPERM;
3159 switch ( xatp.space )
3161 case XENMAPSPACE_shared_info:
3162 if ( xatp.idx == 0 )
3163 mfn = virt_to_mfn(d->shared_info);
3164 /* XXX: assumption here, this is called after E820 table is build
3165 * need the E820 to initialize MTRR.
3166 */
3167 if ( is_hvm_domain(d) ) {
3168 extern void init_mtrr_in_hyper(struct vcpu *);
3169 struct vcpu *vs;
3170 for_each_vcpu(d, vs)
3171 init_mtrr_in_hyper(vs);
3173 break;
3174 case XENMAPSPACE_grant_table:
3175 spin_lock(&d->grant_table->lock);
3177 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3178 (xatp.idx < max_nr_grant_frames) )
3179 gnttab_grow_table(d, xatp.idx + 1);
3181 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3182 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3184 spin_unlock(&d->grant_table->lock);
3185 break;
3186 default:
3187 break;
3190 if ( !paging_mode_translate(d) || (mfn == 0) )
3192 rcu_unlock_domain(d);
3193 return -EINVAL;
3196 LOCK_BIGLOCK(d);
3198 /* Remove previously mapped page if it was present. */
3199 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3200 if ( mfn_valid(prev_mfn) )
3202 if ( is_xen_heap_mfn(prev_mfn) )
3203 /* Xen heap frames are simply unhooked from this phys slot. */
3204 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
3205 else
3206 /* Normal domain memory is freed, to avoid leaking memory. */
3207 guest_remove_page(d, xatp.gpfn);
3210 /* Unmap from old location, if any. */
3211 gpfn = get_gpfn_from_mfn(mfn);
3212 if ( gpfn != INVALID_M2P_ENTRY )
3213 guest_physmap_remove_page(d, gpfn, mfn);
3215 /* Map at new location. */
3216 guest_physmap_add_page(d, xatp.gpfn, mfn);
3218 UNLOCK_BIGLOCK(d);
3220 rcu_unlock_domain(d);
3222 break;
3225 case XENMEM_set_memory_map:
3227 struct xen_foreign_memory_map fmap;
3228 struct domain *d;
3229 int rc;
3231 if ( copy_from_guest(&fmap, arg, 1) )
3232 return -EFAULT;
3234 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3235 return -EINVAL;
3237 if ( fmap.domid == DOMID_SELF )
3238 d = rcu_lock_current_domain();
3239 else if ( !IS_PRIV(current->domain) )
3240 return -EPERM;
3241 else if ( (d = rcu_lock_domain_by_id(fmap.domid)) == NULL )
3242 return -ESRCH;
3244 rc = xsm_domain_memory_map(d);
3245 if ( rc )
3247 rcu_unlock_domain(d);
3248 return rc;
3251 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3252 fmap.map.nr_entries) ? -EFAULT : 0;
3253 d->arch.nr_e820 = fmap.map.nr_entries;
3255 rcu_unlock_domain(d);
3256 return rc;
3259 case XENMEM_memory_map:
3261 struct xen_memory_map map;
3262 struct domain *d = current->domain;
3264 /* Backwards compatibility. */
3265 if ( d->arch.nr_e820 == 0 )
3266 return -ENOSYS;
3268 if ( copy_from_guest(&map, arg, 1) )
3269 return -EFAULT;
3271 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3272 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3273 copy_to_guest(arg, &map, 1) )
3274 return -EFAULT;
3276 return 0;
3279 case XENMEM_machine_memory_map:
3281 struct xen_memory_map memmap;
3282 XEN_GUEST_HANDLE(e820entry_t) buffer;
3283 int count;
3284 int rc;
3286 if ( !IS_PRIV(current->domain) )
3287 return -EINVAL;
3289 rc = xsm_machine_memory_map();
3290 if ( rc )
3291 return rc;
3293 if ( copy_from_guest(&memmap, arg, 1) )
3294 return -EFAULT;
3295 if ( memmap.nr_entries < e820.nr_map + 1 )
3296 return -EINVAL;
3298 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3300 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3301 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3302 return -EFAULT;
3304 memmap.nr_entries = count;
3306 if ( copy_to_guest(arg, &memmap, 1) )
3307 return -EFAULT;
3309 return 0;
3312 case XENMEM_machphys_mapping:
3314 static const struct xen_machphys_mapping mapping = {
3315 .v_start = MACH2PHYS_VIRT_START,
3316 .v_end = MACH2PHYS_VIRT_END,
3317 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3318 };
3320 if ( copy_to_guest(arg, &mapping, 1) )
3321 return -EFAULT;
3323 return 0;
3326 default:
3327 return subarch_memory_op(op, arg);
3330 return 0;
3334 /*************************
3335 * Writable Pagetables
3336 */
3338 struct ptwr_emulate_ctxt {
3339 struct x86_emulate_ctxt ctxt;
3340 unsigned long cr2;
3341 l1_pgentry_t pte;
3342 };
3344 static int ptwr_emulated_read(
3345 enum x86_segment seg,
3346 unsigned long offset,
3347 unsigned long *val,
3348 unsigned int bytes,
3349 struct x86_emulate_ctxt *ctxt)
3351 unsigned int rc;
3352 unsigned long addr = offset;
3354 *val = 0;
3355 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3357 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3358 return X86EMUL_EXCEPTION;
3361 return X86EMUL_OKAY;
3364 static int ptwr_emulated_update(
3365 unsigned long addr,
3366 paddr_t old,
3367 paddr_t val,
3368 unsigned int bytes,
3369 unsigned int do_cmpxchg,
3370 struct ptwr_emulate_ctxt *ptwr_ctxt)
3372 unsigned long mfn;
3373 unsigned long unaligned_addr = addr;
3374 struct page_info *page;
3375 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3376 struct vcpu *v = current;
3377 struct domain *d = v->domain;
3379 /* Only allow naturally-aligned stores within the original %cr2 page. */
3380 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3382 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3383 ptwr_ctxt->cr2, addr, bytes);
3384 return X86EMUL_UNHANDLEABLE;
3387 /* Turn a sub-word access into a full-word access. */
3388 if ( bytes != sizeof(paddr_t) )
3390 paddr_t full;
3391 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3393 /* Align address; read full word. */
3394 addr &= ~(sizeof(paddr_t)-1);
3395 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3397 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3398 return X86EMUL_EXCEPTION;
3400 /* Mask out bits provided by caller. */
3401 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3402 /* Shift the caller value and OR in the missing bits. */
3403 val &= (((paddr_t)1 << (bytes*8)) - 1);
3404 val <<= (offset)*8;
3405 val |= full;
3406 /* Also fill in missing parts of the cmpxchg old value. */
3407 old &= (((paddr_t)1 << (bytes*8)) - 1);
3408 old <<= (offset)*8;
3409 old |= full;
3412 pte = ptwr_ctxt->pte;
3413 mfn = l1e_get_pfn(pte);
3414 page = mfn_to_page(mfn);
3416 /* We are looking only for read-only mappings of p.t. pages. */
3417 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3418 ASSERT(mfn_valid(mfn));
3419 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3420 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3421 ASSERT(page_get_owner(page) == d);
3423 /* Check the new PTE. */
3424 nl1e = l1e_from_intpte(val);
3425 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3427 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3428 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3429 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3431 /*
3432 * If this is an upper-half write to a PAE PTE then we assume that
3433 * the guest has simply got the two writes the wrong way round. We
3434 * zap the PRESENT bit on the assumption that the bottom half will
3435 * be written immediately after we return to the guest.
3436 */
3437 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3438 l1e_get_intpte(nl1e));
3439 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3441 else
3443 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3444 return X86EMUL_UNHANDLEABLE;
3448 adjust_guest_l1e(nl1e, d);
3450 /* Checked successfully: do the update (write or cmpxchg). */
3451 pl1e = map_domain_page(mfn);
3452 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3453 if ( do_cmpxchg )
3455 int okay;
3456 intpte_t t = old;
3457 ol1e = l1e_from_intpte(old);
3459 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3460 &t, l1e_get_intpte(nl1e), _mfn(mfn));
3461 okay = (okay && t == old);
3463 if ( !okay )
3465 unmap_domain_page(pl1e);
3466 put_page_from_l1e(nl1e, d);
3467 return X86EMUL_CMPXCHG_FAILED;
3470 else
3472 ol1e = *pl1e;
3473 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v) )
3474 BUG();
3477 trace_ptwr_emulation(addr, nl1e);
3479 unmap_domain_page(pl1e);
3481 /* Finally, drop the old PTE. */
3482 put_page_from_l1e(ol1e, d);
3484 return X86EMUL_OKAY;
3487 static int ptwr_emulated_write(
3488 enum x86_segment seg,
3489 unsigned long offset,
3490 unsigned long val,
3491 unsigned int bytes,
3492 struct x86_emulate_ctxt *ctxt)
3494 return ptwr_emulated_update(
3495 offset, 0, val, bytes, 0,
3496 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3499 static int ptwr_emulated_cmpxchg(
3500 enum x86_segment seg,
3501 unsigned long offset,
3502 unsigned long old,
3503 unsigned long new,
3504 unsigned int bytes,
3505 struct x86_emulate_ctxt *ctxt)
3507 return ptwr_emulated_update(
3508 offset, old, new, bytes, 1,
3509 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3512 static int ptwr_emulated_cmpxchg8b(
3513 enum x86_segment seg,
3514 unsigned long offset,
3515 unsigned long old,
3516 unsigned long old_hi,
3517 unsigned long new,
3518 unsigned long new_hi,
3519 struct x86_emulate_ctxt *ctxt)
3521 if ( CONFIG_PAGING_LEVELS == 2 )
3522 return X86EMUL_UNHANDLEABLE;
3523 return ptwr_emulated_update(
3524 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3525 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3528 static struct x86_emulate_ops ptwr_emulate_ops = {
3529 .read = ptwr_emulated_read,
3530 .insn_fetch = ptwr_emulated_read,
3531 .write = ptwr_emulated_write,
3532 .cmpxchg = ptwr_emulated_cmpxchg,
3533 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3534 };
3536 /* Write page fault handler: check if guest is trying to modify a PTE. */
3537 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3538 struct cpu_user_regs *regs)
3540 struct domain *d = v->domain;
3541 struct page_info *page;
3542 l1_pgentry_t pte;
3543 struct ptwr_emulate_ctxt ptwr_ctxt;
3544 int rc;
3546 LOCK_BIGLOCK(d);
3548 /* Attempt to read the PTE that maps the VA being accessed. */
3549 guest_get_eff_l1e(v, addr, &pte);
3550 page = l1e_get_page(pte);
3552 /* We are looking only for read-only mappings of p.t. pages. */
3553 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3554 !mfn_valid(l1e_get_pfn(pte)) ||
3555 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3556 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3557 (page_get_owner(page) != d) )
3558 goto bail;
3560 ptwr_ctxt.ctxt.regs = regs;
3561 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3562 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3563 ptwr_ctxt.cr2 = addr;
3564 ptwr_ctxt.pte = pte;
3566 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3567 if ( rc == X86EMUL_UNHANDLEABLE )
3568 goto bail;
3570 UNLOCK_BIGLOCK(d);
3571 perfc_incr(ptwr_emulations);
3572 return EXCRET_fault_fixed;
3574 bail:
3575 UNLOCK_BIGLOCK(d);
3576 return 0;
3579 void free_xen_pagetable(void *v)
3581 extern int early_boot;
3583 BUG_ON(early_boot);
3585 if ( is_xen_heap_page(virt_to_page(v)) )
3586 free_xenheap_page(v);
3587 else
3588 free_domheap_page(virt_to_page(v));
3591 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
3592 #define l1f_to_l2f(f) ((f) | _PAGE_PSE)
3593 #define l2f_to_l1f(f) ((f) & ~_PAGE_PSE)
3595 /*
3596 * map_pages_to_xen() can be called with interrupts disabled:
3597 * * During early bootstrap; or
3598 * * alloc_xenheap_pages() via memguard_guard_range
3599 * In these cases it is safe to use flush_area_local():
3600 * * Because only the local CPU is online; or
3601 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
3602 */
3603 #define flush_area(v,f) (!local_irq_is_enabled() ? \
3604 flush_area_local((const void *)v, f) : \
3605 flush_area_all((const void *)v, f))
3607 int map_pages_to_xen(
3608 unsigned long virt,
3609 unsigned long mfn,
3610 unsigned long nr_mfns,
3611 unsigned int flags)
3613 l2_pgentry_t *pl2e, ol2e;
3614 l1_pgentry_t *pl1e, ol1e;
3615 unsigned int i;
3617 while ( nr_mfns != 0 )
3619 pl2e = virt_to_xen_l2e(virt);
3621 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3622 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3623 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
3625 /* Super-page mapping. */
3626 ol2e = *pl2e;
3627 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_l2f(flags)));
3629 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3631 unsigned int flush_flags =
3632 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3634 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3636 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3637 flush_flags |= FLUSH_TLB_GLOBAL;
3638 if ( (l2e_get_flags(ol2e) ^ l1f_to_l2f(flags)) &
3639 l1f_to_l2f(PAGE_CACHE_ATTRS) )
3640 flush_flags |= FLUSH_CACHE;
3641 flush_area(virt, flush_flags);
3643 else
3645 pl1e = l2e_to_l1e(ol2e);
3646 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3648 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
3649 flush_flags |= FLUSH_TLB_GLOBAL;
3650 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
3651 PAGE_CACHE_ATTRS )
3652 flush_flags |= FLUSH_CACHE;
3654 flush_area(virt, flush_flags);
3655 free_xen_pagetable(pl1e);
3659 virt += 1UL << L2_PAGETABLE_SHIFT;
3660 mfn += 1UL << PAGETABLE_ORDER;
3661 nr_mfns -= 1UL << PAGETABLE_ORDER;
3663 else
3665 /* Normal page mapping. */
3666 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3668 pl1e = alloc_xen_pagetable();
3669 if ( pl1e == NULL )
3670 return -ENOMEM;
3671 clear_page(pl1e);
3672 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3673 __PAGE_HYPERVISOR));
3675 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3677 unsigned int flush_flags =
3678 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3680 /* Skip this PTE if there is no change. */
3681 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
3682 l1_table_offset(virt)) == mfn) &&
3683 (((l2f_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
3684 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
3686 virt += 1UL << L1_PAGETABLE_SHIFT;
3687 mfn += 1UL;
3688 nr_mfns -= 1UL;
3689 continue;
3692 pl1e = alloc_xen_pagetable();
3693 if ( pl1e == NULL )
3694 return -ENOMEM;
3696 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3697 l1e_write(&pl1e[i],
3698 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3699 l2f_to_l1f(l2e_get_flags(*pl2e))));
3701 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
3702 flush_flags |= FLUSH_TLB_GLOBAL;
3704 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3705 __PAGE_HYPERVISOR));
3706 flush_area(virt, flush_flags);
3709 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3710 ol1e = *pl1e;
3711 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
3712 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3714 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
3715 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3716 flush_flags |= FLUSH_TLB_GLOBAL;
3717 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
3718 flush_flags |= FLUSH_CACHE;
3719 flush_area(virt, flush_flags);
3722 virt += 1UL << L1_PAGETABLE_SHIFT;
3723 mfn += 1UL;
3724 nr_mfns -= 1UL;
3726 if ( (flags == PAGE_HYPERVISOR) &&
3727 ((nr_mfns == 0) ||
3728 ((((virt >> PAGE_SHIFT) | mfn) &
3729 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
3731 unsigned long base_mfn;
3732 pl1e = l2e_to_l1e(*pl2e);
3733 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
3734 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
3735 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
3736 (l1e_get_flags(*pl1e) != flags) )
3737 break;
3738 if ( i == L1_PAGETABLE_ENTRIES )
3740 ol2e = *pl2e;
3741 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
3742 l1f_to_l2f(flags)));
3743 flush_area(virt, (FLUSH_TLB_GLOBAL |
3744 FLUSH_ORDER(PAGETABLE_ORDER)));
3745 free_xen_pagetable(l2e_to_l1e(ol2e));
3751 return 0;
3754 void destroy_xen_mappings(unsigned long s, unsigned long e)
3756 l2_pgentry_t *pl2e;
3757 l1_pgentry_t *pl1e;
3758 unsigned int i;
3759 unsigned long v = s;
3761 ASSERT((s & ~PAGE_MASK) == 0);
3762 ASSERT((e & ~PAGE_MASK) == 0);
3764 while ( v < e )
3766 pl2e = virt_to_xen_l2e(v);
3768 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3770 v += 1UL << L2_PAGETABLE_SHIFT;
3771 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
3772 continue;
3775 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3777 if ( (l1_table_offset(v) == 0) &&
3778 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
3780 /* PSE: whole superpage is destroyed. */
3781 l2e_write_atomic(pl2e, l2e_empty());
3782 v += 1UL << L2_PAGETABLE_SHIFT;
3784 else
3786 /* PSE: shatter the superpage and try again. */
3787 pl1e = alloc_xen_pagetable();
3788 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3789 l1e_write(&pl1e[i],
3790 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3791 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
3792 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3793 __PAGE_HYPERVISOR));
3796 else
3798 /* Ordinary 4kB mapping. */
3799 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
3800 l1e_write_atomic(pl1e, l1e_empty());
3801 v += PAGE_SIZE;
3803 /* If we are done with the L2E, check if it is now empty. */
3804 if ( (v != e) && (l1_table_offset(v) != 0) )
3805 continue;
3806 pl1e = l2e_to_l1e(*pl2e);
3807 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3808 if ( l1e_get_intpte(pl1e[i]) != 0 )
3809 break;
3810 if ( i == L1_PAGETABLE_ENTRIES )
3812 /* Empty: zap the L2E and free the L1 page. */
3813 l2e_write_atomic(pl2e, l2e_empty());
3814 flush_all(FLUSH_TLB_GLOBAL); /* flush before free */
3815 free_xen_pagetable(pl1e);
3820 flush_all(FLUSH_TLB_GLOBAL);
3823 void __set_fixmap(
3824 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3826 BUG_ON(idx >= __end_of_fixed_addresses);
3827 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3830 #ifdef MEMORY_GUARD
3832 void memguard_init(void)
3834 map_pages_to_xen(
3835 (unsigned long)__va(xen_phys_start),
3836 xen_phys_start >> PAGE_SHIFT,
3837 (xenheap_phys_end - xen_phys_start) >> PAGE_SHIFT,
3838 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3839 #ifdef __x86_64__
3840 map_pages_to_xen(
3841 XEN_VIRT_START,
3842 xen_phys_start >> PAGE_SHIFT,
3843 (__pa(&_end) + PAGE_SIZE - 1 - xen_phys_start) >> PAGE_SHIFT,
3844 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3845 #endif
3848 static void __memguard_change_range(void *p, unsigned long l, int guard)
3850 unsigned long _p = (unsigned long)p;
3851 unsigned long _l = (unsigned long)l;
3852 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3854 /* Ensure we are dealing with a page-aligned whole number of pages. */
3855 ASSERT((_p&~PAGE_MASK) == 0);
3856 ASSERT((_l&~PAGE_MASK) == 0);
3858 if ( guard )
3859 flags &= ~_PAGE_PRESENT;
3861 map_pages_to_xen(
3862 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3865 void memguard_guard_range(void *p, unsigned long l)
3867 __memguard_change_range(p, l, 1);
3870 void memguard_unguard_range(void *p, unsigned long l)
3872 __memguard_change_range(p, l, 0);
3875 #endif
3877 void memguard_guard_stack(void *p)
3879 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
3880 p = (void *)((unsigned long)p + STACK_SIZE -
3881 PRIMARY_STACK_SIZE - PAGE_SIZE);
3882 memguard_guard_range(p, PAGE_SIZE);
3885 /*
3886 * Local variables:
3887 * mode: C
3888 * c-set-style: "BSD"
3889 * c-basic-offset: 4
3890 * tab-width: 4
3891 * indent-tabs-mode: nil
3892 * End:
3893 */