ia64/xen-unstable

view xen/arch/x86/mm.c @ 19161:398291c661b3

x86: No need to flush TLBs on free_page_type() as we no longer trust
the linear pagetable mapping (we use it but we double check it).

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 04 14:28:13 2009 +0000 (2009-02-04)
parents c3b5e36248c9
children 271697e6d9b2
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(__i386__)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 int opt_allow_hugepage;
164 boolean_param("allowhugepage", opt_allow_hugepage);
166 #define l1_disallow_mask(d) \
167 ((d != dom_io) && \
168 (rangeset_is_empty((d)->iomem_caps) && \
169 rangeset_is_empty((d)->arch.ioport_caps) && \
170 !has_arch_pdevs(d)) ? \
171 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
173 #ifdef CONFIG_COMPAT
174 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
175 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
176 L3_DISALLOW_MASK : \
177 COMPAT_L3_DISALLOW_MASK)
178 #else
179 #define l3_disallow_mask(d) L3_DISALLOW_MASK
180 #endif
182 static void queue_deferred_ops(struct domain *d, unsigned int ops)
183 {
184 ASSERT(d == current->domain);
185 this_cpu(percpu_mm_info).deferred_ops |= ops;
186 }
188 void __init init_frametable(void)
189 {
190 unsigned long nr_pages, page_step, i, mfn;
192 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
194 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
195 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
197 for ( i = 0; i < nr_pages; i += page_step )
198 {
199 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
200 if ( mfn == 0 )
201 panic("Not enough memory for frame table\n");
202 map_pages_to_xen(
203 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
204 mfn, page_step, PAGE_HYPERVISOR);
205 }
207 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
208 }
210 void __init arch_init_memory(void)
211 {
212 extern void subarch_init_memory(void);
214 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
216 /*
217 * Initialise our DOMID_XEN domain.
218 * Any Xen-heap pages that we will allow to be mapped will have
219 * their domain field set to dom_xen.
220 */
221 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
222 BUG_ON(dom_xen == NULL);
224 /*
225 * Initialise our DOMID_IO domain.
226 * This domain owns I/O pages that are within the range of the page_info
227 * array. Mappings occur at the priv of the caller.
228 */
229 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
230 BUG_ON(dom_io == NULL);
232 /* First 1MB of RAM is historically marked as I/O. */
233 for ( i = 0; i < 0x100; i++ )
234 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
236 /* Any areas not specified as RAM by the e820 map are considered I/O. */
237 for ( i = 0, pfn = 0; pfn < max_page; i++ )
238 {
239 while ( (i < e820.nr_map) &&
240 (e820.map[i].type != E820_RAM) &&
241 (e820.map[i].type != E820_UNUSABLE) )
242 i++;
244 if ( i >= e820.nr_map )
245 {
246 /* No more RAM regions: mark as I/O right to end of memory map. */
247 rstart_pfn = rend_pfn = max_page;
248 }
249 else
250 {
251 /* Mark as I/O just up as far as next RAM region. */
252 rstart_pfn = min_t(unsigned long, max_page,
253 PFN_UP(e820.map[i].addr));
254 rend_pfn = max_t(unsigned long, rstart_pfn,
255 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
256 }
258 /*
259 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
260 * In particular this ensures that RAM holes are respected even in
261 * the statically-initialised 1-16MB mapping area.
262 */
263 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
264 ioend_pfn = rstart_pfn;
265 #if defined(CONFIG_X86_32)
266 ioend_pfn = min_t(unsigned long, ioend_pfn,
267 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
268 #endif
269 if ( iostart_pfn < ioend_pfn )
270 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
271 (unsigned long)mfn_to_virt(ioend_pfn));
273 /* Mark as I/O up to next RAM region. */
274 for ( ; pfn < rstart_pfn; pfn++ )
275 {
276 BUG_ON(!mfn_valid(pfn));
277 share_xen_page_with_guest(
278 mfn_to_page(pfn), dom_io, XENSHARE_writable);
279 }
281 /* Skip the RAM region. */
282 pfn = rend_pfn;
283 }
285 subarch_init_memory();
286 }
288 int page_is_conventional_ram(unsigned long mfn)
289 {
290 uint64_t maddr = pfn_to_paddr(mfn);
291 int i;
293 for ( i = 0; i < e820.nr_map; i++ )
294 {
295 if ( (e820.map[i].type == E820_RAM) &&
296 (e820.map[i].addr <= maddr) &&
297 ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
298 return 1;
299 }
301 return 0;
302 }
304 unsigned long domain_get_maximum_gpfn(struct domain *d)
305 {
306 if ( is_hvm_domain(d) )
307 return d->arch.p2m->max_mapped_pfn;
308 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
309 return arch_get_max_pfn(d) - 1;
310 }
312 void share_xen_page_with_guest(
313 struct page_info *page, struct domain *d, int readonly)
314 {
315 if ( page_get_owner(page) == d )
316 return;
318 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
320 spin_lock(&d->page_alloc_lock);
322 /* The incremented type count pins as writable or read-only. */
323 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
324 page->u.inuse.type_info |= PGT_validated | 1;
326 page_set_owner(page, d);
327 wmb(); /* install valid domain ptr before updating refcnt. */
328 ASSERT((page->count_info & ~PGC_xen_heap) == 0);
330 /* Only add to the allocation list if the domain isn't dying. */
331 if ( !d->is_dying )
332 {
333 page->count_info |= PGC_allocated | 1;
334 if ( unlikely(d->xenheap_pages++ == 0) )
335 get_knownalive_domain(d);
336 page_list_add_tail(page, &d->xenpage_list);
337 }
339 spin_unlock(&d->page_alloc_lock);
340 }
342 void share_xen_page_with_privileged_guests(
343 struct page_info *page, int readonly)
344 {
345 share_xen_page_with_guest(page, dom_xen, readonly);
346 }
348 #if defined(__i386__)
350 #ifdef NDEBUG
351 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
352 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
353 #else
354 /*
355 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
356 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
357 * (detected by lack of an owning domain). As required for correctness, we
358 * always shadow PDPTs above 4GB.
359 */
360 #define l3tab_needs_shadow(mfn) \
361 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
362 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
363 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
364 ((mfn) >= 0x100000))
365 #endif
367 static l1_pgentry_t *fix_pae_highmem_pl1e;
369 /* Cache the address of PAE high-memory fixmap page tables. */
370 static int __init cache_pae_fixmap_address(void)
371 {
372 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
373 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
374 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
375 return 0;
376 }
377 __initcall(cache_pae_fixmap_address);
379 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
381 void make_cr3(struct vcpu *v, unsigned long mfn)
382 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
383 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
384 {
385 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
386 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
387 unsigned int cpu = smp_processor_id();
389 /* Fast path: does this mfn need a shadow at all? */
390 if ( !l3tab_needs_shadow(mfn) )
391 {
392 v->arch.cr3 = mfn << PAGE_SHIFT;
393 /* Cache is no longer in use or valid */
394 cache->high_mfn = 0;
395 return;
396 }
398 /* Caching logic is not interrupt safe. */
399 ASSERT(!in_irq());
401 /* Protects against pae_flush_pgd(). */
402 spin_lock(&cache->lock);
404 cache->inuse_idx ^= 1;
405 cache->high_mfn = mfn;
407 /* Map the guest L3 table and copy to the chosen low-memory cache. */
408 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
409 /* First check the previous high mapping can't be in the TLB.
410 * (i.e. have we loaded CR3 since we last did this?) */
411 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
412 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
413 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
414 lowmem_l3tab = cache->table[cache->inuse_idx];
415 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
416 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
417 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
419 v->arch.cr3 = __pa(lowmem_l3tab);
421 spin_unlock(&cache->lock);
422 }
424 #else /* !defined(__i386__) */
426 void make_cr3(struct vcpu *v, unsigned long mfn)
427 {
428 v->arch.cr3 = mfn << PAGE_SHIFT;
429 }
431 #endif /* !defined(__i386__) */
433 void write_ptbase(struct vcpu *v)
434 {
435 write_cr3(v->arch.cr3);
436 }
438 /*
439 * Should be called after CR3 is updated.
440 *
441 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
442 * for HVM guests, arch.monitor_table and hvm's guest CR3.
443 *
444 * Update ref counts to shadow tables appropriately.
445 */
446 void update_cr3(struct vcpu *v)
447 {
448 unsigned long cr3_mfn=0;
450 if ( paging_mode_enabled(v->domain) )
451 {
452 paging_update_cr3(v);
453 return;
454 }
456 #if CONFIG_PAGING_LEVELS == 4
457 if ( !(v->arch.flags & TF_kernel_mode) )
458 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
459 else
460 #endif
461 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
463 make_cr3(v, cr3_mfn);
464 }
467 static void invalidate_shadow_ldt(struct vcpu *v)
468 {
469 int i;
470 unsigned long pfn;
471 struct page_info *page;
473 if ( v->arch.shadow_ldt_mapcnt == 0 )
474 return;
476 v->arch.shadow_ldt_mapcnt = 0;
478 for ( i = 16; i < 32; i++ )
479 {
480 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
481 if ( pfn == 0 ) continue;
482 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
483 page = mfn_to_page(pfn);
484 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
485 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
486 put_page_and_type(page);
487 }
489 /* Dispose of the (now possibly invalid) mappings from the TLB. */
490 if ( v == current )
491 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
492 else
493 flush_tlb_mask(v->domain->domain_dirty_cpumask);
494 }
497 static int alloc_segdesc_page(struct page_info *page)
498 {
499 struct desc_struct *descs;
500 int i;
502 descs = map_domain_page(page_to_mfn(page));
504 for ( i = 0; i < 512; i++ )
505 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
506 goto fail;
508 unmap_domain_page(descs);
509 return 0;
511 fail:
512 unmap_domain_page(descs);
513 return -EINVAL;
514 }
517 /* Map shadow page at offset @off. */
518 int map_ldt_shadow_page(unsigned int off)
519 {
520 struct vcpu *v = current;
521 struct domain *d = v->domain;
522 unsigned long gmfn, mfn;
523 l1_pgentry_t l1e, nl1e;
524 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
525 int okay;
527 BUG_ON(unlikely(in_irq()));
529 guest_get_eff_kern_l1e(v, gva, &l1e);
530 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
531 return 0;
533 gmfn = l1e_get_pfn(l1e);
534 mfn = gmfn_to_mfn(d, gmfn);
535 if ( unlikely(!mfn_valid(mfn)) )
536 return 0;
538 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
539 if ( unlikely(!okay) )
540 return 0;
542 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
544 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
545 v->arch.shadow_ldt_mapcnt++;
547 return 1;
548 }
551 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
552 {
553 struct page_info *page = mfn_to_page(page_nr);
555 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
556 {
557 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
558 return 0;
559 }
561 return 1;
562 }
565 static int get_page_and_type_from_pagenr(unsigned long page_nr,
566 unsigned long type,
567 struct domain *d,
568 int partial,
569 int preemptible)
570 {
571 struct page_info *page = mfn_to_page(page_nr);
572 int rc;
574 if ( likely(partial >= 0) &&
575 unlikely(!get_page_from_pagenr(page_nr, d)) )
576 return -EINVAL;
578 rc = (preemptible ?
579 get_page_type_preemptible(page, type) :
580 (get_page_type(page, type) ? 0 : -EINVAL));
582 if ( unlikely(rc) && partial >= 0 )
583 put_page(page);
585 return rc;
586 }
588 static int get_data_page(
589 struct page_info *page, struct domain *d, int writeable)
590 {
591 int rc;
593 if ( writeable )
594 rc = get_page_and_type(page, d, PGT_writable_page);
595 else
596 rc = get_page(page, d);
598 return rc;
599 }
601 static void put_data_page(
602 struct page_info *page, int writeable)
603 {
604 if ( writeable )
605 put_page_and_type(page);
606 else
607 put_page(page);
608 }
610 /*
611 * We allow root tables to map each other (a.k.a. linear page tables). It
612 * needs some special care with reference counts and access permissions:
613 * 1. The mapping entry must be read-only, or the guest may get write access
614 * to its own PTEs.
615 * 2. We must only bump the reference counts for an *already validated*
616 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
617 * on a validation that is required to complete that validation.
618 * 3. We only need to increment the reference counts for the mapped page
619 * frame if it is mapped by a different root table. This is sufficient and
620 * also necessary to allow validation of a root table mapping itself.
621 */
622 #define define_get_linear_pagetable(level) \
623 static int \
624 get_##level##_linear_pagetable( \
625 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
626 { \
627 unsigned long x, y; \
628 struct page_info *page; \
629 unsigned long pfn; \
630 \
631 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
632 { \
633 MEM_LOG("Attempt to create linear p.t. with write perms"); \
634 return 0; \
635 } \
636 \
637 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
638 { \
639 /* Make sure the mapped frame belongs to the correct domain. */ \
640 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
641 return 0; \
642 \
643 /* \
644 * Ensure that the mapped frame is an already-validated page table. \
645 * If so, atomically increment the count (checking for overflow). \
646 */ \
647 page = mfn_to_page(pfn); \
648 y = page->u.inuse.type_info; \
649 do { \
650 x = y; \
651 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
652 unlikely((x & (PGT_type_mask|PGT_validated)) != \
653 (PGT_##level##_page_table|PGT_validated)) ) \
654 { \
655 put_page(page); \
656 return 0; \
657 } \
658 } \
659 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
660 } \
661 \
662 return 1; \
663 }
666 int is_iomem_page(unsigned long mfn)
667 {
668 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
669 }
672 int
673 get_page_from_l1e(
674 l1_pgentry_t l1e, struct domain *d)
675 {
676 unsigned long mfn = l1e_get_pfn(l1e);
677 struct page_info *page = mfn_to_page(mfn);
678 uint32_t l1f = l1e_get_flags(l1e);
679 struct vcpu *curr = current;
680 struct domain *owner;
681 int okay;
683 if ( !(l1f & _PAGE_PRESENT) )
684 return 1;
686 if ( unlikely(l1f & l1_disallow_mask(d)) )
687 {
688 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
689 return 0;
690 }
692 if ( is_iomem_page(mfn) )
693 {
694 /* DOMID_IO reverts to caller for privilege checks. */
695 if ( d == dom_io )
696 d = curr->domain;
698 if ( !iomem_access_permitted(d, mfn, mfn) )
699 {
700 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
701 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
702 d->domain_id, mfn);
703 return 0;
704 }
706 return 1;
707 }
709 /*
710 * Let privileged domains transfer the right to map their target
711 * domain's pages. This is used to allow stub-domain pvfb export to dom0,
712 * until pvfb supports granted mappings. At that time this minor hack
713 * can go away.
714 */
715 owner = page_get_owner(page);
716 if ( unlikely(d != owner) && (owner != NULL) &&
717 (d != curr->domain) && IS_PRIV_FOR(d, owner) )
718 d = owner;
720 /* Foreign mappings into guests in shadow external mode don't
721 * contribute to writeable mapping refcounts. (This allows the
722 * qemu-dm helper process in dom0 to map the domain's memory without
723 * messing up the count of "real" writable mappings.) */
724 okay = get_data_page(
725 page, d,
726 (l1f & _PAGE_RW) && !(paging_mode_external(d) && (d != curr->domain)));
727 if ( !okay )
728 {
729 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
730 " for dom%d",
731 mfn, get_gpfn_from_mfn(mfn),
732 l1e_get_intpte(l1e), d->domain_id);
733 }
734 else if ( pte_flags_to_cacheattr(l1f) !=
735 ((page->count_info >> PGC_cacheattr_base) & 7) )
736 {
737 unsigned long x, nx, y = page->count_info;
738 unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
740 if ( is_xen_heap_page(page) )
741 {
742 if ( (l1f & _PAGE_RW) &&
743 !(unlikely(paging_mode_external(d) &&
744 (d != curr->domain))) )
745 put_page_type(page);
746 put_page(page);
747 MEM_LOG("Attempt to change cache attributes of Xen heap page");
748 return 0;
749 }
751 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
752 {
753 x = y;
754 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
755 y = cmpxchg(&page->count_info, x, nx);
756 }
758 #ifdef __x86_64__
759 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
760 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
761 #endif
762 }
764 return okay;
765 }
768 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
769 define_get_linear_pagetable(l2);
770 static int
771 get_page_from_l2e(
772 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
773 {
774 unsigned long mfn = l2e_get_pfn(l2e);
775 int rc;
777 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
778 return 1;
780 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
781 {
782 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
783 return -EINVAL;
784 }
786 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
787 {
788 rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
789 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
790 rc = 0;
791 }
792 else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
793 {
794 rc = -EINVAL;
795 }
796 else
797 {
798 unsigned long m = mfn;
799 int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
801 do {
802 rc = get_data_page(mfn_to_page(m), d, writeable);
803 if ( unlikely(!rc) )
804 {
805 while ( m-- > mfn )
806 put_data_page(mfn_to_page(m), writeable);
807 return -EINVAL;
808 }
809 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
811 #ifdef __x86_64__
812 map_pages_to_xen(
813 (unsigned long)mfn_to_virt(mfn), mfn, L1_PAGETABLE_ENTRIES,
814 PAGE_HYPERVISOR | l2e_get_flags(l2e));
815 #endif
816 }
818 return rc;
819 }
822 define_get_linear_pagetable(l3);
823 static int
824 get_page_from_l3e(
825 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
826 {
827 int rc;
829 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
830 return 1;
832 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
833 {
834 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
835 return -EINVAL;
836 }
838 rc = get_page_and_type_from_pagenr(
839 l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
840 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
841 rc = 0;
843 return rc;
844 }
846 #if CONFIG_PAGING_LEVELS >= 4
847 define_get_linear_pagetable(l4);
848 static int
849 get_page_from_l4e(
850 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
851 {
852 int rc;
854 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
855 return 1;
857 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
858 {
859 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
860 return -EINVAL;
861 }
863 rc = get_page_and_type_from_pagenr(
864 l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
865 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
866 rc = 0;
868 return rc;
869 }
870 #endif /* 4 level */
872 #ifdef __x86_64__
874 #ifdef USER_MAPPINGS_ARE_GLOBAL
875 #define adjust_guest_l1e(pl1e, d) \
876 do { \
877 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
878 likely(!is_pv_32on64_domain(d)) ) \
879 { \
880 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
881 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
882 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
883 MEM_LOG("Global bit is set to kernel page %lx", \
884 l1e_get_pfn((pl1e))); \
885 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
886 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
887 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
888 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
889 } \
890 } while ( 0 )
891 #else
892 #define adjust_guest_l1e(pl1e, d) \
893 do { \
894 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
895 likely(!is_pv_32on64_domain(d)) ) \
896 l1e_add_flags((pl1e), _PAGE_USER); \
897 } while ( 0 )
898 #endif
900 #define adjust_guest_l2e(pl2e, d) \
901 do { \
902 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
903 likely(!is_pv_32on64_domain(d)) ) \
904 l2e_add_flags((pl2e), _PAGE_USER); \
905 } while ( 0 )
907 #define adjust_guest_l3e(pl3e, d) \
908 do { \
909 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
910 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
911 _PAGE_USER : \
912 _PAGE_USER|_PAGE_RW); \
913 } while ( 0 )
915 #define adjust_guest_l4e(pl4e, d) \
916 do { \
917 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
918 likely(!is_pv_32on64_domain(d)) ) \
919 l4e_add_flags((pl4e), _PAGE_USER); \
920 } while ( 0 )
922 #else /* !defined(__x86_64__) */
924 #define adjust_guest_l1e(_p, _d) ((void)(_d))
925 #define adjust_guest_l2e(_p, _d) ((void)(_d))
926 #define adjust_guest_l3e(_p, _d) ((void)(_d))
928 #endif
930 #ifdef CONFIG_COMPAT
931 #define unadjust_guest_l3e(pl3e, d) \
932 do { \
933 if ( unlikely(is_pv_32on64_domain(d)) && \
934 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
935 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
936 } while ( 0 )
937 #else
938 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
939 #endif
941 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
942 {
943 unsigned long pfn = l1e_get_pfn(l1e);
944 struct page_info *page;
945 struct domain *e;
946 struct vcpu *v;
948 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
949 return;
951 page = mfn_to_page(pfn);
953 e = page_get_owner(page);
955 /*
956 * Check if this is a mapping that was established via a grant reference.
957 * If it was then we should not be here: we require that such mappings are
958 * explicitly destroyed via the grant-table interface.
959 *
960 * The upshot of this is that the guest can end up with active grants that
961 * it cannot destroy (because it no longer has a PTE to present to the
962 * grant-table interface). This can lead to subtle hard-to-catch bugs,
963 * hence a special grant PTE flag can be enabled to catch the bug early.
964 *
965 * (Note that the undestroyable active grants are not a security hole in
966 * Xen. All active grants can safely be cleaned up when the domain dies.)
967 */
968 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
969 !d->is_shutting_down && !d->is_dying )
970 {
971 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
972 l1e_get_intpte(l1e));
973 domain_crash(d);
974 }
976 /* Remember we didn't take a type-count of foreign writable mappings
977 * to paging-external domains */
978 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
979 !(unlikely((e != d) && paging_mode_external(e))) )
980 {
981 put_page_and_type(page);
982 }
983 else
984 {
985 /* We expect this is rare so we blow the entire shadow LDT. */
986 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
987 PGT_seg_desc_page)) &&
988 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
989 (d == e) )
990 {
991 for_each_vcpu ( d, v )
992 invalidate_shadow_ldt(v);
993 }
994 put_page(page);
995 }
996 }
999 /*
1000 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1001 * Note also that this automatically deals correctly with linear p.t.'s.
1002 */
1003 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1005 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1006 return 1;
1008 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1010 unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
1011 int writeable = l2e_get_flags(l2e) & _PAGE_RW;
1013 ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
1014 do {
1015 put_data_page(mfn_to_page(m), writeable);
1016 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
1018 else
1020 put_page_and_type(l2e_get_page(l2e));
1023 return 0;
1026 static int __put_page_type(struct page_info *, int preemptible);
1028 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1029 int partial, int preemptible)
1031 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1032 return 1;
1034 #ifdef __x86_64__
1035 if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1037 unsigned long mfn = l3e_get_pfn(l3e);
1038 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1040 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1041 do {
1042 put_data_page(mfn_to_page(mfn), writeable);
1043 } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1045 return 0;
1047 #endif
1049 if ( unlikely(partial > 0) )
1050 return __put_page_type(l3e_get_page(l3e), preemptible);
1052 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
1055 #if CONFIG_PAGING_LEVELS >= 4
1056 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1057 int partial, int preemptible)
1059 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1060 (l4e_get_pfn(l4e) != pfn) )
1062 if ( unlikely(partial > 0) )
1063 return __put_page_type(l4e_get_page(l4e), preemptible);
1064 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
1066 return 1;
1068 #endif
1070 static int alloc_l1_table(struct page_info *page)
1072 struct domain *d = page_get_owner(page);
1073 unsigned long pfn = page_to_mfn(page);
1074 l1_pgentry_t *pl1e;
1075 unsigned int i;
1077 pl1e = map_domain_page(pfn);
1079 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1081 if ( is_guest_l1_slot(i) &&
1082 unlikely(!get_page_from_l1e(pl1e[i], d)) )
1083 goto fail;
1085 adjust_guest_l1e(pl1e[i], d);
1088 unmap_domain_page(pl1e);
1089 return 0;
1091 fail:
1092 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1093 while ( i-- > 0 )
1094 if ( is_guest_l1_slot(i) )
1095 put_page_from_l1e(pl1e[i], d);
1097 unmap_domain_page(pl1e);
1098 return -EINVAL;
1101 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1103 struct page_info *page;
1104 l2_pgentry_t *pl2e;
1105 l3_pgentry_t l3e3;
1106 #ifndef CONFIG_COMPAT
1107 l2_pgentry_t l2e;
1108 int i;
1109 #endif
1111 if ( !is_pv_32bit_domain(d) )
1112 return 1;
1114 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1116 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1117 l3e3 = pl3e[3];
1118 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1120 MEM_LOG("PAE L3 3rd slot is empty");
1121 return 0;
1124 /*
1125 * The Xen-private mappings include linear mappings. The L2 thus cannot
1126 * be shared by multiple L3 tables. The test here is adequate because:
1127 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1128 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1129 * 2. Cannot appear in another page table's L3:
1130 * a. alloc_l3_table() calls this function and this check will fail
1131 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1132 */
1133 page = l3e_get_page(l3e3);
1134 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1135 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1136 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1137 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1139 MEM_LOG("PAE L3 3rd slot is shared");
1140 return 0;
1143 /* Xen private mappings. */
1144 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1145 #ifndef CONFIG_COMPAT
1146 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1147 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1148 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1149 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1151 l2e = l2e_from_page(
1152 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1153 __PAGE_HYPERVISOR);
1154 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1156 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1158 l2e = l2e_empty();
1159 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1160 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1161 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1163 #else
1164 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1165 &compat_idle_pg_table_l2[
1166 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1167 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1168 #endif
1169 unmap_domain_page(pl2e);
1171 return 1;
1174 #ifdef __i386__
1175 /* Flush a pgdir update into low-memory caches. */
1176 static void pae_flush_pgd(
1177 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1179 struct domain *d = page_get_owner(mfn_to_page(mfn));
1180 struct vcpu *v;
1181 intpte_t _ol3e, _nl3e, _pl3e;
1182 l3_pgentry_t *l3tab_ptr;
1183 struct pae_l3_cache *cache;
1185 if ( unlikely(shadow_mode_enabled(d)) )
1187 cpumask_t m = CPU_MASK_NONE;
1188 /* Re-shadow this l3 table on any vcpus that are using it */
1189 for_each_vcpu ( d, v )
1190 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1192 paging_update_cr3(v);
1193 cpus_or(m, m, v->vcpu_dirty_cpumask);
1195 flush_tlb_mask(m);
1198 /* If below 4GB then the pgdir is not shadowed in low memory. */
1199 if ( !l3tab_needs_shadow(mfn) )
1200 return;
1202 for_each_vcpu ( d, v )
1204 cache = &v->arch.pae_l3_cache;
1206 spin_lock(&cache->lock);
1208 if ( cache->high_mfn == mfn )
1210 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1211 _ol3e = l3e_get_intpte(*l3tab_ptr);
1212 _nl3e = l3e_get_intpte(nl3e);
1213 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1214 BUG_ON(_pl3e != _ol3e);
1217 spin_unlock(&cache->lock);
1220 flush_tlb_mask(d->domain_dirty_cpumask);
1222 #else
1223 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1224 #endif
1226 static int alloc_l2_table(struct page_info *page, unsigned long type,
1227 int preemptible)
1229 struct domain *d = page_get_owner(page);
1230 unsigned long pfn = page_to_mfn(page);
1231 l2_pgentry_t *pl2e;
1232 unsigned int i;
1233 int rc = 0;
1235 pl2e = map_domain_page(pfn);
1237 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1239 if ( preemptible && i && hypercall_preempt_check() )
1241 page->nr_validated_ptes = i;
1242 rc = -EAGAIN;
1243 break;
1246 if ( !is_guest_l2_slot(d, type, i) ||
1247 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1248 continue;
1250 if ( rc < 0 )
1252 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1253 while ( i-- > 0 )
1254 if ( is_guest_l2_slot(d, type, i) )
1255 put_page_from_l2e(pl2e[i], pfn);
1256 break;
1259 adjust_guest_l2e(pl2e[i], d);
1262 unmap_domain_page(pl2e);
1263 return rc > 0 ? 0 : rc;
1266 static int alloc_l3_table(struct page_info *page, int preemptible)
1268 struct domain *d = page_get_owner(page);
1269 unsigned long pfn = page_to_mfn(page);
1270 l3_pgentry_t *pl3e;
1271 unsigned int i;
1272 int rc = 0, partial = page->partial_pte;
1274 #if CONFIG_PAGING_LEVELS == 3
1275 /*
1276 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1277 * the weird 'extended cr3' format for dealing with high-order address
1278 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1279 */
1280 if ( (pfn >= 0x100000) &&
1281 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1282 d->vcpu[0] && d->vcpu[0]->is_initialised )
1284 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1285 return -EINVAL;
1287 #endif
1289 pl3e = map_domain_page(pfn);
1291 /*
1292 * PAE guests allocate full pages, but aren't required to initialize
1293 * more than the first four entries; when running in compatibility
1294 * mode, however, the full page is visible to the MMU, and hence all
1295 * 512 entries must be valid/verified, which is most easily achieved
1296 * by clearing them out.
1297 */
1298 if ( is_pv_32on64_domain(d) )
1299 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1301 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1302 i++, partial = 0 )
1304 if ( is_pv_32bit_domain(d) && (i == 3) )
1306 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1307 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1308 rc = -EINVAL;
1309 else
1310 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1311 PGT_l2_page_table |
1312 PGT_pae_xen_l2,
1313 d, partial, preemptible);
1315 else if ( !is_guest_l3_slot(i) ||
1316 (rc = get_page_from_l3e(pl3e[i], pfn, d,
1317 partial, preemptible)) > 0 )
1318 continue;
1320 if ( rc == -EAGAIN )
1322 page->nr_validated_ptes = i;
1323 page->partial_pte = partial ?: 1;
1325 else if ( rc == -EINTR && i )
1327 page->nr_validated_ptes = i;
1328 page->partial_pte = 0;
1329 rc = -EAGAIN;
1331 if ( rc < 0 )
1332 break;
1334 adjust_guest_l3e(pl3e[i], d);
1337 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1338 rc = -EINVAL;
1339 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1341 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1342 while ( i-- > 0 )
1344 if ( !is_guest_l3_slot(i) )
1345 continue;
1346 unadjust_guest_l3e(pl3e[i], d);
1347 put_page_from_l3e(pl3e[i], pfn, 0, 0);
1351 unmap_domain_page(pl3e);
1352 return rc > 0 ? 0 : rc;
1355 #if CONFIG_PAGING_LEVELS >= 4
1356 static int alloc_l4_table(struct page_info *page, int preemptible)
1358 struct domain *d = page_get_owner(page);
1359 unsigned long pfn = page_to_mfn(page);
1360 l4_pgentry_t *pl4e = page_to_virt(page);
1361 unsigned int i;
1362 int rc = 0, partial = page->partial_pte;
1364 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1365 i++, partial = 0 )
1367 if ( !is_guest_l4_slot(d, i) ||
1368 (rc = get_page_from_l4e(pl4e[i], pfn, d,
1369 partial, preemptible)) > 0 )
1370 continue;
1372 if ( rc == -EAGAIN )
1374 page->nr_validated_ptes = i;
1375 page->partial_pte = partial ?: 1;
1377 else if ( rc == -EINTR )
1379 if ( i )
1381 page->nr_validated_ptes = i;
1382 page->partial_pte = 0;
1383 rc = -EAGAIN;
1386 else if ( rc < 0 )
1388 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1389 while ( i-- > 0 )
1390 if ( is_guest_l4_slot(d, i) )
1391 put_page_from_l4e(pl4e[i], pfn, 0, 0);
1393 if ( rc < 0 )
1394 return rc;
1396 adjust_guest_l4e(pl4e[i], d);
1399 /* Xen private mappings. */
1400 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1401 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1402 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1403 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1404 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1405 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1406 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1407 __PAGE_HYPERVISOR);
1409 return rc > 0 ? 0 : rc;
1411 #else
1412 #define alloc_l4_table(page, preemptible) (-EINVAL)
1413 #endif
1416 static void free_l1_table(struct page_info *page)
1418 struct domain *d = page_get_owner(page);
1419 unsigned long pfn = page_to_mfn(page);
1420 l1_pgentry_t *pl1e;
1421 unsigned int i;
1423 pl1e = map_domain_page(pfn);
1425 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1426 if ( is_guest_l1_slot(i) )
1427 put_page_from_l1e(pl1e[i], d);
1429 unmap_domain_page(pl1e);
1433 static int free_l2_table(struct page_info *page, int preemptible)
1435 #ifdef CONFIG_COMPAT
1436 struct domain *d = page_get_owner(page);
1437 #endif
1438 unsigned long pfn = page_to_mfn(page);
1439 l2_pgentry_t *pl2e;
1440 unsigned int i = page->nr_validated_ptes - 1;
1441 int err = 0;
1443 pl2e = map_domain_page(pfn);
1445 ASSERT(page->nr_validated_ptes);
1446 do {
1447 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1448 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1449 preemptible && i && hypercall_preempt_check() )
1451 page->nr_validated_ptes = i;
1452 err = -EAGAIN;
1454 } while ( !err && i-- );
1456 unmap_domain_page(pl2e);
1458 if ( !err )
1459 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1461 return err;
1464 static int free_l3_table(struct page_info *page, int preemptible)
1466 struct domain *d = page_get_owner(page);
1467 unsigned long pfn = page_to_mfn(page);
1468 l3_pgentry_t *pl3e;
1469 int rc = 0, partial = page->partial_pte;
1470 unsigned int i = page->nr_validated_ptes - !partial;
1472 pl3e = map_domain_page(pfn);
1474 do {
1475 if ( is_guest_l3_slot(i) )
1477 rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
1478 if ( rc < 0 )
1479 break;
1480 partial = 0;
1481 if ( rc > 0 )
1482 continue;
1483 unadjust_guest_l3e(pl3e[i], d);
1485 } while ( i-- );
1487 unmap_domain_page(pl3e);
1489 if ( rc == -EAGAIN )
1491 page->nr_validated_ptes = i;
1492 page->partial_pte = partial ?: -1;
1494 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1496 page->nr_validated_ptes = i + 1;
1497 page->partial_pte = 0;
1498 rc = -EAGAIN;
1500 return rc > 0 ? 0 : rc;
1503 #if CONFIG_PAGING_LEVELS >= 4
1504 static int free_l4_table(struct page_info *page, int preemptible)
1506 struct domain *d = page_get_owner(page);
1507 unsigned long pfn = page_to_mfn(page);
1508 l4_pgentry_t *pl4e = page_to_virt(page);
1509 int rc = 0, partial = page->partial_pte;
1510 unsigned int i = page->nr_validated_ptes - !partial;
1512 do {
1513 if ( is_guest_l4_slot(d, i) )
1514 rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
1515 if ( rc < 0 )
1516 break;
1517 partial = 0;
1518 } while ( i-- );
1520 if ( rc == -EAGAIN )
1522 page->nr_validated_ptes = i;
1523 page->partial_pte = partial ?: -1;
1525 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1527 page->nr_validated_ptes = i + 1;
1528 page->partial_pte = 0;
1529 rc = -EAGAIN;
1531 return rc > 0 ? 0 : rc;
1533 #else
1534 #define free_l4_table(page, preemptible) (-EINVAL)
1535 #endif
1537 static int page_lock(struct page_info *page)
1539 unsigned long x, nx;
1541 do {
1542 while ( (x = page->u.inuse.type_info) & PGT_locked )
1543 cpu_relax();
1544 nx = x + (1 | PGT_locked);
1545 if ( !(x & PGT_validated) ||
1546 !(x & PGT_count_mask) ||
1547 !(nx & PGT_count_mask) )
1548 return 0;
1549 } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1551 return 1;
1554 static void page_unlock(struct page_info *page)
1556 unsigned long x, nx, y = page->u.inuse.type_info;
1558 do {
1559 x = y;
1560 nx = x - (1 | PGT_locked);
1561 } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1564 /* How to write an entry to the guest pagetables.
1565 * Returns 0 for failure (pointer not valid), 1 for success. */
1566 static inline int update_intpte(intpte_t *p,
1567 intpte_t old,
1568 intpte_t new,
1569 unsigned long mfn,
1570 struct vcpu *v,
1571 int preserve_ad)
1573 int rv = 1;
1574 #ifndef PTE_UPDATE_WITH_CMPXCHG
1575 if ( !preserve_ad )
1577 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1579 else
1580 #endif
1582 intpte_t t = old;
1583 for ( ; ; )
1585 intpte_t _new = new;
1586 if ( preserve_ad )
1587 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1589 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1590 if ( unlikely(rv == 0) )
1592 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1593 ": saw %" PRIpte, old, _new, t);
1594 break;
1597 if ( t == old )
1598 break;
1600 /* Allowed to change in Accessed/Dirty flags only. */
1601 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1603 old = t;
1606 return rv;
1609 /* Macro that wraps the appropriate type-changes around update_intpte().
1610 * Arguments are: type, ptr, old, new, mfn, vcpu */
1611 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1612 update_intpte(&_t ## e_get_intpte(*(_p)), \
1613 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1614 (_m), (_v), (_ad))
1616 /* Update the L1 entry at pl1e to new value nl1e. */
1617 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1618 unsigned long gl1mfn, int preserve_ad)
1620 l1_pgentry_t ol1e;
1621 struct vcpu *curr = current;
1622 struct domain *d = curr->domain;
1623 unsigned long mfn;
1624 p2m_type_t p2mt;
1625 int rc = 1;
1627 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1628 return 0;
1630 if ( unlikely(paging_mode_refcounts(d)) )
1632 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad);
1633 return rc;
1636 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1638 /* Translate foreign guest addresses. */
1639 mfn = mfn_x(gfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e), &p2mt));
1640 if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
1641 return 0;
1642 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1643 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1645 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1647 MEM_LOG("Bad L1 flags %x",
1648 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1649 return 0;
1652 /* Fast path for identical mapping, r/w and presence. */
1653 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1655 adjust_guest_l1e(nl1e, d);
1656 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1657 preserve_ad);
1658 return rc;
1661 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1662 return 0;
1664 adjust_guest_l1e(nl1e, d);
1665 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1666 preserve_ad)) )
1668 ol1e = nl1e;
1669 rc = 0;
1672 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1673 preserve_ad)) )
1675 return 0;
1678 put_page_from_l1e(ol1e, d);
1679 return rc;
1683 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1684 static int mod_l2_entry(l2_pgentry_t *pl2e,
1685 l2_pgentry_t nl2e,
1686 unsigned long pfn,
1687 int preserve_ad)
1689 l2_pgentry_t ol2e;
1690 struct vcpu *curr = current;
1691 struct domain *d = curr->domain;
1692 struct page_info *l2pg = mfn_to_page(pfn);
1693 unsigned long type = l2pg->u.inuse.type_info;
1694 int rc = 1;
1696 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1698 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1699 return 0;
1702 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1703 return 0;
1705 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1707 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1709 MEM_LOG("Bad L2 flags %x",
1710 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1711 return 0;
1714 /* Fast path for identical mapping and presence. */
1715 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1717 adjust_guest_l2e(nl2e, d);
1718 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad);
1719 return rc;
1722 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1723 return 0;
1725 adjust_guest_l2e(nl2e, d);
1726 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1727 preserve_ad)) )
1729 ol2e = nl2e;
1730 rc = 0;
1733 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1734 preserve_ad)) )
1736 return 0;
1739 put_page_from_l2e(ol2e, pfn);
1740 return rc;
1743 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1744 static int mod_l3_entry(l3_pgentry_t *pl3e,
1745 l3_pgentry_t nl3e,
1746 unsigned long pfn,
1747 int preserve_ad,
1748 int preemptible)
1750 l3_pgentry_t ol3e;
1751 struct vcpu *curr = current;
1752 struct domain *d = curr->domain;
1753 int rc = 0;
1755 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1757 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1758 return -EINVAL;
1761 /*
1762 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1763 * would be a pain to ensure they remain continuously valid throughout.
1764 */
1765 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1766 return -EINVAL;
1768 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1769 return -EFAULT;
1771 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1773 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1775 MEM_LOG("Bad L3 flags %x",
1776 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1777 return -EINVAL;
1780 /* Fast path for identical mapping and presence. */
1781 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1783 adjust_guest_l3e(nl3e, d);
1784 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
1785 return rc ? 0 : -EFAULT;
1788 rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
1789 if ( unlikely(rc < 0) )
1790 return rc;
1791 rc = 0;
1793 adjust_guest_l3e(nl3e, d);
1794 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1795 preserve_ad)) )
1797 ol3e = nl3e;
1798 rc = -EFAULT;
1801 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1802 preserve_ad)) )
1804 return -EFAULT;
1807 if ( likely(rc == 0) )
1809 if ( !create_pae_xen_mappings(d, pl3e) )
1810 BUG();
1812 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1815 put_page_from_l3e(ol3e, pfn, 0, 0);
1816 return rc;
1819 #if CONFIG_PAGING_LEVELS >= 4
1821 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1822 static int mod_l4_entry(l4_pgentry_t *pl4e,
1823 l4_pgentry_t nl4e,
1824 unsigned long pfn,
1825 int preserve_ad,
1826 int preemptible)
1828 struct vcpu *curr = current;
1829 struct domain *d = curr->domain;
1830 l4_pgentry_t ol4e;
1831 int rc = 0;
1833 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1835 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1836 return -EINVAL;
1839 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1840 return -EFAULT;
1842 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1844 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1846 MEM_LOG("Bad L4 flags %x",
1847 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1848 return -EINVAL;
1851 /* Fast path for identical mapping and presence. */
1852 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1854 adjust_guest_l4e(nl4e, d);
1855 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
1856 return rc ? 0 : -EFAULT;
1859 rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
1860 if ( unlikely(rc < 0) )
1861 return rc;
1862 rc = 0;
1864 adjust_guest_l4e(nl4e, d);
1865 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1866 preserve_ad)) )
1868 ol4e = nl4e;
1869 rc = -EFAULT;
1872 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1873 preserve_ad)) )
1875 return -EFAULT;
1878 put_page_from_l4e(ol4e, pfn, 0, 0);
1879 return rc;
1882 #endif
1884 void put_page(struct page_info *page)
1886 unsigned long nx, x, y = page->count_info;
1888 do {
1889 x = y;
1890 nx = x - 1;
1892 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1894 if ( unlikely((nx & PGC_count_mask) == 0) )
1896 cleanup_page_cacheattr(page);
1897 free_domheap_page(page);
1902 int get_page(struct page_info *page, struct domain *domain)
1904 unsigned long x, y = page->count_info;
1906 do {
1907 x = y;
1908 if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
1909 /* Keep one spare reference to be acquired by get_page_light(). */
1910 unlikely(((x + 2) & PGC_count_mask) <= 1) ) /* Overflow? */
1911 goto fail;
1913 while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
1915 if ( likely(page_get_owner(page) == domain) )
1916 return 1;
1918 put_page(page);
1920 fail:
1921 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1922 gdprintk(XENLOG_INFO,
1923 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
1924 PRtype_info "\n",
1925 page_to_mfn(page), domain, page_get_owner(page),
1926 y, page->u.inuse.type_info);
1927 return 0;
1930 /*
1931 * Special version of get_page() to be used exclusively when
1932 * - a page is known to already have a non-zero reference count
1933 * - the page does not need its owner to be checked
1934 * - it will not be called more than once without dropping the thus
1935 * acquired reference again.
1936 * Due to get_page() reserving one reference, this call cannot fail.
1937 */
1938 static void get_page_light(struct page_info *page)
1940 unsigned long x, nx, y = page->count_info;
1942 do {
1943 x = y;
1944 nx = x + 1;
1945 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
1946 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
1947 y = cmpxchg(&page->count_info, x, nx);
1949 while ( unlikely(y != x) );
1953 static int alloc_page_type(struct page_info *page, unsigned long type,
1954 int preemptible)
1956 struct domain *owner = page_get_owner(page);
1957 int rc;
1959 /* A page table is dirtied when its type count becomes non-zero. */
1960 if ( likely(owner != NULL) )
1961 paging_mark_dirty(owner, page_to_mfn(page));
1963 switch ( type & PGT_type_mask )
1965 case PGT_l1_page_table:
1966 rc = alloc_l1_table(page);
1967 break;
1968 case PGT_l2_page_table:
1969 rc = alloc_l2_table(page, type, preemptible);
1970 break;
1971 case PGT_l3_page_table:
1972 rc = alloc_l3_table(page, preemptible);
1973 break;
1974 case PGT_l4_page_table:
1975 rc = alloc_l4_table(page, preemptible);
1976 break;
1977 case PGT_seg_desc_page:
1978 rc = alloc_segdesc_page(page);
1979 break;
1980 default:
1981 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
1982 type, page->u.inuse.type_info,
1983 page->count_info);
1984 rc = -EINVAL;
1985 BUG();
1988 /* No need for atomic update of type_info here: noone else updates it. */
1989 wmb();
1990 if ( rc == -EAGAIN )
1992 get_page_light(page);
1993 page->u.inuse.type_info |= PGT_partial;
1995 else if ( rc == -EINTR )
1997 ASSERT((page->u.inuse.type_info &
1998 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
1999 page->u.inuse.type_info &= ~PGT_count_mask;
2001 else if ( rc )
2003 ASSERT(rc < 0);
2004 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2005 PRtype_info ": caf=%08lx taf=%" PRtype_info,
2006 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2007 type, page->count_info, page->u.inuse.type_info);
2008 page->u.inuse.type_info = 0;
2010 else
2012 page->u.inuse.type_info |= PGT_validated;
2015 return rc;
2019 int free_page_type(struct page_info *page, unsigned long type,
2020 int preemptible)
2022 struct domain *owner = page_get_owner(page);
2023 unsigned long gmfn;
2024 int rc;
2026 if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2028 /* A page table is dirtied when its type count becomes zero. */
2029 paging_mark_dirty(owner, page_to_mfn(page));
2031 if ( shadow_mode_refcounts(owner) )
2032 return 0;
2034 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
2035 ASSERT(VALID_M2P(gmfn));
2036 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
2039 if ( !(type & PGT_partial) )
2041 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2042 page->partial_pte = 0;
2045 switch ( type & PGT_type_mask )
2047 case PGT_l1_page_table:
2048 free_l1_table(page);
2049 rc = 0;
2050 break;
2051 case PGT_l2_page_table:
2052 rc = free_l2_table(page, preemptible);
2053 break;
2054 case PGT_l3_page_table:
2055 #if CONFIG_PAGING_LEVELS == 3
2056 if ( !(type & PGT_partial) )
2057 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
2058 #endif
2059 rc = free_l3_table(page, preemptible);
2060 break;
2061 case PGT_l4_page_table:
2062 rc = free_l4_table(page, preemptible);
2063 break;
2064 default:
2065 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
2066 rc = -EINVAL;
2067 BUG();
2070 return rc;
2074 static int __put_final_page_type(
2075 struct page_info *page, unsigned long type, int preemptible)
2077 int rc = free_page_type(page, type, preemptible);
2079 /* No need for atomic update of type_info here: noone else updates it. */
2080 if ( rc == 0 )
2082 /*
2083 * Record TLB information for flush later. We do not stamp page tables
2084 * when running in shadow mode:
2085 * 1. Pointless, since it's the shadow pt's which must be tracked.
2086 * 2. Shadow mode reuses this field for shadowed page tables to
2087 * store flags info -- we don't want to conflict with that.
2088 */
2089 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2090 (page->count_info & PGC_page_table)) )
2091 page->tlbflush_timestamp = tlbflush_current_time();
2092 wmb();
2093 page->u.inuse.type_info--;
2095 else if ( rc == -EINTR )
2097 ASSERT((page->u.inuse.type_info &
2098 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2099 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2100 (page->count_info & PGC_page_table)) )
2101 page->tlbflush_timestamp = tlbflush_current_time();
2102 wmb();
2103 page->u.inuse.type_info |= PGT_validated;
2105 else
2107 BUG_ON(rc != -EAGAIN);
2108 wmb();
2109 get_page_light(page);
2110 page->u.inuse.type_info |= PGT_partial;
2113 return rc;
2117 static int __put_page_type(struct page_info *page,
2118 int preemptible)
2120 unsigned long nx, x, y = page->u.inuse.type_info;
2121 int rc = 0;
2123 for ( ; ; )
2125 x = y;
2126 nx = x - 1;
2128 ASSERT((x & PGT_count_mask) != 0);
2130 if ( unlikely((nx & PGT_count_mask) == 0) )
2132 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2133 likely(nx & (PGT_validated|PGT_partial)) )
2135 /*
2136 * Page-table pages must be unvalidated when count is zero. The
2137 * 'free' is safe because the refcnt is non-zero and validated
2138 * bit is clear => other ops will spin or fail.
2139 */
2140 nx = x & ~(PGT_validated|PGT_partial);
2141 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2142 x, nx)) != x) )
2143 continue;
2144 /* We cleared the 'valid bit' so we do the clean up. */
2145 rc = __put_final_page_type(page, x, preemptible);
2146 if ( x & PGT_partial )
2147 put_page(page);
2148 break;
2151 /*
2152 * Record TLB information for flush later. We do not stamp page
2153 * tables when running in shadow mode:
2154 * 1. Pointless, since it's the shadow pt's which must be tracked.
2155 * 2. Shadow mode reuses this field for shadowed page tables to
2156 * store flags info -- we don't want to conflict with that.
2157 */
2158 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2159 (page->count_info & PGC_page_table)) )
2160 page->tlbflush_timestamp = tlbflush_current_time();
2163 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2164 break;
2166 if ( preemptible && hypercall_preempt_check() )
2167 return -EINTR;
2170 return rc;
2174 static int __get_page_type(struct page_info *page, unsigned long type,
2175 int preemptible)
2177 unsigned long nx, x, y = page->u.inuse.type_info;
2178 int rc = 0;
2180 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2182 for ( ; ; )
2184 x = y;
2185 nx = x + 1;
2186 if ( unlikely((nx & PGT_count_mask) == 0) )
2188 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2189 return -EINVAL;
2191 else if ( unlikely((x & PGT_count_mask) == 0) )
2193 struct domain *d = page_get_owner(page);
2195 /* Normally we should never let a page go from type count 0
2196 * to type count 1 when it is shadowed. One exception:
2197 * out-of-sync shadowed pages are allowed to become
2198 * writeable. */
2199 if ( d && shadow_mode_enabled(d)
2200 && (page->count_info & PGC_page_table)
2201 && !((page->shadow_flags & (1u<<29))
2202 && type == PGT_writable_page) )
2203 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2205 ASSERT(!(x & PGT_pae_xen_l2));
2206 if ( (x & PGT_type_mask) != type )
2208 /*
2209 * On type change we check to flush stale TLB entries. This
2210 * may be unnecessary (e.g., page was GDT/LDT) but those
2211 * circumstances should be very rare.
2212 */
2213 cpumask_t mask = d->domain_dirty_cpumask;
2215 /* Don't flush if the timestamp is old enough */
2216 tlbflush_filter(mask, page->tlbflush_timestamp);
2218 if ( unlikely(!cpus_empty(mask)) &&
2219 /* Shadow mode: track only writable pages. */
2220 (!shadow_mode_enabled(page_get_owner(page)) ||
2221 ((nx & PGT_type_mask) == PGT_writable_page)) )
2223 perfc_incr(need_flush_tlb_flush);
2224 flush_tlb_mask(mask);
2227 /* We lose existing type and validity. */
2228 nx &= ~(PGT_type_mask | PGT_validated);
2229 nx |= type;
2231 /* No special validation needed for writable pages. */
2232 /* Page tables and GDT/LDT need to be scanned for validity. */
2233 if ( type == PGT_writable_page )
2234 nx |= PGT_validated;
2237 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2239 /* Don't log failure if it could be a recursive-mapping attempt. */
2240 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2241 (type == PGT_l1_page_table) )
2242 return -EINVAL;
2243 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2244 (type == PGT_l2_page_table) )
2245 return -EINVAL;
2246 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2247 (type == PGT_l3_page_table) )
2248 return -EINVAL;
2249 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2250 "for mfn %lx (pfn %lx)",
2251 x, type, page_to_mfn(page),
2252 get_gpfn_from_mfn(page_to_mfn(page)));
2253 return -EINVAL;
2255 else if ( unlikely(!(x & PGT_validated)) )
2257 if ( !(x & PGT_partial) )
2259 /* Someone else is updating validation of this page. Wait... */
2260 while ( (y = page->u.inuse.type_info) == x )
2262 if ( preemptible && hypercall_preempt_check() )
2263 return -EINTR;
2264 cpu_relax();
2266 continue;
2268 /* Type ref count was left at 1 when PGT_partial got set. */
2269 ASSERT((x & PGT_count_mask) == 1);
2270 nx = x & ~PGT_partial;
2273 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2274 break;
2276 if ( preemptible && hypercall_preempt_check() )
2277 return -EINTR;
2280 if ( unlikely((x & PGT_type_mask) != type) )
2282 /* Special pages should not be accessible from devices. */
2283 struct domain *d = page_get_owner(page);
2284 if ( d && unlikely(need_iommu(d)) )
2286 if ( (x & PGT_type_mask) == PGT_writable_page )
2287 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2288 else if ( type == PGT_writable_page )
2289 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2290 page_to_mfn(page));
2294 if ( unlikely(!(nx & PGT_validated)) )
2296 if ( !(x & PGT_partial) )
2298 page->nr_validated_ptes = 0;
2299 page->partial_pte = 0;
2301 rc = alloc_page_type(page, type, preemptible);
2304 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2305 put_page(page);
2307 return rc;
2310 void put_page_type(struct page_info *page)
2312 int rc = __put_page_type(page, 0);
2313 ASSERT(rc == 0);
2314 (void)rc;
2317 int get_page_type(struct page_info *page, unsigned long type)
2319 int rc = __get_page_type(page, type, 0);
2320 if ( likely(rc == 0) )
2321 return 1;
2322 ASSERT(rc == -EINVAL);
2323 return 0;
2326 int put_page_type_preemptible(struct page_info *page)
2328 return __put_page_type(page, 1);
2331 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2333 return __get_page_type(page, type, 1);
2336 void cleanup_page_cacheattr(struct page_info *page)
2338 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
2340 if ( likely(cacheattr == 0) )
2341 return;
2343 page->count_info &= ~PGC_cacheattr_mask;
2345 BUG_ON(is_xen_heap_page(page));
2347 #ifdef __x86_64__
2348 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
2349 1, PAGE_HYPERVISOR);
2350 #endif
2354 int new_guest_cr3(unsigned long mfn)
2356 struct vcpu *v = current;
2357 struct domain *d = v->domain;
2358 int okay;
2359 unsigned long old_base_mfn;
2361 #ifdef CONFIG_COMPAT
2362 if ( is_pv_32on64_domain(d) )
2364 okay = paging_mode_refcounts(d)
2365 ? 0 /* Old code was broken, but what should it be? */
2366 : mod_l4_entry(
2367 __va(pagetable_get_paddr(v->arch.guest_table)),
2368 l4e_from_pfn(
2369 mfn,
2370 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2371 pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
2372 if ( unlikely(!okay) )
2374 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2375 return 0;
2378 invalidate_shadow_ldt(v);
2379 write_ptbase(v);
2381 return 1;
2383 #endif
2384 okay = paging_mode_refcounts(d)
2385 ? get_page_from_pagenr(mfn, d)
2386 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
2387 if ( unlikely(!okay) )
2389 MEM_LOG("Error while installing new baseptr %lx", mfn);
2390 return 0;
2393 invalidate_shadow_ldt(v);
2395 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2397 v->arch.guest_table = pagetable_from_pfn(mfn);
2398 update_cr3(v);
2400 write_ptbase(v);
2402 if ( likely(old_base_mfn != 0) )
2404 if ( paging_mode_refcounts(d) )
2405 put_page(mfn_to_page(old_base_mfn));
2406 else
2407 put_page_and_type(mfn_to_page(old_base_mfn));
2410 return 1;
2413 static void process_deferred_ops(void)
2415 unsigned int deferred_ops;
2416 struct domain *d = current->domain;
2417 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2419 deferred_ops = info->deferred_ops;
2420 info->deferred_ops = 0;
2422 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2424 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2425 flush_tlb_mask(d->domain_dirty_cpumask);
2426 else
2427 flush_tlb_local();
2430 if ( deferred_ops & DOP_RELOAD_LDT )
2431 (void)map_ldt_shadow_page(0);
2433 if ( unlikely(info->foreign != NULL) )
2435 rcu_unlock_domain(info->foreign);
2436 info->foreign = NULL;
2440 static int set_foreigndom(domid_t domid)
2442 struct domain *e, *d = current->domain;
2443 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2444 int okay = 1;
2446 ASSERT(info->foreign == NULL);
2448 if ( likely(domid == DOMID_SELF) )
2449 goto out;
2451 if ( unlikely(domid == d->domain_id) )
2453 MEM_LOG("Cannot specify itself as foreign domain");
2454 okay = 0;
2456 else if ( unlikely(paging_mode_translate(d)) )
2458 MEM_LOG("Cannot mix foreign mappings with translated domains");
2459 okay = 0;
2461 else switch ( domid )
2463 case DOMID_IO:
2464 info->foreign = rcu_lock_domain(dom_io);
2465 break;
2466 case DOMID_XEN:
2467 if (!IS_PRIV(d)) {
2468 MEM_LOG("Cannot set foreign dom");
2469 okay = 0;
2470 break;
2472 info->foreign = rcu_lock_domain(dom_xen);
2473 break;
2474 default:
2475 if ( (e = rcu_lock_domain_by_id(domid)) == NULL )
2477 MEM_LOG("Unknown domain '%u'", domid);
2478 okay = 0;
2479 break;
2481 if ( !IS_PRIV_FOR(d, e) )
2483 MEM_LOG("Cannot set foreign dom");
2484 okay = 0;
2485 rcu_unlock_domain(e);
2486 break;
2488 info->foreign = e;
2489 break;
2492 out:
2493 return okay;
2496 static inline cpumask_t vcpumask_to_pcpumask(
2497 struct domain *d, unsigned long vmask)
2499 unsigned int vcpu_id;
2500 cpumask_t pmask = CPU_MASK_NONE;
2501 struct vcpu *v;
2503 /*
2504 * Callers copy only a single guest-sized longword from the guest.
2505 * This must be wide enough to reference all VCPUs. Worst case is 32 bits.
2506 */
2507 BUILD_BUG_ON(MAX_VIRT_CPUS > 32);
2509 while ( vmask != 0 )
2511 vcpu_id = find_first_set_bit(vmask);
2512 vmask &= ~(1UL << vcpu_id);
2513 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2514 ((v = d->vcpu[vcpu_id]) != NULL) )
2515 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2518 return pmask;
2521 #ifdef __i386__
2522 static inline void *fixmap_domain_page(unsigned long mfn)
2524 unsigned int cpu = smp_processor_id();
2525 void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
2527 l1e_write(fix_pae_highmem_pl1e - cpu,
2528 l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
2529 flush_tlb_one_local(ptr);
2530 return ptr;
2532 static inline void fixunmap_domain_page(const void *ptr)
2534 unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
2536 l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
2537 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
2539 #else
2540 #define fixmap_domain_page(mfn) mfn_to_virt(mfn)
2541 #define fixunmap_domain_page(ptr) ((void)(ptr))
2542 #endif
2544 int do_mmuext_op(
2545 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2546 unsigned int count,
2547 XEN_GUEST_HANDLE(uint) pdone,
2548 unsigned int foreigndom)
2550 struct mmuext_op op;
2551 int rc = 0, i = 0, okay;
2552 unsigned long mfn = 0, gmfn = 0, type;
2553 unsigned int done = 0;
2554 struct page_info *page;
2555 struct vcpu *v = current;
2556 struct domain *d = v->domain;
2558 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2560 count &= ~MMU_UPDATE_PREEMPTED;
2561 if ( unlikely(!guest_handle_is_null(pdone)) )
2562 (void)copy_from_guest(&done, pdone, 1);
2564 else
2565 perfc_incr(calls_to_mmuext_op);
2567 if ( unlikely(!guest_handle_okay(uops, count)) )
2569 rc = -EFAULT;
2570 goto out;
2573 if ( !set_foreigndom(foreigndom) )
2575 rc = -ESRCH;
2576 goto out;
2579 for ( i = 0; i < count; i++ )
2581 if ( hypercall_preempt_check() )
2583 rc = -EAGAIN;
2584 break;
2587 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2589 MEM_LOG("Bad __copy_from_guest");
2590 rc = -EFAULT;
2591 break;
2594 okay = 1;
2595 gmfn = op.arg1.mfn;
2596 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2597 page = mfn_to_page(mfn);
2599 switch ( op.cmd )
2601 case MMUEXT_PIN_L1_TABLE:
2602 type = PGT_l1_page_table;
2603 goto pin_page;
2605 case MMUEXT_PIN_L2_TABLE:
2606 type = PGT_l2_page_table;
2607 goto pin_page;
2609 case MMUEXT_PIN_L3_TABLE:
2610 type = PGT_l3_page_table;
2611 goto pin_page;
2613 case MMUEXT_PIN_L4_TABLE:
2614 if ( is_pv_32bit_domain(FOREIGNDOM) )
2615 break;
2616 type = PGT_l4_page_table;
2618 pin_page:
2619 rc = xsm_memory_pin_page(d, page);
2620 if ( rc )
2621 break;
2623 /* Ignore pinning of invalid paging levels. */
2624 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2625 break;
2627 if ( paging_mode_refcounts(FOREIGNDOM) )
2628 break;
2630 rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
2631 okay = !rc;
2632 if ( unlikely(!okay) )
2634 if ( rc == -EINTR )
2635 rc = -EAGAIN;
2636 else if ( rc != -EAGAIN )
2637 MEM_LOG("Error while pinning mfn %lx", mfn);
2638 break;
2641 if ( unlikely(test_and_set_bit(_PGT_pinned,
2642 &page->u.inuse.type_info)) )
2644 MEM_LOG("Mfn %lx already pinned", mfn);
2645 put_page_and_type(page);
2646 okay = 0;
2647 break;
2650 /* A page is dirtied when its pin status is set. */
2651 paging_mark_dirty(d, mfn);
2653 /* We can race domain destruction (domain_relinquish_resources). */
2654 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2656 int drop_ref;
2657 spin_lock(&FOREIGNDOM->page_alloc_lock);
2658 drop_ref = (FOREIGNDOM->is_dying &&
2659 test_and_clear_bit(_PGT_pinned,
2660 &page->u.inuse.type_info));
2661 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2662 if ( drop_ref )
2663 put_page_and_type(page);
2666 break;
2668 case MMUEXT_UNPIN_TABLE:
2669 if ( paging_mode_refcounts(d) )
2670 break;
2672 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2674 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2675 mfn, page_get_owner(page));
2677 else if ( likely(test_and_clear_bit(_PGT_pinned,
2678 &page->u.inuse.type_info)) )
2680 put_page_and_type(page);
2681 put_page(page);
2682 if ( !rc )
2684 /* A page is dirtied when its pin status is cleared. */
2685 paging_mark_dirty(d, mfn);
2688 else
2690 okay = 0;
2691 put_page(page);
2692 MEM_LOG("Mfn %lx not pinned", mfn);
2694 break;
2696 case MMUEXT_NEW_BASEPTR:
2697 okay = new_guest_cr3(mfn);
2698 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2699 break;
2701 #ifdef __x86_64__
2702 case MMUEXT_NEW_USER_BASEPTR: {
2703 unsigned long old_mfn;
2705 if ( mfn != 0 )
2707 if ( paging_mode_refcounts(d) )
2708 okay = get_page_from_pagenr(mfn, d);
2709 else
2710 okay = !get_page_and_type_from_pagenr(
2711 mfn, PGT_root_page_table, d, 0, 0);
2712 if ( unlikely(!okay) )
2714 MEM_LOG("Error while installing new mfn %lx", mfn);
2715 break;
2719 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2720 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2722 if ( old_mfn != 0 )
2724 if ( paging_mode_refcounts(d) )
2725 put_page(mfn_to_page(old_mfn));
2726 else
2727 put_page_and_type(mfn_to_page(old_mfn));
2730 break;
2732 #endif
2734 case MMUEXT_TLB_FLUSH_LOCAL:
2735 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2736 break;
2738 case MMUEXT_INVLPG_LOCAL:
2739 if ( !paging_mode_enabled(d)
2740 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2741 flush_tlb_one_local(op.arg1.linear_addr);
2742 break;
2744 case MMUEXT_TLB_FLUSH_MULTI:
2745 case MMUEXT_INVLPG_MULTI:
2747 unsigned long vmask;
2748 cpumask_t pmask;
2749 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2751 okay = 0;
2752 break;
2754 pmask = vcpumask_to_pcpumask(d, vmask);
2755 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2756 flush_tlb_mask(pmask);
2757 else
2758 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2759 break;
2762 case MMUEXT_TLB_FLUSH_ALL:
2763 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
2764 break;
2766 case MMUEXT_INVLPG_ALL:
2767 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2768 break;
2770 case MMUEXT_FLUSH_CACHE:
2771 if ( unlikely(!cache_flush_permitted(d)) )
2773 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2774 okay = 0;
2776 else
2778 wbinvd();
2780 break;
2782 case MMUEXT_SET_LDT:
2784 unsigned long ptr = op.arg1.linear_addr;
2785 unsigned long ents = op.arg2.nr_ents;
2787 if ( paging_mode_external(d) )
2789 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2790 okay = 0;
2792 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2793 (ents > 8192) ||
2794 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2796 okay = 0;
2797 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2799 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2800 (v->arch.guest_context.ldt_base != ptr) )
2802 invalidate_shadow_ldt(v);
2803 v->arch.guest_context.ldt_base = ptr;
2804 v->arch.guest_context.ldt_ents = ents;
2805 load_LDT(v);
2806 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2807 if ( ents != 0 )
2808 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2810 break;
2813 case MMUEXT_CLEAR_PAGE:
2815 unsigned char *ptr;
2817 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2818 FOREIGNDOM, 0, 0);
2819 if ( unlikely(!okay) )
2821 MEM_LOG("Error while clearing mfn %lx", mfn);
2822 break;
2825 /* A page is dirtied when it's being cleared. */
2826 paging_mark_dirty(d, mfn);
2828 ptr = fixmap_domain_page(mfn);
2829 clear_page(ptr);
2830 fixunmap_domain_page(ptr);
2832 put_page_and_type(page);
2833 break;
2836 case MMUEXT_COPY_PAGE:
2838 const unsigned char *src;
2839 unsigned char *dst;
2840 unsigned long src_mfn;
2842 src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
2843 okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
2844 if ( unlikely(!okay) )
2846 MEM_LOG("Error while copying from mfn %lx", src_mfn);
2847 break;
2850 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2851 FOREIGNDOM, 0, 0);
2852 if ( unlikely(!okay) )
2854 put_page(mfn_to_page(src_mfn));
2855 MEM_LOG("Error while copying to mfn %lx", mfn);
2856 break;
2859 /* A page is dirtied when it's being copied to. */
2860 paging_mark_dirty(d, mfn);
2862 src = map_domain_page(src_mfn);
2863 dst = fixmap_domain_page(mfn);
2864 copy_page(dst, src);
2865 fixunmap_domain_page(dst);
2866 unmap_domain_page(src);
2868 put_page_and_type(page);
2869 put_page(mfn_to_page(src_mfn));
2870 break;
2873 default:
2874 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2875 rc = -ENOSYS;
2876 okay = 0;
2877 break;
2880 if ( unlikely(!okay) )
2882 rc = rc ? rc : -EINVAL;
2883 break;
2886 guest_handle_add_offset(uops, 1);
2889 if ( rc == -EAGAIN )
2890 rc = hypercall_create_continuation(
2891 __HYPERVISOR_mmuext_op, "hihi",
2892 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2894 process_deferred_ops();
2896 perfc_add(num_mmuext_ops, i);
2898 out:
2899 /* Add incremental work we have done to the @done output parameter. */
2900 if ( unlikely(!guest_handle_is_null(pdone)) )
2902 done += i;
2903 copy_to_guest(pdone, &done, 1);
2906 return rc;
2909 int do_mmu_update(
2910 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2911 unsigned int count,
2912 XEN_GUEST_HANDLE(uint) pdone,
2913 unsigned int foreigndom)
2915 struct mmu_update req;
2916 void *va;
2917 unsigned long gpfn, gmfn, mfn;
2918 struct page_info *page;
2919 int rc = 0, okay = 1, i = 0;
2920 unsigned int cmd, done = 0;
2921 struct vcpu *v = current;
2922 struct domain *d = v->domain;
2923 struct domain_mmap_cache mapcache;
2925 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2927 count &= ~MMU_UPDATE_PREEMPTED;
2928 if ( unlikely(!guest_handle_is_null(pdone)) )
2929 (void)copy_from_guest(&done, pdone, 1);
2931 else
2932 perfc_incr(calls_to_mmu_update);
2934 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2936 rc = -EFAULT;
2937 goto out;
2940 if ( !set_foreigndom(foreigndom) )
2942 rc = -ESRCH;
2943 goto out;
2946 domain_mmap_cache_init(&mapcache);
2948 for ( i = 0; i < count; i++ )
2950 if ( hypercall_preempt_check() )
2952 rc = -EAGAIN;
2953 break;
2956 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2958 MEM_LOG("Bad __copy_from_guest");
2959 rc = -EFAULT;
2960 break;
2963 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2964 okay = 0;
2966 switch ( cmd )
2968 /*
2969 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2970 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
2971 * current A/D bits.
2972 */
2973 case MMU_NORMAL_PT_UPDATE:
2974 case MMU_PT_UPDATE_PRESERVE_AD:
2975 rc = xsm_mmu_normal_update(d, FOREIGNDOM, req.val);
2976 if ( rc )
2977 break;
2979 req.ptr -= cmd;
2980 gmfn = req.ptr >> PAGE_SHIFT;
2981 mfn = gmfn_to_mfn(d, gmfn);
2983 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
2985 MEM_LOG("Could not get page for normal update");
2986 break;
2989 va = map_domain_page_with_cache(mfn, &mapcache);
2990 va = (void *)((unsigned long)va +
2991 (unsigned long)(req.ptr & ~PAGE_MASK));
2992 page = mfn_to_page(mfn);
2994 if ( page_lock(page) )
2996 switch ( page->u.inuse.type_info & PGT_type_mask )
2998 case PGT_l1_page_table:
3000 l1_pgentry_t l1e = l1e_from_intpte(req.val);
3001 okay = mod_l1_entry(va, l1e, mfn,
3002 cmd == MMU_PT_UPDATE_PRESERVE_AD);
3004 break;
3005 case PGT_l2_page_table:
3007 l2_pgentry_t l2e = l2e_from_intpte(req.val);
3008 okay = mod_l2_entry(va, l2e, mfn,
3009 cmd == MMU_PT_UPDATE_PRESERVE_AD);
3011 break;
3012 case PGT_l3_page_table:
3014 l3_pgentry_t l3e = l3e_from_intpte(req.val);
3015 rc = mod_l3_entry(va, l3e, mfn,
3016 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
3017 okay = !rc;
3019 break;
3020 #if CONFIG_PAGING_LEVELS >= 4
3021 case PGT_l4_page_table:
3023 l4_pgentry_t l4e = l4e_from_intpte(req.val);
3024 rc = mod_l4_entry(va, l4e, mfn,
3025 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
3026 okay = !rc;
3028 break;
3029 #endif
3030 case PGT_writable_page:
3031 perfc_incr(writable_mmu_updates);
3032 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
3033 break;
3035 page_unlock(page);
3036 if ( rc == -EINTR )
3037 rc = -EAGAIN;
3039 else if ( get_page_type(page, PGT_writable_page) )
3041 perfc_incr(writable_mmu_updates);
3042 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
3043 put_page_type(page);
3046 unmap_domain_page_with_cache(va, &mapcache);
3047 put_page(page);
3048 break;
3050 case MMU_MACHPHYS_UPDATE:
3052 mfn = req.ptr >> PAGE_SHIFT;
3053 gpfn = req.val;
3055 rc = xsm_mmu_machphys_update(d, mfn);
3056 if ( rc )
3057 break;
3059 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
3061 MEM_LOG("Could not get page for mach->phys update");
3062 break;
3065 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
3067 MEM_LOG("Mach-phys update on auto-translate guest");
3068 break;
3071 set_gpfn_from_mfn(mfn, gpfn);
3072 okay = 1;
3074 paging_mark_dirty(FOREIGNDOM, mfn);
3076 put_page(mfn_to_page(mfn));
3077 break;
3079 default:
3080 MEM_LOG("Invalid page update command %x", cmd);
3081 rc = -ENOSYS;
3082 okay = 0;
3083 break;
3086 if ( unlikely(!okay) )
3088 rc = rc ? rc : -EINVAL;
3089 break;
3092 guest_handle_add_offset(ureqs, 1);
3095 if ( rc == -EAGAIN )
3096 rc = hypercall_create_continuation(
3097 __HYPERVISOR_mmu_update, "hihi",
3098 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3100 process_deferred_ops();
3102 domain_mmap_cache_destroy(&mapcache);
3104 perfc_add(num_page_updates, i);
3106 out:
3107 /* Add incremental work we have done to the @done output parameter. */
3108 if ( unlikely(!guest_handle_is_null(pdone)) )
3110 done += i;
3111 copy_to_guest(pdone, &done, 1);
3114 return rc;
3118 static int create_grant_pte_mapping(
3119 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
3121 int rc = GNTST_okay;
3122 void *va;
3123 unsigned long gmfn, mfn;
3124 struct page_info *page;
3125 l1_pgentry_t ol1e;
3126 struct domain *d = v->domain;
3128 ASSERT(domain_is_locked(d));
3130 adjust_guest_l1e(nl1e, d);
3132 gmfn = pte_addr >> PAGE_SHIFT;
3133 mfn = gmfn_to_mfn(d, gmfn);
3135 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3137 MEM_LOG("Could not get page for normal update");
3138 return GNTST_general_error;
3141 va = map_domain_page(mfn);
3142 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
3143 page = mfn_to_page(mfn);
3145 if ( !page_lock(page) )
3147 rc = GNTST_general_error;
3148 goto failed;
3151 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3153 page_unlock(page);
3154 rc = GNTST_general_error;
3155 goto failed;
3158 ol1e = *(l1_pgentry_t *)va;
3159 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3161 page_unlock(page);
3162 rc = GNTST_general_error;
3163 goto failed;
3166 page_unlock(page);
3168 if ( !paging_mode_refcounts(d) )
3169 put_page_from_l1e(ol1e, d);
3171 failed:
3172 unmap_domain_page(va);
3173 put_page(page);
3175 return rc;
3178 static int destroy_grant_pte_mapping(
3179 uint64_t addr, unsigned long frame, struct domain *d)
3181 int rc = GNTST_okay;
3182 void *va;
3183 unsigned long gmfn, mfn;
3184 struct page_info *page;
3185 l1_pgentry_t ol1e;
3187 gmfn = addr >> PAGE_SHIFT;
3188 mfn = gmfn_to_mfn(d, gmfn);
3190 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3192 MEM_LOG("Could not get page for normal update");
3193 return GNTST_general_error;
3196 va = map_domain_page(mfn);
3197 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3198 page = mfn_to_page(mfn);
3200 if ( !page_lock(page) )
3202 rc = GNTST_general_error;
3203 goto failed;
3206 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3208 page_unlock(page);
3209 rc = GNTST_general_error;
3210 goto failed;
3213 ol1e = *(l1_pgentry_t *)va;
3215 /* Check that the virtual address supplied is actually mapped to frame. */
3216 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3218 page_unlock(page);
3219 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3220 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3221 rc = GNTST_general_error;
3222 goto failed;
3225 /* Delete pagetable entry. */
3226 if ( unlikely(!UPDATE_ENTRY
3227 (l1,
3228 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3229 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3230 0)) )
3232 page_unlock(page);
3233 MEM_LOG("Cannot delete PTE entry at %p", va);
3234 rc = GNTST_general_error;
3235 goto failed;
3238 page_unlock(page);
3240 failed:
3241 unmap_domain_page(va);
3242 put_page(page);
3243 return rc;
3247 static int create_grant_va_mapping(
3248 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3250 l1_pgentry_t *pl1e, ol1e;
3251 struct domain *d = v->domain;
3252 unsigned long gl1mfn;
3253 struct page_info *l1pg;
3254 int okay;
3256 ASSERT(domain_is_locked(d));
3258 adjust_guest_l1e(nl1e, d);
3260 pl1e = guest_map_l1e(v, va, &gl1mfn);
3261 if ( !pl1e )
3263 MEM_LOG("Could not find L1 PTE for address %lx", va);
3264 return GNTST_general_error;
3267 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3269 guest_unmap_l1e(v, pl1e);
3270 return GNTST_general_error;
3273 l1pg = mfn_to_page(gl1mfn);
3274 if ( !page_lock(l1pg) )
3276 put_page(l1pg);
3277 guest_unmap_l1e(v, pl1e);
3278 return GNTST_general_error;
3281 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3283 page_unlock(l1pg);
3284 put_page(l1pg);
3285 guest_unmap_l1e(v, pl1e);
3286 return GNTST_general_error;
3289 ol1e = *pl1e;
3290 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3292 page_unlock(l1pg);
3293 put_page(l1pg);
3294 guest_unmap_l1e(v, pl1e);
3296 if ( okay && !paging_mode_refcounts(d) )
3297 put_page_from_l1e(ol1e, d);
3299 return okay ? GNTST_okay : GNTST_general_error;
3302 static int replace_grant_va_mapping(
3303 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3305 l1_pgentry_t *pl1e, ol1e;
3306 unsigned long gl1mfn;
3307 struct page_info *l1pg;
3308 int rc = 0;
3310 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3311 if ( !pl1e )
3313 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3314 return GNTST_general_error;
3317 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3319 rc = GNTST_general_error;
3320 goto out;
3323 l1pg = mfn_to_page(gl1mfn);
3324 if ( !page_lock(l1pg) )
3326 rc = GNTST_general_error;
3327 put_page(l1pg);
3328 goto out;
3331 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3333 rc = GNTST_general_error;
3334 goto unlock_and_out;
3337 ol1e = *pl1e;
3339 /* Check that the virtual address supplied is actually mapped to frame. */
3340 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3342 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3343 l1e_get_pfn(ol1e), addr, frame);
3344 rc = GNTST_general_error;
3345 goto unlock_and_out;
3348 /* Delete pagetable entry. */
3349 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3351 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3352 rc = GNTST_general_error;
3353 goto unlock_and_out;
3356 unlock_and_out:
3357 page_unlock(l1pg);
3358 put_page(l1pg);
3359 out:
3360 guest_unmap_l1e(v, pl1e);
3361 return rc;
3364 static int destroy_grant_va_mapping(
3365 unsigned long addr, unsigned long frame, struct vcpu *v)
3367 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3370 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3371 unsigned int flags, unsigned int cache_flags)
3373 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3375 if ( (flags & GNTMAP_application_map) )
3376 l1e_add_flags(pte,_PAGE_USER);
3377 if ( !(flags & GNTMAP_readonly) )
3378 l1e_add_flags(pte,_PAGE_RW);
3380 l1e_add_flags(pte,
3381 ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
3382 & _PAGE_AVAIL);
3384 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3386 if ( flags & GNTMAP_contains_pte )
3387 return create_grant_pte_mapping(addr, pte, current);
3388 return create_grant_va_mapping(addr, pte, current);
3391 int replace_grant_host_mapping(
3392 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3394 struct vcpu *curr = current;
3395 l1_pgentry_t *pl1e, ol1e;
3396 unsigned long gl1mfn;
3397 struct page_info *l1pg;
3398 int rc;
3400 if ( flags & GNTMAP_contains_pte )
3402 if ( !new_addr )
3403 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3405 MEM_LOG("Unsupported grant table operation");
3406 return GNTST_general_error;
3409 if ( !new_addr )
3410 return destroy_grant_va_mapping(addr, frame, curr);
3412 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3413 if ( !pl1e )
3415 MEM_LOG("Could not find L1 PTE for address %lx",
3416 (unsigned long)new_addr);
3417 return GNTST_general_error;
3420 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3422 guest_unmap_l1e(curr, pl1e);
3423 return GNTST_general_error;
3426 l1pg = mfn_to_page(gl1mfn);
3427 if ( !page_lock(l1pg) )
3429 put_page(l1pg);
3430 guest_unmap_l1e(curr, pl1e);
3431 return GNTST_general_error;
3434 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3436 page_unlock(l1pg);
3437 put_page(l1pg);
3438 guest_unmap_l1e(curr, pl1e);
3439 return GNTST_general_error;
3442 ol1e = *pl1e;
3444 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3445 gl1mfn, curr, 0)) )
3447 page_unlock(l1pg);
3448 put_page(l1pg);
3449 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3450 guest_unmap_l1e(curr, pl1e);
3451 return GNTST_general_error;
3454 page_unlock(l1pg);
3455 put_page(l1pg);
3456 guest_unmap_l1e(curr, pl1e);
3458 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3459 if ( rc && !paging_mode_refcounts(curr->domain) )
3460 put_page_from_l1e(ol1e, curr->domain);
3462 return rc;
3465 int steal_page(
3466 struct domain *d, struct page_info *page, unsigned int memflags)
3468 unsigned long x, y;
3470 spin_lock(&d->page_alloc_lock);
3472 if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
3473 goto fail;
3475 /*
3476 * We require there is just one reference (PGC_allocated). We temporarily
3477 * drop this reference now so that we can safely swizzle the owner.
3478 */
3479 y = page->count_info;
3480 do {
3481 x = y;
3482 if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3483 goto fail;
3484 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3485 } while ( y != x );
3487 /* Swizzle the owner then reinstate the PGC_allocated reference. */
3488 page_set_owner(page, NULL);
3489 y = page->count_info;
3490 do {
3491 x = y;
3492 BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3493 } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3495 /* Unlink from original owner. */
3496 if ( !(memflags & MEMF_no_refcount) )
3497 d->tot_pages--;
3498 page_list_del(page, &d->page_list);
3500 spin_unlock(&d->page_alloc_lock);
3501 return 0;
3503 fail:
3504 spin_unlock(&d->page_alloc_lock);
3505 MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3506 (void *)page_to_mfn(page), d, d->domain_id,
3507 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3508 return -1;
3511 int do_update_va_mapping(unsigned long va, u64 val64,
3512 unsigned long flags)
3514 l1_pgentry_t val = l1e_from_intpte(val64);
3515 struct vcpu *v = current;
3516 struct domain *d = v->domain;
3517 struct page_info *gl1pg;
3518 l1_pgentry_t *pl1e;
3519 unsigned long vmask, bmap_ptr, gl1mfn;
3520 cpumask_t pmask;
3521 int rc;
3523 perfc_incr(calls_to_update_va);
3525 rc = xsm_update_va_mapping(d, FOREIGNDOM, val);
3526 if ( rc )
3527 return rc;
3529 rc = -EINVAL;
3530 pl1e = guest_map_l1e(v, va, &gl1mfn);
3531 if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) )
3532 goto out;
3534 gl1pg = mfn_to_page(gl1mfn);
3535 if ( !page_lock(gl1pg) )
3537 put_page(gl1pg);
3538 goto out;
3541 if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3543 page_unlock(gl1pg);
3544 put_page(gl1pg);
3545 goto out;
3548 rc = mod_l1_entry(pl1e, val, gl1mfn, 0) ? 0 : -EINVAL;
3550 page_unlock(gl1pg);
3551 put_page(gl1pg);
3553 out:
3554 if ( pl1e )
3555 guest_unmap_l1e(v, pl1e);
3557 switch ( flags & UVMF_FLUSHTYPE_MASK )
3559 case UVMF_TLB_FLUSH:
3560 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3562 case UVMF_LOCAL:
3563 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
3564 break;
3565 case UVMF_ALL:
3566 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
3567 break;
3568 default:
3569 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3570 break;
3571 if ( unlikely(!is_pv_32on64_domain(d) ?
3572 get_user(vmask, (unsigned long *)bmap_ptr) :
3573 get_user(vmask, (unsigned int *)bmap_ptr)) )
3574 rc = -EFAULT, vmask = 0;
3575 pmask = vcpumask_to_pcpumask(d, vmask);
3576 if ( cpu_isset(smp_processor_id(), pmask) )
3577 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
3578 flush_tlb_mask(pmask);
3579 break;
3581 break;
3583 case UVMF_INVLPG:
3584 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3585 break;
3586 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3588 case UVMF_LOCAL:
3589 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3590 break;
3591 if ( !paging_mode_enabled(d) ||
3592 (paging_invlpg(v, va) != 0) )
3593 flush_tlb_one_local(va);
3594 break;
3595 case UVMF_ALL:
3596 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
3597 break;
3598 default:
3599 if ( unlikely(!is_pv_32on64_domain(d) ?
3600 get_user(vmask, (unsigned long *)bmap_ptr) :
3601 get_user(vmask, (unsigned int *)bmap_ptr)) )
3602 rc = -EFAULT, vmask = 0;
3603 pmask = vcpumask_to_pcpumask(d, vmask);
3604 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3605 cpu_clear(smp_processor_id(), pmask);
3606 flush_tlb_one_mask(pmask, va);
3607 break;
3609 break;
3612 process_deferred_ops();
3614 return rc;
3617 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3618 unsigned long flags,
3619 domid_t domid)
3621 int rc;
3623 if ( !set_foreigndom(domid) )
3624 return -ESRCH;
3626 rc = do_update_va_mapping(va, val64, flags);
3628 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3629 process_deferred_ops(); /* only to clear foreigndom */
3631 return rc;
3636 /*************************
3637 * Descriptor Tables
3638 */
3640 void destroy_gdt(struct vcpu *v)
3642 int i;
3643 unsigned long pfn;
3645 v->arch.guest_context.gdt_ents = 0;
3646 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3648 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3649 put_page_and_type(mfn_to_page(pfn));
3650 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3651 v->arch.guest_context.gdt_frames[i] = 0;
3656 long set_gdt(struct vcpu *v,
3657 unsigned long *frames,
3658 unsigned int entries)
3660 struct domain *d = v->domain;
3661 /* NB. There are 512 8-byte entries per GDT page. */
3662 int i, nr_pages = (entries + 511) / 512;
3663 unsigned long mfn;
3665 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3666 return -EINVAL;
3668 /* Check the pages in the new GDT. */
3669 for ( i = 0; i < nr_pages; i++ )
3671 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3672 if ( !mfn_valid(mfn) ||
3673 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
3674 goto fail;
3677 /* Tear down the old GDT. */
3678 destroy_gdt(v);
3680 /* Install the new GDT. */
3681 v->arch.guest_context.gdt_ents = entries;
3682 for ( i = 0; i < nr_pages; i++ )
3684 v->arch.guest_context.gdt_frames[i] = frames[i];
3685 l1e_write(&v->arch.perdomain_ptes[i],
3686 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3689 return 0;
3691 fail:
3692 while ( i-- > 0 )
3693 put_page_and_type(mfn_to_page(frames[i]));
3694 return -EINVAL;
3698 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3700 int nr_pages = (entries + 511) / 512;
3701 unsigned long frames[16];
3702 struct vcpu *curr = current;
3703 long ret;
3705 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3706 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3707 return -EINVAL;
3709 if ( copy_from_guest(frames, frame_list, nr_pages) )
3710 return -EFAULT;
3712 domain_lock(curr->domain);
3714 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3715 flush_tlb_local();
3717 domain_unlock(curr->domain);
3719 return ret;
3723 long do_update_descriptor(u64 pa, u64 desc)
3725 struct domain *dom = current->domain;
3726 unsigned long gmfn = pa >> PAGE_SHIFT;
3727 unsigned long mfn;
3728 unsigned int offset;
3729 struct desc_struct *gdt_pent, d;
3730 struct page_info *page;
3731 long ret = -EINVAL;
3733 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3735 *(u64 *)&d = desc;
3737 mfn = gmfn_to_mfn(dom, gmfn);
3738 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3739 !mfn_valid(mfn) ||
3740 !check_descriptor(dom, &d) )
3741 return -EINVAL;
3743 page = mfn_to_page(mfn);
3744 if ( unlikely(!get_page(page, dom)) )
3745 return -EINVAL;
3747 /* Check if the given frame is in use in an unsafe context. */
3748 switch ( page->u.inuse.type_info & PGT_type_mask )
3750 case PGT_seg_desc_page:
3751 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
3752 goto out;
3753 break;
3754 default:
3755 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3756 goto out;
3757 break;
3760 paging_mark_dirty(dom, mfn);
3762 /* All is good so make the update. */
3763 gdt_pent = map_domain_page(mfn);
3764 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3765 unmap_domain_page(gdt_pent);
3767 put_page_type(page);
3769 ret = 0; /* success */
3771 out:
3772 put_page(page);
3774 return ret;
3777 typedef struct e820entry e820entry_t;
3778 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3780 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3782 struct page_info *page = NULL;
3783 int rc;
3785 switch ( op )
3787 case XENMEM_add_to_physmap:
3789 struct xen_add_to_physmap xatp;
3790 unsigned long prev_mfn, mfn = 0, gpfn;
3791 struct domain *d;
3793 if ( copy_from_guest(&xatp, arg, 1) )
3794 return -EFAULT;
3796 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
3797 if ( rc != 0 )
3798 return rc;
3800 if ( xsm_add_to_physmap(current->domain, d) )
3802 rcu_unlock_domain(d);
3803 return -EPERM;
3806 switch ( xatp.space )
3808 case XENMAPSPACE_shared_info:
3809 if ( xatp.idx == 0 )
3810 mfn = virt_to_mfn(d->shared_info);
3811 break;
3812 case XENMAPSPACE_grant_table:
3813 spin_lock(&d->grant_table->lock);
3815 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3816 (xatp.idx < max_nr_grant_frames) )
3817 gnttab_grow_table(d, xatp.idx + 1);
3819 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3820 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3822 spin_unlock(&d->grant_table->lock);
3823 break;
3824 case XENMAPSPACE_gmfn:
3825 xatp.idx = gmfn_to_mfn(d, xatp.idx);
3826 if ( !get_page_from_pagenr(xatp.idx, d) )
3827 break;
3828 mfn = xatp.idx;
3829 page = mfn_to_page(mfn);
3830 break;
3831 default:
3832 break;
3835 if ( !paging_mode_translate(d) || (mfn == 0) )
3837 if ( page )
3838 put_page(page);
3839 rcu_unlock_domain(d);
3840 return -EINVAL;
3843 domain_lock(d);
3845 /* Remove previously mapped page if it was present. */
3846 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3847 if ( mfn_valid(prev_mfn) )
3849 if ( is_xen_heap_mfn(prev_mfn) )
3850 /* Xen heap frames are simply unhooked from this phys slot. */
3851 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3852 else
3853 /* Normal domain memory is freed, to avoid leaking memory. */
3854 guest_remove_page(d, xatp.gpfn);
3857 /* Unmap from old location, if any. */
3858 gpfn = get_gpfn_from_mfn(mfn);
3859 if ( gpfn != INVALID_M2P_ENTRY )
3860 guest_physmap_remove_page(d, gpfn, mfn, 0);
3862 /* Map at new location. */
3863 guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
3865 domain_unlock(d);
3867 if ( page )
3868 put_page(page);
3870 rcu_unlock_domain(d);
3872 break;
3875 case XENMEM_set_memory_map:
3877 struct xen_foreign_memory_map fmap;
3878 struct domain *d;
3879 int rc;
3881 if ( copy_from_guest(&fmap, arg, 1) )
3882 return -EFAULT;
3884 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3885 return -EINVAL;
3887 rc = rcu_lock_target_domain_by_id(fmap.domid, &d);
3888 if ( rc != 0 )
3889 return rc;
3891 rc = xsm_domain_memory_map(d);
3892 if ( rc )
3894 rcu_unlock_domain(d);
3895 return rc;
3898 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3899 fmap.map.nr_entries) ? -EFAULT : 0;
3900 d->arch.nr_e820 = fmap.map.nr_entries;
3902 rcu_unlock_domain(d);
3903 return rc;
3906 case XENMEM_memory_map:
3908 struct xen_memory_map map;
3909 struct domain *d = current->domain;
3911 /* Backwards compatibility. */
3912 if ( d->arch.nr_e820 == 0 )
3913 return -ENOSYS;
3915 if ( copy_from_guest(&map, arg, 1) )
3916 return -EFAULT;
3918 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3919 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3920 copy_to_guest(arg, &map, 1) )
3921 return -EFAULT;
3923 return 0;
3926 case XENMEM_machine_memory_map:
3928 struct xen_memory_map memmap;
3929 XEN_GUEST_HANDLE(e820entry_t) buffer;
3930 int count;
3931 int rc;
3933 if ( !IS_PRIV(current->domain) )
3934 return -EINVAL;
3936 rc = xsm_machine_memory_map();
3937 if ( rc )
3938 return rc;
3940 if ( copy_from_guest(&memmap, arg, 1) )
3941 return -EFAULT;
3942 if ( memmap.nr_entries < e820.nr_map + 1 )
3943 return -EINVAL;
3945 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3947 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3948 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3949 return -EFAULT;
3951 memmap.nr_entries = count;
3953 if ( copy_to_guest(arg, &memmap, 1) )
3954 return -EFAULT;
3956 return 0;
3959 case XENMEM_machphys_mapping:
3961 static const struct xen_machphys_mapping mapping = {
3962 .v_start = MACH2PHYS_VIRT_START,
3963 .v_end = MACH2PHYS_VIRT_END,
3964 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3965 };
3967 if ( copy_to_guest(arg, &mapping, 1) )
3968 return -EFAULT;
3970 return 0;
3973 case XENMEM_set_pod_target:
3974 case XENMEM_get_pod_target:
3976 xen_pod_target_t target;
3977 struct domain *d;
3979 /* Support DOMID_SELF? */
3980 if ( !IS_PRIV(current->domain) )
3981 return -EINVAL;
3983 if ( copy_from_guest(&target, arg, 1) )
3984 return -EFAULT;
3986 rc = rcu_lock_target_domain_by_id(target.domid, &d);
3987 if ( rc != 0 )
3988 return rc;
3990 if ( op == XENMEM_set_pod_target )
3992 if ( target.target_pages > d->max_pages )
3994 rc = -EINVAL;
3995 goto pod_target_out_unlock;
3998 rc = p2m_pod_set_mem_target(d, target.target_pages);
4001 target.tot_pages = d->tot_pages;
4002 target.pod_cache_pages = d->arch.p2m->pod.count;
4003 target.pod_entries = d->arch.p2m->pod.entry_count;
4005 if ( copy_to_guest(arg, &target, 1) )
4007 rc= -EFAULT;
4008 goto pod_target_out_unlock;
4011 pod_target_out_unlock:
4012 rcu_unlock_domain(d);
4013 return rc;
4016 default:
4017 return subarch_memory_op(op, arg);
4020 return 0;
4024 /*************************
4025 * Writable Pagetables
4026 */
4028 struct ptwr_emulate_ctxt {
4029 struct x86_emulate_ctxt ctxt;
4030 unsigned long cr2;
4031 l1_pgentry_t pte;
4032 };
4034 static int ptwr_emulated_read(
4035 enum x86_segment seg,
4036 unsigned long offset,
4037 void *p_data,
4038 unsigned int bytes,
4039 struct x86_emulate_ctxt *ctxt)
4041 unsigned int rc;
4042 unsigned long addr = offset;
4044 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
4046 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
4047 return X86EMUL_EXCEPTION;
4050 return X86EMUL_OKAY;
4053 static int ptwr_emulated_update(
4054 unsigned long addr,
4055 paddr_t old,
4056 paddr_t val,
4057 unsigned int bytes,
4058 unsigned int do_cmpxchg,
4059 struct ptwr_emulate_ctxt *ptwr_ctxt)
4061 unsigned long mfn;
4062 unsigned long unaligned_addr = addr;
4063 struct page_info *page;
4064 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
4065 struct vcpu *v = current;
4066 struct domain *d = v->domain;
4068 /* Only allow naturally-aligned stores within the original %cr2 page. */
4069 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
4071 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
4072 ptwr_ctxt->cr2, addr, bytes);
4073 return X86EMUL_UNHANDLEABLE;
4076 /* Turn a sub-word access into a full-word access. */
4077 if ( bytes != sizeof(paddr_t) )
4079 paddr_t full;
4080 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
4082 /* Align address; read full word. */
4083 addr &= ~(sizeof(paddr_t)-1);
4084 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
4086 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
4087 return X86EMUL_EXCEPTION;
4089 /* Mask out bits provided by caller. */
4090 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
4091 /* Shift the caller value and OR in the missing bits. */
4092 val &= (((paddr_t)1 << (bytes*8)) - 1);
4093 val <<= (offset)*8;
4094 val |= full;
4095 /* Also fill in missing parts of the cmpxchg old value. */
4096 old &= (((paddr_t)1 << (bytes*8)) - 1);
4097 old <<= (offset)*8;
4098 old |= full;
4101 pte = ptwr_ctxt->pte;
4102 mfn = l1e_get_pfn(pte);
4103 page = mfn_to_page(mfn);
4105 /* We are looking only for read-only mappings of p.t. pages. */
4106 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
4107 ASSERT(mfn_valid(mfn));
4108 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
4109 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
4110 ASSERT(page_get_owner(page) == d);
4112 /* Check the new PTE. */
4113 nl1e = l1e_from_intpte(val);
4114 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
4116 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
4117 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
4119 /*
4120 * If this is an upper-half write to a PAE PTE then we assume that
4121 * the guest has simply got the two writes the wrong way round. We
4122 * zap the PRESENT bit on the assumption that the bottom half will
4123 * be written immediately after we return to the guest.
4124 */
4125 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
4126 l1e_get_intpte(nl1e));
4127 l1e_remove_flags(nl1e, _PAGE_PRESENT);
4129 else
4131 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
4132 return X86EMUL_UNHANDLEABLE;
4136 adjust_guest_l1e(nl1e, d);
4138 /* Checked successfully: do the update (write or cmpxchg). */
4139 pl1e = map_domain_page(mfn);
4140 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
4141 if ( do_cmpxchg )
4143 int okay;
4144 intpte_t t = old;
4145 ol1e = l1e_from_intpte(old);
4147 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
4148 &t, l1e_get_intpte(nl1e), _mfn(mfn));
4149 okay = (okay && t == old);
4151 if ( !okay )
4153 unmap_domain_page(pl1e);
4154 put_page_from_l1e(nl1e, d);
4155 return X86EMUL_CMPXCHG_FAILED;
4158 else
4160 ol1e = *pl1e;
4161 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
4162 BUG();
4165 trace_ptwr_emulation(addr, nl1e);
4167 unmap_domain_page(pl1e);
4169 /* Finally, drop the old PTE. */
4170 put_page_from_l1e(ol1e, d);
4172 return X86EMUL_OKAY;
4175 static int ptwr_emulated_write(
4176 enum x86_segment seg,
4177 unsigned long offset,
4178 void *p_data,
4179 unsigned int bytes,
4180 struct x86_emulate_ctxt *ctxt)
4182 paddr_t val = 0;
4184 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4186 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
4187 offset, bytes);
4188 return X86EMUL_UNHANDLEABLE;
4191 memcpy(&val, p_data, bytes);
4193 return ptwr_emulated_update(
4194 offset, 0, val, bytes, 0,
4195 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4198 static int ptwr_emulated_cmpxchg(
4199 enum x86_segment seg,
4200 unsigned long offset,
4201 void *p_old,
4202 void *p_new,
4203 unsigned int bytes,
4204 struct x86_emulate_ctxt *ctxt)
4206 paddr_t old = 0, new = 0;
4208 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4210 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
4211 offset, bytes);
4212 return X86EMUL_UNHANDLEABLE;
4215 memcpy(&old, p_old, bytes);
4216 memcpy(&new, p_new, bytes);
4218 return ptwr_emulated_update(
4219 offset, old, new, bytes, 1,
4220 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4223 static struct x86_emulate_ops ptwr_emulate_ops = {
4224 .read = ptwr_emulated_read,
4225 .insn_fetch = ptwr_emulated_read,
4226 .write = ptwr_emulated_write,
4227 .cmpxchg = ptwr_emulated_cmpxchg,
4228 };
4230 /* Write page fault handler: check if guest is trying to modify a PTE. */
4231 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
4232 struct cpu_user_regs *regs)
4234 struct domain *d = v->domain;
4235 struct page_info *page;
4236 l1_pgentry_t pte;
4237 struct ptwr_emulate_ctxt ptwr_ctxt;
4238 int rc;
4240 /* Attempt to read the PTE that maps the VA being accessed. */
4241 guest_get_eff_l1e(v, addr, &pte);
4243 /* We are looking only for read-only mappings of p.t. pages. */
4244 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
4245 !get_page_from_pagenr(l1e_get_pfn(pte), d) )
4246 goto bail;
4248 page = l1e_get_page(pte);
4249 if ( !page_lock(page) )
4251 put_page(page);
4252 goto bail;
4255 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
4257 page_unlock(page);
4258 put_page(page);
4259 goto bail;
4262 ptwr_ctxt.ctxt.regs = regs;
4263 ptwr_ctxt.ctxt.force_writeback = 0;
4264 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
4265 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
4266 ptwr_ctxt.cr2 = addr;
4267 ptwr_ctxt.pte = pte;
4269 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
4271 page_unlock(page);
4272 put_page(page);
4274 if ( rc == X86EMUL_UNHANDLEABLE )
4275 goto bail;
4277 perfc_incr(ptwr_emulations);
4278 return EXCRET_fault_fixed;
4280 bail:
4281 return 0;
4284 void free_xen_pagetable(void *v)
4286 extern int early_boot;
4288 if ( early_boot )
4289 return;
4291 if ( is_xen_heap_page(virt_to_page(v)) )
4292 free_xenheap_page(v);
4293 else
4294 free_domheap_page(virt_to_page(v));
4297 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4298 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4299 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4301 /*
4302 * map_pages_to_xen() can be called with interrupts disabled:
4303 * * During early bootstrap; or
4304 * * alloc_xenheap_pages() via memguard_guard_range
4305 * In these cases it is safe to use flush_area_local():
4306 * * Because only the local CPU is online; or
4307 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
4308 */
4309 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4310 flush_area_local((const void *)v, f) : \
4311 flush_area_all((const void *)v, f))
4313 int map_pages_to_xen(
4314 unsigned long virt,
4315 unsigned long mfn,
4316 unsigned long nr_mfns,
4317 unsigned int flags)
4319 l2_pgentry_t *pl2e, ol2e;
4320 l1_pgentry_t *pl1e, ol1e;
4321 unsigned int i;
4323 while ( nr_mfns != 0 )
4325 #ifdef __x86_64__
4326 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
4327 l3_pgentry_t ol3e = *pl3e;
4329 if ( cpu_has_page1gb &&
4330 !(((virt >> PAGE_SHIFT) | mfn) &
4331 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4332 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4333 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4335 /* 1GB-page mapping. */
4336 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4338 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4340 unsigned int flush_flags =
4341 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4343 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4345 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4346 flush_flags |= FLUSH_TLB_GLOBAL;
4347 if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4348 PAGE_CACHE_ATTRS )
4349 flush_flags |= FLUSH_CACHE;
4350 flush_area(virt, flush_flags);
4352 else
4354 pl2e = l3e_to_l2e(ol3e);
4355 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4357 ol2e = pl2e[i];
4358 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4359 continue;
4360 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4362 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4363 flush_flags |= FLUSH_TLB_GLOBAL;
4364 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4365 PAGE_CACHE_ATTRS )
4366 flush_flags |= FLUSH_CACHE;
4368 else
4370 unsigned int j;
4372 pl1e = l2e_to_l1e(ol2e);
4373 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4375 ol1e = pl1e[j];
4376 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4377 flush_flags |= FLUSH_TLB_GLOBAL;
4378 if ( (l1e_get_flags(ol1e) ^ flags) &
4379 PAGE_CACHE_ATTRS )
4380 flush_flags |= FLUSH_CACHE;
4384 flush_area(virt, flush_flags);
4385 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4387 ol2e = pl2e[i];
4388 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4389 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4390 free_xen_pagetable(l2e_to_l1e(ol2e));
4392 free_xen_pagetable(pl2e);
4396 virt += 1UL << L3_PAGETABLE_SHIFT;
4397 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4398 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4399 continue;
4402 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4403 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4405 unsigned int flush_flags =
4406 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4408 /* Skip this PTE if there is no change. */
4409 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4410 L1_PAGETABLE_ENTRIES - 1)) +
4411 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4412 l1_table_offset(virt) == mfn) &&
4413 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4414 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4416 /* We can skip to end of L3 superpage if we got a match. */
4417 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4418 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4419 if ( i > nr_mfns )
4420 i = nr_mfns;
4421 virt += i << PAGE_SHIFT;
4422 mfn += i;
4423 nr_mfns -= i;
4424 continue;
4427 pl2e = alloc_xen_pagetable();
4428 if ( pl2e == NULL )
4429 return -ENOMEM;
4431 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4432 l2e_write(pl2e + i,
4433 l2e_from_pfn(l3e_get_pfn(ol3e) +
4434 (i << PAGETABLE_ORDER),
4435 l3e_get_flags(ol3e)));
4437 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4438 flush_flags |= FLUSH_TLB_GLOBAL;
4440 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4441 __PAGE_HYPERVISOR));
4442 flush_area(virt, flush_flags);
4444 #endif
4446 pl2e = virt_to_xen_l2e(virt);
4448 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4449 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4450 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4452 /* Super-page mapping. */
4453 ol2e = *pl2e;
4454 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4456 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4458 unsigned int flush_flags =
4459 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4461 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4463 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4464 flush_flags |= FLUSH_TLB_GLOBAL;
4465 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4466 PAGE_CACHE_ATTRS )
4467 flush_flags |= FLUSH_CACHE;
4468 flush_area(virt, flush_flags);
4470 else
4472 pl1e = l2e_to_l1e(ol2e);
4473 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4475 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4476 flush_flags |= FLUSH_TLB_GLOBAL;
4477 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4478 PAGE_CACHE_ATTRS )
4479 flush_flags |= FLUSH_CACHE;
4481 flush_area(virt, flush_flags);
4482 free_xen_pagetable(pl1e);
4486 virt += 1UL << L2_PAGETABLE_SHIFT;
4487 mfn += 1UL << PAGETABLE_ORDER;
4488 nr_mfns -= 1UL << PAGETABLE_ORDER;
4490 else
4492 /* Normal page mapping. */
4493 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4495 pl1e = alloc_xen_pagetable();
4496 if ( pl1e == NULL )
4497 return -ENOMEM;
4498 clear_page(pl1e);
4499 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4500 __PAGE_HYPERVISOR));
4502 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4504 unsigned int flush_flags =
4505 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4507 /* Skip this PTE if there is no change. */
4508 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4509 l1_table_offset(virt)) == mfn) &&
4510 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4511 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4513 /* We can skip to end of L2 superpage if we got a match. */
4514 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4515 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4516 if ( i > nr_mfns )
4517 i = nr_mfns;
4518 virt += i << L1_PAGETABLE_SHIFT;
4519 mfn += i;
4520 nr_mfns -= i;
4521 goto check_l3;
4524 pl1e = alloc_xen_pagetable();
4525 if ( pl1e == NULL )
4526 return -ENOMEM;
4528 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4529 l1e_write(&pl1e[i],
4530 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4531 lNf_to_l1f(l2e_get_flags(*pl2e))));
4533 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4534 flush_flags |= FLUSH_TLB_GLOBAL;
4536 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4537 __PAGE_HYPERVISOR));
4538 flush_area(virt, flush_flags);
4541 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4542 ol1e = *pl1e;
4543 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4544 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4546 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4547 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4548 flush_flags |= FLUSH_TLB_GLOBAL;
4549 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
4550 flush_flags |= FLUSH_CACHE;
4551 flush_area(virt, flush_flags);
4554 virt += 1UL << L1_PAGETABLE_SHIFT;
4555 mfn += 1UL;
4556 nr_mfns -= 1UL;
4558 if ( (flags == PAGE_HYPERVISOR) &&
4559 ((nr_mfns == 0) ||
4560 ((((virt >> PAGE_SHIFT) | mfn) &
4561 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
4563 unsigned long base_mfn;
4564 pl1e = l2e_to_l1e(*pl2e);
4565 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4566 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4567 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4568 (l1e_get_flags(*pl1e) != flags) )
4569 break;
4570 if ( i == L1_PAGETABLE_ENTRIES )
4572 ol2e = *pl2e;
4573 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4574 l1f_to_lNf(flags)));
4575 flush_area(virt - PAGE_SIZE,
4576 FLUSH_TLB_GLOBAL |
4577 FLUSH_ORDER(PAGETABLE_ORDER));
4578 free_xen_pagetable(l2e_to_l1e(ol2e));
4583 check_l3: ;
4584 #ifdef __x86_64__
4585 if ( cpu_has_page1gb &&
4586 (flags == PAGE_HYPERVISOR) &&
4587 ((nr_mfns == 0) ||
4588 !(((virt >> PAGE_SHIFT) | mfn) &
4589 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4591 unsigned long base_mfn;
4593 ol3e = *pl3e;
4594 pl2e = l3e_to_l2e(ol3e);
4595 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4596 L1_PAGETABLE_ENTRIES - 1);
4597 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4598 if ( (l2e_get_pfn(*pl2e) !=
4599 (base_mfn + (i << PAGETABLE_ORDER))) ||
4600 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4601 break;
4602 if ( i == L2_PAGETABLE_ENTRIES )
4604 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4605 l1f_to_lNf(flags)));
4606 flush_area(virt - PAGE_SIZE,
4607 FLUSH_TLB_GLOBAL |
4608 FLUSH_ORDER(2*PAGETABLE_ORDER));
4609 free_xen_pagetable(l3e_to_l2e(ol3e));
4612 #endif
4615 return 0;
4618 void destroy_xen_mappings(unsigned long s, unsigned long e)
4620 l2_pgentry_t *pl2e;
4621 l1_pgentry_t *pl1e;
4622 unsigned int i;
4623 unsigned long v = s;
4625 ASSERT((s & ~PAGE_MASK) == 0);
4626 ASSERT((e & ~PAGE_MASK) == 0);
4628 while ( v < e )
4630 #ifdef __x86_64__
4631 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4633 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4635 v += 1UL << L3_PAGETABLE_SHIFT;
4636 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4637 continue;
4640 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4642 if ( l2_table_offset(v) == 0 &&
4643 l1_table_offset(v) == 0 &&
4644 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4646 /* PAGE1GB: whole superpage is destroyed. */
4647 l3e_write_atomic(pl3e, l3e_empty());
4648 v += 1UL << L3_PAGETABLE_SHIFT;
4649 continue;
4652 /* PAGE1GB: shatter the superpage and fall through. */
4653 pl2e = alloc_xen_pagetable();
4654 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4655 l2e_write(pl2e + i,
4656 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4657 (i << PAGETABLE_ORDER),
4658 l3e_get_flags(*pl3e)));
4659 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4660 __PAGE_HYPERVISOR));
4662 #endif
4664 pl2e = virt_to_xen_l2e(v);
4666 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4668 v += 1UL << L2_PAGETABLE_SHIFT;
4669 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4670 continue;
4673 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4675 if ( (l1_table_offset(v) == 0) &&
4676 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4678 /* PSE: whole superpage is destroyed. */
4679 l2e_write_atomic(pl2e, l2e_empty());
4680 v += 1UL << L2_PAGETABLE_SHIFT;
4682 else
4684 /* PSE: shatter the superpage and try again. */
4685 pl1e = alloc_xen_pagetable();
4686 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4687 l1e_write(&pl1e[i],
4688 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4689 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4690 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4691 __PAGE_HYPERVISOR));
4694 else
4696 /* Ordinary 4kB mapping. */
4697 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4698 l1e_write_atomic(pl1e, l1e_empty());
4699 v += PAGE_SIZE;
4701 /* If we are done with the L2E, check if it is now empty. */
4702 if ( (v != e) && (l1_table_offset(v) != 0) )
4703 continue;
4704 pl1e = l2e_to_l1e(*pl2e);
4705 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4706 if ( l1e_get_intpte(pl1e[i]) != 0 )
4707 break;
4708 if ( i == L1_PAGETABLE_ENTRIES )
4710 /* Empty: zap the L2E and free the L1 page. */
4711 l2e_write_atomic(pl2e, l2e_empty());
4712 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4713 free_xen_pagetable(pl1e);
4717 #ifdef __x86_64__
4718 /* If we are done with the L3E, check if it is now empty. */
4719 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4720 continue;
4721 pl2e = l3e_to_l2e(*pl3e);
4722 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4723 if ( l2e_get_intpte(pl2e[i]) != 0 )
4724 break;
4725 if ( i == L2_PAGETABLE_ENTRIES )
4727 /* Empty: zap the L3E and free the L2 page. */
4728 l3e_write_atomic(pl3e, l3e_empty());
4729 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4730 free_xen_pagetable(pl2e);
4732 #endif
4735 flush_area(NULL, FLUSH_TLB_GLOBAL);
4738 void __set_fixmap(
4739 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4741 BUG_ON(idx >= __end_of_fixed_addresses);
4742 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4745 #ifdef MEMORY_GUARD
4747 void memguard_init(void)
4749 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4750 #ifdef __i386__
4751 map_pages_to_xen(
4752 (unsigned long)__va(start),
4753 start >> PAGE_SHIFT,
4754 (xenheap_phys_end - start) >> PAGE_SHIFT,
4755 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4756 #else
4757 map_pages_to_xen(
4758 (unsigned long)__va(start),
4759 start >> PAGE_SHIFT,
4760 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4761 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4762 BUG_ON(start != xen_phys_start);
4763 map_pages_to_xen(
4764 XEN_VIRT_START,
4765 start >> PAGE_SHIFT,
4766 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4767 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4768 #endif
4771 static void __memguard_change_range(void *p, unsigned long l, int guard)
4773 unsigned long _p = (unsigned long)p;
4774 unsigned long _l = (unsigned long)l;
4775 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4777 /* Ensure we are dealing with a page-aligned whole number of pages. */
4778 ASSERT((_p&~PAGE_MASK) == 0);
4779 ASSERT((_l&~PAGE_MASK) == 0);
4781 if ( guard )
4782 flags &= ~_PAGE_PRESENT;
4784 map_pages_to_xen(
4785 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4788 void memguard_guard_range(void *p, unsigned long l)
4790 __memguard_change_range(p, l, 1);
4793 void memguard_unguard_range(void *p, unsigned long l)
4795 __memguard_change_range(p, l, 0);
4798 #endif
4800 void memguard_guard_stack(void *p)
4802 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4803 p = (void *)((unsigned long)p + STACK_SIZE -
4804 PRIMARY_STACK_SIZE - PAGE_SIZE);
4805 memguard_guard_range(p, PAGE_SIZE);
4808 /*
4809 * Local variables:
4810 * mode: C
4811 * c-set-style: "BSD"
4812 * c-basic-offset: 4
4813 * tab-width: 4
4814 * indent-tabs-mode: nil
4815 * End:
4816 */