ia64/xen-unstable

view xen/arch/x86/mm.c @ 19632:b0966b6f5180

x86-64: also handle virtual aliases of Xen image pages

With the unification of the heaps, the pages freed from the Xen boot
image now can also end up being allocated to a domain, and hence the
respective aliases need handling when such pages get their
cacheability attributes changed.

Rather than establishing multiple mappings with non-WB attributes
(which temporarily still can cause aliasing issues), simply unmap
those pages from the Xen virtual space, and re-map them (to allow re-
establishing of eventual large page mappings) when the cachability
attribute for them gets restored to normal (WB).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed May 20 15:38:34 2009 +0100 (2009-05-20)
parents cafab2084410
children f210a633571c
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(__i386__)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *__read_mostly frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 int opt_allow_hugepage;
164 boolean_param("allowhugepage", opt_allow_hugepage);
166 #define l1_disallow_mask(d) \
167 ((d != dom_io) && \
168 (rangeset_is_empty((d)->iomem_caps) && \
169 rangeset_is_empty((d)->arch.ioport_caps) && \
170 !has_arch_pdevs(d)) ? \
171 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
173 #ifdef CONFIG_COMPAT
174 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
175 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
176 L3_DISALLOW_MASK : \
177 COMPAT_L3_DISALLOW_MASK)
178 #else
179 #define l3_disallow_mask(d) L3_DISALLOW_MASK
180 #endif
182 void __init init_frametable(void)
183 {
184 unsigned long nr_pages, page_step, i, mfn;
186 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
188 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
189 page_step = 1 << (cpu_has_page1gb ? L3_PAGETABLE_SHIFT - PAGE_SHIFT
190 : L2_PAGETABLE_SHIFT - PAGE_SHIFT);
192 for ( i = 0; i < nr_pages; i += page_step )
193 {
194 /*
195 * The hardcoded 4 below is arbitrary - just pick whatever you think
196 * is reasonable to waste as a trade-off for using a large page.
197 */
198 while (nr_pages + 4 - i < page_step)
199 page_step >>= PAGETABLE_ORDER;
200 mfn = alloc_boot_pages(page_step, page_step);
201 if ( mfn == 0 )
202 panic("Not enough memory for frame table\n");
203 map_pages_to_xen(
204 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
205 mfn, page_step, PAGE_HYPERVISOR);
206 }
208 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
209 }
211 void __init arch_init_memory(void)
212 {
213 extern void subarch_init_memory(void);
215 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
217 /*
218 * Initialise our DOMID_XEN domain.
219 * Any Xen-heap pages that we will allow to be mapped will have
220 * their domain field set to dom_xen.
221 */
222 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
223 BUG_ON(dom_xen == NULL);
225 /*
226 * Initialise our DOMID_IO domain.
227 * This domain owns I/O pages that are within the range of the page_info
228 * array. Mappings occur at the priv of the caller.
229 */
230 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
231 BUG_ON(dom_io == NULL);
233 /* First 1MB of RAM is historically marked as I/O. */
234 for ( i = 0; i < 0x100; i++ )
235 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
237 /* Any areas not specified as RAM by the e820 map are considered I/O. */
238 for ( i = 0, pfn = 0; pfn < max_page; i++ )
239 {
240 while ( (i < e820.nr_map) &&
241 (e820.map[i].type != E820_RAM) &&
242 (e820.map[i].type != E820_UNUSABLE) )
243 i++;
245 if ( i >= e820.nr_map )
246 {
247 /* No more RAM regions: mark as I/O right to end of memory map. */
248 rstart_pfn = rend_pfn = max_page;
249 }
250 else
251 {
252 /* Mark as I/O just up as far as next RAM region. */
253 rstart_pfn = min_t(unsigned long, max_page,
254 PFN_UP(e820.map[i].addr));
255 rend_pfn = max_t(unsigned long, rstart_pfn,
256 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
257 }
259 /*
260 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
261 * In particular this ensures that RAM holes are respected even in
262 * the statically-initialised 1-16MB mapping area.
263 */
264 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
265 ioend_pfn = rstart_pfn;
266 #if defined(CONFIG_X86_32)
267 ioend_pfn = min_t(unsigned long, ioend_pfn,
268 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
269 #endif
270 if ( iostart_pfn < ioend_pfn )
271 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
272 (unsigned long)mfn_to_virt(ioend_pfn));
274 /* Mark as I/O up to next RAM region. */
275 for ( ; pfn < rstart_pfn; pfn++ )
276 {
277 BUG_ON(!mfn_valid(pfn));
278 share_xen_page_with_guest(
279 mfn_to_page(pfn), dom_io, XENSHARE_writable);
280 }
282 /* Skip the RAM region. */
283 pfn = rend_pfn;
284 }
286 subarch_init_memory();
287 }
289 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
290 {
291 uint64_t maddr = pfn_to_paddr(mfn);
292 int i;
294 for ( i = 0; i < e820.nr_map; i++ )
295 {
296 switch ( e820.map[i].type )
297 {
298 case E820_RAM:
299 if ( mem_type & RAM_TYPE_CONVENTIONAL )
300 break;
301 continue;
302 case E820_RESERVED:
303 if ( mem_type & RAM_TYPE_RESERVED )
304 break;
305 continue;
306 case E820_UNUSABLE:
307 if ( mem_type & RAM_TYPE_UNUSABLE )
308 break;
309 continue;
310 case E820_ACPI:
311 case E820_NVS:
312 if ( mem_type & RAM_TYPE_ACPI )
313 break;
314 continue;
315 default:
316 /* unknown */
317 continue;
318 }
320 /* Test the range. */
321 if ( (e820.map[i].addr <= maddr) &&
322 ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
323 return 1;
324 }
326 return 0;
327 }
329 unsigned long domain_get_maximum_gpfn(struct domain *d)
330 {
331 if ( is_hvm_domain(d) )
332 return d->arch.p2m->max_mapped_pfn;
333 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
334 return arch_get_max_pfn(d) - 1;
335 }
337 void share_xen_page_with_guest(
338 struct page_info *page, struct domain *d, int readonly)
339 {
340 if ( page_get_owner(page) == d )
341 return;
343 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
345 spin_lock(&d->page_alloc_lock);
347 /* The incremented type count pins as writable or read-only. */
348 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
349 page->u.inuse.type_info |= PGT_validated | 1;
351 page_set_owner(page, d);
352 wmb(); /* install valid domain ptr before updating refcnt. */
353 ASSERT((page->count_info & ~PGC_xen_heap) == 0);
355 /* Only add to the allocation list if the domain isn't dying. */
356 if ( !d->is_dying )
357 {
358 page->count_info |= PGC_allocated | 1;
359 if ( unlikely(d->xenheap_pages++ == 0) )
360 get_knownalive_domain(d);
361 page_list_add_tail(page, &d->xenpage_list);
362 }
364 spin_unlock(&d->page_alloc_lock);
365 }
367 void share_xen_page_with_privileged_guests(
368 struct page_info *page, int readonly)
369 {
370 share_xen_page_with_guest(page, dom_xen, readonly);
371 }
373 #if defined(__i386__)
375 #ifdef NDEBUG
376 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
377 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
378 #else
379 /*
380 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
381 * We cannot safely shadow the idle page table, nor shadow page tables
382 * (detected by zero reference count). As required for correctness, we
383 * always shadow PDPTs above 4GB.
384 */
385 #define l3tab_needs_shadow(mfn) \
386 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
387 (mfn_to_page(mfn)->count_info & PGC_count_mask) && \
388 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
389 ((mfn) >= 0x100000))
390 #endif
392 static l1_pgentry_t *fix_pae_highmem_pl1e;
394 /* Cache the address of PAE high-memory fixmap page tables. */
395 static int __init cache_pae_fixmap_address(void)
396 {
397 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
398 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
399 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
400 return 0;
401 }
402 __initcall(cache_pae_fixmap_address);
404 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
406 void make_cr3(struct vcpu *v, unsigned long mfn)
407 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
408 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
409 {
410 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
411 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
412 unsigned int cpu = smp_processor_id();
414 /* Fast path: does this mfn need a shadow at all? */
415 if ( !l3tab_needs_shadow(mfn) )
416 {
417 v->arch.cr3 = mfn << PAGE_SHIFT;
418 /* Cache is no longer in use or valid */
419 cache->high_mfn = 0;
420 return;
421 }
423 /* Caching logic is not interrupt safe. */
424 ASSERT(!in_irq());
426 /* Protects against pae_flush_pgd(). */
427 spin_lock(&cache->lock);
429 cache->inuse_idx ^= 1;
430 cache->high_mfn = mfn;
432 /* Map the guest L3 table and copy to the chosen low-memory cache. */
433 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
434 /* First check the previous high mapping can't be in the TLB.
435 * (i.e. have we loaded CR3 since we last did this?) */
436 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
437 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
438 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
439 lowmem_l3tab = cache->table[cache->inuse_idx];
440 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
441 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
442 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
444 v->arch.cr3 = __pa(lowmem_l3tab);
446 spin_unlock(&cache->lock);
447 }
449 #else /* !defined(__i386__) */
451 void make_cr3(struct vcpu *v, unsigned long mfn)
452 {
453 v->arch.cr3 = mfn << PAGE_SHIFT;
454 }
456 #endif /* !defined(__i386__) */
458 void write_ptbase(struct vcpu *v)
459 {
460 write_cr3(v->arch.cr3);
461 }
463 /*
464 * Should be called after CR3 is updated.
465 *
466 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
467 * for HVM guests, arch.monitor_table and hvm's guest CR3.
468 *
469 * Update ref counts to shadow tables appropriately.
470 */
471 void update_cr3(struct vcpu *v)
472 {
473 unsigned long cr3_mfn=0;
475 if ( paging_mode_enabled(v->domain) )
476 {
477 paging_update_cr3(v);
478 return;
479 }
481 #if CONFIG_PAGING_LEVELS == 4
482 if ( !(v->arch.flags & TF_kernel_mode) )
483 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
484 else
485 #endif
486 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
488 make_cr3(v, cr3_mfn);
489 }
492 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
493 {
494 int i;
495 unsigned long pfn;
496 struct page_info *page;
498 BUG_ON(unlikely(in_irq()));
500 spin_lock(&v->arch.shadow_ldt_lock);
502 if ( v->arch.shadow_ldt_mapcnt == 0 )
503 goto out;
505 v->arch.shadow_ldt_mapcnt = 0;
507 for ( i = 16; i < 32; i++ )
508 {
509 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
510 if ( pfn == 0 ) continue;
511 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
512 page = mfn_to_page(pfn);
513 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
514 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
515 put_page_and_type(page);
516 }
518 /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
519 if ( flush )
520 flush_tlb_mask(v->vcpu_dirty_cpumask);
522 out:
523 spin_unlock(&v->arch.shadow_ldt_lock);
524 }
527 static int alloc_segdesc_page(struct page_info *page)
528 {
529 struct desc_struct *descs;
530 int i;
532 descs = map_domain_page(page_to_mfn(page));
534 for ( i = 0; i < 512; i++ )
535 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
536 goto fail;
538 unmap_domain_page(descs);
539 return 0;
541 fail:
542 unmap_domain_page(descs);
543 return -EINVAL;
544 }
547 /* Map shadow page at offset @off. */
548 int map_ldt_shadow_page(unsigned int off)
549 {
550 struct vcpu *v = current;
551 struct domain *d = v->domain;
552 unsigned long gmfn, mfn;
553 l1_pgentry_t l1e, nl1e;
554 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
555 int okay;
557 BUG_ON(unlikely(in_irq()));
559 guest_get_eff_kern_l1e(v, gva, &l1e);
560 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
561 return 0;
563 gmfn = l1e_get_pfn(l1e);
564 mfn = gmfn_to_mfn(d, gmfn);
565 if ( unlikely(!mfn_valid(mfn)) )
566 return 0;
568 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
569 if ( unlikely(!okay) )
570 return 0;
572 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
574 spin_lock(&v->arch.shadow_ldt_lock);
575 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
576 v->arch.shadow_ldt_mapcnt++;
577 spin_unlock(&v->arch.shadow_ldt_lock);
579 return 1;
580 }
583 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
584 {
585 struct page_info *page = mfn_to_page(page_nr);
587 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
588 {
589 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
590 return 0;
591 }
593 return 1;
594 }
597 static int get_page_and_type_from_pagenr(unsigned long page_nr,
598 unsigned long type,
599 struct domain *d,
600 int partial,
601 int preemptible)
602 {
603 struct page_info *page = mfn_to_page(page_nr);
604 int rc;
606 if ( likely(partial >= 0) &&
607 unlikely(!get_page_from_pagenr(page_nr, d)) )
608 return -EINVAL;
610 rc = (preemptible ?
611 get_page_type_preemptible(page, type) :
612 (get_page_type(page, type) ? 0 : -EINVAL));
614 if ( unlikely(rc) && partial >= 0 )
615 put_page(page);
617 return rc;
618 }
620 static int get_data_page(
621 struct page_info *page, struct domain *d, int writeable)
622 {
623 int rc;
625 if ( writeable )
626 rc = get_page_and_type(page, d, PGT_writable_page);
627 else
628 rc = get_page(page, d);
630 return rc;
631 }
633 static void put_data_page(
634 struct page_info *page, int writeable)
635 {
636 if ( writeable )
637 put_page_and_type(page);
638 else
639 put_page(page);
640 }
642 /*
643 * We allow root tables to map each other (a.k.a. linear page tables). It
644 * needs some special care with reference counts and access permissions:
645 * 1. The mapping entry must be read-only, or the guest may get write access
646 * to its own PTEs.
647 * 2. We must only bump the reference counts for an *already validated*
648 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
649 * on a validation that is required to complete that validation.
650 * 3. We only need to increment the reference counts for the mapped page
651 * frame if it is mapped by a different root table. This is sufficient and
652 * also necessary to allow validation of a root table mapping itself.
653 */
654 #define define_get_linear_pagetable(level) \
655 static int \
656 get_##level##_linear_pagetable( \
657 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
658 { \
659 unsigned long x, y; \
660 struct page_info *page; \
661 unsigned long pfn; \
662 \
663 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
664 { \
665 MEM_LOG("Attempt to create linear p.t. with write perms"); \
666 return 0; \
667 } \
668 \
669 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
670 { \
671 /* Make sure the mapped frame belongs to the correct domain. */ \
672 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
673 return 0; \
674 \
675 /* \
676 * Ensure that the mapped frame is an already-validated page table. \
677 * If so, atomically increment the count (checking for overflow). \
678 */ \
679 page = mfn_to_page(pfn); \
680 y = page->u.inuse.type_info; \
681 do { \
682 x = y; \
683 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
684 unlikely((x & (PGT_type_mask|PGT_validated)) != \
685 (PGT_##level##_page_table|PGT_validated)) ) \
686 { \
687 put_page(page); \
688 return 0; \
689 } \
690 } \
691 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
692 } \
693 \
694 return 1; \
695 }
698 int is_iomem_page(unsigned long mfn)
699 {
700 struct page_info *page;
702 if ( !mfn_valid(mfn) )
703 return 1;
705 /* Caller must know that it is an iomem page, or a reference is held. */
706 page = mfn_to_page(mfn);
707 ASSERT((page->count_info & PGC_count_mask) != 0);
709 return (page_get_owner(page) == dom_io);
710 }
712 static void update_xen_mappings(unsigned long mfn, unsigned long cacheattr)
713 {
714 #ifdef __x86_64__
715 bool_t alias = mfn >= PFN_DOWN(xen_phys_start) &&
716 mfn < PFN_UP(xen_phys_start + (unsigned long)_end - XEN_VIRT_START);
717 unsigned long xen_va =
718 XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
720 if ( unlikely(alias) && cacheattr )
721 map_pages_to_xen(xen_va, mfn, 1, 0);
722 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
723 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
724 if ( unlikely(alias) && !cacheattr )
725 map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
726 #endif
727 }
730 int
731 get_page_from_l1e(
732 l1_pgentry_t l1e, struct domain *d)
733 {
734 unsigned long mfn = l1e_get_pfn(l1e);
735 struct page_info *page = mfn_to_page(mfn);
736 uint32_t l1f = l1e_get_flags(l1e);
737 struct vcpu *curr = current;
738 struct domain *owner;
740 if ( !(l1f & _PAGE_PRESENT) )
741 return 1;
743 if ( unlikely(l1f & l1_disallow_mask(d)) )
744 {
745 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
746 return 0;
747 }
749 if ( !mfn_valid(mfn) ||
750 (owner = page_get_owner_and_reference(page)) == dom_io )
751 {
752 /* Only needed the reference to confirm dom_io ownership. */
753 if ( mfn_valid(mfn) )
754 put_page(page);
756 /* DOMID_IO reverts to caller for privilege checks. */
757 if ( d == dom_io )
758 d = curr->domain;
760 if ( !iomem_access_permitted(d, mfn, mfn) )
761 {
762 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
763 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
764 d->domain_id, mfn);
765 return 0;
766 }
768 return 1;
769 }
771 if ( owner == NULL )
772 goto could_not_pin;
774 /*
775 * Let privileged domains transfer the right to map their target
776 * domain's pages. This is used to allow stub-domain pvfb export to dom0,
777 * until pvfb supports granted mappings. At that time this minor hack
778 * can go away.
779 */
780 if ( unlikely(d != owner) && (d != curr->domain) && IS_PRIV_FOR(d, owner) )
781 d = owner;
783 /* Foreign mappings into guests in shadow external mode don't
784 * contribute to writeable mapping refcounts. (This allows the
785 * qemu-dm helper process in dom0 to map the domain's memory without
786 * messing up the count of "real" writable mappings.) */
787 if ( (l1f & _PAGE_RW) &&
788 !(paging_mode_external(d) && (d != curr->domain)) &&
789 !get_page_type(page, PGT_writable_page) )
790 goto could_not_pin;
792 if ( pte_flags_to_cacheattr(l1f) !=
793 ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
794 {
795 unsigned long x, nx, y = page->count_info;
796 unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
798 if ( is_xen_heap_page(page) )
799 {
800 if ( (l1f & _PAGE_RW) &&
801 !(unlikely(paging_mode_external(d) &&
802 (d != curr->domain))) )
803 put_page_type(page);
804 put_page(page);
805 MEM_LOG("Attempt to change cache attributes of Xen heap page");
806 return 0;
807 }
809 while ( ((y & PGC_cacheattr_mask) >> PGC_cacheattr_base) != cacheattr )
810 {
811 x = y;
812 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
813 y = cmpxchg(&page->count_info, x, nx);
814 }
816 update_xen_mappings(mfn, cacheattr);
817 }
819 return 1;
821 could_not_pin:
822 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
823 " for dom%d",
824 mfn, get_gpfn_from_mfn(mfn),
825 l1e_get_intpte(l1e), d->domain_id);
826 if ( owner != NULL )
827 put_page(page);
828 return 0;
829 }
832 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
833 define_get_linear_pagetable(l2);
834 static int
835 get_page_from_l2e(
836 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
837 {
838 unsigned long mfn = l2e_get_pfn(l2e);
839 int rc;
841 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
842 return 1;
844 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
845 {
846 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
847 return -EINVAL;
848 }
850 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
851 {
852 rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
853 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
854 rc = 0;
855 }
856 else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
857 {
858 rc = -EINVAL;
859 }
860 else
861 {
862 unsigned long m = mfn;
863 int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
865 do {
866 rc = get_data_page(mfn_to_page(m), d, writeable);
867 if ( unlikely(!rc) )
868 {
869 while ( m-- > mfn )
870 put_data_page(mfn_to_page(m), writeable);
871 return -EINVAL;
872 }
873 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
874 }
876 return rc;
877 }
880 define_get_linear_pagetable(l3);
881 static int
882 get_page_from_l3e(
883 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
884 {
885 int rc;
887 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
888 return 1;
890 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
891 {
892 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
893 return -EINVAL;
894 }
896 rc = get_page_and_type_from_pagenr(
897 l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
898 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
899 rc = 0;
901 return rc;
902 }
904 #if CONFIG_PAGING_LEVELS >= 4
905 define_get_linear_pagetable(l4);
906 static int
907 get_page_from_l4e(
908 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
909 {
910 int rc;
912 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
913 return 1;
915 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
916 {
917 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
918 return -EINVAL;
919 }
921 rc = get_page_and_type_from_pagenr(
922 l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
923 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
924 rc = 0;
926 return rc;
927 }
928 #endif /* 4 level */
930 #ifdef __x86_64__
932 #ifdef USER_MAPPINGS_ARE_GLOBAL
933 #define adjust_guest_l1e(pl1e, d) \
934 do { \
935 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
936 likely(!is_pv_32on64_domain(d)) ) \
937 { \
938 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
939 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
940 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
941 MEM_LOG("Global bit is set to kernel page %lx", \
942 l1e_get_pfn((pl1e))); \
943 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
944 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
945 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
946 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
947 } \
948 } while ( 0 )
949 #else
950 #define adjust_guest_l1e(pl1e, d) \
951 do { \
952 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
953 likely(!is_pv_32on64_domain(d)) ) \
954 l1e_add_flags((pl1e), _PAGE_USER); \
955 } while ( 0 )
956 #endif
958 #define adjust_guest_l2e(pl2e, d) \
959 do { \
960 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
961 likely(!is_pv_32on64_domain(d)) ) \
962 l2e_add_flags((pl2e), _PAGE_USER); \
963 } while ( 0 )
965 #define adjust_guest_l3e(pl3e, d) \
966 do { \
967 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
968 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
969 _PAGE_USER : \
970 _PAGE_USER|_PAGE_RW); \
971 } while ( 0 )
973 #define adjust_guest_l4e(pl4e, d) \
974 do { \
975 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
976 likely(!is_pv_32on64_domain(d)) ) \
977 l4e_add_flags((pl4e), _PAGE_USER); \
978 } while ( 0 )
980 #else /* !defined(__x86_64__) */
982 #define adjust_guest_l1e(_p, _d) ((void)(_d))
983 #define adjust_guest_l2e(_p, _d) ((void)(_d))
984 #define adjust_guest_l3e(_p, _d) ((void)(_d))
986 #endif
988 #ifdef CONFIG_COMPAT
989 #define unadjust_guest_l3e(pl3e, d) \
990 do { \
991 if ( unlikely(is_pv_32on64_domain(d)) && \
992 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
993 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
994 } while ( 0 )
995 #else
996 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
997 #endif
999 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
1001 unsigned long pfn = l1e_get_pfn(l1e);
1002 struct page_info *page;
1003 struct domain *e;
1004 struct vcpu *v;
1006 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
1007 return;
1009 page = mfn_to_page(pfn);
1011 e = page_get_owner(page);
1013 /*
1014 * Check if this is a mapping that was established via a grant reference.
1015 * If it was then we should not be here: we require that such mappings are
1016 * explicitly destroyed via the grant-table interface.
1018 * The upshot of this is that the guest can end up with active grants that
1019 * it cannot destroy (because it no longer has a PTE to present to the
1020 * grant-table interface). This can lead to subtle hard-to-catch bugs,
1021 * hence a special grant PTE flag can be enabled to catch the bug early.
1023 * (Note that the undestroyable active grants are not a security hole in
1024 * Xen. All active grants can safely be cleaned up when the domain dies.)
1025 */
1026 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1027 !d->is_shutting_down && !d->is_dying )
1029 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
1030 l1e_get_intpte(l1e));
1031 domain_crash(d);
1034 /* Remember we didn't take a type-count of foreign writable mappings
1035 * to paging-external domains */
1036 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1037 !(unlikely((e != d) && paging_mode_external(e))) )
1039 put_page_and_type(page);
1041 else
1043 /* We expect this is rare so we blow the entire shadow LDT. */
1044 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1045 PGT_seg_desc_page)) &&
1046 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1047 (d == e) )
1049 for_each_vcpu ( d, v )
1050 invalidate_shadow_ldt(v, 1);
1052 put_page(page);
1057 /*
1058 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1059 * Note also that this automatically deals correctly with linear p.t.'s.
1060 */
1061 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1063 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1064 return 1;
1066 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1068 unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
1069 int writeable = l2e_get_flags(l2e) & _PAGE_RW;
1071 ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
1072 do {
1073 put_data_page(mfn_to_page(m), writeable);
1074 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
1076 else
1078 put_page_and_type(l2e_get_page(l2e));
1081 return 0;
1084 static int __put_page_type(struct page_info *, int preemptible);
1086 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1087 int partial, int preemptible)
1089 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1090 return 1;
1092 #ifdef __x86_64__
1093 if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1095 unsigned long mfn = l3e_get_pfn(l3e);
1096 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1098 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1099 do {
1100 put_data_page(mfn_to_page(mfn), writeable);
1101 } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1103 return 0;
1105 #endif
1107 if ( unlikely(partial > 0) )
1108 return __put_page_type(l3e_get_page(l3e), preemptible);
1110 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
1113 #if CONFIG_PAGING_LEVELS >= 4
1114 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1115 int partial, int preemptible)
1117 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1118 (l4e_get_pfn(l4e) != pfn) )
1120 if ( unlikely(partial > 0) )
1121 return __put_page_type(l4e_get_page(l4e), preemptible);
1122 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
1124 return 1;
1126 #endif
1128 static int alloc_l1_table(struct page_info *page)
1130 struct domain *d = page_get_owner(page);
1131 unsigned long pfn = page_to_mfn(page);
1132 l1_pgentry_t *pl1e;
1133 unsigned int i;
1135 pl1e = map_domain_page(pfn);
1137 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1139 if ( is_guest_l1_slot(i) &&
1140 unlikely(!get_page_from_l1e(pl1e[i], d)) )
1141 goto fail;
1143 adjust_guest_l1e(pl1e[i], d);
1146 unmap_domain_page(pl1e);
1147 return 0;
1149 fail:
1150 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1151 while ( i-- > 0 )
1152 if ( is_guest_l1_slot(i) )
1153 put_page_from_l1e(pl1e[i], d);
1155 unmap_domain_page(pl1e);
1156 return -EINVAL;
1159 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1161 struct page_info *page;
1162 l2_pgentry_t *pl2e;
1163 l3_pgentry_t l3e3;
1164 #ifndef CONFIG_COMPAT
1165 l2_pgentry_t l2e;
1166 int i;
1167 #endif
1169 if ( !is_pv_32bit_domain(d) )
1170 return 1;
1172 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1174 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1175 l3e3 = pl3e[3];
1176 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1178 MEM_LOG("PAE L3 3rd slot is empty");
1179 return 0;
1182 /*
1183 * The Xen-private mappings include linear mappings. The L2 thus cannot
1184 * be shared by multiple L3 tables. The test here is adequate because:
1185 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1186 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1187 * 2. Cannot appear in another page table's L3:
1188 * a. alloc_l3_table() calls this function and this check will fail
1189 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1190 */
1191 page = l3e_get_page(l3e3);
1192 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1193 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1194 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1195 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1197 MEM_LOG("PAE L3 3rd slot is shared");
1198 return 0;
1201 /* Xen private mappings. */
1202 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1203 #ifndef CONFIG_COMPAT
1204 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1205 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1206 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1207 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1209 l2e = l2e_from_page(
1210 virt_to_page(d->arch.mm_perdomain_pt) + i,
1211 __PAGE_HYPERVISOR);
1212 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1214 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1216 l2e = l2e_empty();
1217 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1218 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1219 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1221 #else
1222 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1223 &compat_idle_pg_table_l2[
1224 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1225 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1226 #endif
1227 unmap_domain_page(pl2e);
1229 return 1;
1232 #ifdef __i386__
1233 /* Flush a pgdir update into low-memory caches. */
1234 static void pae_flush_pgd(
1235 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1237 struct domain *d = page_get_owner(mfn_to_page(mfn));
1238 struct vcpu *v;
1239 intpte_t _ol3e, _nl3e, _pl3e;
1240 l3_pgentry_t *l3tab_ptr;
1241 struct pae_l3_cache *cache;
1243 if ( unlikely(shadow_mode_enabled(d)) )
1245 cpumask_t m = CPU_MASK_NONE;
1246 /* Re-shadow this l3 table on any vcpus that are using it */
1247 for_each_vcpu ( d, v )
1248 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1250 paging_update_cr3(v);
1251 cpus_or(m, m, v->vcpu_dirty_cpumask);
1253 flush_tlb_mask(m);
1256 /* If below 4GB then the pgdir is not shadowed in low memory. */
1257 if ( !l3tab_needs_shadow(mfn) )
1258 return;
1260 for_each_vcpu ( d, v )
1262 cache = &v->arch.pae_l3_cache;
1264 spin_lock(&cache->lock);
1266 if ( cache->high_mfn == mfn )
1268 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1269 _ol3e = l3e_get_intpte(*l3tab_ptr);
1270 _nl3e = l3e_get_intpte(nl3e);
1271 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1272 BUG_ON(_pl3e != _ol3e);
1275 spin_unlock(&cache->lock);
1278 flush_tlb_mask(d->domain_dirty_cpumask);
1280 #else
1281 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1282 #endif
1284 static int alloc_l2_table(struct page_info *page, unsigned long type,
1285 int preemptible)
1287 struct domain *d = page_get_owner(page);
1288 unsigned long pfn = page_to_mfn(page);
1289 l2_pgentry_t *pl2e;
1290 unsigned int i;
1291 int rc = 0;
1293 pl2e = map_domain_page(pfn);
1295 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1297 if ( preemptible && i && hypercall_preempt_check() )
1299 page->nr_validated_ptes = i;
1300 rc = -EAGAIN;
1301 break;
1304 if ( !is_guest_l2_slot(d, type, i) ||
1305 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1306 continue;
1308 if ( rc < 0 )
1310 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1311 while ( i-- > 0 )
1312 if ( is_guest_l2_slot(d, type, i) )
1313 put_page_from_l2e(pl2e[i], pfn);
1314 break;
1317 adjust_guest_l2e(pl2e[i], d);
1320 unmap_domain_page(pl2e);
1321 return rc > 0 ? 0 : rc;
1324 static int alloc_l3_table(struct page_info *page, int preemptible)
1326 struct domain *d = page_get_owner(page);
1327 unsigned long pfn = page_to_mfn(page);
1328 l3_pgentry_t *pl3e;
1329 unsigned int i;
1330 int rc = 0, partial = page->partial_pte;
1332 #if CONFIG_PAGING_LEVELS == 3
1333 /*
1334 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1335 * the weird 'extended cr3' format for dealing with high-order address
1336 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1337 */
1338 if ( (pfn >= 0x100000) &&
1339 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1340 d->vcpu[0] && d->vcpu[0]->is_initialised )
1342 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1343 return -EINVAL;
1345 #endif
1347 pl3e = map_domain_page(pfn);
1349 /*
1350 * PAE guests allocate full pages, but aren't required to initialize
1351 * more than the first four entries; when running in compatibility
1352 * mode, however, the full page is visible to the MMU, and hence all
1353 * 512 entries must be valid/verified, which is most easily achieved
1354 * by clearing them out.
1355 */
1356 if ( is_pv_32on64_domain(d) )
1357 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1359 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1360 i++, partial = 0 )
1362 if ( is_pv_32bit_domain(d) && (i == 3) )
1364 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1365 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1366 rc = -EINVAL;
1367 else
1368 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1369 PGT_l2_page_table |
1370 PGT_pae_xen_l2,
1371 d, partial, preemptible);
1373 else if ( !is_guest_l3_slot(i) ||
1374 (rc = get_page_from_l3e(pl3e[i], pfn, d,
1375 partial, preemptible)) > 0 )
1376 continue;
1378 if ( rc == -EAGAIN )
1380 page->nr_validated_ptes = i;
1381 page->partial_pte = partial ?: 1;
1383 else if ( rc == -EINTR && i )
1385 page->nr_validated_ptes = i;
1386 page->partial_pte = 0;
1387 rc = -EAGAIN;
1389 if ( rc < 0 )
1390 break;
1392 adjust_guest_l3e(pl3e[i], d);
1395 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1396 rc = -EINVAL;
1397 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1399 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1400 while ( i-- > 0 )
1402 if ( !is_guest_l3_slot(i) )
1403 continue;
1404 unadjust_guest_l3e(pl3e[i], d);
1405 put_page_from_l3e(pl3e[i], pfn, 0, 0);
1409 unmap_domain_page(pl3e);
1410 return rc > 0 ? 0 : rc;
1413 #if CONFIG_PAGING_LEVELS >= 4
1414 static int alloc_l4_table(struct page_info *page, int preemptible)
1416 struct domain *d = page_get_owner(page);
1417 unsigned long pfn = page_to_mfn(page);
1418 l4_pgentry_t *pl4e = page_to_virt(page);
1419 unsigned int i;
1420 int rc = 0, partial = page->partial_pte;
1422 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1423 i++, partial = 0 )
1425 if ( !is_guest_l4_slot(d, i) ||
1426 (rc = get_page_from_l4e(pl4e[i], pfn, d,
1427 partial, preemptible)) > 0 )
1428 continue;
1430 if ( rc == -EAGAIN )
1432 page->nr_validated_ptes = i;
1433 page->partial_pte = partial ?: 1;
1435 else if ( rc == -EINTR )
1437 if ( i )
1439 page->nr_validated_ptes = i;
1440 page->partial_pte = 0;
1441 rc = -EAGAIN;
1444 else if ( rc < 0 )
1446 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1447 while ( i-- > 0 )
1448 if ( is_guest_l4_slot(d, i) )
1449 put_page_from_l4e(pl4e[i], pfn, 0, 0);
1451 if ( rc < 0 )
1452 return rc;
1454 adjust_guest_l4e(pl4e[i], d);
1457 /* Xen private mappings. */
1458 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1459 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1460 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1461 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1462 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1463 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1464 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1465 __PAGE_HYPERVISOR);
1467 return rc > 0 ? 0 : rc;
1469 #else
1470 #define alloc_l4_table(page, preemptible) (-EINVAL)
1471 #endif
1474 static void free_l1_table(struct page_info *page)
1476 struct domain *d = page_get_owner(page);
1477 unsigned long pfn = page_to_mfn(page);
1478 l1_pgentry_t *pl1e;
1479 unsigned int i;
1481 pl1e = map_domain_page(pfn);
1483 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1484 if ( is_guest_l1_slot(i) )
1485 put_page_from_l1e(pl1e[i], d);
1487 unmap_domain_page(pl1e);
1491 static int free_l2_table(struct page_info *page, int preemptible)
1493 #ifdef CONFIG_COMPAT
1494 struct domain *d = page_get_owner(page);
1495 #endif
1496 unsigned long pfn = page_to_mfn(page);
1497 l2_pgentry_t *pl2e;
1498 unsigned int i = page->nr_validated_ptes - 1;
1499 int err = 0;
1501 pl2e = map_domain_page(pfn);
1503 ASSERT(page->nr_validated_ptes);
1504 do {
1505 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1506 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1507 preemptible && i && hypercall_preempt_check() )
1509 page->nr_validated_ptes = i;
1510 err = -EAGAIN;
1512 } while ( !err && i-- );
1514 unmap_domain_page(pl2e);
1516 if ( !err )
1517 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1519 return err;
1522 static int free_l3_table(struct page_info *page, int preemptible)
1524 struct domain *d = page_get_owner(page);
1525 unsigned long pfn = page_to_mfn(page);
1526 l3_pgentry_t *pl3e;
1527 int rc = 0, partial = page->partial_pte;
1528 unsigned int i = page->nr_validated_ptes - !partial;
1530 pl3e = map_domain_page(pfn);
1532 do {
1533 if ( is_guest_l3_slot(i) )
1535 rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
1536 if ( rc < 0 )
1537 break;
1538 partial = 0;
1539 if ( rc > 0 )
1540 continue;
1541 unadjust_guest_l3e(pl3e[i], d);
1543 } while ( i-- );
1545 unmap_domain_page(pl3e);
1547 if ( rc == -EAGAIN )
1549 page->nr_validated_ptes = i;
1550 page->partial_pte = partial ?: -1;
1552 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1554 page->nr_validated_ptes = i + 1;
1555 page->partial_pte = 0;
1556 rc = -EAGAIN;
1558 return rc > 0 ? 0 : rc;
1561 #if CONFIG_PAGING_LEVELS >= 4
1562 static int free_l4_table(struct page_info *page, int preemptible)
1564 struct domain *d = page_get_owner(page);
1565 unsigned long pfn = page_to_mfn(page);
1566 l4_pgentry_t *pl4e = page_to_virt(page);
1567 int rc = 0, partial = page->partial_pte;
1568 unsigned int i = page->nr_validated_ptes - !partial;
1570 do {
1571 if ( is_guest_l4_slot(d, i) )
1572 rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
1573 if ( rc < 0 )
1574 break;
1575 partial = 0;
1576 } while ( i-- );
1578 if ( rc == -EAGAIN )
1580 page->nr_validated_ptes = i;
1581 page->partial_pte = partial ?: -1;
1583 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1585 page->nr_validated_ptes = i + 1;
1586 page->partial_pte = 0;
1587 rc = -EAGAIN;
1589 return rc > 0 ? 0 : rc;
1591 #else
1592 #define free_l4_table(page, preemptible) (-EINVAL)
1593 #endif
1595 static int page_lock(struct page_info *page)
1597 unsigned long x, nx;
1599 do {
1600 while ( (x = page->u.inuse.type_info) & PGT_locked )
1601 cpu_relax();
1602 nx = x + (1 | PGT_locked);
1603 if ( !(x & PGT_validated) ||
1604 !(x & PGT_count_mask) ||
1605 !(nx & PGT_count_mask) )
1606 return 0;
1607 } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1609 return 1;
1612 static void page_unlock(struct page_info *page)
1614 unsigned long x, nx, y = page->u.inuse.type_info;
1616 do {
1617 x = y;
1618 nx = x - (1 | PGT_locked);
1619 } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1622 /* How to write an entry to the guest pagetables.
1623 * Returns 0 for failure (pointer not valid), 1 for success. */
1624 static inline int update_intpte(intpte_t *p,
1625 intpte_t old,
1626 intpte_t new,
1627 unsigned long mfn,
1628 struct vcpu *v,
1629 int preserve_ad)
1631 int rv = 1;
1632 #ifndef PTE_UPDATE_WITH_CMPXCHG
1633 if ( !preserve_ad )
1635 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1637 else
1638 #endif
1640 intpte_t t = old;
1641 for ( ; ; )
1643 intpte_t _new = new;
1644 if ( preserve_ad )
1645 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1647 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1648 if ( unlikely(rv == 0) )
1650 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1651 ": saw %" PRIpte, old, _new, t);
1652 break;
1655 if ( t == old )
1656 break;
1658 /* Allowed to change in Accessed/Dirty flags only. */
1659 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1661 old = t;
1664 return rv;
1667 /* Macro that wraps the appropriate type-changes around update_intpte().
1668 * Arguments are: type, ptr, old, new, mfn, vcpu */
1669 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1670 update_intpte(&_t ## e_get_intpte(*(_p)), \
1671 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1672 (_m), (_v), (_ad))
1674 /* Update the L1 entry at pl1e to new value nl1e. */
1675 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1676 unsigned long gl1mfn, int preserve_ad,
1677 struct vcpu *vcpu)
1679 l1_pgentry_t ol1e;
1680 struct domain *d = vcpu->domain;
1681 unsigned long mfn;
1682 p2m_type_t p2mt;
1683 int rc = 1;
1685 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1686 return 0;
1688 if ( unlikely(paging_mode_refcounts(d)) )
1690 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu, preserve_ad);
1691 return rc;
1694 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1696 /* Translate foreign guest addresses. */
1697 mfn = mfn_x(gfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e), &p2mt));
1698 if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
1699 return 0;
1700 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1701 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1703 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1705 MEM_LOG("Bad L1 flags %x",
1706 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1707 return 0;
1710 /* Fast path for identical mapping, r/w and presence. */
1711 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1713 adjust_guest_l1e(nl1e, d);
1714 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1715 preserve_ad);
1716 return rc;
1719 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1720 return 0;
1722 adjust_guest_l1e(nl1e, d);
1723 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1724 preserve_ad)) )
1726 ol1e = nl1e;
1727 rc = 0;
1730 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1731 preserve_ad)) )
1733 return 0;
1736 put_page_from_l1e(ol1e, d);
1737 return rc;
1741 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1742 static int mod_l2_entry(l2_pgentry_t *pl2e,
1743 l2_pgentry_t nl2e,
1744 unsigned long pfn,
1745 int preserve_ad,
1746 struct vcpu *vcpu)
1748 l2_pgentry_t ol2e;
1749 struct domain *d = vcpu->domain;
1750 struct page_info *l2pg = mfn_to_page(pfn);
1751 unsigned long type = l2pg->u.inuse.type_info;
1752 int rc = 1;
1754 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1756 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1757 return 0;
1760 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1761 return 0;
1763 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1765 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1767 MEM_LOG("Bad L2 flags %x",
1768 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1769 return 0;
1772 /* Fast path for identical mapping and presence. */
1773 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1775 adjust_guest_l2e(nl2e, d);
1776 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad);
1777 return rc;
1780 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1781 return 0;
1783 adjust_guest_l2e(nl2e, d);
1784 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1785 preserve_ad)) )
1787 ol2e = nl2e;
1788 rc = 0;
1791 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1792 preserve_ad)) )
1794 return 0;
1797 put_page_from_l2e(ol2e, pfn);
1798 return rc;
1801 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1802 static int mod_l3_entry(l3_pgentry_t *pl3e,
1803 l3_pgentry_t nl3e,
1804 unsigned long pfn,
1805 int preserve_ad,
1806 int preemptible,
1807 struct vcpu *vcpu)
1809 l3_pgentry_t ol3e;
1810 struct domain *d = vcpu->domain;
1811 int rc = 0;
1813 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1815 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1816 return -EINVAL;
1819 /*
1820 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1821 * would be a pain to ensure they remain continuously valid throughout.
1822 */
1823 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1824 return -EINVAL;
1826 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1827 return -EFAULT;
1829 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1831 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1833 MEM_LOG("Bad L3 flags %x",
1834 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1835 return -EINVAL;
1838 /* Fast path for identical mapping and presence. */
1839 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1841 adjust_guest_l3e(nl3e, d);
1842 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
1843 return rc ? 0 : -EFAULT;
1846 rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
1847 if ( unlikely(rc < 0) )
1848 return rc;
1849 rc = 0;
1851 adjust_guest_l3e(nl3e, d);
1852 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1853 preserve_ad)) )
1855 ol3e = nl3e;
1856 rc = -EFAULT;
1859 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1860 preserve_ad)) )
1862 return -EFAULT;
1865 if ( likely(rc == 0) )
1867 if ( !create_pae_xen_mappings(d, pl3e) )
1868 BUG();
1870 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1873 put_page_from_l3e(ol3e, pfn, 0, 0);
1874 return rc;
1877 #if CONFIG_PAGING_LEVELS >= 4
1879 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1880 static int mod_l4_entry(l4_pgentry_t *pl4e,
1881 l4_pgentry_t nl4e,
1882 unsigned long pfn,
1883 int preserve_ad,
1884 int preemptible,
1885 struct vcpu *vcpu)
1887 struct domain *d = vcpu->domain;
1888 l4_pgentry_t ol4e;
1889 int rc = 0;
1891 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1893 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1894 return -EINVAL;
1897 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1898 return -EFAULT;
1900 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1902 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1904 MEM_LOG("Bad L4 flags %x",
1905 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1906 return -EINVAL;
1909 /* Fast path for identical mapping and presence. */
1910 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1912 adjust_guest_l4e(nl4e, d);
1913 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
1914 return rc ? 0 : -EFAULT;
1917 rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
1918 if ( unlikely(rc < 0) )
1919 return rc;
1920 rc = 0;
1922 adjust_guest_l4e(nl4e, d);
1923 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1924 preserve_ad)) )
1926 ol4e = nl4e;
1927 rc = -EFAULT;
1930 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1931 preserve_ad)) )
1933 return -EFAULT;
1936 put_page_from_l4e(ol4e, pfn, 0, 0);
1937 return rc;
1940 #endif
1942 void put_page(struct page_info *page)
1944 unsigned long nx, x, y = page->count_info;
1946 do {
1947 ASSERT((y & PGC_count_mask) != 0);
1948 x = y;
1949 nx = x - 1;
1951 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1953 if ( unlikely((nx & PGC_count_mask) == 0) )
1955 cleanup_page_cacheattr(page);
1956 free_domheap_page(page);
1961 struct domain *page_get_owner_and_reference(struct page_info *page)
1963 unsigned long x, y = page->count_info;
1965 do {
1966 x = y;
1967 /*
1968 * Count == 0: Page is not allocated, so we cannot take a reference.
1969 * Count == -1: Reference count would wrap, which is invalid.
1970 * Count == -2: Remaining unused ref is reserved for get_page_light().
1971 */
1972 if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
1973 return NULL;
1975 while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
1977 return page_get_owner(page);
1981 int get_page(struct page_info *page, struct domain *domain)
1983 struct domain *owner = page_get_owner_and_reference(page);
1985 if ( likely(owner == domain) )
1986 return 1;
1988 if ( owner != NULL )
1989 put_page(page);
1991 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1992 gdprintk(XENLOG_INFO,
1993 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
1994 PRtype_info "\n",
1995 page_to_mfn(page), domain, owner,
1996 page->count_info, page->u.inuse.type_info);
1997 return 0;
2000 /*
2001 * Special version of get_page() to be used exclusively when
2002 * - a page is known to already have a non-zero reference count
2003 * - the page does not need its owner to be checked
2004 * - it will not be called more than once without dropping the thus
2005 * acquired reference again.
2006 * Due to get_page() reserving one reference, this call cannot fail.
2007 */
2008 static void get_page_light(struct page_info *page)
2010 unsigned long x, nx, y = page->count_info;
2012 do {
2013 x = y;
2014 nx = x + 1;
2015 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2016 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2017 y = cmpxchg(&page->count_info, x, nx);
2019 while ( unlikely(y != x) );
2022 static int alloc_page_type(struct page_info *page, unsigned long type,
2023 int preemptible)
2025 struct domain *owner = page_get_owner(page);
2026 int rc;
2028 /* A page table is dirtied when its type count becomes non-zero. */
2029 if ( likely(owner != NULL) )
2030 paging_mark_dirty(owner, page_to_mfn(page));
2032 switch ( type & PGT_type_mask )
2034 case PGT_l1_page_table:
2035 rc = alloc_l1_table(page);
2036 break;
2037 case PGT_l2_page_table:
2038 rc = alloc_l2_table(page, type, preemptible);
2039 break;
2040 case PGT_l3_page_table:
2041 rc = alloc_l3_table(page, preemptible);
2042 break;
2043 case PGT_l4_page_table:
2044 rc = alloc_l4_table(page, preemptible);
2045 break;
2046 case PGT_seg_desc_page:
2047 rc = alloc_segdesc_page(page);
2048 break;
2049 default:
2050 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2051 type, page->u.inuse.type_info,
2052 page->count_info);
2053 rc = -EINVAL;
2054 BUG();
2057 /* No need for atomic update of type_info here: noone else updates it. */
2058 wmb();
2059 if ( rc == -EAGAIN )
2061 get_page_light(page);
2062 page->u.inuse.type_info |= PGT_partial;
2064 else if ( rc == -EINTR )
2066 ASSERT((page->u.inuse.type_info &
2067 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2068 page->u.inuse.type_info &= ~PGT_count_mask;
2070 else if ( rc )
2072 ASSERT(rc < 0);
2073 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2074 PRtype_info ": caf=%08lx taf=%" PRtype_info,
2075 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2076 type, page->count_info, page->u.inuse.type_info);
2077 page->u.inuse.type_info = 0;
2079 else
2081 page->u.inuse.type_info |= PGT_validated;
2084 return rc;
2088 int free_page_type(struct page_info *page, unsigned long type,
2089 int preemptible)
2091 struct domain *owner = page_get_owner(page);
2092 unsigned long gmfn;
2093 int rc;
2095 if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2097 /* A page table is dirtied when its type count becomes zero. */
2098 paging_mark_dirty(owner, page_to_mfn(page));
2100 if ( shadow_mode_refcounts(owner) )
2101 return 0;
2103 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
2104 ASSERT(VALID_M2P(gmfn));
2105 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
2108 if ( !(type & PGT_partial) )
2110 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2111 page->partial_pte = 0;
2114 switch ( type & PGT_type_mask )
2116 case PGT_l1_page_table:
2117 free_l1_table(page);
2118 rc = 0;
2119 break;
2120 case PGT_l2_page_table:
2121 rc = free_l2_table(page, preemptible);
2122 break;
2123 case PGT_l3_page_table:
2124 #if CONFIG_PAGING_LEVELS == 3
2125 if ( !(type & PGT_partial) )
2126 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
2127 #endif
2128 rc = free_l3_table(page, preemptible);
2129 break;
2130 case PGT_l4_page_table:
2131 rc = free_l4_table(page, preemptible);
2132 break;
2133 default:
2134 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
2135 rc = -EINVAL;
2136 BUG();
2139 return rc;
2143 static int __put_final_page_type(
2144 struct page_info *page, unsigned long type, int preemptible)
2146 int rc = free_page_type(page, type, preemptible);
2148 /* No need for atomic update of type_info here: noone else updates it. */
2149 if ( rc == 0 )
2151 /*
2152 * Record TLB information for flush later. We do not stamp page tables
2153 * when running in shadow mode:
2154 * 1. Pointless, since it's the shadow pt's which must be tracked.
2155 * 2. Shadow mode reuses this field for shadowed page tables to
2156 * store flags info -- we don't want to conflict with that.
2157 */
2158 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2159 (page->count_info & PGC_page_table)) )
2160 page->tlbflush_timestamp = tlbflush_current_time();
2161 wmb();
2162 page->u.inuse.type_info--;
2164 else if ( rc == -EINTR )
2166 ASSERT((page->u.inuse.type_info &
2167 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2168 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2169 (page->count_info & PGC_page_table)) )
2170 page->tlbflush_timestamp = tlbflush_current_time();
2171 wmb();
2172 page->u.inuse.type_info |= PGT_validated;
2174 else
2176 BUG_ON(rc != -EAGAIN);
2177 wmb();
2178 get_page_light(page);
2179 page->u.inuse.type_info |= PGT_partial;
2182 return rc;
2186 static int __put_page_type(struct page_info *page,
2187 int preemptible)
2189 unsigned long nx, x, y = page->u.inuse.type_info;
2190 int rc = 0;
2192 for ( ; ; )
2194 x = y;
2195 nx = x - 1;
2197 ASSERT((x & PGT_count_mask) != 0);
2199 if ( unlikely((nx & PGT_count_mask) == 0) )
2201 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2202 likely(nx & (PGT_validated|PGT_partial)) )
2204 /*
2205 * Page-table pages must be unvalidated when count is zero. The
2206 * 'free' is safe because the refcnt is non-zero and validated
2207 * bit is clear => other ops will spin or fail.
2208 */
2209 nx = x & ~(PGT_validated|PGT_partial);
2210 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2211 x, nx)) != x) )
2212 continue;
2213 /* We cleared the 'valid bit' so we do the clean up. */
2214 rc = __put_final_page_type(page, x, preemptible);
2215 if ( x & PGT_partial )
2216 put_page(page);
2217 break;
2220 /*
2221 * Record TLB information for flush later. We do not stamp page
2222 * tables when running in shadow mode:
2223 * 1. Pointless, since it's the shadow pt's which must be tracked.
2224 * 2. Shadow mode reuses this field for shadowed page tables to
2225 * store flags info -- we don't want to conflict with that.
2226 */
2227 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2228 (page->count_info & PGC_page_table)) )
2229 page->tlbflush_timestamp = tlbflush_current_time();
2232 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2233 break;
2235 if ( preemptible && hypercall_preempt_check() )
2236 return -EINTR;
2239 return rc;
2243 static int __get_page_type(struct page_info *page, unsigned long type,
2244 int preemptible)
2246 unsigned long nx, x, y = page->u.inuse.type_info;
2247 int rc = 0;
2249 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2251 for ( ; ; )
2253 x = y;
2254 nx = x + 1;
2255 if ( unlikely((nx & PGT_count_mask) == 0) )
2257 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2258 return -EINVAL;
2260 else if ( unlikely((x & PGT_count_mask) == 0) )
2262 struct domain *d = page_get_owner(page);
2264 /* Normally we should never let a page go from type count 0
2265 * to type count 1 when it is shadowed. One exception:
2266 * out-of-sync shadowed pages are allowed to become
2267 * writeable. */
2268 if ( d && shadow_mode_enabled(d)
2269 && (page->count_info & PGC_page_table)
2270 && !((page->shadow_flags & (1u<<29))
2271 && type == PGT_writable_page) )
2272 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2274 ASSERT(!(x & PGT_pae_xen_l2));
2275 if ( (x & PGT_type_mask) != type )
2277 /*
2278 * On type change we check to flush stale TLB entries. This
2279 * may be unnecessary (e.g., page was GDT/LDT) but those
2280 * circumstances should be very rare.
2281 */
2282 cpumask_t mask = d->domain_dirty_cpumask;
2284 /* Don't flush if the timestamp is old enough */
2285 tlbflush_filter(mask, page->tlbflush_timestamp);
2287 if ( unlikely(!cpus_empty(mask)) &&
2288 /* Shadow mode: track only writable pages. */
2289 (!shadow_mode_enabled(page_get_owner(page)) ||
2290 ((nx & PGT_type_mask) == PGT_writable_page)) )
2292 perfc_incr(need_flush_tlb_flush);
2293 flush_tlb_mask(mask);
2296 /* We lose existing type and validity. */
2297 nx &= ~(PGT_type_mask | PGT_validated);
2298 nx |= type;
2300 /* No special validation needed for writable pages. */
2301 /* Page tables and GDT/LDT need to be scanned for validity. */
2302 if ( type == PGT_writable_page )
2303 nx |= PGT_validated;
2306 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2308 /* Don't log failure if it could be a recursive-mapping attempt. */
2309 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2310 (type == PGT_l1_page_table) )
2311 return -EINVAL;
2312 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2313 (type == PGT_l2_page_table) )
2314 return -EINVAL;
2315 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2316 (type == PGT_l3_page_table) )
2317 return -EINVAL;
2318 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2319 "for mfn %lx (pfn %lx)",
2320 x, type, page_to_mfn(page),
2321 get_gpfn_from_mfn(page_to_mfn(page)));
2322 return -EINVAL;
2324 else if ( unlikely(!(x & PGT_validated)) )
2326 if ( !(x & PGT_partial) )
2328 /* Someone else is updating validation of this page. Wait... */
2329 while ( (y = page->u.inuse.type_info) == x )
2331 if ( preemptible && hypercall_preempt_check() )
2332 return -EINTR;
2333 cpu_relax();
2335 continue;
2337 /* Type ref count was left at 1 when PGT_partial got set. */
2338 ASSERT((x & PGT_count_mask) == 1);
2339 nx = x & ~PGT_partial;
2342 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2343 break;
2345 if ( preemptible && hypercall_preempt_check() )
2346 return -EINTR;
2349 if ( unlikely((x & PGT_type_mask) != type) )
2351 /* Special pages should not be accessible from devices. */
2352 struct domain *d = page_get_owner(page);
2353 if ( d && unlikely(need_iommu(d)) )
2355 if ( (x & PGT_type_mask) == PGT_writable_page )
2356 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2357 else if ( type == PGT_writable_page )
2358 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2359 page_to_mfn(page));
2363 if ( unlikely(!(nx & PGT_validated)) )
2365 if ( !(x & PGT_partial) )
2367 page->nr_validated_ptes = 0;
2368 page->partial_pte = 0;
2370 rc = alloc_page_type(page, type, preemptible);
2373 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2374 put_page(page);
2376 return rc;
2379 void put_page_type(struct page_info *page)
2381 int rc = __put_page_type(page, 0);
2382 ASSERT(rc == 0);
2383 (void)rc;
2386 int get_page_type(struct page_info *page, unsigned long type)
2388 int rc = __get_page_type(page, type, 0);
2389 if ( likely(rc == 0) )
2390 return 1;
2391 ASSERT(rc == -EINVAL);
2392 return 0;
2395 int put_page_type_preemptible(struct page_info *page)
2397 return __put_page_type(page, 1);
2400 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2402 return __get_page_type(page, type, 1);
2405 void cleanup_page_cacheattr(struct page_info *page)
2407 uint32_t cacheattr =
2408 (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2410 if ( likely(cacheattr == 0) )
2411 return;
2413 page->count_info &= ~PGC_cacheattr_mask;
2415 BUG_ON(is_xen_heap_page(page));
2417 update_xen_mappings(page_to_mfn(page), 0);
2421 int new_guest_cr3(unsigned long mfn)
2423 struct vcpu *curr = current;
2424 struct domain *d = curr->domain;
2425 int okay;
2426 unsigned long old_base_mfn;
2428 #ifdef CONFIG_COMPAT
2429 if ( is_pv_32on64_domain(d) )
2431 okay = paging_mode_refcounts(d)
2432 ? 0 /* Old code was broken, but what should it be? */
2433 : mod_l4_entry(
2434 __va(pagetable_get_paddr(curr->arch.guest_table)),
2435 l4e_from_pfn(
2436 mfn,
2437 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2438 pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
2439 if ( unlikely(!okay) )
2441 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2442 return 0;
2445 invalidate_shadow_ldt(curr, 0);
2446 write_ptbase(curr);
2448 return 1;
2450 #endif
2451 okay = paging_mode_refcounts(d)
2452 ? get_page_from_pagenr(mfn, d)
2453 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
2454 if ( unlikely(!okay) )
2456 MEM_LOG("Error while installing new baseptr %lx", mfn);
2457 return 0;
2460 invalidate_shadow_ldt(curr, 0);
2462 old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
2464 curr->arch.guest_table = pagetable_from_pfn(mfn);
2465 update_cr3(curr);
2467 write_ptbase(curr);
2469 if ( likely(old_base_mfn != 0) )
2471 if ( paging_mode_refcounts(d) )
2472 put_page(mfn_to_page(old_base_mfn));
2473 else
2474 put_page_and_type(mfn_to_page(old_base_mfn));
2477 return 1;
2480 static void process_deferred_ops(void)
2482 unsigned int deferred_ops;
2483 struct domain *d = current->domain;
2484 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2486 deferred_ops = info->deferred_ops;
2487 info->deferred_ops = 0;
2489 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2491 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2492 flush_tlb_mask(d->domain_dirty_cpumask);
2493 else
2494 flush_tlb_local();
2497 /*
2498 * Do this after flushing TLBs, to ensure we see fresh LDT mappings
2499 * via the linear pagetable mapping.
2500 */
2501 if ( deferred_ops & DOP_RELOAD_LDT )
2502 (void)map_ldt_shadow_page(0);
2504 if ( unlikely(info->foreign != NULL) )
2506 rcu_unlock_domain(info->foreign);
2507 info->foreign = NULL;
2511 static int set_foreigndom(domid_t domid)
2513 struct domain *e, *d = current->domain;
2514 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2515 int okay = 1;
2517 ASSERT(info->foreign == NULL);
2519 if ( likely(domid == DOMID_SELF) )
2520 goto out;
2522 if ( unlikely(domid == d->domain_id) )
2524 MEM_LOG("Cannot specify itself as foreign domain");
2525 okay = 0;
2527 else if ( unlikely(paging_mode_translate(d)) )
2529 MEM_LOG("Cannot mix foreign mappings with translated domains");
2530 okay = 0;
2532 else switch ( domid )
2534 case DOMID_IO:
2535 info->foreign = rcu_lock_domain(dom_io);
2536 break;
2537 case DOMID_XEN:
2538 if (!IS_PRIV(d)) {
2539 MEM_LOG("Cannot set foreign dom");
2540 okay = 0;
2541 break;
2543 info->foreign = rcu_lock_domain(dom_xen);
2544 break;
2545 default:
2546 if ( (e = rcu_lock_domain_by_id(domid)) == NULL )
2548 MEM_LOG("Unknown domain '%u'", domid);
2549 okay = 0;
2550 break;
2552 if ( !IS_PRIV_FOR(d, e) )
2554 MEM_LOG("Cannot set foreign dom");
2555 okay = 0;
2556 rcu_unlock_domain(e);
2557 break;
2559 info->foreign = e;
2560 break;
2563 out:
2564 return okay;
2567 static inline cpumask_t vcpumask_to_pcpumask(
2568 struct domain *d, unsigned long vmask)
2570 unsigned int vcpu_id;
2571 cpumask_t pmask = CPU_MASK_NONE;
2572 struct vcpu *v;
2574 /*
2575 * Callers copy only a single guest-sized longword from the guest.
2576 * This must be wide enough to reference all VCPUs. Worst case is 32 bits.
2577 */
2578 BUILD_BUG_ON(MAX_VIRT_CPUS > 32);
2580 while ( vmask != 0 )
2582 vcpu_id = find_first_set_bit(vmask);
2583 vmask &= ~(1UL << vcpu_id);
2584 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2585 ((v = d->vcpu[vcpu_id]) != NULL) )
2586 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2589 return pmask;
2592 #ifdef __i386__
2593 static inline void *fixmap_domain_page(unsigned long mfn)
2595 unsigned int cpu = smp_processor_id();
2596 void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
2598 l1e_write(fix_pae_highmem_pl1e - cpu,
2599 l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
2600 flush_tlb_one_local(ptr);
2601 return ptr;
2603 static inline void fixunmap_domain_page(const void *ptr)
2605 unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
2607 l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
2608 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
2610 #else
2611 #define fixmap_domain_page(mfn) mfn_to_virt(mfn)
2612 #define fixunmap_domain_page(ptr) ((void)(ptr))
2613 #endif
2615 int do_mmuext_op(
2616 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2617 unsigned int count,
2618 XEN_GUEST_HANDLE(uint) pdone,
2619 unsigned int foreigndom)
2621 struct mmuext_op op;
2622 int rc = 0, i = 0, okay;
2623 unsigned long mfn = 0, gmfn = 0, type;
2624 unsigned int done = 0;
2625 struct page_info *page;
2626 struct vcpu *curr = current;
2627 struct domain *d = curr->domain;
2629 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2631 count &= ~MMU_UPDATE_PREEMPTED;
2632 if ( unlikely(!guest_handle_is_null(pdone)) )
2633 (void)copy_from_guest(&done, pdone, 1);
2635 else
2636 perfc_incr(calls_to_mmuext_op);
2638 if ( unlikely(!guest_handle_okay(uops, count)) )
2640 rc = -EFAULT;
2641 goto out;
2644 if ( !set_foreigndom(foreigndom) )
2646 rc = -ESRCH;
2647 goto out;
2650 for ( i = 0; i < count; i++ )
2652 if ( hypercall_preempt_check() )
2654 rc = -EAGAIN;
2655 break;
2658 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2660 MEM_LOG("Bad __copy_from_guest");
2661 rc = -EFAULT;
2662 break;
2665 okay = 1;
2666 gmfn = op.arg1.mfn;
2667 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2668 page = mfn_to_page(mfn);
2670 switch ( op.cmd )
2672 case MMUEXT_PIN_L1_TABLE:
2673 type = PGT_l1_page_table;
2674 goto pin_page;
2676 case MMUEXT_PIN_L2_TABLE:
2677 type = PGT_l2_page_table;
2678 goto pin_page;
2680 case MMUEXT_PIN_L3_TABLE:
2681 type = PGT_l3_page_table;
2682 goto pin_page;
2684 case MMUEXT_PIN_L4_TABLE:
2685 if ( is_pv_32bit_domain(FOREIGNDOM) )
2686 break;
2687 type = PGT_l4_page_table;
2689 pin_page:
2690 rc = xsm_memory_pin_page(d, page);
2691 if ( rc )
2692 break;
2694 /* Ignore pinning of invalid paging levels. */
2695 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2696 break;
2698 if ( paging_mode_refcounts(FOREIGNDOM) )
2699 break;
2701 rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
2702 okay = !rc;
2703 if ( unlikely(!okay) )
2705 if ( rc == -EINTR )
2706 rc = -EAGAIN;
2707 else if ( rc != -EAGAIN )
2708 MEM_LOG("Error while pinning mfn %lx", mfn);
2709 break;
2712 if ( unlikely(test_and_set_bit(_PGT_pinned,
2713 &page->u.inuse.type_info)) )
2715 MEM_LOG("Mfn %lx already pinned", mfn);
2716 put_page_and_type(page);
2717 okay = 0;
2718 break;
2721 /* A page is dirtied when its pin status is set. */
2722 paging_mark_dirty(d, mfn);
2724 /* We can race domain destruction (domain_relinquish_resources). */
2725 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2727 int drop_ref;
2728 spin_lock(&FOREIGNDOM->page_alloc_lock);
2729 drop_ref = (FOREIGNDOM->is_dying &&
2730 test_and_clear_bit(_PGT_pinned,
2731 &page->u.inuse.type_info));
2732 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2733 if ( drop_ref )
2734 put_page_and_type(page);
2737 break;
2739 case MMUEXT_UNPIN_TABLE:
2740 if ( paging_mode_refcounts(d) )
2741 break;
2743 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2745 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2746 mfn, page_get_owner(page));
2748 else if ( likely(test_and_clear_bit(_PGT_pinned,
2749 &page->u.inuse.type_info)) )
2751 put_page_and_type(page);
2752 put_page(page);
2753 if ( !rc )
2755 /* A page is dirtied when its pin status is cleared. */
2756 paging_mark_dirty(d, mfn);
2759 else
2761 okay = 0;
2762 put_page(page);
2763 MEM_LOG("Mfn %lx not pinned", mfn);
2765 break;
2767 case MMUEXT_NEW_BASEPTR:
2768 okay = new_guest_cr3(mfn);
2769 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2770 break;
2772 #ifdef __x86_64__
2773 case MMUEXT_NEW_USER_BASEPTR: {
2774 unsigned long old_mfn;
2776 if ( mfn != 0 )
2778 if ( paging_mode_refcounts(d) )
2779 okay = get_page_from_pagenr(mfn, d);
2780 else
2781 okay = !get_page_and_type_from_pagenr(
2782 mfn, PGT_root_page_table, d, 0, 0);
2783 if ( unlikely(!okay) )
2785 MEM_LOG("Error while installing new mfn %lx", mfn);
2786 break;
2790 old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
2791 curr->arch.guest_table_user = pagetable_from_pfn(mfn);
2793 if ( old_mfn != 0 )
2795 if ( paging_mode_refcounts(d) )
2796 put_page(mfn_to_page(old_mfn));
2797 else
2798 put_page_and_type(mfn_to_page(old_mfn));
2801 break;
2803 #endif
2805 case MMUEXT_TLB_FLUSH_LOCAL:
2806 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2807 break;
2809 case MMUEXT_INVLPG_LOCAL:
2810 if ( !paging_mode_enabled(d)
2811 || paging_invlpg(curr, op.arg1.linear_addr) != 0 )
2812 flush_tlb_one_local(op.arg1.linear_addr);
2813 break;
2815 case MMUEXT_TLB_FLUSH_MULTI:
2816 case MMUEXT_INVLPG_MULTI:
2818 unsigned long vmask;
2819 cpumask_t pmask;
2820 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2822 okay = 0;
2823 break;
2825 pmask = vcpumask_to_pcpumask(d, vmask);
2826 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2827 flush_tlb_mask(pmask);
2828 else
2829 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2830 break;
2833 case MMUEXT_TLB_FLUSH_ALL:
2834 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
2835 break;
2837 case MMUEXT_INVLPG_ALL:
2838 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2839 break;
2841 case MMUEXT_FLUSH_CACHE:
2842 if ( unlikely(!cache_flush_permitted(d)) )
2844 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2845 okay = 0;
2847 else
2849 wbinvd();
2851 break;
2853 case MMUEXT_SET_LDT:
2855 unsigned long ptr = op.arg1.linear_addr;
2856 unsigned long ents = op.arg2.nr_ents;
2858 if ( paging_mode_external(d) )
2860 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2861 okay = 0;
2863 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2864 (ents > 8192) ||
2865 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2867 okay = 0;
2868 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2870 else if ( (curr->arch.guest_context.ldt_ents != ents) ||
2871 (curr->arch.guest_context.ldt_base != ptr) )
2873 invalidate_shadow_ldt(curr, 0);
2874 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2875 curr->arch.guest_context.ldt_base = ptr;
2876 curr->arch.guest_context.ldt_ents = ents;
2877 load_LDT(curr);
2878 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2879 if ( ents != 0 )
2880 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2882 break;
2885 case MMUEXT_CLEAR_PAGE:
2887 unsigned char *ptr;
2889 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2890 FOREIGNDOM, 0, 0);
2891 if ( unlikely(!okay) )
2893 MEM_LOG("Error while clearing mfn %lx", mfn);
2894 break;
2897 /* A page is dirtied when it's being cleared. */
2898 paging_mark_dirty(d, mfn);
2900 ptr = fixmap_domain_page(mfn);
2901 clear_page(ptr);
2902 fixunmap_domain_page(ptr);
2904 put_page_and_type(page);
2905 break;
2908 case MMUEXT_COPY_PAGE:
2910 const unsigned char *src;
2911 unsigned char *dst;
2912 unsigned long src_mfn;
2914 src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
2915 okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
2916 if ( unlikely(!okay) )
2918 MEM_LOG("Error while copying from mfn %lx", src_mfn);
2919 break;
2922 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2923 FOREIGNDOM, 0, 0);
2924 if ( unlikely(!okay) )
2926 put_page(mfn_to_page(src_mfn));
2927 MEM_LOG("Error while copying to mfn %lx", mfn);
2928 break;
2931 /* A page is dirtied when it's being copied to. */
2932 paging_mark_dirty(d, mfn);
2934 src = map_domain_page(src_mfn);
2935 dst = fixmap_domain_page(mfn);
2936 copy_page(dst, src);
2937 fixunmap_domain_page(dst);
2938 unmap_domain_page(src);
2940 put_page_and_type(page);
2941 put_page(mfn_to_page(src_mfn));
2942 break;
2945 default:
2946 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2947 rc = -ENOSYS;
2948 okay = 0;
2949 break;
2952 if ( unlikely(!okay) )
2954 rc = rc ? rc : -EINVAL;
2955 break;
2958 guest_handle_add_offset(uops, 1);
2961 if ( rc == -EAGAIN )
2962 rc = hypercall_create_continuation(
2963 __HYPERVISOR_mmuext_op, "hihi",
2964 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2966 process_deferred_ops();
2968 perfc_add(num_mmuext_ops, i);
2970 out:
2971 /* Add incremental work we have done to the @done output parameter. */
2972 if ( unlikely(!guest_handle_is_null(pdone)) )
2974 done += i;
2975 copy_to_guest(pdone, &done, 1);
2978 return rc;
2981 int do_mmu_update(
2982 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2983 unsigned int count,
2984 XEN_GUEST_HANDLE(uint) pdone,
2985 unsigned int foreigndom)
2987 struct mmu_update req;
2988 void *va;
2989 unsigned long gpfn, gmfn, mfn;
2990 struct page_info *page;
2991 int rc = 0, okay = 1, i = 0;
2992 unsigned int cmd, done = 0;
2993 struct domain *d = current->domain;
2994 struct domain_mmap_cache mapcache;
2996 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2998 count &= ~MMU_UPDATE_PREEMPTED;
2999 if ( unlikely(!guest_handle_is_null(pdone)) )
3000 (void)copy_from_guest(&done, pdone, 1);
3002 else
3003 perfc_incr(calls_to_mmu_update);
3005 if ( unlikely(!guest_handle_okay(ureqs, count)) )
3007 rc = -EFAULT;
3008 goto out;
3011 if ( !set_foreigndom(foreigndom) )
3013 rc = -ESRCH;
3014 goto out;
3017 domain_mmap_cache_init(&mapcache);
3019 for ( i = 0; i < count; i++ )
3021 if ( hypercall_preempt_check() )
3023 rc = -EAGAIN;
3024 break;
3027 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3029 MEM_LOG("Bad __copy_from_guest");
3030 rc = -EFAULT;
3031 break;
3034 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3035 okay = 0;
3037 switch ( cmd )
3039 /*
3040 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3041 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3042 * current A/D bits.
3043 */
3044 case MMU_NORMAL_PT_UPDATE:
3045 case MMU_PT_UPDATE_PRESERVE_AD:
3046 rc = xsm_mmu_normal_update(d, FOREIGNDOM, req.val);
3047 if ( rc )
3048 break;
3050 req.ptr -= cmd;
3051 gmfn = req.ptr >> PAGE_SHIFT;
3052 mfn = gmfn_to_mfn(d, gmfn);
3054 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
3056 MEM_LOG("Could not get page for normal update");
3057 break;
3060 va = map_domain_page_with_cache(mfn, &mapcache);
3061 va = (void *)((unsigned long)va +
3062 (unsigned long)(req.ptr & ~PAGE_MASK));
3063 page = mfn_to_page(mfn);
3065 if ( page_lock(page) )
3067 switch ( page->u.inuse.type_info & PGT_type_mask )
3069 case PGT_l1_page_table:
3071 l1_pgentry_t l1e = l1e_from_intpte(req.val);
3072 okay = mod_l1_entry(va, l1e, mfn,
3073 cmd == MMU_PT_UPDATE_PRESERVE_AD,
3074 current);
3076 break;
3077 case PGT_l2_page_table:
3079 l2_pgentry_t l2e = l2e_from_intpte(req.val);
3080 okay = mod_l2_entry(va, l2e, mfn,
3081 cmd == MMU_PT_UPDATE_PRESERVE_AD,
3082 current);
3084 break;
3085 case PGT_l3_page_table:
3087 l3_pgentry_t l3e = l3e_from_intpte(req.val);
3088 rc = mod_l3_entry(va, l3e, mfn,
3089 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1,
3090 current);
3091 okay = !rc;
3093 break;
3094 #if CONFIG_PAGING_LEVELS >= 4
3095 case PGT_l4_page_table:
3097 l4_pgentry_t l4e = l4e_from_intpte(req.val);
3098 rc = mod_l4_entry(va, l4e, mfn,
3099 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1,
3100 current);
3101 okay = !rc;
3103 break;
3104 #endif
3105 case PGT_writable_page:
3106 perfc_incr(writable_mmu_updates);
3107 okay = paging_write_guest_entry(
3108 current, va, req.val, _mfn(mfn));
3109 break;
3111 page_unlock(page);
3112 if ( rc == -EINTR )
3113 rc = -EAGAIN;
3115 else if ( get_page_type(page, PGT_writable_page) )
3117 perfc_incr(writable_mmu_updates);
3118 okay = paging_write_guest_entry(
3119 current, va, req.val, _mfn(mfn));
3120 put_page_type(page);
3123 unmap_domain_page_with_cache(va, &mapcache);
3124 put_page(page);
3125 break;
3127 case MMU_MACHPHYS_UPDATE:
3129 mfn = req.ptr >> PAGE_SHIFT;
3130 gpfn = req.val;
3132 rc = xsm_mmu_machphys_update(d, mfn);
3133 if ( rc )
3134 break;
3136 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
3138 MEM_LOG("Could not get page for mach->phys update");
3139 break;
3142 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
3144 MEM_LOG("Mach-phys update on auto-translate guest");
3145 break;
3148 set_gpfn_from_mfn(mfn, gpfn);
3149 okay = 1;
3151 paging_mark_dirty(FOREIGNDOM, mfn);
3153 put_page(mfn_to_page(mfn));
3154 break;
3156 default:
3157 MEM_LOG("Invalid page update command %x", cmd);
3158 rc = -ENOSYS;
3159 okay = 0;
3160 break;
3163 if ( unlikely(!okay) )
3165 rc = rc ? rc : -EINVAL;
3166 break;
3169 guest_handle_add_offset(ureqs, 1);
3172 if ( rc == -EAGAIN )
3173 rc = hypercall_create_continuation(
3174 __HYPERVISOR_mmu_update, "hihi",
3175 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3177 process_deferred_ops();
3179 domain_mmap_cache_destroy(&mapcache);
3181 perfc_add(num_page_updates, i);
3183 out:
3184 /* Add incremental work we have done to the @done output parameter. */
3185 if ( unlikely(!guest_handle_is_null(pdone)) )
3187 done += i;
3188 copy_to_guest(pdone, &done, 1);
3191 return rc;
3195 static int create_grant_pte_mapping(
3196 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
3198 int rc = GNTST_okay;
3199 void *va;
3200 unsigned long gmfn, mfn;
3201 struct page_info *page;
3202 l1_pgentry_t ol1e;
3203 struct domain *d = v->domain;
3205 ASSERT(domain_is_locked(d));
3207 adjust_guest_l1e(nl1e, d);
3209 gmfn = pte_addr >> PAGE_SHIFT;
3210 mfn = gmfn_to_mfn(d, gmfn);
3212 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3214 MEM_LOG("Could not get page for normal update");
3215 return GNTST_general_error;
3218 va = map_domain_page(mfn);
3219 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
3220 page = mfn_to_page(mfn);
3222 if ( !page_lock(page) )
3224 rc = GNTST_general_error;
3225 goto failed;
3228 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3230 page_unlock(page);
3231 rc = GNTST_general_error;
3232 goto failed;
3235 ol1e = *(l1_pgentry_t *)va;
3236 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3238 page_unlock(page);
3239 rc = GNTST_general_error;
3240 goto failed;
3243 page_unlock(page);
3245 if ( !paging_mode_refcounts(d) )
3246 put_page_from_l1e(ol1e, d);
3248 failed:
3249 unmap_domain_page(va);
3250 put_page(page);
3252 return rc;
3255 static int destroy_grant_pte_mapping(
3256 uint64_t addr, unsigned long frame, struct domain *d)
3258 int rc = GNTST_okay;
3259 void *va;
3260 unsigned long gmfn, mfn;
3261 struct page_info *page;
3262 l1_pgentry_t ol1e;
3264 gmfn = addr >> PAGE_SHIFT;
3265 mfn = gmfn_to_mfn(d, gmfn);
3267 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3269 MEM_LOG("Could not get page for normal update");
3270 return GNTST_general_error;
3273 va = map_domain_page(mfn);
3274 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3275 page = mfn_to_page(mfn);
3277 if ( !page_lock(page) )
3279 rc = GNTST_general_error;
3280 goto failed;
3283 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3285 page_unlock(page);
3286 rc = GNTST_general_error;
3287 goto failed;
3290 ol1e = *(l1_pgentry_t *)va;
3292 /* Check that the virtual address supplied is actually mapped to frame. */
3293 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3295 page_unlock(page);
3296 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3297 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3298 rc = GNTST_general_error;
3299 goto failed;
3302 /* Delete pagetable entry. */
3303 if ( unlikely(!UPDATE_ENTRY
3304 (l1,
3305 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3306 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3307 0)) )
3309 page_unlock(page);
3310 MEM_LOG("Cannot delete PTE entry at %p", va);
3311 rc = GNTST_general_error;
3312 goto failed;
3315 page_unlock(page);
3317 failed:
3318 unmap_domain_page(va);
3319 put_page(page);
3320 return rc;
3324 static int create_grant_va_mapping(
3325 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3327 l1_pgentry_t *pl1e, ol1e;
3328 struct domain *d = v->domain;
3329 unsigned long gl1mfn;
3330 struct page_info *l1pg;
3331 int okay;
3333 ASSERT(domain_is_locked(d));
3335 adjust_guest_l1e(nl1e, d);
3337 pl1e = guest_map_l1e(v, va, &gl1mfn);
3338 if ( !pl1e )
3340 MEM_LOG("Could not find L1 PTE for address %lx", va);
3341 return GNTST_general_error;
3344 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3346 guest_unmap_l1e(v, pl1e);
3347 return GNTST_general_error;
3350 l1pg = mfn_to_page(gl1mfn);
3351 if ( !page_lock(l1pg) )
3353 put_page(l1pg);
3354 guest_unmap_l1e(v, pl1e);
3355 return GNTST_general_error;
3358 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3360 page_unlock(l1pg);
3361 put_page(l1pg);
3362 guest_unmap_l1e(v, pl1e);
3363 return GNTST_general_error;
3366 ol1e = *pl1e;
3367 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3369 page_unlock(l1pg);
3370 put_page(l1pg);
3371 guest_unmap_l1e(v, pl1e);
3373 if ( okay && !paging_mode_refcounts(d) )
3374 put_page_from_l1e(ol1e, d);
3376 return okay ? GNTST_okay : GNTST_general_error;
3379 static int replace_grant_va_mapping(
3380 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3382 l1_pgentry_t *pl1e, ol1e;
3383 unsigned long gl1mfn;
3384 struct page_info *l1pg;
3385 int rc = 0;
3387 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3388 if ( !pl1e )
3390 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3391 return GNTST_general_error;
3394 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3396 rc = GNTST_general_error;
3397 goto out;
3400 l1pg = mfn_to_page(gl1mfn);
3401 if ( !page_lock(l1pg) )
3403 rc = GNTST_general_error;
3404 put_page(l1pg);
3405 goto out;
3408 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3410 rc = GNTST_general_error;
3411 goto unlock_and_out;
3414 ol1e = *pl1e;
3416 /* Check that the virtual address supplied is actually mapped to frame. */
3417 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3419 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3420 l1e_get_pfn(ol1e), addr, frame);
3421 rc = GNTST_general_error;
3422 goto unlock_and_out;
3425 /* Delete pagetable entry. */
3426 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3428 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3429 rc = GNTST_general_error;
3430 goto unlock_and_out;
3433 unlock_and_out:
3434 page_unlock(l1pg);
3435 put_page(l1pg);
3436 out:
3437 guest_unmap_l1e(v, pl1e);
3438 return rc;
3441 static int destroy_grant_va_mapping(
3442 unsigned long addr, unsigned long frame, struct vcpu *v)
3444 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3447 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3448 unsigned int flags, unsigned int cache_flags)
3450 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3452 if ( (flags & GNTMAP_application_map) )
3453 l1e_add_flags(pte,_PAGE_USER);
3454 if ( !(flags & GNTMAP_readonly) )
3455 l1e_add_flags(pte,_PAGE_RW);
3457 l1e_add_flags(pte,
3458 ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
3459 & _PAGE_AVAIL);
3461 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3463 if ( flags & GNTMAP_contains_pte )
3464 return create_grant_pte_mapping(addr, pte, current);
3465 return create_grant_va_mapping(addr, pte, current);
3468 int replace_grant_host_mapping(
3469 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3471 struct vcpu *curr = current;
3472 l1_pgentry_t *pl1e, ol1e;
3473 unsigned long gl1mfn;
3474 struct page_info *l1pg;
3475 int rc;
3477 if ( flags & GNTMAP_contains_pte )
3479 if ( !new_addr )
3480 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3482 MEM_LOG("Unsupported grant table operation");
3483 return GNTST_general_error;
3486 if ( !new_addr )
3487 return destroy_grant_va_mapping(addr, frame, curr);
3489 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3490 if ( !pl1e )
3492 MEM_LOG("Could not find L1 PTE for address %lx",
3493 (unsigned long)new_addr);
3494 return GNTST_general_error;
3497 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3499 guest_unmap_l1e(curr, pl1e);
3500 return GNTST_general_error;
3503 l1pg = mfn_to_page(gl1mfn);
3504 if ( !page_lock(l1pg) )
3506 put_page(l1pg);
3507 guest_unmap_l1e(curr, pl1e);
3508 return GNTST_general_error;
3511 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3513 page_unlock(l1pg);
3514 put_page(l1pg);
3515 guest_unmap_l1e(curr, pl1e);
3516 return GNTST_general_error;
3519 ol1e = *pl1e;
3521 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3522 gl1mfn, curr, 0)) )
3524 page_unlock(l1pg);
3525 put_page(l1pg);
3526 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3527 guest_unmap_l1e(curr, pl1e);
3528 return GNTST_general_error;
3531 page_unlock(l1pg);
3532 put_page(l1pg);
3533 guest_unmap_l1e(curr, pl1e);
3535 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3536 if ( rc && !paging_mode_refcounts(curr->domain) )
3537 put_page_from_l1e(ol1e, curr->domain);
3539 return rc;
3542 int steal_page(
3543 struct domain *d, struct page_info *page, unsigned int memflags)
3545 unsigned long x, y;
3547 spin_lock(&d->page_alloc_lock);
3549 if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
3550 goto fail;
3552 /*
3553 * We require there is just one reference (PGC_allocated). We temporarily
3554 * drop this reference now so that we can safely swizzle the owner.
3555 */
3556 y = page->count_info;
3557 do {
3558 x = y;
3559 if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3560 goto fail;
3561 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3562 } while ( y != x );
3564 /* Swizzle the owner then reinstate the PGC_allocated reference. */
3565 page_set_owner(page, NULL);
3566 y = page->count_info;
3567 do {
3568 x = y;
3569 BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3570 } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3572 /* Unlink from original owner. */
3573 if ( !(memflags & MEMF_no_refcount) )
3574 d->tot_pages--;
3575 page_list_del(page, &d->page_list);
3577 spin_unlock(&d->page_alloc_lock);
3578 return 0;
3580 fail:
3581 spin_unlock(&d->page_alloc_lock);
3582 MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3583 (void *)page_to_mfn(page), d, d->domain_id,
3584 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3585 return -1;
3588 int do_update_va_mapping(unsigned long va, u64 val64,
3589 unsigned long flags)
3591 l1_pgentry_t val = l1e_from_intpte(val64);
3592 struct vcpu *v = current;
3593 struct domain *d = v->domain;
3594 struct page_info *gl1pg;
3595 l1_pgentry_t *pl1e;
3596 unsigned long vmask, bmap_ptr, gl1mfn;
3597 cpumask_t pmask;
3598 int rc;
3600 perfc_incr(calls_to_update_va);
3602 rc = xsm_update_va_mapping(d, FOREIGNDOM, val);
3603 if ( rc )
3604 return rc;
3606 rc = -EINVAL;
3607 pl1e = guest_map_l1e(v, va, &gl1mfn);
3608 if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) )
3609 goto out;
3611 gl1pg = mfn_to_page(gl1mfn);
3612 if ( !page_lock(gl1pg) )
3614 put_page(gl1pg);
3615 goto out;
3618 if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3620 page_unlock(gl1pg);
3621 put_page(gl1pg);
3622 goto out;
3625 rc = mod_l1_entry(pl1e, val, gl1mfn, 0, v) ? 0 : -EINVAL;
3627 page_unlock(gl1pg);
3628 put_page(gl1pg);
3630 out:
3631 if ( pl1e )
3632 guest_unmap_l1e(v, pl1e);
3634 switch ( flags & UVMF_FLUSHTYPE_MASK )
3636 case UVMF_TLB_FLUSH:
3637 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3639 case UVMF_LOCAL:
3640 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
3641 break;
3642 case UVMF_ALL:
3643 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
3644 break;
3645 default:
3646 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3647 break;
3648 if ( unlikely(!is_pv_32on64_domain(d) ?
3649 get_user(vmask, (unsigned long *)bmap_ptr) :
3650 get_user(vmask, (unsigned int *)bmap_ptr)) )
3651 rc = -EFAULT, vmask = 0;
3652 pmask = vcpumask_to_pcpumask(d, vmask);
3653 if ( cpu_isset(smp_processor_id(), pmask) )
3654 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
3655 flush_tlb_mask(pmask);
3656 break;
3658 break;
3660 case UVMF_INVLPG:
3661 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3662 break;
3663 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3665 case UVMF_LOCAL:
3666 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3667 break;
3668 if ( !paging_mode_enabled(d) ||
3669 (paging_invlpg(v, va) != 0) )
3670 flush_tlb_one_local(va);
3671 break;
3672 case UVMF_ALL:
3673 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
3674 break;
3675 default:
3676 if ( unlikely(!is_pv_32on64_domain(d) ?
3677 get_user(vmask, (unsigned long *)bmap_ptr) :
3678 get_user(vmask, (unsigned int *)bmap_ptr)) )
3679 rc = -EFAULT, vmask = 0;
3680 pmask = vcpumask_to_pcpumask(d, vmask);
3681 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3682 cpu_clear(smp_processor_id(), pmask);
3683 flush_tlb_one_mask(pmask, va);
3684 break;
3686 break;
3689 process_deferred_ops();
3691 return rc;
3694 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3695 unsigned long flags,
3696 domid_t domid)
3698 int rc;
3700 if ( !set_foreigndom(domid) )
3701 return -ESRCH;
3703 rc = do_update_va_mapping(va, val64, flags);
3705 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3706 process_deferred_ops(); /* only to clear foreigndom */
3708 return rc;
3713 /*************************
3714 * Descriptor Tables
3715 */
3717 void destroy_gdt(struct vcpu *v)
3719 int i;
3720 unsigned long pfn;
3722 v->arch.guest_context.gdt_ents = 0;
3723 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3725 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3726 put_page_and_type(mfn_to_page(pfn));
3727 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3728 v->arch.guest_context.gdt_frames[i] = 0;
3733 long set_gdt(struct vcpu *v,
3734 unsigned long *frames,
3735 unsigned int entries)
3737 struct domain *d = v->domain;
3738 /* NB. There are 512 8-byte entries per GDT page. */
3739 int i, nr_pages = (entries + 511) / 512;
3740 unsigned long mfn;
3742 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3743 return -EINVAL;
3745 /* Check the pages in the new GDT. */
3746 for ( i = 0; i < nr_pages; i++ )
3748 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3749 if ( !mfn_valid(mfn) ||
3750 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
3751 goto fail;
3754 /* Tear down the old GDT. */
3755 destroy_gdt(v);
3757 /* Install the new GDT. */
3758 v->arch.guest_context.gdt_ents = entries;
3759 for ( i = 0; i < nr_pages; i++ )
3761 v->arch.guest_context.gdt_frames[i] = frames[i];
3762 l1e_write(&v->arch.perdomain_ptes[i],
3763 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3766 return 0;
3768 fail:
3769 while ( i-- > 0 )
3770 put_page_and_type(mfn_to_page(frames[i]));
3771 return -EINVAL;
3775 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3777 int nr_pages = (entries + 511) / 512;
3778 unsigned long frames[16];
3779 struct vcpu *curr = current;
3780 long ret;
3782 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3783 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3784 return -EINVAL;
3786 if ( copy_from_guest(frames, frame_list, nr_pages) )
3787 return -EFAULT;
3789 domain_lock(curr->domain);
3791 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3792 flush_tlb_local();
3794 domain_unlock(curr->domain);
3796 return ret;
3800 long do_update_descriptor(u64 pa, u64 desc)
3802 struct domain *dom = current->domain;
3803 unsigned long gmfn = pa >> PAGE_SHIFT;
3804 unsigned long mfn;
3805 unsigned int offset;
3806 struct desc_struct *gdt_pent, d;
3807 struct page_info *page;
3808 long ret = -EINVAL;
3810 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3812 *(u64 *)&d = desc;
3814 mfn = gmfn_to_mfn(dom, gmfn);
3815 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3816 !mfn_valid(mfn) ||
3817 !check_descriptor(dom, &d) )
3818 return -EINVAL;
3820 page = mfn_to_page(mfn);
3821 if ( unlikely(!get_page(page, dom)) )
3822 return -EINVAL;
3824 /* Check if the given frame is in use in an unsafe context. */
3825 switch ( page->u.inuse.type_info & PGT_type_mask )
3827 case PGT_seg_desc_page:
3828 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
3829 goto out;
3830 break;
3831 default:
3832 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3833 goto out;
3834 break;
3837 paging_mark_dirty(dom, mfn);
3839 /* All is good so make the update. */
3840 gdt_pent = map_domain_page(mfn);
3841 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3842 unmap_domain_page(gdt_pent);
3844 put_page_type(page);
3846 ret = 0; /* success */
3848 out:
3849 put_page(page);
3851 return ret;
3854 typedef struct e820entry e820entry_t;
3855 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3857 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3859 struct page_info *page = NULL;
3860 int rc;
3862 switch ( op )
3864 case XENMEM_add_to_physmap:
3866 struct xen_add_to_physmap xatp;
3867 unsigned long prev_mfn, mfn = 0, gpfn;
3868 struct domain *d;
3870 if ( copy_from_guest(&xatp, arg, 1) )
3871 return -EFAULT;
3873 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
3874 if ( rc != 0 )
3875 return rc;
3877 if ( xsm_add_to_physmap(current->domain, d) )
3879 rcu_unlock_domain(d);
3880 return -EPERM;
3883 switch ( xatp.space )
3885 case XENMAPSPACE_shared_info:
3886 if ( xatp.idx == 0 )
3887 mfn = virt_to_mfn(d->shared_info);
3888 break;
3889 case XENMAPSPACE_grant_table:
3890 spin_lock(&d->grant_table->lock);
3892 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3893 (xatp.idx < max_nr_grant_frames) )
3894 gnttab_grow_table(d, xatp.idx + 1);
3896 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3897 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3899 spin_unlock(&d->grant_table->lock);
3900 break;
3901 case XENMAPSPACE_gmfn:
3902 xatp.idx = gmfn_to_mfn(d, xatp.idx);
3903 if ( !get_page_from_pagenr(xatp.idx, d) )
3904 break;
3905 mfn = xatp.idx;
3906 page = mfn_to_page(mfn);
3907 break;
3908 default:
3909 break;
3912 if ( !paging_mode_translate(d) || (mfn == 0) )
3914 if ( page )
3915 put_page(page);
3916 rcu_unlock_domain(d);
3917 return -EINVAL;
3920 domain_lock(d);
3922 /* Remove previously mapped page if it was present. */
3923 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3924 if ( mfn_valid(prev_mfn) )
3926 if ( is_xen_heap_mfn(prev_mfn) )
3927 /* Xen heap frames are simply unhooked from this phys slot. */
3928 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3929 else
3930 /* Normal domain memory is freed, to avoid leaking memory. */
3931 guest_remove_page(d, xatp.gpfn);
3934 /* Unmap from old location, if any. */
3935 gpfn = get_gpfn_from_mfn(mfn);
3936 if ( gpfn != INVALID_M2P_ENTRY )
3937 guest_physmap_remove_page(d, gpfn, mfn, 0);
3939 /* Map at new location. */
3940 guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
3942 domain_unlock(d);
3944 if ( page )
3945 put_page(page);
3947 rcu_unlock_domain(d);
3949 break;
3952 case XENMEM_set_memory_map:
3954 struct xen_foreign_memory_map fmap;
3955 struct domain *d;
3956 int rc;
3958 if ( copy_from_guest(&fmap, arg, 1) )
3959 return -EFAULT;
3961 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3962 return -EINVAL;
3964 rc = rcu_lock_target_domain_by_id(fmap.domid, &d);
3965 if ( rc != 0 )
3966 return rc;
3968 rc = xsm_domain_memory_map(d);
3969 if ( rc )
3971 rcu_unlock_domain(d);
3972 return rc;
3975 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3976 fmap.map.nr_entries) ? -EFAULT : 0;
3977 d->arch.nr_e820 = fmap.map.nr_entries;
3979 rcu_unlock_domain(d);
3980 return rc;
3983 case XENMEM_memory_map:
3985 struct xen_memory_map map;
3986 struct domain *d = current->domain;
3988 /* Backwards compatibility. */
3989 if ( d->arch.nr_e820 == 0 )
3990 return -ENOSYS;
3992 if ( copy_from_guest(&map, arg, 1) )
3993 return -EFAULT;
3995 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3996 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3997 copy_to_guest(arg, &map, 1) )
3998 return -EFAULT;
4000 return 0;
4003 case XENMEM_machine_memory_map:
4005 struct xen_memory_map memmap;
4006 XEN_GUEST_HANDLE(e820entry_t) buffer;
4007 int count;
4008 int rc;
4010 if ( !IS_PRIV(current->domain) )
4011 return -EINVAL;
4013 rc = xsm_machine_memory_map();
4014 if ( rc )
4015 return rc;
4017 if ( copy_from_guest(&memmap, arg, 1) )
4018 return -EFAULT;
4019 if ( memmap.nr_entries < e820.nr_map + 1 )
4020 return -EINVAL;
4022 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
4024 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
4025 if ( copy_to_guest(buffer, e820.map, count) < 0 )
4026 return -EFAULT;
4028 memmap.nr_entries = count;
4030 if ( copy_to_guest(arg, &memmap, 1) )
4031 return -EFAULT;
4033 return 0;
4036 case XENMEM_machphys_mapping:
4038 static const struct xen_machphys_mapping mapping = {
4039 .v_start = MACH2PHYS_VIRT_START,
4040 .v_end = MACH2PHYS_VIRT_END,
4041 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4042 };
4044 if ( copy_to_guest(arg, &mapping, 1) )
4045 return -EFAULT;
4047 return 0;
4050 case XENMEM_set_pod_target:
4051 case XENMEM_get_pod_target:
4053 xen_pod_target_t target;
4054 struct domain *d;
4056 /* Support DOMID_SELF? */
4057 if ( !IS_PRIV(current->domain) )
4058 return -EINVAL;
4060 if ( copy_from_guest(&target, arg, 1) )
4061 return -EFAULT;
4063 rc = rcu_lock_target_domain_by_id(target.domid, &d);
4064 if ( rc != 0 )
4065 return rc;
4067 if ( op == XENMEM_set_pod_target )
4069 if ( target.target_pages > d->max_pages )
4071 rc = -EINVAL;
4072 goto pod_target_out_unlock;
4075 rc = p2m_pod_set_mem_target(d, target.target_pages);
4078 target.tot_pages = d->tot_pages;
4079 target.pod_cache_pages = d->arch.p2m->pod.count;
4080 target.pod_entries = d->arch.p2m->pod.entry_count;
4082 if ( copy_to_guest(arg, &target, 1) )
4084 rc= -EFAULT;
4085 goto pod_target_out_unlock;
4088 pod_target_out_unlock:
4089 rcu_unlock_domain(d);
4090 return rc;
4093 default:
4094 return subarch_memory_op(op, arg);
4097 return 0;
4101 /*************************
4102 * Writable Pagetables
4103 */
4105 struct ptwr_emulate_ctxt {
4106 struct x86_emulate_ctxt ctxt;
4107 unsigned long cr2;
4108 l1_pgentry_t pte;
4109 };
4111 static int ptwr_emulated_read(
4112 enum x86_segment seg,
4113 unsigned long offset,
4114 void *p_data,
4115 unsigned int bytes,
4116 struct x86_emulate_ctxt *ctxt)
4118 unsigned int rc;
4119 unsigned long addr = offset;
4121 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
4123 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
4124 return X86EMUL_EXCEPTION;
4127 return X86EMUL_OKAY;
4130 static int ptwr_emulated_update(
4131 unsigned long addr,
4132 paddr_t old,
4133 paddr_t val,
4134 unsigned int bytes,
4135 unsigned int do_cmpxchg,
4136 struct ptwr_emulate_ctxt *ptwr_ctxt)
4138 unsigned long mfn;
4139 unsigned long unaligned_addr = addr;
4140 struct page_info *page;
4141 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
4142 struct vcpu *v = current;
4143 struct domain *d = v->domain;
4145 /* Only allow naturally-aligned stores within the original %cr2 page. */
4146 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
4148 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
4149 ptwr_ctxt->cr2, addr, bytes);
4150 return X86EMUL_UNHANDLEABLE;
4153 /* Turn a sub-word access into a full-word access. */
4154 if ( bytes != sizeof(paddr_t) )
4156 paddr_t full;
4157 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
4159 /* Align address; read full word. */
4160 addr &= ~(sizeof(paddr_t)-1);
4161 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
4163 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
4164 return X86EMUL_EXCEPTION;
4166 /* Mask out bits provided by caller. */
4167 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
4168 /* Shift the caller value and OR in the missing bits. */
4169 val &= (((paddr_t)1 << (bytes*8)) - 1);
4170 val <<= (offset)*8;
4171 val |= full;
4172 /* Also fill in missing parts of the cmpxchg old value. */
4173 old &= (((paddr_t)1 << (bytes*8)) - 1);
4174 old <<= (offset)*8;
4175 old |= full;
4178 pte = ptwr_ctxt->pte;
4179 mfn = l1e_get_pfn(pte);
4180 page = mfn_to_page(mfn);
4182 /* We are looking only for read-only mappings of p.t. pages. */
4183 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
4184 ASSERT(mfn_valid(mfn));
4185 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
4186 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
4187 ASSERT(page_get_owner(page) == d);
4189 /* Check the new PTE. */
4190 nl1e = l1e_from_intpte(val);
4191 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
4193 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
4194 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
4196 /*
4197 * If this is an upper-half write to a PAE PTE then we assume that
4198 * the guest has simply got the two writes the wrong way round. We
4199 * zap the PRESENT bit on the assumption that the bottom half will
4200 * be written immediately after we return to the guest.
4201 */
4202 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
4203 l1e_get_intpte(nl1e));
4204 l1e_remove_flags(nl1e, _PAGE_PRESENT);
4206 else
4208 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
4209 return X86EMUL_UNHANDLEABLE;
4213 adjust_guest_l1e(nl1e, d);
4215 /* Checked successfully: do the update (write or cmpxchg). */
4216 pl1e = map_domain_page(mfn);
4217 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
4218 if ( do_cmpxchg )
4220 int okay;
4221 intpte_t t = old;
4222 ol1e = l1e_from_intpte(old);
4224 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
4225 &t, l1e_get_intpte(nl1e), _mfn(mfn));
4226 okay = (okay && t == old);
4228 if ( !okay )
4230 unmap_domain_page(pl1e);
4231 put_page_from_l1e(nl1e, d);
4232 return X86EMUL_CMPXCHG_FAILED;
4235 else
4237 ol1e = *pl1e;
4238 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
4239 BUG();
4242 trace_ptwr_emulation(addr, nl1e);
4244 unmap_domain_page(pl1e);
4246 /* Finally, drop the old PTE. */
4247 put_page_from_l1e(ol1e, d);
4249 return X86EMUL_OKAY;
4252 static int ptwr_emulated_write(
4253 enum x86_segment seg,
4254 unsigned long offset,
4255 void *p_data,
4256 unsigned int bytes,
4257 struct x86_emulate_ctxt *ctxt)
4259 paddr_t val = 0;
4261 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4263 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
4264 offset, bytes);
4265 return X86EMUL_UNHANDLEABLE;
4268 memcpy(&val, p_data, bytes);
4270 return ptwr_emulated_update(
4271 offset, 0, val, bytes, 0,
4272 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4275 static int ptwr_emulated_cmpxchg(
4276 enum x86_segment seg,
4277 unsigned long offset,
4278 void *p_old,
4279 void *p_new,
4280 unsigned int bytes,
4281 struct x86_emulate_ctxt *ctxt)
4283 paddr_t old = 0, new = 0;
4285 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4287 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
4288 offset, bytes);
4289 return X86EMUL_UNHANDLEABLE;
4292 memcpy(&old, p_old, bytes);
4293 memcpy(&new, p_new, bytes);
4295 return ptwr_emulated_update(
4296 offset, old, new, bytes, 1,
4297 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4300 static struct x86_emulate_ops ptwr_emulate_ops = {
4301 .read = ptwr_emulated_read,
4302 .insn_fetch = ptwr_emulated_read,
4303 .write = ptwr_emulated_write,
4304 .cmpxchg = ptwr_emulated_cmpxchg,
4305 };
4307 /* Write page fault handler: check if guest is trying to modify a PTE. */
4308 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
4309 struct cpu_user_regs *regs)
4311 struct domain *d = v->domain;
4312 struct page_info *page;
4313 l1_pgentry_t pte;
4314 struct ptwr_emulate_ctxt ptwr_ctxt;
4315 int rc;
4317 /* Attempt to read the PTE that maps the VA being accessed. */
4318 guest_get_eff_l1e(v, addr, &pte);
4320 /* We are looking only for read-only mappings of p.t. pages. */
4321 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
4322 !get_page_from_pagenr(l1e_get_pfn(pte), d) )
4323 goto bail;
4325 page = l1e_get_page(pte);
4326 if ( !page_lock(page) )
4328 put_page(page);
4329 goto bail;
4332 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
4334 page_unlock(page);
4335 put_page(page);
4336 goto bail;
4339 ptwr_ctxt.ctxt.regs = regs;
4340 ptwr_ctxt.ctxt.force_writeback = 0;
4341 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
4342 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
4343 ptwr_ctxt.cr2 = addr;
4344 ptwr_ctxt.pte = pte;
4346 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
4348 page_unlock(page);
4349 put_page(page);
4351 if ( rc == X86EMUL_UNHANDLEABLE )
4352 goto bail;
4354 perfc_incr(ptwr_emulations);
4355 return EXCRET_fault_fixed;
4357 bail:
4358 return 0;
4361 void free_xen_pagetable(void *v)
4363 extern int early_boot;
4365 if ( early_boot )
4366 return;
4368 if ( is_xen_heap_page(virt_to_page(v)) )
4369 free_xenheap_page(v);
4370 else
4371 free_domheap_page(virt_to_page(v));
4374 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4375 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4376 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4378 /*
4379 * map_pages_to_xen() can be called with interrupts disabled:
4380 * * During early bootstrap; or
4381 * * alloc_xenheap_pages() via memguard_guard_range
4382 * In these cases it is safe to use flush_area_local():
4383 * * Because only the local CPU is online; or
4384 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
4385 */
4386 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4387 flush_area_local((const void *)v, f) : \
4388 flush_area_all((const void *)v, f))
4390 int map_pages_to_xen(
4391 unsigned long virt,
4392 unsigned long mfn,
4393 unsigned long nr_mfns,
4394 unsigned int flags)
4396 l2_pgentry_t *pl2e, ol2e;
4397 l1_pgentry_t *pl1e, ol1e;
4398 unsigned int i;
4400 while ( nr_mfns != 0 )
4402 #ifdef __x86_64__
4403 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
4404 l3_pgentry_t ol3e = *pl3e;
4406 if ( cpu_has_page1gb &&
4407 !(((virt >> PAGE_SHIFT) | mfn) &
4408 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4409 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4410 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4412 /* 1GB-page mapping. */
4413 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4415 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4417 unsigned int flush_flags =
4418 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4420 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4422 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4423 flush_flags |= FLUSH_TLB_GLOBAL;
4424 if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4425 PAGE_CACHE_ATTRS )
4426 flush_flags |= FLUSH_CACHE;
4427 flush_area(virt, flush_flags);
4429 else
4431 pl2e = l3e_to_l2e(ol3e);
4432 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4434 ol2e = pl2e[i];
4435 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4436 continue;
4437 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4439 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4440 flush_flags |= FLUSH_TLB_GLOBAL;
4441 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4442 PAGE_CACHE_ATTRS )
4443 flush_flags |= FLUSH_CACHE;
4445 else
4447 unsigned int j;
4449 pl1e = l2e_to_l1e(ol2e);
4450 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4452 ol1e = pl1e[j];
4453 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4454 flush_flags |= FLUSH_TLB_GLOBAL;
4455 if ( (l1e_get_flags(ol1e) ^ flags) &
4456 PAGE_CACHE_ATTRS )
4457 flush_flags |= FLUSH_CACHE;
4461 flush_area(virt, flush_flags);
4462 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4464 ol2e = pl2e[i];
4465 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4466 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4467 free_xen_pagetable(l2e_to_l1e(ol2e));
4469 free_xen_pagetable(pl2e);
4473 virt += 1UL << L3_PAGETABLE_SHIFT;
4474 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4475 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4476 continue;
4479 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4480 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4482 unsigned int flush_flags =
4483 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4485 /* Skip this PTE if there is no change. */
4486 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4487 L1_PAGETABLE_ENTRIES - 1)) +
4488 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4489 l1_table_offset(virt) == mfn) &&
4490 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4491 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4493 /* We can skip to end of L3 superpage if we got a match. */
4494 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4495 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4496 if ( i > nr_mfns )
4497 i = nr_mfns;
4498 virt += i << PAGE_SHIFT;
4499 mfn += i;
4500 nr_mfns -= i;
4501 continue;
4504 pl2e = alloc_xen_pagetable();
4505 if ( pl2e == NULL )
4506 return -ENOMEM;
4508 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4509 l2e_write(pl2e + i,
4510 l2e_from_pfn(l3e_get_pfn(ol3e) +
4511 (i << PAGETABLE_ORDER),
4512 l3e_get_flags(ol3e)));
4514 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4515 flush_flags |= FLUSH_TLB_GLOBAL;
4517 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4518 __PAGE_HYPERVISOR));
4519 flush_area(virt, flush_flags);
4521 #endif
4523 pl2e = virt_to_xen_l2e(virt);
4525 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4526 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4527 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4529 /* Super-page mapping. */
4530 ol2e = *pl2e;
4531 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4533 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4535 unsigned int flush_flags =
4536 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4538 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4540 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4541 flush_flags |= FLUSH_TLB_GLOBAL;
4542 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4543 PAGE_CACHE_ATTRS )
4544 flush_flags |= FLUSH_CACHE;
4545 flush_area(virt, flush_flags);
4547 else
4549 pl1e = l2e_to_l1e(ol2e);
4550 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4552 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4553 flush_flags |= FLUSH_TLB_GLOBAL;
4554 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4555 PAGE_CACHE_ATTRS )
4556 flush_flags |= FLUSH_CACHE;
4558 flush_area(virt, flush_flags);
4559 free_xen_pagetable(pl1e);
4563 virt += 1UL << L2_PAGETABLE_SHIFT;
4564 mfn += 1UL << PAGETABLE_ORDER;
4565 nr_mfns -= 1UL << PAGETABLE_ORDER;
4567 else
4569 /* Normal page mapping. */
4570 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4572 pl1e = alloc_xen_pagetable();
4573 if ( pl1e == NULL )
4574 return -ENOMEM;
4575 clear_page(pl1e);
4576 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4577 __PAGE_HYPERVISOR));
4579 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4581 unsigned int flush_flags =
4582 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4584 /* Skip this PTE if there is no change. */
4585 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4586 l1_table_offset(virt)) == mfn) &&
4587 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4588 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4590 /* We can skip to end of L2 superpage if we got a match. */
4591 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4592 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4593 if ( i > nr_mfns )
4594 i = nr_mfns;
4595 virt += i << L1_PAGETABLE_SHIFT;
4596 mfn += i;
4597 nr_mfns -= i;
4598 goto check_l3;
4601 pl1e = alloc_xen_pagetable();
4602 if ( pl1e == NULL )
4603 return -ENOMEM;
4605 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4606 l1e_write(&pl1e[i],
4607 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4608 lNf_to_l1f(l2e_get_flags(*pl2e))));
4610 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4611 flush_flags |= FLUSH_TLB_GLOBAL;
4613 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4614 __PAGE_HYPERVISOR));
4615 flush_area(virt, flush_flags);
4618 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4619 ol1e = *pl1e;
4620 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4621 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4623 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4624 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4625 flush_flags |= FLUSH_TLB_GLOBAL;
4626 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
4627 flush_flags |= FLUSH_CACHE;
4628 flush_area(virt, flush_flags);
4631 virt += 1UL << L1_PAGETABLE_SHIFT;
4632 mfn += 1UL;
4633 nr_mfns -= 1UL;
4635 if ( (flags == PAGE_HYPERVISOR) &&
4636 ((nr_mfns == 0) ||
4637 ((((virt >> PAGE_SHIFT) | mfn) &
4638 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
4640 unsigned long base_mfn;
4641 pl1e = l2e_to_l1e(*pl2e);
4642 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4643 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4644 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4645 (l1e_get_flags(*pl1e) != flags) )
4646 break;
4647 if ( i == L1_PAGETABLE_ENTRIES )
4649 ol2e = *pl2e;
4650 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4651 l1f_to_lNf(flags)));
4652 flush_area(virt - PAGE_SIZE,
4653 FLUSH_TLB_GLOBAL |
4654 FLUSH_ORDER(PAGETABLE_ORDER));
4655 free_xen_pagetable(l2e_to_l1e(ol2e));
4660 check_l3: ;
4661 #ifdef __x86_64__
4662 if ( cpu_has_page1gb &&
4663 (flags == PAGE_HYPERVISOR) &&
4664 ((nr_mfns == 0) ||
4665 !(((virt >> PAGE_SHIFT) | mfn) &
4666 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4668 unsigned long base_mfn;
4670 ol3e = *pl3e;
4671 pl2e = l3e_to_l2e(ol3e);
4672 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4673 L1_PAGETABLE_ENTRIES - 1);
4674 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4675 if ( (l2e_get_pfn(*pl2e) !=
4676 (base_mfn + (i << PAGETABLE_ORDER))) ||
4677 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4678 break;
4679 if ( i == L2_PAGETABLE_ENTRIES )
4681 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4682 l1f_to_lNf(flags)));
4683 flush_area(virt - PAGE_SIZE,
4684 FLUSH_TLB_GLOBAL |
4685 FLUSH_ORDER(2*PAGETABLE_ORDER));
4686 free_xen_pagetable(l3e_to_l2e(ol3e));
4689 #endif
4692 return 0;
4695 void destroy_xen_mappings(unsigned long s, unsigned long e)
4697 l2_pgentry_t *pl2e;
4698 l1_pgentry_t *pl1e;
4699 unsigned int i;
4700 unsigned long v = s;
4702 ASSERT((s & ~PAGE_MASK) == 0);
4703 ASSERT((e & ~PAGE_MASK) == 0);
4705 while ( v < e )
4707 #ifdef __x86_64__
4708 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4710 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4712 v += 1UL << L3_PAGETABLE_SHIFT;
4713 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4714 continue;
4717 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4719 if ( l2_table_offset(v) == 0 &&
4720 l1_table_offset(v) == 0 &&
4721 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4723 /* PAGE1GB: whole superpage is destroyed. */
4724 l3e_write_atomic(pl3e, l3e_empty());
4725 v += 1UL << L3_PAGETABLE_SHIFT;
4726 continue;
4729 /* PAGE1GB: shatter the superpage and fall through. */
4730 pl2e = alloc_xen_pagetable();
4731 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4732 l2e_write(pl2e + i,
4733 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4734 (i << PAGETABLE_ORDER),
4735 l3e_get_flags(*pl3e)));
4736 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4737 __PAGE_HYPERVISOR));
4739 #endif
4741 pl2e = virt_to_xen_l2e(v);
4743 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4745 v += 1UL << L2_PAGETABLE_SHIFT;
4746 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4747 continue;
4750 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4752 if ( (l1_table_offset(v) == 0) &&
4753 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4755 /* PSE: whole superpage is destroyed. */
4756 l2e_write_atomic(pl2e, l2e_empty());
4757 v += 1UL << L2_PAGETABLE_SHIFT;
4759 else
4761 /* PSE: shatter the superpage and try again. */
4762 pl1e = alloc_xen_pagetable();
4763 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4764 l1e_write(&pl1e[i],
4765 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4766 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4767 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4768 __PAGE_HYPERVISOR));
4771 else
4773 /* Ordinary 4kB mapping. */
4774 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4775 l1e_write_atomic(pl1e, l1e_empty());
4776 v += PAGE_SIZE;
4778 /* If we are done with the L2E, check if it is now empty. */
4779 if ( (v != e) && (l1_table_offset(v) != 0) )
4780 continue;
4781 pl1e = l2e_to_l1e(*pl2e);
4782 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4783 if ( l1e_get_intpte(pl1e[i]) != 0 )
4784 break;
4785 if ( i == L1_PAGETABLE_ENTRIES )
4787 /* Empty: zap the L2E and free the L1 page. */
4788 l2e_write_atomic(pl2e, l2e_empty());
4789 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4790 free_xen_pagetable(pl1e);
4794 #ifdef __x86_64__
4795 /* If we are done with the L3E, check if it is now empty. */
4796 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4797 continue;
4798 pl2e = l3e_to_l2e(*pl3e);
4799 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4800 if ( l2e_get_intpte(pl2e[i]) != 0 )
4801 break;
4802 if ( i == L2_PAGETABLE_ENTRIES )
4804 /* Empty: zap the L3E and free the L2 page. */
4805 l3e_write_atomic(pl3e, l3e_empty());
4806 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4807 free_xen_pagetable(pl2e);
4809 #endif
4812 flush_area(NULL, FLUSH_TLB_GLOBAL);
4815 void __set_fixmap(
4816 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4818 BUG_ON(idx >= __end_of_fixed_addresses);
4819 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4822 #ifdef MEMORY_GUARD
4824 void memguard_init(void)
4826 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4827 #ifdef __i386__
4828 map_pages_to_xen(
4829 (unsigned long)__va(start),
4830 start >> PAGE_SHIFT,
4831 (xenheap_phys_end - start) >> PAGE_SHIFT,
4832 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4833 #else
4834 map_pages_to_xen(
4835 (unsigned long)__va(start),
4836 start >> PAGE_SHIFT,
4837 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4838 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4839 BUG_ON(start != xen_phys_start);
4840 map_pages_to_xen(
4841 XEN_VIRT_START,
4842 start >> PAGE_SHIFT,
4843 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4844 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4845 #endif
4848 static void __memguard_change_range(void *p, unsigned long l, int guard)
4850 unsigned long _p = (unsigned long)p;
4851 unsigned long _l = (unsigned long)l;
4852 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4854 /* Ensure we are dealing with a page-aligned whole number of pages. */
4855 ASSERT((_p&~PAGE_MASK) == 0);
4856 ASSERT((_l&~PAGE_MASK) == 0);
4858 if ( guard )
4859 flags &= ~_PAGE_PRESENT;
4861 map_pages_to_xen(
4862 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4865 void memguard_guard_range(void *p, unsigned long l)
4867 __memguard_change_range(p, l, 1);
4870 void memguard_unguard_range(void *p, unsigned long l)
4872 __memguard_change_range(p, l, 0);
4875 #endif
4877 void memguard_guard_stack(void *p)
4879 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4880 p = (void *)((unsigned long)p + STACK_SIZE -
4881 PRIMARY_STACK_SIZE - PAGE_SIZE);
4882 memguard_guard_range(p, PAGE_SIZE);
4885 /*
4886 * Local variables:
4887 * mode: C
4888 * c-set-style: "BSD"
4889 * c-basic-offset: 4
4890 * tab-width: 4
4891 * indent-tabs-mode: nil
4892 * End:
4893 */