ia64/xen-unstable

view xen/arch/x86/mm.c @ 19787:cecc76506afc

x86_64: don't allocate L1 per-domain page table pages in a single chunk

Instead, allocate them on demand, and adjust the consumer to no longer
assume the allocated space is contiguous.

This another prerequisite to extend to number of vCPU-s the hypervisor
can support per guest.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:05:23 2009 +0100 (2009-06-18)
parents d835ad2f6980
children 2f9e1348aa98
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(__i386__)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *__read_mostly frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 int opt_allow_hugepage;
164 boolean_param("allowhugepage", opt_allow_hugepage);
166 #define l1_disallow_mask(d) \
167 ((d != dom_io) && \
168 (rangeset_is_empty((d)->iomem_caps) && \
169 rangeset_is_empty((d)->arch.ioport_caps) && \
170 !has_arch_pdevs(d)) ? \
171 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
173 #ifdef CONFIG_COMPAT
174 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
175 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
176 L3_DISALLOW_MASK : \
177 COMPAT_L3_DISALLOW_MASK)
178 #else
179 #define l3_disallow_mask(d) L3_DISALLOW_MASK
180 #endif
182 void __init init_frametable(void)
183 {
184 unsigned long nr_pages, page_step, i, mfn;
186 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
188 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
189 page_step = 1 << (cpu_has_page1gb ? L3_PAGETABLE_SHIFT - PAGE_SHIFT
190 : L2_PAGETABLE_SHIFT - PAGE_SHIFT);
192 for ( i = 0; i < nr_pages; i += page_step )
193 {
194 /*
195 * The hardcoded 4 below is arbitrary - just pick whatever you think
196 * is reasonable to waste as a trade-off for using a large page.
197 */
198 while (nr_pages + 4 - i < page_step)
199 page_step >>= PAGETABLE_ORDER;
200 mfn = alloc_boot_pages(page_step, page_step);
201 if ( mfn == 0 )
202 panic("Not enough memory for frame table\n");
203 map_pages_to_xen(
204 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
205 mfn, page_step, PAGE_HYPERVISOR);
206 }
208 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
209 }
211 void __init arch_init_memory(void)
212 {
213 extern void subarch_init_memory(void);
215 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
217 /*
218 * Initialise our DOMID_XEN domain.
219 * Any Xen-heap pages that we will allow to be mapped will have
220 * their domain field set to dom_xen.
221 */
222 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
223 BUG_ON(dom_xen == NULL);
225 /*
226 * Initialise our DOMID_IO domain.
227 * This domain owns I/O pages that are within the range of the page_info
228 * array. Mappings occur at the priv of the caller.
229 */
230 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
231 BUG_ON(dom_io == NULL);
233 /* First 1MB of RAM is historically marked as I/O. */
234 for ( i = 0; i < 0x100; i++ )
235 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
237 /* Any areas not specified as RAM by the e820 map are considered I/O. */
238 for ( i = 0, pfn = 0; pfn < max_page; i++ )
239 {
240 while ( (i < e820.nr_map) &&
241 (e820.map[i].type != E820_RAM) &&
242 (e820.map[i].type != E820_UNUSABLE) )
243 i++;
245 if ( i >= e820.nr_map )
246 {
247 /* No more RAM regions: mark as I/O right to end of memory map. */
248 rstart_pfn = rend_pfn = max_page;
249 }
250 else
251 {
252 /* Mark as I/O just up as far as next RAM region. */
253 rstart_pfn = min_t(unsigned long, max_page,
254 PFN_UP(e820.map[i].addr));
255 rend_pfn = max_t(unsigned long, rstart_pfn,
256 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
257 }
259 /*
260 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
261 * In particular this ensures that RAM holes are respected even in
262 * the statically-initialised 1-16MB mapping area.
263 */
264 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
265 ioend_pfn = rstart_pfn;
266 #if defined(CONFIG_X86_32)
267 ioend_pfn = min_t(unsigned long, ioend_pfn,
268 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
269 #endif
270 if ( iostart_pfn < ioend_pfn )
271 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
272 (unsigned long)mfn_to_virt(ioend_pfn));
274 /* Mark as I/O up to next RAM region. */
275 for ( ; pfn < rstart_pfn; pfn++ )
276 {
277 BUG_ON(!mfn_valid(pfn));
278 share_xen_page_with_guest(
279 mfn_to_page(pfn), dom_io, XENSHARE_writable);
280 }
282 /* Skip the RAM region. */
283 pfn = rend_pfn;
284 }
286 subarch_init_memory();
287 }
289 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
290 {
291 uint64_t maddr = pfn_to_paddr(mfn);
292 int i;
294 for ( i = 0; i < e820.nr_map; i++ )
295 {
296 switch ( e820.map[i].type )
297 {
298 case E820_RAM:
299 if ( mem_type & RAM_TYPE_CONVENTIONAL )
300 break;
301 continue;
302 case E820_RESERVED:
303 if ( mem_type & RAM_TYPE_RESERVED )
304 break;
305 continue;
306 case E820_UNUSABLE:
307 if ( mem_type & RAM_TYPE_UNUSABLE )
308 break;
309 continue;
310 case E820_ACPI:
311 case E820_NVS:
312 if ( mem_type & RAM_TYPE_ACPI )
313 break;
314 continue;
315 default:
316 /* unknown */
317 continue;
318 }
320 /* Test the range. */
321 if ( (e820.map[i].addr <= maddr) &&
322 ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
323 return 1;
324 }
326 return 0;
327 }
329 unsigned long domain_get_maximum_gpfn(struct domain *d)
330 {
331 if ( is_hvm_domain(d) )
332 return d->arch.p2m->max_mapped_pfn;
333 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
334 return arch_get_max_pfn(d) - 1;
335 }
337 void share_xen_page_with_guest(
338 struct page_info *page, struct domain *d, int readonly)
339 {
340 if ( page_get_owner(page) == d )
341 return;
343 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
345 spin_lock(&d->page_alloc_lock);
347 /* The incremented type count pins as writable or read-only. */
348 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
349 page->u.inuse.type_info |= PGT_validated | 1;
351 page_set_owner(page, d);
352 wmb(); /* install valid domain ptr before updating refcnt. */
353 ASSERT((page->count_info & ~PGC_xen_heap) == 0);
355 /* Only add to the allocation list if the domain isn't dying. */
356 if ( !d->is_dying )
357 {
358 page->count_info |= PGC_allocated | 1;
359 if ( unlikely(d->xenheap_pages++ == 0) )
360 get_knownalive_domain(d);
361 page_list_add_tail(page, &d->xenpage_list);
362 }
364 spin_unlock(&d->page_alloc_lock);
365 }
367 void share_xen_page_with_privileged_guests(
368 struct page_info *page, int readonly)
369 {
370 share_xen_page_with_guest(page, dom_xen, readonly);
371 }
373 #if defined(__i386__)
375 #ifdef NDEBUG
376 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
377 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
378 #else
379 /*
380 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
381 * We cannot safely shadow the idle page table, nor shadow page tables
382 * (detected by zero reference count). As required for correctness, we
383 * always shadow PDPTs above 4GB.
384 */
385 #define l3tab_needs_shadow(mfn) \
386 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
387 (mfn_to_page(mfn)->count_info & PGC_count_mask) && \
388 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
389 ((mfn) >= 0x100000))
390 #endif
392 static l1_pgentry_t *fix_pae_highmem_pl1e;
394 /* Cache the address of PAE high-memory fixmap page tables. */
395 static int __init cache_pae_fixmap_address(void)
396 {
397 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
398 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
399 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
400 return 0;
401 }
402 __initcall(cache_pae_fixmap_address);
404 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
406 void make_cr3(struct vcpu *v, unsigned long mfn)
407 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
408 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
409 {
410 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
411 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
412 unsigned int cpu = smp_processor_id();
414 /* Fast path: does this mfn need a shadow at all? */
415 if ( !l3tab_needs_shadow(mfn) )
416 {
417 v->arch.cr3 = mfn << PAGE_SHIFT;
418 /* Cache is no longer in use or valid */
419 cache->high_mfn = 0;
420 return;
421 }
423 /* Caching logic is not interrupt safe. */
424 ASSERT(!in_irq());
426 /* Protects against pae_flush_pgd(). */
427 spin_lock(&cache->lock);
429 cache->inuse_idx ^= 1;
430 cache->high_mfn = mfn;
432 /* Map the guest L3 table and copy to the chosen low-memory cache. */
433 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
434 /* First check the previous high mapping can't be in the TLB.
435 * (i.e. have we loaded CR3 since we last did this?) */
436 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
437 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
438 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
439 lowmem_l3tab = cache->table[cache->inuse_idx];
440 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
441 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
442 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
444 v->arch.cr3 = __pa(lowmem_l3tab);
446 spin_unlock(&cache->lock);
447 }
449 #else /* !defined(__i386__) */
451 void make_cr3(struct vcpu *v, unsigned long mfn)
452 {
453 v->arch.cr3 = mfn << PAGE_SHIFT;
454 }
456 #endif /* !defined(__i386__) */
458 void write_ptbase(struct vcpu *v)
459 {
460 write_cr3(v->arch.cr3);
461 }
463 /*
464 * Should be called after CR3 is updated.
465 *
466 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
467 * for HVM guests, arch.monitor_table and hvm's guest CR3.
468 *
469 * Update ref counts to shadow tables appropriately.
470 */
471 void update_cr3(struct vcpu *v)
472 {
473 unsigned long cr3_mfn=0;
475 if ( paging_mode_enabled(v->domain) )
476 {
477 paging_update_cr3(v);
478 return;
479 }
481 #if CONFIG_PAGING_LEVELS == 4
482 if ( !(v->arch.flags & TF_kernel_mode) )
483 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
484 else
485 #endif
486 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
488 make_cr3(v, cr3_mfn);
489 }
492 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
493 {
494 int i;
495 unsigned long pfn;
496 struct page_info *page;
498 BUG_ON(unlikely(in_irq()));
500 spin_lock(&v->arch.shadow_ldt_lock);
502 if ( v->arch.shadow_ldt_mapcnt == 0 )
503 goto out;
505 v->arch.shadow_ldt_mapcnt = 0;
507 for ( i = 16; i < 32; i++ )
508 {
509 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
510 if ( pfn == 0 ) continue;
511 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
512 page = mfn_to_page(pfn);
513 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
514 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
515 put_page_and_type(page);
516 }
518 /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
519 if ( flush )
520 flush_tlb_mask(&v->vcpu_dirty_cpumask);
522 out:
523 spin_unlock(&v->arch.shadow_ldt_lock);
524 }
527 static int alloc_segdesc_page(struct page_info *page)
528 {
529 struct desc_struct *descs;
530 int i;
532 descs = map_domain_page(page_to_mfn(page));
534 for ( i = 0; i < 512; i++ )
535 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
536 goto fail;
538 unmap_domain_page(descs);
539 return 0;
541 fail:
542 unmap_domain_page(descs);
543 return -EINVAL;
544 }
547 /* Map shadow page at offset @off. */
548 int map_ldt_shadow_page(unsigned int off)
549 {
550 struct vcpu *v = current;
551 struct domain *d = v->domain;
552 unsigned long gmfn, mfn;
553 l1_pgentry_t l1e, nl1e;
554 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
555 int okay;
557 BUG_ON(unlikely(in_irq()));
559 guest_get_eff_kern_l1e(v, gva, &l1e);
560 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
561 return 0;
563 gmfn = l1e_get_pfn(l1e);
564 mfn = gmfn_to_mfn(d, gmfn);
565 if ( unlikely(!mfn_valid(mfn)) )
566 return 0;
568 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
569 if ( unlikely(!okay) )
570 return 0;
572 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
574 spin_lock(&v->arch.shadow_ldt_lock);
575 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
576 v->arch.shadow_ldt_mapcnt++;
577 spin_unlock(&v->arch.shadow_ldt_lock);
579 return 1;
580 }
583 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
584 {
585 struct page_info *page = mfn_to_page(page_nr);
587 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
588 {
589 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
590 return 0;
591 }
593 return 1;
594 }
597 static int get_page_and_type_from_pagenr(unsigned long page_nr,
598 unsigned long type,
599 struct domain *d,
600 int partial,
601 int preemptible)
602 {
603 struct page_info *page = mfn_to_page(page_nr);
604 int rc;
606 if ( likely(partial >= 0) &&
607 unlikely(!get_page_from_pagenr(page_nr, d)) )
608 return -EINVAL;
610 rc = (preemptible ?
611 get_page_type_preemptible(page, type) :
612 (get_page_type(page, type) ? 0 : -EINVAL));
614 if ( unlikely(rc) && partial >= 0 )
615 put_page(page);
617 return rc;
618 }
620 static int get_data_page(
621 struct page_info *page, struct domain *d, int writeable)
622 {
623 int rc;
625 if ( writeable )
626 rc = get_page_and_type(page, d, PGT_writable_page);
627 else
628 rc = get_page(page, d);
630 return rc;
631 }
633 static void put_data_page(
634 struct page_info *page, int writeable)
635 {
636 if ( writeable )
637 put_page_and_type(page);
638 else
639 put_page(page);
640 }
642 /*
643 * We allow root tables to map each other (a.k.a. linear page tables). It
644 * needs some special care with reference counts and access permissions:
645 * 1. The mapping entry must be read-only, or the guest may get write access
646 * to its own PTEs.
647 * 2. We must only bump the reference counts for an *already validated*
648 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
649 * on a validation that is required to complete that validation.
650 * 3. We only need to increment the reference counts for the mapped page
651 * frame if it is mapped by a different root table. This is sufficient and
652 * also necessary to allow validation of a root table mapping itself.
653 */
654 #define define_get_linear_pagetable(level) \
655 static int \
656 get_##level##_linear_pagetable( \
657 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
658 { \
659 unsigned long x, y; \
660 struct page_info *page; \
661 unsigned long pfn; \
662 \
663 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
664 { \
665 MEM_LOG("Attempt to create linear p.t. with write perms"); \
666 return 0; \
667 } \
668 \
669 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
670 { \
671 /* Make sure the mapped frame belongs to the correct domain. */ \
672 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
673 return 0; \
674 \
675 /* \
676 * Ensure that the mapped frame is an already-validated page table. \
677 * If so, atomically increment the count (checking for overflow). \
678 */ \
679 page = mfn_to_page(pfn); \
680 y = page->u.inuse.type_info; \
681 do { \
682 x = y; \
683 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
684 unlikely((x & (PGT_type_mask|PGT_validated)) != \
685 (PGT_##level##_page_table|PGT_validated)) ) \
686 { \
687 put_page(page); \
688 return 0; \
689 } \
690 } \
691 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
692 } \
693 \
694 return 1; \
695 }
698 int is_iomem_page(unsigned long mfn)
699 {
700 struct page_info *page;
702 if ( !mfn_valid(mfn) )
703 return 1;
705 /* Caller must know that it is an iomem page, or a reference is held. */
706 page = mfn_to_page(mfn);
707 ASSERT((page->count_info & PGC_count_mask) != 0);
709 return (page_get_owner(page) == dom_io);
710 }
712 static void update_xen_mappings(unsigned long mfn, unsigned long cacheattr)
713 {
714 #ifdef __x86_64__
715 bool_t alias = mfn >= PFN_DOWN(xen_phys_start) &&
716 mfn < PFN_UP(xen_phys_start + (unsigned long)_end - XEN_VIRT_START);
717 unsigned long xen_va =
718 XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
720 if ( unlikely(alias) && cacheattr )
721 map_pages_to_xen(xen_va, mfn, 1, 0);
722 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
723 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
724 if ( unlikely(alias) && !cacheattr )
725 map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
726 #endif
727 }
729 int
730 get_page_from_l1e(
731 l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
732 {
733 unsigned long mfn = l1e_get_pfn(l1e);
734 struct page_info *page = mfn_to_page(mfn);
735 uint32_t l1f = l1e_get_flags(l1e);
736 struct vcpu *curr = current;
737 struct domain *real_pg_owner;
739 if ( !(l1f & _PAGE_PRESENT) )
740 return 1;
742 if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
743 {
744 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(l1e_owner));
745 return 0;
746 }
748 if ( !mfn_valid(mfn) ||
749 (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
750 {
751 /* Only needed the reference to confirm dom_io ownership. */
752 if ( mfn_valid(mfn) )
753 put_page(page);
755 /* DOMID_IO reverts to caller for privilege checks. */
756 if ( pg_owner == dom_io )
757 pg_owner = curr->domain;
759 if ( !iomem_access_permitted(pg_owner, mfn, mfn) )
760 {
761 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
762 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
763 pg_owner->domain_id, mfn);
764 return 0;
765 }
767 return 1;
768 }
770 if ( real_pg_owner == NULL )
771 goto could_not_pin;
773 if ( unlikely(real_pg_owner != pg_owner) )
774 {
775 /*
776 * Let privileged domains transfer the right to map their target
777 * domain's pages. This is used to allow stub-domain pvfb export to
778 * dom0, until pvfb supports granted mappings. At that time this
779 * minor hack can go away.
780 */
781 if ( (pg_owner == l1e_owner) || !IS_PRIV_FOR(pg_owner, real_pg_owner) )
782 goto could_not_pin;
783 pg_owner = real_pg_owner;
784 }
786 /* Foreign mappings into guests in shadow external mode don't
787 * contribute to writeable mapping refcounts. (This allows the
788 * qemu-dm helper process in dom0 to map the domain's memory without
789 * messing up the count of "real" writable mappings.) */
790 if ( (l1f & _PAGE_RW) &&
791 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) &&
792 !get_page_type(page, PGT_writable_page) )
793 goto could_not_pin;
795 if ( pte_flags_to_cacheattr(l1f) !=
796 ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
797 {
798 unsigned long x, nx, y = page->count_info;
799 unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
801 if ( is_xen_heap_page(page) )
802 {
803 if ( (l1f & _PAGE_RW) &&
804 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
805 put_page_type(page);
806 put_page(page);
807 MEM_LOG("Attempt to change cache attributes of Xen heap page");
808 return 0;
809 }
811 while ( ((y & PGC_cacheattr_mask) >> PGC_cacheattr_base) != cacheattr )
812 {
813 x = y;
814 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
815 y = cmpxchg(&page->count_info, x, nx);
816 }
818 update_xen_mappings(mfn, cacheattr);
819 }
821 return 1;
823 could_not_pin:
824 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
825 " for l1e_owner=%d, pg_owner=%d",
826 mfn, get_gpfn_from_mfn(mfn),
827 l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id);
828 if ( real_pg_owner != NULL )
829 put_page(page);
830 return 0;
831 }
834 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
835 define_get_linear_pagetable(l2);
836 static int
837 get_page_from_l2e(
838 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
839 {
840 unsigned long mfn = l2e_get_pfn(l2e);
841 int rc;
843 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
844 return 1;
846 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
847 {
848 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
849 return -EINVAL;
850 }
852 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
853 {
854 rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
855 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
856 rc = 0;
857 }
858 else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
859 {
860 rc = -EINVAL;
861 }
862 else
863 {
864 unsigned long m = mfn;
865 int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
867 do {
868 rc = get_data_page(mfn_to_page(m), d, writeable);
869 if ( unlikely(!rc) )
870 {
871 while ( m-- > mfn )
872 put_data_page(mfn_to_page(m), writeable);
873 return -EINVAL;
874 }
875 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
876 }
878 return rc;
879 }
882 define_get_linear_pagetable(l3);
883 static int
884 get_page_from_l3e(
885 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
886 {
887 int rc;
889 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
890 return 1;
892 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
893 {
894 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
895 return -EINVAL;
896 }
898 rc = get_page_and_type_from_pagenr(
899 l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
900 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
901 rc = 0;
903 return rc;
904 }
906 #if CONFIG_PAGING_LEVELS >= 4
907 define_get_linear_pagetable(l4);
908 static int
909 get_page_from_l4e(
910 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
911 {
912 int rc;
914 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
915 return 1;
917 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
918 {
919 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
920 return -EINVAL;
921 }
923 rc = get_page_and_type_from_pagenr(
924 l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
925 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
926 rc = 0;
928 return rc;
929 }
930 #endif /* 4 level */
932 #ifdef __x86_64__
934 #ifdef USER_MAPPINGS_ARE_GLOBAL
935 #define adjust_guest_l1e(pl1e, d) \
936 do { \
937 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
938 likely(!is_pv_32on64_domain(d)) ) \
939 { \
940 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
941 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
942 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
943 MEM_LOG("Global bit is set to kernel page %lx", \
944 l1e_get_pfn((pl1e))); \
945 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
946 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
947 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
948 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
949 } \
950 } while ( 0 )
951 #else
952 #define adjust_guest_l1e(pl1e, d) \
953 do { \
954 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
955 likely(!is_pv_32on64_domain(d)) ) \
956 l1e_add_flags((pl1e), _PAGE_USER); \
957 } while ( 0 )
958 #endif
960 #define adjust_guest_l2e(pl2e, d) \
961 do { \
962 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
963 likely(!is_pv_32on64_domain(d)) ) \
964 l2e_add_flags((pl2e), _PAGE_USER); \
965 } while ( 0 )
967 #define adjust_guest_l3e(pl3e, d) \
968 do { \
969 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
970 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
971 _PAGE_USER : \
972 _PAGE_USER|_PAGE_RW); \
973 } while ( 0 )
975 #define adjust_guest_l4e(pl4e, d) \
976 do { \
977 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
978 likely(!is_pv_32on64_domain(d)) ) \
979 l4e_add_flags((pl4e), _PAGE_USER); \
980 } while ( 0 )
982 #else /* !defined(__x86_64__) */
984 #define adjust_guest_l1e(_p, _d) ((void)(_d))
985 #define adjust_guest_l2e(_p, _d) ((void)(_d))
986 #define adjust_guest_l3e(_p, _d) ((void)(_d))
988 #endif
990 #ifdef CONFIG_COMPAT
991 #define unadjust_guest_l3e(pl3e, d) \
992 do { \
993 if ( unlikely(is_pv_32on64_domain(d)) && \
994 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
995 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
996 } while ( 0 )
997 #else
998 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
999 #endif
1001 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
1003 unsigned long pfn = l1e_get_pfn(l1e);
1004 struct page_info *page;
1005 struct domain *pg_owner;
1006 struct vcpu *v;
1008 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
1009 return;
1011 page = mfn_to_page(pfn);
1012 pg_owner = page_get_owner(page);
1014 /*
1015 * Check if this is a mapping that was established via a grant reference.
1016 * If it was then we should not be here: we require that such mappings are
1017 * explicitly destroyed via the grant-table interface.
1019 * The upshot of this is that the guest can end up with active grants that
1020 * it cannot destroy (because it no longer has a PTE to present to the
1021 * grant-table interface). This can lead to subtle hard-to-catch bugs,
1022 * hence a special grant PTE flag can be enabled to catch the bug early.
1024 * (Note that the undestroyable active grants are not a security hole in
1025 * Xen. All active grants can safely be cleaned up when the domain dies.)
1026 */
1027 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1028 !l1e_owner->is_shutting_down && !l1e_owner->is_dying )
1030 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
1031 l1e_get_intpte(l1e));
1032 domain_crash(l1e_owner);
1035 /* Remember we didn't take a type-count of foreign writable mappings
1036 * to paging-external domains */
1037 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1038 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
1040 put_page_and_type(page);
1042 else
1044 /* We expect this is rare so we blow the entire shadow LDT. */
1045 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1046 PGT_seg_desc_page)) &&
1047 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1048 (l1e_owner == pg_owner) )
1050 for_each_vcpu ( pg_owner, v )
1051 invalidate_shadow_ldt(v, 1);
1053 put_page(page);
1058 /*
1059 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1060 * Note also that this automatically deals correctly with linear p.t.'s.
1061 */
1062 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1064 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1065 return 1;
1067 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1069 unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
1070 int writeable = l2e_get_flags(l2e) & _PAGE_RW;
1072 ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
1073 do {
1074 put_data_page(mfn_to_page(m), writeable);
1075 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
1077 else
1079 put_page_and_type(l2e_get_page(l2e));
1082 return 0;
1085 static int __put_page_type(struct page_info *, int preemptible);
1087 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1088 int partial, int preemptible)
1090 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1091 return 1;
1093 #ifdef __x86_64__
1094 if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1096 unsigned long mfn = l3e_get_pfn(l3e);
1097 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1099 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1100 do {
1101 put_data_page(mfn_to_page(mfn), writeable);
1102 } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1104 return 0;
1106 #endif
1108 if ( unlikely(partial > 0) )
1109 return __put_page_type(l3e_get_page(l3e), preemptible);
1111 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
1114 #if CONFIG_PAGING_LEVELS >= 4
1115 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1116 int partial, int preemptible)
1118 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1119 (l4e_get_pfn(l4e) != pfn) )
1121 if ( unlikely(partial > 0) )
1122 return __put_page_type(l4e_get_page(l4e), preemptible);
1123 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
1125 return 1;
1127 #endif
1129 static int alloc_l1_table(struct page_info *page)
1131 struct domain *d = page_get_owner(page);
1132 unsigned long pfn = page_to_mfn(page);
1133 l1_pgentry_t *pl1e;
1134 unsigned int i;
1136 pl1e = map_domain_page(pfn);
1138 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1140 if ( is_guest_l1_slot(i) &&
1141 unlikely(!get_page_from_l1e(pl1e[i], d, d)) )
1142 goto fail;
1144 adjust_guest_l1e(pl1e[i], d);
1147 unmap_domain_page(pl1e);
1148 return 0;
1150 fail:
1151 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1152 while ( i-- > 0 )
1153 if ( is_guest_l1_slot(i) )
1154 put_page_from_l1e(pl1e[i], d);
1156 unmap_domain_page(pl1e);
1157 return -EINVAL;
1160 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1162 struct page_info *page;
1163 l2_pgentry_t *pl2e;
1164 l3_pgentry_t l3e3;
1165 #ifndef CONFIG_COMPAT
1166 l2_pgentry_t l2e;
1167 int i;
1168 #endif
1170 if ( !is_pv_32bit_domain(d) )
1171 return 1;
1173 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1175 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1176 l3e3 = pl3e[3];
1177 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1179 MEM_LOG("PAE L3 3rd slot is empty");
1180 return 0;
1183 /*
1184 * The Xen-private mappings include linear mappings. The L2 thus cannot
1185 * be shared by multiple L3 tables. The test here is adequate because:
1186 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1187 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1188 * 2. Cannot appear in another page table's L3:
1189 * a. alloc_l3_table() calls this function and this check will fail
1190 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1191 */
1192 page = l3e_get_page(l3e3);
1193 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1194 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1195 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1196 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1198 MEM_LOG("PAE L3 3rd slot is shared");
1199 return 0;
1202 /* Xen private mappings. */
1203 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1204 #ifndef CONFIG_COMPAT
1205 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1206 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1207 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1208 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1210 l2e = l2e_from_page(perdomain_pt_page(d, i), __PAGE_HYPERVISOR);
1211 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1213 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1215 l2e = l2e_empty();
1216 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1217 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1218 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1220 #else
1221 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1222 &compat_idle_pg_table_l2[
1223 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1224 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1225 #endif
1226 unmap_domain_page(pl2e);
1228 return 1;
1231 #ifdef __i386__
1232 /* Flush a pgdir update into low-memory caches. */
1233 static void pae_flush_pgd(
1234 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1236 struct domain *d = page_get_owner(mfn_to_page(mfn));
1237 struct vcpu *v;
1238 intpte_t _ol3e, _nl3e, _pl3e;
1239 l3_pgentry_t *l3tab_ptr;
1240 struct pae_l3_cache *cache;
1242 if ( unlikely(shadow_mode_enabled(d)) )
1244 cpumask_t m = CPU_MASK_NONE;
1245 /* Re-shadow this l3 table on any vcpus that are using it */
1246 for_each_vcpu ( d, v )
1247 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1249 paging_update_cr3(v);
1250 cpus_or(m, m, v->vcpu_dirty_cpumask);
1252 flush_tlb_mask(&m);
1255 /* If below 4GB then the pgdir is not shadowed in low memory. */
1256 if ( !l3tab_needs_shadow(mfn) )
1257 return;
1259 for_each_vcpu ( d, v )
1261 cache = &v->arch.pae_l3_cache;
1263 spin_lock(&cache->lock);
1265 if ( cache->high_mfn == mfn )
1267 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1268 _ol3e = l3e_get_intpte(*l3tab_ptr);
1269 _nl3e = l3e_get_intpte(nl3e);
1270 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1271 BUG_ON(_pl3e != _ol3e);
1274 spin_unlock(&cache->lock);
1277 flush_tlb_mask(&d->domain_dirty_cpumask);
1279 #else
1280 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1281 #endif
1283 static int alloc_l2_table(struct page_info *page, unsigned long type,
1284 int preemptible)
1286 struct domain *d = page_get_owner(page);
1287 unsigned long pfn = page_to_mfn(page);
1288 l2_pgentry_t *pl2e;
1289 unsigned int i;
1290 int rc = 0;
1292 pl2e = map_domain_page(pfn);
1294 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1296 if ( preemptible && i && hypercall_preempt_check() )
1298 page->nr_validated_ptes = i;
1299 rc = -EAGAIN;
1300 break;
1303 if ( !is_guest_l2_slot(d, type, i) ||
1304 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1305 continue;
1307 if ( rc < 0 )
1309 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1310 while ( i-- > 0 )
1311 if ( is_guest_l2_slot(d, type, i) )
1312 put_page_from_l2e(pl2e[i], pfn);
1313 break;
1316 adjust_guest_l2e(pl2e[i], d);
1319 unmap_domain_page(pl2e);
1320 return rc > 0 ? 0 : rc;
1323 static int alloc_l3_table(struct page_info *page, int preemptible)
1325 struct domain *d = page_get_owner(page);
1326 unsigned long pfn = page_to_mfn(page);
1327 l3_pgentry_t *pl3e;
1328 unsigned int i;
1329 int rc = 0, partial = page->partial_pte;
1331 #if CONFIG_PAGING_LEVELS == 3
1332 /*
1333 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1334 * the weird 'extended cr3' format for dealing with high-order address
1335 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1336 */
1337 if ( (pfn >= 0x100000) &&
1338 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1339 d->vcpu[0] && d->vcpu[0]->is_initialised )
1341 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1342 return -EINVAL;
1344 #endif
1346 pl3e = map_domain_page(pfn);
1348 /*
1349 * PAE guests allocate full pages, but aren't required to initialize
1350 * more than the first four entries; when running in compatibility
1351 * mode, however, the full page is visible to the MMU, and hence all
1352 * 512 entries must be valid/verified, which is most easily achieved
1353 * by clearing them out.
1354 */
1355 if ( is_pv_32on64_domain(d) )
1356 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1358 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1359 i++, partial = 0 )
1361 if ( is_pv_32bit_domain(d) && (i == 3) )
1363 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1364 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1365 rc = -EINVAL;
1366 else
1367 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1368 PGT_l2_page_table |
1369 PGT_pae_xen_l2,
1370 d, partial, preemptible);
1372 else if ( !is_guest_l3_slot(i) ||
1373 (rc = get_page_from_l3e(pl3e[i], pfn, d,
1374 partial, preemptible)) > 0 )
1375 continue;
1377 if ( rc == -EAGAIN )
1379 page->nr_validated_ptes = i;
1380 page->partial_pte = partial ?: 1;
1382 else if ( rc == -EINTR && i )
1384 page->nr_validated_ptes = i;
1385 page->partial_pte = 0;
1386 rc = -EAGAIN;
1388 if ( rc < 0 )
1389 break;
1391 adjust_guest_l3e(pl3e[i], d);
1394 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1395 rc = -EINVAL;
1396 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1398 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1399 while ( i-- > 0 )
1401 if ( !is_guest_l3_slot(i) )
1402 continue;
1403 unadjust_guest_l3e(pl3e[i], d);
1404 put_page_from_l3e(pl3e[i], pfn, 0, 0);
1408 unmap_domain_page(pl3e);
1409 return rc > 0 ? 0 : rc;
1412 #if CONFIG_PAGING_LEVELS >= 4
1413 static int alloc_l4_table(struct page_info *page, int preemptible)
1415 struct domain *d = page_get_owner(page);
1416 unsigned long pfn = page_to_mfn(page);
1417 l4_pgentry_t *pl4e = page_to_virt(page);
1418 unsigned int i;
1419 int rc = 0, partial = page->partial_pte;
1421 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1422 i++, partial = 0 )
1424 if ( !is_guest_l4_slot(d, i) ||
1425 (rc = get_page_from_l4e(pl4e[i], pfn, d,
1426 partial, preemptible)) > 0 )
1427 continue;
1429 if ( rc == -EAGAIN )
1431 page->nr_validated_ptes = i;
1432 page->partial_pte = partial ?: 1;
1434 else if ( rc == -EINTR )
1436 if ( i )
1438 page->nr_validated_ptes = i;
1439 page->partial_pte = 0;
1440 rc = -EAGAIN;
1443 else if ( rc < 0 )
1445 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1446 while ( i-- > 0 )
1447 if ( is_guest_l4_slot(d, i) )
1448 put_page_from_l4e(pl4e[i], pfn, 0, 0);
1450 if ( rc < 0 )
1451 return rc;
1453 adjust_guest_l4e(pl4e[i], d);
1456 /* Xen private mappings. */
1457 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1458 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1459 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1460 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1461 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1462 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1463 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1464 __PAGE_HYPERVISOR);
1466 return rc > 0 ? 0 : rc;
1468 #else
1469 #define alloc_l4_table(page, preemptible) (-EINVAL)
1470 #endif
1473 static void free_l1_table(struct page_info *page)
1475 struct domain *d = page_get_owner(page);
1476 unsigned long pfn = page_to_mfn(page);
1477 l1_pgentry_t *pl1e;
1478 unsigned int i;
1480 pl1e = map_domain_page(pfn);
1482 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1483 if ( is_guest_l1_slot(i) )
1484 put_page_from_l1e(pl1e[i], d);
1486 unmap_domain_page(pl1e);
1490 static int free_l2_table(struct page_info *page, int preemptible)
1492 #ifdef CONFIG_COMPAT
1493 struct domain *d = page_get_owner(page);
1494 #endif
1495 unsigned long pfn = page_to_mfn(page);
1496 l2_pgentry_t *pl2e;
1497 unsigned int i = page->nr_validated_ptes - 1;
1498 int err = 0;
1500 pl2e = map_domain_page(pfn);
1502 ASSERT(page->nr_validated_ptes);
1503 do {
1504 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1505 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1506 preemptible && i && hypercall_preempt_check() )
1508 page->nr_validated_ptes = i;
1509 err = -EAGAIN;
1511 } while ( !err && i-- );
1513 unmap_domain_page(pl2e);
1515 if ( !err )
1516 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1518 return err;
1521 static int free_l3_table(struct page_info *page, int preemptible)
1523 struct domain *d = page_get_owner(page);
1524 unsigned long pfn = page_to_mfn(page);
1525 l3_pgentry_t *pl3e;
1526 int rc = 0, partial = page->partial_pte;
1527 unsigned int i = page->nr_validated_ptes - !partial;
1529 pl3e = map_domain_page(pfn);
1531 do {
1532 if ( is_guest_l3_slot(i) )
1534 rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
1535 if ( rc < 0 )
1536 break;
1537 partial = 0;
1538 if ( rc > 0 )
1539 continue;
1540 unadjust_guest_l3e(pl3e[i], d);
1542 } while ( i-- );
1544 unmap_domain_page(pl3e);
1546 if ( rc == -EAGAIN )
1548 page->nr_validated_ptes = i;
1549 page->partial_pte = partial ?: -1;
1551 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1553 page->nr_validated_ptes = i + 1;
1554 page->partial_pte = 0;
1555 rc = -EAGAIN;
1557 return rc > 0 ? 0 : rc;
1560 #if CONFIG_PAGING_LEVELS >= 4
1561 static int free_l4_table(struct page_info *page, int preemptible)
1563 struct domain *d = page_get_owner(page);
1564 unsigned long pfn = page_to_mfn(page);
1565 l4_pgentry_t *pl4e = page_to_virt(page);
1566 int rc = 0, partial = page->partial_pte;
1567 unsigned int i = page->nr_validated_ptes - !partial;
1569 do {
1570 if ( is_guest_l4_slot(d, i) )
1571 rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
1572 if ( rc < 0 )
1573 break;
1574 partial = 0;
1575 } while ( i-- );
1577 if ( rc == -EAGAIN )
1579 page->nr_validated_ptes = i;
1580 page->partial_pte = partial ?: -1;
1582 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1584 page->nr_validated_ptes = i + 1;
1585 page->partial_pte = 0;
1586 rc = -EAGAIN;
1588 return rc > 0 ? 0 : rc;
1590 #else
1591 #define free_l4_table(page, preemptible) (-EINVAL)
1592 #endif
1594 static int page_lock(struct page_info *page)
1596 unsigned long x, nx;
1598 do {
1599 while ( (x = page->u.inuse.type_info) & PGT_locked )
1600 cpu_relax();
1601 nx = x + (1 | PGT_locked);
1602 if ( !(x & PGT_validated) ||
1603 !(x & PGT_count_mask) ||
1604 !(nx & PGT_count_mask) )
1605 return 0;
1606 } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1608 return 1;
1611 static void page_unlock(struct page_info *page)
1613 unsigned long x, nx, y = page->u.inuse.type_info;
1615 do {
1616 x = y;
1617 nx = x - (1 | PGT_locked);
1618 } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1621 /* How to write an entry to the guest pagetables.
1622 * Returns 0 for failure (pointer not valid), 1 for success. */
1623 static inline int update_intpte(intpte_t *p,
1624 intpte_t old,
1625 intpte_t new,
1626 unsigned long mfn,
1627 struct vcpu *v,
1628 int preserve_ad)
1630 int rv = 1;
1631 #ifndef PTE_UPDATE_WITH_CMPXCHG
1632 if ( !preserve_ad )
1634 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1636 else
1637 #endif
1639 intpte_t t = old;
1640 for ( ; ; )
1642 intpte_t _new = new;
1643 if ( preserve_ad )
1644 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1646 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1647 if ( unlikely(rv == 0) )
1649 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1650 ": saw %" PRIpte, old, _new, t);
1651 break;
1654 if ( t == old )
1655 break;
1657 /* Allowed to change in Accessed/Dirty flags only. */
1658 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1660 old = t;
1663 return rv;
1666 /* Macro that wraps the appropriate type-changes around update_intpte().
1667 * Arguments are: type, ptr, old, new, mfn, vcpu */
1668 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1669 update_intpte(&_t ## e_get_intpte(*(_p)), \
1670 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1671 (_m), (_v), (_ad))
1673 /* Update the L1 entry at pl1e to new value nl1e. */
1674 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1675 unsigned long gl1mfn, int preserve_ad,
1676 struct vcpu *vcpu)
1678 l1_pgentry_t ol1e;
1679 struct domain *d = vcpu->domain;
1680 unsigned long mfn;
1681 p2m_type_t p2mt;
1682 int rc = 1;
1684 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1685 return 0;
1687 if ( unlikely(paging_mode_refcounts(d)) )
1689 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu, preserve_ad);
1690 return rc;
1693 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1695 /* Translate foreign guest addresses. */
1696 mfn = mfn_x(gfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e), &p2mt));
1697 if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
1698 return 0;
1699 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1700 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1702 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1704 MEM_LOG("Bad L1 flags %x",
1705 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1706 return 0;
1709 /* Fast path for identical mapping, r/w and presence. */
1710 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1712 adjust_guest_l1e(nl1e, d);
1713 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1714 preserve_ad);
1715 return rc;
1718 if ( unlikely(!get_page_from_l1e(nl1e, d, FOREIGNDOM)) )
1719 return 0;
1721 adjust_guest_l1e(nl1e, d);
1722 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1723 preserve_ad)) )
1725 ol1e = nl1e;
1726 rc = 0;
1729 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
1730 preserve_ad)) )
1732 return 0;
1735 put_page_from_l1e(ol1e, d);
1736 return rc;
1740 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1741 static int mod_l2_entry(l2_pgentry_t *pl2e,
1742 l2_pgentry_t nl2e,
1743 unsigned long pfn,
1744 int preserve_ad,
1745 struct vcpu *vcpu)
1747 l2_pgentry_t ol2e;
1748 struct domain *d = vcpu->domain;
1749 struct page_info *l2pg = mfn_to_page(pfn);
1750 unsigned long type = l2pg->u.inuse.type_info;
1751 int rc = 1;
1753 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1755 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1756 return 0;
1759 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1760 return 0;
1762 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1764 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1766 MEM_LOG("Bad L2 flags %x",
1767 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1768 return 0;
1771 /* Fast path for identical mapping and presence. */
1772 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1774 adjust_guest_l2e(nl2e, d);
1775 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad);
1776 return rc;
1779 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1780 return 0;
1782 adjust_guest_l2e(nl2e, d);
1783 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1784 preserve_ad)) )
1786 ol2e = nl2e;
1787 rc = 0;
1790 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
1791 preserve_ad)) )
1793 return 0;
1796 put_page_from_l2e(ol2e, pfn);
1797 return rc;
1800 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1801 static int mod_l3_entry(l3_pgentry_t *pl3e,
1802 l3_pgentry_t nl3e,
1803 unsigned long pfn,
1804 int preserve_ad,
1805 int preemptible,
1806 struct vcpu *vcpu)
1808 l3_pgentry_t ol3e;
1809 struct domain *d = vcpu->domain;
1810 int rc = 0;
1812 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1814 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1815 return -EINVAL;
1818 /*
1819 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1820 * would be a pain to ensure they remain continuously valid throughout.
1821 */
1822 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1823 return -EINVAL;
1825 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1826 return -EFAULT;
1828 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1830 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1832 MEM_LOG("Bad L3 flags %x",
1833 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1834 return -EINVAL;
1837 /* Fast path for identical mapping and presence. */
1838 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1840 adjust_guest_l3e(nl3e, d);
1841 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
1842 return rc ? 0 : -EFAULT;
1845 rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
1846 if ( unlikely(rc < 0) )
1847 return rc;
1848 rc = 0;
1850 adjust_guest_l3e(nl3e, d);
1851 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1852 preserve_ad)) )
1854 ol3e = nl3e;
1855 rc = -EFAULT;
1858 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
1859 preserve_ad)) )
1861 return -EFAULT;
1864 if ( likely(rc == 0) )
1866 if ( !create_pae_xen_mappings(d, pl3e) )
1867 BUG();
1869 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1872 put_page_from_l3e(ol3e, pfn, 0, 0);
1873 return rc;
1876 #if CONFIG_PAGING_LEVELS >= 4
1878 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1879 static int mod_l4_entry(l4_pgentry_t *pl4e,
1880 l4_pgentry_t nl4e,
1881 unsigned long pfn,
1882 int preserve_ad,
1883 int preemptible,
1884 struct vcpu *vcpu)
1886 struct domain *d = vcpu->domain;
1887 l4_pgentry_t ol4e;
1888 int rc = 0;
1890 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1892 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1893 return -EINVAL;
1896 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1897 return -EFAULT;
1899 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1901 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1903 MEM_LOG("Bad L4 flags %x",
1904 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1905 return -EINVAL;
1908 /* Fast path for identical mapping and presence. */
1909 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1911 adjust_guest_l4e(nl4e, d);
1912 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
1913 return rc ? 0 : -EFAULT;
1916 rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
1917 if ( unlikely(rc < 0) )
1918 return rc;
1919 rc = 0;
1921 adjust_guest_l4e(nl4e, d);
1922 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1923 preserve_ad)) )
1925 ol4e = nl4e;
1926 rc = -EFAULT;
1929 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
1930 preserve_ad)) )
1932 return -EFAULT;
1935 put_page_from_l4e(ol4e, pfn, 0, 0);
1936 return rc;
1939 #endif
1941 void put_page(struct page_info *page)
1943 unsigned long nx, x, y = page->count_info;
1945 do {
1946 ASSERT((y & PGC_count_mask) != 0);
1947 x = y;
1948 nx = x - 1;
1950 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1952 if ( unlikely((nx & PGC_count_mask) == 0) )
1954 cleanup_page_cacheattr(page);
1955 free_domheap_page(page);
1960 struct domain *page_get_owner_and_reference(struct page_info *page)
1962 unsigned long x, y = page->count_info;
1964 do {
1965 x = y;
1966 /*
1967 * Count == 0: Page is not allocated, so we cannot take a reference.
1968 * Count == -1: Reference count would wrap, which is invalid.
1969 * Count == -2: Remaining unused ref is reserved for get_page_light().
1970 */
1971 if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
1972 return NULL;
1974 while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
1976 return page_get_owner(page);
1980 int get_page(struct page_info *page, struct domain *domain)
1982 struct domain *owner = page_get_owner_and_reference(page);
1984 if ( likely(owner == domain) )
1985 return 1;
1987 if ( owner != NULL )
1988 put_page(page);
1990 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1991 gdprintk(XENLOG_INFO,
1992 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
1993 PRtype_info "\n",
1994 page_to_mfn(page), domain, owner,
1995 page->count_info, page->u.inuse.type_info);
1996 return 0;
1999 /*
2000 * Special version of get_page() to be used exclusively when
2001 * - a page is known to already have a non-zero reference count
2002 * - the page does not need its owner to be checked
2003 * - it will not be called more than once without dropping the thus
2004 * acquired reference again.
2005 * Due to get_page() reserving one reference, this call cannot fail.
2006 */
2007 static void get_page_light(struct page_info *page)
2009 unsigned long x, nx, y = page->count_info;
2011 do {
2012 x = y;
2013 nx = x + 1;
2014 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2015 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2016 y = cmpxchg(&page->count_info, x, nx);
2018 while ( unlikely(y != x) );
2021 static int alloc_page_type(struct page_info *page, unsigned long type,
2022 int preemptible)
2024 struct domain *owner = page_get_owner(page);
2025 int rc;
2027 /* A page table is dirtied when its type count becomes non-zero. */
2028 if ( likely(owner != NULL) )
2029 paging_mark_dirty(owner, page_to_mfn(page));
2031 switch ( type & PGT_type_mask )
2033 case PGT_l1_page_table:
2034 rc = alloc_l1_table(page);
2035 break;
2036 case PGT_l2_page_table:
2037 rc = alloc_l2_table(page, type, preemptible);
2038 break;
2039 case PGT_l3_page_table:
2040 rc = alloc_l3_table(page, preemptible);
2041 break;
2042 case PGT_l4_page_table:
2043 rc = alloc_l4_table(page, preemptible);
2044 break;
2045 case PGT_seg_desc_page:
2046 rc = alloc_segdesc_page(page);
2047 break;
2048 default:
2049 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2050 type, page->u.inuse.type_info,
2051 page->count_info);
2052 rc = -EINVAL;
2053 BUG();
2056 /* No need for atomic update of type_info here: noone else updates it. */
2057 wmb();
2058 if ( rc == -EAGAIN )
2060 get_page_light(page);
2061 page->u.inuse.type_info |= PGT_partial;
2063 else if ( rc == -EINTR )
2065 ASSERT((page->u.inuse.type_info &
2066 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2067 page->u.inuse.type_info &= ~PGT_count_mask;
2069 else if ( rc )
2071 ASSERT(rc < 0);
2072 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2073 PRtype_info ": caf=%08lx taf=%" PRtype_info,
2074 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2075 type, page->count_info, page->u.inuse.type_info);
2076 page->u.inuse.type_info = 0;
2078 else
2080 page->u.inuse.type_info |= PGT_validated;
2083 return rc;
2087 int free_page_type(struct page_info *page, unsigned long type,
2088 int preemptible)
2090 struct domain *owner = page_get_owner(page);
2091 unsigned long gmfn;
2092 int rc;
2094 if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2096 /* A page table is dirtied when its type count becomes zero. */
2097 paging_mark_dirty(owner, page_to_mfn(page));
2099 if ( shadow_mode_refcounts(owner) )
2100 return 0;
2102 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
2103 ASSERT(VALID_M2P(gmfn));
2104 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
2107 if ( !(type & PGT_partial) )
2109 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2110 page->partial_pte = 0;
2113 switch ( type & PGT_type_mask )
2115 case PGT_l1_page_table:
2116 free_l1_table(page);
2117 rc = 0;
2118 break;
2119 case PGT_l2_page_table:
2120 rc = free_l2_table(page, preemptible);
2121 break;
2122 case PGT_l3_page_table:
2123 #if CONFIG_PAGING_LEVELS == 3
2124 if ( !(type & PGT_partial) )
2125 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
2126 #endif
2127 rc = free_l3_table(page, preemptible);
2128 break;
2129 case PGT_l4_page_table:
2130 rc = free_l4_table(page, preemptible);
2131 break;
2132 default:
2133 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
2134 rc = -EINVAL;
2135 BUG();
2138 return rc;
2142 static int __put_final_page_type(
2143 struct page_info *page, unsigned long type, int preemptible)
2145 int rc = free_page_type(page, type, preemptible);
2147 /* No need for atomic update of type_info here: noone else updates it. */
2148 if ( rc == 0 )
2150 /*
2151 * Record TLB information for flush later. We do not stamp page tables
2152 * when running in shadow mode:
2153 * 1. Pointless, since it's the shadow pt's which must be tracked.
2154 * 2. Shadow mode reuses this field for shadowed page tables to
2155 * store flags info -- we don't want to conflict with that.
2156 */
2157 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2158 (page->count_info & PGC_page_table)) )
2159 page->tlbflush_timestamp = tlbflush_current_time();
2160 wmb();
2161 page->u.inuse.type_info--;
2163 else if ( rc == -EINTR )
2165 ASSERT((page->u.inuse.type_info &
2166 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2167 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2168 (page->count_info & PGC_page_table)) )
2169 page->tlbflush_timestamp = tlbflush_current_time();
2170 wmb();
2171 page->u.inuse.type_info |= PGT_validated;
2173 else
2175 BUG_ON(rc != -EAGAIN);
2176 wmb();
2177 get_page_light(page);
2178 page->u.inuse.type_info |= PGT_partial;
2181 return rc;
2185 static int __put_page_type(struct page_info *page,
2186 int preemptible)
2188 unsigned long nx, x, y = page->u.inuse.type_info;
2189 int rc = 0;
2191 for ( ; ; )
2193 x = y;
2194 nx = x - 1;
2196 ASSERT((x & PGT_count_mask) != 0);
2198 if ( unlikely((nx & PGT_count_mask) == 0) )
2200 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2201 likely(nx & (PGT_validated|PGT_partial)) )
2203 /*
2204 * Page-table pages must be unvalidated when count is zero. The
2205 * 'free' is safe because the refcnt is non-zero and validated
2206 * bit is clear => other ops will spin or fail.
2207 */
2208 nx = x & ~(PGT_validated|PGT_partial);
2209 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2210 x, nx)) != x) )
2211 continue;
2212 /* We cleared the 'valid bit' so we do the clean up. */
2213 rc = __put_final_page_type(page, x, preemptible);
2214 if ( x & PGT_partial )
2215 put_page(page);
2216 break;
2219 /*
2220 * Record TLB information for flush later. We do not stamp page
2221 * tables when running in shadow mode:
2222 * 1. Pointless, since it's the shadow pt's which must be tracked.
2223 * 2. Shadow mode reuses this field for shadowed page tables to
2224 * store flags info -- we don't want to conflict with that.
2225 */
2226 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2227 (page->count_info & PGC_page_table)) )
2228 page->tlbflush_timestamp = tlbflush_current_time();
2231 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2232 break;
2234 if ( preemptible && hypercall_preempt_check() )
2235 return -EINTR;
2238 return rc;
2242 static int __get_page_type(struct page_info *page, unsigned long type,
2243 int preemptible)
2245 unsigned long nx, x, y = page->u.inuse.type_info;
2246 int rc = 0;
2248 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2250 for ( ; ; )
2252 x = y;
2253 nx = x + 1;
2254 if ( unlikely((nx & PGT_count_mask) == 0) )
2256 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2257 return -EINVAL;
2259 else if ( unlikely((x & PGT_count_mask) == 0) )
2261 struct domain *d = page_get_owner(page);
2263 /* Normally we should never let a page go from type count 0
2264 * to type count 1 when it is shadowed. One exception:
2265 * out-of-sync shadowed pages are allowed to become
2266 * writeable. */
2267 if ( d && shadow_mode_enabled(d)
2268 && (page->count_info & PGC_page_table)
2269 && !((page->shadow_flags & (1u<<29))
2270 && type == PGT_writable_page) )
2271 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2273 ASSERT(!(x & PGT_pae_xen_l2));
2274 if ( (x & PGT_type_mask) != type )
2276 /*
2277 * On type change we check to flush stale TLB entries. This
2278 * may be unnecessary (e.g., page was GDT/LDT) but those
2279 * circumstances should be very rare.
2280 */
2281 cpumask_t mask = d->domain_dirty_cpumask;
2283 /* Don't flush if the timestamp is old enough */
2284 tlbflush_filter(mask, page->tlbflush_timestamp);
2286 if ( unlikely(!cpus_empty(mask)) &&
2287 /* Shadow mode: track only writable pages. */
2288 (!shadow_mode_enabled(page_get_owner(page)) ||
2289 ((nx & PGT_type_mask) == PGT_writable_page)) )
2291 perfc_incr(need_flush_tlb_flush);
2292 flush_tlb_mask(&mask);
2295 /* We lose existing type and validity. */
2296 nx &= ~(PGT_type_mask | PGT_validated);
2297 nx |= type;
2299 /* No special validation needed for writable pages. */
2300 /* Page tables and GDT/LDT need to be scanned for validity. */
2301 if ( type == PGT_writable_page )
2302 nx |= PGT_validated;
2305 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2307 /* Don't log failure if it could be a recursive-mapping attempt. */
2308 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2309 (type == PGT_l1_page_table) )
2310 return -EINVAL;
2311 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2312 (type == PGT_l2_page_table) )
2313 return -EINVAL;
2314 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2315 (type == PGT_l3_page_table) )
2316 return -EINVAL;
2317 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2318 "for mfn %lx (pfn %lx)",
2319 x, type, page_to_mfn(page),
2320 get_gpfn_from_mfn(page_to_mfn(page)));
2321 return -EINVAL;
2323 else if ( unlikely(!(x & PGT_validated)) )
2325 if ( !(x & PGT_partial) )
2327 /* Someone else is updating validation of this page. Wait... */
2328 while ( (y = page->u.inuse.type_info) == x )
2330 if ( preemptible && hypercall_preempt_check() )
2331 return -EINTR;
2332 cpu_relax();
2334 continue;
2336 /* Type ref count was left at 1 when PGT_partial got set. */
2337 ASSERT((x & PGT_count_mask) == 1);
2338 nx = x & ~PGT_partial;
2341 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2342 break;
2344 if ( preemptible && hypercall_preempt_check() )
2345 return -EINTR;
2348 if ( unlikely((x & PGT_type_mask) != type) )
2350 /* Special pages should not be accessible from devices. */
2351 struct domain *d = page_get_owner(page);
2352 if ( d && unlikely(need_iommu(d)) )
2354 if ( (x & PGT_type_mask) == PGT_writable_page )
2355 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2356 else if ( type == PGT_writable_page )
2357 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2358 page_to_mfn(page));
2362 if ( unlikely(!(nx & PGT_validated)) )
2364 if ( !(x & PGT_partial) )
2366 page->nr_validated_ptes = 0;
2367 page->partial_pte = 0;
2369 rc = alloc_page_type(page, type, preemptible);
2372 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2373 put_page(page);
2375 return rc;
2378 void put_page_type(struct page_info *page)
2380 int rc = __put_page_type(page, 0);
2381 ASSERT(rc == 0);
2382 (void)rc;
2385 int get_page_type(struct page_info *page, unsigned long type)
2387 int rc = __get_page_type(page, type, 0);
2388 if ( likely(rc == 0) )
2389 return 1;
2390 ASSERT(rc == -EINVAL);
2391 return 0;
2394 int put_page_type_preemptible(struct page_info *page)
2396 return __put_page_type(page, 1);
2399 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2401 return __get_page_type(page, type, 1);
2404 void cleanup_page_cacheattr(struct page_info *page)
2406 uint32_t cacheattr =
2407 (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2409 if ( likely(cacheattr == 0) )
2410 return;
2412 page->count_info &= ~PGC_cacheattr_mask;
2414 BUG_ON(is_xen_heap_page(page));
2416 update_xen_mappings(page_to_mfn(page), 0);
2420 int new_guest_cr3(unsigned long mfn)
2422 struct vcpu *curr = current;
2423 struct domain *d = curr->domain;
2424 int okay;
2425 unsigned long old_base_mfn;
2427 #ifdef CONFIG_COMPAT
2428 if ( is_pv_32on64_domain(d) )
2430 okay = paging_mode_refcounts(d)
2431 ? 0 /* Old code was broken, but what should it be? */
2432 : mod_l4_entry(
2433 __va(pagetable_get_paddr(curr->arch.guest_table)),
2434 l4e_from_pfn(
2435 mfn,
2436 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2437 pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
2438 if ( unlikely(!okay) )
2440 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2441 return 0;
2444 invalidate_shadow_ldt(curr, 0);
2445 write_ptbase(curr);
2447 return 1;
2449 #endif
2450 okay = paging_mode_refcounts(d)
2451 ? get_page_from_pagenr(mfn, d)
2452 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
2453 if ( unlikely(!okay) )
2455 MEM_LOG("Error while installing new baseptr %lx", mfn);
2456 return 0;
2459 invalidate_shadow_ldt(curr, 0);
2461 old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
2463 curr->arch.guest_table = pagetable_from_pfn(mfn);
2464 update_cr3(curr);
2466 write_ptbase(curr);
2468 if ( likely(old_base_mfn != 0) )
2470 if ( paging_mode_refcounts(d) )
2471 put_page(mfn_to_page(old_base_mfn));
2472 else
2473 put_page_and_type(mfn_to_page(old_base_mfn));
2476 return 1;
2479 static void process_deferred_ops(void)
2481 unsigned int deferred_ops;
2482 struct domain *d = current->domain;
2483 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2485 deferred_ops = info->deferred_ops;
2486 info->deferred_ops = 0;
2488 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2490 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2491 flush_tlb_mask(&d->domain_dirty_cpumask);
2492 else
2493 flush_tlb_local();
2496 /*
2497 * Do this after flushing TLBs, to ensure we see fresh LDT mappings
2498 * via the linear pagetable mapping.
2499 */
2500 if ( deferred_ops & DOP_RELOAD_LDT )
2501 (void)map_ldt_shadow_page(0);
2503 if ( unlikely(info->foreign != NULL) )
2505 rcu_unlock_domain(info->foreign);
2506 info->foreign = NULL;
2510 static int set_foreigndom(domid_t domid)
2512 struct domain *e, *d = current->domain;
2513 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2514 int okay = 1;
2516 ASSERT(info->foreign == NULL);
2518 if ( likely(domid == DOMID_SELF) )
2519 goto out;
2521 if ( unlikely(domid == d->domain_id) )
2523 MEM_LOG("Cannot specify itself as foreign domain");
2524 okay = 0;
2526 else if ( unlikely(paging_mode_translate(d)) )
2528 MEM_LOG("Cannot mix foreign mappings with translated domains");
2529 okay = 0;
2531 else switch ( domid )
2533 case DOMID_IO:
2534 info->foreign = rcu_lock_domain(dom_io);
2535 break;
2536 case DOMID_XEN:
2537 if (!IS_PRIV(d)) {
2538 MEM_LOG("Cannot set foreign dom");
2539 okay = 0;
2540 break;
2542 info->foreign = rcu_lock_domain(dom_xen);
2543 break;
2544 default:
2545 if ( (e = rcu_lock_domain_by_id(domid)) == NULL )
2547 MEM_LOG("Unknown domain '%u'", domid);
2548 okay = 0;
2549 break;
2551 if ( !IS_PRIV_FOR(d, e) )
2553 MEM_LOG("Cannot set foreign dom");
2554 okay = 0;
2555 rcu_unlock_domain(e);
2556 break;
2558 info->foreign = e;
2559 break;
2562 out:
2563 return okay;
2566 static inline int vcpumask_to_pcpumask(
2567 struct domain *d, XEN_GUEST_HANDLE(const_void) bmap, cpumask_t *pmask)
2569 unsigned int vcpu_id, vcpu_bias, offs;
2570 unsigned long vmask;
2571 struct vcpu *v;
2572 bool_t is_native = !is_pv_32on64_domain(d);
2574 cpus_clear(*pmask);
2575 for ( vmask = 0, offs = 0; ; ++offs)
2577 vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
2578 if ( vcpu_bias >= MAX_VIRT_CPUS )
2579 return 0;
2581 if ( unlikely(is_native ?
2582 copy_from_guest_offset(&vmask, bmap, offs, 1) :
2583 copy_from_guest_offset((unsigned int *)&vmask, bmap,
2584 offs, 1)) )
2586 cpus_clear(*pmask);
2587 return -EFAULT;
2590 while ( vmask )
2592 vcpu_id = find_first_set_bit(vmask);
2593 vmask &= ~(1UL << vcpu_id);
2594 vcpu_id += vcpu_bias;
2595 if ( (vcpu_id >= MAX_VIRT_CPUS) )
2596 return 0;
2597 if ( ((v = d->vcpu[vcpu_id]) != NULL) )
2598 cpus_or(*pmask, *pmask, v->vcpu_dirty_cpumask);
2603 #ifdef __i386__
2604 static inline void *fixmap_domain_page(unsigned long mfn)
2606 unsigned int cpu = smp_processor_id();
2607 void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
2609 l1e_write(fix_pae_highmem_pl1e - cpu,
2610 l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
2611 flush_tlb_one_local(ptr);
2612 return ptr;
2614 static inline void fixunmap_domain_page(const void *ptr)
2616 unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
2618 l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
2619 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
2621 #else
2622 #define fixmap_domain_page(mfn) mfn_to_virt(mfn)
2623 #define fixunmap_domain_page(ptr) ((void)(ptr))
2624 #endif
2626 int do_mmuext_op(
2627 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2628 unsigned int count,
2629 XEN_GUEST_HANDLE(uint) pdone,
2630 unsigned int foreigndom)
2632 struct mmuext_op op;
2633 int rc = 0, i = 0, okay;
2634 unsigned long mfn = 0, gmfn = 0, type;
2635 unsigned int done = 0;
2636 struct page_info *page;
2637 struct vcpu *curr = current;
2638 struct domain *d = curr->domain;
2640 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2642 count &= ~MMU_UPDATE_PREEMPTED;
2643 if ( unlikely(!guest_handle_is_null(pdone)) )
2644 (void)copy_from_guest(&done, pdone, 1);
2646 else
2647 perfc_incr(calls_to_mmuext_op);
2649 if ( unlikely(!guest_handle_okay(uops, count)) )
2651 rc = -EFAULT;
2652 goto out;
2655 if ( !set_foreigndom(foreigndom) )
2657 rc = -ESRCH;
2658 goto out;
2661 for ( i = 0; i < count; i++ )
2663 if ( hypercall_preempt_check() )
2665 rc = -EAGAIN;
2666 break;
2669 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2671 MEM_LOG("Bad __copy_from_guest");
2672 rc = -EFAULT;
2673 break;
2676 okay = 1;
2677 gmfn = op.arg1.mfn;
2678 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2679 page = mfn_to_page(mfn);
2681 switch ( op.cmd )
2683 case MMUEXT_PIN_L1_TABLE:
2684 type = PGT_l1_page_table;
2685 goto pin_page;
2687 case MMUEXT_PIN_L2_TABLE:
2688 type = PGT_l2_page_table;
2689 goto pin_page;
2691 case MMUEXT_PIN_L3_TABLE:
2692 type = PGT_l3_page_table;
2693 goto pin_page;
2695 case MMUEXT_PIN_L4_TABLE:
2696 if ( is_pv_32bit_domain(FOREIGNDOM) )
2697 break;
2698 type = PGT_l4_page_table;
2700 pin_page:
2701 rc = xsm_memory_pin_page(d, page);
2702 if ( rc )
2703 break;
2705 /* Ignore pinning of invalid paging levels. */
2706 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2707 break;
2709 if ( paging_mode_refcounts(FOREIGNDOM) )
2710 break;
2712 rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
2713 okay = !rc;
2714 if ( unlikely(!okay) )
2716 if ( rc == -EINTR )
2717 rc = -EAGAIN;
2718 else if ( rc != -EAGAIN )
2719 MEM_LOG("Error while pinning mfn %lx", mfn);
2720 break;
2723 if ( unlikely(test_and_set_bit(_PGT_pinned,
2724 &page->u.inuse.type_info)) )
2726 MEM_LOG("Mfn %lx already pinned", mfn);
2727 put_page_and_type(page);
2728 okay = 0;
2729 break;
2732 /* A page is dirtied when its pin status is set. */
2733 paging_mark_dirty(d, mfn);
2735 /* We can race domain destruction (domain_relinquish_resources). */
2736 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2738 int drop_ref;
2739 spin_lock(&FOREIGNDOM->page_alloc_lock);
2740 drop_ref = (FOREIGNDOM->is_dying &&
2741 test_and_clear_bit(_PGT_pinned,
2742 &page->u.inuse.type_info));
2743 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2744 if ( drop_ref )
2745 put_page_and_type(page);
2748 break;
2750 case MMUEXT_UNPIN_TABLE:
2751 if ( paging_mode_refcounts(d) )
2752 break;
2754 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2756 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2757 mfn, page_get_owner(page));
2759 else if ( likely(test_and_clear_bit(_PGT_pinned,
2760 &page->u.inuse.type_info)) )
2762 put_page_and_type(page);
2763 put_page(page);
2764 if ( !rc )
2766 /* A page is dirtied when its pin status is cleared. */
2767 paging_mark_dirty(d, mfn);
2770 else
2772 okay = 0;
2773 put_page(page);
2774 MEM_LOG("Mfn %lx not pinned", mfn);
2776 break;
2778 case MMUEXT_NEW_BASEPTR:
2779 okay = new_guest_cr3(mfn);
2780 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2781 break;
2783 #ifdef __x86_64__
2784 case MMUEXT_NEW_USER_BASEPTR: {
2785 unsigned long old_mfn;
2787 if ( mfn != 0 )
2789 if ( paging_mode_refcounts(d) )
2790 okay = get_page_from_pagenr(mfn, d);
2791 else
2792 okay = !get_page_and_type_from_pagenr(
2793 mfn, PGT_root_page_table, d, 0, 0);
2794 if ( unlikely(!okay) )
2796 MEM_LOG("Error while installing new mfn %lx", mfn);
2797 break;
2801 old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
2802 curr->arch.guest_table_user = pagetable_from_pfn(mfn);
2804 if ( old_mfn != 0 )
2806 if ( paging_mode_refcounts(d) )
2807 put_page(mfn_to_page(old_mfn));
2808 else
2809 put_page_and_type(mfn_to_page(old_mfn));
2812 break;
2814 #endif
2816 case MMUEXT_TLB_FLUSH_LOCAL:
2817 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2818 break;
2820 case MMUEXT_INVLPG_LOCAL:
2821 if ( !paging_mode_enabled(d)
2822 || paging_invlpg(curr, op.arg1.linear_addr) != 0 )
2823 flush_tlb_one_local(op.arg1.linear_addr);
2824 break;
2826 case MMUEXT_TLB_FLUSH_MULTI:
2827 case MMUEXT_INVLPG_MULTI:
2829 cpumask_t pmask;
2831 if ( unlikely(vcpumask_to_pcpumask(d, op.arg2.vcpumask, &pmask)) )
2833 okay = 0;
2834 break;
2836 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2837 flush_tlb_mask(&pmask);
2838 else
2839 flush_tlb_one_mask(&pmask, op.arg1.linear_addr);
2840 break;
2843 case MMUEXT_TLB_FLUSH_ALL:
2844 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
2845 break;
2847 case MMUEXT_INVLPG_ALL:
2848 flush_tlb_one_mask(&d->domain_dirty_cpumask, op.arg1.linear_addr);
2849 break;
2851 case MMUEXT_FLUSH_CACHE:
2852 if ( unlikely(!cache_flush_permitted(d)) )
2854 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2855 okay = 0;
2857 else
2859 wbinvd();
2861 break;
2863 case MMUEXT_SET_LDT:
2865 unsigned long ptr = op.arg1.linear_addr;
2866 unsigned long ents = op.arg2.nr_ents;
2868 if ( paging_mode_external(d) )
2870 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2871 okay = 0;
2873 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2874 (ents > 8192) ||
2875 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2877 okay = 0;
2878 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2880 else if ( (curr->arch.guest_context.ldt_ents != ents) ||
2881 (curr->arch.guest_context.ldt_base != ptr) )
2883 invalidate_shadow_ldt(curr, 0);
2884 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2885 curr->arch.guest_context.ldt_base = ptr;
2886 curr->arch.guest_context.ldt_ents = ents;
2887 load_LDT(curr);
2888 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2889 if ( ents != 0 )
2890 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2892 break;
2895 case MMUEXT_CLEAR_PAGE:
2897 unsigned char *ptr;
2899 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2900 FOREIGNDOM, 0, 0);
2901 if ( unlikely(!okay) )
2903 MEM_LOG("Error while clearing mfn %lx", mfn);
2904 break;
2907 /* A page is dirtied when it's being cleared. */
2908 paging_mark_dirty(d, mfn);
2910 ptr = fixmap_domain_page(mfn);
2911 clear_page(ptr);
2912 fixunmap_domain_page(ptr);
2914 put_page_and_type(page);
2915 break;
2918 case MMUEXT_COPY_PAGE:
2920 const unsigned char *src;
2921 unsigned char *dst;
2922 unsigned long src_mfn;
2924 src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
2925 okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
2926 if ( unlikely(!okay) )
2928 MEM_LOG("Error while copying from mfn %lx", src_mfn);
2929 break;
2932 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2933 FOREIGNDOM, 0, 0);
2934 if ( unlikely(!okay) )
2936 put_page(mfn_to_page(src_mfn));
2937 MEM_LOG("Error while copying to mfn %lx", mfn);
2938 break;
2941 /* A page is dirtied when it's being copied to. */
2942 paging_mark_dirty(d, mfn);
2944 src = map_domain_page(src_mfn);
2945 dst = fixmap_domain_page(mfn);
2946 copy_page(dst, src);
2947 fixunmap_domain_page(dst);
2948 unmap_domain_page(src);
2950 put_page_and_type(page);
2951 put_page(mfn_to_page(src_mfn));
2952 break;
2955 default:
2956 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2957 rc = -ENOSYS;
2958 okay = 0;
2959 break;
2962 if ( unlikely(!okay) )
2964 rc = rc ? rc : -EINVAL;
2965 break;
2968 guest_handle_add_offset(uops, 1);
2971 if ( rc == -EAGAIN )
2972 rc = hypercall_create_continuation(
2973 __HYPERVISOR_mmuext_op, "hihi",
2974 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2976 process_deferred_ops();
2978 perfc_add(num_mmuext_ops, i);
2980 out:
2981 /* Add incremental work we have done to the @done output parameter. */
2982 if ( unlikely(!guest_handle_is_null(pdone)) )
2984 done += i;
2985 copy_to_guest(pdone, &done, 1);
2988 return rc;
2991 int do_mmu_update(
2992 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2993 unsigned int count,
2994 XEN_GUEST_HANDLE(uint) pdone,
2995 unsigned int foreigndom)
2997 struct mmu_update req;
2998 void *va;
2999 unsigned long gpfn, gmfn, mfn;
3000 struct page_info *page;
3001 int rc = 0, okay = 1, i = 0;
3002 unsigned int cmd, done = 0;
3003 struct domain *d = current->domain;
3004 struct domain_mmap_cache mapcache;
3006 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3008 count &= ~MMU_UPDATE_PREEMPTED;
3009 if ( unlikely(!guest_handle_is_null(pdone)) )
3010 (void)copy_from_guest(&done, pdone, 1);
3012 else
3013 perfc_incr(calls_to_mmu_update);
3015 if ( unlikely(!guest_handle_okay(ureqs, count)) )
3017 rc = -EFAULT;
3018 goto out;
3021 if ( !set_foreigndom(foreigndom) )
3023 rc = -ESRCH;
3024 goto out;
3027 domain_mmap_cache_init(&mapcache);
3029 for ( i = 0; i < count; i++ )
3031 if ( hypercall_preempt_check() )
3033 rc = -EAGAIN;
3034 break;
3037 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3039 MEM_LOG("Bad __copy_from_guest");
3040 rc = -EFAULT;
3041 break;
3044 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3045 okay = 0;
3047 switch ( cmd )
3049 /*
3050 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3051 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3052 * current A/D bits.
3053 */
3054 case MMU_NORMAL_PT_UPDATE:
3055 case MMU_PT_UPDATE_PRESERVE_AD:
3056 rc = xsm_mmu_normal_update(d, FOREIGNDOM, req.val);
3057 if ( rc )
3058 break;
3060 req.ptr -= cmd;
3061 gmfn = req.ptr >> PAGE_SHIFT;
3062 mfn = gmfn_to_mfn(d, gmfn);
3064 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
3066 MEM_LOG("Could not get page for normal update");
3067 break;
3070 va = map_domain_page_with_cache(mfn, &mapcache);
3071 va = (void *)((unsigned long)va +
3072 (unsigned long)(req.ptr & ~PAGE_MASK));
3073 page = mfn_to_page(mfn);
3075 if ( page_lock(page) )
3077 switch ( page->u.inuse.type_info & PGT_type_mask )
3079 case PGT_l1_page_table:
3081 l1_pgentry_t l1e = l1e_from_intpte(req.val);
3082 okay = mod_l1_entry(va, l1e, mfn,
3083 cmd == MMU_PT_UPDATE_PRESERVE_AD,
3084 current);
3086 break;
3087 case PGT_l2_page_table:
3089 l2_pgentry_t l2e = l2e_from_intpte(req.val);
3090 okay = mod_l2_entry(va, l2e, mfn,
3091 cmd == MMU_PT_UPDATE_PRESERVE_AD,
3092 current);
3094 break;
3095 case PGT_l3_page_table:
3097 l3_pgentry_t l3e = l3e_from_intpte(req.val);
3098 rc = mod_l3_entry(va, l3e, mfn,
3099 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1,
3100 current);
3101 okay = !rc;
3103 break;
3104 #if CONFIG_PAGING_LEVELS >= 4
3105 case PGT_l4_page_table:
3107 l4_pgentry_t l4e = l4e_from_intpte(req.val);
3108 rc = mod_l4_entry(va, l4e, mfn,
3109 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1,
3110 current);
3111 okay = !rc;
3113 break;
3114 #endif
3115 case PGT_writable_page:
3116 perfc_incr(writable_mmu_updates);
3117 okay = paging_write_guest_entry(
3118 current, va, req.val, _mfn(mfn));
3119 break;
3121 page_unlock(page);
3122 if ( rc == -EINTR )
3123 rc = -EAGAIN;
3125 else if ( get_page_type(page, PGT_writable_page) )
3127 perfc_incr(writable_mmu_updates);
3128 okay = paging_write_guest_entry(
3129 current, va, req.val, _mfn(mfn));
3130 put_page_type(page);
3133 unmap_domain_page_with_cache(va, &mapcache);
3134 put_page(page);
3135 break;
3137 case MMU_MACHPHYS_UPDATE:
3139 mfn = req.ptr >> PAGE_SHIFT;
3140 gpfn = req.val;
3142 rc = xsm_mmu_machphys_update(d, mfn);
3143 if ( rc )
3144 break;
3146 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
3148 MEM_LOG("Could not get page for mach->phys update");
3149 break;
3152 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
3154 MEM_LOG("Mach-phys update on auto-translate guest");
3155 break;
3158 set_gpfn_from_mfn(mfn, gpfn);
3159 okay = 1;
3161 paging_mark_dirty(FOREIGNDOM, mfn);
3163 put_page(mfn_to_page(mfn));
3164 break;
3166 default:
3167 MEM_LOG("Invalid page update command %x", cmd);
3168 rc = -ENOSYS;
3169 okay = 0;
3170 break;
3173 if ( unlikely(!okay) )
3175 rc = rc ? rc : -EINVAL;
3176 break;
3179 guest_handle_add_offset(ureqs, 1);
3182 if ( rc == -EAGAIN )
3183 rc = hypercall_create_continuation(
3184 __HYPERVISOR_mmu_update, "hihi",
3185 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3187 process_deferred_ops();
3189 domain_mmap_cache_destroy(&mapcache);
3191 perfc_add(num_page_updates, i);
3193 out:
3194 /* Add incremental work we have done to the @done output parameter. */
3195 if ( unlikely(!guest_handle_is_null(pdone)) )
3197 done += i;
3198 copy_to_guest(pdone, &done, 1);
3201 return rc;
3205 static int create_grant_pte_mapping(
3206 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
3208 int rc = GNTST_okay;
3209 void *va;
3210 unsigned long gmfn, mfn;
3211 struct page_info *page;
3212 l1_pgentry_t ol1e;
3213 struct domain *d = v->domain;
3215 ASSERT(domain_is_locked(d));
3217 adjust_guest_l1e(nl1e, d);
3219 gmfn = pte_addr >> PAGE_SHIFT;
3220 mfn = gmfn_to_mfn(d, gmfn);
3222 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3224 MEM_LOG("Could not get page for normal update");
3225 return GNTST_general_error;
3228 va = map_domain_page(mfn);
3229 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
3230 page = mfn_to_page(mfn);
3232 if ( !page_lock(page) )
3234 rc = GNTST_general_error;
3235 goto failed;
3238 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3240 page_unlock(page);
3241 rc = GNTST_general_error;
3242 goto failed;
3245 ol1e = *(l1_pgentry_t *)va;
3246 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3248 page_unlock(page);
3249 rc = GNTST_general_error;
3250 goto failed;
3253 page_unlock(page);
3255 if ( !paging_mode_refcounts(d) )
3256 put_page_from_l1e(ol1e, d);
3258 failed:
3259 unmap_domain_page(va);
3260 put_page(page);
3262 return rc;
3265 static int destroy_grant_pte_mapping(
3266 uint64_t addr, unsigned long frame, struct domain *d)
3268 int rc = GNTST_okay;
3269 void *va;
3270 unsigned long gmfn, mfn;
3271 struct page_info *page;
3272 l1_pgentry_t ol1e;
3274 gmfn = addr >> PAGE_SHIFT;
3275 mfn = gmfn_to_mfn(d, gmfn);
3277 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3279 MEM_LOG("Could not get page for normal update");
3280 return GNTST_general_error;
3283 va = map_domain_page(mfn);
3284 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3285 page = mfn_to_page(mfn);
3287 if ( !page_lock(page) )
3289 rc = GNTST_general_error;
3290 goto failed;
3293 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3295 page_unlock(page);
3296 rc = GNTST_general_error;
3297 goto failed;
3300 ol1e = *(l1_pgentry_t *)va;
3302 /* Check that the virtual address supplied is actually mapped to frame. */
3303 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3305 page_unlock(page);
3306 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3307 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3308 rc = GNTST_general_error;
3309 goto failed;
3312 /* Delete pagetable entry. */
3313 if ( unlikely(!UPDATE_ENTRY
3314 (l1,
3315 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3316 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3317 0)) )
3319 page_unlock(page);
3320 MEM_LOG("Cannot delete PTE entry at %p", va);
3321 rc = GNTST_general_error;
3322 goto failed;
3325 page_unlock(page);
3327 failed:
3328 unmap_domain_page(va);
3329 put_page(page);
3330 return rc;
3334 static int create_grant_va_mapping(
3335 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3337 l1_pgentry_t *pl1e, ol1e;
3338 struct domain *d = v->domain;
3339 unsigned long gl1mfn;
3340 struct page_info *l1pg;
3341 int okay;
3343 ASSERT(domain_is_locked(d));
3345 adjust_guest_l1e(nl1e, d);
3347 pl1e = guest_map_l1e(v, va, &gl1mfn);
3348 if ( !pl1e )
3350 MEM_LOG("Could not find L1 PTE for address %lx", va);
3351 return GNTST_general_error;
3354 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3356 guest_unmap_l1e(v, pl1e);
3357 return GNTST_general_error;
3360 l1pg = mfn_to_page(gl1mfn);
3361 if ( !page_lock(l1pg) )
3363 put_page(l1pg);
3364 guest_unmap_l1e(v, pl1e);
3365 return GNTST_general_error;
3368 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3370 page_unlock(l1pg);
3371 put_page(l1pg);
3372 guest_unmap_l1e(v, pl1e);
3373 return GNTST_general_error;
3376 ol1e = *pl1e;
3377 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3379 page_unlock(l1pg);
3380 put_page(l1pg);
3381 guest_unmap_l1e(v, pl1e);
3383 if ( okay && !paging_mode_refcounts(d) )
3384 put_page_from_l1e(ol1e, d);
3386 return okay ? GNTST_okay : GNTST_general_error;
3389 static int replace_grant_va_mapping(
3390 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3392 l1_pgentry_t *pl1e, ol1e;
3393 unsigned long gl1mfn;
3394 struct page_info *l1pg;
3395 int rc = 0;
3397 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3398 if ( !pl1e )
3400 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3401 return GNTST_general_error;
3404 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3406 rc = GNTST_general_error;
3407 goto out;
3410 l1pg = mfn_to_page(gl1mfn);
3411 if ( !page_lock(l1pg) )
3413 rc = GNTST_general_error;
3414 put_page(l1pg);
3415 goto out;
3418 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3420 rc = GNTST_general_error;
3421 goto unlock_and_out;
3424 ol1e = *pl1e;
3426 /* Check that the virtual address supplied is actually mapped to frame. */
3427 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3429 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3430 l1e_get_pfn(ol1e), addr, frame);
3431 rc = GNTST_general_error;
3432 goto unlock_and_out;
3435 /* Delete pagetable entry. */
3436 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3438 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3439 rc = GNTST_general_error;
3440 goto unlock_and_out;
3443 unlock_and_out:
3444 page_unlock(l1pg);
3445 put_page(l1pg);
3446 out:
3447 guest_unmap_l1e(v, pl1e);
3448 return rc;
3451 static int destroy_grant_va_mapping(
3452 unsigned long addr, unsigned long frame, struct vcpu *v)
3454 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3457 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3458 unsigned int flags, unsigned int cache_flags)
3460 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3462 if ( (flags & GNTMAP_application_map) )
3463 l1e_add_flags(pte,_PAGE_USER);
3464 if ( !(flags & GNTMAP_readonly) )
3465 l1e_add_flags(pte,_PAGE_RW);
3467 l1e_add_flags(pte,
3468 ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
3469 & _PAGE_AVAIL);
3471 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3473 if ( flags & GNTMAP_contains_pte )
3474 return create_grant_pte_mapping(addr, pte, current);
3475 return create_grant_va_mapping(addr, pte, current);
3478 int replace_grant_host_mapping(
3479 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3481 struct vcpu *curr = current;
3482 l1_pgentry_t *pl1e, ol1e;
3483 unsigned long gl1mfn;
3484 struct page_info *l1pg;
3485 int rc;
3487 if ( flags & GNTMAP_contains_pte )
3489 if ( !new_addr )
3490 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3492 MEM_LOG("Unsupported grant table operation");
3493 return GNTST_general_error;
3496 if ( !new_addr )
3497 return destroy_grant_va_mapping(addr, frame, curr);
3499 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3500 if ( !pl1e )
3502 MEM_LOG("Could not find L1 PTE for address %lx",
3503 (unsigned long)new_addr);
3504 return GNTST_general_error;
3507 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
3509 guest_unmap_l1e(curr, pl1e);
3510 return GNTST_general_error;
3513 l1pg = mfn_to_page(gl1mfn);
3514 if ( !page_lock(l1pg) )
3516 put_page(l1pg);
3517 guest_unmap_l1e(curr, pl1e);
3518 return GNTST_general_error;
3521 if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3523 page_unlock(l1pg);
3524 put_page(l1pg);
3525 guest_unmap_l1e(curr, pl1e);
3526 return GNTST_general_error;
3529 ol1e = *pl1e;
3531 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3532 gl1mfn, curr, 0)) )
3534 page_unlock(l1pg);
3535 put_page(l1pg);
3536 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3537 guest_unmap_l1e(curr, pl1e);
3538 return GNTST_general_error;
3541 page_unlock(l1pg);
3542 put_page(l1pg);
3543 guest_unmap_l1e(curr, pl1e);
3545 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3546 if ( rc && !paging_mode_refcounts(curr->domain) )
3547 put_page_from_l1e(ol1e, curr->domain);
3549 return rc;
3552 int donate_page(
3553 struct domain *d, struct page_info *page, unsigned int memflags)
3555 spin_lock(&d->page_alloc_lock);
3557 if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) )
3558 goto fail;
3560 if ( d->is_dying )
3561 goto fail;
3563 if ( page->count_info & ~(PGC_allocated | 1) )
3564 goto fail;
3566 if ( !(memflags & MEMF_no_refcount) )
3568 if ( d->tot_pages >= d->max_pages )
3569 goto fail;
3570 d->tot_pages++;
3573 page->count_info = PGC_allocated | 1;
3574 page_set_owner(page, d);
3575 page_list_add_tail(page,&d->page_list);
3577 spin_unlock(&d->page_alloc_lock);
3578 return 0;
3580 fail:
3581 spin_unlock(&d->page_alloc_lock);
3582 MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3583 (void *)page_to_mfn(page), d, d->domain_id,
3584 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3585 return -1;
3588 int steal_page(
3589 struct domain *d, struct page_info *page, unsigned int memflags)
3591 unsigned long x, y;
3593 spin_lock(&d->page_alloc_lock);
3595 if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
3596 goto fail;
3598 /*
3599 * We require there is just one reference (PGC_allocated). We temporarily
3600 * drop this reference now so that we can safely swizzle the owner.
3601 */
3602 y = page->count_info;
3603 do {
3604 x = y;
3605 if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3606 goto fail;
3607 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3608 } while ( y != x );
3610 /* Swizzle the owner then reinstate the PGC_allocated reference. */
3611 page_set_owner(page, NULL);
3612 y = page->count_info;
3613 do {
3614 x = y;
3615 BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3616 } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3618 /* Unlink from original owner. */
3619 if ( !(memflags & MEMF_no_refcount) )
3620 d->tot_pages--;
3621 page_list_del(page, &d->page_list);
3623 spin_unlock(&d->page_alloc_lock);
3624 return 0;
3626 fail:
3627 spin_unlock(&d->page_alloc_lock);
3628 MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
3629 (void *)page_to_mfn(page), d, d->domain_id,
3630 page_get_owner(page), page->count_info, page->u.inuse.type_info);
3631 return -1;
3634 int do_update_va_mapping(unsigned long va, u64 val64,
3635 unsigned long flags)
3637 l1_pgentry_t val = l1e_from_intpte(val64);
3638 struct vcpu *v = current;
3639 struct domain *d = v->domain;
3640 struct page_info *gl1pg;
3641 l1_pgentry_t *pl1e;
3642 unsigned long bmap_ptr, gl1mfn;
3643 cpumask_t pmask;
3644 int rc;
3646 perfc_incr(calls_to_update_va);
3648 rc = xsm_update_va_mapping(d, FOREIGNDOM, val);
3649 if ( rc )
3650 return rc;
3652 rc = -EINVAL;
3653 pl1e = guest_map_l1e(v, va, &gl1mfn);
3654 if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) )
3655 goto out;
3657 gl1pg = mfn_to_page(gl1mfn);
3658 if ( !page_lock(gl1pg) )
3660 put_page(gl1pg);
3661 goto out;
3664 if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3666 page_unlock(gl1pg);
3667 put_page(gl1pg);
3668 goto out;
3671 rc = mod_l1_entry(pl1e, val, gl1mfn, 0, v) ? 0 : -EINVAL;
3673 page_unlock(gl1pg);
3674 put_page(gl1pg);
3676 out:
3677 if ( pl1e )
3678 guest_unmap_l1e(v, pl1e);
3680 switch ( flags & UVMF_FLUSHTYPE_MASK )
3682 case UVMF_TLB_FLUSH:
3683 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3685 case UVMF_LOCAL:
3686 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
3687 break;
3688 case UVMF_ALL:
3689 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
3690 break;
3691 default:
3692 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3693 break;
3694 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3695 void),
3696 &pmask);
3697 if ( cpu_isset(smp_processor_id(), pmask) )
3698 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
3699 flush_tlb_mask(&pmask);
3700 break;
3702 break;
3704 case UVMF_INVLPG:
3705 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
3706 break;
3707 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3709 case UVMF_LOCAL:
3710 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3711 break;
3712 if ( !paging_mode_enabled(d) ||
3713 (paging_invlpg(v, va) != 0) )
3714 flush_tlb_one_local(va);
3715 break;
3716 case UVMF_ALL:
3717 flush_tlb_one_mask(&d->domain_dirty_cpumask, va);
3718 break;
3719 default:
3720 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3721 void),
3722 &pmask);
3723 if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
3724 cpu_clear(smp_processor_id(), pmask);
3725 flush_tlb_one_mask(&pmask, va);
3726 break;
3728 break;
3731 process_deferred_ops();
3733 return rc;
3736 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3737 unsigned long flags,
3738 domid_t domid)
3740 int rc;
3742 if ( !set_foreigndom(domid) )
3743 return -ESRCH;
3745 rc = do_update_va_mapping(va, val64, flags);
3747 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3748 process_deferred_ops(); /* only to clear foreigndom */
3750 return rc;
3755 /*************************
3756 * Descriptor Tables
3757 */
3759 void destroy_gdt(struct vcpu *v)
3761 int i;
3762 unsigned long pfn;
3764 v->arch.guest_context.gdt_ents = 0;
3765 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3767 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3768 put_page_and_type(mfn_to_page(pfn));
3769 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3770 v->arch.guest_context.gdt_frames[i] = 0;
3775 long set_gdt(struct vcpu *v,
3776 unsigned long *frames,
3777 unsigned int entries)
3779 struct domain *d = v->domain;
3780 /* NB. There are 512 8-byte entries per GDT page. */
3781 int i, nr_pages = (entries + 511) / 512;
3782 unsigned long mfn;
3784 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3785 return -EINVAL;
3787 /* Check the pages in the new GDT. */
3788 for ( i = 0; i < nr_pages; i++ )
3790 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3791 if ( !mfn_valid(mfn) ||
3792 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
3793 goto fail;
3796 /* Tear down the old GDT. */
3797 destroy_gdt(v);
3799 /* Install the new GDT. */
3800 v->arch.guest_context.gdt_ents = entries;
3801 for ( i = 0; i < nr_pages; i++ )
3803 v->arch.guest_context.gdt_frames[i] = frames[i];
3804 l1e_write(&v->arch.perdomain_ptes[i],
3805 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3808 return 0;
3810 fail:
3811 while ( i-- > 0 )
3812 put_page_and_type(mfn_to_page(frames[i]));
3813 return -EINVAL;
3817 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3819 int nr_pages = (entries + 511) / 512;
3820 unsigned long frames[16];
3821 struct vcpu *curr = current;
3822 long ret;
3824 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3825 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3826 return -EINVAL;
3828 if ( copy_from_guest(frames, frame_list, nr_pages) )
3829 return -EFAULT;
3831 domain_lock(curr->domain);
3833 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3834 flush_tlb_local();
3836 domain_unlock(curr->domain);
3838 return ret;
3842 long do_update_descriptor(u64 pa, u64 desc)
3844 struct domain *dom = current->domain;
3845 unsigned long gmfn = pa >> PAGE_SHIFT;
3846 unsigned long mfn;
3847 unsigned int offset;
3848 struct desc_struct *gdt_pent, d;
3849 struct page_info *page;
3850 long ret = -EINVAL;
3852 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3854 *(u64 *)&d = desc;
3856 mfn = gmfn_to_mfn(dom, gmfn);
3857 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3858 !mfn_valid(mfn) ||
3859 !check_descriptor(dom, &d) )
3860 return -EINVAL;
3862 page = mfn_to_page(mfn);
3863 if ( unlikely(!get_page(page, dom)) )
3864 return -EINVAL;
3866 /* Check if the given frame is in use in an unsafe context. */
3867 switch ( page->u.inuse.type_info & PGT_type_mask )
3869 case PGT_seg_desc_page:
3870 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
3871 goto out;
3872 break;
3873 default:
3874 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3875 goto out;
3876 break;
3879 paging_mark_dirty(dom, mfn);
3881 /* All is good so make the update. */
3882 gdt_pent = map_domain_page(mfn);
3883 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3884 unmap_domain_page(gdt_pent);
3886 put_page_type(page);
3888 ret = 0; /* success */
3890 out:
3891 put_page(page);
3893 return ret;
3896 typedef struct e820entry e820entry_t;
3897 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3899 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3901 struct page_info *page = NULL;
3902 int rc;
3904 switch ( op )
3906 case XENMEM_add_to_physmap:
3908 struct xen_add_to_physmap xatp;
3909 unsigned long prev_mfn, mfn = 0, gpfn;
3910 struct domain *d;
3912 if ( copy_from_guest(&xatp, arg, 1) )
3913 return -EFAULT;
3915 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
3916 if ( rc != 0 )
3917 return rc;
3919 if ( xsm_add_to_physmap(current->domain, d) )
3921 rcu_unlock_domain(d);
3922 return -EPERM;
3925 switch ( xatp.space )
3927 case XENMAPSPACE_shared_info:
3928 if ( xatp.idx == 0 )
3929 mfn = virt_to_mfn(d->shared_info);
3930 break;
3931 case XENMAPSPACE_grant_table:
3932 spin_lock(&d->grant_table->lock);
3934 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3935 (xatp.idx < max_nr_grant_frames) )
3936 gnttab_grow_table(d, xatp.idx + 1);
3938 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3939 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3941 spin_unlock(&d->grant_table->lock);
3942 break;
3943 case XENMAPSPACE_gmfn:
3944 xatp.idx = gmfn_to_mfn(d, xatp.idx);
3945 if ( !get_page_from_pagenr(xatp.idx, d) )
3946 break;
3947 mfn = xatp.idx;
3948 page = mfn_to_page(mfn);
3949 break;
3950 default:
3951 break;
3954 if ( !paging_mode_translate(d) || (mfn == 0) )
3956 if ( page )
3957 put_page(page);
3958 rcu_unlock_domain(d);
3959 return -EINVAL;
3962 domain_lock(d);
3964 /* Remove previously mapped page if it was present. */
3965 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3966 if ( mfn_valid(prev_mfn) )
3968 if ( is_xen_heap_mfn(prev_mfn) )
3969 /* Xen heap frames are simply unhooked from this phys slot. */
3970 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3971 else
3972 /* Normal domain memory is freed, to avoid leaking memory. */
3973 guest_remove_page(d, xatp.gpfn);
3976 /* Unmap from old location, if any. */
3977 gpfn = get_gpfn_from_mfn(mfn);
3978 if ( gpfn != INVALID_M2P_ENTRY )
3979 guest_physmap_remove_page(d, gpfn, mfn, 0);
3981 /* Map at new location. */
3982 guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
3984 domain_unlock(d);
3986 if ( page )
3987 put_page(page);
3989 rcu_unlock_domain(d);
3991 break;
3994 case XENMEM_set_memory_map:
3996 struct xen_foreign_memory_map fmap;
3997 struct domain *d;
3998 int rc;
4000 if ( copy_from_guest(&fmap, arg, 1) )
4001 return -EFAULT;
4003 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
4004 return -EINVAL;
4006 rc = rcu_lock_target_domain_by_id(fmap.domid, &d);
4007 if ( rc != 0 )
4008 return rc;
4010 rc = xsm_domain_memory_map(d);
4011 if ( rc )
4013 rcu_unlock_domain(d);
4014 return rc;
4017 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
4018 fmap.map.nr_entries) ? -EFAULT : 0;
4019 d->arch.nr_e820 = fmap.map.nr_entries;
4021 rcu_unlock_domain(d);
4022 return rc;
4025 case XENMEM_memory_map:
4027 struct xen_memory_map map;
4028 struct domain *d = current->domain;
4030 /* Backwards compatibility. */
4031 if ( d->arch.nr_e820 == 0 )
4032 return -ENOSYS;
4034 if ( copy_from_guest(&map, arg, 1) )
4035 return -EFAULT;
4037 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
4038 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
4039 copy_to_guest(arg, &map, 1) )
4040 return -EFAULT;
4042 return 0;
4045 case XENMEM_machine_memory_map:
4047 struct xen_memory_map memmap;
4048 XEN_GUEST_HANDLE(e820entry_t) buffer;
4049 int count;
4050 int rc;
4052 if ( !IS_PRIV(current->domain) )
4053 return -EINVAL;
4055 rc = xsm_machine_memory_map();
4056 if ( rc )
4057 return rc;
4059 if ( copy_from_guest(&memmap, arg, 1) )
4060 return -EFAULT;
4061 if ( memmap.nr_entries < e820.nr_map + 1 )
4062 return -EINVAL;
4064 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
4066 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
4067 if ( copy_to_guest(buffer, e820.map, count) < 0 )
4068 return -EFAULT;
4070 memmap.nr_entries = count;
4072 if ( copy_to_guest(arg, &memmap, 1) )
4073 return -EFAULT;
4075 return 0;
4078 case XENMEM_machphys_mapping:
4080 static const struct xen_machphys_mapping mapping = {
4081 .v_start = MACH2PHYS_VIRT_START,
4082 .v_end = MACH2PHYS_VIRT_END,
4083 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4084 };
4086 if ( copy_to_guest(arg, &mapping, 1) )
4087 return -EFAULT;
4089 return 0;
4092 case XENMEM_set_pod_target:
4093 case XENMEM_get_pod_target:
4095 xen_pod_target_t target;
4096 struct domain *d;
4098 /* Support DOMID_SELF? */
4099 if ( !IS_PRIV(current->domain) )
4100 return -EINVAL;
4102 if ( copy_from_guest(&target, arg, 1) )
4103 return -EFAULT;
4105 rc = rcu_lock_target_domain_by_id(target.domid, &d);
4106 if ( rc != 0 )
4107 return rc;
4109 if ( op == XENMEM_set_pod_target )
4111 if ( target.target_pages > d->max_pages )
4113 rc = -EINVAL;
4114 goto pod_target_out_unlock;
4117 rc = p2m_pod_set_mem_target(d, target.target_pages);
4120 target.tot_pages = d->tot_pages;
4121 target.pod_cache_pages = d->arch.p2m->pod.count;
4122 target.pod_entries = d->arch.p2m->pod.entry_count;
4124 if ( copy_to_guest(arg, &target, 1) )
4126 rc= -EFAULT;
4127 goto pod_target_out_unlock;
4130 pod_target_out_unlock:
4131 rcu_unlock_domain(d);
4132 return rc;
4135 default:
4136 return subarch_memory_op(op, arg);
4139 return 0;
4143 /*************************
4144 * Writable Pagetables
4145 */
4147 struct ptwr_emulate_ctxt {
4148 struct x86_emulate_ctxt ctxt;
4149 unsigned long cr2;
4150 l1_pgentry_t pte;
4151 };
4153 static int ptwr_emulated_read(
4154 enum x86_segment seg,
4155 unsigned long offset,
4156 void *p_data,
4157 unsigned int bytes,
4158 struct x86_emulate_ctxt *ctxt)
4160 unsigned int rc;
4161 unsigned long addr = offset;
4163 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
4165 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
4166 return X86EMUL_EXCEPTION;
4169 return X86EMUL_OKAY;
4172 static int ptwr_emulated_update(
4173 unsigned long addr,
4174 paddr_t old,
4175 paddr_t val,
4176 unsigned int bytes,
4177 unsigned int do_cmpxchg,
4178 struct ptwr_emulate_ctxt *ptwr_ctxt)
4180 unsigned long mfn;
4181 unsigned long unaligned_addr = addr;
4182 struct page_info *page;
4183 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
4184 struct vcpu *v = current;
4185 struct domain *d = v->domain;
4187 /* Only allow naturally-aligned stores within the original %cr2 page. */
4188 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
4190 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
4191 ptwr_ctxt->cr2, addr, bytes);
4192 return X86EMUL_UNHANDLEABLE;
4195 /* Turn a sub-word access into a full-word access. */
4196 if ( bytes != sizeof(paddr_t) )
4198 paddr_t full;
4199 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
4201 /* Align address; read full word. */
4202 addr &= ~(sizeof(paddr_t)-1);
4203 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
4205 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
4206 return X86EMUL_EXCEPTION;
4208 /* Mask out bits provided by caller. */
4209 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
4210 /* Shift the caller value and OR in the missing bits. */
4211 val &= (((paddr_t)1 << (bytes*8)) - 1);
4212 val <<= (offset)*8;
4213 val |= full;
4214 /* Also fill in missing parts of the cmpxchg old value. */
4215 old &= (((paddr_t)1 << (bytes*8)) - 1);
4216 old <<= (offset)*8;
4217 old |= full;
4220 pte = ptwr_ctxt->pte;
4221 mfn = l1e_get_pfn(pte);
4222 page = mfn_to_page(mfn);
4224 /* We are looking only for read-only mappings of p.t. pages. */
4225 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
4226 ASSERT(mfn_valid(mfn));
4227 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
4228 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
4229 ASSERT(page_get_owner(page) == d);
4231 /* Check the new PTE. */
4232 nl1e = l1e_from_intpte(val);
4233 if ( unlikely(!get_page_from_l1e(nl1e, d, d)) )
4235 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
4236 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
4238 /*
4239 * If this is an upper-half write to a PAE PTE then we assume that
4240 * the guest has simply got the two writes the wrong way round. We
4241 * zap the PRESENT bit on the assumption that the bottom half will
4242 * be written immediately after we return to the guest.
4243 */
4244 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
4245 l1e_get_intpte(nl1e));
4246 l1e_remove_flags(nl1e, _PAGE_PRESENT);
4248 else
4250 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
4251 return X86EMUL_UNHANDLEABLE;
4255 adjust_guest_l1e(nl1e, d);
4257 /* Checked successfully: do the update (write or cmpxchg). */
4258 pl1e = map_domain_page(mfn);
4259 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
4260 if ( do_cmpxchg )
4262 int okay;
4263 intpte_t t = old;
4264 ol1e = l1e_from_intpte(old);
4266 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
4267 &t, l1e_get_intpte(nl1e), _mfn(mfn));
4268 okay = (okay && t == old);
4270 if ( !okay )
4272 unmap_domain_page(pl1e);
4273 put_page_from_l1e(nl1e, d);
4274 return X86EMUL_CMPXCHG_FAILED;
4277 else
4279 ol1e = *pl1e;
4280 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
4281 BUG();
4284 trace_ptwr_emulation(addr, nl1e);
4286 unmap_domain_page(pl1e);
4288 /* Finally, drop the old PTE. */
4289 put_page_from_l1e(ol1e, d);
4291 return X86EMUL_OKAY;
4294 static int ptwr_emulated_write(
4295 enum x86_segment seg,
4296 unsigned long offset,
4297 void *p_data,
4298 unsigned int bytes,
4299 struct x86_emulate_ctxt *ctxt)
4301 paddr_t val = 0;
4303 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4305 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
4306 offset, bytes);
4307 return X86EMUL_UNHANDLEABLE;
4310 memcpy(&val, p_data, bytes);
4312 return ptwr_emulated_update(
4313 offset, 0, val, bytes, 0,
4314 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4317 static int ptwr_emulated_cmpxchg(
4318 enum x86_segment seg,
4319 unsigned long offset,
4320 void *p_old,
4321 void *p_new,
4322 unsigned int bytes,
4323 struct x86_emulate_ctxt *ctxt)
4325 paddr_t old = 0, new = 0;
4327 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4329 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
4330 offset, bytes);
4331 return X86EMUL_UNHANDLEABLE;
4334 memcpy(&old, p_old, bytes);
4335 memcpy(&new, p_new, bytes);
4337 return ptwr_emulated_update(
4338 offset, old, new, bytes, 1,
4339 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4342 static struct x86_emulate_ops ptwr_emulate_ops = {
4343 .read = ptwr_emulated_read,
4344 .insn_fetch = ptwr_emulated_read,
4345 .write = ptwr_emulated_write,
4346 .cmpxchg = ptwr_emulated_cmpxchg,
4347 };
4349 /* Write page fault handler: check if guest is trying to modify a PTE. */
4350 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
4351 struct cpu_user_regs *regs)
4353 struct domain *d = v->domain;
4354 struct page_info *page;
4355 l1_pgentry_t pte;
4356 struct ptwr_emulate_ctxt ptwr_ctxt;
4357 int rc;
4359 /* Attempt to read the PTE that maps the VA being accessed. */
4360 guest_get_eff_l1e(v, addr, &pte);
4362 /* We are looking only for read-only mappings of p.t. pages. */
4363 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
4364 !get_page_from_pagenr(l1e_get_pfn(pte), d) )
4365 goto bail;
4367 page = l1e_get_page(pte);
4368 if ( !page_lock(page) )
4370 put_page(page);
4371 goto bail;
4374 if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
4376 page_unlock(page);
4377 put_page(page);
4378 goto bail;
4381 ptwr_ctxt.ctxt.regs = regs;
4382 ptwr_ctxt.ctxt.force_writeback = 0;
4383 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
4384 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
4385 ptwr_ctxt.cr2 = addr;
4386 ptwr_ctxt.pte = pte;
4388 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
4390 page_unlock(page);
4391 put_page(page);
4393 if ( rc == X86EMUL_UNHANDLEABLE )
4394 goto bail;
4396 perfc_incr(ptwr_emulations);
4397 return EXCRET_fault_fixed;
4399 bail:
4400 return 0;
4403 void free_xen_pagetable(void *v)
4405 extern int early_boot;
4407 if ( early_boot )
4408 return;
4410 if ( is_xen_heap_page(virt_to_page(v)) )
4411 free_xenheap_page(v);
4412 else
4413 free_domheap_page(virt_to_page(v));
4416 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4417 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4418 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4420 /*
4421 * map_pages_to_xen() can be called with interrupts disabled:
4422 * * During early bootstrap; or
4423 * * alloc_xenheap_pages() via memguard_guard_range
4424 * In these cases it is safe to use flush_area_local():
4425 * * Because only the local CPU is online; or
4426 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
4427 */
4428 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4429 flush_area_local((const void *)v, f) : \
4430 flush_area_all((const void *)v, f))
4432 int map_pages_to_xen(
4433 unsigned long virt,
4434 unsigned long mfn,
4435 unsigned long nr_mfns,
4436 unsigned int flags)
4438 l2_pgentry_t *pl2e, ol2e;
4439 l1_pgentry_t *pl1e, ol1e;
4440 unsigned int i;
4442 while ( nr_mfns != 0 )
4444 #ifdef __x86_64__
4445 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
4446 l3_pgentry_t ol3e = *pl3e;
4448 if ( cpu_has_page1gb &&
4449 !(((virt >> PAGE_SHIFT) | mfn) &
4450 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4451 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4452 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4454 /* 1GB-page mapping. */
4455 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4457 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4459 unsigned int flush_flags =
4460 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4462 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4464 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4465 flush_flags |= FLUSH_TLB_GLOBAL;
4466 if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4467 PAGE_CACHE_ATTRS )
4468 flush_flags |= FLUSH_CACHE;
4469 flush_area(virt, flush_flags);
4471 else
4473 pl2e = l3e_to_l2e(ol3e);
4474 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4476 ol2e = pl2e[i];
4477 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4478 continue;
4479 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4481 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4482 flush_flags |= FLUSH_TLB_GLOBAL;
4483 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4484 PAGE_CACHE_ATTRS )
4485 flush_flags |= FLUSH_CACHE;
4487 else
4489 unsigned int j;
4491 pl1e = l2e_to_l1e(ol2e);
4492 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4494 ol1e = pl1e[j];
4495 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4496 flush_flags |= FLUSH_TLB_GLOBAL;
4497 if ( (l1e_get_flags(ol1e) ^ flags) &
4498 PAGE_CACHE_ATTRS )
4499 flush_flags |= FLUSH_CACHE;
4503 flush_area(virt, flush_flags);
4504 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4506 ol2e = pl2e[i];
4507 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4508 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4509 free_xen_pagetable(l2e_to_l1e(ol2e));
4511 free_xen_pagetable(pl2e);
4515 virt += 1UL << L3_PAGETABLE_SHIFT;
4516 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4517 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4518 continue;
4521 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4522 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4524 unsigned int flush_flags =
4525 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4527 /* Skip this PTE if there is no change. */
4528 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4529 L1_PAGETABLE_ENTRIES - 1)) +
4530 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4531 l1_table_offset(virt) == mfn) &&
4532 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4533 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4535 /* We can skip to end of L3 superpage if we got a match. */
4536 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4537 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4538 if ( i > nr_mfns )
4539 i = nr_mfns;
4540 virt += i << PAGE_SHIFT;
4541 mfn += i;
4542 nr_mfns -= i;
4543 continue;
4546 pl2e = alloc_xen_pagetable();
4547 if ( pl2e == NULL )
4548 return -ENOMEM;
4550 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4551 l2e_write(pl2e + i,
4552 l2e_from_pfn(l3e_get_pfn(ol3e) +
4553 (i << PAGETABLE_ORDER),
4554 l3e_get_flags(ol3e)));
4556 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4557 flush_flags |= FLUSH_TLB_GLOBAL;
4559 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4560 __PAGE_HYPERVISOR));
4561 flush_area(virt, flush_flags);
4563 #endif
4565 pl2e = virt_to_xen_l2e(virt);
4567 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4568 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4569 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4571 /* Super-page mapping. */
4572 ol2e = *pl2e;
4573 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4575 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4577 unsigned int flush_flags =
4578 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4580 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4582 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4583 flush_flags |= FLUSH_TLB_GLOBAL;
4584 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4585 PAGE_CACHE_ATTRS )
4586 flush_flags |= FLUSH_CACHE;
4587 flush_area(virt, flush_flags);
4589 else
4591 pl1e = l2e_to_l1e(ol2e);
4592 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4594 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4595 flush_flags |= FLUSH_TLB_GLOBAL;
4596 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4597 PAGE_CACHE_ATTRS )
4598 flush_flags |= FLUSH_CACHE;
4600 flush_area(virt, flush_flags);
4601 free_xen_pagetable(pl1e);
4605 virt += 1UL << L2_PAGETABLE_SHIFT;
4606 mfn += 1UL << PAGETABLE_ORDER;
4607 nr_mfns -= 1UL << PAGETABLE_ORDER;
4609 else
4611 /* Normal page mapping. */
4612 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4614 pl1e = alloc_xen_pagetable();
4615 if ( pl1e == NULL )
4616 return -ENOMEM;
4617 clear_page(pl1e);
4618 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4619 __PAGE_HYPERVISOR));
4621 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4623 unsigned int flush_flags =
4624 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4626 /* Skip this PTE if there is no change. */
4627 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4628 l1_table_offset(virt)) == mfn) &&
4629 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4630 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4632 /* We can skip to end of L2 superpage if we got a match. */
4633 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4634 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4635 if ( i > nr_mfns )
4636 i = nr_mfns;
4637 virt += i << L1_PAGETABLE_SHIFT;
4638 mfn += i;
4639 nr_mfns -= i;
4640 goto check_l3;
4643 pl1e = alloc_xen_pagetable();
4644 if ( pl1e == NULL )
4645 return -ENOMEM;
4647 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4648 l1e_write(&pl1e[i],
4649 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4650 lNf_to_l1f(l2e_get_flags(*pl2e))));
4652 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4653 flush_flags |= FLUSH_TLB_GLOBAL;
4655 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4656 __PAGE_HYPERVISOR));
4657 flush_area(virt, flush_flags);
4660 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4661 ol1e = *pl1e;
4662 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4663 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4665 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4666 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4667 flush_flags |= FLUSH_TLB_GLOBAL;
4668 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
4669 flush_flags |= FLUSH_CACHE;
4670 flush_area(virt, flush_flags);
4673 virt += 1UL << L1_PAGETABLE_SHIFT;
4674 mfn += 1UL;
4675 nr_mfns -= 1UL;
4677 if ( (flags == PAGE_HYPERVISOR) &&
4678 ((nr_mfns == 0) ||
4679 ((((virt >> PAGE_SHIFT) | mfn) &
4680 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
4682 unsigned long base_mfn;
4683 pl1e = l2e_to_l1e(*pl2e);
4684 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4685 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4686 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4687 (l1e_get_flags(*pl1e) != flags) )
4688 break;
4689 if ( i == L1_PAGETABLE_ENTRIES )
4691 ol2e = *pl2e;
4692 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4693 l1f_to_lNf(flags)));
4694 flush_area(virt - PAGE_SIZE,
4695 FLUSH_TLB_GLOBAL |
4696 FLUSH_ORDER(PAGETABLE_ORDER));
4697 free_xen_pagetable(l2e_to_l1e(ol2e));
4702 check_l3: ;
4703 #ifdef __x86_64__
4704 if ( cpu_has_page1gb &&
4705 (flags == PAGE_HYPERVISOR) &&
4706 ((nr_mfns == 0) ||
4707 !(((virt >> PAGE_SHIFT) | mfn) &
4708 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4710 unsigned long base_mfn;
4712 ol3e = *pl3e;
4713 pl2e = l3e_to_l2e(ol3e);
4714 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4715 L1_PAGETABLE_ENTRIES - 1);
4716 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4717 if ( (l2e_get_pfn(*pl2e) !=
4718 (base_mfn + (i << PAGETABLE_ORDER))) ||
4719 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4720 break;
4721 if ( i == L2_PAGETABLE_ENTRIES )
4723 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4724 l1f_to_lNf(flags)));
4725 flush_area(virt - PAGE_SIZE,
4726 FLUSH_TLB_GLOBAL |
4727 FLUSH_ORDER(2*PAGETABLE_ORDER));
4728 free_xen_pagetable(l3e_to_l2e(ol3e));
4731 #endif
4734 return 0;
4737 void destroy_xen_mappings(unsigned long s, unsigned long e)
4739 l2_pgentry_t *pl2e;
4740 l1_pgentry_t *pl1e;
4741 unsigned int i;
4742 unsigned long v = s;
4744 ASSERT((s & ~PAGE_MASK) == 0);
4745 ASSERT((e & ~PAGE_MASK) == 0);
4747 while ( v < e )
4749 #ifdef __x86_64__
4750 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4752 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4754 v += 1UL << L3_PAGETABLE_SHIFT;
4755 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4756 continue;
4759 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4761 if ( l2_table_offset(v) == 0 &&
4762 l1_table_offset(v) == 0 &&
4763 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4765 /* PAGE1GB: whole superpage is destroyed. */
4766 l3e_write_atomic(pl3e, l3e_empty());
4767 v += 1UL << L3_PAGETABLE_SHIFT;
4768 continue;
4771 /* PAGE1GB: shatter the superpage and fall through. */
4772 pl2e = alloc_xen_pagetable();
4773 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4774 l2e_write(pl2e + i,
4775 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4776 (i << PAGETABLE_ORDER),
4777 l3e_get_flags(*pl3e)));
4778 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4779 __PAGE_HYPERVISOR));
4781 #endif
4783 pl2e = virt_to_xen_l2e(v);
4785 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4787 v += 1UL << L2_PAGETABLE_SHIFT;
4788 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4789 continue;
4792 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4794 if ( (l1_table_offset(v) == 0) &&
4795 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4797 /* PSE: whole superpage is destroyed. */
4798 l2e_write_atomic(pl2e, l2e_empty());
4799 v += 1UL << L2_PAGETABLE_SHIFT;
4801 else
4803 /* PSE: shatter the superpage and try again. */
4804 pl1e = alloc_xen_pagetable();
4805 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4806 l1e_write(&pl1e[i],
4807 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4808 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4809 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4810 __PAGE_HYPERVISOR));
4813 else
4815 /* Ordinary 4kB mapping. */
4816 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4817 l1e_write_atomic(pl1e, l1e_empty());
4818 v += PAGE_SIZE;
4820 /* If we are done with the L2E, check if it is now empty. */
4821 if ( (v != e) && (l1_table_offset(v) != 0) )
4822 continue;
4823 pl1e = l2e_to_l1e(*pl2e);
4824 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4825 if ( l1e_get_intpte(pl1e[i]) != 0 )
4826 break;
4827 if ( i == L1_PAGETABLE_ENTRIES )
4829 /* Empty: zap the L2E and free the L1 page. */
4830 l2e_write_atomic(pl2e, l2e_empty());
4831 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4832 free_xen_pagetable(pl1e);
4836 #ifdef __x86_64__
4837 /* If we are done with the L3E, check if it is now empty. */
4838 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4839 continue;
4840 pl2e = l3e_to_l2e(*pl3e);
4841 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4842 if ( l2e_get_intpte(pl2e[i]) != 0 )
4843 break;
4844 if ( i == L2_PAGETABLE_ENTRIES )
4846 /* Empty: zap the L3E and free the L2 page. */
4847 l3e_write_atomic(pl3e, l3e_empty());
4848 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4849 free_xen_pagetable(pl2e);
4851 #endif
4854 flush_area(NULL, FLUSH_TLB_GLOBAL);
4857 void __set_fixmap(
4858 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4860 BUG_ON(idx >= __end_of_fixed_addresses);
4861 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4864 #ifdef MEMORY_GUARD
4866 void memguard_init(void)
4868 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4869 #ifdef __i386__
4870 map_pages_to_xen(
4871 (unsigned long)__va(start),
4872 start >> PAGE_SHIFT,
4873 (xenheap_phys_end - start) >> PAGE_SHIFT,
4874 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4875 #else
4876 map_pages_to_xen(
4877 (unsigned long)__va(start),
4878 start >> PAGE_SHIFT,
4879 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4880 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4881 BUG_ON(start != xen_phys_start);
4882 map_pages_to_xen(
4883 XEN_VIRT_START,
4884 start >> PAGE_SHIFT,
4885 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4886 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4887 #endif
4890 static void __memguard_change_range(void *p, unsigned long l, int guard)
4892 unsigned long _p = (unsigned long)p;
4893 unsigned long _l = (unsigned long)l;
4894 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4896 /* Ensure we are dealing with a page-aligned whole number of pages. */
4897 ASSERT((_p&~PAGE_MASK) == 0);
4898 ASSERT((_l&~PAGE_MASK) == 0);
4900 if ( guard )
4901 flags &= ~_PAGE_PRESENT;
4903 map_pages_to_xen(
4904 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4907 void memguard_guard_range(void *p, unsigned long l)
4909 __memguard_change_range(p, l, 1);
4912 void memguard_unguard_range(void *p, unsigned long l)
4914 __memguard_change_range(p, l, 0);
4917 #endif
4919 void memguard_guard_stack(void *p)
4921 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4922 p = (void *)((unsigned long)p + STACK_SIZE -
4923 PRIMARY_STACK_SIZE - PAGE_SIZE);
4924 memguard_guard_range(p, PAGE_SIZE);
4927 /*
4928 * Local variables:
4929 * mode: C
4930 * c-set-style: "BSD"
4931 * c-basic-offset: 4
4932 * tab-width: 4
4933 * indent-tabs-mode: nil
4934 * End:
4935 */