ia64/xen-unstable

view xen/arch/x86/mm.c @ 18806:ed8524f4a044

x86: Re-initialise HPET on resume from S3

Signed-off-by: Guanqun Lu <guanqun.lu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Nov 18 15:55:14 2008 +0000 (2008-11-18)
parents 6b6610c115fc
children f09a1d5d4338
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(__i386__)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 int opt_allow_hugepage;
164 boolean_param("allowhugepage", opt_allow_hugepage);
166 #define l1_disallow_mask(d) \
167 ((d != dom_io) && \
168 (rangeset_is_empty((d)->iomem_caps) && \
169 rangeset_is_empty((d)->arch.ioport_caps) && \
170 !has_arch_pdevs(d)) ? \
171 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
173 #ifdef CONFIG_COMPAT
174 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
175 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
176 L3_DISALLOW_MASK : \
177 COMPAT_L3_DISALLOW_MASK)
178 #else
179 #define l3_disallow_mask(d) L3_DISALLOW_MASK
180 #endif
182 static void queue_deferred_ops(struct domain *d, unsigned int ops)
183 {
184 ASSERT(d == current->domain);
185 this_cpu(percpu_mm_info).deferred_ops |= ops;
186 }
188 void __init init_frametable(void)
189 {
190 unsigned long nr_pages, page_step, i, mfn;
192 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
194 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
195 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
197 for ( i = 0; i < nr_pages; i += page_step )
198 {
199 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
200 if ( mfn == 0 )
201 panic("Not enough memory for frame table\n");
202 map_pages_to_xen(
203 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
204 mfn, page_step, PAGE_HYPERVISOR);
205 }
207 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
209 #if defined(__x86_64__)
210 for ( i = 0; i < max_page; i ++ )
211 spin_lock_init(&frame_table[i].lock);
212 #endif
213 }
215 void __init arch_init_memory(void)
216 {
217 extern void subarch_init_memory(void);
219 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
221 /*
222 * Initialise our DOMID_XEN domain.
223 * Any Xen-heap pages that we will allow to be mapped will have
224 * their domain field set to dom_xen.
225 */
226 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
227 BUG_ON(dom_xen == NULL);
229 /*
230 * Initialise our DOMID_IO domain.
231 * This domain owns I/O pages that are within the range of the page_info
232 * array. Mappings occur at the priv of the caller.
233 */
234 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
235 BUG_ON(dom_io == NULL);
237 /* First 1MB of RAM is historically marked as I/O. */
238 for ( i = 0; i < 0x100; i++ )
239 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
241 /* Any areas not specified as RAM by the e820 map are considered I/O. */
242 for ( i = 0, pfn = 0; pfn < max_page; i++ )
243 {
244 while ( (i < e820.nr_map) &&
245 (e820.map[i].type != E820_RAM) &&
246 (e820.map[i].type != E820_UNUSABLE) )
247 i++;
249 if ( i >= e820.nr_map )
250 {
251 /* No more RAM regions: mark as I/O right to end of memory map. */
252 rstart_pfn = rend_pfn = max_page;
253 }
254 else
255 {
256 /* Mark as I/O just up as far as next RAM region. */
257 rstart_pfn = min_t(unsigned long, max_page,
258 PFN_UP(e820.map[i].addr));
259 rend_pfn = max_t(unsigned long, rstart_pfn,
260 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
261 }
263 /*
264 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
265 * In particular this ensures that RAM holes are respected even in
266 * the statically-initialised 1-16MB mapping area.
267 */
268 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
269 ioend_pfn = rstart_pfn;
270 #if defined(CONFIG_X86_32)
271 ioend_pfn = min_t(unsigned long, ioend_pfn,
272 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
273 #endif
274 if ( iostart_pfn < ioend_pfn )
275 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
276 (unsigned long)mfn_to_virt(ioend_pfn));
278 /* Mark as I/O up to next RAM region. */
279 for ( ; pfn < rstart_pfn; pfn++ )
280 {
281 BUG_ON(!mfn_valid(pfn));
282 share_xen_page_with_guest(
283 mfn_to_page(pfn), dom_io, XENSHARE_writable);
284 }
286 /* Skip the RAM region. */
287 pfn = rend_pfn;
288 }
290 subarch_init_memory();
291 }
293 int memory_is_conventional_ram(paddr_t p)
294 {
295 int i;
297 for ( i = 0; i < e820.nr_map; i++ )
298 {
299 if ( (e820.map[i].type == E820_RAM) &&
300 (e820.map[i].addr <= p) &&
301 (e820.map[i].size > p) )
302 return 1;
303 }
305 return 0;
306 }
308 unsigned long domain_get_maximum_gpfn(struct domain *d)
309 {
310 if ( is_hvm_domain(d) )
311 return d->arch.p2m->max_mapped_pfn;
312 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
313 return arch_get_max_pfn(d) - 1;
314 }
316 void share_xen_page_with_guest(
317 struct page_info *page, struct domain *d, int readonly)
318 {
319 if ( page_get_owner(page) == d )
320 return;
322 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
324 spin_lock(&d->page_alloc_lock);
326 /* The incremented type count pins as writable or read-only. */
327 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
328 page->u.inuse.type_info |= PGT_validated | 1;
330 page_set_owner(page, d);
331 wmb(); /* install valid domain ptr before updating refcnt. */
332 ASSERT(page->count_info == 0);
334 /* Only add to the allocation list if the domain isn't dying. */
335 if ( !d->is_dying )
336 {
337 page->count_info |= PGC_allocated | 1;
338 if ( unlikely(d->xenheap_pages++ == 0) )
339 get_knownalive_domain(d);
340 list_add_tail(&page->list, &d->xenpage_list);
341 }
343 spin_unlock(&d->page_alloc_lock);
344 }
346 void share_xen_page_with_privileged_guests(
347 struct page_info *page, int readonly)
348 {
349 share_xen_page_with_guest(page, dom_xen, readonly);
350 }
352 #if defined(__i386__)
354 #ifdef NDEBUG
355 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
356 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
357 #else
358 /*
359 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
360 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
361 * (detected by lack of an owning domain). As required for correctness, we
362 * always shadow PDPTs above 4GB.
363 */
364 #define l3tab_needs_shadow(mfn) \
365 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
366 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
367 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
368 ((mfn) >= 0x100000))
369 #endif
371 static l1_pgentry_t *fix_pae_highmem_pl1e;
373 /* Cache the address of PAE high-memory fixmap page tables. */
374 static int __init cache_pae_fixmap_address(void)
375 {
376 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
377 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
378 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
379 return 0;
380 }
381 __initcall(cache_pae_fixmap_address);
383 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
385 void make_cr3(struct vcpu *v, unsigned long mfn)
386 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
387 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
388 {
389 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
390 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
391 unsigned int cpu = smp_processor_id();
393 /* Fast path: does this mfn need a shadow at all? */
394 if ( !l3tab_needs_shadow(mfn) )
395 {
396 v->arch.cr3 = mfn << PAGE_SHIFT;
397 /* Cache is no longer in use or valid */
398 cache->high_mfn = 0;
399 return;
400 }
402 /* Caching logic is not interrupt safe. */
403 ASSERT(!in_irq());
405 /* Protects against pae_flush_pgd(). */
406 spin_lock(&cache->lock);
408 cache->inuse_idx ^= 1;
409 cache->high_mfn = mfn;
411 /* Map the guest L3 table and copy to the chosen low-memory cache. */
412 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
413 /* First check the previous high mapping can't be in the TLB.
414 * (i.e. have we loaded CR3 since we last did this?) */
415 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
416 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
417 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
418 lowmem_l3tab = cache->table[cache->inuse_idx];
419 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
420 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
421 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
423 v->arch.cr3 = __pa(lowmem_l3tab);
425 spin_unlock(&cache->lock);
426 }
428 #else /* !defined(__i386__) */
430 void make_cr3(struct vcpu *v, unsigned long mfn)
431 {
432 v->arch.cr3 = mfn << PAGE_SHIFT;
433 }
435 #endif /* !defined(__i386__) */
437 void write_ptbase(struct vcpu *v)
438 {
439 write_cr3(v->arch.cr3);
440 }
442 /*
443 * Should be called after CR3 is updated.
444 *
445 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
446 * for HVM guests, arch.monitor_table and hvm's guest CR3.
447 *
448 * Update ref counts to shadow tables appropriately.
449 */
450 void update_cr3(struct vcpu *v)
451 {
452 unsigned long cr3_mfn=0;
454 if ( paging_mode_enabled(v->domain) )
455 {
456 paging_update_cr3(v);
457 return;
458 }
460 #if CONFIG_PAGING_LEVELS == 4
461 if ( !(v->arch.flags & TF_kernel_mode) )
462 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
463 else
464 #endif
465 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
467 make_cr3(v, cr3_mfn);
468 }
471 static void invalidate_shadow_ldt(struct vcpu *v)
472 {
473 int i;
474 unsigned long pfn;
475 struct page_info *page;
477 if ( v->arch.shadow_ldt_mapcnt == 0 )
478 return;
480 v->arch.shadow_ldt_mapcnt = 0;
482 for ( i = 16; i < 32; i++ )
483 {
484 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
485 if ( pfn == 0 ) continue;
486 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
487 page = mfn_to_page(pfn);
488 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
489 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
490 put_page_and_type(page);
491 }
493 /* Dispose of the (now possibly invalid) mappings from the TLB. */
494 if ( v == current )
495 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
496 else
497 flush_tlb_mask(v->domain->domain_dirty_cpumask);
498 }
501 static int alloc_segdesc_page(struct page_info *page)
502 {
503 struct desc_struct *descs;
504 int i;
506 descs = map_domain_page(page_to_mfn(page));
508 for ( i = 0; i < 512; i++ )
509 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
510 goto fail;
512 unmap_domain_page(descs);
513 return 0;
515 fail:
516 unmap_domain_page(descs);
517 return -EINVAL;
518 }
521 /* Map shadow page at offset @off. */
522 int map_ldt_shadow_page(unsigned int off)
523 {
524 struct vcpu *v = current;
525 struct domain *d = v->domain;
526 unsigned long gmfn, mfn;
527 l1_pgentry_t l1e, nl1e;
528 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
529 int okay;
531 BUG_ON(unlikely(in_irq()));
533 guest_get_eff_kern_l1e(v, gva, &l1e);
534 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
535 return 0;
537 gmfn = l1e_get_pfn(l1e);
538 mfn = gmfn_to_mfn(d, gmfn);
539 if ( unlikely(!mfn_valid(mfn)) )
540 return 0;
542 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
543 if ( unlikely(!okay) )
544 return 0;
546 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
548 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
549 v->arch.shadow_ldt_mapcnt++;
551 return 1;
552 }
555 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
556 {
557 struct page_info *page = mfn_to_page(page_nr);
559 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
560 {
561 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
562 return 0;
563 }
565 return 1;
566 }
569 static int get_page_and_type_from_pagenr(unsigned long page_nr,
570 unsigned long type,
571 struct domain *d,
572 int partial,
573 int preemptible)
574 {
575 struct page_info *page = mfn_to_page(page_nr);
576 int rc;
578 if ( likely(partial >= 0) &&
579 unlikely(!get_page_from_pagenr(page_nr, d)) )
580 return -EINVAL;
582 rc = (preemptible ?
583 get_page_type_preemptible(page, type) :
584 (get_page_type(page, type) ? 0 : -EINVAL));
586 if ( unlikely(rc) && partial >= 0 )
587 put_page(page);
589 return rc;
590 }
592 static int get_data_page(
593 struct page_info *page, struct domain *d, int writeable)
594 {
595 int rc;
597 if ( writeable )
598 rc = get_page_and_type(page, d, PGT_writable_page);
599 else
600 rc = get_page(page, d);
602 return rc;
603 }
605 static void put_data_page(
606 struct page_info *page, int writeable)
607 {
608 if ( writeable )
609 put_page_and_type(page);
610 else
611 put_page(page);
612 }
614 /*
615 * We allow root tables to map each other (a.k.a. linear page tables). It
616 * needs some special care with reference counts and access permissions:
617 * 1. The mapping entry must be read-only, or the guest may get write access
618 * to its own PTEs.
619 * 2. We must only bump the reference counts for an *already validated*
620 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
621 * on a validation that is required to complete that validation.
622 * 3. We only need to increment the reference counts for the mapped page
623 * frame if it is mapped by a different root table. This is sufficient and
624 * also necessary to allow validation of a root table mapping itself.
625 */
626 #define define_get_linear_pagetable(level) \
627 static int \
628 get_##level##_linear_pagetable( \
629 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
630 { \
631 unsigned long x, y; \
632 struct page_info *page; \
633 unsigned long pfn; \
634 \
635 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
636 { \
637 MEM_LOG("Attempt to create linear p.t. with write perms"); \
638 return 0; \
639 } \
640 \
641 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
642 { \
643 /* Make sure the mapped frame belongs to the correct domain. */ \
644 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
645 return 0; \
646 \
647 /* \
648 * Ensure that the mapped frame is an already-validated page table. \
649 * If so, atomically increment the count (checking for overflow). \
650 */ \
651 page = mfn_to_page(pfn); \
652 y = page->u.inuse.type_info; \
653 do { \
654 x = y; \
655 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
656 unlikely((x & (PGT_type_mask|PGT_validated)) != \
657 (PGT_##level##_page_table|PGT_validated)) ) \
658 { \
659 put_page(page); \
660 return 0; \
661 } \
662 } \
663 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
664 } \
665 \
666 return 1; \
667 }
670 int is_iomem_page(unsigned long mfn)
671 {
672 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
673 }
676 int
677 get_page_from_l1e(
678 l1_pgentry_t l1e, struct domain *d)
679 {
680 unsigned long mfn = l1e_get_pfn(l1e);
681 struct page_info *page = mfn_to_page(mfn);
682 uint32_t l1f = l1e_get_flags(l1e);
683 struct vcpu *curr = current;
684 struct domain *owner;
685 int okay;
687 if ( !(l1f & _PAGE_PRESENT) )
688 return 1;
690 if ( unlikely(l1f & l1_disallow_mask(d)) )
691 {
692 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
693 return 0;
694 }
696 if ( is_iomem_page(mfn) )
697 {
698 /* DOMID_IO reverts to caller for privilege checks. */
699 if ( d == dom_io )
700 d = curr->domain;
702 if ( !iomem_access_permitted(d, mfn, mfn) )
703 {
704 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
705 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
706 d->domain_id, mfn);
707 return 0;
708 }
710 return 1;
711 }
713 /*
714 * Let privileged domains transfer the right to map their target
715 * domain's pages. This is used to allow stub-domain pvfb export to dom0,
716 * until pvfb supports granted mappings. At that time this minor hack
717 * can go away.
718 */
719 owner = page_get_owner(page);
720 if ( unlikely(d != owner) && (owner != NULL) &&
721 (d != curr->domain) && IS_PRIV_FOR(d, owner) )
722 d = owner;
724 /* Foreign mappings into guests in shadow external mode don't
725 * contribute to writeable mapping refcounts. (This allows the
726 * qemu-dm helper process in dom0 to map the domain's memory without
727 * messing up the count of "real" writable mappings.) */
728 okay = get_data_page(
729 page, d,
730 (l1f & _PAGE_RW) && !(paging_mode_external(d) && (d != curr->domain)));
731 if ( !okay )
732 {
733 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
734 " for dom%d",
735 mfn, get_gpfn_from_mfn(mfn),
736 l1e_get_intpte(l1e), d->domain_id);
737 }
738 else if ( pte_flags_to_cacheattr(l1f) !=
739 ((page->count_info >> PGC_cacheattr_base) & 7) )
740 {
741 uint32_t x, nx, y = page->count_info;
742 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
744 if ( is_xen_heap_page(page) )
745 {
746 if ( (l1f & _PAGE_RW) &&
747 !(unlikely(paging_mode_external(d) &&
748 (d != curr->domain))) )
749 put_page_type(page);
750 put_page(page);
751 MEM_LOG("Attempt to change cache attributes of Xen heap page");
752 return 0;
753 }
755 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
756 {
757 x = y;
758 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
759 y = cmpxchg(&page->count_info, x, nx);
760 }
762 #ifdef __x86_64__
763 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
764 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
765 #endif
766 }
768 return okay;
769 }
772 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
773 define_get_linear_pagetable(l2);
774 static int
775 get_page_from_l2e(
776 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
777 {
778 unsigned long mfn = l2e_get_pfn(l2e);
779 int rc;
781 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
782 return 1;
784 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
785 {
786 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
787 return -EINVAL;
788 }
790 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
791 {
792 rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
793 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
794 rc = 0;
795 }
796 else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
797 {
798 rc = -EINVAL;
799 }
800 else
801 {
802 unsigned long m = mfn;
803 int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
805 do {
806 rc = get_data_page(mfn_to_page(m), d, writeable);
807 if ( unlikely(!rc) )
808 {
809 while ( m-- > mfn )
810 put_data_page(mfn_to_page(m), writeable);
811 return -EINVAL;
812 }
813 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
815 #ifdef __x86_64__
816 map_pages_to_xen(
817 (unsigned long)mfn_to_virt(mfn), mfn, L1_PAGETABLE_ENTRIES,
818 PAGE_HYPERVISOR | l2e_get_flags(l2e));
819 #endif
820 }
822 return rc;
823 }
826 define_get_linear_pagetable(l3);
827 static int
828 get_page_from_l3e(
829 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
830 {
831 int rc;
833 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
834 return 1;
836 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
837 {
838 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
839 return -EINVAL;
840 }
842 rc = get_page_and_type_from_pagenr(
843 l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
844 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
845 rc = 0;
847 return rc;
848 }
850 #if CONFIG_PAGING_LEVELS >= 4
851 define_get_linear_pagetable(l4);
852 static int
853 get_page_from_l4e(
854 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
855 {
856 int rc;
858 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
859 return 1;
861 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
862 {
863 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
864 return -EINVAL;
865 }
867 rc = get_page_and_type_from_pagenr(
868 l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
869 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
870 rc = 0;
872 return rc;
873 }
874 #endif /* 4 level */
876 #ifdef __x86_64__
878 #ifdef USER_MAPPINGS_ARE_GLOBAL
879 #define adjust_guest_l1e(pl1e, d) \
880 do { \
881 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
882 likely(!is_pv_32on64_domain(d)) ) \
883 { \
884 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
885 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
886 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
887 MEM_LOG("Global bit is set to kernel page %lx", \
888 l1e_get_pfn((pl1e))); \
889 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
890 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
891 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
892 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
893 } \
894 } while ( 0 )
895 #else
896 #define adjust_guest_l1e(pl1e, d) \
897 do { \
898 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
899 likely(!is_pv_32on64_domain(d)) ) \
900 l1e_add_flags((pl1e), _PAGE_USER); \
901 } while ( 0 )
902 #endif
904 #define adjust_guest_l2e(pl2e, d) \
905 do { \
906 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
907 likely(!is_pv_32on64_domain(d)) ) \
908 l2e_add_flags((pl2e), _PAGE_USER); \
909 } while ( 0 )
911 #define adjust_guest_l3e(pl3e, d) \
912 do { \
913 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
914 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
915 _PAGE_USER : \
916 _PAGE_USER|_PAGE_RW); \
917 } while ( 0 )
919 #define adjust_guest_l4e(pl4e, d) \
920 do { \
921 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
922 likely(!is_pv_32on64_domain(d)) ) \
923 l4e_add_flags((pl4e), _PAGE_USER); \
924 } while ( 0 )
926 #else /* !defined(__x86_64__) */
928 #define adjust_guest_l1e(_p, _d) ((void)(_d))
929 #define adjust_guest_l2e(_p, _d) ((void)(_d))
930 #define adjust_guest_l3e(_p, _d) ((void)(_d))
932 #endif
934 #ifdef CONFIG_COMPAT
935 #define unadjust_guest_l3e(pl3e, d) \
936 do { \
937 if ( unlikely(is_pv_32on64_domain(d)) && \
938 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
939 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
940 } while ( 0 )
941 #else
942 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
943 #endif
945 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
946 {
947 unsigned long pfn = l1e_get_pfn(l1e);
948 struct page_info *page;
949 struct domain *e;
950 struct vcpu *v;
952 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
953 return;
955 page = mfn_to_page(pfn);
957 e = page_get_owner(page);
959 /*
960 * Check if this is a mapping that was established via a grant reference.
961 * If it was then we should not be here: we require that such mappings are
962 * explicitly destroyed via the grant-table interface.
963 *
964 * The upshot of this is that the guest can end up with active grants that
965 * it cannot destroy (because it no longer has a PTE to present to the
966 * grant-table interface). This can lead to subtle hard-to-catch bugs,
967 * hence a special grant PTE flag can be enabled to catch the bug early.
968 *
969 * (Note that the undestroyable active grants are not a security hole in
970 * Xen. All active grants can safely be cleaned up when the domain dies.)
971 */
972 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
973 !d->is_shutting_down && !d->is_dying )
974 {
975 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
976 l1e_get_intpte(l1e));
977 domain_crash(d);
978 }
980 /* Remember we didn't take a type-count of foreign writable mappings
981 * to paging-external domains */
982 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
983 !(unlikely((e != d) && paging_mode_external(e))) )
984 {
985 put_page_and_type(page);
986 }
987 else
988 {
989 /* We expect this is rare so we blow the entire shadow LDT. */
990 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
991 PGT_seg_desc_page)) &&
992 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
993 (d == e) )
994 {
995 for_each_vcpu ( d, v )
996 invalidate_shadow_ldt(v);
997 }
998 put_page(page);
999 }
1003 /*
1004 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1005 * Note also that this automatically deals correctly with linear p.t.'s.
1006 */
1007 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1009 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1010 return 1;
1012 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1014 unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
1015 int writeable = l2e_get_flags(l2e) & _PAGE_RW;
1016 ASSERT(opt_allow_hugepage && !(mfn & (L1_PAGETABLE_ENTRIES-1)));
1017 do {
1018 put_data_page(mfn_to_page(m), writeable);
1019 } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
1021 else
1023 put_page_and_type(l2e_get_page(l2e));
1026 return 0;
1029 static int __put_page_type(struct page_info *, int preemptible);
1031 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1032 int partial, int preemptible)
1034 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
1035 (l3e_get_pfn(l3e) != pfn) )
1037 if ( unlikely(partial > 0) )
1038 return __put_page_type(l3e_get_page(l3e), preemptible);
1039 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
1041 return 1;
1044 #if CONFIG_PAGING_LEVELS >= 4
1045 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1046 int partial, int preemptible)
1048 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1049 (l4e_get_pfn(l4e) != pfn) )
1051 if ( unlikely(partial > 0) )
1052 return __put_page_type(l4e_get_page(l4e), preemptible);
1053 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
1055 return 1;
1057 #endif
1059 static int alloc_l1_table(struct page_info *page)
1061 struct domain *d = page_get_owner(page);
1062 unsigned long pfn = page_to_mfn(page);
1063 l1_pgentry_t *pl1e;
1064 unsigned int i;
1066 pl1e = map_domain_page(pfn);
1068 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1070 if ( is_guest_l1_slot(i) &&
1071 unlikely(!get_page_from_l1e(pl1e[i], d)) )
1072 goto fail;
1074 adjust_guest_l1e(pl1e[i], d);
1077 unmap_domain_page(pl1e);
1078 return 0;
1080 fail:
1081 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1082 while ( i-- > 0 )
1083 if ( is_guest_l1_slot(i) )
1084 put_page_from_l1e(pl1e[i], d);
1086 unmap_domain_page(pl1e);
1087 return -EINVAL;
1090 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1092 struct page_info *page;
1093 l2_pgentry_t *pl2e;
1094 l3_pgentry_t l3e3;
1095 #ifndef CONFIG_COMPAT
1096 l2_pgentry_t l2e;
1097 int i;
1098 #endif
1100 if ( !is_pv_32bit_domain(d) )
1101 return 1;
1103 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1105 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1106 l3e3 = pl3e[3];
1107 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1109 MEM_LOG("PAE L3 3rd slot is empty");
1110 return 0;
1113 /*
1114 * The Xen-private mappings include linear mappings. The L2 thus cannot
1115 * be shared by multiple L3 tables. The test here is adequate because:
1116 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1117 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1118 * 2. Cannot appear in another page table's L3:
1119 * a. alloc_l3_table() calls this function and this check will fail
1120 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1121 */
1122 page = l3e_get_page(l3e3);
1123 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1124 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1125 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1126 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1128 MEM_LOG("PAE L3 3rd slot is shared");
1129 return 0;
1132 /* Xen private mappings. */
1133 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1134 #ifndef CONFIG_COMPAT
1135 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1136 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1137 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1138 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1140 l2e = l2e_from_page(
1141 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1142 __PAGE_HYPERVISOR);
1143 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1145 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1147 l2e = l2e_empty();
1148 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1149 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1150 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1152 #else
1153 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1154 &compat_idle_pg_table_l2[
1155 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1156 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1157 #endif
1158 unmap_domain_page(pl2e);
1160 return 1;
1163 #ifdef __i386__
1164 /* Flush a pgdir update into low-memory caches. */
1165 static void pae_flush_pgd(
1166 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1168 struct domain *d = page_get_owner(mfn_to_page(mfn));
1169 struct vcpu *v;
1170 intpte_t _ol3e, _nl3e, _pl3e;
1171 l3_pgentry_t *l3tab_ptr;
1172 struct pae_l3_cache *cache;
1174 if ( unlikely(shadow_mode_enabled(d)) )
1176 cpumask_t m = CPU_MASK_NONE;
1177 /* Re-shadow this l3 table on any vcpus that are using it */
1178 for_each_vcpu ( d, v )
1179 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1181 paging_update_cr3(v);
1182 cpus_or(m, m, v->vcpu_dirty_cpumask);
1184 flush_tlb_mask(m);
1187 /* If below 4GB then the pgdir is not shadowed in low memory. */
1188 if ( !l3tab_needs_shadow(mfn) )
1189 return;
1191 for_each_vcpu ( d, v )
1193 cache = &v->arch.pae_l3_cache;
1195 spin_lock(&cache->lock);
1197 if ( cache->high_mfn == mfn )
1199 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1200 _ol3e = l3e_get_intpte(*l3tab_ptr);
1201 _nl3e = l3e_get_intpte(nl3e);
1202 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1203 BUG_ON(_pl3e != _ol3e);
1206 spin_unlock(&cache->lock);
1209 flush_tlb_mask(d->domain_dirty_cpumask);
1211 #else
1212 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1213 #endif
1215 static int alloc_l2_table(struct page_info *page, unsigned long type,
1216 int preemptible)
1218 struct domain *d = page_get_owner(page);
1219 unsigned long pfn = page_to_mfn(page);
1220 l2_pgentry_t *pl2e;
1221 unsigned int i;
1222 int rc = 0;
1224 pl2e = map_domain_page(pfn);
1226 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1228 if ( preemptible && i && hypercall_preempt_check() )
1230 page->nr_validated_ptes = i;
1231 rc = -EAGAIN;
1232 break;
1235 if ( !is_guest_l2_slot(d, type, i) ||
1236 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1237 continue;
1239 if ( rc < 0 )
1241 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1242 while ( i-- > 0 )
1243 if ( is_guest_l2_slot(d, type, i) )
1244 put_page_from_l2e(pl2e[i], pfn);
1245 break;
1248 adjust_guest_l2e(pl2e[i], d);
1251 unmap_domain_page(pl2e);
1252 return rc > 0 ? 0 : rc;
1255 static int alloc_l3_table(struct page_info *page, int preemptible)
1257 struct domain *d = page_get_owner(page);
1258 unsigned long pfn = page_to_mfn(page);
1259 l3_pgentry_t *pl3e;
1260 unsigned int i;
1261 int rc = 0, partial = page->partial_pte;
1263 #if CONFIG_PAGING_LEVELS == 3
1264 /*
1265 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1266 * the weird 'extended cr3' format for dealing with high-order address
1267 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1268 */
1269 if ( (pfn >= 0x100000) &&
1270 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1271 d->vcpu[0] && d->vcpu[0]->is_initialised )
1273 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1274 return -EINVAL;
1276 #endif
1278 pl3e = map_domain_page(pfn);
1280 /*
1281 * PAE guests allocate full pages, but aren't required to initialize
1282 * more than the first four entries; when running in compatibility
1283 * mode, however, the full page is visible to the MMU, and hence all
1284 * 512 entries must be valid/verified, which is most easily achieved
1285 * by clearing them out.
1286 */
1287 if ( is_pv_32on64_domain(d) )
1288 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1290 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1291 i++, partial = 0 )
1293 if ( is_pv_32bit_domain(d) && (i == 3) )
1295 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1296 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1297 rc = -EINVAL;
1298 else
1299 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1300 PGT_l2_page_table |
1301 PGT_pae_xen_l2,
1302 d, partial, preemptible);
1304 else if ( !is_guest_l3_slot(i) ||
1305 (rc = get_page_from_l3e(pl3e[i], pfn, d,
1306 partial, preemptible)) > 0 )
1307 continue;
1309 if ( rc == -EAGAIN )
1311 page->nr_validated_ptes = i;
1312 page->partial_pte = partial ?: 1;
1314 else if ( rc == -EINTR && i )
1316 page->nr_validated_ptes = i;
1317 page->partial_pte = 0;
1318 rc = -EAGAIN;
1320 if ( rc < 0 )
1321 break;
1323 adjust_guest_l3e(pl3e[i], d);
1326 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1327 rc = -EINVAL;
1328 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1330 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1331 while ( i-- > 0 )
1333 if ( !is_guest_l3_slot(i) )
1334 continue;
1335 unadjust_guest_l3e(pl3e[i], d);
1336 put_page_from_l3e(pl3e[i], pfn, 0, 0);
1340 unmap_domain_page(pl3e);
1341 return rc > 0 ? 0 : rc;
1344 #if CONFIG_PAGING_LEVELS >= 4
1345 static int alloc_l4_table(struct page_info *page, int preemptible)
1347 struct domain *d = page_get_owner(page);
1348 unsigned long pfn = page_to_mfn(page);
1349 l4_pgentry_t *pl4e = page_to_virt(page);
1350 unsigned int i;
1351 int rc = 0, partial = page->partial_pte;
1353 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1354 i++, partial = 0 )
1356 if ( !is_guest_l4_slot(d, i) ||
1357 (rc = get_page_from_l4e(pl4e[i], pfn, d,
1358 partial, preemptible)) > 0 )
1359 continue;
1361 if ( rc == -EAGAIN )
1363 page->nr_validated_ptes = i;
1364 page->partial_pte = partial ?: 1;
1366 else if ( rc == -EINTR )
1368 if ( i )
1370 page->nr_validated_ptes = i;
1371 page->partial_pte = 0;
1372 rc = -EAGAIN;
1375 else if ( rc < 0 )
1377 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1378 while ( i-- > 0 )
1379 if ( is_guest_l4_slot(d, i) )
1380 put_page_from_l4e(pl4e[i], pfn, 0, 0);
1382 if ( rc < 0 )
1383 return rc;
1385 adjust_guest_l4e(pl4e[i], d);
1388 /* Xen private mappings. */
1389 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1390 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1391 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1392 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1393 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1394 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1395 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1396 __PAGE_HYPERVISOR);
1398 return rc > 0 ? 0 : rc;
1400 #else
1401 #define alloc_l4_table(page, preemptible) (-EINVAL)
1402 #endif
1405 static void free_l1_table(struct page_info *page)
1407 struct domain *d = page_get_owner(page);
1408 unsigned long pfn = page_to_mfn(page);
1409 l1_pgentry_t *pl1e;
1410 unsigned int i;
1412 pl1e = map_domain_page(pfn);
1414 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1415 if ( is_guest_l1_slot(i) )
1416 put_page_from_l1e(pl1e[i], d);
1418 unmap_domain_page(pl1e);
1422 static int free_l2_table(struct page_info *page, int preemptible)
1424 #ifdef CONFIG_COMPAT
1425 struct domain *d = page_get_owner(page);
1426 #endif
1427 unsigned long pfn = page_to_mfn(page);
1428 l2_pgentry_t *pl2e;
1429 unsigned int i = page->nr_validated_ptes - 1;
1430 int err = 0;
1432 pl2e = map_domain_page(pfn);
1434 ASSERT(page->nr_validated_ptes);
1435 do {
1436 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1437 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1438 preemptible && i && hypercall_preempt_check() )
1440 page->nr_validated_ptes = i;
1441 err = -EAGAIN;
1443 } while ( !err && i-- );
1445 unmap_domain_page(pl2e);
1447 if ( !err )
1448 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1450 return err;
1453 static int free_l3_table(struct page_info *page, int preemptible)
1455 struct domain *d = page_get_owner(page);
1456 unsigned long pfn = page_to_mfn(page);
1457 l3_pgentry_t *pl3e;
1458 int rc = 0, partial = page->partial_pte;
1459 unsigned int i = page->nr_validated_ptes - !partial;
1461 pl3e = map_domain_page(pfn);
1463 do {
1464 if ( is_guest_l3_slot(i) )
1466 rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
1467 if ( rc < 0 )
1468 break;
1469 partial = 0;
1470 if ( rc > 0 )
1471 continue;
1472 unadjust_guest_l3e(pl3e[i], d);
1474 } while ( i-- );
1476 unmap_domain_page(pl3e);
1478 if ( rc == -EAGAIN )
1480 page->nr_validated_ptes = i;
1481 page->partial_pte = partial ?: -1;
1483 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1485 page->nr_validated_ptes = i + 1;
1486 page->partial_pte = 0;
1487 rc = -EAGAIN;
1489 return rc > 0 ? 0 : rc;
1492 #if CONFIG_PAGING_LEVELS >= 4
1493 static int free_l4_table(struct page_info *page, int preemptible)
1495 struct domain *d = page_get_owner(page);
1496 unsigned long pfn = page_to_mfn(page);
1497 l4_pgentry_t *pl4e = page_to_virt(page);
1498 int rc = 0, partial = page->partial_pte;
1499 unsigned int i = page->nr_validated_ptes - !partial;
1501 do {
1502 if ( is_guest_l4_slot(d, i) )
1503 rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
1504 if ( rc < 0 )
1505 break;
1506 partial = 0;
1507 } while ( i-- );
1509 if ( rc == -EAGAIN )
1511 page->nr_validated_ptes = i;
1512 page->partial_pte = partial ?: -1;
1514 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1516 page->nr_validated_ptes = i + 1;
1517 page->partial_pte = 0;
1518 rc = -EAGAIN;
1520 return rc > 0 ? 0 : rc;
1522 #else
1523 #define free_l4_table(page, preemptible) (-EINVAL)
1524 #endif
1526 static void page_lock(struct page_info *page)
1528 #if defined(__i386__)
1529 while ( unlikely(test_and_set_bit(_PGC_locked, &page->count_info)) )
1530 while ( test_bit(_PGC_locked, &page->count_info) )
1531 cpu_relax();
1532 #else
1533 spin_lock(&page->lock);
1534 #endif
1537 static void page_unlock(struct page_info *page)
1539 #if defined(__i386__)
1540 clear_bit(_PGC_locked, &page->count_info);
1541 #else
1542 spin_unlock(&page->lock);
1543 #endif
1546 /* How to write an entry to the guest pagetables.
1547 * Returns 0 for failure (pointer not valid), 1 for success. */
1548 static inline int update_intpte(intpte_t *p,
1549 intpte_t old,
1550 intpte_t new,
1551 unsigned long mfn,
1552 struct vcpu *v,
1553 int preserve_ad)
1555 int rv = 1;
1556 #ifndef PTE_UPDATE_WITH_CMPXCHG
1557 if ( !preserve_ad )
1559 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1561 else
1562 #endif
1564 intpte_t t = old;
1565 for ( ; ; )
1567 intpte_t _new = new;
1568 if ( preserve_ad )
1569 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1571 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1572 if ( unlikely(rv == 0) )
1574 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1575 ": saw %" PRIpte, old, _new, t);
1576 break;
1579 if ( t == old )
1580 break;
1582 /* Allowed to change in Accessed/Dirty flags only. */
1583 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1585 old = t;
1588 return rv;
1591 /* Macro that wraps the appropriate type-changes around update_intpte().
1592 * Arguments are: type, ptr, old, new, mfn, vcpu */
1593 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1594 update_intpte(&_t ## e_get_intpte(*(_p)), \
1595 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1596 (_m), (_v), (_ad))
1598 /* Update the L1 entry at pl1e to new value nl1e. */
1599 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1600 unsigned long gl1mfn, int preserve_ad)
1602 l1_pgentry_t ol1e;
1603 struct vcpu *curr = current;
1604 struct domain *d = curr->domain;
1605 unsigned long mfn;
1606 struct page_info *l1pg = mfn_to_page(gl1mfn);
1607 int rc = 1;
1609 page_lock(l1pg);
1611 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1612 return page_unlock(l1pg), 0;
1614 if ( unlikely(paging_mode_refcounts(d)) )
1616 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad);
1617 page_unlock(l1pg);
1618 return rc;
1621 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1623 /* Translate foreign guest addresses. */
1624 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1625 if ( unlikely(mfn == INVALID_MFN) )
1626 return page_unlock(l1pg), 0;
1627 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1628 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1630 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1632 page_unlock(l1pg);
1633 MEM_LOG("Bad L1 flags %x",
1634 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1635 return 0;
1638 /* Fast path for identical mapping, r/w and presence. */
1639 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1641 adjust_guest_l1e(nl1e, d);
1642 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1643 preserve_ad);
1644 page_unlock(l1pg);
1645 return rc;
1648 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1649 return page_unlock(l1pg), 0;
1651 adjust_guest_l1e(nl1e, d);
1652 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1653 preserve_ad)) )
1655 ol1e = nl1e;
1656 rc = 0;
1659 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1660 preserve_ad)) )
1662 page_unlock(l1pg);
1663 return 0;
1666 page_unlock(l1pg);
1667 put_page_from_l1e(ol1e, d);
1668 return rc;
1672 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1673 static int mod_l2_entry(l2_pgentry_t *pl2e,
1674 l2_pgentry_t nl2e,
1675 unsigned long pfn,
1676 unsigned long type,
1677 int preserve_ad)
1679 l2_pgentry_t ol2e;
1680 struct vcpu *curr = current;
1681 struct domain *d = curr->domain;
1682 struct page_info *l2pg = mfn_to_page(pfn);
1683 int rc = 1;
1685 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1687 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1688 return 0;
1691 page_lock(l2pg);
1693 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1694 return page_unlock(l2pg), 0;
1696 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1698 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1700 page_unlock(l2pg);
1701 MEM_LOG("Bad L2 flags %x",
1702 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1703 return 0;
1706 /* Fast path for identical mapping and presence. */
1707 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1709 adjust_guest_l2e(nl2e, d);
1710 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad);
1711 page_unlock(l2pg);
1712 return rc;
1715 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1716 return page_unlock(l2pg), 0;
1718 adjust_guest_l2e(nl2e, d);
1719 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1720 preserve_ad)) )
1722 ol2e = nl2e;
1723 rc = 0;
1726 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1727 preserve_ad)) )
1729 page_unlock(l2pg);
1730 return 0;
1733 page_unlock(l2pg);
1734 put_page_from_l2e(ol2e, pfn);
1735 return rc;
1738 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1739 static int mod_l3_entry(l3_pgentry_t *pl3e,
1740 l3_pgentry_t nl3e,
1741 unsigned long pfn,
1742 int preserve_ad,
1743 int preemptible)
1745 l3_pgentry_t ol3e;
1746 struct vcpu *curr = current;
1747 struct domain *d = curr->domain;
1748 struct page_info *l3pg = mfn_to_page(pfn);
1749 int rc = 0;
1751 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1753 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1754 return -EINVAL;
1757 /*
1758 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1759 * would be a pain to ensure they remain continuously valid throughout.
1760 */
1761 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1762 return -EINVAL;
1764 page_lock(l3pg);
1766 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1767 return page_unlock(l3pg), -EFAULT;
1769 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1771 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1773 page_unlock(l3pg);
1774 MEM_LOG("Bad L3 flags %x",
1775 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1776 return -EINVAL;
1779 /* Fast path for identical mapping and presence. */
1780 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1782 adjust_guest_l3e(nl3e, d);
1783 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
1784 page_unlock(l3pg);
1785 return rc ? 0 : -EFAULT;
1788 rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
1789 if ( unlikely(rc < 0) )
1790 return page_unlock(l3pg), rc;
1791 rc = 0;
1793 adjust_guest_l3e(nl3e, d);
1794 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1795 preserve_ad)) )
1797 ol3e = nl3e;
1798 rc = -EFAULT;
1801 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1802 preserve_ad)) )
1804 page_unlock(l3pg);
1805 return -EFAULT;
1808 if ( likely(rc == 0) )
1810 if ( !create_pae_xen_mappings(d, pl3e) )
1811 BUG();
1813 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1816 page_unlock(l3pg);
1817 put_page_from_l3e(ol3e, pfn, 0, 0);
1818 return rc;
1821 #if CONFIG_PAGING_LEVELS >= 4
1823 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1824 static int mod_l4_entry(l4_pgentry_t *pl4e,
1825 l4_pgentry_t nl4e,
1826 unsigned long pfn,
1827 int preserve_ad,
1828 int preemptible)
1830 struct vcpu *curr = current;
1831 struct domain *d = curr->domain;
1832 l4_pgentry_t ol4e;
1833 struct page_info *l4pg = mfn_to_page(pfn);
1834 int rc = 0;
1836 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1838 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1839 return -EINVAL;
1842 page_lock(l4pg);
1844 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1845 return page_unlock(l4pg), -EFAULT;
1847 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1849 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1851 page_unlock(l4pg);
1852 MEM_LOG("Bad L4 flags %x",
1853 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1854 return -EINVAL;
1857 /* Fast path for identical mapping and presence. */
1858 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1860 adjust_guest_l4e(nl4e, d);
1861 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
1862 page_unlock(l4pg);
1863 return rc ? 0 : -EFAULT;
1866 rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
1867 if ( unlikely(rc < 0) )
1868 return page_unlock(l4pg), rc;
1869 rc = 0;
1871 adjust_guest_l4e(nl4e, d);
1872 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1873 preserve_ad)) )
1875 ol4e = nl4e;
1876 rc = -EFAULT;
1879 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1880 preserve_ad)) )
1882 page_unlock(l4pg);
1883 return -EFAULT;
1886 page_unlock(l4pg);
1887 put_page_from_l4e(ol4e, pfn, 0, 0);
1888 return rc;
1891 #endif
1893 void put_page(struct page_info *page)
1895 u32 nx, x, y = page->count_info;
1897 do {
1898 x = y;
1899 nx = x - 1;
1901 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1903 if ( unlikely((nx & PGC_count_mask) == 0) )
1905 cleanup_page_cacheattr(page);
1906 free_domheap_page(page);
1911 int get_page(struct page_info *page, struct domain *domain)
1913 u32 x, nx, y = page->count_info;
1914 u32 d, nd = page->u.inuse._domain;
1915 u32 _domain = pickle_domptr(domain);
1917 do {
1918 x = y;
1919 nx = x + 1;
1920 d = nd;
1921 if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
1922 /* Keep one spare reference to be acquired by get_page_light(). */
1923 unlikely(((nx + 1) & PGC_count_mask) <= 1) || /* Overflow? */
1924 unlikely(d != _domain) ) /* Wrong owner? */
1926 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1927 gdprintk(XENLOG_INFO,
1928 "Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
1929 PRtype_info "\n",
1930 page_to_mfn(page), domain, unpickle_domptr(d),
1931 x, page->u.inuse.type_info);
1932 return 0;
1934 asm volatile (
1935 LOCK_PREFIX "cmpxchg8b %2"
1936 : "=d" (nd), "=a" (y),
1937 "=m" (*(volatile u64 *)(&page->count_info))
1938 : "0" (d), "1" (x), "c" (d), "b" (nx) );
1940 while ( unlikely(nd != d) || unlikely(y != x) );
1942 return 1;
1945 /*
1946 * Special version of get_page() to be used exclusively when
1947 * - a page is known to already have a non-zero reference count
1948 * - the page does not need its owner to be checked
1949 * - it will not be called more than once without dropping the thus
1950 * acquired reference again.
1951 * Due to get_page() reserving one reference, this call cannot fail.
1952 */
1953 static void get_page_light(struct page_info *page)
1955 u32 x, nx, y = page->count_info;
1957 do {
1958 x = y;
1959 nx = x + 1;
1960 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
1961 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
1962 y = cmpxchg(&page->count_info, x, nx);
1964 while ( unlikely(y != x) );
1968 static int alloc_page_type(struct page_info *page, unsigned long type,
1969 int preemptible)
1971 struct domain *owner = page_get_owner(page);
1972 int rc;
1974 /* A page table is dirtied when its type count becomes non-zero. */
1975 if ( likely(owner != NULL) )
1976 paging_mark_dirty(owner, page_to_mfn(page));
1978 switch ( type & PGT_type_mask )
1980 case PGT_l1_page_table:
1981 rc = alloc_l1_table(page);
1982 break;
1983 case PGT_l2_page_table:
1984 rc = alloc_l2_table(page, type, preemptible);
1985 break;
1986 case PGT_l3_page_table:
1987 rc = alloc_l3_table(page, preemptible);
1988 break;
1989 case PGT_l4_page_table:
1990 rc = alloc_l4_table(page, preemptible);
1991 break;
1992 case PGT_seg_desc_page:
1993 rc = alloc_segdesc_page(page);
1994 break;
1995 default:
1996 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1997 type, page->u.inuse.type_info,
1998 page->count_info);
1999 rc = -EINVAL;
2000 BUG();
2003 /* No need for atomic update of type_info here: noone else updates it. */
2004 wmb();
2005 if ( rc == -EAGAIN )
2007 get_page_light(page);
2008 page->u.inuse.type_info |= PGT_partial;
2010 else if ( rc == -EINTR )
2012 ASSERT((page->u.inuse.type_info &
2013 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2014 page->u.inuse.type_info &= ~PGT_count_mask;
2016 else if ( rc )
2018 ASSERT(rc < 0);
2019 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2020 PRtype_info ": caf=%08x taf=%" PRtype_info,
2021 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2022 type, page->count_info, page->u.inuse.type_info);
2023 page->u.inuse.type_info = 0;
2025 else
2027 page->u.inuse.type_info |= PGT_validated;
2030 return rc;
2034 int free_page_type(struct page_info *page, unsigned long type,
2035 int preemptible)
2037 struct domain *owner = page_get_owner(page);
2038 unsigned long gmfn;
2039 int rc;
2041 if ( likely(owner != NULL) )
2043 /*
2044 * We have to flush before the next use of the linear mapping
2045 * (e.g., update_va_mapping()) or we could end up modifying a page
2046 * that is no longer a page table (and hence screw up ref counts).
2047 */
2048 if ( current->domain == owner )
2049 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
2050 else
2051 flush_tlb_mask(owner->domain_dirty_cpumask);
2053 if ( unlikely(paging_mode_enabled(owner)) )
2055 /* A page table is dirtied when its type count becomes zero. */
2056 paging_mark_dirty(owner, page_to_mfn(page));
2058 if ( shadow_mode_refcounts(owner) )
2059 return 0;
2061 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
2062 ASSERT(VALID_M2P(gmfn));
2063 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
2067 if ( !(type & PGT_partial) )
2069 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2070 page->partial_pte = 0;
2073 switch ( type & PGT_type_mask )
2075 case PGT_l1_page_table:
2076 free_l1_table(page);
2077 rc = 0;
2078 break;
2079 case PGT_l2_page_table:
2080 rc = free_l2_table(page, preemptible);
2081 break;
2082 case PGT_l3_page_table:
2083 #if CONFIG_PAGING_LEVELS == 3
2084 if ( !(type & PGT_partial) )
2085 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
2086 #endif
2087 rc = free_l3_table(page, preemptible);
2088 break;
2089 case PGT_l4_page_table:
2090 rc = free_l4_table(page, preemptible);
2091 break;
2092 default:
2093 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
2094 rc = -EINVAL;
2095 BUG();
2098 return rc;
2102 static int __put_final_page_type(
2103 struct page_info *page, unsigned long type, int preemptible)
2105 int rc = free_page_type(page, type, preemptible);
2107 /* No need for atomic update of type_info here: noone else updates it. */
2108 if ( rc == 0 )
2110 /*
2111 * Record TLB information for flush later. We do not stamp page tables
2112 * when running in shadow mode:
2113 * 1. Pointless, since it's the shadow pt's which must be tracked.
2114 * 2. Shadow mode reuses this field for shadowed page tables to
2115 * store flags info -- we don't want to conflict with that.
2116 */
2117 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2118 (page->count_info & PGC_page_table)) )
2119 page->tlbflush_timestamp = tlbflush_current_time();
2120 wmb();
2121 page->u.inuse.type_info--;
2123 else if ( rc == -EINTR )
2125 ASSERT((page->u.inuse.type_info &
2126 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2127 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2128 (page->count_info & PGC_page_table)) )
2129 page->tlbflush_timestamp = tlbflush_current_time();
2130 wmb();
2131 page->u.inuse.type_info |= PGT_validated;
2133 else
2135 BUG_ON(rc != -EAGAIN);
2136 wmb();
2137 get_page_light(page);
2138 page->u.inuse.type_info |= PGT_partial;
2141 return rc;
2145 static int __put_page_type(struct page_info *page,
2146 int preemptible)
2148 unsigned long nx, x, y = page->u.inuse.type_info;
2149 int rc = 0;
2151 for ( ; ; )
2153 x = y;
2154 nx = x - 1;
2156 ASSERT((x & PGT_count_mask) != 0);
2158 if ( unlikely((nx & PGT_count_mask) == 0) )
2160 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2161 likely(nx & (PGT_validated|PGT_partial)) )
2163 /*
2164 * Page-table pages must be unvalidated when count is zero. The
2165 * 'free' is safe because the refcnt is non-zero and validated
2166 * bit is clear => other ops will spin or fail.
2167 */
2168 nx = x & ~(PGT_validated|PGT_partial);
2169 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2170 x, nx)) != x) )
2171 continue;
2172 /* We cleared the 'valid bit' so we do the clean up. */
2173 rc = __put_final_page_type(page, x, preemptible);
2174 if ( x & PGT_partial )
2175 put_page(page);
2176 break;
2179 /*
2180 * Record TLB information for flush later. We do not stamp page
2181 * tables when running in shadow mode:
2182 * 1. Pointless, since it's the shadow pt's which must be tracked.
2183 * 2. Shadow mode reuses this field for shadowed page tables to
2184 * store flags info -- we don't want to conflict with that.
2185 */
2186 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2187 (page->count_info & PGC_page_table)) )
2188 page->tlbflush_timestamp = tlbflush_current_time();
2191 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2192 break;
2194 if ( preemptible && hypercall_preempt_check() )
2195 return -EINTR;
2198 return rc;
2202 static int __get_page_type(struct page_info *page, unsigned long type,
2203 int preemptible)
2205 unsigned long nx, x, y = page->u.inuse.type_info;
2206 int rc = 0;
2208 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2210 for ( ; ; )
2212 x = y;
2213 nx = x + 1;
2214 if ( unlikely((nx & PGT_count_mask) == 0) )
2216 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2217 return -EINVAL;
2219 else if ( unlikely((x & PGT_count_mask) == 0) )
2221 struct domain *d = page_get_owner(page);
2223 /* Normally we should never let a page go from type count 0
2224 * to type count 1 when it is shadowed. One exception:
2225 * out-of-sync shadowed pages are allowed to become
2226 * writeable. */
2227 if ( d && shadow_mode_enabled(d)
2228 && (page->count_info & PGC_page_table)
2229 && !((page->shadow_flags & (1u<<29))
2230 && type == PGT_writable_page) )
2231 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2233 ASSERT(!(x & PGT_pae_xen_l2));
2234 if ( (x & PGT_type_mask) != type )
2236 /*
2237 * On type change we check to flush stale TLB entries. This
2238 * may be unnecessary (e.g., page was GDT/LDT) but those
2239 * circumstances should be very rare.
2240 */
2241 cpumask_t mask = d->domain_dirty_cpumask;
2243 /* Don't flush if the timestamp is old enough */
2244 tlbflush_filter(mask, page->tlbflush_timestamp);
2246 if ( unlikely(!cpus_empty(mask)) &&
2247 /* Shadow mode: track only writable pages. */
2248 (!shadow_mode_enabled(page_get_owner(page)) ||
2249 ((nx & PGT_type_mask) == PGT_writable_page)) )
2251 perfc_incr(need_flush_tlb_flush);
2252 flush_tlb_mask(mask);
2255 /* We lose existing type and validity. */
2256 nx &= ~(PGT_type_mask | PGT_validated);
2257 nx |= type;
2259 /* No special validation needed for writable pages. */
2260 /* Page tables and GDT/LDT need to be scanned for validity. */
2261 if ( type == PGT_writable_page )
2262 nx |= PGT_validated;
2265 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2267 /* Don't log failure if it could be a recursive-mapping attempt. */
2268 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2269 (type == PGT_l1_page_table) )
2270 return -EINVAL;
2271 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2272 (type == PGT_l2_page_table) )
2273 return -EINVAL;
2274 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2275 (type == PGT_l3_page_table) )
2276 return -EINVAL;
2277 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2278 "for mfn %lx (pfn %lx)",
2279 x, type, page_to_mfn(page),
2280 get_gpfn_from_mfn(page_to_mfn(page)));
2281 return -EINVAL;
2283 else if ( unlikely(!(x & PGT_validated)) )
2285 if ( !(x & PGT_partial) )
2287 /* Someone else is updating validation of this page. Wait... */
2288 while ( (y = page->u.inuse.type_info) == x )
2290 if ( preemptible && hypercall_preempt_check() )
2291 return -EINTR;
2292 cpu_relax();
2294 continue;
2296 /* Type ref count was left at 1 when PGT_partial got set. */
2297 ASSERT((x & PGT_count_mask) == 1);
2298 nx = x & ~PGT_partial;
2301 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2302 break;
2304 if ( preemptible && hypercall_preempt_check() )
2305 return -EINTR;
2308 if ( unlikely((x & PGT_type_mask) != type) )
2310 /* Special pages should not be accessible from devices. */
2311 struct domain *d = page_get_owner(page);
2312 if ( d && unlikely(need_iommu(d)) )
2314 if ( (x & PGT_type_mask) == PGT_writable_page )
2315 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2316 else if ( type == PGT_writable_page )
2317 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2318 page_to_mfn(page));
2322 if ( unlikely(!(nx & PGT_validated)) )
2324 if ( !(x & PGT_partial) )
2326 page->nr_validated_ptes = 0;
2327 page->partial_pte = 0;
2329 rc = alloc_page_type(page, type, preemptible);
2332 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2333 put_page(page);
2335 return rc;
2338 void put_page_type(struct page_info *page)
2340 int rc = __put_page_type(page, 0);
2341 ASSERT(rc == 0);
2342 (void)rc;
2345 int get_page_type(struct page_info *page, unsigned long type)
2347 int rc = __get_page_type(page, type, 0);
2348 if ( likely(rc == 0) )
2349 return 1;
2350 ASSERT(rc == -EINVAL);
2351 return 0;
2354 int put_page_type_preemptible(struct page_info *page)
2356 return __put_page_type(page, 1);
2359 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2361 return __get_page_type(page, type, 1);
2364 void cleanup_page_cacheattr(struct page_info *page)
2366 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
2368 if ( likely(cacheattr == 0) )
2369 return;
2371 page->count_info &= ~PGC_cacheattr_mask;
2373 BUG_ON(is_xen_heap_page(page));
2375 #ifdef __x86_64__
2376 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
2377 1, PAGE_HYPERVISOR);
2378 #endif
2382 int new_guest_cr3(unsigned long mfn)
2384 struct vcpu *v = current;
2385 struct domain *d = v->domain;
2386 int okay;
2387 unsigned long old_base_mfn;
2389 #ifdef CONFIG_COMPAT
2390 if ( is_pv_32on64_domain(d) )
2392 okay = paging_mode_refcounts(d)
2393 ? 0 /* Old code was broken, but what should it be? */
2394 : mod_l4_entry(
2395 __va(pagetable_get_paddr(v->arch.guest_table)),
2396 l4e_from_pfn(
2397 mfn,
2398 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2399 pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
2400 if ( unlikely(!okay) )
2402 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2403 return 0;
2406 invalidate_shadow_ldt(v);
2407 write_ptbase(v);
2409 return 1;
2411 #endif
2412 okay = paging_mode_refcounts(d)
2413 ? get_page_from_pagenr(mfn, d)
2414 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
2415 if ( unlikely(!okay) )
2417 MEM_LOG("Error while installing new baseptr %lx", mfn);
2418 return 0;
2421 invalidate_shadow_ldt(v);
2423 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2425 v->arch.guest_table = pagetable_from_pfn(mfn);
2426 update_cr3(v);
2428 write_ptbase(v);
2430 if ( likely(old_base_mfn != 0) )
2432 if ( paging_mode_refcounts(d) )
2433 put_page(mfn_to_page(old_base_mfn));
2434 else
2435 put_page_and_type(mfn_to_page(old_base_mfn));
2438 return 1;
2441 static void process_deferred_ops(void)
2443 unsigned int deferred_ops;
2444 struct domain *d = current->domain;
2445 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2447 deferred_ops = info->deferred_ops;
2448 info->deferred_ops = 0;
2450 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2452 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2453 flush_tlb_mask(d->domain_dirty_cpumask);
2454 else
2455 flush_tlb_local();
2458 if ( deferred_ops & DOP_RELOAD_LDT )
2459 (void)map_ldt_shadow_page(0);
2461 if ( unlikely(info->foreign != NULL) )
2463 rcu_unlock_domain(info->foreign);
2464 info->foreign = NULL;
2468 static int set_foreigndom(domid_t domid)
2470 struct domain *e, *d = current->domain;
2471 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2472 int okay = 1;
2474 ASSERT(info->foreign == NULL);
2476 if ( likely(domid == DOMID_SELF) )
2477 goto out;
2479 if ( unlikely(domid == d->domain_id) )
2481 MEM_LOG("Cannot specify itself as foreign domain");
2482 okay = 0;
2484 else if ( unlikely(paging_mode_translate(d)) )
2486 MEM_LOG("Cannot mix foreign mappings with translated domains");
2487 okay = 0;
2489 else switch ( domid )
2491 case DOMID_IO:
2492 info->foreign = rcu_lock_domain(dom_io);
2493 break;
2494 case DOMID_XEN:
2495 if (!IS_PRIV(d)) {
2496 MEM_LOG("Cannot set foreign dom");
2497 okay = 0;
2498 break;
2500 info->foreign = rcu_lock_domain(dom_xen);
2501 break;
2502 default:
2503 if ( (e = rcu_lock_domain_by_id(domid)) == NULL )
2505 MEM_LOG("Unknown domain '%u'", domid);
2506 okay = 0;
2507 break;
2509 if ( !IS_PRIV_FOR(d, e) )
2511 MEM_LOG("Cannot set foreign dom");
2512 okay = 0;
2513 rcu_unlock_domain(e);
2514 break;
2516 info->foreign = e;
2517 break;
2520 out:
2521 return okay;
2524 static inline cpumask_t vcpumask_to_pcpumask(
2525 struct domain *d, unsigned long vmask)
2527 unsigned int vcpu_id;
2528 cpumask_t pmask = CPU_MASK_NONE;
2529 struct vcpu *v;
2531 /*
2532 * Callers copy only a single guest-sized longword from the guest.
2533 * This must be wide enough to reference all VCPUs. Worst case is 32 bits.
2534 */
2535 BUILD_BUG_ON(MAX_VIRT_CPUS > 32);
2537 while ( vmask != 0 )
2539 vcpu_id = find_first_set_bit(vmask);
2540 vmask &= ~(1UL << vcpu_id);
2541 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2542 ((v = d->vcpu[vcpu_id]) != NULL) )
2543 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2546 return pmask;
2549 #ifdef __i386__
2550 static inline void *fixmap_domain_page(unsigned long mfn)
2552 unsigned int cpu = smp_processor_id();
2553 void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
2555 l1e_write(fix_pae_highmem_pl1e - cpu,
2556 l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
2557 flush_tlb_one_local(ptr);
2558 return ptr;
2560 static inline void fixunmap_domain_page(const void *ptr)
2562 unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
2564 l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
2565 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
2567 #else
2568 #define fixmap_domain_page(mfn) mfn_to_virt(mfn)
2569 #define fixunmap_domain_page(ptr) ((void)(ptr))
2570 #endif
2572 int do_mmuext_op(
2573 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2574 unsigned int count,
2575 XEN_GUEST_HANDLE(uint) pdone,
2576 unsigned int foreigndom)
2578 struct mmuext_op op;
2579 int rc = 0, i = 0, okay;
2580 unsigned long mfn = 0, gmfn = 0, type;
2581 unsigned int done = 0;
2582 struct page_info *page;
2583 struct vcpu *v = current;
2584 struct domain *d = v->domain;
2586 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2588 count &= ~MMU_UPDATE_PREEMPTED;
2589 if ( unlikely(!guest_handle_is_null(pdone)) )
2590 (void)copy_from_guest(&done, pdone, 1);
2592 else
2593 perfc_incr(calls_to_mmuext_op);
2595 if ( unlikely(!guest_handle_okay(uops, count)) )
2597 rc = -EFAULT;
2598 goto out;
2601 if ( !set_foreigndom(foreigndom) )
2603 rc = -ESRCH;
2604 goto out;
2607 for ( i = 0; i < count; i++ )
2609 if ( hypercall_preempt_check() )
2611 rc = -EAGAIN;
2612 break;
2615 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2617 MEM_LOG("Bad __copy_from_guest");
2618 rc = -EFAULT;
2619 break;
2622 okay = 1;
2623 gmfn = op.arg1.mfn;
2624 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2625 page = mfn_to_page(mfn);
2627 switch ( op.cmd )
2629 case MMUEXT_PIN_L1_TABLE:
2630 type = PGT_l1_page_table;
2631 goto pin_page;
2633 case MMUEXT_PIN_L2_TABLE:
2634 type = PGT_l2_page_table;
2635 goto pin_page;
2637 case MMUEXT_PIN_L3_TABLE:
2638 type = PGT_l3_page_table;
2639 goto pin_page;
2641 case MMUEXT_PIN_L4_TABLE:
2642 if ( is_pv_32bit_domain(FOREIGNDOM) )
2643 break;
2644 type = PGT_l4_page_table;
2646 pin_page:
2647 rc = xsm_memory_pin_page(d, page);
2648 if ( rc )
2649 break;
2651 /* Ignore pinning of invalid paging levels. */
2652 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2653 break;
2655 if ( paging_mode_refcounts(FOREIGNDOM) )
2656 break;
2658 rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
2659 okay = !rc;
2660 if ( unlikely(!okay) )
2662 if ( rc == -EINTR )
2663 rc = -EAGAIN;
2664 else if ( rc != -EAGAIN )
2665 MEM_LOG("Error while pinning mfn %lx", mfn);
2666 break;
2669 if ( unlikely(test_and_set_bit(_PGT_pinned,
2670 &page->u.inuse.type_info)) )
2672 MEM_LOG("Mfn %lx already pinned", mfn);
2673 put_page_and_type(page);
2674 okay = 0;
2675 break;
2678 /* A page is dirtied when its pin status is set. */
2679 paging_mark_dirty(d, mfn);
2681 /* We can race domain destruction (domain_relinquish_resources). */
2682 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2684 int drop_ref;
2685 spin_lock(&FOREIGNDOM->page_alloc_lock);
2686 drop_ref = (FOREIGNDOM->is_dying &&
2687 test_and_clear_bit(_PGT_pinned,
2688 &page->u.inuse.type_info));
2689 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2690 if ( drop_ref )
2691 put_page_and_type(page);
2694 break;
2696 case MMUEXT_UNPIN_TABLE:
2697 if ( paging_mode_refcounts(d) )
2698 break;
2700 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2702 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2703 mfn, page_get_owner(page));
2705 else if ( likely(test_and_clear_bit(_PGT_pinned,
2706 &page->u.inuse.type_info)) )
2708 put_page_and_type(page);
2709 put_page(page);
2710 if ( !rc )
2712 /* A page is dirtied when its pin status is cleared. */
2713 paging_mark_dirty(d, mfn);
2716 else
2718 okay = 0;
2719 put_page(page);
2720 MEM_LOG("Mfn %lx not pinned", mfn);
2722 break;
2724 case MMUEXT_NEW_BASEPTR:
2725 okay = new_guest_cr3(mfn);
2726 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2727 break;
2729 #ifdef __x86_64__
2730 case MMUEXT_NEW_USER_BASEPTR: {
2731 unsigned long old_mfn;
2733 if ( mfn != 0 )
2735 if ( paging_mode_refcounts(d) )
2736 okay = get_page_from_pagenr(mfn, d);
2737 else
2738 okay = !get_page_and_type_from_pagenr(
2739 mfn, PGT_root_page_table, d, 0, 0);
2740 if ( unlikely(!okay) )
2742 MEM_LOG("Error while installing new mfn %lx", mfn);
2743 break;
2747 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2748 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2750 if ( old_mfn != 0 )
2752 if ( paging_mode_refcounts(d) )
2753 put_page(mfn_to_page(old_mfn));
2754 else
2755 put_page_and_type(mfn_to_page(old_mfn));
2758 break;
2760 #endif
2762 case MMUEXT_TLB_FLUSH_LOCAL:
2763 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2764 break;
2766 case MMUEXT_INVLPG_LOCAL:
2767 if ( !paging_mode_enabled(d)
2768 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2769 flush_tlb_one_local(op.arg1.linear_addr);
2770 break;
2772 case MMUEXT_TLB_FLUSH_MULTI:
2773 case MMUEXT_INVLPG_MULTI:
2775 unsigned long vmask;
2776 cpumask_t pmask;
2777 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2779 okay = 0;
2780 break;
2782 pmask = vcpumask_to_pcpumask(d, vmask);
2783 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2784 flush_tlb_mask(pmask);
2785 else
2786 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2787 break;
2790 case MMUEXT_TLB_FLUSH_ALL:
2791 flush_tlb_mask(d->domain_dirty_cpumask);
2792 break;
2794 case MMUEXT_INVLPG_ALL:
2795 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2796 break;
2798 case MMUEXT_FLUSH_CACHE:
2799 if ( unlikely(!cache_flush_permitted(d)) )
2801 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2802 okay = 0;
2804 else
2806 wbinvd();
2808 break;
2810 case MMUEXT_SET_LDT:
2812 unsigned long ptr = op.arg1.linear_addr;
2813 unsigned long ents = op.arg2.nr_ents;
2815 if ( paging_mode_external(d) )
2817 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2818 okay = 0;
2820 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2821 (ents > 8192) ||
2822 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2824 okay = 0;
2825 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2827 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2828 (v->arch.guest_context.ldt_base != ptr) )
2830 invalidate_shadow_ldt(v);
2831 v->arch.guest_context.ldt_base = ptr;
2832 v->arch.guest_context.ldt_ents = ents;
2833 load_LDT(v);
2834 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2835 if ( ents != 0 )
2836 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2838 break;
2841 case MMUEXT_CLEAR_PAGE:
2843 unsigned char *ptr;
2845 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2846 FOREIGNDOM, 0, 0);
2847 if ( unlikely(!okay) )
2849 MEM_LOG("Error while clearing mfn %lx", mfn);
2850 break;
2853 /* A page is dirtied when it's being cleared. */
2854 paging_mark_dirty(d, mfn);
2856 ptr = fixmap_domain_page(mfn);
2857 clear_page(ptr);
2858 fixunmap_domain_page(ptr);
2860 put_page_and_type(page);
2861 break;
2864 case MMUEXT_COPY_PAGE:
2866 const unsigned char *src;
2867 unsigned char *dst;
2868 unsigned long src_mfn;
2870 src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
2871 okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
2872 if ( unlikely(!okay) )
2874 MEM_LOG("Error while copying from mfn %lx", src_mfn);
2875 break;
2878 okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
2879 FOREIGNDOM, 0, 0);
2880 if ( unlikely(!okay) )
2882 put_page(mfn_to_page(src_mfn));
2883 MEM_LOG("Error while copying to mfn %lx", mfn);
2884 break;
2887 /* A page is dirtied when it's being copied to. */
2888 paging_mark_dirty(d, mfn);
2890 src = map_domain_page(src_mfn);
2891 dst = fixmap_domain_page(mfn);
2892 copy_page(dst, src);
2893 fixunmap_domain_page(dst);
2894 unmap_domain_page(src);
2896 put_page_and_type(page);
2897 put_page(mfn_to_page(src_mfn));
2898 break;
2901 default:
2902 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2903 rc = -ENOSYS;
2904 okay = 0;
2905 break;
2908 if ( unlikely(!okay) )
2910 rc = rc ? rc : -EINVAL;
2911 break;
2914 guest_handle_add_offset(uops, 1);
2917 if ( rc == -EAGAIN )
2918 rc = hypercall_create_continuation(
2919 __HYPERVISOR_mmuext_op, "hihi",
2920 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2922 process_deferred_ops();
2924 perfc_add(num_mmuext_ops, i);
2926 out:
2927 /* Add incremental work we have done to the @done output parameter. */
2928 if ( unlikely(!guest_handle_is_null(pdone)) )
2930 done += i;
2931 copy_to_guest(pdone, &done, 1);
2934 return rc;
2937 int do_mmu_update(
2938 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2939 unsigned int count,
2940 XEN_GUEST_HANDLE(uint) pdone,
2941 unsigned int foreigndom)
2943 struct mmu_update req;
2944 void *va;
2945 unsigned long gpfn, gmfn, mfn;
2946 struct page_info *page;
2947 int rc = 0, okay = 1, i = 0;
2948 unsigned int cmd, done = 0;
2949 struct vcpu *v = current;
2950 struct domain *d = v->domain;
2951 unsigned long type_info;
2952 struct domain_mmap_cache mapcache;
2954 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2956 count &= ~MMU_UPDATE_PREEMPTED;
2957 if ( unlikely(!guest_handle_is_null(pdone)) )
2958 (void)copy_from_guest(&done, pdone, 1);
2960 else
2961 perfc_incr(calls_to_mmu_update);
2963 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2965 rc = -EFAULT;
2966 goto out;
2969 if ( !set_foreigndom(foreigndom) )
2971 rc = -ESRCH;
2972 goto out;
2975 domain_mmap_cache_init(&mapcache);
2977 for ( i = 0; i < count; i++ )
2979 if ( hypercall_preempt_check() )
2981 rc = -EAGAIN;
2982 break;
2985 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2987 MEM_LOG("Bad __copy_from_guest");
2988 rc = -EFAULT;
2989 break;
2992 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2993 okay = 0;
2995 switch ( cmd )
2997 /*
2998 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2999 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3000 * current A/D bits.
3001 */
3002 case MMU_NORMAL_PT_UPDATE:
3003 case MMU_PT_UPDATE_PRESERVE_AD:
3004 rc = xsm_mmu_normal_update(d, FOREIGNDOM, req.val);
3005 if ( rc )
3006 break;
3008 req.ptr -= cmd;
3009 gmfn = req.ptr >> PAGE_SHIFT;
3010 mfn = gmfn_to_mfn(d, gmfn);
3012 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
3014 MEM_LOG("Could not get page for normal update");
3015 break;
3018 va = map_domain_page_with_cache(mfn, &mapcache);
3019 va = (void *)((unsigned long)va +
3020 (unsigned long)(req.ptr & ~PAGE_MASK));
3021 page = mfn_to_page(mfn);
3023 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
3025 case PGT_l1_page_table:
3026 case PGT_l2_page_table:
3027 case PGT_l3_page_table:
3028 case PGT_l4_page_table:
3030 if ( paging_mode_refcounts(d) )
3032 MEM_LOG("mmu update on auto-refcounted domain!");
3033 break;
3036 if ( unlikely(!get_page_type(
3037 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
3038 goto not_a_pt;
3040 switch ( type_info & PGT_type_mask )
3042 case PGT_l1_page_table:
3044 l1_pgentry_t l1e = l1e_from_intpte(req.val);
3045 okay = mod_l1_entry(va, l1e, mfn,
3046 cmd == MMU_PT_UPDATE_PRESERVE_AD);
3048 break;
3049 case PGT_l2_page_table:
3051 l2_pgentry_t l2e = l2e_from_intpte(req.val);
3052 okay = mod_l2_entry(va, l2e, mfn, type_info,
3053 cmd == MMU_PT_UPDATE_PRESERVE_AD);
3055 break;
3056 case PGT_l3_page_table:
3058 l3_pgentry_t l3e = l3e_from_intpte(req.val);
3059 rc = mod_l3_entry(va, l3e, mfn,
3060 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
3061 okay = !rc;
3063 break;
3064 #if CONFIG_PAGING_LEVELS >= 4
3065 case PGT_l4_page_table:
3067 l4_pgentry_t l4e = l4e_from_intpte(req.val);
3068 rc = mod_l4_entry(va, l4e, mfn,
3069 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
3070 okay = !rc;
3072 break;
3073 #endif
3076 put_page_type(page);
3077 if ( rc == -EINTR )
3078 rc = -EAGAIN;
3080 break;
3082 default:
3083 not_a_pt:
3085 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3086 break;
3088 perfc_incr(writable_mmu_updates);
3090 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
3092 put_page_type(page);
3094 break;
3097 unmap_domain_page_with_cache(va, &mapcache);
3099 put_page(page);
3100 break;
3102 case MMU_MACHPHYS_UPDATE:
3104 mfn = req.ptr >> PAGE_SHIFT;
3105 gpfn = req.val;
3107 rc = xsm_mmu_machphys_update(d, mfn);
3108 if ( rc )
3109 break;
3111 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
3113 MEM_LOG("Could not get page for mach->phys update");
3114 break;
3117 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
3119 MEM_LOG("Mach-phys update on auto-translate guest");
3120 break;
3123 set_gpfn_from_mfn(mfn, gpfn);
3124 okay = 1;
3126 paging_mark_dirty(FOREIGNDOM, mfn);
3128 put_page(mfn_to_page(mfn));
3129 break;
3131 default:
3132 MEM_LOG("Invalid page update command %x", cmd);
3133 rc = -ENOSYS;
3134 okay = 0;
3135 break;
3138 if ( unlikely(!okay) )
3140 rc = rc ? rc : -EINVAL;
3141 break;
3144 guest_handle_add_offset(ureqs, 1);
3147 if ( rc == -EAGAIN )
3148 rc = hypercall_create_continuation(
3149 __HYPERVISOR_mmu_update, "hihi",
3150 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3152 process_deferred_ops();
3154 domain_mmap_cache_destroy(&mapcache);
3156 perfc_add(num_page_updates, i);
3158 out:
3159 /* Add incremental work we have done to the @done output parameter. */
3160 if ( unlikely(!guest_handle_is_null(pdone)) )
3162 done += i;
3163 copy_to_guest(pdone, &done, 1);
3166 return rc;
3170 static int create_grant_pte_mapping(
3171 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
3173 int rc = GNTST_okay;
3174 void *va;
3175 unsigned long gmfn, mfn;
3176 struct page_info *page;
3177 u32 type;
3178 l1_pgentry_t ol1e;
3179 struct domain *d = v->domain;
3181 ASSERT(domain_is_locked(d));
3183 adjust_guest_l1e(nl1e, d);
3185 gmfn = pte_addr >> PAGE_SHIFT;
3186 mfn = gmfn_to_mfn(d, gmfn);
3188 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3190 MEM_LOG("Could not get page for normal update");
3191 return GNTST_general_error;
3194 va = map_domain_page(mfn);
3195 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
3196 page = mfn_to_page(mfn);
3198 type = page->u.inuse.type_info & PGT_type_mask;
3199 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
3201 MEM_LOG("Grant map attempted to update a non-L1 page");
3202 rc = GNTST_general_error;
3203 goto failed;
3206 page_lock(page);
3208 ol1e = *(l1_pgentry_t *)va;
3209 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3211 page_unlock(page);
3212 put_page_type(page);
3213 rc = GNTST_general_error;
3214 goto failed;
3217 page_unlock(page);
3219 if ( !paging_mode_refcounts(d) )
3220 put_page_from_l1e(ol1e, d);
3222 put_page_type(page);
3224 failed:
3225 unmap_domain_page(va);
3226 put_page(page);
3228 return rc;
3231 static int destroy_grant_pte_mapping(
3232 uint64_t addr, unsigned long frame, struct domain *d)
3234 int rc = GNTST_okay;
3235 void *va;
3236 unsigned long gmfn, mfn;
3237 struct page_info *page;
3238 u32 type;
3239 l1_pgentry_t ol1e;
3241 gmfn = addr >> PAGE_SHIFT;
3242 mfn = gmfn_to_mfn(d, gmfn);
3244 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3246 MEM_LOG("Could not get page for normal update");
3247 return GNTST_general_error;
3250 va = map_domain_page(mfn);
3251 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3252 page = mfn_to_page(mfn);
3254 type = page->u.inuse.type_info & PGT_type_mask;
3255 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
3257 MEM_LOG("Grant map attempted to update a non-L1 page");
3258 rc = GNTST_general_error;
3259 goto failed;
3262 page_lock(page);
3264 ol1e = *(l1_pgentry_t *)va;
3266 /* Check that the virtual address supplied is actually mapped to frame. */
3267 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3269 page_unlock(page);
3270 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3271 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3272 put_page_type(page);
3273 rc = GNTST_general_error;
3274 goto failed;
3277 /* Delete pagetable entry. */
3278 if ( unlikely(!UPDATE_ENTRY
3279 (l1,
3280 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3281 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3282 0)) )
3284 page_unlock(page);
3285 MEM_LOG("Cannot delete PTE entry at %p", va);
3286 put_page_type(page);
3287 rc = GNTST_general_error;
3288 goto failed;
3291 page_unlock(page);
3292 put_page_type(page);
3294 failed:
3295 unmap_domain_page(va);
3296 put_page(page);
3297 return rc;
3301 static int create_grant_va_mapping(
3302 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3304 l1_pgentry_t *pl1e, ol1e;
3305 struct domain *d = v->domain;
3306 unsigned long gl1mfn;
3307 struct page_info *l1pg;
3308 int okay;
3310 ASSERT(domain_is_locked(d));
3312 adjust_guest_l1e(nl1e, d);
3314 pl1e = guest_map_l1e(v, va, &gl1mfn);
3315 if ( !pl1e )
3317 MEM_LOG("Could not find L1 PTE for address %lx", va);
3318 return GNTST_general_error;
3320 l1pg = mfn_to_page(gl1mfn);
3321 page_lock(l1pg);
3322 ol1e = *pl1e;
3323 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3324 page_unlock(l1pg);
3325 guest_unmap_l1e(v, pl1e);
3326 pl1e = NULL;
3328 if ( !okay )
3329 return GNTST_general_error;
3331 if ( !paging_mode_refcounts(d) )
3332 put_page_from_l1e(ol1e, d);
3334 return GNTST_okay;
3337 static int replace_grant_va_mapping(
3338 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3340 l1_pgentry_t *pl1e, ol1e;
3341 unsigned long gl1mfn;
3342 struct page_info *l1pg;
3343 int rc = 0;
3345 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3346 if ( !pl1e )
3348 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3349 return GNTST_general_error;
3352 l1pg = mfn_to_page(gl1mfn);
3353 page_lock(l1pg);
3354 ol1e = *pl1e;
3356 /* Check that the virtual address supplied is actually mapped to frame. */
3357 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3359 page_unlock(l1pg);
3360 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3361 l1e_get_pfn(ol1e), addr, frame);
3362 rc = GNTST_general_error;
3363 goto out;
3366 /* Delete pagetable entry. */
3367 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3369 page_unlock(l1pg);
3370 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3371 rc = GNTST_general_error;
3372 goto out;
3375 page_unlock(l1pg);
3377 out:
3378 guest_unmap_l1e(v, pl1e);
3379 return rc;
3382 static int destroy_grant_va_mapping(
3383 unsigned long addr, unsigned long frame, struct vcpu *v)
3385 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3388 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3389 unsigned int flags, unsigned int cache_flags)
3391 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3393 if ( (flags & GNTMAP_application_map) )
3394 l1e_add_flags(pte,_PAGE_USER);
3395 if ( !(flags & GNTMAP_readonly) )
3396 l1e_add_flags(pte,_PAGE_RW);
3398 l1e_add_flags(pte,
3399 ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
3400 & _PAGE_AVAIL);
3402 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3404 if ( flags & GNTMAP_contains_pte )
3405 return create_grant_pte_mapping(addr, pte, current);
3406 return create_grant_va_mapping(addr, pte, current);
3409 int replace_grant_host_mapping(
3410 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3412 struct vcpu *curr = current;
3413 l1_pgentry_t *pl1e, ol1e;
3414 unsigned long gl1mfn;
3415 struct page_info *l1pg;
3416 int rc;
3418 if ( flags & GNTMAP_contains_pte )
3420 if ( !new_addr )
3421 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3423 MEM_LOG("Unsupported grant table operation");
3424 return GNTST_general_error;
3427 if ( !new_addr )
3428 return destroy_grant_va_mapping(addr, frame, curr);
3430 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3431 if ( !pl1e )
3433 MEM_LOG("Could not find L1 PTE for address %lx",
3434 (unsigned long)new_addr);
3435 return GNTST_general_error;
3438 l1pg = mfn_to_page(gl1mfn);
3439 page_lock(l1pg);
3440 ol1e = *pl1e;
3442 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3443 gl1mfn, curr, 0)) )
3445 page_unlock(l1pg);
3446 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3447 guest_unmap_l1e(curr, pl1e);
3448 return GNTST_general_error;
3451 page_unlock(l1pg);
3452 guest_unmap_l1e(curr, pl1e);
3454 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3455 if ( rc && !paging_mode_refcounts(curr->domain) )
3456 put_page_from_l1e(ol1e, curr->domain);
3458 return rc;
3461 int steal_page(
3462 struct domain *d, struct page_info *page, unsigned int memflags)
3464 u32 _d, _nd, x, y;
3466 spin_lock(&d->page_alloc_lock);
3468 /*
3469 * The tricky bit: atomically release ownership while there is just one
3470 * benign reference to the page (PGC_allocated). If that reference
3471 * disappears then the deallocation routine will safely spin.
3472 */
3473 _d = pickle_domptr(d);
3474 _nd = page->u.inuse._domain;
3475 y = page->count_info;
3476 do {
3477 x = y;
3478 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
3479 (1 | PGC_allocated)) || unlikely(_nd != _d) )
3481 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
3482 " caf=%08x, taf=%" PRtype_info "\n",
3483 (void *) page_to_mfn(page),
3484 d, d->domain_id, unpickle_domptr(_nd), x,
3485 page->u.inuse.type_info);
3486 spin_unlock(&d->page_alloc_lock);
3487 return -1;
3489 asm volatile (
3490 LOCK_PREFIX "cmpxchg8b %2"
3491 : "=d" (_nd), "=a" (y),
3492 "=m" (*(volatile u64 *)(&page->count_info))
3493 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
3494 } while (unlikely(_nd != _d) || unlikely(y != x));
3496 /*
3497 * Unlink from 'd'. At least one reference remains (now anonymous), so
3498 * noone else is spinning to try to delete this page from 'd'.
3499 */
3500 if ( !(memflags & MEMF_no_refcount) )
3501 d->tot_pages--;
3502 list_del(&page->list);
3504 spin_unlock(&d->page_alloc_lock);
3506 return 0;
3509 int do_update_va_mapping(unsigned long va, u64 val64,
3510 unsigned long flags)
3512 l1_pgentry_t val = l1e_from_intpte(val64);
3513 struct vcpu *v = current;
3514 struct domain *d = v->domain;
3515 l1_pgentry_t *pl1e;
3516 unsigned long vmask, bmap_ptr, gl1mfn;
3517 cpumask_t pmask;
3518 int rc = 0;
3520 perfc_incr(calls_to_update_va);
3522 if ( unlikely(!access_ok(va, 1) && !paging_mode_external(d)) )
3523 return -EINVAL;
3525 rc = xsm_update_va_mapping(d, FOREIGNDOM, val);
3526 if ( rc )
3527 return rc;
3529 pl1e = guest_map_l1e(v, va, &gl1mfn);
3531 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) )
3532 rc = -EINVAL;
3534 if ( pl1e )
3535 guest_unmap_l1e(v, pl1e);
3536 pl1e = NULL;
3538 process_deferred_ops();
3540 switch ( flags & UVMF_FLUSHTYPE_MASK )
3542 case UVMF_TLB_FLUSH:
3543 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3545 case UVMF_LOCAL:
3546 flush_tlb_local();
3547 break;
3548 case UVMF_ALL:
3549 flush_tlb_mask(d->domain_dirty_cpumask);
3550 break;
3551 default:
3552 if ( unlikely(!is_pv_32on64_domain(d) ?
3553 get_user(vmask, (unsigned long *)bmap_ptr) :
3554 get_user(vmask, (unsigned int *)bmap_ptr)) )
3555 rc = -EFAULT;
3556 pmask = vcpumask_to_pcpumask(d, vmask);
3557 flush_tlb_mask(pmask);
3558 break;
3560 break;
3562 case UVMF_INVLPG:
3563 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3565 case UVMF_LOCAL:
3566 if ( !paging_mode_enabled(d) ||
3567 (paging_invlpg(v, va) != 0) )
3568 flush_tlb_one_local(va);
3569 break;
3570 case UVMF_ALL:
3571 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
3572 break;
3573 default:
3574 if ( unlikely(!is_pv_32on64_domain(d) ?
3575 get_user(vmask, (unsigned long *)bmap_ptr) :
3576 get_user(vmask, (unsigned int *)bmap_ptr)) )
3577 rc = -EFAULT;
3578 pmask = vcpumask_to_pcpumask(d, vmask);
3579 flush_tlb_one_mask(pmask, va);
3580 break;
3582 break;
3585 return rc;
3588 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3589 unsigned long flags,
3590 domid_t domid)
3592 int rc;
3594 if ( !set_foreigndom(domid) )
3595 return -ESRCH;
3597 rc = do_update_va_mapping(va, val64, flags);
3599 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3600 process_deferred_ops(); /* only to clear foreigndom */
3602 return rc;
3607 /*************************
3608 * Descriptor Tables
3609 */
3611 void destroy_gdt(struct vcpu *v)
3613 int i;
3614 unsigned long pfn;
3616 v->arch.guest_context.gdt_ents = 0;
3617 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3619 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3620 put_page_and_type(mfn_to_page(pfn));
3621 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3622 v->arch.guest_context.gdt_frames[i] = 0;
3627 long set_gdt(struct vcpu *v,
3628 unsigned long *frames,
3629 unsigned int entries)
3631 struct domain *d = v->domain;
3632 /* NB. There are 512 8-byte entries per GDT page. */
3633 int i, nr_pages = (entries + 511) / 512;
3634 unsigned long mfn;
3636 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3637 return -EINVAL;
3639 /* Check the pages in the new GDT. */
3640 for ( i = 0; i < nr_pages; i++ )
3642 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3643 if ( !mfn_valid(mfn) ||
3644 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
3645 goto fail;
3648 /* Tear down the old GDT. */
3649 destroy_gdt(v);
3651 /* Install the new GDT. */
3652 v->arch.guest_context.gdt_ents = entries;
3653 for ( i = 0; i < nr_pages; i++ )
3655 v->arch.guest_context.gdt_frames[i] = frames[i];
3656 l1e_write(&v->arch.perdomain_ptes[i],
3657 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3660 return 0;
3662 fail:
3663 while ( i-- > 0 )
3664 put_page_and_type(mfn_to_page(frames[i]));
3665 return -EINVAL;
3669 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3671 int nr_pages = (entries + 511) / 512;
3672 unsigned long frames[16];
3673 struct vcpu *curr = current;
3674 long ret;
3676 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3677 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3678 return -EINVAL;
3680 if ( copy_from_guest(frames, frame_list, nr_pages) )
3681 return -EFAULT;
3683 domain_lock(curr->domain);
3685 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3686 flush_tlb_local();
3688 domain_unlock(curr->domain);
3690 return ret;
3694 long do_update_descriptor(u64 pa, u64 desc)
3696 struct domain *dom = current->domain;
3697 unsigned long gmfn = pa >> PAGE_SHIFT;
3698 unsigned long mfn;
3699 unsigned int offset;
3700 struct desc_struct *gdt_pent, d;
3701 struct page_info *page;
3702 long ret = -EINVAL;
3704 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3706 *(u64 *)&d = desc;
3708 mfn = gmfn_to_mfn(dom, gmfn);
3709 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3710 !mfn_valid(mfn) ||
3711 !check_descriptor(dom, &d) )
3712 return -EINVAL;
3714 page = mfn_to_page(mfn);
3715 if ( unlikely(!get_page(page, dom)) )
3716 return -EINVAL;
3718 /* Check if the given frame is in use in an unsafe context. */
3719 switch ( page->u.inuse.type_info & PGT_type_mask )
3721 case PGT_seg_desc_page:
3722 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
3723 goto out;
3724 break;
3725 default:
3726 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3727 goto out;
3728 break;
3731 paging_mark_dirty(dom, mfn);
3733 /* All is good so make the update. */
3734 gdt_pent = map_domain_page(mfn);
3735 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3736 unmap_domain_page(gdt_pent);
3738 put_page_type(page);
3740 ret = 0; /* success */
3742 out:
3743 put_page(page);
3745 return ret;
3748 typedef struct e820entry e820entry_t;
3749 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3751 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3753 struct page_info *page = NULL;
3754 int rc;
3756 switch ( op )
3758 case XENMEM_add_to_physmap:
3760 struct xen_add_to_physmap xatp;
3761 unsigned long prev_mfn, mfn = 0, gpfn;
3762 struct domain *d;
3764 if ( copy_from_guest(&xatp, arg, 1) )
3765 return -EFAULT;
3767 rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
3768 if ( rc != 0 )
3769 return rc;
3771 if ( xsm_add_to_physmap(current->domain, d) )
3773 rcu_unlock_domain(d);
3774 return -EPERM;
3777 switch ( xatp.space )
3779 case XENMAPSPACE_shared_info:
3780 if ( xatp.idx == 0 )
3781 mfn = virt_to_mfn(d->shared_info);
3782 break;
3783 case XENMAPSPACE_grant_table:
3784 spin_lock(&d->grant_table->lock);
3786 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3787 (xatp.idx < max_nr_grant_frames) )
3788 gnttab_grow_table(d, xatp.idx + 1);
3790 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3791 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3793 spin_unlock(&d->grant_table->lock);
3794 break;
3795 case XENMAPSPACE_mfn:
3797 if ( get_page_from_pagenr(xatp.idx, d) ) {
3798 mfn = xatp.idx;
3799 page = mfn_to_page(mfn);
3801 break;
3803 default:
3804 break;
3807 if ( !paging_mode_translate(d) || (mfn == 0) )
3809 if ( page )
3810 put_page(page);
3811 rcu_unlock_domain(d);
3812 return -EINVAL;
3815 domain_lock(d);
3817 /* Remove previously mapped page if it was present. */
3818 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3819 if ( mfn_valid(prev_mfn) )
3821 if ( is_xen_heap_mfn(prev_mfn) )
3822 /* Xen heap frames are simply unhooked from this phys slot. */
3823 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3824 else
3825 /* Normal domain memory is freed, to avoid leaking memory. */
3826 guest_remove_page(d, xatp.gpfn);
3829 /* Unmap from old location, if any. */
3830 gpfn = get_gpfn_from_mfn(mfn);
3831 if ( gpfn != INVALID_M2P_ENTRY )
3832 guest_physmap_remove_page(d, gpfn, mfn, 0);
3834 /* Map at new location. */
3835 guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
3837 domain_unlock(d);
3839 if ( page )
3840 put_page(page);
3842 rcu_unlock_domain(d);
3844 break;
3847 case XENMEM_remove_from_physmap:
3849 struct xen_remove_from_physmap xrfp;
3850 unsigned long mfn;
3851 struct domain *d;
3853 if ( copy_from_guest(&xrfp, arg, 1) )
3854 return -EFAULT;
3856 rc = rcu_lock_target_domain_by_id(xrfp.domid, &d);
3857 if ( rc != 0 )
3858 return rc;
3860 if ( xsm_remove_from_physmap(current->domain, d) )
3862 rcu_unlock_domain(d);
3863 return -EPERM;
3866 domain_lock(d);
3868 mfn = gmfn_to_mfn(d, xrfp.gpfn);
3870 if ( mfn_valid(mfn) )
3871 guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);
3873 domain_unlock(d);
3875 rcu_unlock_domain(d);
3877 break;
3880 case XENMEM_set_memory_map:
3882 struct xen_foreign_memory_map fmap;
3883 struct domain *d;
3884 int rc;
3886 if ( copy_from_guest(&fmap, arg, 1) )
3887 return -EFAULT;
3889 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3890 return -EINVAL;
3892 rc = rcu_lock_target_domain_by_id(fmap.domid, &d);
3893 if ( rc != 0 )
3894 return rc;
3896 rc = xsm_domain_memory_map(d);
3897 if ( rc )
3899 rcu_unlock_domain(d);
3900 return rc;
3903 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3904 fmap.map.nr_entries) ? -EFAULT : 0;
3905 d->arch.nr_e820 = fmap.map.nr_entries;
3907 rcu_unlock_domain(d);
3908 return rc;
3911 case XENMEM_memory_map:
3913 struct xen_memory_map map;
3914 struct domain *d = current->domain;
3916 /* Backwards compatibility. */
3917 if ( d->arch.nr_e820 == 0 )
3918 return -ENOSYS;
3920 if ( copy_from_guest(&map, arg, 1) )
3921 return -EFAULT;
3923 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3924 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3925 copy_to_guest(arg, &map, 1) )
3926 return -EFAULT;
3928 return 0;
3931 case XENMEM_machine_memory_map:
3933 struct xen_memory_map memmap;
3934 XEN_GUEST_HANDLE(e820entry_t) buffer;
3935 int count;
3936 int rc;
3938 if ( !IS_PRIV(current->domain) )
3939 return -EINVAL;
3941 rc = xsm_machine_memory_map();
3942 if ( rc )
3943 return rc;
3945 if ( copy_from_guest(&memmap, arg, 1) )
3946 return -EFAULT;
3947 if ( memmap.nr_entries < e820.nr_map + 1 )
3948 return -EINVAL;
3950 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3952 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3953 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3954 return -EFAULT;
3956 memmap.nr_entries = count;
3958 if ( copy_to_guest(arg, &memmap, 1) )
3959 return -EFAULT;
3961 return 0;
3964 case XENMEM_machphys_mapping:
3966 static const struct xen_machphys_mapping mapping = {
3967 .v_start = MACH2PHYS_VIRT_START,
3968 .v_end = MACH2PHYS_VIRT_END,
3969 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3970 };
3972 if ( copy_to_guest(arg, &mapping, 1) )
3973 return -EFAULT;
3975 return 0;
3978 default:
3979 return subarch_memory_op(op, arg);
3982 return 0;
3986 /*************************
3987 * Writable Pagetables
3988 */
3990 struct ptwr_emulate_ctxt {
3991 struct x86_emulate_ctxt ctxt;
3992 unsigned long cr2;
3993 l1_pgentry_t pte;
3994 };
3996 static int ptwr_emulated_read(
3997 enum x86_segment seg,
3998 unsigned long offset,
3999 void *p_data,
4000 unsigned int bytes,
4001 struct x86_emulate_ctxt *ctxt)
4003 unsigned int rc;
4004 unsigned long addr = offset;
4006 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
4008 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
4009 return X86EMUL_EXCEPTION;
4012 return X86EMUL_OKAY;
4015 static int ptwr_emulated_update(
4016 unsigned long addr,
4017 paddr_t old,
4018 paddr_t val,
4019 unsigned int bytes,
4020 unsigned int do_cmpxchg,
4021 struct ptwr_emulate_ctxt *ptwr_ctxt)
4023 unsigned long mfn;
4024 unsigned long unaligned_addr = addr;
4025 struct page_info *page;
4026 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
4027 struct vcpu *v = current;
4028 struct domain *d = v->domain;
4030 /* Only allow naturally-aligned stores within the original %cr2 page. */
4031 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
4033 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
4034 ptwr_ctxt->cr2, addr, bytes);
4035 return X86EMUL_UNHANDLEABLE;
4038 /* Turn a sub-word access into a full-word access. */
4039 if ( bytes != sizeof(paddr_t) )
4041 paddr_t full;
4042 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
4044 /* Align address; read full word. */
4045 addr &= ~(sizeof(paddr_t)-1);
4046 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
4048 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
4049 return X86EMUL_EXCEPTION;
4051 /* Mask out bits provided by caller. */
4052 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
4053 /* Shift the caller value and OR in the missing bits. */
4054 val &= (((paddr_t)1 << (bytes*8)) - 1);
4055 val <<= (offset)*8;
4056 val |= full;
4057 /* Also fill in missing parts of the cmpxchg old value. */
4058 old &= (((paddr_t)1 << (bytes*8)) - 1);
4059 old <<= (offset)*8;
4060 old |= full;
4063 pte = ptwr_ctxt->pte;
4064 mfn = l1e_get_pfn(pte);
4065 page = mfn_to_page(mfn);
4067 /* We are looking only for read-only mappings of p.t. pages. */
4068 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
4069 ASSERT(mfn_valid(mfn));
4070 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
4071 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
4072 ASSERT(page_get_owner(page) == d);
4074 /* Check the new PTE. */
4075 nl1e = l1e_from_intpte(val);
4076 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
4078 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
4079 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
4081 /*
4082 * If this is an upper-half write to a PAE PTE then we assume that
4083 * the guest has simply got the two writes the wrong way round. We
4084 * zap the PRESENT bit on the assumption that the bottom half will
4085 * be written immediately after we return to the guest.
4086 */
4087 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
4088 l1e_get_intpte(nl1e));
4089 l1e_remove_flags(nl1e, _PAGE_PRESENT);
4091 else
4093 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
4094 return X86EMUL_UNHANDLEABLE;
4098 adjust_guest_l1e(nl1e, d);
4100 /* Checked successfully: do the update (write or cmpxchg). */
4101 pl1e = map_domain_page(mfn);
4102 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
4103 if ( do_cmpxchg )
4105 int okay;
4106 intpte_t t = old;
4107 ol1e = l1e_from_intpte(old);
4109 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
4110 &t, l1e_get_intpte(nl1e), _mfn(mfn));
4111 okay = (okay && t == old);
4113 if ( !okay )
4115 unmap_domain_page(pl1e);
4116 put_page_from_l1e(nl1e, d);
4117 return X86EMUL_CMPXCHG_FAILED;
4120 else
4122 ol1e = *pl1e;
4123 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
4124 BUG();
4127 trace_ptwr_emulation(addr, nl1e);
4129 unmap_domain_page(pl1e);
4131 /* Finally, drop the old PTE. */
4132 put_page_from_l1e(ol1e, d);
4134 return X86EMUL_OKAY;
4137 static int ptwr_emulated_write(
4138 enum x86_segment seg,
4139 unsigned long offset,
4140 void *p_data,
4141 unsigned int bytes,
4142 struct x86_emulate_ctxt *ctxt)
4144 paddr_t val = 0;
4146 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4148 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
4149 offset, bytes);
4150 return X86EMUL_UNHANDLEABLE;
4153 memcpy(&val, p_data, bytes);
4155 return ptwr_emulated_update(
4156 offset, 0, val, bytes, 0,
4157 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4160 static int ptwr_emulated_cmpxchg(
4161 enum x86_segment seg,
4162 unsigned long offset,
4163 void *p_old,
4164 void *p_new,
4165 unsigned int bytes,
4166 struct x86_emulate_ctxt *ctxt)
4168 paddr_t old = 0, new = 0;
4170 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4172 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
4173 offset, bytes);
4174 return X86EMUL_UNHANDLEABLE;
4177 memcpy(&old, p_old, bytes);
4178 memcpy(&new, p_new, bytes);
4180 return ptwr_emulated_update(
4181 offset, old, new, bytes, 1,
4182 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4185 static struct x86_emulate_ops ptwr_emulate_ops = {
4186 .read = ptwr_emulated_read,
4187 .insn_fetch = ptwr_emulated_read,
4188 .write = ptwr_emulated_write,
4189 .cmpxchg = ptwr_emulated_cmpxchg,
4190 };
4192 /* Write page fault handler: check if guest is trying to modify a PTE. */
4193 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
4194 struct cpu_user_regs *regs)
4196 struct domain *d = v->domain;
4197 struct page_info *page;
4198 l1_pgentry_t pte;
4199 struct ptwr_emulate_ctxt ptwr_ctxt;
4200 int rc;
4202 /* Attempt to read the PTE that maps the VA being accessed. */
4203 guest_get_eff_l1e(v, addr, &pte);
4204 page = l1e_get_page(pte);
4206 /* We are looking only for read-only mappings of p.t. pages. */
4207 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
4208 !mfn_valid(l1e_get_pfn(pte)) ||
4209 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
4210 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
4211 (page_get_owner(page) != d) )
4212 goto bail;
4214 ptwr_ctxt.ctxt.regs = regs;
4215 ptwr_ctxt.ctxt.force_writeback = 0;
4216 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
4217 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
4218 ptwr_ctxt.cr2 = addr;
4219 ptwr_ctxt.pte = pte;
4221 page_lock(page);
4222 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
4223 page_unlock(page);
4224 if ( rc == X86EMUL_UNHANDLEABLE )
4225 goto bail;
4227 perfc_incr(ptwr_emulations);
4228 return EXCRET_fault_fixed;
4230 bail:
4231 return 0;
4234 void free_xen_pagetable(void *v)
4236 extern int early_boot;
4238 if ( early_boot )
4239 return;
4241 if ( is_xen_heap_page(virt_to_page(v)) )
4242 free_xenheap_page(v);
4243 else
4244 free_domheap_page(virt_to_page(v));
4247 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4248 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4249 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4251 /*
4252 * map_pages_to_xen() can be called with interrupts disabled:
4253 * * During early bootstrap; or
4254 * * alloc_xenheap_pages() via memguard_guard_range
4255 * In these cases it is safe to use flush_area_local():
4256 * * Because only the local CPU is online; or
4257 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
4258 */
4259 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4260 flush_area_local((const void *)v, f) : \
4261 flush_area_all((const void *)v, f))
4263 int map_pages_to_xen(
4264 unsigned long virt,
4265 unsigned long mfn,
4266 unsigned long nr_mfns,
4267 unsigned int flags)
4269 l2_pgentry_t *pl2e, ol2e;
4270 l1_pgentry_t *pl1e, ol1e;
4271 unsigned int i;
4273 while ( nr_mfns != 0 )
4275 #ifdef __x86_64__
4276 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
4277 l3_pgentry_t ol3e = *pl3e;
4279 if ( cpu_has_page1gb &&
4280 !(((virt >> PAGE_SHIFT) | mfn) &
4281 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4282 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4283 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4285 /* 1GB-page mapping. */
4286 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4288 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4290 unsigned int flush_flags =
4291 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4293 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4295 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4296 flush_flags |= FLUSH_TLB_GLOBAL;
4297 if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4298 PAGE_CACHE_ATTRS )
4299 flush_flags |= FLUSH_CACHE;
4300 flush_area(virt, flush_flags);
4302 else
4304 pl2e = l3e_to_l2e(ol3e);
4305 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4307 ol2e = pl2e[i];
4308 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4309 continue;
4310 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4312 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4313 flush_flags |= FLUSH_TLB_GLOBAL;
4314 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4315 PAGE_CACHE_ATTRS )
4316 flush_flags |= FLUSH_CACHE;
4318 else
4320 unsigned int j;
4322 pl1e = l2e_to_l1e(ol2e);
4323 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4325 ol1e = pl1e[j];
4326 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4327 flush_flags |= FLUSH_TLB_GLOBAL;
4328 if ( (l1e_get_flags(ol1e) ^ flags) &
4329 PAGE_CACHE_ATTRS )
4330 flush_flags |= FLUSH_CACHE;
4334 flush_area(virt, flush_flags);
4335 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4337 ol2e = pl2e[i];
4338 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4339 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4340 free_xen_pagetable(l2e_to_l1e(ol2e));
4342 free_xen_pagetable(pl2e);
4346 virt += 1UL << L3_PAGETABLE_SHIFT;
4347 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4348 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4349 continue;
4352 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4353 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4355 unsigned int flush_flags =
4356 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4358 /* Skip this PTE if there is no change. */
4359 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4360 L1_PAGETABLE_ENTRIES - 1)) +
4361 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4362 l1_table_offset(virt) == mfn) &&
4363 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4364 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4366 /* We can skip to end of L3 superpage if we got a match. */
4367 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4368 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4369 if ( i > nr_mfns )
4370 i = nr_mfns;
4371 virt += i << PAGE_SHIFT;
4372 mfn += i;
4373 nr_mfns -= i;
4374 continue;
4377 pl2e = alloc_xen_pagetable();
4378 if ( pl2e == NULL )
4379 return -ENOMEM;
4381 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4382 l2e_write(pl2e + i,
4383 l2e_from_pfn(l3e_get_pfn(ol3e) +
4384 (i << PAGETABLE_ORDER),
4385 l3e_get_flags(ol3e)));
4387 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4388 flush_flags |= FLUSH_TLB_GLOBAL;
4390 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4391 __PAGE_HYPERVISOR));
4392 flush_area(virt, flush_flags);
4394 #endif
4396 pl2e = virt_to_xen_l2e(virt);
4398 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4399 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4400 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4402 /* Super-page mapping. */
4403 ol2e = *pl2e;
4404 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4406 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4408 unsigned int flush_flags =
4409 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4411 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4413 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4414 flush_flags |= FLUSH_TLB_GLOBAL;
4415 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4416 PAGE_CACHE_ATTRS )
4417 flush_flags |= FLUSH_CACHE;
4418 flush_area(virt, flush_flags);
4420 else
4422 pl1e = l2e_to_l1e(ol2e);
4423 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4425 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4426 flush_flags |= FLUSH_TLB_GLOBAL;
4427 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4428 PAGE_CACHE_ATTRS )
4429 flush_flags |= FLUSH_CACHE;
4431 flush_area(virt, flush_flags);
4432 free_xen_pagetable(pl1e);
4436 virt += 1UL << L2_PAGETABLE_SHIFT;
4437 mfn += 1UL << PAGETABLE_ORDER;
4438 nr_mfns -= 1UL << PAGETABLE_ORDER;
4440 else
4442 /* Normal page mapping. */
4443 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4445 pl1e = alloc_xen_pagetable();
4446 if ( pl1e == NULL )
4447 return -ENOMEM;
4448 clear_page(pl1e);
4449 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4450 __PAGE_HYPERVISOR));
4452 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4454 unsigned int flush_flags =
4455 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4457 /* Skip this PTE if there is no change. */
4458 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4459 l1_table_offset(virt)) == mfn) &&
4460 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4461 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4463 /* We can skip to end of L2 superpage if we got a match. */
4464 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4465 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4466 if ( i > nr_mfns )
4467 i = nr_mfns;
4468 virt += i << L1_PAGETABLE_SHIFT;
4469 mfn += i;
4470 nr_mfns -= i;
4471 goto check_l3;
4474 pl1e = alloc_xen_pagetable();
4475 if ( pl1e == NULL )
4476 return -ENOMEM;
4478 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4479 l1e_write(&pl1e[i],
4480 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4481 lNf_to_l1f(l2e_get_flags(*pl2e))));
4483 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4484 flush_flags |= FLUSH_TLB_GLOBAL;
4486 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4487 __PAGE_HYPERVISOR));
4488 flush_area(virt, flush_flags);
4491 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4492 ol1e = *pl1e;
4493 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4494 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4496 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4497 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4498 flush_flags |= FLUSH_TLB_GLOBAL;
4499 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
4500 flush_flags |= FLUSH_CACHE;
4501 flush_area(virt, flush_flags);
4504 virt += 1UL << L1_PAGETABLE_SHIFT;
4505 mfn += 1UL;
4506 nr_mfns -= 1UL;
4508 if ( (flags == PAGE_HYPERVISOR) &&
4509 ((nr_mfns == 0) ||
4510 ((((virt >> PAGE_SHIFT) | mfn) &
4511 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
4513 unsigned long base_mfn;
4514 pl1e = l2e_to_l1e(*pl2e);
4515 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4516 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4517 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4518 (l1e_get_flags(*pl1e) != flags) )
4519 break;
4520 if ( i == L1_PAGETABLE_ENTRIES )
4522 ol2e = *pl2e;
4523 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4524 l1f_to_lNf(flags)));
4525 flush_area(virt - PAGE_SIZE,
4526 FLUSH_TLB_GLOBAL |
4527 FLUSH_ORDER(PAGETABLE_ORDER));
4528 free_xen_pagetable(l2e_to_l1e(ol2e));
4533 check_l3: ;
4534 #ifdef __x86_64__
4535 if ( cpu_has_page1gb &&
4536 (flags == PAGE_HYPERVISOR) &&
4537 ((nr_mfns == 0) ||
4538 !(((virt >> PAGE_SHIFT) | mfn) &
4539 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4541 unsigned long base_mfn;
4543 ol3e = *pl3e;
4544 pl2e = l3e_to_l2e(ol3e);
4545 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4546 L1_PAGETABLE_ENTRIES - 1);
4547 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4548 if ( (l2e_get_pfn(*pl2e) !=
4549 (base_mfn + (i << PAGETABLE_ORDER))) ||
4550 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4551 break;
4552 if ( i == L2_PAGETABLE_ENTRIES )
4554 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4555 l1f_to_lNf(flags)));
4556 flush_area(virt - PAGE_SIZE,
4557 FLUSH_TLB_GLOBAL |
4558 FLUSH_ORDER(2*PAGETABLE_ORDER));
4559 free_xen_pagetable(l3e_to_l2e(ol3e));
4562 #endif
4565 return 0;
4568 void destroy_xen_mappings(unsigned long s, unsigned long e)
4570 l2_pgentry_t *pl2e;
4571 l1_pgentry_t *pl1e;
4572 unsigned int i;
4573 unsigned long v = s;
4575 ASSERT((s & ~PAGE_MASK) == 0);
4576 ASSERT((e & ~PAGE_MASK) == 0);
4578 while ( v < e )
4580 #ifdef __x86_64__
4581 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4583 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4585 v += 1UL << L3_PAGETABLE_SHIFT;
4586 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4587 continue;
4590 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4592 if ( l2_table_offset(v) == 0 &&
4593 l1_table_offset(v) == 0 &&
4594 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4596 /* PAGE1GB: whole superpage is destroyed. */
4597 l3e_write_atomic(pl3e, l3e_empty());
4598 v += 1UL << L3_PAGETABLE_SHIFT;
4599 continue;
4602 /* PAGE1GB: shatter the superpage and fall through. */
4603 pl2e = alloc_xen_pagetable();
4604 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4605 l2e_write(pl2e + i,
4606 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4607 (i << PAGETABLE_ORDER),
4608 l3e_get_flags(*pl3e)));
4609 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4610 __PAGE_HYPERVISOR));
4612 #endif
4614 pl2e = virt_to_xen_l2e(v);
4616 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4618 v += 1UL << L2_PAGETABLE_SHIFT;
4619 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4620 continue;
4623 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4625 if ( (l1_table_offset(v) == 0) &&
4626 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4628 /* PSE: whole superpage is destroyed. */
4629 l2e_write_atomic(pl2e, l2e_empty());
4630 v += 1UL << L2_PAGETABLE_SHIFT;
4632 else
4634 /* PSE: shatter the superpage and try again. */
4635 pl1e = alloc_xen_pagetable();
4636 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4637 l1e_write(&pl1e[i],
4638 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4639 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4640 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4641 __PAGE_HYPERVISOR));
4644 else
4646 /* Ordinary 4kB mapping. */
4647 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4648 l1e_write_atomic(pl1e, l1e_empty());
4649 v += PAGE_SIZE;
4651 /* If we are done with the L2E, check if it is now empty. */
4652 if ( (v != e) && (l1_table_offset(v) != 0) )
4653 continue;
4654 pl1e = l2e_to_l1e(*pl2e);
4655 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4656 if ( l1e_get_intpte(pl1e[i]) != 0 )
4657 break;
4658 if ( i == L1_PAGETABLE_ENTRIES )
4660 /* Empty: zap the L2E and free the L1 page. */
4661 l2e_write_atomic(pl2e, l2e_empty());
4662 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4663 free_xen_pagetable(pl1e);
4667 #ifdef __x86_64__
4668 /* If we are done with the L3E, check if it is now empty. */
4669 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4670 continue;
4671 pl2e = l3e_to_l2e(*pl3e);
4672 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4673 if ( l2e_get_intpte(pl2e[i]) != 0 )
4674 break;
4675 if ( i == L2_PAGETABLE_ENTRIES )
4677 /* Empty: zap the L3E and free the L2 page. */
4678 l3e_write_atomic(pl3e, l3e_empty());
4679 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4680 free_xen_pagetable(pl2e);
4682 #endif
4685 flush_area(NULL, FLUSH_TLB_GLOBAL);
4688 void __set_fixmap(
4689 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4691 BUG_ON(idx >= __end_of_fixed_addresses);
4692 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4695 #ifdef MEMORY_GUARD
4697 void memguard_init(void)
4699 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4700 map_pages_to_xen(
4701 (unsigned long)__va(start),
4702 start >> PAGE_SHIFT,
4703 (xenheap_phys_end - start) >> PAGE_SHIFT,
4704 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4705 #ifdef __x86_64__
4706 BUG_ON(start != xen_phys_start);
4707 map_pages_to_xen(
4708 XEN_VIRT_START,
4709 start >> PAGE_SHIFT,
4710 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4711 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4712 #endif
4715 static void __memguard_change_range(void *p, unsigned long l, int guard)
4717 unsigned long _p = (unsigned long)p;
4718 unsigned long _l = (unsigned long)l;
4719 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4721 /* Ensure we are dealing with a page-aligned whole number of pages. */
4722 ASSERT((_p&~PAGE_MASK) == 0);
4723 ASSERT((_l&~PAGE_MASK) == 0);
4725 if ( guard )
4726 flags &= ~_PAGE_PRESENT;
4728 map_pages_to_xen(
4729 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4732 void memguard_guard_range(void *p, unsigned long l)
4734 __memguard_change_range(p, l, 1);
4737 void memguard_unguard_range(void *p, unsigned long l)
4739 __memguard_change_range(p, l, 0);
4742 #endif
4744 void memguard_guard_stack(void *p)
4746 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4747 p = (void *)((unsigned long)p + STACK_SIZE -
4748 PRIMARY_STACK_SIZE - PAGE_SIZE);
4749 memguard_guard_range(p, PAGE_SIZE);
4752 /*
4753 * Local variables:
4754 * mode: C
4755 * c-set-style: "BSD"
4756 * c-basic-offset: 4
4757 * tab-width: 4
4758 * indent-tabs-mode: nil
4759 * End:
4760 */