ia64/xen-unstable

view xen/arch/x86/mm.c @ 18433:c9db93b0660a

x86: Fix interpretation of get_l*e_linear_pagetable().

Broken by get_page_type() preemption patch (c/s 18412).

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Sep 03 14:56:08 2008 +0100 (2008-09-03)
parents 86b956d8cf04
children e5766aea2907
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(__i386__)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 #define l1_disallow_mask(d) \
164 ((d != dom_io) && \
165 (rangeset_is_empty((d)->iomem_caps) && \
166 rangeset_is_empty((d)->arch.ioport_caps) && \
167 !has_arch_pdevs(d)) ? \
168 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
170 #ifdef CONFIG_COMPAT
171 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
172 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
173 L3_DISALLOW_MASK : \
174 COMPAT_L3_DISALLOW_MASK)
175 #else
176 #define l3_disallow_mask(d) L3_DISALLOW_MASK
177 #endif
179 static void queue_deferred_ops(struct domain *d, unsigned int ops)
180 {
181 ASSERT(d == current->domain);
182 this_cpu(percpu_mm_info).deferred_ops |= ops;
183 }
185 void __init init_frametable(void)
186 {
187 unsigned long nr_pages, page_step, i, mfn;
189 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
191 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
192 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
194 for ( i = 0; i < nr_pages; i += page_step )
195 {
196 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
197 if ( mfn == 0 )
198 panic("Not enough memory for frame table\n");
199 map_pages_to_xen(
200 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
201 mfn, page_step, PAGE_HYPERVISOR);
202 }
204 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
206 #if defined(__x86_64__)
207 for ( i = 0; i < max_page; i ++ )
208 spin_lock_init(&frame_table[i].lock);
209 #endif
210 }
212 void __init arch_init_memory(void)
213 {
214 extern void subarch_init_memory(void);
216 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
218 /*
219 * Initialise our DOMID_XEN domain.
220 * Any Xen-heap pages that we will allow to be mapped will have
221 * their domain field set to dom_xen.
222 */
223 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
224 BUG_ON(dom_xen == NULL);
226 /*
227 * Initialise our DOMID_IO domain.
228 * This domain owns I/O pages that are within the range of the page_info
229 * array. Mappings occur at the priv of the caller.
230 */
231 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
232 BUG_ON(dom_io == NULL);
234 /* First 1MB of RAM is historically marked as I/O. */
235 for ( i = 0; i < 0x100; i++ )
236 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
238 /* Any areas not specified as RAM by the e820 map are considered I/O. */
239 for ( i = 0, pfn = 0; pfn < max_page; i++ )
240 {
241 while ( (i < e820.nr_map) &&
242 (e820.map[i].type != E820_RAM) &&
243 (e820.map[i].type != E820_UNUSABLE) )
244 i++;
246 if ( i >= e820.nr_map )
247 {
248 /* No more RAM regions: mark as I/O right to end of memory map. */
249 rstart_pfn = rend_pfn = max_page;
250 }
251 else
252 {
253 /* Mark as I/O just up as far as next RAM region. */
254 rstart_pfn = min_t(unsigned long, max_page,
255 PFN_UP(e820.map[i].addr));
256 rend_pfn = max_t(unsigned long, rstart_pfn,
257 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
258 }
260 /*
261 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
262 * In particular this ensures that RAM holes are respected even in
263 * the statically-initialised 1-16MB mapping area.
264 */
265 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
266 ioend_pfn = rstart_pfn;
267 #if defined(CONFIG_X86_32)
268 ioend_pfn = min_t(unsigned long, ioend_pfn,
269 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
270 #endif
271 if ( iostart_pfn < ioend_pfn )
272 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
273 (unsigned long)mfn_to_virt(ioend_pfn));
275 /* Mark as I/O up to next RAM region. */
276 for ( ; pfn < rstart_pfn; pfn++ )
277 {
278 BUG_ON(!mfn_valid(pfn));
279 share_xen_page_with_guest(
280 mfn_to_page(pfn), dom_io, XENSHARE_writable);
281 }
283 /* Skip the RAM region. */
284 pfn = rend_pfn;
285 }
287 subarch_init_memory();
288 }
290 int memory_is_conventional_ram(paddr_t p)
291 {
292 int i;
294 for ( i = 0; i < e820.nr_map; i++ )
295 {
296 if ( (e820.map[i].type == E820_RAM) &&
297 (e820.map[i].addr <= p) &&
298 (e820.map[i].size > p) )
299 return 1;
300 }
302 return 0;
303 }
305 unsigned long domain_get_maximum_gpfn(struct domain *d)
306 {
307 if ( is_hvm_domain(d) )
308 return d->arch.p2m->max_mapped_pfn;
309 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
310 return arch_get_max_pfn(d) - 1;
311 }
313 void share_xen_page_with_guest(
314 struct page_info *page, struct domain *d, int readonly)
315 {
316 if ( page_get_owner(page) == d )
317 return;
319 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
321 spin_lock(&d->page_alloc_lock);
323 /* The incremented type count pins as writable or read-only. */
324 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
325 page->u.inuse.type_info |= PGT_validated | 1;
327 page_set_owner(page, d);
328 wmb(); /* install valid domain ptr before updating refcnt. */
329 ASSERT(page->count_info == 0);
331 /* Only add to the allocation list if the domain isn't dying. */
332 if ( !d->is_dying )
333 {
334 page->count_info |= PGC_allocated | 1;
335 if ( unlikely(d->xenheap_pages++ == 0) )
336 get_knownalive_domain(d);
337 list_add_tail(&page->list, &d->xenpage_list);
338 }
340 spin_unlock(&d->page_alloc_lock);
341 }
343 void share_xen_page_with_privileged_guests(
344 struct page_info *page, int readonly)
345 {
346 share_xen_page_with_guest(page, dom_xen, readonly);
347 }
349 #if defined(__i386__)
351 #ifdef NDEBUG
352 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
353 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
354 #else
355 /*
356 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
357 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
358 * (detected by lack of an owning domain). As required for correctness, we
359 * always shadow PDPTs above 4GB.
360 */
361 #define l3tab_needs_shadow(mfn) \
362 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
363 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
364 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
365 ((mfn) >= 0x100000))
366 #endif
368 static l1_pgentry_t *fix_pae_highmem_pl1e;
370 /* Cache the address of PAE high-memory fixmap page tables. */
371 static int __init cache_pae_fixmap_address(void)
372 {
373 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
374 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
375 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
376 return 0;
377 }
378 __initcall(cache_pae_fixmap_address);
380 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
382 void make_cr3(struct vcpu *v, unsigned long mfn)
383 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
384 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
385 {
386 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
387 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
388 unsigned int cpu = smp_processor_id();
390 /* Fast path: does this mfn need a shadow at all? */
391 if ( !l3tab_needs_shadow(mfn) )
392 {
393 v->arch.cr3 = mfn << PAGE_SHIFT;
394 /* Cache is no longer in use or valid */
395 cache->high_mfn = 0;
396 return;
397 }
399 /* Caching logic is not interrupt safe. */
400 ASSERT(!in_irq());
402 /* Protects against pae_flush_pgd(). */
403 spin_lock(&cache->lock);
405 cache->inuse_idx ^= 1;
406 cache->high_mfn = mfn;
408 /* Map the guest L3 table and copy to the chosen low-memory cache. */
409 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
410 /* First check the previous high mapping can't be in the TLB.
411 * (i.e. have we loaded CR3 since we last did this?) */
412 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
413 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
414 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
415 lowmem_l3tab = cache->table[cache->inuse_idx];
416 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
417 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
418 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
420 v->arch.cr3 = __pa(lowmem_l3tab);
422 spin_unlock(&cache->lock);
423 }
425 #else /* !defined(__i386__) */
427 void make_cr3(struct vcpu *v, unsigned long mfn)
428 {
429 v->arch.cr3 = mfn << PAGE_SHIFT;
430 }
432 #endif /* !defined(__i386__) */
434 void write_ptbase(struct vcpu *v)
435 {
436 write_cr3(v->arch.cr3);
437 }
439 /*
440 * Should be called after CR3 is updated.
441 *
442 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
443 * for HVM guests, arch.monitor_table and hvm's guest CR3.
444 *
445 * Update ref counts to shadow tables appropriately.
446 */
447 void update_cr3(struct vcpu *v)
448 {
449 unsigned long cr3_mfn=0;
451 if ( paging_mode_enabled(v->domain) )
452 {
453 paging_update_cr3(v);
454 return;
455 }
457 #if CONFIG_PAGING_LEVELS == 4
458 if ( !(v->arch.flags & TF_kernel_mode) )
459 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
460 else
461 #endif
462 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
464 make_cr3(v, cr3_mfn);
465 }
468 static void invalidate_shadow_ldt(struct vcpu *v)
469 {
470 int i;
471 unsigned long pfn;
472 struct page_info *page;
474 if ( v->arch.shadow_ldt_mapcnt == 0 )
475 return;
477 v->arch.shadow_ldt_mapcnt = 0;
479 for ( i = 16; i < 32; i++ )
480 {
481 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
482 if ( pfn == 0 ) continue;
483 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
484 page = mfn_to_page(pfn);
485 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
486 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
487 put_page_and_type(page);
488 }
490 /* Dispose of the (now possibly invalid) mappings from the TLB. */
491 if ( v == current )
492 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
493 else
494 flush_tlb_mask(v->domain->domain_dirty_cpumask);
495 }
498 static int alloc_segdesc_page(struct page_info *page)
499 {
500 struct desc_struct *descs;
501 int i;
503 descs = map_domain_page(page_to_mfn(page));
505 for ( i = 0; i < 512; i++ )
506 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
507 goto fail;
509 unmap_domain_page(descs);
510 return 0;
512 fail:
513 unmap_domain_page(descs);
514 return -EINVAL;
515 }
518 /* Map shadow page at offset @off. */
519 int map_ldt_shadow_page(unsigned int off)
520 {
521 struct vcpu *v = current;
522 struct domain *d = v->domain;
523 unsigned long gmfn, mfn;
524 l1_pgentry_t l1e, nl1e;
525 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
526 int okay;
528 BUG_ON(unlikely(in_irq()));
530 guest_get_eff_kern_l1e(v, gva, &l1e);
531 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
532 return 0;
534 gmfn = l1e_get_pfn(l1e);
535 mfn = gmfn_to_mfn(d, gmfn);
536 if ( unlikely(!mfn_valid(mfn)) )
537 return 0;
539 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
540 if ( unlikely(!okay) )
541 return 0;
543 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
545 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
546 v->arch.shadow_ldt_mapcnt++;
548 return 1;
549 }
552 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
553 {
554 struct page_info *page = mfn_to_page(page_nr);
556 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
557 {
558 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
559 return 0;
560 }
562 return 1;
563 }
566 static int get_page_and_type_from_pagenr(unsigned long page_nr,
567 unsigned long type,
568 struct domain *d,
569 int preemptible)
570 {
571 struct page_info *page = mfn_to_page(page_nr);
572 int rc;
574 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
575 return -EINVAL;
577 rc = (preemptible ?
578 get_page_type_preemptible(page, type) :
579 (get_page_type(page, type) ? 0 : -EINVAL));
581 if ( rc )
582 put_page(page);
584 return rc;
585 }
587 /*
588 * We allow root tables to map each other (a.k.a. linear page tables). It
589 * needs some special care with reference counts and access permissions:
590 * 1. The mapping entry must be read-only, or the guest may get write access
591 * to its own PTEs.
592 * 2. We must only bump the reference counts for an *already validated*
593 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
594 * on a validation that is required to complete that validation.
595 * 3. We only need to increment the reference counts for the mapped page
596 * frame if it is mapped by a different root table. This is sufficient and
597 * also necessary to allow validation of a root table mapping itself.
598 */
599 #define define_get_linear_pagetable(level) \
600 static int \
601 get_##level##_linear_pagetable( \
602 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
603 { \
604 unsigned long x, y; \
605 struct page_info *page; \
606 unsigned long pfn; \
607 \
608 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
609 { \
610 MEM_LOG("Attempt to create linear p.t. with write perms"); \
611 return 0; \
612 } \
613 \
614 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
615 { \
616 /* Make sure the mapped frame belongs to the correct domain. */ \
617 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
618 return 0; \
619 \
620 /* \
621 * Ensure that the mapped frame is an already-validated page table. \
622 * If so, atomically increment the count (checking for overflow). \
623 */ \
624 page = mfn_to_page(pfn); \
625 y = page->u.inuse.type_info; \
626 do { \
627 x = y; \
628 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
629 unlikely((x & (PGT_type_mask|PGT_validated)) != \
630 (PGT_##level##_page_table|PGT_validated)) ) \
631 { \
632 put_page(page); \
633 return 0; \
634 } \
635 } \
636 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
637 } \
638 \
639 return 1; \
640 }
643 int is_iomem_page(unsigned long mfn)
644 {
645 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
646 }
649 int
650 get_page_from_l1e(
651 l1_pgentry_t l1e, struct domain *d)
652 {
653 unsigned long mfn = l1e_get_pfn(l1e);
654 struct page_info *page = mfn_to_page(mfn);
655 uint32_t l1f = l1e_get_flags(l1e);
656 struct vcpu *curr = current;
657 struct domain *owner;
658 int okay;
660 if ( !(l1f & _PAGE_PRESENT) )
661 return 1;
663 if ( unlikely(l1f & l1_disallow_mask(d)) )
664 {
665 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
666 return 0;
667 }
669 if ( is_iomem_page(mfn) )
670 {
671 /* DOMID_IO reverts to caller for privilege checks. */
672 if ( d == dom_io )
673 d = curr->domain;
675 if ( !iomem_access_permitted(d, mfn, mfn) )
676 {
677 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
678 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
679 d->domain_id, mfn);
680 return 0;
681 }
683 return 1;
684 }
686 /*
687 * Let privileged domains transfer the right to map their target
688 * domain's pages. This is used to allow stub-domain pvfb export to dom0,
689 * until pvfb supports granted mappings. At that time this minor hack
690 * can go away.
691 */
692 owner = page_get_owner(page);
693 if ( unlikely(d != owner) && (owner != NULL) &&
694 (d != curr->domain) && IS_PRIV_FOR(d, owner) )
695 d = owner;
697 /* Foreign mappings into guests in shadow external mode don't
698 * contribute to writeable mapping refcounts. (This allows the
699 * qemu-dm helper process in dom0 to map the domain's memory without
700 * messing up the count of "real" writable mappings.) */
701 okay = (((l1f & _PAGE_RW) &&
702 !(unlikely(paging_mode_external(d) && (d != curr->domain))))
703 ? get_page_and_type(page, d, PGT_writable_page)
704 : get_page(page, d));
705 if ( !okay )
706 {
707 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
708 " for dom%d",
709 mfn, get_gpfn_from_mfn(mfn),
710 l1e_get_intpte(l1e), d->domain_id);
711 }
712 else if ( pte_flags_to_cacheattr(l1f) !=
713 ((page->count_info >> PGC_cacheattr_base) & 7) )
714 {
715 uint32_t x, nx, y = page->count_info;
716 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
718 if ( is_xen_heap_page(page) )
719 {
720 if ( (l1f & _PAGE_RW) &&
721 !(unlikely(paging_mode_external(d) &&
722 (d != curr->domain))) )
723 put_page_type(page);
724 put_page(page);
725 MEM_LOG("Attempt to change cache attributes of Xen heap page");
726 return 0;
727 }
729 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
730 {
731 x = y;
732 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
733 y = cmpxchg(&page->count_info, x, nx);
734 }
736 #ifdef __x86_64__
737 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
738 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
739 #endif
740 }
742 return okay;
743 }
746 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
747 define_get_linear_pagetable(l2);
748 static int
749 get_page_from_l2e(
750 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
751 {
752 int rc;
754 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
755 return 1;
757 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
758 {
759 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
760 return -EINVAL;
761 }
763 rc = get_page_and_type_from_pagenr(
764 l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
765 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
766 rc = 0;
768 return rc;
769 }
772 define_get_linear_pagetable(l3);
773 static int
774 get_page_from_l3e(
775 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
776 {
777 int rc;
779 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
780 return 1;
782 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
783 {
784 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
785 return -EINVAL;
786 }
788 rc = get_page_and_type_from_pagenr(
789 l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
790 if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
791 rc = 0;
793 return rc;
794 }
796 #if CONFIG_PAGING_LEVELS >= 4
797 define_get_linear_pagetable(l4);
798 static int
799 get_page_from_l4e(
800 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
801 {
802 int rc;
804 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
805 return 1;
807 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
808 {
809 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
810 return -EINVAL;
811 }
813 rc = get_page_and_type_from_pagenr(
814 l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
815 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
816 rc = 0;
818 return rc;
819 }
820 #endif /* 4 level */
822 #ifdef __x86_64__
824 #ifdef USER_MAPPINGS_ARE_GLOBAL
825 #define adjust_guest_l1e(pl1e, d) \
826 do { \
827 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
828 likely(!is_pv_32on64_domain(d)) ) \
829 { \
830 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
831 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
832 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
833 MEM_LOG("Global bit is set to kernel page %lx", \
834 l1e_get_pfn((pl1e))); \
835 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
836 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
837 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
838 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
839 } \
840 } while ( 0 )
841 #else
842 #define adjust_guest_l1e(pl1e, d) \
843 do { \
844 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
845 likely(!is_pv_32on64_domain(d)) ) \
846 l1e_add_flags((pl1e), _PAGE_USER); \
847 } while ( 0 )
848 #endif
850 #define adjust_guest_l2e(pl2e, d) \
851 do { \
852 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
853 likely(!is_pv_32on64_domain(d)) ) \
854 l2e_add_flags((pl2e), _PAGE_USER); \
855 } while ( 0 )
857 #define adjust_guest_l3e(pl3e, d) \
858 do { \
859 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
860 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
861 _PAGE_USER : \
862 _PAGE_USER|_PAGE_RW); \
863 } while ( 0 )
865 #define adjust_guest_l4e(pl4e, d) \
866 do { \
867 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
868 likely(!is_pv_32on64_domain(d)) ) \
869 l4e_add_flags((pl4e), _PAGE_USER); \
870 } while ( 0 )
872 #else /* !defined(__x86_64__) */
874 #define adjust_guest_l1e(_p, _d) ((void)(_d))
875 #define adjust_guest_l2e(_p, _d) ((void)(_d))
876 #define adjust_guest_l3e(_p, _d) ((void)(_d))
878 #endif
880 #ifdef CONFIG_COMPAT
881 #define unadjust_guest_l3e(pl3e, d) \
882 do { \
883 if ( unlikely(is_pv_32on64_domain(d)) && \
884 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
885 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
886 } while ( 0 )
887 #else
888 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
889 #endif
891 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
892 {
893 unsigned long pfn = l1e_get_pfn(l1e);
894 struct page_info *page;
895 struct domain *e;
896 struct vcpu *v;
898 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
899 return;
901 page = mfn_to_page(pfn);
903 e = page_get_owner(page);
905 /*
906 * Check if this is a mapping that was established via a grant reference.
907 * If it was then we should not be here: we require that such mappings are
908 * explicitly destroyed via the grant-table interface.
909 *
910 * The upshot of this is that the guest can end up with active grants that
911 * it cannot destroy (because it no longer has a PTE to present to the
912 * grant-table interface). This can lead to subtle hard-to-catch bugs,
913 * hence a special grant PTE flag can be enabled to catch the bug early.
914 *
915 * (Note that the undestroyable active grants are not a security hole in
916 * Xen. All active grants can safely be cleaned up when the domain dies.)
917 */
918 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
919 !d->is_shutting_down && !d->is_dying )
920 {
921 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
922 l1e_get_intpte(l1e));
923 domain_crash(d);
924 }
926 /* Remember we didn't take a type-count of foreign writable mappings
927 * to paging-external domains */
928 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
929 !(unlikely((e != d) && paging_mode_external(e))) )
930 {
931 put_page_and_type(page);
932 }
933 else
934 {
935 /* We expect this is rare so we blow the entire shadow LDT. */
936 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
937 PGT_seg_desc_page)) &&
938 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
939 (d == e) )
940 {
941 for_each_vcpu ( d, v )
942 invalidate_shadow_ldt(v);
943 }
944 put_page(page);
945 }
946 }
949 /*
950 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
951 * Note also that this automatically deals correctly with linear p.t.'s.
952 */
953 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
954 {
955 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
956 (l2e_get_pfn(l2e) != pfn) )
957 {
958 put_page_and_type(l2e_get_page(l2e));
959 return 0;
960 }
961 return 1;
962 }
965 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
966 int preemptible)
967 {
968 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
969 (l3e_get_pfn(l3e) != pfn) )
970 return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
971 return 1;
972 }
974 #if CONFIG_PAGING_LEVELS >= 4
975 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
976 int preemptible)
977 {
978 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
979 (l4e_get_pfn(l4e) != pfn) )
980 return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
981 return 1;
982 }
983 #endif
985 static int alloc_l1_table(struct page_info *page)
986 {
987 struct domain *d = page_get_owner(page);
988 unsigned long pfn = page_to_mfn(page);
989 l1_pgentry_t *pl1e;
990 unsigned int i;
992 pl1e = map_domain_page(pfn);
994 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
995 {
996 if ( is_guest_l1_slot(i) &&
997 unlikely(!get_page_from_l1e(pl1e[i], d)) )
998 goto fail;
1000 adjust_guest_l1e(pl1e[i], d);
1003 unmap_domain_page(pl1e);
1004 return 0;
1006 fail:
1007 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
1008 while ( i-- > 0 )
1009 if ( is_guest_l1_slot(i) )
1010 put_page_from_l1e(pl1e[i], d);
1012 unmap_domain_page(pl1e);
1013 return -EINVAL;
1016 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1018 struct page_info *page;
1019 l2_pgentry_t *pl2e;
1020 l3_pgentry_t l3e3;
1021 #ifndef CONFIG_COMPAT
1022 l2_pgentry_t l2e;
1023 int i;
1024 #endif
1026 if ( !is_pv_32bit_domain(d) )
1027 return 1;
1029 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1031 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1032 l3e3 = pl3e[3];
1033 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1035 MEM_LOG("PAE L3 3rd slot is empty");
1036 return 0;
1039 /*
1040 * The Xen-private mappings include linear mappings. The L2 thus cannot
1041 * be shared by multiple L3 tables. The test here is adequate because:
1042 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1043 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1044 * 2. Cannot appear in another page table's L3:
1045 * a. alloc_l3_table() calls this function and this check will fail
1046 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1047 */
1048 page = l3e_get_page(l3e3);
1049 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1050 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1051 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1052 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1054 MEM_LOG("PAE L3 3rd slot is shared");
1055 return 0;
1058 /* Xen private mappings. */
1059 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1060 #ifndef CONFIG_COMPAT
1061 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1062 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1063 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1064 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1066 l2e = l2e_from_page(
1067 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1068 __PAGE_HYPERVISOR);
1069 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1071 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1073 l2e = l2e_empty();
1074 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1075 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1076 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1078 #else
1079 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1080 &compat_idle_pg_table_l2[
1081 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1082 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1083 #endif
1084 unmap_domain_page(pl2e);
1086 return 1;
1089 #ifdef __i386__
1090 /* Flush a pgdir update into low-memory caches. */
1091 static void pae_flush_pgd(
1092 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1094 struct domain *d = page_get_owner(mfn_to_page(mfn));
1095 struct vcpu *v;
1096 intpte_t _ol3e, _nl3e, _pl3e;
1097 l3_pgentry_t *l3tab_ptr;
1098 struct pae_l3_cache *cache;
1100 if ( unlikely(shadow_mode_enabled(d)) )
1102 cpumask_t m = CPU_MASK_NONE;
1103 /* Re-shadow this l3 table on any vcpus that are using it */
1104 for_each_vcpu ( d, v )
1105 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1107 paging_update_cr3(v);
1108 cpus_or(m, m, v->vcpu_dirty_cpumask);
1110 flush_tlb_mask(m);
1113 /* If below 4GB then the pgdir is not shadowed in low memory. */
1114 if ( !l3tab_needs_shadow(mfn) )
1115 return;
1117 for_each_vcpu ( d, v )
1119 cache = &v->arch.pae_l3_cache;
1121 spin_lock(&cache->lock);
1123 if ( cache->high_mfn == mfn )
1125 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1126 _ol3e = l3e_get_intpte(*l3tab_ptr);
1127 _nl3e = l3e_get_intpte(nl3e);
1128 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1129 BUG_ON(_pl3e != _ol3e);
1132 spin_unlock(&cache->lock);
1135 flush_tlb_mask(d->domain_dirty_cpumask);
1137 #else
1138 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1139 #endif
1141 static int alloc_l2_table(struct page_info *page, unsigned long type,
1142 int preemptible)
1144 struct domain *d = page_get_owner(page);
1145 unsigned long pfn = page_to_mfn(page);
1146 l2_pgentry_t *pl2e;
1147 unsigned int i;
1148 int rc = 0;
1150 pl2e = map_domain_page(pfn);
1152 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1154 if ( preemptible && i && hypercall_preempt_check() )
1156 page->nr_validated_ptes = i;
1157 rc = -EAGAIN;
1158 break;
1161 if ( !is_guest_l2_slot(d, type, i) ||
1162 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1163 continue;
1165 if ( rc < 0 )
1167 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1168 while ( i-- > 0 )
1169 if ( is_guest_l2_slot(d, type, i) )
1170 put_page_from_l2e(pl2e[i], pfn);
1171 break;
1174 adjust_guest_l2e(pl2e[i], d);
1177 unmap_domain_page(pl2e);
1178 return rc > 0 ? 0 : rc;
1181 static int alloc_l3_table(struct page_info *page, int preemptible)
1183 struct domain *d = page_get_owner(page);
1184 unsigned long pfn = page_to_mfn(page);
1185 l3_pgentry_t *pl3e;
1186 unsigned int i;
1187 int rc = 0;
1189 #if CONFIG_PAGING_LEVELS == 3
1190 /*
1191 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1192 * the weird 'extended cr3' format for dealing with high-order address
1193 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1194 */
1195 if ( (pfn >= 0x100000) &&
1196 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1197 d->vcpu[0] && d->vcpu[0]->is_initialised )
1199 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1200 return -EINVAL;
1202 #endif
1204 pl3e = map_domain_page(pfn);
1206 /*
1207 * PAE guests allocate full pages, but aren't required to initialize
1208 * more than the first four entries; when running in compatibility
1209 * mode, however, the full page is visible to the MMU, and hence all
1210 * 512 entries must be valid/verified, which is most easily achieved
1211 * by clearing them out.
1212 */
1213 if ( is_pv_32on64_domain(d) )
1214 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1216 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
1218 if ( is_pv_32bit_domain(d) && (i == 3) )
1220 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1221 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1222 rc = -EINVAL;
1223 else
1224 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1225 PGT_l2_page_table |
1226 PGT_pae_xen_l2,
1227 d, preemptible);
1229 else if ( !is_guest_l3_slot(i) ||
1230 (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
1231 continue;
1233 if ( rc == -EAGAIN )
1235 page->nr_validated_ptes = i;
1236 page->partial_pte = 1;
1238 else if ( rc == -EINTR && i )
1240 page->nr_validated_ptes = i;
1241 page->partial_pte = 0;
1242 rc = -EAGAIN;
1244 if ( rc < 0 )
1245 break;
1247 adjust_guest_l3e(pl3e[i], d);
1250 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1251 rc = -EINVAL;
1252 if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
1254 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1255 while ( i-- > 0 )
1257 if ( !is_guest_l3_slot(i) )
1258 continue;
1259 unadjust_guest_l3e(pl3e[i], d);
1260 put_page_from_l3e(pl3e[i], pfn, 0);
1264 unmap_domain_page(pl3e);
1265 return rc > 0 ? 0 : rc;
1268 #if CONFIG_PAGING_LEVELS >= 4
1269 static int alloc_l4_table(struct page_info *page, int preemptible)
1271 struct domain *d = page_get_owner(page);
1272 unsigned long pfn = page_to_mfn(page);
1273 l4_pgentry_t *pl4e = page_to_virt(page);
1274 unsigned int i;
1275 int rc = 0;
1277 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
1279 if ( !is_guest_l4_slot(d, i) ||
1280 (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
1281 continue;
1283 if ( rc == -EAGAIN )
1285 page->nr_validated_ptes = i;
1286 page->partial_pte = 1;
1288 else if ( rc == -EINTR )
1290 if ( i )
1292 page->nr_validated_ptes = i;
1293 page->partial_pte = 0;
1294 rc = -EAGAIN;
1297 else if ( rc < 0 )
1299 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1300 while ( i-- > 0 )
1301 if ( is_guest_l4_slot(d, i) )
1302 put_page_from_l4e(pl4e[i], pfn, 0);
1304 if ( rc < 0 )
1305 return rc;
1307 adjust_guest_l4e(pl4e[i], d);
1310 /* Xen private mappings. */
1311 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1312 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1313 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1314 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1315 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1316 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1317 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1318 __PAGE_HYPERVISOR);
1320 return rc > 0 ? 0 : rc;
1322 #else
1323 #define alloc_l4_table(page, preemptible) (-EINVAL)
1324 #endif
1327 static void free_l1_table(struct page_info *page)
1329 struct domain *d = page_get_owner(page);
1330 unsigned long pfn = page_to_mfn(page);
1331 l1_pgentry_t *pl1e;
1332 unsigned int i;
1334 pl1e = map_domain_page(pfn);
1336 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1337 if ( is_guest_l1_slot(i) )
1338 put_page_from_l1e(pl1e[i], d);
1340 unmap_domain_page(pl1e);
1344 static int free_l2_table(struct page_info *page, int preemptible)
1346 #ifdef CONFIG_COMPAT
1347 struct domain *d = page_get_owner(page);
1348 #endif
1349 unsigned long pfn = page_to_mfn(page);
1350 l2_pgentry_t *pl2e;
1351 unsigned int i = page->nr_validated_ptes - 1;
1352 int err = 0;
1354 pl2e = map_domain_page(pfn);
1356 ASSERT(page->nr_validated_ptes);
1357 do {
1358 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1359 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1360 preemptible && i && hypercall_preempt_check() )
1362 page->nr_validated_ptes = i;
1363 err = -EAGAIN;
1365 } while ( !err && i-- );
1367 unmap_domain_page(pl2e);
1369 if ( !err )
1370 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1372 return err;
1375 static int free_l3_table(struct page_info *page, int preemptible)
1377 struct domain *d = page_get_owner(page);
1378 unsigned long pfn = page_to_mfn(page);
1379 l3_pgentry_t *pl3e;
1380 unsigned int i = page->nr_validated_ptes - !page->partial_pte;
1381 int rc = 0;
1383 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1384 if ( d->arch.relmem == RELMEM_l3 )
1385 return 0;
1386 #endif
1388 pl3e = map_domain_page(pfn);
1390 do {
1391 if ( is_guest_l3_slot(i) )
1393 rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
1394 if ( rc > 0 )
1395 continue;
1396 if ( rc )
1397 break;
1398 unadjust_guest_l3e(pl3e[i], d);
1400 } while ( i-- );
1402 unmap_domain_page(pl3e);
1404 if ( rc == -EAGAIN )
1406 page->nr_validated_ptes = i;
1407 page->partial_pte = 1;
1409 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1411 page->nr_validated_ptes = i + 1;
1412 page->partial_pte = 0;
1413 rc = -EAGAIN;
1415 return rc > 0 ? 0 : rc;
1418 #if CONFIG_PAGING_LEVELS >= 4
1419 static int free_l4_table(struct page_info *page, int preemptible)
1421 struct domain *d = page_get_owner(page);
1422 unsigned long pfn = page_to_mfn(page);
1423 l4_pgentry_t *pl4e = page_to_virt(page);
1424 unsigned int i = page->nr_validated_ptes - !page->partial_pte;
1425 int rc = 0;
1427 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1428 if ( d->arch.relmem == RELMEM_l4 )
1429 return 0;
1430 #endif
1432 do {
1433 if ( is_guest_l4_slot(d, i) )
1434 rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
1435 } while ( rc >= 0 && i-- );
1437 if ( rc == -EAGAIN )
1439 page->nr_validated_ptes = i;
1440 page->partial_pte = 1;
1442 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1444 page->nr_validated_ptes = i + 1;
1445 page->partial_pte = 0;
1446 rc = -EAGAIN;
1448 return rc > 0 ? 0 : rc;
1450 #else
1451 #define free_l4_table(page, preemptible) (-EINVAL)
1452 #endif
1454 static void page_lock(struct page_info *page)
1456 #if defined(__i386__)
1457 while ( unlikely(test_and_set_bit(_PGC_locked, &page->count_info)) )
1458 while ( test_bit(_PGC_locked, &page->count_info) )
1459 cpu_relax();
1460 #else
1461 spin_lock(&page->lock);
1462 #endif
1465 static void page_unlock(struct page_info *page)
1467 #if defined(__i386__)
1468 clear_bit(_PGC_locked, &page->count_info);
1469 #else
1470 spin_unlock(&page->lock);
1471 #endif
1474 /* How to write an entry to the guest pagetables.
1475 * Returns 0 for failure (pointer not valid), 1 for success. */
1476 static inline int update_intpte(intpte_t *p,
1477 intpte_t old,
1478 intpte_t new,
1479 unsigned long mfn,
1480 struct vcpu *v,
1481 int preserve_ad)
1483 int rv = 1;
1484 #ifndef PTE_UPDATE_WITH_CMPXCHG
1485 if ( !preserve_ad )
1487 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1489 else
1490 #endif
1492 intpte_t t = old;
1493 for ( ; ; )
1495 intpte_t _new = new;
1496 if ( preserve_ad )
1497 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1499 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1500 if ( unlikely(rv == 0) )
1502 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1503 ": saw %" PRIpte, old, _new, t);
1504 break;
1507 if ( t == old )
1508 break;
1510 /* Allowed to change in Accessed/Dirty flags only. */
1511 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1513 old = t;
1516 return rv;
1519 /* Macro that wraps the appropriate type-changes around update_intpte().
1520 * Arguments are: type, ptr, old, new, mfn, vcpu */
1521 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1522 update_intpte(&_t ## e_get_intpte(*(_p)), \
1523 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1524 (_m), (_v), (_ad))
1526 /* Update the L1 entry at pl1e to new value nl1e. */
1527 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1528 unsigned long gl1mfn, int preserve_ad)
1530 l1_pgentry_t ol1e;
1531 struct vcpu *curr = current;
1532 struct domain *d = curr->domain;
1533 unsigned long mfn;
1534 struct page_info *l1pg = mfn_to_page(gl1mfn);
1535 int rc = 1;
1537 page_lock(l1pg);
1539 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1540 return page_unlock(l1pg), 0;
1542 if ( unlikely(paging_mode_refcounts(d)) )
1544 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad);
1545 page_unlock(l1pg);
1546 return rc;
1549 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1551 /* Translate foreign guest addresses. */
1552 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1553 if ( unlikely(mfn == INVALID_MFN) )
1554 return page_unlock(l1pg), 0;
1555 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1556 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1558 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1560 page_unlock(l1pg);
1561 MEM_LOG("Bad L1 flags %x",
1562 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1563 return 0;
1566 /* Fast path for identical mapping, r/w and presence. */
1567 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1569 adjust_guest_l1e(nl1e, d);
1570 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1571 preserve_ad);
1572 page_unlock(l1pg);
1573 return rc;
1576 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1577 return page_unlock(l1pg), 0;
1579 adjust_guest_l1e(nl1e, d);
1580 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1581 preserve_ad)) )
1583 ol1e = nl1e;
1584 rc = 0;
1587 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1588 preserve_ad)) )
1590 page_unlock(l1pg);
1591 return 0;
1594 page_unlock(l1pg);
1595 put_page_from_l1e(ol1e, d);
1596 return rc;
1600 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1601 static int mod_l2_entry(l2_pgentry_t *pl2e,
1602 l2_pgentry_t nl2e,
1603 unsigned long pfn,
1604 unsigned long type,
1605 int preserve_ad)
1607 l2_pgentry_t ol2e;
1608 struct vcpu *curr = current;
1609 struct domain *d = curr->domain;
1610 struct page_info *l2pg = mfn_to_page(pfn);
1611 int rc = 1;
1613 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1615 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1616 return 0;
1619 page_lock(l2pg);
1621 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1622 return page_unlock(l2pg), 0;
1624 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1626 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1628 page_unlock(l2pg);
1629 MEM_LOG("Bad L2 flags %x",
1630 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1631 return 0;
1634 /* Fast path for identical mapping and presence. */
1635 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1637 adjust_guest_l2e(nl2e, d);
1638 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad);
1639 page_unlock(l2pg);
1640 return rc;
1643 if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
1644 return page_unlock(l2pg), 0;
1646 adjust_guest_l2e(nl2e, d);
1647 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1648 preserve_ad)) )
1650 ol2e = nl2e;
1651 rc = 0;
1654 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1655 preserve_ad)) )
1657 page_unlock(l2pg);
1658 return 0;
1661 page_unlock(l2pg);
1662 put_page_from_l2e(ol2e, pfn);
1663 return rc;
1666 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1667 static int mod_l3_entry(l3_pgentry_t *pl3e,
1668 l3_pgentry_t nl3e,
1669 unsigned long pfn,
1670 int preserve_ad,
1671 int preemptible)
1673 l3_pgentry_t ol3e;
1674 struct vcpu *curr = current;
1675 struct domain *d = curr->domain;
1676 struct page_info *l3pg = mfn_to_page(pfn);
1677 int rc = 0;
1679 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1681 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1682 return -EINVAL;
1685 /*
1686 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1687 * would be a pain to ensure they remain continuously valid throughout.
1688 */
1689 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1690 return -EINVAL;
1692 page_lock(l3pg);
1694 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1695 return page_unlock(l3pg), -EFAULT;
1697 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1699 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1701 page_unlock(l3pg);
1702 MEM_LOG("Bad L3 flags %x",
1703 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1704 return -EINVAL;
1707 /* Fast path for identical mapping and presence. */
1708 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1710 adjust_guest_l3e(nl3e, d);
1711 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
1712 page_unlock(l3pg);
1713 return rc ? 0 : -EFAULT;
1716 rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
1717 if ( unlikely(rc < 0) )
1718 return page_unlock(l3pg), rc;
1719 rc = 0;
1721 adjust_guest_l3e(nl3e, d);
1722 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1723 preserve_ad)) )
1725 ol3e = nl3e;
1726 rc = -EFAULT;
1729 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1730 preserve_ad)) )
1732 page_unlock(l3pg);
1733 return -EFAULT;
1736 if ( likely(rc == 0) )
1738 if ( !create_pae_xen_mappings(d, pl3e) )
1739 BUG();
1741 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1744 page_unlock(l3pg);
1745 put_page_from_l3e(ol3e, pfn, 0);
1746 return rc;
1749 #if CONFIG_PAGING_LEVELS >= 4
1751 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1752 static int mod_l4_entry(l4_pgentry_t *pl4e,
1753 l4_pgentry_t nl4e,
1754 unsigned long pfn,
1755 int preserve_ad,
1756 int preemptible)
1758 struct vcpu *curr = current;
1759 struct domain *d = curr->domain;
1760 l4_pgentry_t ol4e;
1761 struct page_info *l4pg = mfn_to_page(pfn);
1762 int rc = 0;
1764 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1766 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1767 return -EINVAL;
1770 page_lock(l4pg);
1772 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1773 return page_unlock(l4pg), -EFAULT;
1775 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1777 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1779 page_unlock(l4pg);
1780 MEM_LOG("Bad L4 flags %x",
1781 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1782 return -EINVAL;
1785 /* Fast path for identical mapping and presence. */
1786 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1788 adjust_guest_l4e(nl4e, d);
1789 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
1790 page_unlock(l4pg);
1791 return rc ? 0 : -EFAULT;
1794 rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
1795 if ( unlikely(rc < 0) )
1796 return page_unlock(l4pg), rc;
1797 rc = 0;
1799 adjust_guest_l4e(nl4e, d);
1800 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1801 preserve_ad)) )
1803 ol4e = nl4e;
1804 rc = -EFAULT;
1807 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1808 preserve_ad)) )
1810 page_unlock(l4pg);
1811 return -EFAULT;
1814 page_unlock(l4pg);
1815 put_page_from_l4e(ol4e, pfn, 0);
1816 return rc;
1819 #endif
1821 void put_page(struct page_info *page)
1823 u32 nx, x, y = page->count_info;
1825 do {
1826 x = y;
1827 nx = x - 1;
1829 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1831 if ( unlikely((nx & PGC_count_mask) == 0) )
1833 cleanup_page_cacheattr(page);
1834 free_domheap_page(page);
1839 int get_page(struct page_info *page, struct domain *domain)
1841 u32 x, nx, y = page->count_info;
1842 u32 d, nd = page->u.inuse._domain;
1843 u32 _domain = pickle_domptr(domain);
1845 do {
1846 x = y;
1847 nx = x + 1;
1848 d = nd;
1849 if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
1850 unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
1851 unlikely(d != _domain) ) /* Wrong owner? */
1853 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1854 gdprintk(XENLOG_INFO,
1855 "Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
1856 PRtype_info "\n",
1857 page_to_mfn(page), domain, unpickle_domptr(d),
1858 x, page->u.inuse.type_info);
1859 return 0;
1861 asm volatile (
1862 LOCK_PREFIX "cmpxchg8b %2"
1863 : "=d" (nd), "=a" (y),
1864 "=m" (*(volatile u64 *)(&page->count_info))
1865 : "0" (d), "1" (x), "c" (d), "b" (nx) );
1867 while ( unlikely(nd != d) || unlikely(y != x) );
1869 return 1;
1873 static int alloc_page_type(struct page_info *page, unsigned long type,
1874 int preemptible)
1876 struct domain *owner = page_get_owner(page);
1877 int rc;
1879 /* A page table is dirtied when its type count becomes non-zero. */
1880 if ( likely(owner != NULL) )
1881 paging_mark_dirty(owner, page_to_mfn(page));
1883 switch ( type & PGT_type_mask )
1885 case PGT_l1_page_table:
1886 alloc_l1_table(page);
1887 rc = 0;
1888 break;
1889 case PGT_l2_page_table:
1890 rc = alloc_l2_table(page, type, preemptible);
1891 break;
1892 case PGT_l3_page_table:
1893 rc = alloc_l3_table(page, preemptible);
1894 break;
1895 case PGT_l4_page_table:
1896 rc = alloc_l4_table(page, preemptible);
1897 break;
1898 case PGT_seg_desc_page:
1899 rc = alloc_segdesc_page(page);
1900 break;
1901 default:
1902 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1903 type, page->u.inuse.type_info,
1904 page->count_info);
1905 rc = -EINVAL;
1906 BUG();
1909 /* No need for atomic update of type_info here: noone else updates it. */
1910 wmb();
1911 if ( rc == -EAGAIN )
1913 page->u.inuse.type_info |= PGT_partial;
1915 else if ( rc == -EINTR )
1917 ASSERT((page->u.inuse.type_info &
1918 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
1919 page->u.inuse.type_info &= ~PGT_count_mask;
1921 else if ( rc )
1923 ASSERT(rc < 0);
1924 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1925 PRtype_info ": caf=%08x taf=%" PRtype_info,
1926 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1927 type, page->count_info, page->u.inuse.type_info);
1928 page->u.inuse.type_info = 0;
1930 else
1932 page->u.inuse.type_info |= PGT_validated;
1935 return rc;
1939 int free_page_type(struct page_info *page, unsigned long type,
1940 int preemptible)
1942 struct domain *owner = page_get_owner(page);
1943 unsigned long gmfn;
1944 int rc;
1946 if ( likely(owner != NULL) )
1948 /*
1949 * We have to flush before the next use of the linear mapping
1950 * (e.g., update_va_mapping()) or we could end up modifying a page
1951 * that is no longer a page table (and hence screw up ref counts).
1952 */
1953 if ( current->domain == owner )
1954 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1955 else
1956 flush_tlb_mask(owner->domain_dirty_cpumask);
1958 if ( unlikely(paging_mode_enabled(owner)) )
1960 /* A page table is dirtied when its type count becomes zero. */
1961 paging_mark_dirty(owner, page_to_mfn(page));
1963 if ( shadow_mode_refcounts(owner) )
1964 return 0;
1966 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1967 ASSERT(VALID_M2P(gmfn));
1968 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1972 if ( !(type & PGT_partial) )
1974 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
1975 page->partial_pte = 0;
1977 switch ( type & PGT_type_mask )
1979 case PGT_l1_page_table:
1980 free_l1_table(page);
1981 rc = 0;
1982 break;
1983 case PGT_l2_page_table:
1984 rc = free_l2_table(page, preemptible);
1985 break;
1986 case PGT_l3_page_table:
1987 #if CONFIG_PAGING_LEVELS == 3
1988 if ( !(type & PGT_partial) )
1989 page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
1990 #endif
1991 rc = free_l3_table(page, preemptible);
1992 break;
1993 case PGT_l4_page_table:
1994 rc = free_l4_table(page, preemptible);
1995 break;
1996 default:
1997 MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
1998 rc = -EINVAL;
1999 BUG();
2002 /* No need for atomic update of type_info here: noone else updates it. */
2003 if ( rc == 0 )
2005 /*
2006 * Record TLB information for flush later. We do not stamp page tables
2007 * when running in shadow mode:
2008 * 1. Pointless, since it's the shadow pt's which must be tracked.
2009 * 2. Shadow mode reuses this field for shadowed page tables to
2010 * store flags info -- we don't want to conflict with that.
2011 */
2012 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2013 (page->count_info & PGC_page_table)) )
2014 page->tlbflush_timestamp = tlbflush_current_time();
2015 wmb();
2016 page->u.inuse.type_info--;
2018 else if ( rc == -EINTR )
2020 ASSERT(!(page->u.inuse.type_info &
2021 (PGT_count_mask|PGT_validated|PGT_partial)));
2022 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2023 (page->count_info & PGC_page_table)) )
2024 page->tlbflush_timestamp = tlbflush_current_time();
2025 wmb();
2026 page->u.inuse.type_info |= PGT_validated;
2028 else
2030 BUG_ON(rc != -EAGAIN);
2031 wmb();
2032 page->u.inuse.type_info |= PGT_partial;
2035 return rc;
2039 static int __put_page_type(struct page_info *page,
2040 int preemptible)
2042 unsigned long nx, x, y = page->u.inuse.type_info;
2044 for ( ; ; )
2046 x = y;
2047 nx = x - 1;
2049 ASSERT((x & PGT_count_mask) != 0);
2051 if ( unlikely((nx & PGT_count_mask) == 0) )
2053 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2054 likely(nx & (PGT_validated|PGT_partial)) )
2056 /*
2057 * Page-table pages must be unvalidated when count is zero. The
2058 * 'free' is safe because the refcnt is non-zero and validated
2059 * bit is clear => other ops will spin or fail.
2060 */
2061 nx = x & ~(PGT_validated|PGT_partial);
2062 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2063 x, nx)) != x) )
2064 continue;
2065 /* We cleared the 'valid bit' so we do the clean up. */
2066 return free_page_type(page, x, preemptible);
2069 /*
2070 * Record TLB information for flush later. We do not stamp page
2071 * tables when running in shadow mode:
2072 * 1. Pointless, since it's the shadow pt's which must be tracked.
2073 * 2. Shadow mode reuses this field for shadowed page tables to
2074 * store flags info -- we don't want to conflict with that.
2075 */
2076 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
2077 (page->count_info & PGC_page_table)) )
2078 page->tlbflush_timestamp = tlbflush_current_time();
2081 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2082 break;
2084 if ( preemptible && hypercall_preempt_check() )
2085 return -EINTR;
2088 return 0;
2092 static int __get_page_type(struct page_info *page, unsigned long type,
2093 int preemptible)
2095 unsigned long nx, x, y = page->u.inuse.type_info;
2097 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2099 for ( ; ; )
2101 x = y;
2102 nx = x + 1;
2103 if ( unlikely((nx & PGT_count_mask) == 0) )
2105 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
2106 return -EINVAL;
2108 else if ( unlikely((x & PGT_count_mask) == 0) )
2110 struct domain *d = page_get_owner(page);
2112 /* Normally we should never let a page go from type count 0
2113 * to type count 1 when it is shadowed. One exception:
2114 * out-of-sync shadowed pages are allowed to become
2115 * writeable. */
2116 if ( d && shadow_mode_enabled(d)
2117 && (page->count_info & PGC_page_table)
2118 && !((page->shadow_flags & (1u<<29))
2119 && type == PGT_writable_page) )
2120 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
2122 ASSERT(!(x & PGT_pae_xen_l2));
2123 if ( (x & PGT_type_mask) != type )
2125 /*
2126 * On type change we check to flush stale TLB entries. This
2127 * may be unnecessary (e.g., page was GDT/LDT) but those
2128 * circumstances should be very rare.
2129 */
2130 cpumask_t mask = d->domain_dirty_cpumask;
2132 /* Don't flush if the timestamp is old enough */
2133 tlbflush_filter(mask, page->tlbflush_timestamp);
2135 if ( unlikely(!cpus_empty(mask)) &&
2136 /* Shadow mode: track only writable pages. */
2137 (!shadow_mode_enabled(page_get_owner(page)) ||
2138 ((nx & PGT_type_mask) == PGT_writable_page)) )
2140 perfc_incr(need_flush_tlb_flush);
2141 flush_tlb_mask(mask);
2144 /* We lose existing type and validity. */
2145 nx &= ~(PGT_type_mask | PGT_validated);
2146 nx |= type;
2148 /* No special validation needed for writable pages. */
2149 /* Page tables and GDT/LDT need to be scanned for validity. */
2150 if ( type == PGT_writable_page )
2151 nx |= PGT_validated;
2154 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2156 /* Don't log failure if it could be a recursive-mapping attempt. */
2157 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2158 (type == PGT_l1_page_table) )
2159 return -EINVAL;
2160 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2161 (type == PGT_l2_page_table) )
2162 return -EINVAL;
2163 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2164 (type == PGT_l3_page_table) )
2165 return -EINVAL;
2166 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2167 "for mfn %lx (pfn %lx)",
2168 x, type, page_to_mfn(page),
2169 get_gpfn_from_mfn(page_to_mfn(page)));
2170 return -EINVAL;
2172 else if ( unlikely(!(x & PGT_validated)) )
2174 if ( !(x & PGT_partial) )
2176 /* Someone else is updating validation of this page. Wait... */
2177 while ( (y = page->u.inuse.type_info) == x )
2179 if ( preemptible && hypercall_preempt_check() )
2180 return -EINTR;
2181 cpu_relax();
2183 continue;
2185 /* Type ref count was left at 1 when PGT_partial got set. */
2186 ASSERT((x & PGT_count_mask) == 1);
2187 nx = x & ~PGT_partial;
2190 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2191 break;
2193 if ( preemptible && hypercall_preempt_check() )
2194 return -EINTR;
2197 if ( unlikely((x & PGT_type_mask) != type) )
2199 /* Special pages should not be accessible from devices. */
2200 struct domain *d = page_get_owner(page);
2201 if ( d && unlikely(need_iommu(d)) )
2203 if ( (x & PGT_type_mask) == PGT_writable_page )
2204 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2205 else if ( type == PGT_writable_page )
2206 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2207 page_to_mfn(page));
2211 if ( unlikely(!(nx & PGT_validated)) )
2213 if ( !(x & PGT_partial) )
2215 page->nr_validated_ptes = 0;
2216 page->partial_pte = 0;
2218 return alloc_page_type(page, type, preemptible);
2221 return 0;
2224 void put_page_type(struct page_info *page)
2226 int rc = __put_page_type(page, 0);
2227 ASSERT(rc == 0);
2228 (void)rc;
2231 int get_page_type(struct page_info *page, unsigned long type)
2233 int rc = __get_page_type(page, type, 0);
2234 if ( likely(rc == 0) )
2235 return 1;
2236 ASSERT(rc == -EINVAL);
2237 return 0;
2240 int put_page_type_preemptible(struct page_info *page)
2242 return __put_page_type(page, 1);
2245 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2247 return __get_page_type(page, type, 1);
2250 void cleanup_page_cacheattr(struct page_info *page)
2252 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
2254 if ( likely(cacheattr == 0) )
2255 return;
2257 page->count_info &= ~PGC_cacheattr_mask;
2259 BUG_ON(is_xen_heap_page(page));
2261 #ifdef __x86_64__
2262 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
2263 1, PAGE_HYPERVISOR);
2264 #endif
2268 int new_guest_cr3(unsigned long mfn)
2270 struct vcpu *v = current;
2271 struct domain *d = v->domain;
2272 int okay;
2273 unsigned long old_base_mfn;
2275 #ifdef CONFIG_COMPAT
2276 if ( is_pv_32on64_domain(d) )
2278 okay = paging_mode_refcounts(d)
2279 ? 0 /* Old code was broken, but what should it be? */
2280 : mod_l4_entry(
2281 __va(pagetable_get_paddr(v->arch.guest_table)),
2282 l4e_from_pfn(
2283 mfn,
2284 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2285 pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
2286 if ( unlikely(!okay) )
2288 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2289 return 0;
2292 invalidate_shadow_ldt(v);
2293 write_ptbase(v);
2295 return 1;
2297 #endif
2298 okay = paging_mode_refcounts(d)
2299 ? get_page_from_pagenr(mfn, d)
2300 : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
2301 if ( unlikely(!okay) )
2303 MEM_LOG("Error while installing new baseptr %lx", mfn);
2304 return 0;
2307 invalidate_shadow_ldt(v);
2309 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2311 v->arch.guest_table = pagetable_from_pfn(mfn);
2312 update_cr3(v);
2314 write_ptbase(v);
2316 if ( likely(old_base_mfn != 0) )
2318 if ( paging_mode_refcounts(d) )
2319 put_page(mfn_to_page(old_base_mfn));
2320 else
2321 put_page_and_type(mfn_to_page(old_base_mfn));
2324 return 1;
2327 static void process_deferred_ops(void)
2329 unsigned int deferred_ops;
2330 struct domain *d = current->domain;
2331 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2333 deferred_ops = info->deferred_ops;
2334 info->deferred_ops = 0;
2336 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2338 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2339 flush_tlb_mask(d->domain_dirty_cpumask);
2340 else
2341 flush_tlb_local();
2344 if ( deferred_ops & DOP_RELOAD_LDT )
2345 (void)map_ldt_shadow_page(0);
2347 if ( unlikely(info->foreign != NULL) )
2349 rcu_unlock_domain(info->foreign);
2350 info->foreign = NULL;
2354 static int set_foreigndom(domid_t domid)
2356 struct domain *e, *d = current->domain;
2357 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2358 int okay = 1;
2360 ASSERT(info->foreign == NULL);
2362 if ( likely(domid == DOMID_SELF) )
2363 goto out;
2365 if ( unlikely(domid == d->domain_id) )
2367 MEM_LOG("Cannot specify itself as foreign domain");
2368 okay = 0;
2370 else if ( unlikely(paging_mode_translate(d)) )
2372 MEM_LOG("Cannot mix foreign mappings with translated domains");
2373 okay = 0;
2375 else switch ( domid )
2377 case DOMID_IO:
2378 info->foreign = rcu_lock_domain(dom_io);
2379 break;
2380 case DOMID_XEN:
2381 if (!IS_PRIV(d)) {
2382 MEM_LOG("Cannot set foreign dom");
2383 okay = 0;
2384 break;
2386 info->foreign = rcu_lock_domain(dom_xen);
2387 break;
2388 default:
2389 if ( (e = rcu_lock_domain_by_id(domid)) == NULL )
2391 MEM_LOG("Unknown domain '%u'", domid);
2392 okay = 0;
2393 break;
2395 if ( !IS_PRIV_FOR(d, e) )
2397 MEM_LOG("Cannot set foreign dom");
2398 okay = 0;
2399 rcu_unlock_domain(e);
2400 break;
2402 info->foreign = e;
2403 break;
2406 out:
2407 return okay;
2410 static inline cpumask_t vcpumask_to_pcpumask(
2411 struct domain *d, unsigned long vmask)
2413 unsigned int vcpu_id;
2414 cpumask_t pmask = CPU_MASK_NONE;
2415 struct vcpu *v;
2417 /*
2418 * Callers copy only a single guest-sized longword from the guest.
2419 * This must be wide enough to reference all VCPUs. Worst case is 32 bits.
2420 */
2421 BUILD_BUG_ON(MAX_VIRT_CPUS > 32);
2423 while ( vmask != 0 )
2425 vcpu_id = find_first_set_bit(vmask);
2426 vmask &= ~(1UL << vcpu_id);
2427 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2428 ((v = d->vcpu[vcpu_id]) != NULL) )
2429 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2432 return pmask;
2435 int do_mmuext_op(
2436 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2437 unsigned int count,
2438 XEN_GUEST_HANDLE(uint) pdone,
2439 unsigned int foreigndom)
2441 struct mmuext_op op;
2442 int rc = 0, i = 0, okay;
2443 unsigned long mfn = 0, gmfn = 0, type;
2444 unsigned int done = 0;
2445 struct page_info *page;
2446 struct vcpu *v = current;
2447 struct domain *d = v->domain;
2449 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2451 count &= ~MMU_UPDATE_PREEMPTED;
2452 if ( unlikely(!guest_handle_is_null(pdone)) )
2453 (void)copy_from_guest(&done, pdone, 1);
2455 else
2456 perfc_incr(calls_to_mmuext_op);
2458 if ( unlikely(!guest_handle_okay(uops, count)) )
2460 rc = -EFAULT;
2461 goto out;
2464 if ( !set_foreigndom(foreigndom) )
2466 rc = -ESRCH;
2467 goto out;
2470 for ( i = 0; i < count; i++ )
2472 if ( hypercall_preempt_check() )
2474 rc = -EAGAIN;
2475 break;
2478 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2480 MEM_LOG("Bad __copy_from_guest");
2481 rc = -EFAULT;
2482 break;
2485 okay = 1;
2486 gmfn = op.arg1.mfn;
2487 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2488 page = mfn_to_page(mfn);
2490 switch ( op.cmd )
2492 case MMUEXT_PIN_L1_TABLE:
2493 type = PGT_l1_page_table;
2494 goto pin_page;
2496 case MMUEXT_PIN_L2_TABLE:
2497 type = PGT_l2_page_table;
2498 goto pin_page;
2500 case MMUEXT_PIN_L3_TABLE:
2501 type = PGT_l3_page_table;
2502 goto pin_page;
2504 case MMUEXT_PIN_L4_TABLE:
2505 if ( is_pv_32bit_domain(FOREIGNDOM) )
2506 break;
2507 type = PGT_l4_page_table;
2509 pin_page:
2510 rc = xsm_memory_pin_page(d, page);
2511 if ( rc )
2512 break;
2514 /* Ignore pinning of invalid paging levels. */
2515 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2516 break;
2518 if ( paging_mode_refcounts(FOREIGNDOM) )
2519 break;
2521 rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
2522 okay = !rc;
2523 if ( unlikely(!okay) )
2525 if ( rc == -EINTR )
2526 rc = -EAGAIN;
2527 else if ( rc != -EAGAIN )
2528 MEM_LOG("Error while pinning mfn %lx", mfn);
2529 break;
2532 if ( unlikely(test_and_set_bit(_PGT_pinned,
2533 &page->u.inuse.type_info)) )
2535 MEM_LOG("Mfn %lx already pinned", mfn);
2536 put_page_and_type(page);
2537 okay = 0;
2538 break;
2541 /* A page is dirtied when its pin status is set. */
2542 paging_mark_dirty(d, mfn);
2544 /* We can race domain destruction (domain_relinquish_resources). */
2545 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2547 int drop_ref;
2548 spin_lock(&FOREIGNDOM->page_alloc_lock);
2549 drop_ref = (FOREIGNDOM->is_dying &&
2550 test_and_clear_bit(_PGT_pinned,
2551 &page->u.inuse.type_info));
2552 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2553 if ( drop_ref )
2554 put_page_and_type(page);
2557 break;
2559 case MMUEXT_UNPIN_TABLE:
2560 if ( paging_mode_refcounts(d) )
2561 break;
2563 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2565 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2566 mfn, page_get_owner(page));
2568 else if ( likely(test_and_clear_bit(_PGT_pinned,
2569 &page->u.inuse.type_info)) )
2571 put_page_and_type(page);
2572 put_page(page);
2573 if ( !rc )
2575 /* A page is dirtied when its pin status is cleared. */
2576 paging_mark_dirty(d, mfn);
2579 else
2581 okay = 0;
2582 put_page(page);
2583 MEM_LOG("Mfn %lx not pinned", mfn);
2585 break;
2587 case MMUEXT_NEW_BASEPTR:
2588 okay = new_guest_cr3(mfn);
2589 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2590 break;
2592 #ifdef __x86_64__
2593 case MMUEXT_NEW_USER_BASEPTR: {
2594 unsigned long old_mfn;
2596 if ( mfn != 0 )
2598 if ( paging_mode_refcounts(d) )
2599 okay = get_page_from_pagenr(mfn, d);
2600 else
2601 okay = !get_page_and_type_from_pagenr(
2602 mfn, PGT_root_page_table, d, 0);
2603 if ( unlikely(!okay) )
2605 MEM_LOG("Error while installing new mfn %lx", mfn);
2606 break;
2610 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2611 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2613 if ( old_mfn != 0 )
2615 if ( paging_mode_refcounts(d) )
2616 put_page(mfn_to_page(old_mfn));
2617 else
2618 put_page_and_type(mfn_to_page(old_mfn));
2621 break;
2623 #endif
2625 case MMUEXT_TLB_FLUSH_LOCAL:
2626 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2627 break;
2629 case MMUEXT_INVLPG_LOCAL:
2630 if ( !paging_mode_enabled(d)
2631 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2632 flush_tlb_one_local(op.arg1.linear_addr);
2633 break;
2635 case MMUEXT_TLB_FLUSH_MULTI:
2636 case MMUEXT_INVLPG_MULTI:
2638 unsigned long vmask;
2639 cpumask_t pmask;
2640 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2642 okay = 0;
2643 break;
2645 pmask = vcpumask_to_pcpumask(d, vmask);
2646 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2647 flush_tlb_mask(pmask);
2648 else
2649 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2650 break;
2653 case MMUEXT_TLB_FLUSH_ALL:
2654 flush_tlb_mask(d->domain_dirty_cpumask);
2655 break;
2657 case MMUEXT_INVLPG_ALL:
2658 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2659 break;
2661 case MMUEXT_FLUSH_CACHE:
2662 if ( unlikely(!cache_flush_permitted(d)) )
2664 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2665 okay = 0;
2667 else
2669 wbinvd();
2671 break;
2673 case MMUEXT_SET_LDT:
2675 unsigned long ptr = op.arg1.linear_addr;
2676 unsigned long ents = op.arg2.nr_ents;
2678 if ( paging_mode_external(d) )
2680 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2681 okay = 0;
2683 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2684 (ents > 8192) ||
2685 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2687 okay = 0;
2688 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2690 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2691 (v->arch.guest_context.ldt_base != ptr) )
2693 invalidate_shadow_ldt(v);
2694 v->arch.guest_context.ldt_base = ptr;
2695 v->arch.guest_context.ldt_ents = ents;
2696 load_LDT(v);
2697 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2698 if ( ents != 0 )
2699 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2701 break;
2704 default:
2705 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2706 rc = -ENOSYS;
2707 okay = 0;
2708 break;
2711 if ( unlikely(!okay) )
2713 rc = rc ? rc : -EINVAL;
2714 break;
2717 guest_handle_add_offset(uops, 1);
2720 if ( rc == -EAGAIN )
2721 rc = hypercall_create_continuation(
2722 __HYPERVISOR_mmuext_op, "hihi",
2723 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2725 process_deferred_ops();
2727 perfc_add(num_mmuext_ops, i);
2729 out:
2730 /* Add incremental work we have done to the @done output parameter. */
2731 if ( unlikely(!guest_handle_is_null(pdone)) )
2733 done += i;
2734 copy_to_guest(pdone, &done, 1);
2737 return rc;
2740 int do_mmu_update(
2741 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2742 unsigned int count,
2743 XEN_GUEST_HANDLE(uint) pdone,
2744 unsigned int foreigndom)
2746 struct mmu_update req;
2747 void *va;
2748 unsigned long gpfn, gmfn, mfn;
2749 struct page_info *page;
2750 int rc = 0, okay = 1, i = 0;
2751 unsigned int cmd, done = 0;
2752 struct vcpu *v = current;
2753 struct domain *d = v->domain;
2754 unsigned long type_info;
2755 struct domain_mmap_cache mapcache;
2757 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2759 count &= ~MMU_UPDATE_PREEMPTED;
2760 if ( unlikely(!guest_handle_is_null(pdone)) )
2761 (void)copy_from_guest(&done, pdone, 1);
2763 else
2764 perfc_incr(calls_to_mmu_update);
2766 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2768 rc = -EFAULT;
2769 goto out;
2772 if ( !set_foreigndom(foreigndom) )
2774 rc = -ESRCH;
2775 goto out;
2778 domain_mmap_cache_init(&mapcache);
2780 for ( i = 0; i < count; i++ )
2782 if ( hypercall_preempt_check() )
2784 rc = -EAGAIN;
2785 break;
2788 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2790 MEM_LOG("Bad __copy_from_guest");
2791 rc = -EFAULT;
2792 break;
2795 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2796 okay = 0;
2798 switch ( cmd )
2800 /*
2801 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2802 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
2803 * current A/D bits.
2804 */
2805 case MMU_NORMAL_PT_UPDATE:
2806 case MMU_PT_UPDATE_PRESERVE_AD:
2807 rc = xsm_mmu_normal_update(d, req.val);
2808 if ( rc )
2809 break;
2811 req.ptr -= cmd;
2812 gmfn = req.ptr >> PAGE_SHIFT;
2813 mfn = gmfn_to_mfn(d, gmfn);
2815 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
2817 MEM_LOG("Could not get page for normal update");
2818 break;
2821 va = map_domain_page_with_cache(mfn, &mapcache);
2822 va = (void *)((unsigned long)va +
2823 (unsigned long)(req.ptr & ~PAGE_MASK));
2824 page = mfn_to_page(mfn);
2826 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2828 case PGT_l1_page_table:
2829 case PGT_l2_page_table:
2830 case PGT_l3_page_table:
2831 case PGT_l4_page_table:
2833 if ( paging_mode_refcounts(d) )
2835 MEM_LOG("mmu update on auto-refcounted domain!");
2836 break;
2839 if ( unlikely(!get_page_type(
2840 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2841 goto not_a_pt;
2843 switch ( type_info & PGT_type_mask )
2845 case PGT_l1_page_table:
2847 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2848 okay = mod_l1_entry(va, l1e, mfn,
2849 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2851 break;
2852 case PGT_l2_page_table:
2854 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2855 okay = mod_l2_entry(va, l2e, mfn, type_info,
2856 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2858 break;
2859 case PGT_l3_page_table:
2861 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2862 rc = mod_l3_entry(va, l3e, mfn,
2863 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
2864 okay = !rc;
2866 break;
2867 #if CONFIG_PAGING_LEVELS >= 4
2868 case PGT_l4_page_table:
2870 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2871 rc = mod_l4_entry(va, l4e, mfn,
2872 cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
2873 okay = !rc;
2875 break;
2876 #endif
2879 put_page_type(page);
2880 if ( rc == -EINTR )
2881 rc = -EAGAIN;
2883 break;
2885 default:
2886 not_a_pt:
2888 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2889 break;
2891 perfc_incr(writable_mmu_updates);
2893 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2895 put_page_type(page);
2897 break;
2900 unmap_domain_page_with_cache(va, &mapcache);
2902 put_page(page);
2903 break;
2905 case MMU_MACHPHYS_UPDATE:
2907 mfn = req.ptr >> PAGE_SHIFT;
2908 gpfn = req.val;
2910 rc = xsm_mmu_machphys_update(d, mfn);
2911 if ( rc )
2912 break;
2914 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2916 MEM_LOG("Could not get page for mach->phys update");
2917 break;
2920 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2922 MEM_LOG("Mach-phys update on auto-translate guest");
2923 break;
2926 set_gpfn_from_mfn(mfn, gpfn);
2927 okay = 1;
2929 paging_mark_dirty(FOREIGNDOM, mfn);
2931 put_page(mfn_to_page(mfn));
2932 break;
2934 default:
2935 MEM_LOG("Invalid page update command %x", cmd);
2936 rc = -ENOSYS;
2937 okay = 0;
2938 break;
2941 if ( unlikely(!okay) )
2943 rc = rc ? rc : -EINVAL;
2944 break;
2947 guest_handle_add_offset(ureqs, 1);
2950 if ( rc == -EAGAIN )
2951 rc = hypercall_create_continuation(
2952 __HYPERVISOR_mmu_update, "hihi",
2953 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2955 process_deferred_ops();
2957 domain_mmap_cache_destroy(&mapcache);
2959 perfc_add(num_page_updates, i);
2961 out:
2962 /* Add incremental work we have done to the @done output parameter. */
2963 if ( unlikely(!guest_handle_is_null(pdone)) )
2965 done += i;
2966 copy_to_guest(pdone, &done, 1);
2969 return rc;
2973 static int create_grant_pte_mapping(
2974 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2976 int rc = GNTST_okay;
2977 void *va;
2978 unsigned long gmfn, mfn;
2979 struct page_info *page;
2980 u32 type;
2981 l1_pgentry_t ol1e;
2982 struct domain *d = v->domain;
2984 ASSERT(domain_is_locked(d));
2986 adjust_guest_l1e(nl1e, d);
2988 gmfn = pte_addr >> PAGE_SHIFT;
2989 mfn = gmfn_to_mfn(d, gmfn);
2991 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2993 MEM_LOG("Could not get page for normal update");
2994 return GNTST_general_error;
2997 va = map_domain_page(mfn);
2998 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2999 page = mfn_to_page(mfn);
3001 type = page->u.inuse.type_info & PGT_type_mask;
3002 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
3004 MEM_LOG("Grant map attempted to update a non-L1 page");
3005 rc = GNTST_general_error;
3006 goto failed;
3009 page_lock(page);
3011 ol1e = *(l1_pgentry_t *)va;
3012 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
3014 page_unlock(page);
3015 put_page_type(page);
3016 rc = GNTST_general_error;
3017 goto failed;
3020 page_unlock(page);
3022 if ( !paging_mode_refcounts(d) )
3023 put_page_from_l1e(ol1e, d);
3025 put_page_type(page);
3027 failed:
3028 unmap_domain_page(va);
3029 put_page(page);
3031 return rc;
3034 static int destroy_grant_pte_mapping(
3035 uint64_t addr, unsigned long frame, struct domain *d)
3037 int rc = GNTST_okay;
3038 void *va;
3039 unsigned long gmfn, mfn;
3040 struct page_info *page;
3041 u32 type;
3042 l1_pgentry_t ol1e;
3044 gmfn = addr >> PAGE_SHIFT;
3045 mfn = gmfn_to_mfn(d, gmfn);
3047 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
3049 MEM_LOG("Could not get page for normal update");
3050 return GNTST_general_error;
3053 va = map_domain_page(mfn);
3054 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
3055 page = mfn_to_page(mfn);
3057 type = page->u.inuse.type_info & PGT_type_mask;
3058 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
3060 MEM_LOG("Grant map attempted to update a non-L1 page");
3061 rc = GNTST_general_error;
3062 goto failed;
3065 page_lock(page);
3067 ol1e = *(l1_pgentry_t *)va;
3069 /* Check that the virtual address supplied is actually mapped to frame. */
3070 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
3072 page_unlock(page);
3073 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
3074 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
3075 put_page_type(page);
3076 rc = GNTST_general_error;
3077 goto failed;
3080 /* Delete pagetable entry. */
3081 if ( unlikely(!UPDATE_ENTRY
3082 (l1,
3083 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
3084 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
3085 0)) )
3087 page_unlock(page);
3088 MEM_LOG("Cannot delete PTE entry at %p", va);
3089 put_page_type(page);
3090 rc = GNTST_general_error;
3091 goto failed;
3094 page_unlock(page);
3095 put_page_type(page);
3097 failed:
3098 unmap_domain_page(va);
3099 put_page(page);
3100 return rc;
3104 static int create_grant_va_mapping(
3105 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
3107 l1_pgentry_t *pl1e, ol1e;
3108 struct domain *d = v->domain;
3109 unsigned long gl1mfn;
3110 struct page_info *l1pg;
3111 int okay;
3113 ASSERT(domain_is_locked(d));
3115 adjust_guest_l1e(nl1e, d);
3117 pl1e = guest_map_l1e(v, va, &gl1mfn);
3118 if ( !pl1e )
3120 MEM_LOG("Could not find L1 PTE for address %lx", va);
3121 return GNTST_general_error;
3123 l1pg = mfn_to_page(gl1mfn);
3124 page_lock(l1pg);
3125 ol1e = *pl1e;
3126 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
3127 page_unlock(l1pg);
3128 guest_unmap_l1e(v, pl1e);
3129 pl1e = NULL;
3131 if ( !okay )
3132 return GNTST_general_error;
3134 if ( !paging_mode_refcounts(d) )
3135 put_page_from_l1e(ol1e, d);
3137 return GNTST_okay;
3140 static int replace_grant_va_mapping(
3141 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
3143 l1_pgentry_t *pl1e, ol1e;
3144 unsigned long gl1mfn;
3145 struct page_info *l1pg;
3146 int rc = 0;
3148 pl1e = guest_map_l1e(v, addr, &gl1mfn);
3149 if ( !pl1e )
3151 MEM_LOG("Could not find L1 PTE for address %lx", addr);
3152 return GNTST_general_error;
3155 l1pg = mfn_to_page(gl1mfn);
3156 page_lock(l1pg);
3157 ol1e = *pl1e;
3159 /* Check that the virtual address supplied is actually mapped to frame. */
3160 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
3162 page_unlock(l1pg);
3163 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
3164 l1e_get_pfn(ol1e), addr, frame);
3165 rc = GNTST_general_error;
3166 goto out;
3169 /* Delete pagetable entry. */
3170 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
3172 page_unlock(l1pg);
3173 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3174 rc = GNTST_general_error;
3175 goto out;
3178 page_unlock(l1pg);
3180 out:
3181 guest_unmap_l1e(v, pl1e);
3182 return rc;
3185 static int destroy_grant_va_mapping(
3186 unsigned long addr, unsigned long frame, struct vcpu *v)
3188 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
3191 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
3192 unsigned int flags, unsigned int cache_flags)
3194 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
3196 if ( (flags & GNTMAP_application_map) )
3197 l1e_add_flags(pte,_PAGE_USER);
3198 if ( !(flags & GNTMAP_readonly) )
3199 l1e_add_flags(pte,_PAGE_RW);
3201 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
3203 if ( flags & GNTMAP_contains_pte )
3204 return create_grant_pte_mapping(addr, pte, current);
3205 return create_grant_va_mapping(addr, pte, current);
3208 int replace_grant_host_mapping(
3209 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3211 struct vcpu *curr = current;
3212 l1_pgentry_t *pl1e, ol1e;
3213 unsigned long gl1mfn;
3214 struct page_info *l1pg;
3215 int rc;
3217 if ( flags & GNTMAP_contains_pte )
3219 if ( !new_addr )
3220 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3222 MEM_LOG("Unsupported grant table operation");
3223 return GNTST_general_error;
3226 if ( !new_addr )
3227 return destroy_grant_va_mapping(addr, frame, curr);
3229 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3230 if ( !pl1e )
3232 MEM_LOG("Could not find L1 PTE for address %lx",
3233 (unsigned long)new_addr);
3234 return GNTST_general_error;
3237 l1pg = mfn_to_page(gl1mfn);
3238 page_lock(l1pg);
3239 ol1e = *pl1e;
3241 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3242 gl1mfn, curr, 0)) )
3244 page_unlock(l1pg);
3245 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3246 guest_unmap_l1e(curr, pl1e);
3247 return GNTST_general_error;
3250 page_unlock(l1pg);
3251 guest_unmap_l1e(curr, pl1e);
3253 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3254 if ( rc && !paging_mode_refcounts(curr->domain) )
3255 put_page_from_l1e(ol1e, curr->domain);
3257 return rc;
3260 int steal_page(
3261 struct domain *d, struct page_info *page, unsigned int memflags)
3263 u32 _d, _nd, x, y;
3265 spin_lock(&d->page_alloc_lock);
3267 /*
3268 * The tricky bit: atomically release ownership while there is just one
3269 * benign reference to the page (PGC_allocated). If that reference
3270 * disappears then the deallocation routine will safely spin.
3271 */
3272 _d = pickle_domptr(d);
3273 _nd = page->u.inuse._domain;
3274 y = page->count_info;
3275 do {
3276 x = y;
3277 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
3278 (1 | PGC_allocated)) || unlikely(_nd != _d) )
3280 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
3281 " caf=%08x, taf=%" PRtype_info "\n",
3282 (void *) page_to_mfn(page),
3283 d, d->domain_id, unpickle_domptr(_nd), x,
3284 page->u.inuse.type_info);
3285 spin_unlock(&d->page_alloc_lock);
3286 return -1;
3288 asm volatile (
3289 LOCK_PREFIX "cmpxchg8b %2"
3290 : "=d" (_nd), "=a" (y),
3291 "=m" (*(volatile u64 *)(&page->count_info))
3292 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
3293 } while (unlikely(_nd != _d) || unlikely(y != x));
3295 /*
3296 * Unlink from 'd'. At least one reference remains (now anonymous), so
3297 * noone else is spinning to try to delete this page from 'd'.
3298 */
3299 if ( !(memflags & MEMF_no_refcount) )
3300 d->tot_pages--;
3301 list_del(&page->list);
3303 spin_unlock(&d->page_alloc_lock);
3305 return 0;
3308 int do_update_va_mapping(unsigned long va, u64 val64,
3309 unsigned long flags)
3311 l1_pgentry_t val = l1e_from_intpte(val64);
3312 struct vcpu *v = current;
3313 struct domain *d = v->domain;
3314 l1_pgentry_t *pl1e;
3315 unsigned long vmask, bmap_ptr, gl1mfn;
3316 cpumask_t pmask;
3317 int rc = 0;
3319 perfc_incr(calls_to_update_va);
3321 if ( unlikely(!access_ok(va, 1) && !paging_mode_external(d)) )
3322 return -EINVAL;
3324 rc = xsm_update_va_mapping(d, val);
3325 if ( rc )
3326 return rc;
3328 pl1e = guest_map_l1e(v, va, &gl1mfn);
3330 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) )
3331 rc = -EINVAL;
3333 if ( pl1e )
3334 guest_unmap_l1e(v, pl1e);
3335 pl1e = NULL;
3337 process_deferred_ops();
3339 switch ( flags & UVMF_FLUSHTYPE_MASK )
3341 case UVMF_TLB_FLUSH:
3342 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3344 case UVMF_LOCAL:
3345 flush_tlb_local();
3346 break;
3347 case UVMF_ALL:
3348 flush_tlb_mask(d->domain_dirty_cpumask);
3349 break;
3350 default:
3351 if ( unlikely(!is_pv_32on64_domain(d) ?
3352 get_user(vmask, (unsigned long *)bmap_ptr) :
3353 get_user(vmask, (unsigned int *)bmap_ptr)) )
3354 rc = -EFAULT;
3355 pmask = vcpumask_to_pcpumask(d, vmask);
3356 flush_tlb_mask(pmask);
3357 break;
3359 break;
3361 case UVMF_INVLPG:
3362 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3364 case UVMF_LOCAL:
3365 if ( !paging_mode_enabled(d) ||
3366 (paging_invlpg(v, va) != 0) )
3367 flush_tlb_one_local(va);
3368 break;
3369 case UVMF_ALL:
3370 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
3371 break;
3372 default:
3373 if ( unlikely(!is_pv_32on64_domain(d) ?
3374 get_user(vmask, (unsigned long *)bmap_ptr) :
3375 get_user(vmask, (unsigned int *)bmap_ptr)) )
3376 rc = -EFAULT;
3377 pmask = vcpumask_to_pcpumask(d, vmask);
3378 flush_tlb_one_mask(pmask, va);
3379 break;
3381 break;
3384 return rc;
3387 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3388 unsigned long flags,
3389 domid_t domid)
3391 int rc;
3393 if ( !set_foreigndom(domid) )
3394 return -ESRCH;
3396 rc = do_update_va_mapping(va, val64, flags);
3398 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3399 process_deferred_ops(); /* only to clear foreigndom */
3401 return rc;
3406 /*************************
3407 * Descriptor Tables
3408 */
3410 void destroy_gdt(struct vcpu *v)
3412 int i;
3413 unsigned long pfn;
3415 v->arch.guest_context.gdt_ents = 0;
3416 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3418 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3419 put_page_and_type(mfn_to_page(pfn));
3420 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3421 v->arch.guest_context.gdt_frames[i] = 0;
3426 long set_gdt(struct vcpu *v,
3427 unsigned long *frames,
3428 unsigned int entries)
3430 struct domain *d = v->domain;
3431 /* NB. There are 512 8-byte entries per GDT page. */
3432 int i, nr_pages = (entries + 511) / 512;
3433 unsigned long mfn;
3435 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3436 return -EINVAL;
3438 /* Check the pages in the new GDT. */
3439 for ( i = 0; i < nr_pages; i++ )
3441 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3442 if ( !mfn_valid(mfn) ||
3443 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
3444 goto fail;
3447 /* Tear down the old GDT. */
3448 destroy_gdt(v);
3450 /* Install the new GDT. */
3451 v->arch.guest_context.gdt_ents = entries;
3452 for ( i = 0; i < nr_pages; i++ )
3454 v->arch.guest_context.gdt_frames[i] = frames[i];
3455 l1e_write(&v->arch.perdomain_ptes[i],
3456 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3459 return 0;
3461 fail:
3462 while ( i-- > 0 )
3463 put_page_and_type(mfn_to_page(frames[i]));
3464 return -EINVAL;
3468 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3470 int nr_pages = (entries + 511) / 512;
3471 unsigned long frames[16];
3472 struct vcpu *curr = current;
3473 long ret;
3475 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3476 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3477 return -EINVAL;
3479 if ( copy_from_guest(frames, frame_list, nr_pages) )
3480 return -EFAULT;
3482 domain_lock(curr->domain);
3484 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3485 flush_tlb_local();
3487 domain_unlock(curr->domain);
3489 return ret;
3493 long do_update_descriptor(u64 pa, u64 desc)
3495 struct domain *dom = current->domain;
3496 unsigned long gmfn = pa >> PAGE_SHIFT;
3497 unsigned long mfn;
3498 unsigned int offset;
3499 struct desc_struct *gdt_pent, d;
3500 struct page_info *page;
3501 long ret = -EINVAL;
3503 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3505 *(u64 *)&d = desc;
3507 mfn = gmfn_to_mfn(dom, gmfn);
3508 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3509 !mfn_valid(mfn) ||
3510 !check_descriptor(dom, &d) )
3511 return -EINVAL;
3513 page = mfn_to_page(mfn);
3514 if ( unlikely(!get_page(page, dom)) )
3515 return -EINVAL;
3517 /* Check if the given frame is in use in an unsafe context. */
3518 switch ( page->u.inuse.type_info & PGT_type_mask )
3520 case PGT_seg_desc_page:
3521 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
3522 goto out;
3523 break;
3524 default:
3525 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3526 goto out;
3527 break;
3530 paging_mark_dirty(dom, mfn);
3532 /* All is good so make the update. */
3533 gdt_pent = map_domain_page(mfn);
3534 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3535 unmap_domain_page(gdt_pent);
3537 put_page_type(page);
3539 ret = 0; /* success */
3541 out:
3542 put_page(page);
3544 return ret;
3547 typedef struct e820entry e820entry_t;
3548 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3550 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3552 struct page_info *page = NULL;
3553 switch ( op )
3555 case XENMEM_add_to_physmap:
3557 struct xen_add_to_physmap xatp;
3558 unsigned long prev_mfn, mfn = 0, gpfn;
3559 struct domain *d;
3561 if ( copy_from_guest(&xatp, arg, 1) )
3562 return -EFAULT;
3564 if ( xatp.domid == DOMID_SELF )
3566 d = rcu_lock_current_domain();
3568 else
3570 if ( (d = rcu_lock_domain_by_id(xatp.domid)) == NULL )
3571 return -ESRCH;
3572 if ( !IS_PRIV_FOR(current->domain, d) )
3574 rcu_unlock_domain(d);
3575 return -EPERM;
3579 if ( xsm_add_to_physmap(current->domain, d) )
3581 rcu_unlock_domain(d);
3582 return -EPERM;
3585 switch ( xatp.space )
3587 case XENMAPSPACE_shared_info:
3588 if ( xatp.idx == 0 )
3589 mfn = virt_to_mfn(d->shared_info);
3590 break;
3591 case XENMAPSPACE_grant_table:
3592 spin_lock(&d->grant_table->lock);
3594 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3595 (xatp.idx < max_nr_grant_frames) )
3596 gnttab_grow_table(d, xatp.idx + 1);
3598 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3599 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3601 spin_unlock(&d->grant_table->lock);
3602 break;
3603 case XENMAPSPACE_mfn:
3605 if ( get_page_from_pagenr(xatp.idx, d) ) {
3606 mfn = xatp.idx;
3607 page = mfn_to_page(mfn);
3609 break;
3611 default:
3612 break;
3615 if ( !paging_mode_translate(d) || (mfn == 0) )
3617 if ( page )
3618 put_page(page);
3619 rcu_unlock_domain(d);
3620 return -EINVAL;
3623 domain_lock(d);
3625 /* Remove previously mapped page if it was present. */
3626 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3627 if ( mfn_valid(prev_mfn) )
3629 if ( is_xen_heap_mfn(prev_mfn) )
3630 /* Xen heap frames are simply unhooked from this phys slot. */
3631 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3632 else
3633 /* Normal domain memory is freed, to avoid leaking memory. */
3634 guest_remove_page(d, xatp.gpfn);
3637 /* Unmap from old location, if any. */
3638 gpfn = get_gpfn_from_mfn(mfn);
3639 if ( gpfn != INVALID_M2P_ENTRY )
3640 guest_physmap_remove_page(d, gpfn, mfn, 0);
3642 /* Map at new location. */
3643 guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
3645 domain_unlock(d);
3647 if ( page )
3648 put_page(page);
3650 rcu_unlock_domain(d);
3652 break;
3655 case XENMEM_remove_from_physmap:
3657 struct xen_remove_from_physmap xrfp;
3658 unsigned long mfn;
3659 struct domain *d;
3661 if ( copy_from_guest(&xrfp, arg, 1) )
3662 return -EFAULT;
3664 if ( xrfp.domid == DOMID_SELF )
3666 d = rcu_lock_current_domain();
3668 else
3670 if ( (d = rcu_lock_domain_by_id(xrfp.domid)) == NULL )
3671 return -ESRCH;
3672 if ( !IS_PRIV_FOR(current->domain, d) )
3674 rcu_unlock_domain(d);
3675 return -EPERM;
3679 if ( xsm_remove_from_physmap(current->domain, d) )
3681 rcu_unlock_domain(d);
3682 return -EPERM;
3685 domain_lock(d);
3687 mfn = gmfn_to_mfn(d, xrfp.gpfn);
3689 if ( mfn_valid(mfn) )
3690 guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);
3692 domain_unlock(d);
3694 rcu_unlock_domain(d);
3696 break;
3699 case XENMEM_set_memory_map:
3701 struct xen_foreign_memory_map fmap;
3702 struct domain *d;
3703 int rc;
3705 if ( copy_from_guest(&fmap, arg, 1) )
3706 return -EFAULT;
3708 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3709 return -EINVAL;
3711 if ( fmap.domid == DOMID_SELF )
3713 d = rcu_lock_current_domain();
3715 else
3717 if ( (d = rcu_lock_domain_by_id(fmap.domid)) == NULL )
3718 return -ESRCH;
3719 if ( !IS_PRIV_FOR(current->domain, d) )
3721 rcu_unlock_domain(d);
3722 return -EPERM;
3726 rc = xsm_domain_memory_map(d);
3727 if ( rc )
3729 rcu_unlock_domain(d);
3730 return rc;
3733 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3734 fmap.map.nr_entries) ? -EFAULT : 0;
3735 d->arch.nr_e820 = fmap.map.nr_entries;
3737 rcu_unlock_domain(d);
3738 return rc;
3741 case XENMEM_memory_map:
3743 struct xen_memory_map map;
3744 struct domain *d = current->domain;
3746 /* Backwards compatibility. */
3747 if ( d->arch.nr_e820 == 0 )
3748 return -ENOSYS;
3750 if ( copy_from_guest(&map, arg, 1) )
3751 return -EFAULT;
3753 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3754 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3755 copy_to_guest(arg, &map, 1) )
3756 return -EFAULT;
3758 return 0;
3761 case XENMEM_machine_memory_map:
3763 struct xen_memory_map memmap;
3764 XEN_GUEST_HANDLE(e820entry_t) buffer;
3765 int count;
3766 int rc;
3768 if ( !IS_PRIV(current->domain) )
3769 return -EINVAL;
3771 rc = xsm_machine_memory_map();
3772 if ( rc )
3773 return rc;
3775 if ( copy_from_guest(&memmap, arg, 1) )
3776 return -EFAULT;
3777 if ( memmap.nr_entries < e820.nr_map + 1 )
3778 return -EINVAL;
3780 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3782 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3783 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3784 return -EFAULT;
3786 memmap.nr_entries = count;
3788 if ( copy_to_guest(arg, &memmap, 1) )
3789 return -EFAULT;
3791 return 0;
3794 case XENMEM_machphys_mapping:
3796 static const struct xen_machphys_mapping mapping = {
3797 .v_start = MACH2PHYS_VIRT_START,
3798 .v_end = MACH2PHYS_VIRT_END,
3799 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3800 };
3802 if ( copy_to_guest(arg, &mapping, 1) )
3803 return -EFAULT;
3805 return 0;
3808 default:
3809 return subarch_memory_op(op, arg);
3812 return 0;
3816 /*************************
3817 * Writable Pagetables
3818 */
3820 struct ptwr_emulate_ctxt {
3821 struct x86_emulate_ctxt ctxt;
3822 unsigned long cr2;
3823 l1_pgentry_t pte;
3824 };
3826 static int ptwr_emulated_read(
3827 enum x86_segment seg,
3828 unsigned long offset,
3829 void *p_data,
3830 unsigned int bytes,
3831 struct x86_emulate_ctxt *ctxt)
3833 unsigned int rc;
3834 unsigned long addr = offset;
3836 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
3838 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3839 return X86EMUL_EXCEPTION;
3842 return X86EMUL_OKAY;
3845 static int ptwr_emulated_update(
3846 unsigned long addr,
3847 paddr_t old,
3848 paddr_t val,
3849 unsigned int bytes,
3850 unsigned int do_cmpxchg,
3851 struct ptwr_emulate_ctxt *ptwr_ctxt)
3853 unsigned long mfn;
3854 unsigned long unaligned_addr = addr;
3855 struct page_info *page;
3856 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3857 struct vcpu *v = current;
3858 struct domain *d = v->domain;
3860 /* Only allow naturally-aligned stores within the original %cr2 page. */
3861 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3863 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
3864 ptwr_ctxt->cr2, addr, bytes);
3865 return X86EMUL_UNHANDLEABLE;
3868 /* Turn a sub-word access into a full-word access. */
3869 if ( bytes != sizeof(paddr_t) )
3871 paddr_t full;
3872 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3874 /* Align address; read full word. */
3875 addr &= ~(sizeof(paddr_t)-1);
3876 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3878 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3879 return X86EMUL_EXCEPTION;
3881 /* Mask out bits provided by caller. */
3882 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3883 /* Shift the caller value and OR in the missing bits. */
3884 val &= (((paddr_t)1 << (bytes*8)) - 1);
3885 val <<= (offset)*8;
3886 val |= full;
3887 /* Also fill in missing parts of the cmpxchg old value. */
3888 old &= (((paddr_t)1 << (bytes*8)) - 1);
3889 old <<= (offset)*8;
3890 old |= full;
3893 pte = ptwr_ctxt->pte;
3894 mfn = l1e_get_pfn(pte);
3895 page = mfn_to_page(mfn);
3897 /* We are looking only for read-only mappings of p.t. pages. */
3898 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3899 ASSERT(mfn_valid(mfn));
3900 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3901 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3902 ASSERT(page_get_owner(page) == d);
3904 /* Check the new PTE. */
3905 nl1e = l1e_from_intpte(val);
3906 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3908 if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
3909 !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3911 /*
3912 * If this is an upper-half write to a PAE PTE then we assume that
3913 * the guest has simply got the two writes the wrong way round. We
3914 * zap the PRESENT bit on the assumption that the bottom half will
3915 * be written immediately after we return to the guest.
3916 */
3917 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3918 l1e_get_intpte(nl1e));
3919 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3921 else
3923 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3924 return X86EMUL_UNHANDLEABLE;
3928 adjust_guest_l1e(nl1e, d);
3930 /* Checked successfully: do the update (write or cmpxchg). */
3931 pl1e = map_domain_page(mfn);
3932 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3933 if ( do_cmpxchg )
3935 int okay;
3936 intpte_t t = old;
3937 ol1e = l1e_from_intpte(old);
3939 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3940 &t, l1e_get_intpte(nl1e), _mfn(mfn));
3941 okay = (okay && t == old);
3943 if ( !okay )
3945 unmap_domain_page(pl1e);
3946 put_page_from_l1e(nl1e, d);
3947 return X86EMUL_CMPXCHG_FAILED;
3950 else
3952 ol1e = *pl1e;
3953 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
3954 BUG();
3957 trace_ptwr_emulation(addr, nl1e);
3959 unmap_domain_page(pl1e);
3961 /* Finally, drop the old PTE. */
3962 put_page_from_l1e(ol1e, d);
3964 return X86EMUL_OKAY;
3967 static int ptwr_emulated_write(
3968 enum x86_segment seg,
3969 unsigned long offset,
3970 void *p_data,
3971 unsigned int bytes,
3972 struct x86_emulate_ctxt *ctxt)
3974 paddr_t val = 0;
3976 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
3978 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
3979 offset, bytes);
3980 return X86EMUL_UNHANDLEABLE;
3983 memcpy(&val, p_data, bytes);
3985 return ptwr_emulated_update(
3986 offset, 0, val, bytes, 0,
3987 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3990 static int ptwr_emulated_cmpxchg(
3991 enum x86_segment seg,
3992 unsigned long offset,
3993 void *p_old,
3994 void *p_new,
3995 unsigned int bytes,
3996 struct x86_emulate_ctxt *ctxt)
3998 paddr_t old = 0, new = 0;
4000 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
4002 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
4003 offset, bytes);
4004 return X86EMUL_UNHANDLEABLE;
4007 memcpy(&old, p_old, bytes);
4008 memcpy(&new, p_new, bytes);
4010 return ptwr_emulated_update(
4011 offset, old, new, bytes, 1,
4012 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
4015 static struct x86_emulate_ops ptwr_emulate_ops = {
4016 .read = ptwr_emulated_read,
4017 .insn_fetch = ptwr_emulated_read,
4018 .write = ptwr_emulated_write,
4019 .cmpxchg = ptwr_emulated_cmpxchg,
4020 };
4022 /* Write page fault handler: check if guest is trying to modify a PTE. */
4023 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
4024 struct cpu_user_regs *regs)
4026 struct domain *d = v->domain;
4027 struct page_info *page;
4028 l1_pgentry_t pte;
4029 struct ptwr_emulate_ctxt ptwr_ctxt;
4030 int rc;
4032 /* Attempt to read the PTE that maps the VA being accessed. */
4033 guest_get_eff_l1e(v, addr, &pte);
4034 page = l1e_get_page(pte);
4036 /* We are looking only for read-only mappings of p.t. pages. */
4037 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
4038 !mfn_valid(l1e_get_pfn(pte)) ||
4039 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
4040 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
4041 (page_get_owner(page) != d) )
4042 goto bail;
4044 ptwr_ctxt.ctxt.regs = regs;
4045 ptwr_ctxt.ctxt.force_writeback = 0;
4046 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
4047 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
4048 ptwr_ctxt.cr2 = addr;
4049 ptwr_ctxt.pte = pte;
4051 page_lock(page);
4052 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
4053 page_unlock(page);
4054 if ( rc == X86EMUL_UNHANDLEABLE )
4055 goto bail;
4057 perfc_incr(ptwr_emulations);
4058 return EXCRET_fault_fixed;
4060 bail:
4061 return 0;
4064 void free_xen_pagetable(void *v)
4066 extern int early_boot;
4068 if ( early_boot )
4069 return;
4071 if ( is_xen_heap_page(virt_to_page(v)) )
4072 free_xenheap_page(v);
4073 else
4074 free_domheap_page(virt_to_page(v));
4077 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4078 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4079 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4081 /*
4082 * map_pages_to_xen() can be called with interrupts disabled:
4083 * * During early bootstrap; or
4084 * * alloc_xenheap_pages() via memguard_guard_range
4085 * In these cases it is safe to use flush_area_local():
4086 * * Because only the local CPU is online; or
4087 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
4088 */
4089 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4090 flush_area_local((const void *)v, f) : \
4091 flush_area_all((const void *)v, f))
4093 int map_pages_to_xen(
4094 unsigned long virt,
4095 unsigned long mfn,
4096 unsigned long nr_mfns,
4097 unsigned int flags)
4099 l2_pgentry_t *pl2e, ol2e;
4100 l1_pgentry_t *pl1e, ol1e;
4101 unsigned int i;
4103 while ( nr_mfns != 0 )
4105 #ifdef __x86_64__
4106 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
4107 l3_pgentry_t ol3e = *pl3e;
4109 if ( cpu_has_page1gb &&
4110 !(((virt >> PAGE_SHIFT) | mfn) &
4111 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4112 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4113 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4115 /* 1GB-page mapping. */
4116 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4118 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4120 unsigned int flush_flags =
4121 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4123 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4125 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4126 flush_flags |= FLUSH_TLB_GLOBAL;
4127 if ( (l1f_to_lNf(l3e_get_flags(ol3e)) ^ flags) &
4128 PAGE_CACHE_ATTRS )
4129 flush_flags |= FLUSH_CACHE;
4130 flush_area(virt, flush_flags);
4132 else
4134 pl2e = l3e_to_l2e(ol3e);
4135 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4137 ol2e = pl2e[i];
4138 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4139 continue;
4140 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4142 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4143 flush_flags |= FLUSH_TLB_GLOBAL;
4144 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4145 PAGE_CACHE_ATTRS )
4146 flush_flags |= FLUSH_CACHE;
4148 else
4150 unsigned int j;
4152 pl1e = l2e_to_l1e(ol2e);
4153 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4155 ol1e = pl1e[j];
4156 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4157 flush_flags |= FLUSH_TLB_GLOBAL;
4158 if ( (l1e_get_flags(ol1e) ^ flags) &
4159 PAGE_CACHE_ATTRS )
4160 flush_flags |= FLUSH_CACHE;
4164 flush_area(virt, flush_flags);
4165 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4167 ol2e = pl2e[i];
4168 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4169 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4170 free_xen_pagetable(l2e_to_l1e(ol2e));
4172 free_xen_pagetable(pl2e);
4176 virt += 1UL << L3_PAGETABLE_SHIFT;
4177 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4178 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4179 continue;
4182 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4183 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4185 unsigned int flush_flags =
4186 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4188 /* Skip this PTE if there is no change. */
4189 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4190 L1_PAGETABLE_ENTRIES - 1)) +
4191 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4192 l1_table_offset(virt) == mfn) &&
4193 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4194 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4196 /* We can skip to end of L3 superpage if we got a match. */
4197 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4198 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4199 if ( i > nr_mfns )
4200 i = nr_mfns;
4201 virt += i << PAGE_SHIFT;
4202 mfn += i;
4203 nr_mfns -= i;
4204 continue;
4207 pl2e = alloc_xen_pagetable();
4208 if ( pl2e == NULL )
4209 return -ENOMEM;
4211 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4212 l2e_write(pl2e + i,
4213 l2e_from_pfn(l3e_get_pfn(ol3e) +
4214 (i << PAGETABLE_ORDER),
4215 l3e_get_flags(ol3e)));
4217 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4218 flush_flags |= FLUSH_TLB_GLOBAL;
4220 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4221 __PAGE_HYPERVISOR));
4222 flush_area(virt, flush_flags);
4224 #endif
4226 pl2e = virt_to_xen_l2e(virt);
4228 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4229 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4230 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4232 /* Super-page mapping. */
4233 ol2e = *pl2e;
4234 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4236 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4238 unsigned int flush_flags =
4239 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4241 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4243 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4244 flush_flags |= FLUSH_TLB_GLOBAL;
4245 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4246 PAGE_CACHE_ATTRS )
4247 flush_flags |= FLUSH_CACHE;
4248 flush_area(virt, flush_flags);
4250 else
4252 pl1e = l2e_to_l1e(ol2e);
4253 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4255 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4256 flush_flags |= FLUSH_TLB_GLOBAL;
4257 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4258 PAGE_CACHE_ATTRS )
4259 flush_flags |= FLUSH_CACHE;
4261 flush_area(virt, flush_flags);
4262 free_xen_pagetable(pl1e);
4266 virt += 1UL << L2_PAGETABLE_SHIFT;
4267 mfn += 1UL << PAGETABLE_ORDER;
4268 nr_mfns -= 1UL << PAGETABLE_ORDER;
4270 else
4272 /* Normal page mapping. */
4273 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4275 pl1e = alloc_xen_pagetable();
4276 if ( pl1e == NULL )
4277 return -ENOMEM;
4278 clear_page(pl1e);
4279 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4280 __PAGE_HYPERVISOR));
4282 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4284 unsigned int flush_flags =
4285 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4287 /* Skip this PTE if there is no change. */
4288 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4289 l1_table_offset(virt)) == mfn) &&
4290 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4291 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4293 /* We can skip to end of L2 superpage if we got a match. */
4294 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4295 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4296 if ( i > nr_mfns )
4297 i = nr_mfns;
4298 virt += i << L1_PAGETABLE_SHIFT;
4299 mfn += i;
4300 nr_mfns -= i;
4301 goto check_l3;
4304 pl1e = alloc_xen_pagetable();
4305 if ( pl1e == NULL )
4306 return -ENOMEM;
4308 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4309 l1e_write(&pl1e[i],
4310 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4311 lNf_to_l1f(l2e_get_flags(*pl2e))));
4313 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4314 flush_flags |= FLUSH_TLB_GLOBAL;
4316 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4317 __PAGE_HYPERVISOR));
4318 flush_area(virt, flush_flags);
4321 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4322 ol1e = *pl1e;
4323 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4324 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4326 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4327 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4328 flush_flags |= FLUSH_TLB_GLOBAL;
4329 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
4330 flush_flags |= FLUSH_CACHE;
4331 flush_area(virt, flush_flags);
4334 virt += 1UL << L1_PAGETABLE_SHIFT;
4335 mfn += 1UL;
4336 nr_mfns -= 1UL;
4338 if ( (flags == PAGE_HYPERVISOR) &&
4339 ((nr_mfns == 0) ||
4340 ((((virt >> PAGE_SHIFT) | mfn) &
4341 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
4343 unsigned long base_mfn;
4344 pl1e = l2e_to_l1e(*pl2e);
4345 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4346 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4347 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4348 (l1e_get_flags(*pl1e) != flags) )
4349 break;
4350 if ( i == L1_PAGETABLE_ENTRIES )
4352 ol2e = *pl2e;
4353 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4354 l1f_to_lNf(flags)));
4355 flush_area(virt - PAGE_SIZE,
4356 FLUSH_TLB_GLOBAL |
4357 FLUSH_ORDER(PAGETABLE_ORDER));
4358 free_xen_pagetable(l2e_to_l1e(ol2e));
4363 check_l3: ;
4364 #ifdef __x86_64__
4365 if ( cpu_has_page1gb &&
4366 (flags == PAGE_HYPERVISOR) &&
4367 ((nr_mfns == 0) ||
4368 !(((virt >> PAGE_SHIFT) | mfn) &
4369 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4371 unsigned long base_mfn;
4373 ol3e = *pl3e;
4374 pl2e = l3e_to_l2e(ol3e);
4375 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4376 L1_PAGETABLE_ENTRIES - 1);
4377 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4378 if ( (l2e_get_pfn(*pl2e) !=
4379 (base_mfn + (i << PAGETABLE_ORDER))) ||
4380 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4381 break;
4382 if ( i == L2_PAGETABLE_ENTRIES )
4384 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4385 l1f_to_lNf(flags)));
4386 flush_area(virt - PAGE_SIZE,
4387 FLUSH_TLB_GLOBAL |
4388 FLUSH_ORDER(2*PAGETABLE_ORDER));
4389 free_xen_pagetable(l3e_to_l2e(ol3e));
4392 #endif
4395 return 0;
4398 void destroy_xen_mappings(unsigned long s, unsigned long e)
4400 l2_pgentry_t *pl2e;
4401 l1_pgentry_t *pl1e;
4402 unsigned int i;
4403 unsigned long v = s;
4405 ASSERT((s & ~PAGE_MASK) == 0);
4406 ASSERT((e & ~PAGE_MASK) == 0);
4408 while ( v < e )
4410 #ifdef __x86_64__
4411 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4413 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4415 v += 1UL << L3_PAGETABLE_SHIFT;
4416 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4417 continue;
4420 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4422 if ( l2_table_offset(v) == 0 &&
4423 l1_table_offset(v) == 0 &&
4424 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4426 /* PAGE1GB: whole superpage is destroyed. */
4427 l3e_write_atomic(pl3e, l3e_empty());
4428 v += 1UL << L3_PAGETABLE_SHIFT;
4429 continue;
4432 /* PAGE1GB: shatter the superpage and fall through. */
4433 pl2e = alloc_xen_pagetable();
4434 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4435 l2e_write(pl2e + i,
4436 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4437 (i << PAGETABLE_ORDER),
4438 l3e_get_flags(*pl3e)));
4439 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4440 __PAGE_HYPERVISOR));
4442 #endif
4444 pl2e = virt_to_xen_l2e(v);
4446 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4448 v += 1UL << L2_PAGETABLE_SHIFT;
4449 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4450 continue;
4453 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4455 if ( (l1_table_offset(v) == 0) &&
4456 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4458 /* PSE: whole superpage is destroyed. */
4459 l2e_write_atomic(pl2e, l2e_empty());
4460 v += 1UL << L2_PAGETABLE_SHIFT;
4462 else
4464 /* PSE: shatter the superpage and try again. */
4465 pl1e = alloc_xen_pagetable();
4466 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4467 l1e_write(&pl1e[i],
4468 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4469 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4470 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4471 __PAGE_HYPERVISOR));
4474 else
4476 /* Ordinary 4kB mapping. */
4477 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4478 l1e_write_atomic(pl1e, l1e_empty());
4479 v += PAGE_SIZE;
4481 /* If we are done with the L2E, check if it is now empty. */
4482 if ( (v != e) && (l1_table_offset(v) != 0) )
4483 continue;
4484 pl1e = l2e_to_l1e(*pl2e);
4485 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4486 if ( l1e_get_intpte(pl1e[i]) != 0 )
4487 break;
4488 if ( i == L1_PAGETABLE_ENTRIES )
4490 /* Empty: zap the L2E and free the L1 page. */
4491 l2e_write_atomic(pl2e, l2e_empty());
4492 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4493 free_xen_pagetable(pl1e);
4497 #ifdef __x86_64__
4498 /* If we are done with the L3E, check if it is now empty. */
4499 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4500 continue;
4501 pl2e = l3e_to_l2e(*pl3e);
4502 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4503 if ( l2e_get_intpte(pl2e[i]) != 0 )
4504 break;
4505 if ( i == L2_PAGETABLE_ENTRIES )
4507 /* Empty: zap the L3E and free the L2 page. */
4508 l3e_write_atomic(pl3e, l3e_empty());
4509 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4510 free_xen_pagetable(pl2e);
4512 #endif
4515 flush_area(NULL, FLUSH_TLB_GLOBAL);
4518 void __set_fixmap(
4519 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4521 BUG_ON(idx >= __end_of_fixed_addresses);
4522 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4525 #ifdef MEMORY_GUARD
4527 void memguard_init(void)
4529 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4530 map_pages_to_xen(
4531 (unsigned long)__va(start),
4532 start >> PAGE_SHIFT,
4533 (xenheap_phys_end - start) >> PAGE_SHIFT,
4534 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4535 #ifdef __x86_64__
4536 BUG_ON(start != xen_phys_start);
4537 map_pages_to_xen(
4538 XEN_VIRT_START,
4539 start >> PAGE_SHIFT,
4540 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4541 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4542 #endif
4545 static void __memguard_change_range(void *p, unsigned long l, int guard)
4547 unsigned long _p = (unsigned long)p;
4548 unsigned long _l = (unsigned long)l;
4549 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4551 /* Ensure we are dealing with a page-aligned whole number of pages. */
4552 ASSERT((_p&~PAGE_MASK) == 0);
4553 ASSERT((_l&~PAGE_MASK) == 0);
4555 if ( guard )
4556 flags &= ~_PAGE_PRESENT;
4558 map_pages_to_xen(
4559 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4562 void memguard_guard_range(void *p, unsigned long l)
4564 __memguard_change_range(p, l, 1);
4567 void memguard_unguard_range(void *p, unsigned long l)
4569 __memguard_change_range(p, l, 0);
4572 #endif
4574 void memguard_guard_stack(void *p)
4576 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4577 p = (void *)((unsigned long)p + STACK_SIZE -
4578 PRIMARY_STACK_SIZE - PAGE_SIZE);
4579 memguard_guard_range(p, PAGE_SIZE);
4582 /*
4583 * Local variables:
4584 * mode: C
4585 * c-set-style: "BSD"
4586 * c-basic-offset: 4
4587 * tab-width: 4
4588 * indent-tabs-mode: nil
4589 * End:
4590 */