ia64/xen-unstable

view xen/arch/x86/mm.c @ 17256:6ac99b961bff

x86: stub domains and page ownership for mapping in dom0

In the case of an ioemu stubdomain with SDL or vnc server running in
dom0, we want to have the stubdomain expose the HVM guest's video RAM
through PVFB.

However, to map the pages from dom0, xenfb uses xc_map_foreign_pages
with the stubdomain ID as domid (since that is what is advertised in
the PVFB protocol, and needed for other parts of the protocol), and
thus get_page_from_l1e() complains because the stubdomain is not the
owner of the pages. In such case, this patch checks that the
stubdomain has privileges on the actual owner of the pages, and then
accept the mapping.

Signed-off-by: Samuel Thibault <samuel.thibault@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Mar 18 16:15:24 2008 +0000 (2008-03-18)
parents f1a107ec62b6
children 4af873c95d0b
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 static struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 #define l1_disallow_mask(d) \
164 ((d != dom_io) && \
165 (rangeset_is_empty((d)->iomem_caps) && \
166 rangeset_is_empty((d)->arch.ioport_caps)) ? \
167 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
169 #ifdef CONFIG_COMPAT
170 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
171 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
172 L3_DISALLOW_MASK : \
173 COMPAT_L3_DISALLOW_MASK)
174 #else
175 #define l3_disallow_mask(d) L3_DISALLOW_MASK
176 #endif
178 static void queue_deferred_ops(struct domain *d, unsigned int ops)
179 {
180 ASSERT(d == current->domain);
181 this_cpu(percpu_mm_info).deferred_ops |= ops;
182 }
184 void __init init_frametable(void)
185 {
186 unsigned long nr_pages, page_step, i, mfn;
188 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
190 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
191 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
193 for ( i = 0; i < nr_pages; i += page_step )
194 {
195 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
196 if ( mfn == 0 )
197 panic("Not enough memory for frame table\n");
198 map_pages_to_xen(
199 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
200 mfn, page_step, PAGE_HYPERVISOR);
201 }
203 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
204 }
206 void __init arch_init_memory(void)
207 {
208 extern void subarch_init_memory(void);
210 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
212 /*
213 * Initialise our DOMID_XEN domain.
214 * Any Xen-heap pages that we will allow to be mapped will have
215 * their domain field set to dom_xen.
216 */
217 dom_xen = alloc_domain(DOMID_XEN);
218 BUG_ON(dom_xen == NULL);
220 /*
221 * Initialise our DOMID_IO domain.
222 * This domain owns I/O pages that are within the range of the page_info
223 * array. Mappings occur at the priv of the caller.
224 */
225 dom_io = alloc_domain(DOMID_IO);
226 BUG_ON(dom_io == NULL);
228 /* First 1MB of RAM is historically marked as I/O. */
229 for ( i = 0; i < 0x100; i++ )
230 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
232 /* Any areas not specified as RAM by the e820 map are considered I/O. */
233 for ( i = 0, pfn = 0; pfn < max_page; i++ )
234 {
235 while ( (i < e820.nr_map) &&
236 (e820.map[i].type != E820_RAM) &&
237 (e820.map[i].type != E820_UNUSABLE) )
238 i++;
240 if ( i >= e820.nr_map )
241 {
242 /* No more RAM regions: mark as I/O right to end of memory map. */
243 rstart_pfn = rend_pfn = max_page;
244 }
245 else
246 {
247 /* Mark as I/O just up as far as next RAM region. */
248 rstart_pfn = min_t(unsigned long, max_page,
249 PFN_UP(e820.map[i].addr));
250 rend_pfn = max_t(unsigned long, rstart_pfn,
251 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
252 }
254 /*
255 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
256 * In particular this ensures that RAM holes are respected even in
257 * the statically-initialised 1-16MB mapping area.
258 */
259 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
260 ioend_pfn = rstart_pfn;
261 #if defined(CONFIG_X86_32)
262 ioend_pfn = min_t(unsigned long, ioend_pfn,
263 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
264 #endif
265 if ( iostart_pfn < ioend_pfn )
266 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
267 (unsigned long)mfn_to_virt(ioend_pfn));
269 /* Mark as I/O up to next RAM region. */
270 for ( ; pfn < rstart_pfn; pfn++ )
271 {
272 BUG_ON(!mfn_valid(pfn));
273 share_xen_page_with_guest(
274 mfn_to_page(pfn), dom_io, XENSHARE_writable);
275 }
277 /* Skip the RAM region. */
278 pfn = rend_pfn;
279 }
281 subarch_init_memory();
282 }
284 int memory_is_conventional_ram(paddr_t p)
285 {
286 int i;
288 for ( i = 0; i < e820.nr_map; i++ )
289 {
290 if ( (e820.map[i].type == E820_RAM) &&
291 (e820.map[i].addr <= p) &&
292 (e820.map[i].size > p) )
293 return 1;
294 }
296 return 0;
297 }
299 unsigned long domain_get_maximum_gpfn(struct domain *d)
300 {
301 if ( is_hvm_domain(d) )
302 return d->arch.p2m.max_mapped_pfn;
303 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
304 return arch_get_max_pfn(d) - 1;
305 }
307 void share_xen_page_with_guest(
308 struct page_info *page, struct domain *d, int readonly)
309 {
310 if ( page_get_owner(page) == d )
311 return;
313 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
315 spin_lock(&d->page_alloc_lock);
317 /* The incremented type count pins as writable or read-only. */
318 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
319 page->u.inuse.type_info |= PGT_validated | 1;
321 page_set_owner(page, d);
322 wmb(); /* install valid domain ptr before updating refcnt. */
323 ASSERT(page->count_info == 0);
325 /* Only add to the allocation list if the domain isn't dying. */
326 if ( !d->is_dying )
327 {
328 page->count_info |= PGC_allocated | 1;
329 if ( unlikely(d->xenheap_pages++ == 0) )
330 get_knownalive_domain(d);
331 list_add_tail(&page->list, &d->xenpage_list);
332 }
334 spin_unlock(&d->page_alloc_lock);
335 }
337 void share_xen_page_with_privileged_guests(
338 struct page_info *page, int readonly)
339 {
340 share_xen_page_with_guest(page, dom_xen, readonly);
341 }
343 #if defined(CONFIG_X86_PAE)
345 #ifdef NDEBUG
346 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
347 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
348 #else
349 /*
350 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
351 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
352 * (detected by lack of an owning domain). As required for correctness, we
353 * always shadow PDPTs above 4GB.
354 */
355 #define l3tab_needs_shadow(mfn) \
356 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
357 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
358 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
359 ((mfn) >= 0x100000))
360 #endif
362 static l1_pgentry_t *fix_pae_highmem_pl1e;
364 /* Cache the address of PAE high-memory fixmap page tables. */
365 static int __init cache_pae_fixmap_address(void)
366 {
367 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
368 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
369 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
370 return 0;
371 }
372 __initcall(cache_pae_fixmap_address);
374 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
376 void make_cr3(struct vcpu *v, unsigned long mfn)
377 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
378 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
379 {
380 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
381 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
382 unsigned int cpu = smp_processor_id();
384 /* Fast path: does this mfn need a shadow at all? */
385 if ( !l3tab_needs_shadow(mfn) )
386 {
387 v->arch.cr3 = mfn << PAGE_SHIFT;
388 /* Cache is no longer in use or valid */
389 cache->high_mfn = 0;
390 return;
391 }
393 /* Caching logic is not interrupt safe. */
394 ASSERT(!in_irq());
396 /* Protects against pae_flush_pgd(). */
397 spin_lock(&cache->lock);
399 cache->inuse_idx ^= 1;
400 cache->high_mfn = mfn;
402 /* Map the guest L3 table and copy to the chosen low-memory cache. */
403 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
404 /* First check the previous high mapping can't be in the TLB.
405 * (i.e. have we loaded CR3 since we last did this?) */
406 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
407 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
408 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
409 lowmem_l3tab = cache->table[cache->inuse_idx];
410 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
411 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
412 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
414 v->arch.cr3 = __pa(lowmem_l3tab);
416 spin_unlock(&cache->lock);
417 }
419 #else /* !CONFIG_X86_PAE */
421 void make_cr3(struct vcpu *v, unsigned long mfn)
422 {
423 v->arch.cr3 = mfn << PAGE_SHIFT;
424 }
426 #endif /* !CONFIG_X86_PAE */
428 void write_ptbase(struct vcpu *v)
429 {
430 write_cr3(v->arch.cr3);
431 }
433 /*
434 * Should be called after CR3 is updated.
435 *
436 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
437 * for HVM guests, arch.monitor_table and hvm's guest CR3.
438 *
439 * Update ref counts to shadow tables appropriately.
440 */
441 void update_cr3(struct vcpu *v)
442 {
443 unsigned long cr3_mfn=0;
445 if ( paging_mode_enabled(v->domain) )
446 {
447 paging_update_cr3(v);
448 return;
449 }
451 #if CONFIG_PAGING_LEVELS == 4
452 if ( !(v->arch.flags & TF_kernel_mode) )
453 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
454 else
455 #endif
456 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
458 make_cr3(v, cr3_mfn);
459 }
462 static void invalidate_shadow_ldt(struct vcpu *v)
463 {
464 int i;
465 unsigned long pfn;
466 struct page_info *page;
468 if ( v->arch.shadow_ldt_mapcnt == 0 )
469 return;
471 v->arch.shadow_ldt_mapcnt = 0;
473 for ( i = 16; i < 32; i++ )
474 {
475 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
476 if ( pfn == 0 ) continue;
477 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
478 page = mfn_to_page(pfn);
479 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
480 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
481 put_page_and_type(page);
482 }
484 /* Dispose of the (now possibly invalid) mappings from the TLB. */
485 if ( v == current )
486 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
487 else
488 flush_tlb_mask(v->domain->domain_dirty_cpumask);
489 }
492 static int alloc_segdesc_page(struct page_info *page)
493 {
494 struct desc_struct *descs;
495 int i;
497 descs = map_domain_page(page_to_mfn(page));
499 for ( i = 0; i < 512; i++ )
500 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
501 goto fail;
503 unmap_domain_page(descs);
504 return 1;
506 fail:
507 unmap_domain_page(descs);
508 return 0;
509 }
512 /* Map shadow page at offset @off. */
513 int map_ldt_shadow_page(unsigned int off)
514 {
515 struct vcpu *v = current;
516 struct domain *d = v->domain;
517 unsigned long gmfn, mfn;
518 l1_pgentry_t l1e, nl1e;
519 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
520 int okay;
522 BUG_ON(unlikely(in_irq()));
524 guest_get_eff_kern_l1e(v, gva, &l1e);
525 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
526 return 0;
528 gmfn = l1e_get_pfn(l1e);
529 mfn = gmfn_to_mfn(d, gmfn);
530 if ( unlikely(!mfn_valid(mfn)) )
531 return 0;
533 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
534 if ( unlikely(!okay) )
535 return 0;
537 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
539 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
540 v->arch.shadow_ldt_mapcnt++;
542 return 1;
543 }
546 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
547 {
548 struct page_info *page = mfn_to_page(page_nr);
550 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
551 {
552 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
553 return 0;
554 }
556 return 1;
557 }
560 static int get_page_and_type_from_pagenr(unsigned long page_nr,
561 unsigned long type,
562 struct domain *d)
563 {
564 struct page_info *page = mfn_to_page(page_nr);
566 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
567 return 0;
569 if ( unlikely(!get_page_type(page, type)) )
570 {
571 put_page(page);
572 return 0;
573 }
575 return 1;
576 }
578 /*
579 * We allow root tables to map each other (a.k.a. linear page tables). It
580 * needs some special care with reference counts and access permissions:
581 * 1. The mapping entry must be read-only, or the guest may get write access
582 * to its own PTEs.
583 * 2. We must only bump the reference counts for an *already validated*
584 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
585 * on a validation that is required to complete that validation.
586 * 3. We only need to increment the reference counts for the mapped page
587 * frame if it is mapped by a different root table. This is sufficient and
588 * also necessary to allow validation of a root table mapping itself.
589 */
590 #define define_get_linear_pagetable(level) \
591 static int \
592 get_##level##_linear_pagetable( \
593 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
594 { \
595 unsigned long x, y; \
596 struct page_info *page; \
597 unsigned long pfn; \
598 \
599 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
600 { \
601 MEM_LOG("Attempt to create linear p.t. with write perms"); \
602 return 0; \
603 } \
604 \
605 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
606 { \
607 /* Make sure the mapped frame belongs to the correct domain. */ \
608 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
609 return 0; \
610 \
611 /* \
612 * Ensure that the mapped frame is an already-validated page table. \
613 * If so, atomically increment the count (checking for overflow). \
614 */ \
615 page = mfn_to_page(pfn); \
616 y = page->u.inuse.type_info; \
617 do { \
618 x = y; \
619 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
620 unlikely((x & (PGT_type_mask|PGT_validated)) != \
621 (PGT_##level##_page_table|PGT_validated)) ) \
622 { \
623 put_page(page); \
624 return 0; \
625 } \
626 } \
627 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
628 } \
629 \
630 return 1; \
631 }
634 int is_iomem_page(unsigned long mfn)
635 {
636 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
637 }
640 int
641 get_page_from_l1e(
642 l1_pgentry_t l1e, struct domain *d)
643 {
644 unsigned long mfn = l1e_get_pfn(l1e);
645 struct page_info *page = mfn_to_page(mfn);
646 uint32_t l1f = l1e_get_flags(l1e);
647 struct vcpu *curr = current;
648 struct domain *owner = page_get_owner(page);
649 int okay;
651 if ( !(l1f & _PAGE_PRESENT) )
652 return 1;
654 if ( unlikely(l1f & l1_disallow_mask(d)) )
655 {
656 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
657 return 0;
658 }
660 if ( is_iomem_page(mfn) )
661 {
662 /* DOMID_IO reverts to caller for privilege checks. */
663 if ( d == dom_io )
664 d = curr->domain;
666 if ( !iomem_access_permitted(d, mfn, mfn) )
667 {
668 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
669 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
670 d->domain_id, mfn);
671 return 0;
672 }
674 return 1;
675 }
677 /*
678 * Let privileged domains transfer the right to map their target
679 * domain's pages. This is used to allow stub-domain pvfb export to dom0,
680 * until pvfb supports granted mappings. At that time this minor hack
681 * can go away.
682 */
683 if ( unlikely(d != owner) && (owner != NULL) &&
684 (d != curr->domain) && IS_PRIV_FOR(d, owner) )
685 d = owner;
687 /* Foreign mappings into guests in shadow external mode don't
688 * contribute to writeable mapping refcounts. (This allows the
689 * qemu-dm helper process in dom0 to map the domain's memory without
690 * messing up the count of "real" writable mappings.) */
691 okay = (((l1f & _PAGE_RW) &&
692 !(unlikely(paging_mode_external(d) && (d != curr->domain))))
693 ? get_page_and_type(page, d, PGT_writable_page)
694 : get_page(page, d));
695 if ( !okay )
696 {
697 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
698 " for dom%d",
699 mfn, get_gpfn_from_mfn(mfn),
700 l1e_get_intpte(l1e), d->domain_id);
701 }
702 else if ( pte_flags_to_cacheattr(l1f) !=
703 ((page->count_info >> PGC_cacheattr_base) & 7) )
704 {
705 uint32_t x, nx, y = page->count_info;
706 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
708 if ( is_xen_heap_page(page) )
709 {
710 if ( (l1f & _PAGE_RW) &&
711 !(unlikely(paging_mode_external(d) &&
712 (d != curr->domain))) )
713 put_page_type(page);
714 put_page(page);
715 MEM_LOG("Attempt to change cache attributes of Xen heap page");
716 return 0;
717 }
719 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
720 {
721 x = y;
722 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
723 y = cmpxchg(&page->count_info, x, nx);
724 }
726 #ifdef __x86_64__
727 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
728 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
729 #endif
730 }
732 return okay;
733 }
736 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
737 define_get_linear_pagetable(l2);
738 static int
739 get_page_from_l2e(
740 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
741 {
742 int rc;
744 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
745 return 1;
747 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
748 {
749 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
750 return 0;
751 }
753 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
754 if ( unlikely(!rc) )
755 rc = get_l2_linear_pagetable(l2e, pfn, d);
757 return rc;
758 }
761 #if CONFIG_PAGING_LEVELS >= 3
762 define_get_linear_pagetable(l3);
763 static int
764 get_page_from_l3e(
765 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
766 {
767 int rc;
769 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
770 return 1;
772 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
773 {
774 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
775 return 0;
776 }
778 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
779 if ( unlikely(!rc) )
780 rc = get_l3_linear_pagetable(l3e, pfn, d);
782 return rc;
783 }
784 #endif /* 3 level */
786 #if CONFIG_PAGING_LEVELS >= 4
787 define_get_linear_pagetable(l4);
788 static int
789 get_page_from_l4e(
790 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
791 {
792 int rc;
794 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
795 return 1;
797 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
798 {
799 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
800 return 0;
801 }
803 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
804 if ( unlikely(!rc) )
805 rc = get_l4_linear_pagetable(l4e, pfn, d);
807 return rc;
808 }
809 #endif /* 4 level */
811 #ifdef __x86_64__
813 #ifdef USER_MAPPINGS_ARE_GLOBAL
814 #define adjust_guest_l1e(pl1e, d) \
815 do { \
816 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
817 likely(!is_pv_32on64_domain(d)) ) \
818 { \
819 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
820 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
821 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
822 MEM_LOG("Global bit is set to kernel page %lx", \
823 l1e_get_pfn((pl1e))); \
824 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
825 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
826 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
827 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
828 } \
829 } while ( 0 )
830 #else
831 #define adjust_guest_l1e(pl1e, d) \
832 do { \
833 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
834 likely(!is_pv_32on64_domain(d)) ) \
835 l1e_add_flags((pl1e), _PAGE_USER); \
836 } while ( 0 )
837 #endif
839 #define adjust_guest_l2e(pl2e, d) \
840 do { \
841 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
842 likely(!is_pv_32on64_domain(d)) ) \
843 l2e_add_flags((pl2e), _PAGE_USER); \
844 } while ( 0 )
846 #define adjust_guest_l3e(pl3e, d) \
847 do { \
848 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
849 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
850 _PAGE_USER : \
851 _PAGE_USER|_PAGE_RW); \
852 } while ( 0 )
854 #define adjust_guest_l4e(pl4e, d) \
855 do { \
856 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
857 likely(!is_pv_32on64_domain(d)) ) \
858 l4e_add_flags((pl4e), _PAGE_USER); \
859 } while ( 0 )
861 #else /* !defined(__x86_64__) */
863 #define adjust_guest_l1e(_p, _d) ((void)(_d))
864 #define adjust_guest_l2e(_p, _d) ((void)(_d))
865 #define adjust_guest_l3e(_p, _d) ((void)(_d))
867 #endif
869 #ifdef CONFIG_COMPAT
870 #define unadjust_guest_l3e(pl3e, d) \
871 do { \
872 if ( unlikely(is_pv_32on64_domain(d)) && \
873 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
874 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
875 } while ( 0 )
876 #else
877 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
878 #endif
880 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
881 {
882 unsigned long pfn = l1e_get_pfn(l1e);
883 struct page_info *page;
884 struct domain *e;
885 struct vcpu *v;
887 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
888 return;
890 page = mfn_to_page(pfn);
892 e = page_get_owner(page);
894 /*
895 * Check if this is a mapping that was established via a grant reference.
896 * If it was then we should not be here: we require that such mappings are
897 * explicitly destroyed via the grant-table interface.
898 *
899 * The upshot of this is that the guest can end up with active grants that
900 * it cannot destroy (because it no longer has a PTE to present to the
901 * grant-table interface). This can lead to subtle hard-to-catch bugs,
902 * hence a special grant PTE flag can be enabled to catch the bug early.
903 *
904 * (Note that the undestroyable active grants are not a security hole in
905 * Xen. All active grants can safely be cleaned up when the domain dies.)
906 */
907 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
908 !d->is_shutting_down && !d->is_dying )
909 {
910 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
911 l1e_get_intpte(l1e));
912 domain_crash(d);
913 }
915 /* Remember we didn't take a type-count of foreign writable mappings
916 * to paging-external domains */
917 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
918 !(unlikely((e != d) && paging_mode_external(e))) )
919 {
920 put_page_and_type(page);
921 }
922 else
923 {
924 /* We expect this is rare so we blow the entire shadow LDT. */
925 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
926 PGT_ldt_page)) &&
927 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
928 (d == e) )
929 {
930 for_each_vcpu ( d, v )
931 invalidate_shadow_ldt(v);
932 }
933 put_page(page);
934 }
935 }
938 /*
939 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
940 * Note also that this automatically deals correctly with linear p.t.'s.
941 */
942 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
943 {
944 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
945 (l2e_get_pfn(l2e) != pfn) )
946 put_page_and_type(l2e_get_page(l2e));
947 }
950 #if CONFIG_PAGING_LEVELS >= 3
951 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
952 {
953 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
954 (l3e_get_pfn(l3e) != pfn) )
955 put_page_and_type(l3e_get_page(l3e));
956 }
957 #endif
959 #if CONFIG_PAGING_LEVELS >= 4
960 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
961 {
962 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
963 (l4e_get_pfn(l4e) != pfn) )
964 put_page_and_type(l4e_get_page(l4e));
965 }
966 #endif
968 static int alloc_l1_table(struct page_info *page)
969 {
970 struct domain *d = page_get_owner(page);
971 unsigned long pfn = page_to_mfn(page);
972 l1_pgentry_t *pl1e;
973 int i;
975 pl1e = map_domain_page(pfn);
977 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
978 {
979 if ( is_guest_l1_slot(i) &&
980 unlikely(!get_page_from_l1e(pl1e[i], d)) )
981 goto fail;
983 adjust_guest_l1e(pl1e[i], d);
984 }
986 unmap_domain_page(pl1e);
987 return 1;
989 fail:
990 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
991 while ( i-- > 0 )
992 if ( is_guest_l1_slot(i) )
993 put_page_from_l1e(pl1e[i], d);
995 unmap_domain_page(pl1e);
996 return 0;
997 }
999 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1000 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1002 struct page_info *page;
1003 l2_pgentry_t *pl2e;
1004 l3_pgentry_t l3e3;
1005 #ifndef CONFIG_COMPAT
1006 l2_pgentry_t l2e;
1007 int i;
1008 #endif
1010 if ( !is_pv_32bit_domain(d) )
1011 return 1;
1013 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1015 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1016 l3e3 = pl3e[3];
1017 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1019 MEM_LOG("PAE L3 3rd slot is empty");
1020 return 0;
1023 /*
1024 * The Xen-private mappings include linear mappings. The L2 thus cannot
1025 * be shared by multiple L3 tables. The test here is adequate because:
1026 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1027 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1028 * 2. Cannot appear in another page table's L3:
1029 * a. alloc_l3_table() calls this function and this check will fail
1030 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1031 */
1032 page = l3e_get_page(l3e3);
1033 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1034 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1035 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1036 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1038 MEM_LOG("PAE L3 3rd slot is shared");
1039 return 0;
1042 /* Xen private mappings. */
1043 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1044 #ifndef CONFIG_COMPAT
1045 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1046 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1047 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1048 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1050 l2e = l2e_from_page(
1051 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1052 __PAGE_HYPERVISOR);
1053 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1055 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1057 l2e = l2e_empty();
1058 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1059 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1060 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1062 #else
1063 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1064 &compat_idle_pg_table_l2[
1065 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1066 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1067 #endif
1068 unmap_domain_page(pl2e);
1070 return 1;
1072 #else
1073 # define create_pae_xen_mappings(d, pl3e) (1)
1074 #endif
1076 #ifdef CONFIG_X86_PAE
1077 /* Flush a pgdir update into low-memory caches. */
1078 static void pae_flush_pgd(
1079 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1081 struct domain *d = page_get_owner(mfn_to_page(mfn));
1082 struct vcpu *v;
1083 intpte_t _ol3e, _nl3e, _pl3e;
1084 l3_pgentry_t *l3tab_ptr;
1085 struct pae_l3_cache *cache;
1087 if ( unlikely(shadow_mode_enabled(d)) )
1089 cpumask_t m = CPU_MASK_NONE;
1090 /* Re-shadow this l3 table on any vcpus that are using it */
1091 for_each_vcpu ( d, v )
1092 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1094 paging_update_cr3(v);
1095 cpus_or(m, m, v->vcpu_dirty_cpumask);
1097 flush_tlb_mask(m);
1100 /* If below 4GB then the pgdir is not shadowed in low memory. */
1101 if ( !l3tab_needs_shadow(mfn) )
1102 return;
1104 for_each_vcpu ( d, v )
1106 cache = &v->arch.pae_l3_cache;
1108 spin_lock(&cache->lock);
1110 if ( cache->high_mfn == mfn )
1112 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1113 _ol3e = l3e_get_intpte(*l3tab_ptr);
1114 _nl3e = l3e_get_intpte(nl3e);
1115 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1116 BUG_ON(_pl3e != _ol3e);
1119 spin_unlock(&cache->lock);
1122 flush_tlb_mask(d->domain_dirty_cpumask);
1124 #else
1125 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1126 #endif
1128 static int alloc_l2_table(struct page_info *page, unsigned long type)
1130 struct domain *d = page_get_owner(page);
1131 unsigned long pfn = page_to_mfn(page);
1132 l2_pgentry_t *pl2e;
1133 int i;
1135 pl2e = map_domain_page(pfn);
1137 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1139 if ( is_guest_l2_slot(d, type, i) &&
1140 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1141 goto fail;
1143 adjust_guest_l2e(pl2e[i], d);
1146 #if CONFIG_PAGING_LEVELS == 2
1147 /* Xen private mappings. */
1148 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1149 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1150 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1151 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1152 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1153 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1154 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1155 l2e_from_page(
1156 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1157 __PAGE_HYPERVISOR);
1158 #endif
1160 unmap_domain_page(pl2e);
1161 return 1;
1163 fail:
1164 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1165 while ( i-- > 0 )
1166 if ( is_guest_l2_slot(d, type, i) )
1167 put_page_from_l2e(pl2e[i], pfn);
1169 unmap_domain_page(pl2e);
1170 return 0;
1174 #if CONFIG_PAGING_LEVELS >= 3
1175 static int alloc_l3_table(struct page_info *page)
1177 struct domain *d = page_get_owner(page);
1178 unsigned long pfn = page_to_mfn(page);
1179 l3_pgentry_t *pl3e;
1180 int i;
1182 #ifdef CONFIG_X86_PAE
1183 /*
1184 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1185 * the weird 'extended cr3' format for dealing with high-order address
1186 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1187 */
1188 if ( (pfn >= 0x100000) &&
1189 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1190 d->vcpu[0] && d->vcpu[0]->is_initialised )
1192 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1193 return 0;
1195 #endif
1197 pl3e = map_domain_page(pfn);
1199 /*
1200 * PAE guests allocate full pages, but aren't required to initialize
1201 * more than the first four entries; when running in compatibility
1202 * mode, however, the full page is visible to the MMU, and hence all
1203 * 512 entries must be valid/verified, which is most easily achieved
1204 * by clearing them out.
1205 */
1206 if ( is_pv_32on64_domain(d) )
1207 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1209 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1211 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1212 if ( is_pv_32bit_domain(d) && (i == 3) )
1214 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1215 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1216 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1217 PGT_l2_page_table |
1218 PGT_pae_xen_l2,
1219 d) )
1220 goto fail;
1222 else
1223 #endif
1224 if ( is_guest_l3_slot(i) &&
1225 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1226 goto fail;
1228 adjust_guest_l3e(pl3e[i], d);
1231 if ( !create_pae_xen_mappings(d, pl3e) )
1232 goto fail;
1234 unmap_domain_page(pl3e);
1235 return 1;
1237 fail:
1238 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1239 while ( i-- > 0 )
1240 if ( is_guest_l3_slot(i) )
1241 put_page_from_l3e(pl3e[i], pfn);
1243 unmap_domain_page(pl3e);
1244 return 0;
1246 #else
1247 #define alloc_l3_table(page) (0)
1248 #endif
1250 #if CONFIG_PAGING_LEVELS >= 4
1251 static int alloc_l4_table(struct page_info *page)
1253 struct domain *d = page_get_owner(page);
1254 unsigned long pfn = page_to_mfn(page);
1255 l4_pgentry_t *pl4e = page_to_virt(page);
1256 int i;
1258 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1260 if ( is_guest_l4_slot(d, i) &&
1261 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1262 goto fail;
1264 adjust_guest_l4e(pl4e[i], d);
1267 /* Xen private mappings. */
1268 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1269 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1270 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1271 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1272 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1273 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1274 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1275 __PAGE_HYPERVISOR);
1276 if ( is_pv_32on64_domain(d) )
1277 pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1278 l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
1279 __PAGE_HYPERVISOR);
1281 return 1;
1283 fail:
1284 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1285 while ( i-- > 0 )
1286 if ( is_guest_l4_slot(d, i) )
1287 put_page_from_l4e(pl4e[i], pfn);
1289 return 0;
1291 #else
1292 #define alloc_l4_table(page) (0)
1293 #endif
1296 static void free_l1_table(struct page_info *page)
1298 struct domain *d = page_get_owner(page);
1299 unsigned long pfn = page_to_mfn(page);
1300 l1_pgentry_t *pl1e;
1301 int i;
1303 pl1e = map_domain_page(pfn);
1305 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1306 if ( is_guest_l1_slot(i) )
1307 put_page_from_l1e(pl1e[i], d);
1309 unmap_domain_page(pl1e);
1313 static void free_l2_table(struct page_info *page)
1315 #ifdef CONFIG_COMPAT
1316 struct domain *d = page_get_owner(page);
1317 #endif
1318 unsigned long pfn = page_to_mfn(page);
1319 l2_pgentry_t *pl2e;
1320 int i;
1322 pl2e = map_domain_page(pfn);
1324 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1325 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1326 put_page_from_l2e(pl2e[i], pfn);
1328 unmap_domain_page(pl2e);
1330 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1334 #if CONFIG_PAGING_LEVELS >= 3
1336 static void free_l3_table(struct page_info *page)
1338 struct domain *d = page_get_owner(page);
1339 unsigned long pfn = page_to_mfn(page);
1340 l3_pgentry_t *pl3e;
1341 int i;
1343 pl3e = map_domain_page(pfn);
1345 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1346 if ( is_guest_l3_slot(i) )
1348 put_page_from_l3e(pl3e[i], pfn);
1349 unadjust_guest_l3e(pl3e[i], d);
1352 unmap_domain_page(pl3e);
1355 #endif
1357 #if CONFIG_PAGING_LEVELS >= 4
1359 static void free_l4_table(struct page_info *page)
1361 struct domain *d = page_get_owner(page);
1362 unsigned long pfn = page_to_mfn(page);
1363 l4_pgentry_t *pl4e = page_to_virt(page);
1364 int i;
1366 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1367 if ( is_guest_l4_slot(d, i) )
1368 put_page_from_l4e(pl4e[i], pfn);
1371 #endif
1374 /* How to write an entry to the guest pagetables.
1375 * Returns 0 for failure (pointer not valid), 1 for success. */
1376 static inline int update_intpte(intpte_t *p,
1377 intpte_t old,
1378 intpte_t new,
1379 unsigned long mfn,
1380 struct vcpu *v,
1381 int preserve_ad)
1383 int rv = 1;
1384 #ifndef PTE_UPDATE_WITH_CMPXCHG
1385 if ( !preserve_ad )
1387 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1389 else
1390 #endif
1392 intpte_t t = old;
1393 for ( ; ; )
1395 intpte_t _new = new;
1396 if ( preserve_ad )
1397 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1399 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1400 if ( unlikely(rv == 0) )
1402 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1403 ": saw %" PRIpte, old, _new, t);
1404 break;
1407 if ( t == old )
1408 break;
1410 /* Allowed to change in Accessed/Dirty flags only. */
1411 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1413 old = t;
1416 return rv;
1419 /* Macro that wraps the appropriate type-changes around update_intpte().
1420 * Arguments are: type, ptr, old, new, mfn, vcpu */
1421 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1422 update_intpte(&_t ## e_get_intpte(*(_p)), \
1423 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1424 (_m), (_v), (_ad))
1426 /* Update the L1 entry at pl1e to new value nl1e. */
1427 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1428 unsigned long gl1mfn, int preserve_ad)
1430 l1_pgentry_t ol1e;
1431 struct vcpu *curr = current;
1432 struct domain *d = curr->domain;
1433 unsigned long mfn;
1435 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1436 return 0;
1438 if ( unlikely(paging_mode_refcounts(d)) )
1439 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad);
1441 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1443 /* Translate foreign guest addresses. */
1444 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1445 if ( unlikely(mfn == INVALID_MFN) )
1446 return 0;
1447 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1448 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1450 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1452 MEM_LOG("Bad L1 flags %x",
1453 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1454 return 0;
1457 /* Fast path for identical mapping, r/w and presence. */
1458 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1460 adjust_guest_l1e(nl1e, d);
1461 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1462 preserve_ad);
1465 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1466 return 0;
1468 adjust_guest_l1e(nl1e, d);
1469 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1470 preserve_ad)) )
1472 put_page_from_l1e(nl1e, d);
1473 return 0;
1476 else
1478 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1479 preserve_ad)) )
1480 return 0;
1483 put_page_from_l1e(ol1e, d);
1484 return 1;
1488 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1489 static int mod_l2_entry(l2_pgentry_t *pl2e,
1490 l2_pgentry_t nl2e,
1491 unsigned long pfn,
1492 unsigned long type,
1493 int preserve_ad)
1495 l2_pgentry_t ol2e;
1496 struct vcpu *curr = current;
1497 struct domain *d = curr->domain;
1499 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1501 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1502 return 0;
1505 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1506 return 0;
1508 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1510 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1512 MEM_LOG("Bad L2 flags %x",
1513 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1514 return 0;
1517 /* Fast path for identical mapping and presence. */
1518 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1520 adjust_guest_l2e(nl2e, d);
1521 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad);
1524 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1525 return 0;
1527 adjust_guest_l2e(nl2e, d);
1528 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1529 preserve_ad)) )
1531 put_page_from_l2e(nl2e, pfn);
1532 return 0;
1535 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1536 preserve_ad)) )
1538 return 0;
1541 put_page_from_l2e(ol2e, pfn);
1542 return 1;
1545 #if CONFIG_PAGING_LEVELS >= 3
1547 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1548 static int mod_l3_entry(l3_pgentry_t *pl3e,
1549 l3_pgentry_t nl3e,
1550 unsigned long pfn,
1551 int preserve_ad)
1553 l3_pgentry_t ol3e;
1554 struct vcpu *curr = current;
1555 struct domain *d = curr->domain;
1556 int okay;
1558 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1560 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1561 return 0;
1564 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1565 /*
1566 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1567 * would be a pain to ensure they remain continuously valid throughout.
1568 */
1569 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1570 return 0;
1571 #endif
1573 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1574 return 0;
1576 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1578 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1580 MEM_LOG("Bad L3 flags %x",
1581 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1582 return 0;
1585 /* Fast path for identical mapping and presence. */
1586 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1588 adjust_guest_l3e(nl3e, d);
1589 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
1592 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1593 return 0;
1595 adjust_guest_l3e(nl3e, d);
1596 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1597 preserve_ad)) )
1599 put_page_from_l3e(nl3e, pfn);
1600 return 0;
1603 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1604 preserve_ad)) )
1606 return 0;
1609 okay = create_pae_xen_mappings(d, pl3e);
1610 BUG_ON(!okay);
1612 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1614 put_page_from_l3e(ol3e, pfn);
1615 return 1;
1618 #endif
1620 #if CONFIG_PAGING_LEVELS >= 4
1622 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1623 static int mod_l4_entry(l4_pgentry_t *pl4e,
1624 l4_pgentry_t nl4e,
1625 unsigned long pfn,
1626 int preserve_ad)
1628 struct vcpu *curr = current;
1629 struct domain *d = curr->domain;
1630 l4_pgentry_t ol4e;
1632 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1634 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1635 return 0;
1638 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1639 return 0;
1641 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1643 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1645 MEM_LOG("Bad L4 flags %x",
1646 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1647 return 0;
1650 /* Fast path for identical mapping and presence. */
1651 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1653 adjust_guest_l4e(nl4e, d);
1654 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
1657 if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
1658 return 0;
1660 adjust_guest_l4e(nl4e, d);
1661 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1662 preserve_ad)) )
1664 put_page_from_l4e(nl4e, pfn);
1665 return 0;
1668 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1669 preserve_ad)) )
1671 return 0;
1674 put_page_from_l4e(ol4e, pfn);
1675 return 1;
1678 #endif
1680 void put_page(struct page_info *page)
1682 u32 nx, x, y = page->count_info;
1684 do {
1685 x = y;
1686 nx = x - 1;
1688 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1690 if ( unlikely((nx & PGC_count_mask) == 0) )
1692 cleanup_page_cacheattr(page);
1693 free_domheap_page(page);
1698 int get_page(struct page_info *page, struct domain *domain)
1700 u32 x, nx, y = page->count_info;
1701 u32 d, nd = page->u.inuse._domain;
1702 u32 _domain = pickle_domptr(domain);
1704 do {
1705 x = y;
1706 nx = x + 1;
1707 d = nd;
1708 if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
1709 unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
1710 unlikely(d != _domain) ) /* Wrong owner? */
1712 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1713 gdprintk(XENLOG_INFO,
1714 "Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
1715 PRtype_info "\n",
1716 page_to_mfn(page), domain, unpickle_domptr(d),
1717 x, page->u.inuse.type_info);
1718 return 0;
1720 asm volatile (
1721 LOCK_PREFIX "cmpxchg8b %3"
1722 : "=d" (nd), "=a" (y), "=c" (d),
1723 "=m" (*(volatile u64 *)(&page->count_info))
1724 : "0" (d), "1" (x), "c" (d), "b" (nx) );
1726 while ( unlikely(nd != d) || unlikely(y != x) );
1728 return 1;
1732 static int alloc_page_type(struct page_info *page, unsigned long type)
1734 struct domain *owner = page_get_owner(page);
1736 /* A page table is dirtied when its type count becomes non-zero. */
1737 if ( likely(owner != NULL) )
1738 paging_mark_dirty(owner, page_to_mfn(page));
1740 switch ( type & PGT_type_mask )
1742 case PGT_l1_page_table:
1743 return alloc_l1_table(page);
1744 case PGT_l2_page_table:
1745 return alloc_l2_table(page, type);
1746 case PGT_l3_page_table:
1747 return alloc_l3_table(page);
1748 case PGT_l4_page_table:
1749 return alloc_l4_table(page);
1750 case PGT_gdt_page:
1751 case PGT_ldt_page:
1752 return alloc_segdesc_page(page);
1753 default:
1754 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1755 type, page->u.inuse.type_info,
1756 page->count_info);
1757 BUG();
1760 return 0;
1764 void free_page_type(struct page_info *page, unsigned long type)
1766 struct domain *owner = page_get_owner(page);
1767 unsigned long gmfn;
1769 if ( likely(owner != NULL) )
1771 /*
1772 * We have to flush before the next use of the linear mapping
1773 * (e.g., update_va_mapping()) or we could end up modifying a page
1774 * that is no longer a page table (and hence screw up ref counts).
1775 */
1776 if ( current->domain == owner )
1777 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1778 else
1779 flush_tlb_mask(owner->domain_dirty_cpumask);
1781 if ( unlikely(paging_mode_enabled(owner)) )
1783 /* A page table is dirtied when its type count becomes zero. */
1784 paging_mark_dirty(owner, page_to_mfn(page));
1786 if ( shadow_mode_refcounts(owner) )
1787 return;
1789 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1790 ASSERT(VALID_M2P(gmfn));
1791 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1795 switch ( type & PGT_type_mask )
1797 case PGT_l1_page_table:
1798 free_l1_table(page);
1799 break;
1801 case PGT_l2_page_table:
1802 free_l2_table(page);
1803 break;
1805 #if CONFIG_PAGING_LEVELS >= 3
1806 case PGT_l3_page_table:
1807 free_l3_table(page);
1808 break;
1809 #endif
1811 #if CONFIG_PAGING_LEVELS >= 4
1812 case PGT_l4_page_table:
1813 free_l4_table(page);
1814 break;
1815 #endif
1817 default:
1818 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1819 type, page_to_mfn(page));
1820 BUG();
1825 void put_page_type(struct page_info *page)
1827 unsigned long nx, x, y = page->u.inuse.type_info;
1829 again:
1830 do {
1831 x = y;
1832 nx = x - 1;
1834 ASSERT((x & PGT_count_mask) != 0);
1836 if ( unlikely((nx & PGT_count_mask) == 0) )
1838 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1839 likely(nx & PGT_validated) )
1841 /*
1842 * Page-table pages must be unvalidated when count is zero. The
1843 * 'free' is safe because the refcnt is non-zero and validated
1844 * bit is clear => other ops will spin or fail.
1845 */
1846 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1847 x & ~PGT_validated)) != x) )
1848 goto again;
1849 /* We cleared the 'valid bit' so we do the clean up. */
1850 free_page_type(page, x);
1851 /* Carry on, but with the 'valid bit' now clear. */
1852 x &= ~PGT_validated;
1853 nx &= ~PGT_validated;
1856 /*
1857 * Record TLB information for flush later. We do not stamp page
1858 * tables when running in shadow mode:
1859 * 1. Pointless, since it's the shadow pt's which must be tracked.
1860 * 2. Shadow mode reuses this field for shadowed page tables to
1861 * store flags info -- we don't want to conflict with that.
1862 */
1863 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1864 (page->count_info & PGC_page_table)) )
1865 page->tlbflush_timestamp = tlbflush_current_time();
1868 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1872 int get_page_type(struct page_info *page, unsigned long type)
1874 unsigned long nx, x, y = page->u.inuse.type_info;
1876 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1878 again:
1879 do {
1880 x = y;
1881 nx = x + 1;
1882 if ( unlikely((nx & PGT_count_mask) == 0) )
1884 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1885 return 0;
1887 else if ( unlikely((x & PGT_count_mask) == 0) )
1889 struct domain *d = page_get_owner(page);
1891 /* Never allow a shadowed frame to go from type count 0 to 1 */
1892 if ( d && shadow_mode_enabled(d) )
1893 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1895 ASSERT(!(x & PGT_pae_xen_l2));
1896 if ( (x & PGT_type_mask) != type )
1898 /*
1899 * On type change we check to flush stale TLB entries. This
1900 * may be unnecessary (e.g., page was GDT/LDT) but those
1901 * circumstances should be very rare.
1902 */
1903 cpumask_t mask = d->domain_dirty_cpumask;
1905 /* Don't flush if the timestamp is old enough */
1906 tlbflush_filter(mask, page->tlbflush_timestamp);
1908 if ( unlikely(!cpus_empty(mask)) &&
1909 /* Shadow mode: track only writable pages. */
1910 (!shadow_mode_enabled(page_get_owner(page)) ||
1911 ((nx & PGT_type_mask) == PGT_writable_page)) )
1913 perfc_incr(need_flush_tlb_flush);
1914 flush_tlb_mask(mask);
1917 /* We lose existing type and validity. */
1918 nx &= ~(PGT_type_mask | PGT_validated);
1919 nx |= type;
1921 /* No special validation needed for writable pages. */
1922 /* Page tables and GDT/LDT need to be scanned for validity. */
1923 if ( type == PGT_writable_page )
1924 nx |= PGT_validated;
1927 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1929 /* Don't log failure if it could be a recursive-mapping attempt. */
1930 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
1931 (type == PGT_l1_page_table) )
1932 return 0;
1933 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
1934 (type == PGT_l2_page_table) )
1935 return 0;
1936 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
1937 (type == PGT_l3_page_table) )
1938 return 0;
1939 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
1940 "for mfn %lx (pfn %lx)",
1941 x, type, page_to_mfn(page),
1942 get_gpfn_from_mfn(page_to_mfn(page)));
1943 return 0;
1945 else if ( unlikely(!(x & PGT_validated)) )
1947 /* Someone else is updating validation of this page. Wait... */
1948 while ( (y = page->u.inuse.type_info) == x )
1949 cpu_relax();
1950 goto again;
1953 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1955 if ( unlikely(!(nx & PGT_validated)) )
1957 /* Try to validate page type; drop the new reference on failure. */
1958 if ( unlikely(!alloc_page_type(page, type)) )
1960 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1961 PRtype_info ": caf=%08x taf=%" PRtype_info,
1962 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1963 type, page->count_info, page->u.inuse.type_info);
1964 /* Noone else can get a reference. We hold the only ref. */
1965 page->u.inuse.type_info = 0;
1966 return 0;
1969 /* Noone else is updating simultaneously. */
1970 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1973 return 1;
1977 void cleanup_page_cacheattr(struct page_info *page)
1979 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
1981 if ( likely(cacheattr == 0) )
1982 return;
1984 page->count_info &= ~PGC_cacheattr_mask;
1986 BUG_ON(is_xen_heap_page(page));
1988 #ifdef __x86_64__
1989 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
1990 1, PAGE_HYPERVISOR);
1991 #endif
1995 int new_guest_cr3(unsigned long mfn)
1997 struct vcpu *v = current;
1998 struct domain *d = v->domain;
1999 int okay;
2000 unsigned long old_base_mfn;
2002 #ifdef CONFIG_COMPAT
2003 if ( is_pv_32on64_domain(d) )
2005 okay = paging_mode_refcounts(d)
2006 ? 0 /* Old code was broken, but what should it be? */
2007 : mod_l4_entry(
2008 __va(pagetable_get_paddr(v->arch.guest_table)),
2009 l4e_from_pfn(
2010 mfn,
2011 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2012 pagetable_get_pfn(v->arch.guest_table), 0);
2013 if ( unlikely(!okay) )
2015 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2016 return 0;
2019 invalidate_shadow_ldt(v);
2020 write_ptbase(v);
2022 return 1;
2024 #endif
2025 okay = paging_mode_refcounts(d)
2026 ? get_page_from_pagenr(mfn, d)
2027 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
2028 if ( unlikely(!okay) )
2030 MEM_LOG("Error while installing new baseptr %lx", mfn);
2031 return 0;
2034 invalidate_shadow_ldt(v);
2036 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2038 v->arch.guest_table = pagetable_from_pfn(mfn);
2039 update_cr3(v);
2041 write_ptbase(v);
2043 if ( likely(old_base_mfn != 0) )
2045 if ( paging_mode_refcounts(d) )
2046 put_page(mfn_to_page(old_base_mfn));
2047 else
2048 put_page_and_type(mfn_to_page(old_base_mfn));
2051 return 1;
2054 static void process_deferred_ops(void)
2056 unsigned int deferred_ops;
2057 struct domain *d = current->domain;
2058 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2060 deferred_ops = info->deferred_ops;
2061 info->deferred_ops = 0;
2063 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2065 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2066 flush_tlb_mask(d->domain_dirty_cpumask);
2067 else
2068 flush_tlb_local();
2071 if ( deferred_ops & DOP_RELOAD_LDT )
2072 (void)map_ldt_shadow_page(0);
2074 if ( unlikely(info->foreign != NULL) )
2076 rcu_unlock_domain(info->foreign);
2077 info->foreign = NULL;
2081 static int set_foreigndom(domid_t domid)
2083 struct domain *e, *d = current->domain;
2084 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2085 int okay = 1;
2087 ASSERT(info->foreign == NULL);
2089 if ( likely(domid == DOMID_SELF) )
2090 goto out;
2092 if ( unlikely(domid == d->domain_id) )
2094 MEM_LOG("Cannot specify itself as foreign domain");
2095 okay = 0;
2097 else if ( unlikely(paging_mode_translate(d)) )
2099 MEM_LOG("Cannot mix foreign mappings with translated domains");
2100 okay = 0;
2102 else switch ( domid )
2104 case DOMID_IO:
2105 info->foreign = rcu_lock_domain(dom_io);
2106 break;
2107 case DOMID_XEN:
2108 if (!IS_PRIV(d)) {
2109 MEM_LOG("Cannot set foreign dom");
2110 okay = 0;
2111 break;
2113 info->foreign = rcu_lock_domain(dom_xen);
2114 break;
2115 default:
2116 e = rcu_lock_domain_by_id(domid);
2117 if ( e == NULL )
2119 MEM_LOG("Unknown domain '%u'", domid);
2120 okay = 0;
2121 break;
2123 if (!IS_PRIV_FOR(d, e)) {
2124 MEM_LOG("Cannot set foreign dom");
2125 okay = 0;
2126 rcu_unlock_domain(e);
2127 break;
2129 info->foreign = e;
2130 break;
2133 out:
2134 return okay;
2137 static inline cpumask_t vcpumask_to_pcpumask(
2138 struct domain *d, unsigned long vmask)
2140 unsigned int vcpu_id;
2141 cpumask_t pmask = CPU_MASK_NONE;
2142 struct vcpu *v;
2144 while ( vmask != 0 )
2146 vcpu_id = find_first_set_bit(vmask);
2147 vmask &= ~(1UL << vcpu_id);
2148 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2149 ((v = d->vcpu[vcpu_id]) != NULL) )
2150 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2153 return pmask;
2156 int do_mmuext_op(
2157 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2158 unsigned int count,
2159 XEN_GUEST_HANDLE(uint) pdone,
2160 unsigned int foreigndom)
2162 struct mmuext_op op;
2163 int rc = 0, i = 0, okay;
2164 unsigned long mfn = 0, gmfn = 0, type;
2165 unsigned int done = 0;
2166 struct page_info *page;
2167 struct vcpu *v = current;
2168 struct domain *d = v->domain;
2170 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2172 count &= ~MMU_UPDATE_PREEMPTED;
2173 if ( unlikely(!guest_handle_is_null(pdone)) )
2174 (void)copy_from_guest(&done, pdone, 1);
2176 else
2177 perfc_incr(calls_to_mmuext_op);
2179 if ( unlikely(!guest_handle_okay(uops, count)) )
2181 rc = -EFAULT;
2182 goto out;
2185 if ( !set_foreigndom(foreigndom) )
2187 rc = -ESRCH;
2188 goto out;
2191 LOCK_BIGLOCK(d);
2193 for ( i = 0; i < count; i++ )
2195 if ( hypercall_preempt_check() )
2197 rc = hypercall_create_continuation(
2198 __HYPERVISOR_mmuext_op, "hihi",
2199 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2200 break;
2203 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2205 MEM_LOG("Bad __copy_from_guest");
2206 rc = -EFAULT;
2207 break;
2210 okay = 1;
2211 gmfn = op.arg1.mfn;
2212 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2213 page = mfn_to_page(mfn);
2215 switch ( op.cmd )
2217 case MMUEXT_PIN_L1_TABLE:
2218 type = PGT_l1_page_table;
2219 goto pin_page;
2221 case MMUEXT_PIN_L2_TABLE:
2222 type = PGT_l2_page_table;
2223 goto pin_page;
2225 case MMUEXT_PIN_L3_TABLE:
2226 type = PGT_l3_page_table;
2227 goto pin_page;
2229 case MMUEXT_PIN_L4_TABLE:
2230 if ( is_pv_32bit_domain(FOREIGNDOM) )
2231 break;
2232 type = PGT_l4_page_table;
2234 pin_page:
2235 rc = xsm_memory_pin_page(d, page);
2236 if ( rc )
2237 break;
2239 /* Ignore pinning of invalid paging levels. */
2240 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2241 break;
2243 if ( paging_mode_refcounts(FOREIGNDOM) )
2244 break;
2246 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2247 if ( unlikely(!okay) )
2249 MEM_LOG("Error while pinning mfn %lx", mfn);
2250 break;
2253 if ( unlikely(test_and_set_bit(_PGT_pinned,
2254 &page->u.inuse.type_info)) )
2256 MEM_LOG("Mfn %lx already pinned", mfn);
2257 put_page_and_type(page);
2258 okay = 0;
2259 break;
2262 /* A page is dirtied when its pin status is set. */
2263 paging_mark_dirty(d, mfn);
2265 /* We can race domain destruction (domain_relinquish_resources). */
2266 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2268 int drop_ref;
2269 spin_lock(&FOREIGNDOM->page_alloc_lock);
2270 drop_ref = (FOREIGNDOM->is_dying &&
2271 test_and_clear_bit(_PGT_pinned,
2272 &page->u.inuse.type_info));
2273 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2274 if ( drop_ref )
2275 put_page_and_type(page);
2278 break;
2280 case MMUEXT_UNPIN_TABLE:
2281 if ( paging_mode_refcounts(d) )
2282 break;
2284 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2286 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2287 mfn, page_get_owner(page));
2289 else if ( likely(test_and_clear_bit(_PGT_pinned,
2290 &page->u.inuse.type_info)) )
2292 put_page_and_type(page);
2293 put_page(page);
2294 /* A page is dirtied when its pin status is cleared. */
2295 paging_mark_dirty(d, mfn);
2297 else
2299 okay = 0;
2300 put_page(page);
2301 MEM_LOG("Mfn %lx not pinned", mfn);
2303 break;
2305 case MMUEXT_NEW_BASEPTR:
2306 okay = new_guest_cr3(mfn);
2307 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2308 break;
2310 #ifdef __x86_64__
2311 case MMUEXT_NEW_USER_BASEPTR: {
2312 unsigned long old_mfn;
2314 if ( mfn != 0 )
2316 if ( paging_mode_refcounts(d) )
2317 okay = get_page_from_pagenr(mfn, d);
2318 else
2319 okay = get_page_and_type_from_pagenr(
2320 mfn, PGT_root_page_table, d);
2321 if ( unlikely(!okay) )
2323 MEM_LOG("Error while installing new mfn %lx", mfn);
2324 break;
2328 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2329 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2331 if ( old_mfn != 0 )
2333 if ( paging_mode_refcounts(d) )
2334 put_page(mfn_to_page(old_mfn));
2335 else
2336 put_page_and_type(mfn_to_page(old_mfn));
2339 break;
2341 #endif
2343 case MMUEXT_TLB_FLUSH_LOCAL:
2344 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2345 break;
2347 case MMUEXT_INVLPG_LOCAL:
2348 if ( !paging_mode_enabled(d)
2349 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2350 flush_tlb_one_local(op.arg1.linear_addr);
2351 break;
2353 case MMUEXT_TLB_FLUSH_MULTI:
2354 case MMUEXT_INVLPG_MULTI:
2356 unsigned long vmask;
2357 cpumask_t pmask;
2358 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2360 okay = 0;
2361 break;
2363 pmask = vcpumask_to_pcpumask(d, vmask);
2364 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2365 flush_tlb_mask(pmask);
2366 else
2367 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2368 break;
2371 case MMUEXT_TLB_FLUSH_ALL:
2372 flush_tlb_mask(d->domain_dirty_cpumask);
2373 break;
2375 case MMUEXT_INVLPG_ALL:
2376 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2377 break;
2379 case MMUEXT_FLUSH_CACHE:
2380 if ( unlikely(!cache_flush_permitted(d)) )
2382 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2383 okay = 0;
2385 else
2387 wbinvd();
2389 break;
2391 case MMUEXT_SET_LDT:
2393 unsigned long ptr = op.arg1.linear_addr;
2394 unsigned long ents = op.arg2.nr_ents;
2396 if ( paging_mode_external(d) )
2398 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2399 okay = 0;
2401 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2402 (ents > 8192) ||
2403 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2405 okay = 0;
2406 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2408 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2409 (v->arch.guest_context.ldt_base != ptr) )
2411 invalidate_shadow_ldt(v);
2412 v->arch.guest_context.ldt_base = ptr;
2413 v->arch.guest_context.ldt_ents = ents;
2414 load_LDT(v);
2415 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2416 if ( ents != 0 )
2417 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2419 break;
2422 default:
2423 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2424 rc = -ENOSYS;
2425 okay = 0;
2426 break;
2429 if ( unlikely(!okay) )
2431 rc = rc ? rc : -EINVAL;
2432 break;
2435 guest_handle_add_offset(uops, 1);
2438 process_deferred_ops();
2440 UNLOCK_BIGLOCK(d);
2442 perfc_add(num_mmuext_ops, i);
2444 out:
2445 /* Add incremental work we have done to the @done output parameter. */
2446 if ( unlikely(!guest_handle_is_null(pdone)) )
2448 done += i;
2449 copy_to_guest(pdone, &done, 1);
2452 return rc;
2455 int do_mmu_update(
2456 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2457 unsigned int count,
2458 XEN_GUEST_HANDLE(uint) pdone,
2459 unsigned int foreigndom)
2461 struct mmu_update req;
2462 void *va;
2463 unsigned long gpfn, gmfn, mfn;
2464 struct page_info *page;
2465 int rc = 0, okay = 1, i = 0;
2466 unsigned int cmd, done = 0;
2467 struct vcpu *v = current;
2468 struct domain *d = v->domain;
2469 unsigned long type_info;
2470 struct domain_mmap_cache mapcache;
2472 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2474 count &= ~MMU_UPDATE_PREEMPTED;
2475 if ( unlikely(!guest_handle_is_null(pdone)) )
2476 (void)copy_from_guest(&done, pdone, 1);
2478 else
2479 perfc_incr(calls_to_mmu_update);
2481 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2483 rc = -EFAULT;
2484 goto out;
2487 if ( !set_foreigndom(foreigndom) )
2489 rc = -ESRCH;
2490 goto out;
2493 domain_mmap_cache_init(&mapcache);
2495 LOCK_BIGLOCK(d);
2497 for ( i = 0; i < count; i++ )
2499 if ( hypercall_preempt_check() )
2501 rc = hypercall_create_continuation(
2502 __HYPERVISOR_mmu_update, "hihi",
2503 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2504 break;
2507 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2509 MEM_LOG("Bad __copy_from_guest");
2510 rc = -EFAULT;
2511 break;
2514 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2515 okay = 0;
2517 switch ( cmd )
2519 /*
2520 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2521 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
2522 * current A/D bits.
2523 */
2524 case MMU_NORMAL_PT_UPDATE:
2525 case MMU_PT_UPDATE_PRESERVE_AD:
2526 rc = xsm_mmu_normal_update(d, req.val);
2527 if ( rc )
2528 break;
2530 req.ptr -= cmd;
2531 gmfn = req.ptr >> PAGE_SHIFT;
2532 mfn = gmfn_to_mfn(d, gmfn);
2534 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
2536 MEM_LOG("Could not get page for normal update");
2537 break;
2540 va = map_domain_page_with_cache(mfn, &mapcache);
2541 va = (void *)((unsigned long)va +
2542 (unsigned long)(req.ptr & ~PAGE_MASK));
2543 page = mfn_to_page(mfn);
2545 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2547 case PGT_l1_page_table:
2548 case PGT_l2_page_table:
2549 case PGT_l3_page_table:
2550 case PGT_l4_page_table:
2552 if ( paging_mode_refcounts(d) )
2554 MEM_LOG("mmu update on auto-refcounted domain!");
2555 break;
2558 if ( unlikely(!get_page_type(
2559 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2560 goto not_a_pt;
2562 switch ( type_info & PGT_type_mask )
2564 case PGT_l1_page_table:
2566 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2567 okay = mod_l1_entry(va, l1e, mfn,
2568 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2570 break;
2571 case PGT_l2_page_table:
2573 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2574 okay = mod_l2_entry(va, l2e, mfn, type_info,
2575 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2577 break;
2578 #if CONFIG_PAGING_LEVELS >= 3
2579 case PGT_l3_page_table:
2581 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2582 okay = mod_l3_entry(va, l3e, mfn,
2583 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2585 break;
2586 #endif
2587 #if CONFIG_PAGING_LEVELS >= 4
2588 case PGT_l4_page_table:
2590 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2591 okay = mod_l4_entry(va, l4e, mfn,
2592 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2594 break;
2595 #endif
2598 put_page_type(page);
2600 break;
2602 default:
2603 not_a_pt:
2605 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2606 break;
2608 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2610 put_page_type(page);
2612 break;
2615 unmap_domain_page_with_cache(va, &mapcache);
2617 put_page(page);
2618 break;
2620 case MMU_MACHPHYS_UPDATE:
2622 mfn = req.ptr >> PAGE_SHIFT;
2623 gpfn = req.val;
2625 rc = xsm_mmu_machphys_update(d, mfn);
2626 if ( rc )
2627 break;
2629 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2631 MEM_LOG("Could not get page for mach->phys update");
2632 break;
2635 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2637 MEM_LOG("Mach-phys update on auto-translate guest");
2638 break;
2641 set_gpfn_from_mfn(mfn, gpfn);
2642 okay = 1;
2644 paging_mark_dirty(FOREIGNDOM, mfn);
2646 put_page(mfn_to_page(mfn));
2647 break;
2649 default:
2650 MEM_LOG("Invalid page update command %x", cmd);
2651 rc = -ENOSYS;
2652 okay = 0;
2653 break;
2656 if ( unlikely(!okay) )
2658 rc = rc ? rc : -EINVAL;
2659 break;
2662 guest_handle_add_offset(ureqs, 1);
2665 process_deferred_ops();
2667 UNLOCK_BIGLOCK(d);
2669 domain_mmap_cache_destroy(&mapcache);
2671 perfc_add(num_page_updates, i);
2673 out:
2674 /* Add incremental work we have done to the @done output parameter. */
2675 if ( unlikely(!guest_handle_is_null(pdone)) )
2677 done += i;
2678 copy_to_guest(pdone, &done, 1);
2681 return rc;
2685 static int create_grant_pte_mapping(
2686 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2688 int rc = GNTST_okay;
2689 void *va;
2690 unsigned long gmfn, mfn;
2691 struct page_info *page;
2692 u32 type;
2693 l1_pgentry_t ol1e;
2694 struct domain *d = v->domain;
2696 ASSERT(spin_is_locked(&d->big_lock));
2698 adjust_guest_l1e(nl1e, d);
2700 gmfn = pte_addr >> PAGE_SHIFT;
2701 mfn = gmfn_to_mfn(d, gmfn);
2703 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2705 MEM_LOG("Could not get page for normal update");
2706 return GNTST_general_error;
2709 va = map_domain_page(mfn);
2710 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2711 page = mfn_to_page(mfn);
2713 type = page->u.inuse.type_info & PGT_type_mask;
2714 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2716 MEM_LOG("Grant map attempted to update a non-L1 page");
2717 rc = GNTST_general_error;
2718 goto failed;
2721 ol1e = *(l1_pgentry_t *)va;
2722 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
2724 put_page_type(page);
2725 rc = GNTST_general_error;
2726 goto failed;
2729 if ( !paging_mode_refcounts(d) )
2730 put_page_from_l1e(ol1e, d);
2732 put_page_type(page);
2734 failed:
2735 unmap_domain_page(va);
2736 put_page(page);
2738 return rc;
2741 static int destroy_grant_pte_mapping(
2742 uint64_t addr, unsigned long frame, struct domain *d)
2744 int rc = GNTST_okay;
2745 void *va;
2746 unsigned long gmfn, mfn;
2747 struct page_info *page;
2748 u32 type;
2749 l1_pgentry_t ol1e;
2751 gmfn = addr >> PAGE_SHIFT;
2752 mfn = gmfn_to_mfn(d, gmfn);
2754 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2756 MEM_LOG("Could not get page for normal update");
2757 return GNTST_general_error;
2760 va = map_domain_page(mfn);
2761 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2762 page = mfn_to_page(mfn);
2764 type = page->u.inuse.type_info & PGT_type_mask;
2765 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2767 MEM_LOG("Grant map attempted to update a non-L1 page");
2768 rc = GNTST_general_error;
2769 goto failed;
2772 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2774 put_page_type(page);
2775 rc = GNTST_general_error;
2776 goto failed;
2779 /* Check that the virtual address supplied is actually mapped to frame. */
2780 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2782 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2783 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2784 put_page_type(page);
2785 rc = GNTST_general_error;
2786 goto failed;
2789 /* Delete pagetable entry. */
2790 if ( unlikely(!UPDATE_ENTRY
2791 (l1,
2792 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2793 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
2794 0)) )
2796 MEM_LOG("Cannot delete PTE entry at %p", va);
2797 put_page_type(page);
2798 rc = GNTST_general_error;
2799 goto failed;
2802 put_page_type(page);
2804 failed:
2805 unmap_domain_page(va);
2806 put_page(page);
2807 return rc;
2811 static int create_grant_va_mapping(
2812 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2814 l1_pgentry_t *pl1e, ol1e;
2815 struct domain *d = v->domain;
2816 unsigned long gl1mfn;
2817 int okay;
2819 ASSERT(spin_is_locked(&d->big_lock));
2821 adjust_guest_l1e(nl1e, d);
2823 pl1e = guest_map_l1e(v, va, &gl1mfn);
2824 if ( !pl1e )
2826 MEM_LOG("Could not find L1 PTE for address %lx", va);
2827 return GNTST_general_error;
2829 ol1e = *pl1e;
2830 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
2831 guest_unmap_l1e(v, pl1e);
2832 pl1e = NULL;
2834 if ( !okay )
2835 return GNTST_general_error;
2837 if ( !paging_mode_refcounts(d) )
2838 put_page_from_l1e(ol1e, d);
2840 return GNTST_okay;
2843 static int replace_grant_va_mapping(
2844 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2846 l1_pgentry_t *pl1e, ol1e;
2847 unsigned long gl1mfn;
2848 int rc = 0;
2850 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2851 if ( !pl1e )
2853 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2854 return GNTST_general_error;
2856 ol1e = *pl1e;
2858 /* Check that the virtual address supplied is actually mapped to frame. */
2859 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2861 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2862 l1e_get_pfn(ol1e), addr, frame);
2863 rc = GNTST_general_error;
2864 goto out;
2867 /* Delete pagetable entry. */
2868 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
2870 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2871 rc = GNTST_general_error;
2872 goto out;
2875 out:
2876 guest_unmap_l1e(v, pl1e);
2877 return rc;
2880 static int destroy_grant_va_mapping(
2881 unsigned long addr, unsigned long frame, struct vcpu *v)
2883 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2886 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
2887 unsigned int flags, unsigned int cache_flags)
2889 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2891 if ( (flags & GNTMAP_application_map) )
2892 l1e_add_flags(pte,_PAGE_USER);
2893 if ( !(flags & GNTMAP_readonly) )
2894 l1e_add_flags(pte,_PAGE_RW);
2896 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
2898 if ( flags & GNTMAP_contains_pte )
2899 return create_grant_pte_mapping(addr, pte, current);
2900 return create_grant_va_mapping(addr, pte, current);
2903 int replace_grant_host_mapping(
2904 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
2906 struct vcpu *curr = current;
2907 l1_pgentry_t *pl1e, ol1e;
2908 unsigned long gl1mfn;
2909 int rc;
2911 if ( flags & GNTMAP_contains_pte )
2913 if ( !new_addr )
2914 return destroy_grant_pte_mapping(addr, frame, curr->domain);
2916 MEM_LOG("Unsupported grant table operation");
2917 return GNTST_general_error;
2920 if ( !new_addr )
2921 return destroy_grant_va_mapping(addr, frame, curr);
2923 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
2924 if ( !pl1e )
2926 MEM_LOG("Could not find L1 PTE for address %lx",
2927 (unsigned long)new_addr);
2928 return GNTST_general_error;
2930 ol1e = *pl1e;
2932 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
2933 gl1mfn, curr, 0)) )
2935 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2936 guest_unmap_l1e(curr, pl1e);
2937 return GNTST_general_error;
2940 guest_unmap_l1e(curr, pl1e);
2942 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
2943 if ( rc && !paging_mode_refcounts(curr->domain) )
2944 put_page_from_l1e(ol1e, curr->domain);
2946 return rc;
2949 int steal_page(
2950 struct domain *d, struct page_info *page, unsigned int memflags)
2952 u32 _d, _nd, x, y;
2954 spin_lock(&d->page_alloc_lock);
2956 /*
2957 * The tricky bit: atomically release ownership while there is just one
2958 * benign reference to the page (PGC_allocated). If that reference
2959 * disappears then the deallocation routine will safely spin.
2960 */
2961 _d = pickle_domptr(d);
2962 _nd = page->u.inuse._domain;
2963 y = page->count_info;
2964 do {
2965 x = y;
2966 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2967 (1 | PGC_allocated)) || unlikely(_nd != _d) )
2969 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2970 " caf=%08x, taf=%" PRtype_info "\n",
2971 (void *) page_to_mfn(page),
2972 d, d->domain_id, unpickle_domptr(_nd), x,
2973 page->u.inuse.type_info);
2974 spin_unlock(&d->page_alloc_lock);
2975 return -1;
2977 asm volatile (
2978 LOCK_PREFIX "cmpxchg8b %2"
2979 : "=d" (_nd), "=a" (y),
2980 "=m" (*(volatile u64 *)(&page->count_info))
2981 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2982 } while (unlikely(_nd != _d) || unlikely(y != x));
2984 /*
2985 * Unlink from 'd'. At least one reference remains (now anonymous), so
2986 * noone else is spinning to try to delete this page from 'd'.
2987 */
2988 if ( !(memflags & MEMF_no_refcount) )
2989 d->tot_pages--;
2990 list_del(&page->list);
2992 spin_unlock(&d->page_alloc_lock);
2994 return 0;
2997 int do_update_va_mapping(unsigned long va, u64 val64,
2998 unsigned long flags)
3000 l1_pgentry_t val = l1e_from_intpte(val64);
3001 struct vcpu *v = current;
3002 struct domain *d = v->domain;
3003 l1_pgentry_t *pl1e;
3004 unsigned long vmask, bmap_ptr, gl1mfn;
3005 cpumask_t pmask;
3006 int rc = 0;
3008 perfc_incr(calls_to_update_va);
3010 if ( unlikely(!__addr_ok(va) && !paging_mode_external(d)) )
3011 return -EINVAL;
3013 rc = xsm_update_va_mapping(current->domain, val);
3014 if ( rc )
3015 return rc;
3017 LOCK_BIGLOCK(d);
3019 pl1e = guest_map_l1e(v, va, &gl1mfn);
3021 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) )
3022 rc = -EINVAL;
3024 if ( pl1e )
3025 guest_unmap_l1e(v, pl1e);
3026 pl1e = NULL;
3028 process_deferred_ops();
3030 UNLOCK_BIGLOCK(d);
3032 switch ( flags & UVMF_FLUSHTYPE_MASK )
3034 case UVMF_TLB_FLUSH:
3035 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3037 case UVMF_LOCAL:
3038 flush_tlb_local();
3039 break;
3040 case UVMF_ALL:
3041 flush_tlb_mask(d->domain_dirty_cpumask);
3042 break;
3043 default:
3044 if ( unlikely(!is_pv_32on64_domain(d) ?
3045 get_user(vmask, (unsigned long *)bmap_ptr) :
3046 get_user(vmask, (unsigned int *)bmap_ptr)) )
3047 rc = -EFAULT;
3048 pmask = vcpumask_to_pcpumask(d, vmask);
3049 flush_tlb_mask(pmask);
3050 break;
3052 break;
3054 case UVMF_INVLPG:
3055 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3057 case UVMF_LOCAL:
3058 if ( !paging_mode_enabled(d) ||
3059 (paging_invlpg(v, va) != 0) )
3060 flush_tlb_one_local(va);
3061 break;
3062 case UVMF_ALL:
3063 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
3064 break;
3065 default:
3066 if ( unlikely(!is_pv_32on64_domain(d) ?
3067 get_user(vmask, (unsigned long *)bmap_ptr) :
3068 get_user(vmask, (unsigned int *)bmap_ptr)) )
3069 rc = -EFAULT;
3070 pmask = vcpumask_to_pcpumask(d, vmask);
3071 flush_tlb_one_mask(pmask, va);
3072 break;
3074 break;
3077 return rc;
3080 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3081 unsigned long flags,
3082 domid_t domid)
3084 int rc;
3086 if ( !set_foreigndom(domid) )
3087 return -ESRCH;
3089 rc = do_update_va_mapping(va, val64, flags);
3091 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3092 process_deferred_ops(); /* only to clear foreigndom */
3094 return rc;
3099 /*************************
3100 * Descriptor Tables
3101 */
3103 void destroy_gdt(struct vcpu *v)
3105 int i;
3106 unsigned long pfn;
3108 v->arch.guest_context.gdt_ents = 0;
3109 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3111 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3112 put_page_and_type(mfn_to_page(pfn));
3113 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3114 v->arch.guest_context.gdt_frames[i] = 0;
3119 long set_gdt(struct vcpu *v,
3120 unsigned long *frames,
3121 unsigned int entries)
3123 struct domain *d = v->domain;
3124 /* NB. There are 512 8-byte entries per GDT page. */
3125 int i, nr_pages = (entries + 511) / 512;
3126 unsigned long mfn;
3128 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3129 return -EINVAL;
3131 /* Check the pages in the new GDT. */
3132 for ( i = 0; i < nr_pages; i++ )
3134 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3135 if ( !mfn_valid(mfn) ||
3136 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
3137 goto fail;
3140 /* Tear down the old GDT. */
3141 destroy_gdt(v);
3143 /* Install the new GDT. */
3144 v->arch.guest_context.gdt_ents = entries;
3145 for ( i = 0; i < nr_pages; i++ )
3147 v->arch.guest_context.gdt_frames[i] = frames[i];
3148 l1e_write(&v->arch.perdomain_ptes[i],
3149 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3152 return 0;
3154 fail:
3155 while ( i-- > 0 )
3156 put_page_and_type(mfn_to_page(frames[i]));
3157 return -EINVAL;
3161 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3163 int nr_pages = (entries + 511) / 512;
3164 unsigned long frames[16];
3165 struct vcpu *curr = current;
3166 long ret;
3168 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3169 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3170 return -EINVAL;
3172 if ( copy_from_guest(frames, frame_list, nr_pages) )
3173 return -EFAULT;
3175 LOCK_BIGLOCK(curr->domain);
3177 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3178 flush_tlb_local();
3180 UNLOCK_BIGLOCK(curr->domain);
3182 return ret;
3186 long do_update_descriptor(u64 pa, u64 desc)
3188 struct domain *dom = current->domain;
3189 unsigned long gmfn = pa >> PAGE_SHIFT;
3190 unsigned long mfn;
3191 unsigned int offset;
3192 struct desc_struct *gdt_pent, d;
3193 struct page_info *page;
3194 long ret = -EINVAL;
3196 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3198 *(u64 *)&d = desc;
3200 mfn = gmfn_to_mfn(dom, gmfn);
3201 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3202 !mfn_valid(mfn) ||
3203 !check_descriptor(dom, &d) )
3204 return -EINVAL;
3206 page = mfn_to_page(mfn);
3207 if ( unlikely(!get_page(page, dom)) )
3208 return -EINVAL;
3210 /* Check if the given frame is in use in an unsafe context. */
3211 switch ( page->u.inuse.type_info & PGT_type_mask )
3213 case PGT_gdt_page:
3214 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
3215 goto out;
3216 break;
3217 case PGT_ldt_page:
3218 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
3219 goto out;
3220 break;
3221 default:
3222 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3223 goto out;
3224 break;
3227 paging_mark_dirty(dom, mfn);
3229 /* All is good so make the update. */
3230 gdt_pent = map_domain_page(mfn);
3231 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3232 unmap_domain_page(gdt_pent);
3234 put_page_type(page);
3236 ret = 0; /* success */
3238 out:
3239 put_page(page);
3241 return ret;
3244 typedef struct e820entry e820entry_t;
3245 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3247 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3249 switch ( op )
3251 case XENMEM_add_to_physmap:
3253 struct xen_add_to_physmap xatp;
3254 unsigned long prev_mfn, mfn = 0, gpfn;
3255 struct domain *d;
3257 if ( copy_from_guest(&xatp, arg, 1) )
3258 return -EFAULT;
3260 if ( xatp.domid == DOMID_SELF )
3261 d = rcu_lock_current_domain();
3262 else {
3263 d = rcu_lock_domain_by_id(xatp.domid);
3264 if ( d == NULL )
3265 return -ESRCH;
3266 if ( !IS_PRIV_FOR(current->domain, d) ) {
3267 rcu_unlock_domain(d);
3268 return -EPERM;
3272 if ( xsm_add_to_physmap(current->domain, d) )
3274 rcu_unlock_domain(d);
3275 return -EPERM;
3278 switch ( xatp.space )
3280 case XENMAPSPACE_shared_info:
3281 if ( xatp.idx == 0 )
3282 mfn = virt_to_mfn(d->shared_info);
3283 /* XXX: assumption here, this is called after E820 table is build
3284 * need the E820 to initialize MTRR.
3285 */
3286 if ( is_hvm_domain(d) ) {
3287 extern void init_mtrr_in_hyper(struct vcpu *);
3288 struct vcpu *vs;
3289 for_each_vcpu(d, vs)
3290 init_mtrr_in_hyper(vs);
3292 break;
3293 case XENMAPSPACE_grant_table:
3294 spin_lock(&d->grant_table->lock);
3296 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3297 (xatp.idx < max_nr_grant_frames) )
3298 gnttab_grow_table(d, xatp.idx + 1);
3300 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3301 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3303 spin_unlock(&d->grant_table->lock);
3304 break;
3305 default:
3306 break;
3309 if ( !paging_mode_translate(d) || (mfn == 0) )
3311 rcu_unlock_domain(d);
3312 return -EINVAL;
3315 LOCK_BIGLOCK(d);
3317 /* Remove previously mapped page if it was present. */
3318 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3319 if ( mfn_valid(prev_mfn) )
3321 if ( is_xen_heap_mfn(prev_mfn) )
3322 /* Xen heap frames are simply unhooked from this phys slot. */
3323 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
3324 else
3325 /* Normal domain memory is freed, to avoid leaking memory. */
3326 guest_remove_page(d, xatp.gpfn);
3329 /* Unmap from old location, if any. */
3330 gpfn = get_gpfn_from_mfn(mfn);
3331 if ( gpfn != INVALID_M2P_ENTRY )
3332 guest_physmap_remove_page(d, gpfn, mfn);
3334 /* Map at new location. */
3335 guest_physmap_add_page(d, xatp.gpfn, mfn);
3337 UNLOCK_BIGLOCK(d);
3339 rcu_unlock_domain(d);
3341 break;
3344 case XENMEM_set_memory_map:
3346 struct xen_foreign_memory_map fmap;
3347 struct domain *d;
3348 int rc;
3350 if ( copy_from_guest(&fmap, arg, 1) )
3351 return -EFAULT;
3353 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3354 return -EINVAL;
3356 if ( fmap.domid == DOMID_SELF )
3357 d = rcu_lock_current_domain();
3358 else {
3359 d = rcu_lock_domain_by_id(fmap.domid);
3360 if ( d == NULL )
3361 return -ESRCH;
3362 if ( !IS_PRIV_FOR(current->domain, d) ) {
3363 rcu_unlock_domain(d);
3364 return -EPERM;
3368 rc = xsm_domain_memory_map(d);
3369 if ( rc )
3371 rcu_unlock_domain(d);
3372 return rc;
3375 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3376 fmap.map.nr_entries) ? -EFAULT : 0;
3377 d->arch.nr_e820 = fmap.map.nr_entries;
3379 rcu_unlock_domain(d);
3380 return rc;
3383 case XENMEM_memory_map:
3385 struct xen_memory_map map;
3386 struct domain *d = current->domain;
3388 /* Backwards compatibility. */
3389 if ( d->arch.nr_e820 == 0 )
3390 return -ENOSYS;
3392 if ( copy_from_guest(&map, arg, 1) )
3393 return -EFAULT;
3395 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3396 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3397 copy_to_guest(arg, &map, 1) )
3398 return -EFAULT;
3400 return 0;
3403 case XENMEM_machine_memory_map:
3405 struct xen_memory_map memmap;
3406 XEN_GUEST_HANDLE(e820entry_t) buffer;
3407 int count;
3408 int rc;
3410 if ( !IS_PRIV(current->domain) )
3411 return -EINVAL;
3413 rc = xsm_machine_memory_map();
3414 if ( rc )
3415 return rc;
3417 if ( copy_from_guest(&memmap, arg, 1) )
3418 return -EFAULT;
3419 if ( memmap.nr_entries < e820.nr_map + 1 )
3420 return -EINVAL;
3422 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3424 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3425 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3426 return -EFAULT;
3428 memmap.nr_entries = count;
3430 if ( copy_to_guest(arg, &memmap, 1) )
3431 return -EFAULT;
3433 return 0;
3436 case XENMEM_machphys_mapping:
3438 static const struct xen_machphys_mapping mapping = {
3439 .v_start = MACH2PHYS_VIRT_START,
3440 .v_end = MACH2PHYS_VIRT_END,
3441 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3442 };
3444 if ( copy_to_guest(arg, &mapping, 1) )
3445 return -EFAULT;
3447 return 0;
3450 default:
3451 return subarch_memory_op(op, arg);
3454 return 0;
3458 /*************************
3459 * Writable Pagetables
3460 */
3462 struct ptwr_emulate_ctxt {
3463 struct x86_emulate_ctxt ctxt;
3464 unsigned long cr2;
3465 l1_pgentry_t pte;
3466 };
3468 static int ptwr_emulated_read(
3469 enum x86_segment seg,
3470 unsigned long offset,
3471 unsigned long *val,
3472 unsigned int bytes,
3473 struct x86_emulate_ctxt *ctxt)
3475 unsigned int rc;
3476 unsigned long addr = offset;
3478 *val = 0;
3479 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3481 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3482 return X86EMUL_EXCEPTION;
3485 return X86EMUL_OKAY;
3488 static int ptwr_emulated_update(
3489 unsigned long addr,
3490 paddr_t old,
3491 paddr_t val,
3492 unsigned int bytes,
3493 unsigned int do_cmpxchg,
3494 struct ptwr_emulate_ctxt *ptwr_ctxt)
3496 unsigned long mfn;
3497 unsigned long unaligned_addr = addr;
3498 struct page_info *page;
3499 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3500 struct vcpu *v = current;
3501 struct domain *d = v->domain;
3503 /* Only allow naturally-aligned stores within the original %cr2 page. */
3504 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3506 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3507 ptwr_ctxt->cr2, addr, bytes);
3508 return X86EMUL_UNHANDLEABLE;
3511 /* Turn a sub-word access into a full-word access. */
3512 if ( bytes != sizeof(paddr_t) )
3514 paddr_t full;
3515 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3517 /* Align address; read full word. */
3518 addr &= ~(sizeof(paddr_t)-1);
3519 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3521 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3522 return X86EMUL_EXCEPTION;
3524 /* Mask out bits provided by caller. */
3525 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3526 /* Shift the caller value and OR in the missing bits. */
3527 val &= (((paddr_t)1 << (bytes*8)) - 1);
3528 val <<= (offset)*8;
3529 val |= full;
3530 /* Also fill in missing parts of the cmpxchg old value. */
3531 old &= (((paddr_t)1 << (bytes*8)) - 1);
3532 old <<= (offset)*8;
3533 old |= full;
3536 pte = ptwr_ctxt->pte;
3537 mfn = l1e_get_pfn(pte);
3538 page = mfn_to_page(mfn);
3540 /* We are looking only for read-only mappings of p.t. pages. */
3541 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3542 ASSERT(mfn_valid(mfn));
3543 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3544 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3545 ASSERT(page_get_owner(page) == d);
3547 /* Check the new PTE. */
3548 nl1e = l1e_from_intpte(val);
3549 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3551 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3552 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3553 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3555 /*
3556 * If this is an upper-half write to a PAE PTE then we assume that
3557 * the guest has simply got the two writes the wrong way round. We
3558 * zap the PRESENT bit on the assumption that the bottom half will
3559 * be written immediately after we return to the guest.
3560 */
3561 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3562 l1e_get_intpte(nl1e));
3563 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3565 else
3567 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3568 return X86EMUL_UNHANDLEABLE;
3572 adjust_guest_l1e(nl1e, d);
3574 /* Checked successfully: do the update (write or cmpxchg). */
3575 pl1e = map_domain_page(mfn);
3576 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3577 if ( do_cmpxchg )
3579 int okay;
3580 intpte_t t = old;
3581 ol1e = l1e_from_intpte(old);
3583 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3584 &t, l1e_get_intpte(nl1e), _mfn(mfn));
3585 okay = (okay && t == old);
3587 if ( !okay )
3589 unmap_domain_page(pl1e);
3590 put_page_from_l1e(nl1e, d);
3591 return X86EMUL_CMPXCHG_FAILED;
3594 else
3596 ol1e = *pl1e;
3597 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
3598 BUG();
3601 trace_ptwr_emulation(addr, nl1e);
3603 unmap_domain_page(pl1e);
3605 /* Finally, drop the old PTE. */
3606 put_page_from_l1e(ol1e, d);
3608 return X86EMUL_OKAY;
3611 static int ptwr_emulated_write(
3612 enum x86_segment seg,
3613 unsigned long offset,
3614 unsigned long val,
3615 unsigned int bytes,
3616 struct x86_emulate_ctxt *ctxt)
3618 return ptwr_emulated_update(
3619 offset, 0, val, bytes, 0,
3620 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3623 static int ptwr_emulated_cmpxchg(
3624 enum x86_segment seg,
3625 unsigned long offset,
3626 unsigned long old,
3627 unsigned long new,
3628 unsigned int bytes,
3629 struct x86_emulate_ctxt *ctxt)
3631 return ptwr_emulated_update(
3632 offset, old, new, bytes, 1,
3633 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3636 static int ptwr_emulated_cmpxchg8b(
3637 enum x86_segment seg,
3638 unsigned long offset,
3639 unsigned long old,
3640 unsigned long old_hi,
3641 unsigned long new,
3642 unsigned long new_hi,
3643 struct x86_emulate_ctxt *ctxt)
3645 if ( CONFIG_PAGING_LEVELS == 2 )
3646 return X86EMUL_UNHANDLEABLE;
3647 return ptwr_emulated_update(
3648 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3649 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3652 static struct x86_emulate_ops ptwr_emulate_ops = {
3653 .read = ptwr_emulated_read,
3654 .insn_fetch = ptwr_emulated_read,
3655 .write = ptwr_emulated_write,
3656 .cmpxchg = ptwr_emulated_cmpxchg,
3657 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3658 };
3660 /* Write page fault handler: check if guest is trying to modify a PTE. */
3661 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3662 struct cpu_user_regs *regs)
3664 struct domain *d = v->domain;
3665 struct page_info *page;
3666 l1_pgentry_t pte;
3667 struct ptwr_emulate_ctxt ptwr_ctxt;
3668 int rc;
3670 LOCK_BIGLOCK(d);
3672 /* Attempt to read the PTE that maps the VA being accessed. */
3673 guest_get_eff_l1e(v, addr, &pte);
3674 page = l1e_get_page(pte);
3676 /* We are looking only for read-only mappings of p.t. pages. */
3677 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3678 !mfn_valid(l1e_get_pfn(pte)) ||
3679 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3680 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3681 (page_get_owner(page) != d) )
3682 goto bail;
3684 ptwr_ctxt.ctxt.regs = regs;
3685 ptwr_ctxt.ctxt.force_writeback = 0;
3686 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3687 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3688 ptwr_ctxt.cr2 = addr;
3689 ptwr_ctxt.pte = pte;
3691 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3692 if ( rc == X86EMUL_UNHANDLEABLE )
3693 goto bail;
3695 UNLOCK_BIGLOCK(d);
3696 perfc_incr(ptwr_emulations);
3697 return EXCRET_fault_fixed;
3699 bail:
3700 UNLOCK_BIGLOCK(d);
3701 return 0;
3704 void free_xen_pagetable(void *v)
3706 extern int early_boot;
3708 if ( early_boot )
3709 return;
3711 if ( is_xen_heap_page(virt_to_page(v)) )
3712 free_xenheap_page(v);
3713 else
3714 free_domheap_page(virt_to_page(v));
3717 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
3718 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
3719 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
3721 /*
3722 * map_pages_to_xen() can be called with interrupts disabled:
3723 * * During early bootstrap; or
3724 * * alloc_xenheap_pages() via memguard_guard_range
3725 * In these cases it is safe to use flush_area_local():
3726 * * Because only the local CPU is online; or
3727 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
3728 */
3729 #define flush_area(v,f) (!local_irq_is_enabled() ? \
3730 flush_area_local((const void *)v, f) : \
3731 flush_area_all((const void *)v, f))
3733 int map_pages_to_xen(
3734 unsigned long virt,
3735 unsigned long mfn,
3736 unsigned long nr_mfns,
3737 unsigned int flags)
3739 l2_pgentry_t *pl2e, ol2e;
3740 l1_pgentry_t *pl1e, ol1e;
3741 unsigned int i;
3743 while ( nr_mfns != 0 )
3745 #ifdef __x86_64__
3746 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
3747 l3_pgentry_t ol3e = *pl3e;
3749 if ( cpu_has_page1gb &&
3750 !(((virt >> PAGE_SHIFT) | mfn) &
3751 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
3752 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
3753 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
3755 /* 1GB-page mapping. */
3756 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
3758 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
3760 unsigned int flush_flags =
3761 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3763 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
3765 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
3766 flush_flags |= FLUSH_TLB_GLOBAL;
3767 if ( (l1f_to_lNf(l3e_get_flags(ol3e)) ^ flags) &
3768 PAGE_CACHE_ATTRS )
3769 flush_flags |= FLUSH_CACHE;
3770 flush_area(virt, flush_flags);
3772 else
3774 pl2e = l3e_to_l2e(ol3e);
3775 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3777 ol2e = pl2e[i];
3778 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3779 continue;
3780 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3782 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3783 flush_flags |= FLUSH_TLB_GLOBAL;
3784 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
3785 PAGE_CACHE_ATTRS )
3786 flush_flags |= FLUSH_CACHE;
3788 else
3790 unsigned int j;
3792 pl1e = l2e_to_l1e(ol2e);
3793 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
3795 ol1e = pl1e[j];
3796 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3797 flush_flags |= FLUSH_TLB_GLOBAL;
3798 if ( (l1e_get_flags(ol1e) ^ flags) &
3799 PAGE_CACHE_ATTRS )
3800 flush_flags |= FLUSH_CACHE;
3804 flush_area(virt, flush_flags);
3805 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3807 ol2e = pl2e[i];
3808 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
3809 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3810 free_xen_pagetable(l2e_to_l1e(ol2e));
3812 free_xen_pagetable(pl2e);
3816 virt += 1UL << L3_PAGETABLE_SHIFT;
3817 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3818 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3819 continue;
3822 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
3823 (l3e_get_flags(ol3e) & _PAGE_PSE) )
3825 unsigned int flush_flags =
3826 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3828 /* Skip this PTE if there is no change. */
3829 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
3830 L1_PAGETABLE_ENTRIES - 1)) +
3831 (l2_table_offset(virt) << PAGETABLE_ORDER) +
3832 l1_table_offset(virt) == mfn) &&
3833 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
3834 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
3836 /* We can skip to end of L3 superpage if we got a match. */
3837 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
3838 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
3839 if ( i > nr_mfns )
3840 i = nr_mfns;
3841 virt += i << PAGE_SHIFT;
3842 mfn += i;
3843 nr_mfns -= i;
3844 continue;
3847 pl2e = alloc_xen_pagetable();
3848 if ( pl2e == NULL )
3849 return -ENOMEM;
3851 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3852 l2e_write(pl2e + i,
3853 l2e_from_pfn(l3e_get_pfn(ol3e) +
3854 (i << PAGETABLE_ORDER),
3855 l3e_get_flags(ol3e)));
3857 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
3858 flush_flags |= FLUSH_TLB_GLOBAL;
3860 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
3861 __PAGE_HYPERVISOR));
3862 flush_area(virt, flush_flags);
3864 #endif
3866 pl2e = virt_to_xen_l2e(virt);
3868 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3869 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3870 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
3872 /* Super-page mapping. */
3873 ol2e = *pl2e;
3874 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
3876 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3878 unsigned int flush_flags =
3879 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3881 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3883 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3884 flush_flags |= FLUSH_TLB_GLOBAL;
3885 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
3886 PAGE_CACHE_ATTRS )
3887 flush_flags |= FLUSH_CACHE;
3888 flush_area(virt, flush_flags);
3890 else
3892 pl1e = l2e_to_l1e(ol2e);
3893 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3895 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
3896 flush_flags |= FLUSH_TLB_GLOBAL;
3897 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
3898 PAGE_CACHE_ATTRS )
3899 flush_flags |= FLUSH_CACHE;
3901 flush_area(virt, flush_flags);
3902 free_xen_pagetable(pl1e);
3906 virt += 1UL << L2_PAGETABLE_SHIFT;
3907 mfn += 1UL << PAGETABLE_ORDER;
3908 nr_mfns -= 1UL << PAGETABLE_ORDER;
3910 else
3912 /* Normal page mapping. */
3913 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3915 pl1e = alloc_xen_pagetable();
3916 if ( pl1e == NULL )
3917 return -ENOMEM;
3918 clear_page(pl1e);
3919 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3920 __PAGE_HYPERVISOR));
3922 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3924 unsigned int flush_flags =
3925 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3927 /* Skip this PTE if there is no change. */
3928 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
3929 l1_table_offset(virt)) == mfn) &&
3930 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
3931 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
3933 /* We can skip to end of L2 superpage if we got a match. */
3934 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
3935 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
3936 if ( i > nr_mfns )
3937 i = nr_mfns;
3938 virt += i << L1_PAGETABLE_SHIFT;
3939 mfn += i;
3940 nr_mfns -= i;
3941 goto check_l3;
3944 pl1e = alloc_xen_pagetable();
3945 if ( pl1e == NULL )
3946 return -ENOMEM;
3948 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3949 l1e_write(&pl1e[i],
3950 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3951 lNf_to_l1f(l2e_get_flags(*pl2e))));
3953 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
3954 flush_flags |= FLUSH_TLB_GLOBAL;
3956 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3957 __PAGE_HYPERVISOR));
3958 flush_area(virt, flush_flags);
3961 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3962 ol1e = *pl1e;
3963 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
3964 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3966 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
3967 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3968 flush_flags |= FLUSH_TLB_GLOBAL;
3969 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
3970 flush_flags |= FLUSH_CACHE;
3971 flush_area(virt, flush_flags);
3974 virt += 1UL << L1_PAGETABLE_SHIFT;
3975 mfn += 1UL;
3976 nr_mfns -= 1UL;
3978 if ( (flags == PAGE_HYPERVISOR) &&
3979 ((nr_mfns == 0) ||
3980 ((((virt >> PAGE_SHIFT) | mfn) &
3981 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
3983 unsigned long base_mfn;
3984 pl1e = l2e_to_l1e(*pl2e);
3985 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
3986 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
3987 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
3988 (l1e_get_flags(*pl1e) != flags) )
3989 break;
3990 if ( i == L1_PAGETABLE_ENTRIES )
3992 ol2e = *pl2e;
3993 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
3994 l1f_to_lNf(flags)));
3995 flush_area(virt - PAGE_SIZE,
3996 FLUSH_TLB_GLOBAL |
3997 FLUSH_ORDER(PAGETABLE_ORDER));
3998 free_xen_pagetable(l2e_to_l1e(ol2e));
4003 check_l3: ;
4004 #ifdef __x86_64__
4005 if ( cpu_has_page1gb &&
4006 (flags == PAGE_HYPERVISOR) &&
4007 ((nr_mfns == 0) ||
4008 !(((virt >> PAGE_SHIFT) | mfn) &
4009 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4011 unsigned long base_mfn;
4013 ol3e = *pl3e;
4014 pl2e = l3e_to_l2e(ol3e);
4015 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4016 L1_PAGETABLE_ENTRIES - 1);
4017 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4018 if ( (l2e_get_pfn(*pl2e) !=
4019 (base_mfn + (i << PAGETABLE_ORDER))) ||
4020 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4021 break;
4022 if ( i == L2_PAGETABLE_ENTRIES )
4024 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4025 l1f_to_lNf(flags)));
4026 flush_area(virt - PAGE_SIZE,
4027 FLUSH_TLB_GLOBAL |
4028 FLUSH_ORDER(2*PAGETABLE_ORDER));
4029 free_xen_pagetable(l3e_to_l2e(ol3e));
4032 #endif
4035 return 0;
4038 void destroy_xen_mappings(unsigned long s, unsigned long e)
4040 l2_pgentry_t *pl2e;
4041 l1_pgentry_t *pl1e;
4042 unsigned int i;
4043 unsigned long v = s;
4045 ASSERT((s & ~PAGE_MASK) == 0);
4046 ASSERT((e & ~PAGE_MASK) == 0);
4048 while ( v < e )
4050 #ifdef __x86_64__
4051 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4053 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4055 v += 1UL << L3_PAGETABLE_SHIFT;
4056 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4057 continue;
4060 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4062 if ( l2_table_offset(v) == 0 &&
4063 l1_table_offset(v) == 0 &&
4064 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4066 /* PAGE1GB: whole superpage is destroyed. */
4067 l3e_write_atomic(pl3e, l3e_empty());
4068 v += 1UL << L3_PAGETABLE_SHIFT;
4069 continue;
4072 /* PAGE1GB: shatter the superpage and fall through. */
4073 pl2e = alloc_xen_pagetable();
4074 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4075 l2e_write(pl2e + i,
4076 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4077 (i << PAGETABLE_ORDER),
4078 l3e_get_flags(*pl3e)));
4079 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4080 __PAGE_HYPERVISOR));
4082 #endif
4084 pl2e = virt_to_xen_l2e(v);
4086 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4088 v += 1UL << L2_PAGETABLE_SHIFT;
4089 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4090 continue;
4093 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4095 if ( (l1_table_offset(v) == 0) &&
4096 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4098 /* PSE: whole superpage is destroyed. */
4099 l2e_write_atomic(pl2e, l2e_empty());
4100 v += 1UL << L2_PAGETABLE_SHIFT;
4102 else
4104 /* PSE: shatter the superpage and try again. */
4105 pl1e = alloc_xen_pagetable();
4106 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4107 l1e_write(&pl1e[i],
4108 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4109 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4110 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4111 __PAGE_HYPERVISOR));
4114 else
4116 /* Ordinary 4kB mapping. */
4117 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4118 l1e_write_atomic(pl1e, l1e_empty());
4119 v += PAGE_SIZE;
4121 /* If we are done with the L2E, check if it is now empty. */
4122 if ( (v != e) && (l1_table_offset(v) != 0) )
4123 continue;
4124 pl1e = l2e_to_l1e(*pl2e);
4125 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4126 if ( l1e_get_intpte(pl1e[i]) != 0 )
4127 break;
4128 if ( i == L1_PAGETABLE_ENTRIES )
4130 /* Empty: zap the L2E and free the L1 page. */
4131 l2e_write_atomic(pl2e, l2e_empty());
4132 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4133 free_xen_pagetable(pl1e);
4137 #ifdef __x86_64__
4138 /* If we are done with the L3E, check if it is now empty. */
4139 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4140 continue;
4141 pl2e = l3e_to_l2e(*pl3e);
4142 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4143 if ( l2e_get_intpte(pl2e[i]) != 0 )
4144 break;
4145 if ( i == L2_PAGETABLE_ENTRIES )
4147 /* Empty: zap the L3E and free the L2 page. */
4148 l3e_write_atomic(pl3e, l3e_empty());
4149 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4150 free_xen_pagetable(pl2e);
4152 #endif
4155 flush_area(NULL, FLUSH_TLB_GLOBAL);
4158 void __set_fixmap(
4159 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4161 BUG_ON(idx >= __end_of_fixed_addresses);
4162 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4165 #ifdef MEMORY_GUARD
4167 void memguard_init(void)
4169 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4170 map_pages_to_xen(
4171 (unsigned long)__va(start),
4172 start >> PAGE_SHIFT,
4173 (xenheap_phys_end - start) >> PAGE_SHIFT,
4174 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4175 #ifdef __x86_64__
4176 BUG_ON(start != xen_phys_start);
4177 map_pages_to_xen(
4178 XEN_VIRT_START,
4179 start >> PAGE_SHIFT,
4180 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4181 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4182 #endif
4185 static void __memguard_change_range(void *p, unsigned long l, int guard)
4187 unsigned long _p = (unsigned long)p;
4188 unsigned long _l = (unsigned long)l;
4189 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4191 /* Ensure we are dealing with a page-aligned whole number of pages. */
4192 ASSERT((_p&~PAGE_MASK) == 0);
4193 ASSERT((_l&~PAGE_MASK) == 0);
4195 if ( guard )
4196 flags &= ~_PAGE_PRESENT;
4198 map_pages_to_xen(
4199 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4202 void memguard_guard_range(void *p, unsigned long l)
4204 __memguard_change_range(p, l, 1);
4207 void memguard_unguard_range(void *p, unsigned long l)
4209 __memguard_change_range(p, l, 0);
4212 #endif
4214 void memguard_guard_stack(void *p)
4216 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4217 p = (void *)((unsigned long)p + STACK_SIZE -
4218 PRIMARY_STACK_SIZE - PAGE_SIZE);
4219 memguard_guard_range(p, PAGE_SIZE);
4222 /*
4223 * Local variables:
4224 * mode: C
4225 * c-set-style: "BSD"
4226 * c-basic-offset: 4
4227 * tab-width: 4
4228 * indent-tabs-mode: nil
4229 * End:
4230 */