ia64/xen-unstable

view xen/arch/x86/mm.c @ 17845:7cf53a91c3a3

x86: minor adjustment to asm constraint in get_page()

While not wrong, avoiding the unnecessary output allows the compiler a
little more freedom.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 12 16:17:19 2008 +0100 (2008-06-12)
parents 882eb6186c2d
children 09dd5999401b
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(__i386__)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 static struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 #define l1_disallow_mask(d) \
164 ((d != dom_io) && \
165 (rangeset_is_empty((d)->iomem_caps) && \
166 rangeset_is_empty((d)->arch.ioport_caps)) ? \
167 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
169 #ifdef CONFIG_COMPAT
170 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
171 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
172 L3_DISALLOW_MASK : \
173 COMPAT_L3_DISALLOW_MASK)
174 #else
175 #define l3_disallow_mask(d) L3_DISALLOW_MASK
176 #endif
178 static void queue_deferred_ops(struct domain *d, unsigned int ops)
179 {
180 ASSERT(d == current->domain);
181 this_cpu(percpu_mm_info).deferred_ops |= ops;
182 }
184 void __init init_frametable(void)
185 {
186 unsigned long nr_pages, page_step, i, mfn;
188 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
190 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
191 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
193 for ( i = 0; i < nr_pages; i += page_step )
194 {
195 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
196 if ( mfn == 0 )
197 panic("Not enough memory for frame table\n");
198 map_pages_to_xen(
199 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
200 mfn, page_step, PAGE_HYPERVISOR);
201 }
203 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
204 }
206 void __init arch_init_memory(void)
207 {
208 extern void subarch_init_memory(void);
210 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
212 /*
213 * Initialise our DOMID_XEN domain.
214 * Any Xen-heap pages that we will allow to be mapped will have
215 * their domain field set to dom_xen.
216 */
217 dom_xen = alloc_domain(DOMID_XEN);
218 BUG_ON(dom_xen == NULL);
220 /*
221 * Initialise our DOMID_IO domain.
222 * This domain owns I/O pages that are within the range of the page_info
223 * array. Mappings occur at the priv of the caller.
224 */
225 dom_io = alloc_domain(DOMID_IO);
226 BUG_ON(dom_io == NULL);
228 /* First 1MB of RAM is historically marked as I/O. */
229 for ( i = 0; i < 0x100; i++ )
230 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
232 /* Any areas not specified as RAM by the e820 map are considered I/O. */
233 for ( i = 0, pfn = 0; pfn < max_page; i++ )
234 {
235 while ( (i < e820.nr_map) &&
236 (e820.map[i].type != E820_RAM) &&
237 (e820.map[i].type != E820_UNUSABLE) )
238 i++;
240 if ( i >= e820.nr_map )
241 {
242 /* No more RAM regions: mark as I/O right to end of memory map. */
243 rstart_pfn = rend_pfn = max_page;
244 }
245 else
246 {
247 /* Mark as I/O just up as far as next RAM region. */
248 rstart_pfn = min_t(unsigned long, max_page,
249 PFN_UP(e820.map[i].addr));
250 rend_pfn = max_t(unsigned long, rstart_pfn,
251 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
252 }
254 /*
255 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
256 * In particular this ensures that RAM holes are respected even in
257 * the statically-initialised 1-16MB mapping area.
258 */
259 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
260 ioend_pfn = rstart_pfn;
261 #if defined(CONFIG_X86_32)
262 ioend_pfn = min_t(unsigned long, ioend_pfn,
263 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
264 #endif
265 if ( iostart_pfn < ioend_pfn )
266 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
267 (unsigned long)mfn_to_virt(ioend_pfn));
269 /* Mark as I/O up to next RAM region. */
270 for ( ; pfn < rstart_pfn; pfn++ )
271 {
272 BUG_ON(!mfn_valid(pfn));
273 share_xen_page_with_guest(
274 mfn_to_page(pfn), dom_io, XENSHARE_writable);
275 }
277 /* Skip the RAM region. */
278 pfn = rend_pfn;
279 }
281 subarch_init_memory();
282 }
284 int memory_is_conventional_ram(paddr_t p)
285 {
286 int i;
288 for ( i = 0; i < e820.nr_map; i++ )
289 {
290 if ( (e820.map[i].type == E820_RAM) &&
291 (e820.map[i].addr <= p) &&
292 (e820.map[i].size > p) )
293 return 1;
294 }
296 return 0;
297 }
299 unsigned long domain_get_maximum_gpfn(struct domain *d)
300 {
301 if ( is_hvm_domain(d) )
302 return d->arch.p2m->max_mapped_pfn;
303 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
304 return arch_get_max_pfn(d) - 1;
305 }
307 void share_xen_page_with_guest(
308 struct page_info *page, struct domain *d, int readonly)
309 {
310 if ( page_get_owner(page) == d )
311 return;
313 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
315 spin_lock(&d->page_alloc_lock);
317 /* The incremented type count pins as writable or read-only. */
318 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
319 page->u.inuse.type_info |= PGT_validated | 1;
321 page_set_owner(page, d);
322 wmb(); /* install valid domain ptr before updating refcnt. */
323 ASSERT(page->count_info == 0);
325 /* Only add to the allocation list if the domain isn't dying. */
326 if ( !d->is_dying )
327 {
328 page->count_info |= PGC_allocated | 1;
329 if ( unlikely(d->xenheap_pages++ == 0) )
330 get_knownalive_domain(d);
331 list_add_tail(&page->list, &d->xenpage_list);
332 }
334 spin_unlock(&d->page_alloc_lock);
335 }
337 void share_xen_page_with_privileged_guests(
338 struct page_info *page, int readonly)
339 {
340 share_xen_page_with_guest(page, dom_xen, readonly);
341 }
343 #if defined(__i386__)
345 #ifdef NDEBUG
346 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
347 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
348 #else
349 /*
350 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
351 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
352 * (detected by lack of an owning domain). As required for correctness, we
353 * always shadow PDPTs above 4GB.
354 */
355 #define l3tab_needs_shadow(mfn) \
356 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
357 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
358 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
359 ((mfn) >= 0x100000))
360 #endif
362 static l1_pgentry_t *fix_pae_highmem_pl1e;
364 /* Cache the address of PAE high-memory fixmap page tables. */
365 static int __init cache_pae_fixmap_address(void)
366 {
367 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
368 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
369 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
370 return 0;
371 }
372 __initcall(cache_pae_fixmap_address);
374 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
376 void make_cr3(struct vcpu *v, unsigned long mfn)
377 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
378 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
379 {
380 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
381 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
382 unsigned int cpu = smp_processor_id();
384 /* Fast path: does this mfn need a shadow at all? */
385 if ( !l3tab_needs_shadow(mfn) )
386 {
387 v->arch.cr3 = mfn << PAGE_SHIFT;
388 /* Cache is no longer in use or valid */
389 cache->high_mfn = 0;
390 return;
391 }
393 /* Caching logic is not interrupt safe. */
394 ASSERT(!in_irq());
396 /* Protects against pae_flush_pgd(). */
397 spin_lock(&cache->lock);
399 cache->inuse_idx ^= 1;
400 cache->high_mfn = mfn;
402 /* Map the guest L3 table and copy to the chosen low-memory cache. */
403 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
404 /* First check the previous high mapping can't be in the TLB.
405 * (i.e. have we loaded CR3 since we last did this?) */
406 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
407 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
408 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
409 lowmem_l3tab = cache->table[cache->inuse_idx];
410 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
411 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
412 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
414 v->arch.cr3 = __pa(lowmem_l3tab);
416 spin_unlock(&cache->lock);
417 }
419 #else /* !defined(__i386__) */
421 void make_cr3(struct vcpu *v, unsigned long mfn)
422 {
423 v->arch.cr3 = mfn << PAGE_SHIFT;
424 }
426 #endif /* !defined(__i386__) */
428 void write_ptbase(struct vcpu *v)
429 {
430 write_cr3(v->arch.cr3);
431 }
433 /*
434 * Should be called after CR3 is updated.
435 *
436 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
437 * for HVM guests, arch.monitor_table and hvm's guest CR3.
438 *
439 * Update ref counts to shadow tables appropriately.
440 */
441 void update_cr3(struct vcpu *v)
442 {
443 unsigned long cr3_mfn=0;
445 if ( paging_mode_enabled(v->domain) )
446 {
447 paging_update_cr3(v);
448 return;
449 }
451 #if CONFIG_PAGING_LEVELS == 4
452 if ( !(v->arch.flags & TF_kernel_mode) )
453 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
454 else
455 #endif
456 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
458 make_cr3(v, cr3_mfn);
459 }
462 static void invalidate_shadow_ldt(struct vcpu *v)
463 {
464 int i;
465 unsigned long pfn;
466 struct page_info *page;
468 if ( v->arch.shadow_ldt_mapcnt == 0 )
469 return;
471 v->arch.shadow_ldt_mapcnt = 0;
473 for ( i = 16; i < 32; i++ )
474 {
475 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
476 if ( pfn == 0 ) continue;
477 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
478 page = mfn_to_page(pfn);
479 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
480 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
481 put_page_and_type(page);
482 }
484 /* Dispose of the (now possibly invalid) mappings from the TLB. */
485 if ( v == current )
486 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
487 else
488 flush_tlb_mask(v->domain->domain_dirty_cpumask);
489 }
492 static int alloc_segdesc_page(struct page_info *page)
493 {
494 struct desc_struct *descs;
495 int i;
497 descs = map_domain_page(page_to_mfn(page));
499 for ( i = 0; i < 512; i++ )
500 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
501 goto fail;
503 unmap_domain_page(descs);
504 return 1;
506 fail:
507 unmap_domain_page(descs);
508 return 0;
509 }
512 /* Map shadow page at offset @off. */
513 int map_ldt_shadow_page(unsigned int off)
514 {
515 struct vcpu *v = current;
516 struct domain *d = v->domain;
517 unsigned long gmfn, mfn;
518 l1_pgentry_t l1e, nl1e;
519 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
520 int okay;
522 BUG_ON(unlikely(in_irq()));
524 guest_get_eff_kern_l1e(v, gva, &l1e);
525 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
526 return 0;
528 gmfn = l1e_get_pfn(l1e);
529 mfn = gmfn_to_mfn(d, gmfn);
530 if ( unlikely(!mfn_valid(mfn)) )
531 return 0;
533 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
534 if ( unlikely(!okay) )
535 return 0;
537 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
539 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
540 v->arch.shadow_ldt_mapcnt++;
542 return 1;
543 }
546 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
547 {
548 struct page_info *page = mfn_to_page(page_nr);
550 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
551 {
552 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
553 return 0;
554 }
556 return 1;
557 }
560 static int get_page_and_type_from_pagenr(unsigned long page_nr,
561 unsigned long type,
562 struct domain *d)
563 {
564 struct page_info *page = mfn_to_page(page_nr);
566 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
567 return 0;
569 if ( unlikely(!get_page_type(page, type)) )
570 {
571 put_page(page);
572 return 0;
573 }
575 return 1;
576 }
578 /*
579 * We allow root tables to map each other (a.k.a. linear page tables). It
580 * needs some special care with reference counts and access permissions:
581 * 1. The mapping entry must be read-only, or the guest may get write access
582 * to its own PTEs.
583 * 2. We must only bump the reference counts for an *already validated*
584 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
585 * on a validation that is required to complete that validation.
586 * 3. We only need to increment the reference counts for the mapped page
587 * frame if it is mapped by a different root table. This is sufficient and
588 * also necessary to allow validation of a root table mapping itself.
589 */
590 #define define_get_linear_pagetable(level) \
591 static int \
592 get_##level##_linear_pagetable( \
593 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
594 { \
595 unsigned long x, y; \
596 struct page_info *page; \
597 unsigned long pfn; \
598 \
599 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
600 { \
601 MEM_LOG("Attempt to create linear p.t. with write perms"); \
602 return 0; \
603 } \
604 \
605 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
606 { \
607 /* Make sure the mapped frame belongs to the correct domain. */ \
608 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
609 return 0; \
610 \
611 /* \
612 * Ensure that the mapped frame is an already-validated page table. \
613 * If so, atomically increment the count (checking for overflow). \
614 */ \
615 page = mfn_to_page(pfn); \
616 y = page->u.inuse.type_info; \
617 do { \
618 x = y; \
619 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
620 unlikely((x & (PGT_type_mask|PGT_validated)) != \
621 (PGT_##level##_page_table|PGT_validated)) ) \
622 { \
623 put_page(page); \
624 return 0; \
625 } \
626 } \
627 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
628 } \
629 \
630 return 1; \
631 }
634 int is_iomem_page(unsigned long mfn)
635 {
636 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
637 }
640 int
641 get_page_from_l1e(
642 l1_pgentry_t l1e, struct domain *d)
643 {
644 unsigned long mfn = l1e_get_pfn(l1e);
645 struct page_info *page = mfn_to_page(mfn);
646 uint32_t l1f = l1e_get_flags(l1e);
647 struct vcpu *curr = current;
648 struct domain *owner;
649 int okay;
651 if ( !(l1f & _PAGE_PRESENT) )
652 return 1;
654 if ( unlikely(l1f & l1_disallow_mask(d)) )
655 {
656 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
657 return 0;
658 }
660 if ( is_iomem_page(mfn) )
661 {
662 /* DOMID_IO reverts to caller for privilege checks. */
663 if ( d == dom_io )
664 d = curr->domain;
666 if ( !iomem_access_permitted(d, mfn, mfn) )
667 {
668 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
669 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
670 d->domain_id, mfn);
671 return 0;
672 }
674 return 1;
675 }
677 /*
678 * Let privileged domains transfer the right to map their target
679 * domain's pages. This is used to allow stub-domain pvfb export to dom0,
680 * until pvfb supports granted mappings. At that time this minor hack
681 * can go away.
682 */
683 owner = page_get_owner(page);
684 if ( unlikely(d != owner) && (owner != NULL) &&
685 (d != curr->domain) && IS_PRIV_FOR(d, owner) )
686 d = owner;
688 /* Foreign mappings into guests in shadow external mode don't
689 * contribute to writeable mapping refcounts. (This allows the
690 * qemu-dm helper process in dom0 to map the domain's memory without
691 * messing up the count of "real" writable mappings.) */
692 okay = (((l1f & _PAGE_RW) &&
693 !(unlikely(paging_mode_external(d) && (d != curr->domain))))
694 ? get_page_and_type(page, d, PGT_writable_page)
695 : get_page(page, d));
696 if ( !okay )
697 {
698 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
699 " for dom%d",
700 mfn, get_gpfn_from_mfn(mfn),
701 l1e_get_intpte(l1e), d->domain_id);
702 }
703 else if ( pte_flags_to_cacheattr(l1f) !=
704 ((page->count_info >> PGC_cacheattr_base) & 7) )
705 {
706 uint32_t x, nx, y = page->count_info;
707 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
709 if ( is_xen_heap_page(page) )
710 {
711 if ( (l1f & _PAGE_RW) &&
712 !(unlikely(paging_mode_external(d) &&
713 (d != curr->domain))) )
714 put_page_type(page);
715 put_page(page);
716 MEM_LOG("Attempt to change cache attributes of Xen heap page");
717 return 0;
718 }
720 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
721 {
722 x = y;
723 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
724 y = cmpxchg(&page->count_info, x, nx);
725 }
727 #ifdef __x86_64__
728 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
729 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
730 #endif
731 }
733 return okay;
734 }
737 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
738 define_get_linear_pagetable(l2);
739 static int
740 get_page_from_l2e(
741 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
742 {
743 int rc;
745 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
746 return 1;
748 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
749 {
750 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
751 return 0;
752 }
754 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
755 if ( unlikely(!rc) )
756 rc = get_l2_linear_pagetable(l2e, pfn, d);
758 return rc;
759 }
762 #if CONFIG_PAGING_LEVELS >= 3
763 define_get_linear_pagetable(l3);
764 static int
765 get_page_from_l3e(
766 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
767 {
768 int rc;
770 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
771 return 1;
773 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
774 {
775 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
776 return 0;
777 }
779 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
780 if ( unlikely(!rc) )
781 rc = get_l3_linear_pagetable(l3e, pfn, d);
783 return rc;
784 }
785 #endif /* 3 level */
787 #if CONFIG_PAGING_LEVELS >= 4
788 define_get_linear_pagetable(l4);
789 static int
790 get_page_from_l4e(
791 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
792 {
793 int rc;
795 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
796 return 1;
798 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
799 {
800 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
801 return 0;
802 }
804 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
805 if ( unlikely(!rc) )
806 rc = get_l4_linear_pagetable(l4e, pfn, d);
808 return rc;
809 }
810 #endif /* 4 level */
812 #ifdef __x86_64__
814 #ifdef USER_MAPPINGS_ARE_GLOBAL
815 #define adjust_guest_l1e(pl1e, d) \
816 do { \
817 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
818 likely(!is_pv_32on64_domain(d)) ) \
819 { \
820 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
821 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
822 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
823 MEM_LOG("Global bit is set to kernel page %lx", \
824 l1e_get_pfn((pl1e))); \
825 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
826 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
827 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
828 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
829 } \
830 } while ( 0 )
831 #else
832 #define adjust_guest_l1e(pl1e, d) \
833 do { \
834 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
835 likely(!is_pv_32on64_domain(d)) ) \
836 l1e_add_flags((pl1e), _PAGE_USER); \
837 } while ( 0 )
838 #endif
840 #define adjust_guest_l2e(pl2e, d) \
841 do { \
842 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
843 likely(!is_pv_32on64_domain(d)) ) \
844 l2e_add_flags((pl2e), _PAGE_USER); \
845 } while ( 0 )
847 #define adjust_guest_l3e(pl3e, d) \
848 do { \
849 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
850 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
851 _PAGE_USER : \
852 _PAGE_USER|_PAGE_RW); \
853 } while ( 0 )
855 #define adjust_guest_l4e(pl4e, d) \
856 do { \
857 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
858 likely(!is_pv_32on64_domain(d)) ) \
859 l4e_add_flags((pl4e), _PAGE_USER); \
860 } while ( 0 )
862 #else /* !defined(__x86_64__) */
864 #define adjust_guest_l1e(_p, _d) ((void)(_d))
865 #define adjust_guest_l2e(_p, _d) ((void)(_d))
866 #define adjust_guest_l3e(_p, _d) ((void)(_d))
868 #endif
870 #ifdef CONFIG_COMPAT
871 #define unadjust_guest_l3e(pl3e, d) \
872 do { \
873 if ( unlikely(is_pv_32on64_domain(d)) && \
874 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
875 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
876 } while ( 0 )
877 #else
878 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
879 #endif
881 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
882 {
883 unsigned long pfn = l1e_get_pfn(l1e);
884 struct page_info *page;
885 struct domain *e;
886 struct vcpu *v;
888 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
889 return;
891 page = mfn_to_page(pfn);
893 e = page_get_owner(page);
895 /*
896 * Check if this is a mapping that was established via a grant reference.
897 * If it was then we should not be here: we require that such mappings are
898 * explicitly destroyed via the grant-table interface.
899 *
900 * The upshot of this is that the guest can end up with active grants that
901 * it cannot destroy (because it no longer has a PTE to present to the
902 * grant-table interface). This can lead to subtle hard-to-catch bugs,
903 * hence a special grant PTE flag can be enabled to catch the bug early.
904 *
905 * (Note that the undestroyable active grants are not a security hole in
906 * Xen. All active grants can safely be cleaned up when the domain dies.)
907 */
908 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
909 !d->is_shutting_down && !d->is_dying )
910 {
911 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
912 l1e_get_intpte(l1e));
913 domain_crash(d);
914 }
916 /* Remember we didn't take a type-count of foreign writable mappings
917 * to paging-external domains */
918 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
919 !(unlikely((e != d) && paging_mode_external(e))) )
920 {
921 put_page_and_type(page);
922 }
923 else
924 {
925 /* We expect this is rare so we blow the entire shadow LDT. */
926 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
927 PGT_seg_desc_page)) &&
928 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
929 (d == e) )
930 {
931 for_each_vcpu ( d, v )
932 invalidate_shadow_ldt(v);
933 }
934 put_page(page);
935 }
936 }
939 /*
940 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
941 * Note also that this automatically deals correctly with linear p.t.'s.
942 */
943 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
944 {
945 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
946 (l2e_get_pfn(l2e) != pfn) )
947 put_page_and_type(l2e_get_page(l2e));
948 }
951 #if CONFIG_PAGING_LEVELS >= 3
952 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
953 {
954 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
955 (l3e_get_pfn(l3e) != pfn) )
956 put_page_and_type(l3e_get_page(l3e));
957 }
958 #endif
960 #if CONFIG_PAGING_LEVELS >= 4
961 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
962 {
963 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
964 (l4e_get_pfn(l4e) != pfn) )
965 put_page_and_type(l4e_get_page(l4e));
966 }
967 #endif
969 static int alloc_l1_table(struct page_info *page)
970 {
971 struct domain *d = page_get_owner(page);
972 unsigned long pfn = page_to_mfn(page);
973 l1_pgentry_t *pl1e;
974 int i;
976 pl1e = map_domain_page(pfn);
978 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
979 {
980 if ( is_guest_l1_slot(i) &&
981 unlikely(!get_page_from_l1e(pl1e[i], d)) )
982 goto fail;
984 adjust_guest_l1e(pl1e[i], d);
985 }
987 unmap_domain_page(pl1e);
988 return 1;
990 fail:
991 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
992 while ( i-- > 0 )
993 if ( is_guest_l1_slot(i) )
994 put_page_from_l1e(pl1e[i], d);
996 unmap_domain_page(pl1e);
997 return 0;
998 }
1000 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1002 struct page_info *page;
1003 l2_pgentry_t *pl2e;
1004 l3_pgentry_t l3e3;
1005 #ifndef CONFIG_COMPAT
1006 l2_pgentry_t l2e;
1007 int i;
1008 #endif
1010 if ( !is_pv_32bit_domain(d) )
1011 return 1;
1013 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1015 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1016 l3e3 = pl3e[3];
1017 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1019 MEM_LOG("PAE L3 3rd slot is empty");
1020 return 0;
1023 /*
1024 * The Xen-private mappings include linear mappings. The L2 thus cannot
1025 * be shared by multiple L3 tables. The test here is adequate because:
1026 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1027 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1028 * 2. Cannot appear in another page table's L3:
1029 * a. alloc_l3_table() calls this function and this check will fail
1030 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1031 */
1032 page = l3e_get_page(l3e3);
1033 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1034 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1035 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1036 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1038 MEM_LOG("PAE L3 3rd slot is shared");
1039 return 0;
1042 /* Xen private mappings. */
1043 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1044 #ifndef CONFIG_COMPAT
1045 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1046 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1047 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1048 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1050 l2e = l2e_from_page(
1051 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1052 __PAGE_HYPERVISOR);
1053 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1055 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1057 l2e = l2e_empty();
1058 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1059 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1060 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1062 #else
1063 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1064 &compat_idle_pg_table_l2[
1065 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1066 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1067 #endif
1068 unmap_domain_page(pl2e);
1070 return 1;
1073 #ifdef __i386__
1074 /* Flush a pgdir update into low-memory caches. */
1075 static void pae_flush_pgd(
1076 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1078 struct domain *d = page_get_owner(mfn_to_page(mfn));
1079 struct vcpu *v;
1080 intpte_t _ol3e, _nl3e, _pl3e;
1081 l3_pgentry_t *l3tab_ptr;
1082 struct pae_l3_cache *cache;
1084 if ( unlikely(shadow_mode_enabled(d)) )
1086 cpumask_t m = CPU_MASK_NONE;
1087 /* Re-shadow this l3 table on any vcpus that are using it */
1088 for_each_vcpu ( d, v )
1089 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1091 paging_update_cr3(v);
1092 cpus_or(m, m, v->vcpu_dirty_cpumask);
1094 flush_tlb_mask(m);
1097 /* If below 4GB then the pgdir is not shadowed in low memory. */
1098 if ( !l3tab_needs_shadow(mfn) )
1099 return;
1101 for_each_vcpu ( d, v )
1103 cache = &v->arch.pae_l3_cache;
1105 spin_lock(&cache->lock);
1107 if ( cache->high_mfn == mfn )
1109 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1110 _ol3e = l3e_get_intpte(*l3tab_ptr);
1111 _nl3e = l3e_get_intpte(nl3e);
1112 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1113 BUG_ON(_pl3e != _ol3e);
1116 spin_unlock(&cache->lock);
1119 flush_tlb_mask(d->domain_dirty_cpumask);
1121 #else
1122 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1123 #endif
1125 static int alloc_l2_table(struct page_info *page, unsigned long type)
1127 struct domain *d = page_get_owner(page);
1128 unsigned long pfn = page_to_mfn(page);
1129 l2_pgentry_t *pl2e;
1130 int i;
1132 pl2e = map_domain_page(pfn);
1134 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1136 if ( is_guest_l2_slot(d, type, i) &&
1137 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1138 goto fail;
1140 adjust_guest_l2e(pl2e[i], d);
1143 unmap_domain_page(pl2e);
1144 return 1;
1146 fail:
1147 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1148 while ( i-- > 0 )
1149 if ( is_guest_l2_slot(d, type, i) )
1150 put_page_from_l2e(pl2e[i], pfn);
1152 unmap_domain_page(pl2e);
1153 return 0;
1157 #if CONFIG_PAGING_LEVELS >= 3
1158 static int alloc_l3_table(struct page_info *page)
1160 struct domain *d = page_get_owner(page);
1161 unsigned long pfn = page_to_mfn(page);
1162 l3_pgentry_t *pl3e;
1163 int i;
1165 #if CONFIG_PAGING_LEVELS == 3
1166 /*
1167 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1168 * the weird 'extended cr3' format for dealing with high-order address
1169 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1170 */
1171 if ( (pfn >= 0x100000) &&
1172 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1173 d->vcpu[0] && d->vcpu[0]->is_initialised )
1175 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1176 return 0;
1178 #endif
1180 pl3e = map_domain_page(pfn);
1182 /*
1183 * PAE guests allocate full pages, but aren't required to initialize
1184 * more than the first four entries; when running in compatibility
1185 * mode, however, the full page is visible to the MMU, and hence all
1186 * 512 entries must be valid/verified, which is most easily achieved
1187 * by clearing them out.
1188 */
1189 if ( is_pv_32on64_domain(d) )
1190 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1192 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1194 if ( is_pv_32bit_domain(d) && (i == 3) )
1196 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1197 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1198 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1199 PGT_l2_page_table |
1200 PGT_pae_xen_l2,
1201 d) )
1202 goto fail;
1204 else if ( is_guest_l3_slot(i) &&
1205 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1206 goto fail;
1208 adjust_guest_l3e(pl3e[i], d);
1211 if ( !create_pae_xen_mappings(d, pl3e) )
1212 goto fail;
1214 unmap_domain_page(pl3e);
1215 return 1;
1217 fail:
1218 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1219 while ( i-- > 0 )
1220 if ( is_guest_l3_slot(i) )
1221 put_page_from_l3e(pl3e[i], pfn);
1223 unmap_domain_page(pl3e);
1224 return 0;
1226 #else
1227 #define alloc_l3_table(page) (0)
1228 #endif
1230 #if CONFIG_PAGING_LEVELS >= 4
1231 static int alloc_l4_table(struct page_info *page)
1233 struct domain *d = page_get_owner(page);
1234 unsigned long pfn = page_to_mfn(page);
1235 l4_pgentry_t *pl4e = page_to_virt(page);
1236 int i;
1238 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1240 if ( is_guest_l4_slot(d, i) &&
1241 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1242 goto fail;
1244 adjust_guest_l4e(pl4e[i], d);
1247 /* Xen private mappings. */
1248 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1249 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1250 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1251 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1252 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1253 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1254 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1255 __PAGE_HYPERVISOR);
1257 return 1;
1259 fail:
1260 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1261 while ( i-- > 0 )
1262 if ( is_guest_l4_slot(d, i) )
1263 put_page_from_l4e(pl4e[i], pfn);
1265 return 0;
1267 #else
1268 #define alloc_l4_table(page) (0)
1269 #endif
1272 static void free_l1_table(struct page_info *page)
1274 struct domain *d = page_get_owner(page);
1275 unsigned long pfn = page_to_mfn(page);
1276 l1_pgentry_t *pl1e;
1277 int i;
1279 pl1e = map_domain_page(pfn);
1281 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1282 if ( is_guest_l1_slot(i) )
1283 put_page_from_l1e(pl1e[i], d);
1285 unmap_domain_page(pl1e);
1289 static void free_l2_table(struct page_info *page)
1291 #ifdef CONFIG_COMPAT
1292 struct domain *d = page_get_owner(page);
1293 #endif
1294 unsigned long pfn = page_to_mfn(page);
1295 l2_pgentry_t *pl2e;
1296 int i;
1298 pl2e = map_domain_page(pfn);
1300 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1301 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1302 put_page_from_l2e(pl2e[i], pfn);
1304 unmap_domain_page(pl2e);
1306 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1310 #if CONFIG_PAGING_LEVELS >= 3
1312 static void free_l3_table(struct page_info *page)
1314 struct domain *d = page_get_owner(page);
1315 unsigned long pfn = page_to_mfn(page);
1316 l3_pgentry_t *pl3e;
1317 int i;
1319 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1320 if ( d->arch.relmem == RELMEM_l3 )
1321 return;
1322 #endif
1324 pl3e = map_domain_page(pfn);
1326 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1327 if ( is_guest_l3_slot(i) )
1329 put_page_from_l3e(pl3e[i], pfn);
1330 unadjust_guest_l3e(pl3e[i], d);
1333 unmap_domain_page(pl3e);
1336 #endif
1338 #if CONFIG_PAGING_LEVELS >= 4
1340 static void free_l4_table(struct page_info *page)
1342 struct domain *d = page_get_owner(page);
1343 unsigned long pfn = page_to_mfn(page);
1344 l4_pgentry_t *pl4e = page_to_virt(page);
1345 int i;
1347 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1348 if ( d->arch.relmem == RELMEM_l4 )
1349 return;
1350 #endif
1352 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1353 if ( is_guest_l4_slot(d, i) )
1354 put_page_from_l4e(pl4e[i], pfn);
1357 #endif
1360 /* How to write an entry to the guest pagetables.
1361 * Returns 0 for failure (pointer not valid), 1 for success. */
1362 static inline int update_intpte(intpte_t *p,
1363 intpte_t old,
1364 intpte_t new,
1365 unsigned long mfn,
1366 struct vcpu *v,
1367 int preserve_ad)
1369 int rv = 1;
1370 #ifndef PTE_UPDATE_WITH_CMPXCHG
1371 if ( !preserve_ad )
1373 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1375 else
1376 #endif
1378 intpte_t t = old;
1379 for ( ; ; )
1381 intpte_t _new = new;
1382 if ( preserve_ad )
1383 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1385 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1386 if ( unlikely(rv == 0) )
1388 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1389 ": saw %" PRIpte, old, _new, t);
1390 break;
1393 if ( t == old )
1394 break;
1396 /* Allowed to change in Accessed/Dirty flags only. */
1397 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1399 old = t;
1402 return rv;
1405 /* Macro that wraps the appropriate type-changes around update_intpte().
1406 * Arguments are: type, ptr, old, new, mfn, vcpu */
1407 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1408 update_intpte(&_t ## e_get_intpte(*(_p)), \
1409 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1410 (_m), (_v), (_ad))
1412 /* Update the L1 entry at pl1e to new value nl1e. */
1413 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1414 unsigned long gl1mfn, int preserve_ad)
1416 l1_pgentry_t ol1e;
1417 struct vcpu *curr = current;
1418 struct domain *d = curr->domain;
1419 unsigned long mfn;
1421 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1422 return 0;
1424 if ( unlikely(paging_mode_refcounts(d)) )
1425 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad);
1427 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1429 /* Translate foreign guest addresses. */
1430 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1431 if ( unlikely(mfn == INVALID_MFN) )
1432 return 0;
1433 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1434 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1436 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1438 MEM_LOG("Bad L1 flags %x",
1439 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1440 return 0;
1443 /* Fast path for identical mapping, r/w and presence. */
1444 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1446 adjust_guest_l1e(nl1e, d);
1447 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1448 preserve_ad);
1451 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1452 return 0;
1454 adjust_guest_l1e(nl1e, d);
1455 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1456 preserve_ad)) )
1458 put_page_from_l1e(nl1e, d);
1459 return 0;
1462 else
1464 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1465 preserve_ad)) )
1466 return 0;
1469 put_page_from_l1e(ol1e, d);
1470 return 1;
1474 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1475 static int mod_l2_entry(l2_pgentry_t *pl2e,
1476 l2_pgentry_t nl2e,
1477 unsigned long pfn,
1478 unsigned long type,
1479 int preserve_ad)
1481 l2_pgentry_t ol2e;
1482 struct vcpu *curr = current;
1483 struct domain *d = curr->domain;
1485 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1487 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1488 return 0;
1491 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1492 return 0;
1494 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1496 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1498 MEM_LOG("Bad L2 flags %x",
1499 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1500 return 0;
1503 /* Fast path for identical mapping and presence. */
1504 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1506 adjust_guest_l2e(nl2e, d);
1507 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad);
1510 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1511 return 0;
1513 adjust_guest_l2e(nl2e, d);
1514 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1515 preserve_ad)) )
1517 put_page_from_l2e(nl2e, pfn);
1518 return 0;
1521 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1522 preserve_ad)) )
1524 return 0;
1527 put_page_from_l2e(ol2e, pfn);
1528 return 1;
1531 #if CONFIG_PAGING_LEVELS >= 3
1533 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1534 static int mod_l3_entry(l3_pgentry_t *pl3e,
1535 l3_pgentry_t nl3e,
1536 unsigned long pfn,
1537 int preserve_ad)
1539 l3_pgentry_t ol3e;
1540 struct vcpu *curr = current;
1541 struct domain *d = curr->domain;
1542 int okay;
1544 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1546 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1547 return 0;
1550 /*
1551 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1552 * would be a pain to ensure they remain continuously valid throughout.
1553 */
1554 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1555 return 0;
1557 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1558 return 0;
1560 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1562 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1564 MEM_LOG("Bad L3 flags %x",
1565 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1566 return 0;
1569 /* Fast path for identical mapping and presence. */
1570 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1572 adjust_guest_l3e(nl3e, d);
1573 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
1576 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1577 return 0;
1579 adjust_guest_l3e(nl3e, d);
1580 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1581 preserve_ad)) )
1583 put_page_from_l3e(nl3e, pfn);
1584 return 0;
1587 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1588 preserve_ad)) )
1590 return 0;
1593 okay = create_pae_xen_mappings(d, pl3e);
1594 BUG_ON(!okay);
1596 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1598 put_page_from_l3e(ol3e, pfn);
1599 return 1;
1602 #endif
1604 #if CONFIG_PAGING_LEVELS >= 4
1606 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1607 static int mod_l4_entry(l4_pgentry_t *pl4e,
1608 l4_pgentry_t nl4e,
1609 unsigned long pfn,
1610 int preserve_ad)
1612 struct vcpu *curr = current;
1613 struct domain *d = curr->domain;
1614 l4_pgentry_t ol4e;
1616 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1618 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1619 return 0;
1622 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1623 return 0;
1625 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1627 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1629 MEM_LOG("Bad L4 flags %x",
1630 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1631 return 0;
1634 /* Fast path for identical mapping and presence. */
1635 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1637 adjust_guest_l4e(nl4e, d);
1638 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
1641 if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
1642 return 0;
1644 adjust_guest_l4e(nl4e, d);
1645 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1646 preserve_ad)) )
1648 put_page_from_l4e(nl4e, pfn);
1649 return 0;
1652 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1653 preserve_ad)) )
1655 return 0;
1658 put_page_from_l4e(ol4e, pfn);
1659 return 1;
1662 #endif
1664 void put_page(struct page_info *page)
1666 u32 nx, x, y = page->count_info;
1668 do {
1669 x = y;
1670 nx = x - 1;
1672 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1674 if ( unlikely((nx & PGC_count_mask) == 0) )
1676 cleanup_page_cacheattr(page);
1677 free_domheap_page(page);
1682 int get_page(struct page_info *page, struct domain *domain)
1684 u32 x, nx, y = page->count_info;
1685 u32 d, nd = page->u.inuse._domain;
1686 u32 _domain = pickle_domptr(domain);
1688 do {
1689 x = y;
1690 nx = x + 1;
1691 d = nd;
1692 if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
1693 unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
1694 unlikely(d != _domain) ) /* Wrong owner? */
1696 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1697 gdprintk(XENLOG_INFO,
1698 "Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
1699 PRtype_info "\n",
1700 page_to_mfn(page), domain, unpickle_domptr(d),
1701 x, page->u.inuse.type_info);
1702 return 0;
1704 asm volatile (
1705 LOCK_PREFIX "cmpxchg8b %2"
1706 : "=d" (nd), "=a" (y),
1707 "=m" (*(volatile u64 *)(&page->count_info))
1708 : "0" (d), "1" (x), "c" (d), "b" (nx) );
1710 while ( unlikely(nd != d) || unlikely(y != x) );
1712 return 1;
1716 static int alloc_page_type(struct page_info *page, unsigned long type)
1718 struct domain *owner = page_get_owner(page);
1720 /* A page table is dirtied when its type count becomes non-zero. */
1721 if ( likely(owner != NULL) )
1722 paging_mark_dirty(owner, page_to_mfn(page));
1724 switch ( type & PGT_type_mask )
1726 case PGT_l1_page_table:
1727 return alloc_l1_table(page);
1728 case PGT_l2_page_table:
1729 return alloc_l2_table(page, type);
1730 case PGT_l3_page_table:
1731 return alloc_l3_table(page);
1732 case PGT_l4_page_table:
1733 return alloc_l4_table(page);
1734 case PGT_seg_desc_page:
1735 return alloc_segdesc_page(page);
1736 default:
1737 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1738 type, page->u.inuse.type_info,
1739 page->count_info);
1740 BUG();
1743 return 0;
1747 void free_page_type(struct page_info *page, unsigned long type)
1749 struct domain *owner = page_get_owner(page);
1750 unsigned long gmfn;
1752 if ( likely(owner != NULL) )
1754 /*
1755 * We have to flush before the next use of the linear mapping
1756 * (e.g., update_va_mapping()) or we could end up modifying a page
1757 * that is no longer a page table (and hence screw up ref counts).
1758 */
1759 if ( current->domain == owner )
1760 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1761 else
1762 flush_tlb_mask(owner->domain_dirty_cpumask);
1764 if ( unlikely(paging_mode_enabled(owner)) )
1766 /* A page table is dirtied when its type count becomes zero. */
1767 paging_mark_dirty(owner, page_to_mfn(page));
1769 if ( shadow_mode_refcounts(owner) )
1770 return;
1772 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1773 ASSERT(VALID_M2P(gmfn));
1774 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1778 switch ( type & PGT_type_mask )
1780 case PGT_l1_page_table:
1781 free_l1_table(page);
1782 break;
1784 case PGT_l2_page_table:
1785 free_l2_table(page);
1786 break;
1788 #if CONFIG_PAGING_LEVELS >= 3
1789 case PGT_l3_page_table:
1790 free_l3_table(page);
1791 break;
1792 #endif
1794 #if CONFIG_PAGING_LEVELS >= 4
1795 case PGT_l4_page_table:
1796 free_l4_table(page);
1797 break;
1798 #endif
1800 default:
1801 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1802 type, page_to_mfn(page));
1803 BUG();
1808 void put_page_type(struct page_info *page)
1810 unsigned long nx, x, y = page->u.inuse.type_info;
1812 again:
1813 do {
1814 x = y;
1815 nx = x - 1;
1817 ASSERT((x & PGT_count_mask) != 0);
1819 if ( unlikely((nx & PGT_count_mask) == 0) )
1821 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1822 likely(nx & PGT_validated) )
1824 /*
1825 * Page-table pages must be unvalidated when count is zero. The
1826 * 'free' is safe because the refcnt is non-zero and validated
1827 * bit is clear => other ops will spin or fail.
1828 */
1829 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1830 x & ~PGT_validated)) != x) )
1831 goto again;
1832 /* We cleared the 'valid bit' so we do the clean up. */
1833 free_page_type(page, x);
1834 /* Carry on, but with the 'valid bit' now clear. */
1835 x &= ~PGT_validated;
1836 nx &= ~PGT_validated;
1839 /*
1840 * Record TLB information for flush later. We do not stamp page
1841 * tables when running in shadow mode:
1842 * 1. Pointless, since it's the shadow pt's which must be tracked.
1843 * 2. Shadow mode reuses this field for shadowed page tables to
1844 * store flags info -- we don't want to conflict with that.
1845 */
1846 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1847 (page->count_info & PGC_page_table)) )
1848 page->tlbflush_timestamp = tlbflush_current_time();
1851 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1855 int get_page_type(struct page_info *page, unsigned long type)
1857 unsigned long nx, x, y = page->u.inuse.type_info;
1859 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1861 again:
1862 do {
1863 x = y;
1864 nx = x + 1;
1865 if ( unlikely((nx & PGT_count_mask) == 0) )
1867 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1868 return 0;
1870 else if ( unlikely((x & PGT_count_mask) == 0) )
1872 struct domain *d = page_get_owner(page);
1874 /* Never allow a shadowed frame to go from type count 0 to 1 */
1875 if ( d && shadow_mode_enabled(d) )
1876 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1878 ASSERT(!(x & PGT_pae_xen_l2));
1879 if ( (x & PGT_type_mask) != type )
1881 /*
1882 * On type change we check to flush stale TLB entries. This
1883 * may be unnecessary (e.g., page was GDT/LDT) but those
1884 * circumstances should be very rare.
1885 */
1886 cpumask_t mask = d->domain_dirty_cpumask;
1888 /* Don't flush if the timestamp is old enough */
1889 tlbflush_filter(mask, page->tlbflush_timestamp);
1891 if ( unlikely(!cpus_empty(mask)) &&
1892 /* Shadow mode: track only writable pages. */
1893 (!shadow_mode_enabled(page_get_owner(page)) ||
1894 ((nx & PGT_type_mask) == PGT_writable_page)) )
1896 perfc_incr(need_flush_tlb_flush);
1897 flush_tlb_mask(mask);
1900 /* We lose existing type and validity. */
1901 nx &= ~(PGT_type_mask | PGT_validated);
1902 nx |= type;
1904 /* No special validation needed for writable pages. */
1905 /* Page tables and GDT/LDT need to be scanned for validity. */
1906 if ( type == PGT_writable_page )
1907 nx |= PGT_validated;
1910 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1912 /* Don't log failure if it could be a recursive-mapping attempt. */
1913 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
1914 (type == PGT_l1_page_table) )
1915 return 0;
1916 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
1917 (type == PGT_l2_page_table) )
1918 return 0;
1919 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
1920 (type == PGT_l3_page_table) )
1921 return 0;
1922 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
1923 "for mfn %lx (pfn %lx)",
1924 x, type, page_to_mfn(page),
1925 get_gpfn_from_mfn(page_to_mfn(page)));
1926 return 0;
1928 else if ( unlikely(!(x & PGT_validated)) )
1930 /* Someone else is updating validation of this page. Wait... */
1931 while ( (y = page->u.inuse.type_info) == x )
1932 cpu_relax();
1933 goto again;
1936 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1938 if ( unlikely((x & PGT_type_mask) != type) )
1940 /* Special pages should not be accessible from devices. */
1941 struct domain *d = page_get_owner(page);
1942 if ( d && unlikely(need_iommu(d)) )
1944 if ( (x & PGT_type_mask) == PGT_writable_page )
1945 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
1946 else if ( type == PGT_writable_page )
1947 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
1948 page_to_mfn(page));
1952 if ( unlikely(!(nx & PGT_validated)) )
1954 /* Try to validate page type; drop the new reference on failure. */
1955 if ( unlikely(!alloc_page_type(page, type)) )
1957 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1958 PRtype_info ": caf=%08x taf=%" PRtype_info,
1959 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1960 type, page->count_info, page->u.inuse.type_info);
1961 /* Noone else can get a reference. We hold the only ref. */
1962 page->u.inuse.type_info = 0;
1963 return 0;
1966 /* Noone else is updating simultaneously. */
1967 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1970 return 1;
1974 void cleanup_page_cacheattr(struct page_info *page)
1976 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
1978 if ( likely(cacheattr == 0) )
1979 return;
1981 page->count_info &= ~PGC_cacheattr_mask;
1983 BUG_ON(is_xen_heap_page(page));
1985 #ifdef __x86_64__
1986 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
1987 1, PAGE_HYPERVISOR);
1988 #endif
1992 int new_guest_cr3(unsigned long mfn)
1994 struct vcpu *v = current;
1995 struct domain *d = v->domain;
1996 int okay;
1997 unsigned long old_base_mfn;
1999 #ifdef CONFIG_COMPAT
2000 if ( is_pv_32on64_domain(d) )
2002 okay = paging_mode_refcounts(d)
2003 ? 0 /* Old code was broken, but what should it be? */
2004 : mod_l4_entry(
2005 __va(pagetable_get_paddr(v->arch.guest_table)),
2006 l4e_from_pfn(
2007 mfn,
2008 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2009 pagetable_get_pfn(v->arch.guest_table), 0);
2010 if ( unlikely(!okay) )
2012 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2013 return 0;
2016 invalidate_shadow_ldt(v);
2017 write_ptbase(v);
2019 return 1;
2021 #endif
2022 okay = paging_mode_refcounts(d)
2023 ? get_page_from_pagenr(mfn, d)
2024 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
2025 if ( unlikely(!okay) )
2027 MEM_LOG("Error while installing new baseptr %lx", mfn);
2028 return 0;
2031 invalidate_shadow_ldt(v);
2033 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2035 v->arch.guest_table = pagetable_from_pfn(mfn);
2036 update_cr3(v);
2038 write_ptbase(v);
2040 if ( likely(old_base_mfn != 0) )
2042 if ( paging_mode_refcounts(d) )
2043 put_page(mfn_to_page(old_base_mfn));
2044 else
2045 put_page_and_type(mfn_to_page(old_base_mfn));
2048 return 1;
2051 static void process_deferred_ops(void)
2053 unsigned int deferred_ops;
2054 struct domain *d = current->domain;
2055 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2057 deferred_ops = info->deferred_ops;
2058 info->deferred_ops = 0;
2060 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2062 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2063 flush_tlb_mask(d->domain_dirty_cpumask);
2064 else
2065 flush_tlb_local();
2068 if ( deferred_ops & DOP_RELOAD_LDT )
2069 (void)map_ldt_shadow_page(0);
2071 if ( unlikely(info->foreign != NULL) )
2073 rcu_unlock_domain(info->foreign);
2074 info->foreign = NULL;
2078 static int set_foreigndom(domid_t domid)
2080 struct domain *e, *d = current->domain;
2081 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2082 int okay = 1;
2084 ASSERT(info->foreign == NULL);
2086 if ( likely(domid == DOMID_SELF) )
2087 goto out;
2089 if ( unlikely(domid == d->domain_id) )
2091 MEM_LOG("Cannot specify itself as foreign domain");
2092 okay = 0;
2094 else if ( unlikely(paging_mode_translate(d)) )
2096 MEM_LOG("Cannot mix foreign mappings with translated domains");
2097 okay = 0;
2099 else switch ( domid )
2101 case DOMID_IO:
2102 info->foreign = rcu_lock_domain(dom_io);
2103 break;
2104 case DOMID_XEN:
2105 if (!IS_PRIV(d)) {
2106 MEM_LOG("Cannot set foreign dom");
2107 okay = 0;
2108 break;
2110 info->foreign = rcu_lock_domain(dom_xen);
2111 break;
2112 default:
2113 if ( (e = rcu_lock_domain_by_id(domid)) == NULL )
2115 MEM_LOG("Unknown domain '%u'", domid);
2116 okay = 0;
2117 break;
2119 if ( !IS_PRIV_FOR(d, e) )
2121 MEM_LOG("Cannot set foreign dom");
2122 okay = 0;
2123 rcu_unlock_domain(e);
2124 break;
2126 info->foreign = e;
2127 break;
2130 out:
2131 return okay;
2134 static inline cpumask_t vcpumask_to_pcpumask(
2135 struct domain *d, unsigned long vmask)
2137 unsigned int vcpu_id;
2138 cpumask_t pmask = CPU_MASK_NONE;
2139 struct vcpu *v;
2141 while ( vmask != 0 )
2143 vcpu_id = find_first_set_bit(vmask);
2144 vmask &= ~(1UL << vcpu_id);
2145 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2146 ((v = d->vcpu[vcpu_id]) != NULL) )
2147 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2150 return pmask;
2153 int do_mmuext_op(
2154 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2155 unsigned int count,
2156 XEN_GUEST_HANDLE(uint) pdone,
2157 unsigned int foreigndom)
2159 struct mmuext_op op;
2160 int rc = 0, i = 0, okay;
2161 unsigned long mfn = 0, gmfn = 0, type;
2162 unsigned int done = 0;
2163 struct page_info *page;
2164 struct vcpu *v = current;
2165 struct domain *d = v->domain;
2167 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2169 count &= ~MMU_UPDATE_PREEMPTED;
2170 if ( unlikely(!guest_handle_is_null(pdone)) )
2171 (void)copy_from_guest(&done, pdone, 1);
2173 else
2174 perfc_incr(calls_to_mmuext_op);
2176 if ( unlikely(!guest_handle_okay(uops, count)) )
2178 rc = -EFAULT;
2179 goto out;
2182 if ( !set_foreigndom(foreigndom) )
2184 rc = -ESRCH;
2185 goto out;
2188 domain_lock(d);
2190 for ( i = 0; i < count; i++ )
2192 if ( hypercall_preempt_check() )
2194 rc = hypercall_create_continuation(
2195 __HYPERVISOR_mmuext_op, "hihi",
2196 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2197 break;
2200 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2202 MEM_LOG("Bad __copy_from_guest");
2203 rc = -EFAULT;
2204 break;
2207 okay = 1;
2208 gmfn = op.arg1.mfn;
2209 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2210 page = mfn_to_page(mfn);
2212 switch ( op.cmd )
2214 case MMUEXT_PIN_L1_TABLE:
2215 type = PGT_l1_page_table;
2216 goto pin_page;
2218 case MMUEXT_PIN_L2_TABLE:
2219 type = PGT_l2_page_table;
2220 goto pin_page;
2222 case MMUEXT_PIN_L3_TABLE:
2223 type = PGT_l3_page_table;
2224 goto pin_page;
2226 case MMUEXT_PIN_L4_TABLE:
2227 if ( is_pv_32bit_domain(FOREIGNDOM) )
2228 break;
2229 type = PGT_l4_page_table;
2231 pin_page:
2232 rc = xsm_memory_pin_page(d, page);
2233 if ( rc )
2234 break;
2236 /* Ignore pinning of invalid paging levels. */
2237 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2238 break;
2240 if ( paging_mode_refcounts(FOREIGNDOM) )
2241 break;
2243 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2244 if ( unlikely(!okay) )
2246 MEM_LOG("Error while pinning mfn %lx", mfn);
2247 break;
2250 if ( unlikely(test_and_set_bit(_PGT_pinned,
2251 &page->u.inuse.type_info)) )
2253 MEM_LOG("Mfn %lx already pinned", mfn);
2254 put_page_and_type(page);
2255 okay = 0;
2256 break;
2259 /* A page is dirtied when its pin status is set. */
2260 paging_mark_dirty(d, mfn);
2262 /* We can race domain destruction (domain_relinquish_resources). */
2263 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2265 int drop_ref;
2266 spin_lock(&FOREIGNDOM->page_alloc_lock);
2267 drop_ref = (FOREIGNDOM->is_dying &&
2268 test_and_clear_bit(_PGT_pinned,
2269 &page->u.inuse.type_info));
2270 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2271 if ( drop_ref )
2272 put_page_and_type(page);
2275 break;
2277 case MMUEXT_UNPIN_TABLE:
2278 if ( paging_mode_refcounts(d) )
2279 break;
2281 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2283 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2284 mfn, page_get_owner(page));
2286 else if ( likely(test_and_clear_bit(_PGT_pinned,
2287 &page->u.inuse.type_info)) )
2289 put_page_and_type(page);
2290 put_page(page);
2291 /* A page is dirtied when its pin status is cleared. */
2292 paging_mark_dirty(d, mfn);
2294 else
2296 okay = 0;
2297 put_page(page);
2298 MEM_LOG("Mfn %lx not pinned", mfn);
2300 break;
2302 case MMUEXT_NEW_BASEPTR:
2303 okay = new_guest_cr3(mfn);
2304 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2305 break;
2307 #ifdef __x86_64__
2308 case MMUEXT_NEW_USER_BASEPTR: {
2309 unsigned long old_mfn;
2311 if ( mfn != 0 )
2313 if ( paging_mode_refcounts(d) )
2314 okay = get_page_from_pagenr(mfn, d);
2315 else
2316 okay = get_page_and_type_from_pagenr(
2317 mfn, PGT_root_page_table, d);
2318 if ( unlikely(!okay) )
2320 MEM_LOG("Error while installing new mfn %lx", mfn);
2321 break;
2325 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2326 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2328 if ( old_mfn != 0 )
2330 if ( paging_mode_refcounts(d) )
2331 put_page(mfn_to_page(old_mfn));
2332 else
2333 put_page_and_type(mfn_to_page(old_mfn));
2336 break;
2338 #endif
2340 case MMUEXT_TLB_FLUSH_LOCAL:
2341 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2342 break;
2344 case MMUEXT_INVLPG_LOCAL:
2345 if ( !paging_mode_enabled(d)
2346 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2347 flush_tlb_one_local(op.arg1.linear_addr);
2348 break;
2350 case MMUEXT_TLB_FLUSH_MULTI:
2351 case MMUEXT_INVLPG_MULTI:
2353 unsigned long vmask;
2354 cpumask_t pmask;
2355 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2357 okay = 0;
2358 break;
2360 pmask = vcpumask_to_pcpumask(d, vmask);
2361 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2362 flush_tlb_mask(pmask);
2363 else
2364 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2365 break;
2368 case MMUEXT_TLB_FLUSH_ALL:
2369 flush_tlb_mask(d->domain_dirty_cpumask);
2370 break;
2372 case MMUEXT_INVLPG_ALL:
2373 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2374 break;
2376 case MMUEXT_FLUSH_CACHE:
2377 if ( unlikely(!cache_flush_permitted(d)) )
2379 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2380 okay = 0;
2382 else
2384 wbinvd();
2386 break;
2388 case MMUEXT_SET_LDT:
2390 unsigned long ptr = op.arg1.linear_addr;
2391 unsigned long ents = op.arg2.nr_ents;
2393 if ( paging_mode_external(d) )
2395 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2396 okay = 0;
2398 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2399 (ents > 8192) ||
2400 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2402 okay = 0;
2403 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2405 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2406 (v->arch.guest_context.ldt_base != ptr) )
2408 invalidate_shadow_ldt(v);
2409 v->arch.guest_context.ldt_base = ptr;
2410 v->arch.guest_context.ldt_ents = ents;
2411 load_LDT(v);
2412 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2413 if ( ents != 0 )
2414 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2416 break;
2419 default:
2420 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2421 rc = -ENOSYS;
2422 okay = 0;
2423 break;
2426 if ( unlikely(!okay) )
2428 rc = rc ? rc : -EINVAL;
2429 break;
2432 guest_handle_add_offset(uops, 1);
2435 process_deferred_ops();
2437 domain_unlock(d);
2439 perfc_add(num_mmuext_ops, i);
2441 out:
2442 /* Add incremental work we have done to the @done output parameter. */
2443 if ( unlikely(!guest_handle_is_null(pdone)) )
2445 done += i;
2446 copy_to_guest(pdone, &done, 1);
2449 return rc;
2452 int do_mmu_update(
2453 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2454 unsigned int count,
2455 XEN_GUEST_HANDLE(uint) pdone,
2456 unsigned int foreigndom)
2458 struct mmu_update req;
2459 void *va;
2460 unsigned long gpfn, gmfn, mfn;
2461 struct page_info *page;
2462 int rc = 0, okay = 1, i = 0;
2463 unsigned int cmd, done = 0;
2464 struct vcpu *v = current;
2465 struct domain *d = v->domain;
2466 unsigned long type_info;
2467 struct domain_mmap_cache mapcache;
2469 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2471 count &= ~MMU_UPDATE_PREEMPTED;
2472 if ( unlikely(!guest_handle_is_null(pdone)) )
2473 (void)copy_from_guest(&done, pdone, 1);
2475 else
2476 perfc_incr(calls_to_mmu_update);
2478 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2480 rc = -EFAULT;
2481 goto out;
2484 if ( !set_foreigndom(foreigndom) )
2486 rc = -ESRCH;
2487 goto out;
2490 domain_mmap_cache_init(&mapcache);
2492 domain_lock(d);
2494 for ( i = 0; i < count; i++ )
2496 if ( hypercall_preempt_check() )
2498 rc = hypercall_create_continuation(
2499 __HYPERVISOR_mmu_update, "hihi",
2500 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2501 break;
2504 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2506 MEM_LOG("Bad __copy_from_guest");
2507 rc = -EFAULT;
2508 break;
2511 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2512 okay = 0;
2514 switch ( cmd )
2516 /*
2517 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2518 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
2519 * current A/D bits.
2520 */
2521 case MMU_NORMAL_PT_UPDATE:
2522 case MMU_PT_UPDATE_PRESERVE_AD:
2523 rc = xsm_mmu_normal_update(d, req.val);
2524 if ( rc )
2525 break;
2527 req.ptr -= cmd;
2528 gmfn = req.ptr >> PAGE_SHIFT;
2529 mfn = gmfn_to_mfn(d, gmfn);
2531 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
2533 MEM_LOG("Could not get page for normal update");
2534 break;
2537 va = map_domain_page_with_cache(mfn, &mapcache);
2538 va = (void *)((unsigned long)va +
2539 (unsigned long)(req.ptr & ~PAGE_MASK));
2540 page = mfn_to_page(mfn);
2542 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2544 case PGT_l1_page_table:
2545 case PGT_l2_page_table:
2546 case PGT_l3_page_table:
2547 case PGT_l4_page_table:
2549 if ( paging_mode_refcounts(d) )
2551 MEM_LOG("mmu update on auto-refcounted domain!");
2552 break;
2555 if ( unlikely(!get_page_type(
2556 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2557 goto not_a_pt;
2559 switch ( type_info & PGT_type_mask )
2561 case PGT_l1_page_table:
2563 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2564 okay = mod_l1_entry(va, l1e, mfn,
2565 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2567 break;
2568 case PGT_l2_page_table:
2570 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2571 okay = mod_l2_entry(va, l2e, mfn, type_info,
2572 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2574 break;
2575 #if CONFIG_PAGING_LEVELS >= 3
2576 case PGT_l3_page_table:
2578 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2579 okay = mod_l3_entry(va, l3e, mfn,
2580 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2582 break;
2583 #endif
2584 #if CONFIG_PAGING_LEVELS >= 4
2585 case PGT_l4_page_table:
2587 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2588 okay = mod_l4_entry(va, l4e, mfn,
2589 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2591 break;
2592 #endif
2595 put_page_type(page);
2597 break;
2599 default:
2600 not_a_pt:
2602 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2603 break;
2605 perfc_incr(writable_mmu_updates);
2607 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2609 put_page_type(page);
2611 break;
2614 unmap_domain_page_with_cache(va, &mapcache);
2616 put_page(page);
2617 break;
2619 case MMU_MACHPHYS_UPDATE:
2621 mfn = req.ptr >> PAGE_SHIFT;
2622 gpfn = req.val;
2624 rc = xsm_mmu_machphys_update(d, mfn);
2625 if ( rc )
2626 break;
2628 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2630 MEM_LOG("Could not get page for mach->phys update");
2631 break;
2634 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2636 MEM_LOG("Mach-phys update on auto-translate guest");
2637 break;
2640 set_gpfn_from_mfn(mfn, gpfn);
2641 okay = 1;
2643 paging_mark_dirty(FOREIGNDOM, mfn);
2645 put_page(mfn_to_page(mfn));
2646 break;
2648 default:
2649 MEM_LOG("Invalid page update command %x", cmd);
2650 rc = -ENOSYS;
2651 okay = 0;
2652 break;
2655 if ( unlikely(!okay) )
2657 rc = rc ? rc : -EINVAL;
2658 break;
2661 guest_handle_add_offset(ureqs, 1);
2664 process_deferred_ops();
2666 domain_unlock(d);
2668 domain_mmap_cache_destroy(&mapcache);
2670 perfc_add(num_page_updates, i);
2672 out:
2673 /* Add incremental work we have done to the @done output parameter. */
2674 if ( unlikely(!guest_handle_is_null(pdone)) )
2676 done += i;
2677 copy_to_guest(pdone, &done, 1);
2680 return rc;
2684 static int create_grant_pte_mapping(
2685 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2687 int rc = GNTST_okay;
2688 void *va;
2689 unsigned long gmfn, mfn;
2690 struct page_info *page;
2691 u32 type;
2692 l1_pgentry_t ol1e;
2693 struct domain *d = v->domain;
2695 ASSERT(domain_is_locked(d));
2697 adjust_guest_l1e(nl1e, d);
2699 gmfn = pte_addr >> PAGE_SHIFT;
2700 mfn = gmfn_to_mfn(d, gmfn);
2702 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2704 MEM_LOG("Could not get page for normal update");
2705 return GNTST_general_error;
2708 va = map_domain_page(mfn);
2709 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2710 page = mfn_to_page(mfn);
2712 type = page->u.inuse.type_info & PGT_type_mask;
2713 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2715 MEM_LOG("Grant map attempted to update a non-L1 page");
2716 rc = GNTST_general_error;
2717 goto failed;
2720 ol1e = *(l1_pgentry_t *)va;
2721 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
2723 put_page_type(page);
2724 rc = GNTST_general_error;
2725 goto failed;
2728 if ( !paging_mode_refcounts(d) )
2729 put_page_from_l1e(ol1e, d);
2731 put_page_type(page);
2733 failed:
2734 unmap_domain_page(va);
2735 put_page(page);
2737 return rc;
2740 static int destroy_grant_pte_mapping(
2741 uint64_t addr, unsigned long frame, struct domain *d)
2743 int rc = GNTST_okay;
2744 void *va;
2745 unsigned long gmfn, mfn;
2746 struct page_info *page;
2747 u32 type;
2748 l1_pgentry_t ol1e;
2750 gmfn = addr >> PAGE_SHIFT;
2751 mfn = gmfn_to_mfn(d, gmfn);
2753 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2755 MEM_LOG("Could not get page for normal update");
2756 return GNTST_general_error;
2759 va = map_domain_page(mfn);
2760 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2761 page = mfn_to_page(mfn);
2763 type = page->u.inuse.type_info & PGT_type_mask;
2764 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2766 MEM_LOG("Grant map attempted to update a non-L1 page");
2767 rc = GNTST_general_error;
2768 goto failed;
2771 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2773 put_page_type(page);
2774 rc = GNTST_general_error;
2775 goto failed;
2778 /* Check that the virtual address supplied is actually mapped to frame. */
2779 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2781 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2782 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2783 put_page_type(page);
2784 rc = GNTST_general_error;
2785 goto failed;
2788 /* Delete pagetable entry. */
2789 if ( unlikely(!UPDATE_ENTRY
2790 (l1,
2791 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2792 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
2793 0)) )
2795 MEM_LOG("Cannot delete PTE entry at %p", va);
2796 put_page_type(page);
2797 rc = GNTST_general_error;
2798 goto failed;
2801 put_page_type(page);
2803 failed:
2804 unmap_domain_page(va);
2805 put_page(page);
2806 return rc;
2810 static int create_grant_va_mapping(
2811 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2813 l1_pgentry_t *pl1e, ol1e;
2814 struct domain *d = v->domain;
2815 unsigned long gl1mfn;
2816 int okay;
2818 ASSERT(domain_is_locked(d));
2820 adjust_guest_l1e(nl1e, d);
2822 pl1e = guest_map_l1e(v, va, &gl1mfn);
2823 if ( !pl1e )
2825 MEM_LOG("Could not find L1 PTE for address %lx", va);
2826 return GNTST_general_error;
2828 ol1e = *pl1e;
2829 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
2830 guest_unmap_l1e(v, pl1e);
2831 pl1e = NULL;
2833 if ( !okay )
2834 return GNTST_general_error;
2836 if ( !paging_mode_refcounts(d) )
2837 put_page_from_l1e(ol1e, d);
2839 return GNTST_okay;
2842 static int replace_grant_va_mapping(
2843 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2845 l1_pgentry_t *pl1e, ol1e;
2846 unsigned long gl1mfn;
2847 int rc = 0;
2849 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2850 if ( !pl1e )
2852 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2853 return GNTST_general_error;
2855 ol1e = *pl1e;
2857 /* Check that the virtual address supplied is actually mapped to frame. */
2858 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2860 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2861 l1e_get_pfn(ol1e), addr, frame);
2862 rc = GNTST_general_error;
2863 goto out;
2866 /* Delete pagetable entry. */
2867 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
2869 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2870 rc = GNTST_general_error;
2871 goto out;
2874 out:
2875 guest_unmap_l1e(v, pl1e);
2876 return rc;
2879 static int destroy_grant_va_mapping(
2880 unsigned long addr, unsigned long frame, struct vcpu *v)
2882 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2885 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
2886 unsigned int flags, unsigned int cache_flags)
2888 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2890 if ( (flags & GNTMAP_application_map) )
2891 l1e_add_flags(pte,_PAGE_USER);
2892 if ( !(flags & GNTMAP_readonly) )
2893 l1e_add_flags(pte,_PAGE_RW);
2895 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
2897 if ( flags & GNTMAP_contains_pte )
2898 return create_grant_pte_mapping(addr, pte, current);
2899 return create_grant_va_mapping(addr, pte, current);
2902 int replace_grant_host_mapping(
2903 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
2905 struct vcpu *curr = current;
2906 l1_pgentry_t *pl1e, ol1e;
2907 unsigned long gl1mfn;
2908 int rc;
2910 if ( flags & GNTMAP_contains_pte )
2912 if ( !new_addr )
2913 return destroy_grant_pte_mapping(addr, frame, curr->domain);
2915 MEM_LOG("Unsupported grant table operation");
2916 return GNTST_general_error;
2919 if ( !new_addr )
2920 return destroy_grant_va_mapping(addr, frame, curr);
2922 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
2923 if ( !pl1e )
2925 MEM_LOG("Could not find L1 PTE for address %lx",
2926 (unsigned long)new_addr);
2927 return GNTST_general_error;
2929 ol1e = *pl1e;
2931 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
2932 gl1mfn, curr, 0)) )
2934 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2935 guest_unmap_l1e(curr, pl1e);
2936 return GNTST_general_error;
2939 guest_unmap_l1e(curr, pl1e);
2941 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
2942 if ( rc && !paging_mode_refcounts(curr->domain) )
2943 put_page_from_l1e(ol1e, curr->domain);
2945 return rc;
2948 int steal_page(
2949 struct domain *d, struct page_info *page, unsigned int memflags)
2951 u32 _d, _nd, x, y;
2953 spin_lock(&d->page_alloc_lock);
2955 /*
2956 * The tricky bit: atomically release ownership while there is just one
2957 * benign reference to the page (PGC_allocated). If that reference
2958 * disappears then the deallocation routine will safely spin.
2959 */
2960 _d = pickle_domptr(d);
2961 _nd = page->u.inuse._domain;
2962 y = page->count_info;
2963 do {
2964 x = y;
2965 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2966 (1 | PGC_allocated)) || unlikely(_nd != _d) )
2968 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2969 " caf=%08x, taf=%" PRtype_info "\n",
2970 (void *) page_to_mfn(page),
2971 d, d->domain_id, unpickle_domptr(_nd), x,
2972 page->u.inuse.type_info);
2973 spin_unlock(&d->page_alloc_lock);
2974 return -1;
2976 asm volatile (
2977 LOCK_PREFIX "cmpxchg8b %2"
2978 : "=d" (_nd), "=a" (y),
2979 "=m" (*(volatile u64 *)(&page->count_info))
2980 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2981 } while (unlikely(_nd != _d) || unlikely(y != x));
2983 /*
2984 * Unlink from 'd'. At least one reference remains (now anonymous), so
2985 * noone else is spinning to try to delete this page from 'd'.
2986 */
2987 if ( !(memflags & MEMF_no_refcount) )
2988 d->tot_pages--;
2989 list_del(&page->list);
2991 spin_unlock(&d->page_alloc_lock);
2993 return 0;
2996 int do_update_va_mapping(unsigned long va, u64 val64,
2997 unsigned long flags)
2999 l1_pgentry_t val = l1e_from_intpte(val64);
3000 struct vcpu *v = current;
3001 struct domain *d = v->domain;
3002 l1_pgentry_t *pl1e;
3003 unsigned long vmask, bmap_ptr, gl1mfn;
3004 cpumask_t pmask;
3005 int rc = 0;
3007 perfc_incr(calls_to_update_va);
3009 if ( unlikely(!access_ok(va, 1) && !paging_mode_external(d)) )
3010 return -EINVAL;
3012 rc = xsm_update_va_mapping(current->domain, val);
3013 if ( rc )
3014 return rc;
3016 domain_lock(d);
3018 pl1e = guest_map_l1e(v, va, &gl1mfn);
3020 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) )
3021 rc = -EINVAL;
3023 if ( pl1e )
3024 guest_unmap_l1e(v, pl1e);
3025 pl1e = NULL;
3027 process_deferred_ops();
3029 domain_unlock(d);
3031 switch ( flags & UVMF_FLUSHTYPE_MASK )
3033 case UVMF_TLB_FLUSH:
3034 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3036 case UVMF_LOCAL:
3037 flush_tlb_local();
3038 break;
3039 case UVMF_ALL:
3040 flush_tlb_mask(d->domain_dirty_cpumask);
3041 break;
3042 default:
3043 if ( unlikely(!is_pv_32on64_domain(d) ?
3044 get_user(vmask, (unsigned long *)bmap_ptr) :
3045 get_user(vmask, (unsigned int *)bmap_ptr)) )
3046 rc = -EFAULT;
3047 pmask = vcpumask_to_pcpumask(d, vmask);
3048 flush_tlb_mask(pmask);
3049 break;
3051 break;
3053 case UVMF_INVLPG:
3054 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3056 case UVMF_LOCAL:
3057 if ( !paging_mode_enabled(d) ||
3058 (paging_invlpg(v, va) != 0) )
3059 flush_tlb_one_local(va);
3060 break;
3061 case UVMF_ALL:
3062 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
3063 break;
3064 default:
3065 if ( unlikely(!is_pv_32on64_domain(d) ?
3066 get_user(vmask, (unsigned long *)bmap_ptr) :
3067 get_user(vmask, (unsigned int *)bmap_ptr)) )
3068 rc = -EFAULT;
3069 pmask = vcpumask_to_pcpumask(d, vmask);
3070 flush_tlb_one_mask(pmask, va);
3071 break;
3073 break;
3076 return rc;
3079 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3080 unsigned long flags,
3081 domid_t domid)
3083 int rc;
3085 if ( !set_foreigndom(domid) )
3086 return -ESRCH;
3088 rc = do_update_va_mapping(va, val64, flags);
3090 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3091 process_deferred_ops(); /* only to clear foreigndom */
3093 return rc;
3098 /*************************
3099 * Descriptor Tables
3100 */
3102 void destroy_gdt(struct vcpu *v)
3104 int i;
3105 unsigned long pfn;
3107 v->arch.guest_context.gdt_ents = 0;
3108 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3110 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3111 put_page_and_type(mfn_to_page(pfn));
3112 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3113 v->arch.guest_context.gdt_frames[i] = 0;
3118 long set_gdt(struct vcpu *v,
3119 unsigned long *frames,
3120 unsigned int entries)
3122 struct domain *d = v->domain;
3123 /* NB. There are 512 8-byte entries per GDT page. */
3124 int i, nr_pages = (entries + 511) / 512;
3125 unsigned long mfn;
3127 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3128 return -EINVAL;
3130 /* Check the pages in the new GDT. */
3131 for ( i = 0; i < nr_pages; i++ )
3133 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3134 if ( !mfn_valid(mfn) ||
3135 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
3136 goto fail;
3139 /* Tear down the old GDT. */
3140 destroy_gdt(v);
3142 /* Install the new GDT. */
3143 v->arch.guest_context.gdt_ents = entries;
3144 for ( i = 0; i < nr_pages; i++ )
3146 v->arch.guest_context.gdt_frames[i] = frames[i];
3147 l1e_write(&v->arch.perdomain_ptes[i],
3148 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3151 return 0;
3153 fail:
3154 while ( i-- > 0 )
3155 put_page_and_type(mfn_to_page(frames[i]));
3156 return -EINVAL;
3160 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3162 int nr_pages = (entries + 511) / 512;
3163 unsigned long frames[16];
3164 struct vcpu *curr = current;
3165 long ret;
3167 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3168 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3169 return -EINVAL;
3171 if ( copy_from_guest(frames, frame_list, nr_pages) )
3172 return -EFAULT;
3174 domain_lock(curr->domain);
3176 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3177 flush_tlb_local();
3179 domain_unlock(curr->domain);
3181 return ret;
3185 long do_update_descriptor(u64 pa, u64 desc)
3187 struct domain *dom = current->domain;
3188 unsigned long gmfn = pa >> PAGE_SHIFT;
3189 unsigned long mfn;
3190 unsigned int offset;
3191 struct desc_struct *gdt_pent, d;
3192 struct page_info *page;
3193 long ret = -EINVAL;
3195 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3197 *(u64 *)&d = desc;
3199 mfn = gmfn_to_mfn(dom, gmfn);
3200 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3201 !mfn_valid(mfn) ||
3202 !check_descriptor(dom, &d) )
3203 return -EINVAL;
3205 page = mfn_to_page(mfn);
3206 if ( unlikely(!get_page(page, dom)) )
3207 return -EINVAL;
3209 /* Check if the given frame is in use in an unsafe context. */
3210 switch ( page->u.inuse.type_info & PGT_type_mask )
3212 case PGT_seg_desc_page:
3213 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
3214 goto out;
3215 break;
3216 default:
3217 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3218 goto out;
3219 break;
3222 paging_mark_dirty(dom, mfn);
3224 /* All is good so make the update. */
3225 gdt_pent = map_domain_page(mfn);
3226 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3227 unmap_domain_page(gdt_pent);
3229 put_page_type(page);
3231 ret = 0; /* success */
3233 out:
3234 put_page(page);
3236 return ret;
3239 typedef struct e820entry e820entry_t;
3240 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3242 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3244 switch ( op )
3246 case XENMEM_add_to_physmap:
3248 struct xen_add_to_physmap xatp;
3249 unsigned long prev_mfn, mfn = 0, gpfn;
3250 struct domain *d;
3252 if ( copy_from_guest(&xatp, arg, 1) )
3253 return -EFAULT;
3255 if ( xatp.domid == DOMID_SELF )
3257 d = rcu_lock_current_domain();
3259 else
3261 if ( (d = rcu_lock_domain_by_id(xatp.domid)) == NULL )
3262 return -ESRCH;
3263 if ( !IS_PRIV_FOR(current->domain, d) )
3265 rcu_unlock_domain(d);
3266 return -EPERM;
3270 if ( xsm_add_to_physmap(current->domain, d) )
3272 rcu_unlock_domain(d);
3273 return -EPERM;
3276 switch ( xatp.space )
3278 case XENMAPSPACE_shared_info:
3279 if ( xatp.idx == 0 )
3280 mfn = virt_to_mfn(d->shared_info);
3281 break;
3282 case XENMAPSPACE_grant_table:
3283 spin_lock(&d->grant_table->lock);
3285 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3286 (xatp.idx < max_nr_grant_frames) )
3287 gnttab_grow_table(d, xatp.idx + 1);
3289 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3290 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3292 spin_unlock(&d->grant_table->lock);
3293 break;
3294 default:
3295 break;
3298 if ( !paging_mode_translate(d) || (mfn == 0) )
3300 rcu_unlock_domain(d);
3301 return -EINVAL;
3304 domain_lock(d);
3306 /* Remove previously mapped page if it was present. */
3307 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3308 if ( mfn_valid(prev_mfn) )
3310 if ( is_xen_heap_mfn(prev_mfn) )
3311 /* Xen heap frames are simply unhooked from this phys slot. */
3312 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3313 else
3314 /* Normal domain memory is freed, to avoid leaking memory. */
3315 guest_remove_page(d, xatp.gpfn);
3318 /* Unmap from old location, if any. */
3319 gpfn = get_gpfn_from_mfn(mfn);
3320 if ( gpfn != INVALID_M2P_ENTRY )
3321 guest_physmap_remove_page(d, gpfn, mfn, 0);
3323 /* Map at new location. */
3324 guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
3326 domain_unlock(d);
3328 rcu_unlock_domain(d);
3330 break;
3333 case XENMEM_set_memory_map:
3335 struct xen_foreign_memory_map fmap;
3336 struct domain *d;
3337 int rc;
3339 if ( copy_from_guest(&fmap, arg, 1) )
3340 return -EFAULT;
3342 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3343 return -EINVAL;
3345 if ( fmap.domid == DOMID_SELF )
3347 d = rcu_lock_current_domain();
3349 else
3351 if ( (d = rcu_lock_domain_by_id(fmap.domid)) == NULL )
3352 return -ESRCH;
3353 if ( !IS_PRIV_FOR(current->domain, d) )
3355 rcu_unlock_domain(d);
3356 return -EPERM;
3360 rc = xsm_domain_memory_map(d);
3361 if ( rc )
3363 rcu_unlock_domain(d);
3364 return rc;
3367 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3368 fmap.map.nr_entries) ? -EFAULT : 0;
3369 d->arch.nr_e820 = fmap.map.nr_entries;
3371 rcu_unlock_domain(d);
3372 return rc;
3375 case XENMEM_memory_map:
3377 struct xen_memory_map map;
3378 struct domain *d = current->domain;
3380 /* Backwards compatibility. */
3381 if ( d->arch.nr_e820 == 0 )
3382 return -ENOSYS;
3384 if ( copy_from_guest(&map, arg, 1) )
3385 return -EFAULT;
3387 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3388 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3389 copy_to_guest(arg, &map, 1) )
3390 return -EFAULT;
3392 return 0;
3395 case XENMEM_machine_memory_map:
3397 struct xen_memory_map memmap;
3398 XEN_GUEST_HANDLE(e820entry_t) buffer;
3399 int count;
3400 int rc;
3402 if ( !IS_PRIV(current->domain) )
3403 return -EINVAL;
3405 rc = xsm_machine_memory_map();
3406 if ( rc )
3407 return rc;
3409 if ( copy_from_guest(&memmap, arg, 1) )
3410 return -EFAULT;
3411 if ( memmap.nr_entries < e820.nr_map + 1 )
3412 return -EINVAL;
3414 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3416 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3417 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3418 return -EFAULT;
3420 memmap.nr_entries = count;
3422 if ( copy_to_guest(arg, &memmap, 1) )
3423 return -EFAULT;
3425 return 0;
3428 case XENMEM_machphys_mapping:
3430 static const struct xen_machphys_mapping mapping = {
3431 .v_start = MACH2PHYS_VIRT_START,
3432 .v_end = MACH2PHYS_VIRT_END,
3433 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3434 };
3436 if ( copy_to_guest(arg, &mapping, 1) )
3437 return -EFAULT;
3439 return 0;
3442 default:
3443 return subarch_memory_op(op, arg);
3446 return 0;
3450 /*************************
3451 * Writable Pagetables
3452 */
3454 struct ptwr_emulate_ctxt {
3455 struct x86_emulate_ctxt ctxt;
3456 unsigned long cr2;
3457 l1_pgentry_t pte;
3458 };
3460 static int ptwr_emulated_read(
3461 enum x86_segment seg,
3462 unsigned long offset,
3463 unsigned long *val,
3464 unsigned int bytes,
3465 struct x86_emulate_ctxt *ctxt)
3467 unsigned int rc;
3468 unsigned long addr = offset;
3470 *val = 0;
3471 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3473 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3474 return X86EMUL_EXCEPTION;
3477 return X86EMUL_OKAY;
3480 static int ptwr_emulated_update(
3481 unsigned long addr,
3482 paddr_t old,
3483 paddr_t val,
3484 unsigned int bytes,
3485 unsigned int do_cmpxchg,
3486 struct ptwr_emulate_ctxt *ptwr_ctxt)
3488 unsigned long mfn;
3489 unsigned long unaligned_addr = addr;
3490 struct page_info *page;
3491 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3492 struct vcpu *v = current;
3493 struct domain *d = v->domain;
3495 /* Only allow naturally-aligned stores within the original %cr2 page. */
3496 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3498 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3499 ptwr_ctxt->cr2, addr, bytes);
3500 return X86EMUL_UNHANDLEABLE;
3503 /* Turn a sub-word access into a full-word access. */
3504 if ( bytes != sizeof(paddr_t) )
3506 paddr_t full;
3507 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3509 /* Align address; read full word. */
3510 addr &= ~(sizeof(paddr_t)-1);
3511 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3513 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3514 return X86EMUL_EXCEPTION;
3516 /* Mask out bits provided by caller. */
3517 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3518 /* Shift the caller value and OR in the missing bits. */
3519 val &= (((paddr_t)1 << (bytes*8)) - 1);
3520 val <<= (offset)*8;
3521 val |= full;
3522 /* Also fill in missing parts of the cmpxchg old value. */
3523 old &= (((paddr_t)1 << (bytes*8)) - 1);
3524 old <<= (offset)*8;
3525 old |= full;
3528 pte = ptwr_ctxt->pte;
3529 mfn = l1e_get_pfn(pte);
3530 page = mfn_to_page(mfn);
3532 /* We are looking only for read-only mappings of p.t. pages. */
3533 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3534 ASSERT(mfn_valid(mfn));
3535 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3536 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3537 ASSERT(page_get_owner(page) == d);
3539 /* Check the new PTE. */
3540 nl1e = l1e_from_intpte(val);
3541 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3543 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3544 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3545 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3547 /*
3548 * If this is an upper-half write to a PAE PTE then we assume that
3549 * the guest has simply got the two writes the wrong way round. We
3550 * zap the PRESENT bit on the assumption that the bottom half will
3551 * be written immediately after we return to the guest.
3552 */
3553 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3554 l1e_get_intpte(nl1e));
3555 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3557 else
3559 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3560 return X86EMUL_UNHANDLEABLE;
3564 adjust_guest_l1e(nl1e, d);
3566 /* Checked successfully: do the update (write or cmpxchg). */
3567 pl1e = map_domain_page(mfn);
3568 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3569 if ( do_cmpxchg )
3571 int okay;
3572 intpte_t t = old;
3573 ol1e = l1e_from_intpte(old);
3575 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3576 &t, l1e_get_intpte(nl1e), _mfn(mfn));
3577 okay = (okay && t == old);
3579 if ( !okay )
3581 unmap_domain_page(pl1e);
3582 put_page_from_l1e(nl1e, d);
3583 return X86EMUL_CMPXCHG_FAILED;
3586 else
3588 ol1e = *pl1e;
3589 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
3590 BUG();
3593 trace_ptwr_emulation(addr, nl1e);
3595 unmap_domain_page(pl1e);
3597 /* Finally, drop the old PTE. */
3598 put_page_from_l1e(ol1e, d);
3600 return X86EMUL_OKAY;
3603 static int ptwr_emulated_write(
3604 enum x86_segment seg,
3605 unsigned long offset,
3606 unsigned long val,
3607 unsigned int bytes,
3608 struct x86_emulate_ctxt *ctxt)
3610 return ptwr_emulated_update(
3611 offset, 0, val, bytes, 0,
3612 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3615 static int ptwr_emulated_cmpxchg(
3616 enum x86_segment seg,
3617 unsigned long offset,
3618 void *p_old,
3619 void *p_new,
3620 unsigned int bytes,
3621 struct x86_emulate_ctxt *ctxt)
3623 paddr_t old = 0, new = 0;
3624 if ( bytes > sizeof(paddr_t) )
3625 return X86EMUL_UNHANDLEABLE;
3626 memcpy(&old, p_old, bytes);
3627 memcpy(&new, p_new, bytes);
3628 return ptwr_emulated_update(
3629 offset, old, new, bytes, 1,
3630 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3633 static struct x86_emulate_ops ptwr_emulate_ops = {
3634 .read = ptwr_emulated_read,
3635 .insn_fetch = ptwr_emulated_read,
3636 .write = ptwr_emulated_write,
3637 .cmpxchg = ptwr_emulated_cmpxchg,
3638 };
3640 /* Write page fault handler: check if guest is trying to modify a PTE. */
3641 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3642 struct cpu_user_regs *regs)
3644 struct domain *d = v->domain;
3645 struct page_info *page;
3646 l1_pgentry_t pte;
3647 struct ptwr_emulate_ctxt ptwr_ctxt;
3648 int rc;
3650 domain_lock(d);
3652 /* Attempt to read the PTE that maps the VA being accessed. */
3653 guest_get_eff_l1e(v, addr, &pte);
3654 page = l1e_get_page(pte);
3656 /* We are looking only for read-only mappings of p.t. pages. */
3657 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3658 !mfn_valid(l1e_get_pfn(pte)) ||
3659 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3660 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3661 (page_get_owner(page) != d) )
3662 goto bail;
3664 ptwr_ctxt.ctxt.regs = regs;
3665 ptwr_ctxt.ctxt.force_writeback = 0;
3666 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3667 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3668 ptwr_ctxt.cr2 = addr;
3669 ptwr_ctxt.pte = pte;
3671 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3672 if ( rc == X86EMUL_UNHANDLEABLE )
3673 goto bail;
3675 domain_unlock(d);
3676 perfc_incr(ptwr_emulations);
3677 return EXCRET_fault_fixed;
3679 bail:
3680 domain_unlock(d);
3681 return 0;
3684 void free_xen_pagetable(void *v)
3686 extern int early_boot;
3688 if ( early_boot )
3689 return;
3691 if ( is_xen_heap_page(virt_to_page(v)) )
3692 free_xenheap_page(v);
3693 else
3694 free_domheap_page(virt_to_page(v));
3697 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
3698 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
3699 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
3701 /*
3702 * map_pages_to_xen() can be called with interrupts disabled:
3703 * * During early bootstrap; or
3704 * * alloc_xenheap_pages() via memguard_guard_range
3705 * In these cases it is safe to use flush_area_local():
3706 * * Because only the local CPU is online; or
3707 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
3708 */
3709 #define flush_area(v,f) (!local_irq_is_enabled() ? \
3710 flush_area_local((const void *)v, f) : \
3711 flush_area_all((const void *)v, f))
3713 int map_pages_to_xen(
3714 unsigned long virt,
3715 unsigned long mfn,
3716 unsigned long nr_mfns,
3717 unsigned int flags)
3719 l2_pgentry_t *pl2e, ol2e;
3720 l1_pgentry_t *pl1e, ol1e;
3721 unsigned int i;
3723 while ( nr_mfns != 0 )
3725 #ifdef __x86_64__
3726 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
3727 l3_pgentry_t ol3e = *pl3e;
3729 if ( cpu_has_page1gb &&
3730 !(((virt >> PAGE_SHIFT) | mfn) &
3731 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
3732 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
3733 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
3735 /* 1GB-page mapping. */
3736 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
3738 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
3740 unsigned int flush_flags =
3741 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3743 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
3745 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
3746 flush_flags |= FLUSH_TLB_GLOBAL;
3747 if ( (l1f_to_lNf(l3e_get_flags(ol3e)) ^ flags) &
3748 PAGE_CACHE_ATTRS )
3749 flush_flags |= FLUSH_CACHE;
3750 flush_area(virt, flush_flags);
3752 else
3754 pl2e = l3e_to_l2e(ol3e);
3755 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3757 ol2e = pl2e[i];
3758 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3759 continue;
3760 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3762 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3763 flush_flags |= FLUSH_TLB_GLOBAL;
3764 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
3765 PAGE_CACHE_ATTRS )
3766 flush_flags |= FLUSH_CACHE;
3768 else
3770 unsigned int j;
3772 pl1e = l2e_to_l1e(ol2e);
3773 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
3775 ol1e = pl1e[j];
3776 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3777 flush_flags |= FLUSH_TLB_GLOBAL;
3778 if ( (l1e_get_flags(ol1e) ^ flags) &
3779 PAGE_CACHE_ATTRS )
3780 flush_flags |= FLUSH_CACHE;
3784 flush_area(virt, flush_flags);
3785 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3787 ol2e = pl2e[i];
3788 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
3789 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3790 free_xen_pagetable(l2e_to_l1e(ol2e));
3792 free_xen_pagetable(pl2e);
3796 virt += 1UL << L3_PAGETABLE_SHIFT;
3797 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3798 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3799 continue;
3802 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
3803 (l3e_get_flags(ol3e) & _PAGE_PSE) )
3805 unsigned int flush_flags =
3806 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3808 /* Skip this PTE if there is no change. */
3809 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
3810 L1_PAGETABLE_ENTRIES - 1)) +
3811 (l2_table_offset(virt) << PAGETABLE_ORDER) +
3812 l1_table_offset(virt) == mfn) &&
3813 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
3814 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
3816 /* We can skip to end of L3 superpage if we got a match. */
3817 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
3818 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
3819 if ( i > nr_mfns )
3820 i = nr_mfns;
3821 virt += i << PAGE_SHIFT;
3822 mfn += i;
3823 nr_mfns -= i;
3824 continue;
3827 pl2e = alloc_xen_pagetable();
3828 if ( pl2e == NULL )
3829 return -ENOMEM;
3831 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3832 l2e_write(pl2e + i,
3833 l2e_from_pfn(l3e_get_pfn(ol3e) +
3834 (i << PAGETABLE_ORDER),
3835 l3e_get_flags(ol3e)));
3837 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
3838 flush_flags |= FLUSH_TLB_GLOBAL;
3840 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
3841 __PAGE_HYPERVISOR));
3842 flush_area(virt, flush_flags);
3844 #endif
3846 pl2e = virt_to_xen_l2e(virt);
3848 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3849 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3850 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
3852 /* Super-page mapping. */
3853 ol2e = *pl2e;
3854 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
3856 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3858 unsigned int flush_flags =
3859 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3861 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3863 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3864 flush_flags |= FLUSH_TLB_GLOBAL;
3865 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
3866 PAGE_CACHE_ATTRS )
3867 flush_flags |= FLUSH_CACHE;
3868 flush_area(virt, flush_flags);
3870 else
3872 pl1e = l2e_to_l1e(ol2e);
3873 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3875 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
3876 flush_flags |= FLUSH_TLB_GLOBAL;
3877 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
3878 PAGE_CACHE_ATTRS )
3879 flush_flags |= FLUSH_CACHE;
3881 flush_area(virt, flush_flags);
3882 free_xen_pagetable(pl1e);
3886 virt += 1UL << L2_PAGETABLE_SHIFT;
3887 mfn += 1UL << PAGETABLE_ORDER;
3888 nr_mfns -= 1UL << PAGETABLE_ORDER;
3890 else
3892 /* Normal page mapping. */
3893 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3895 pl1e = alloc_xen_pagetable();
3896 if ( pl1e == NULL )
3897 return -ENOMEM;
3898 clear_page(pl1e);
3899 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3900 __PAGE_HYPERVISOR));
3902 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3904 unsigned int flush_flags =
3905 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3907 /* Skip this PTE if there is no change. */
3908 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
3909 l1_table_offset(virt)) == mfn) &&
3910 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
3911 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
3913 /* We can skip to end of L2 superpage if we got a match. */
3914 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
3915 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
3916 if ( i > nr_mfns )
3917 i = nr_mfns;
3918 virt += i << L1_PAGETABLE_SHIFT;
3919 mfn += i;
3920 nr_mfns -= i;
3921 goto check_l3;
3924 pl1e = alloc_xen_pagetable();
3925 if ( pl1e == NULL )
3926 return -ENOMEM;
3928 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3929 l1e_write(&pl1e[i],
3930 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3931 lNf_to_l1f(l2e_get_flags(*pl2e))));
3933 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
3934 flush_flags |= FLUSH_TLB_GLOBAL;
3936 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3937 __PAGE_HYPERVISOR));
3938 flush_area(virt, flush_flags);
3941 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3942 ol1e = *pl1e;
3943 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
3944 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3946 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
3947 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3948 flush_flags |= FLUSH_TLB_GLOBAL;
3949 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
3950 flush_flags |= FLUSH_CACHE;
3951 flush_area(virt, flush_flags);
3954 virt += 1UL << L1_PAGETABLE_SHIFT;
3955 mfn += 1UL;
3956 nr_mfns -= 1UL;
3958 if ( (flags == PAGE_HYPERVISOR) &&
3959 ((nr_mfns == 0) ||
3960 ((((virt >> PAGE_SHIFT) | mfn) &
3961 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
3963 unsigned long base_mfn;
3964 pl1e = l2e_to_l1e(*pl2e);
3965 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
3966 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
3967 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
3968 (l1e_get_flags(*pl1e) != flags) )
3969 break;
3970 if ( i == L1_PAGETABLE_ENTRIES )
3972 ol2e = *pl2e;
3973 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
3974 l1f_to_lNf(flags)));
3975 flush_area(virt - PAGE_SIZE,
3976 FLUSH_TLB_GLOBAL |
3977 FLUSH_ORDER(PAGETABLE_ORDER));
3978 free_xen_pagetable(l2e_to_l1e(ol2e));
3983 check_l3: ;
3984 #ifdef __x86_64__
3985 if ( cpu_has_page1gb &&
3986 (flags == PAGE_HYPERVISOR) &&
3987 ((nr_mfns == 0) ||
3988 !(((virt >> PAGE_SHIFT) | mfn) &
3989 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
3991 unsigned long base_mfn;
3993 ol3e = *pl3e;
3994 pl2e = l3e_to_l2e(ol3e);
3995 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
3996 L1_PAGETABLE_ENTRIES - 1);
3997 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
3998 if ( (l2e_get_pfn(*pl2e) !=
3999 (base_mfn + (i << PAGETABLE_ORDER))) ||
4000 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4001 break;
4002 if ( i == L2_PAGETABLE_ENTRIES )
4004 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4005 l1f_to_lNf(flags)));
4006 flush_area(virt - PAGE_SIZE,
4007 FLUSH_TLB_GLOBAL |
4008 FLUSH_ORDER(2*PAGETABLE_ORDER));
4009 free_xen_pagetable(l3e_to_l2e(ol3e));
4012 #endif
4015 return 0;
4018 void destroy_xen_mappings(unsigned long s, unsigned long e)
4020 l2_pgentry_t *pl2e;
4021 l1_pgentry_t *pl1e;
4022 unsigned int i;
4023 unsigned long v = s;
4025 ASSERT((s & ~PAGE_MASK) == 0);
4026 ASSERT((e & ~PAGE_MASK) == 0);
4028 while ( v < e )
4030 #ifdef __x86_64__
4031 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4033 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4035 v += 1UL << L3_PAGETABLE_SHIFT;
4036 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4037 continue;
4040 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4042 if ( l2_table_offset(v) == 0 &&
4043 l1_table_offset(v) == 0 &&
4044 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4046 /* PAGE1GB: whole superpage is destroyed. */
4047 l3e_write_atomic(pl3e, l3e_empty());
4048 v += 1UL << L3_PAGETABLE_SHIFT;
4049 continue;
4052 /* PAGE1GB: shatter the superpage and fall through. */
4053 pl2e = alloc_xen_pagetable();
4054 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4055 l2e_write(pl2e + i,
4056 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4057 (i << PAGETABLE_ORDER),
4058 l3e_get_flags(*pl3e)));
4059 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4060 __PAGE_HYPERVISOR));
4062 #endif
4064 pl2e = virt_to_xen_l2e(v);
4066 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4068 v += 1UL << L2_PAGETABLE_SHIFT;
4069 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4070 continue;
4073 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4075 if ( (l1_table_offset(v) == 0) &&
4076 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4078 /* PSE: whole superpage is destroyed. */
4079 l2e_write_atomic(pl2e, l2e_empty());
4080 v += 1UL << L2_PAGETABLE_SHIFT;
4082 else
4084 /* PSE: shatter the superpage and try again. */
4085 pl1e = alloc_xen_pagetable();
4086 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4087 l1e_write(&pl1e[i],
4088 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4089 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4090 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4091 __PAGE_HYPERVISOR));
4094 else
4096 /* Ordinary 4kB mapping. */
4097 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4098 l1e_write_atomic(pl1e, l1e_empty());
4099 v += PAGE_SIZE;
4101 /* If we are done with the L2E, check if it is now empty. */
4102 if ( (v != e) && (l1_table_offset(v) != 0) )
4103 continue;
4104 pl1e = l2e_to_l1e(*pl2e);
4105 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4106 if ( l1e_get_intpte(pl1e[i]) != 0 )
4107 break;
4108 if ( i == L1_PAGETABLE_ENTRIES )
4110 /* Empty: zap the L2E and free the L1 page. */
4111 l2e_write_atomic(pl2e, l2e_empty());
4112 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4113 free_xen_pagetable(pl1e);
4117 #ifdef __x86_64__
4118 /* If we are done with the L3E, check if it is now empty. */
4119 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4120 continue;
4121 pl2e = l3e_to_l2e(*pl3e);
4122 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4123 if ( l2e_get_intpte(pl2e[i]) != 0 )
4124 break;
4125 if ( i == L2_PAGETABLE_ENTRIES )
4127 /* Empty: zap the L3E and free the L2 page. */
4128 l3e_write_atomic(pl3e, l3e_empty());
4129 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4130 free_xen_pagetable(pl2e);
4132 #endif
4135 flush_area(NULL, FLUSH_TLB_GLOBAL);
4138 void __set_fixmap(
4139 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4141 BUG_ON(idx >= __end_of_fixed_addresses);
4142 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4145 #ifdef MEMORY_GUARD
4147 void memguard_init(void)
4149 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4150 map_pages_to_xen(
4151 (unsigned long)__va(start),
4152 start >> PAGE_SHIFT,
4153 (xenheap_phys_end - start) >> PAGE_SHIFT,
4154 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4155 #ifdef __x86_64__
4156 BUG_ON(start != xen_phys_start);
4157 map_pages_to_xen(
4158 XEN_VIRT_START,
4159 start >> PAGE_SHIFT,
4160 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4161 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4162 #endif
4165 static void __memguard_change_range(void *p, unsigned long l, int guard)
4167 unsigned long _p = (unsigned long)p;
4168 unsigned long _l = (unsigned long)l;
4169 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4171 /* Ensure we are dealing with a page-aligned whole number of pages. */
4172 ASSERT((_p&~PAGE_MASK) == 0);
4173 ASSERT((_l&~PAGE_MASK) == 0);
4175 if ( guard )
4176 flags &= ~_PAGE_PRESENT;
4178 map_pages_to_xen(
4179 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4182 void memguard_guard_range(void *p, unsigned long l)
4184 __memguard_change_range(p, l, 1);
4187 void memguard_unguard_range(void *p, unsigned long l)
4189 __memguard_change_range(p, l, 0);
4192 #endif
4194 void memguard_guard_stack(void *p)
4196 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4197 p = (void *)((unsigned long)p + STACK_SIZE -
4198 PRIMARY_STACK_SIZE - PAGE_SIZE);
4199 memguard_guard_range(p, PAGE_SIZE);
4202 /*
4203 * Local variables:
4204 * mode: C
4205 * c-set-style: "BSD"
4206 * c-basic-offset: 4
4207 * tab-width: 4
4208 * indent-tabs-mode: nil
4209 * End:
4210 */