ia64/xen-unstable

view xen/arch/x86/mm.c @ 16694:15cfd1f8fa38

x86: Fix a comment in get_page_type().
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jan 08 16:45:08 2008 +0000 (2008-01-08)
parents 8d5517355aa8
children fba4e7357744
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
118 /*
119 * PTE updates can be done with ordinary writes except:
120 * 1. Debug builds get extra checking by using CMPXCHG[8B].
121 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
122 */
123 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
124 #define PTE_UPDATE_WITH_CMPXCHG
125 #endif
127 /* Used to defer flushing of memory structures. */
128 struct percpu_mm_info {
129 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
130 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
131 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
132 unsigned int deferred_ops;
133 /* If non-NULL, specifies a foreign subject domain for some operations. */
134 struct domain *foreign;
135 };
136 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
138 /*
139 * Returns the current foreign domain; defaults to the currently-executing
140 * domain if a foreign override hasn't been specified.
141 */
142 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
144 /* Private domain structs for DOMID_XEN and DOMID_IO. */
145 static struct domain *dom_xen, *dom_io;
147 /* Frame table and its size in pages. */
148 struct page_info *frame_table;
149 unsigned long max_page;
150 unsigned long total_pages;
152 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
154 #define l1_disallow_mask(d) \
155 ((d != dom_io) && \
156 (rangeset_is_empty((d)->iomem_caps) && \
157 rangeset_is_empty((d)->arch.ioport_caps)) ? \
158 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
160 #ifdef CONFIG_COMPAT
161 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
162 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
163 L3_DISALLOW_MASK : \
164 COMPAT_L3_DISALLOW_MASK)
165 #else
166 #define l3_disallow_mask(d) L3_DISALLOW_MASK
167 #endif
169 static void queue_deferred_ops(struct domain *d, unsigned int ops)
170 {
171 ASSERT(d == current->domain);
172 this_cpu(percpu_mm_info).deferred_ops |= ops;
173 }
175 void __init init_frametable(void)
176 {
177 unsigned long nr_pages, page_step, i, mfn;
179 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
181 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
182 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
184 for ( i = 0; i < nr_pages; i += page_step )
185 {
186 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
187 if ( mfn == 0 )
188 panic("Not enough memory for frame table\n");
189 map_pages_to_xen(
190 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
191 mfn, page_step, PAGE_HYPERVISOR);
192 }
194 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
195 }
197 void __init arch_init_memory(void)
198 {
199 extern void subarch_init_memory(void);
201 unsigned long i, pfn, rstart_pfn, rend_pfn;
203 /*
204 * Initialise our DOMID_XEN domain.
205 * Any Xen-heap pages that we will allow to be mapped will have
206 * their domain field set to dom_xen.
207 */
208 dom_xen = alloc_domain(DOMID_XEN);
209 BUG_ON(dom_xen == NULL);
211 /*
212 * Initialise our DOMID_IO domain.
213 * This domain owns I/O pages that are within the range of the page_info
214 * array. Mappings occur at the priv of the caller.
215 */
216 dom_io = alloc_domain(DOMID_IO);
217 BUG_ON(dom_io == NULL);
219 /* First 1MB of RAM is historically marked as I/O. */
220 for ( i = 0; i < 0x100; i++ )
221 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
223 /* Any areas not specified as RAM by the e820 map are considered I/O. */
224 for ( i = 0, pfn = 0; pfn < max_page; i++ )
225 {
226 while ( (i < e820.nr_map) &&
227 (e820.map[i].type != E820_RAM) &&
228 (e820.map[i].type != E820_UNUSABLE) )
229 i++;
231 if ( i >= e820.nr_map )
232 {
233 /* No more RAM regions: mark as I/O right to end of memory map. */
234 rstart_pfn = rend_pfn = max_page;
235 }
236 else
237 {
238 /* Mark as I/O just up as far as next RAM region. */
239 rstart_pfn = min_t(unsigned long, max_page,
240 PFN_UP(e820.map[i].addr));
241 rend_pfn = max_t(unsigned long, rstart_pfn,
242 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
243 }
245 /* Mark as I/O up to next RAM region. */
246 for ( ; pfn < rstart_pfn; pfn++ )
247 {
248 BUG_ON(!mfn_valid(pfn));
249 share_xen_page_with_guest(
250 mfn_to_page(pfn), dom_io, XENSHARE_writable);
251 }
253 /* Skip the RAM region. */
254 pfn = rend_pfn;
255 }
257 subarch_init_memory();
258 }
260 int memory_is_conventional_ram(paddr_t p)
261 {
262 int i;
264 for ( i = 0; i < e820.nr_map; i++ )
265 {
266 if ( (e820.map[i].type == E820_RAM) &&
267 (e820.map[i].addr <= p) &&
268 (e820.map[i].size > p) )
269 return 1;
270 }
272 return 0;
273 }
275 unsigned long domain_get_maximum_gpfn(struct domain *d)
276 {
277 if ( is_hvm_domain(d) )
278 return d->arch.p2m.max_mapped_pfn;
279 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
280 return arch_get_max_pfn(d) - 1;
281 }
283 void share_xen_page_with_guest(
284 struct page_info *page, struct domain *d, int readonly)
285 {
286 if ( page_get_owner(page) == d )
287 return;
289 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
291 spin_lock(&d->page_alloc_lock);
293 /* The incremented type count pins as writable or read-only. */
294 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
295 page->u.inuse.type_info |= PGT_validated | 1;
297 page_set_owner(page, d);
298 wmb(); /* install valid domain ptr before updating refcnt. */
299 ASSERT(page->count_info == 0);
301 /* Only add to the allocation list if the domain isn't dying. */
302 if ( !d->is_dying )
303 {
304 page->count_info |= PGC_allocated | 1;
305 if ( unlikely(d->xenheap_pages++ == 0) )
306 get_knownalive_domain(d);
307 list_add_tail(&page->list, &d->xenpage_list);
308 }
310 spin_unlock(&d->page_alloc_lock);
311 }
313 void share_xen_page_with_privileged_guests(
314 struct page_info *page, int readonly)
315 {
316 share_xen_page_with_guest(page, dom_xen, readonly);
317 }
319 #if defined(CONFIG_X86_PAE)
321 #ifdef NDEBUG
322 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
323 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
324 #else
325 /*
326 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
327 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
328 * (detected by lack of an owning domain). As required for correctness, we
329 * always shadow PDPTs above 4GB.
330 */
331 #define l3tab_needs_shadow(mfn) \
332 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
333 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
334 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
335 ((mfn) >= 0x100000))
336 #endif
338 static l1_pgentry_t *fix_pae_highmem_pl1e;
340 /* Cache the address of PAE high-memory fixmap page tables. */
341 static int __init cache_pae_fixmap_address(void)
342 {
343 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
344 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
345 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
346 return 0;
347 }
348 __initcall(cache_pae_fixmap_address);
350 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
352 void make_cr3(struct vcpu *v, unsigned long mfn)
353 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
354 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
355 {
356 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
357 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
358 unsigned int cpu = smp_processor_id();
360 /* Fast path: does this mfn need a shadow at all? */
361 if ( !l3tab_needs_shadow(mfn) )
362 {
363 v->arch.cr3 = mfn << PAGE_SHIFT;
364 /* Cache is no longer in use or valid */
365 cache->high_mfn = 0;
366 return;
367 }
369 /* Caching logic is not interrupt safe. */
370 ASSERT(!in_irq());
372 /* Protects against pae_flush_pgd(). */
373 spin_lock(&cache->lock);
375 cache->inuse_idx ^= 1;
376 cache->high_mfn = mfn;
378 /* Map the guest L3 table and copy to the chosen low-memory cache. */
379 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
380 /* First check the previous high mapping can't be in the TLB.
381 * (i.e. have we loaded CR3 since we last did this?) */
382 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
383 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
384 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
385 lowmem_l3tab = cache->table[cache->inuse_idx];
386 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
387 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
388 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
390 v->arch.cr3 = __pa(lowmem_l3tab);
392 spin_unlock(&cache->lock);
393 }
395 #else /* !CONFIG_X86_PAE */
397 void make_cr3(struct vcpu *v, unsigned long mfn)
398 {
399 v->arch.cr3 = mfn << PAGE_SHIFT;
400 }
402 #endif /* !CONFIG_X86_PAE */
404 void write_ptbase(struct vcpu *v)
405 {
406 write_cr3(v->arch.cr3);
407 }
409 /*
410 * Should be called after CR3 is updated.
411 *
412 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
413 * for HVM guests, arch.monitor_table and hvm's guest CR3.
414 *
415 * Update ref counts to shadow tables appropriately.
416 */
417 void update_cr3(struct vcpu *v)
418 {
419 unsigned long cr3_mfn=0;
421 if ( paging_mode_enabled(v->domain) )
422 {
423 paging_update_cr3(v);
424 return;
425 }
427 #if CONFIG_PAGING_LEVELS == 4
428 if ( !(v->arch.flags & TF_kernel_mode) )
429 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
430 else
431 #endif
432 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
434 make_cr3(v, cr3_mfn);
435 }
438 static void invalidate_shadow_ldt(struct vcpu *v)
439 {
440 int i;
441 unsigned long pfn;
442 struct page_info *page;
444 if ( v->arch.shadow_ldt_mapcnt == 0 )
445 return;
447 v->arch.shadow_ldt_mapcnt = 0;
449 for ( i = 16; i < 32; i++ )
450 {
451 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
452 if ( pfn == 0 ) continue;
453 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
454 page = mfn_to_page(pfn);
455 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
456 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
457 put_page_and_type(page);
458 }
460 /* Dispose of the (now possibly invalid) mappings from the TLB. */
461 if ( v == current )
462 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
463 else
464 flush_tlb_mask(v->domain->domain_dirty_cpumask);
465 }
468 static int alloc_segdesc_page(struct page_info *page)
469 {
470 struct desc_struct *descs;
471 int i;
473 descs = map_domain_page(page_to_mfn(page));
475 for ( i = 0; i < 512; i++ )
476 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
477 goto fail;
479 unmap_domain_page(descs);
480 return 1;
482 fail:
483 unmap_domain_page(descs);
484 return 0;
485 }
488 /* Map shadow page at offset @off. */
489 int map_ldt_shadow_page(unsigned int off)
490 {
491 struct vcpu *v = current;
492 struct domain *d = v->domain;
493 unsigned long gmfn, mfn;
494 l1_pgentry_t l1e, nl1e;
495 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
496 int okay;
498 BUG_ON(unlikely(in_irq()));
500 guest_get_eff_kern_l1e(v, gva, &l1e);
501 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
502 return 0;
504 gmfn = l1e_get_pfn(l1e);
505 mfn = gmfn_to_mfn(d, gmfn);
506 if ( unlikely(!mfn_valid(mfn)) )
507 return 0;
509 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
510 if ( unlikely(!okay) )
511 return 0;
513 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
515 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
516 v->arch.shadow_ldt_mapcnt++;
518 return 1;
519 }
522 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
523 {
524 struct page_info *page = mfn_to_page(page_nr);
526 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
527 {
528 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
529 return 0;
530 }
532 return 1;
533 }
536 static int get_page_and_type_from_pagenr(unsigned long page_nr,
537 unsigned long type,
538 struct domain *d)
539 {
540 struct page_info *page = mfn_to_page(page_nr);
542 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
543 return 0;
545 if ( unlikely(!get_page_type(page, type)) )
546 {
547 put_page(page);
548 return 0;
549 }
551 return 1;
552 }
554 /*
555 * We allow root tables to map each other (a.k.a. linear page tables). It
556 * needs some special care with reference counts and access permissions:
557 * 1. The mapping entry must be read-only, or the guest may get write access
558 * to its own PTEs.
559 * 2. We must only bump the reference counts for an *already validated*
560 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
561 * on a validation that is required to complete that validation.
562 * 3. We only need to increment the reference counts for the mapped page
563 * frame if it is mapped by a different root table. This is sufficient and
564 * also necessary to allow validation of a root table mapping itself.
565 */
566 #define define_get_linear_pagetable(level) \
567 static int \
568 get_##level##_linear_pagetable( \
569 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
570 { \
571 unsigned long x, y; \
572 struct page_info *page; \
573 unsigned long pfn; \
574 \
575 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
576 { \
577 MEM_LOG("Attempt to create linear p.t. with write perms"); \
578 return 0; \
579 } \
580 \
581 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
582 { \
583 /* Make sure the mapped frame belongs to the correct domain. */ \
584 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
585 return 0; \
586 \
587 /* \
588 * Ensure that the mapped frame is an already-validated page table. \
589 * If so, atomically increment the count (checking for overflow). \
590 */ \
591 page = mfn_to_page(pfn); \
592 y = page->u.inuse.type_info; \
593 do { \
594 x = y; \
595 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
596 unlikely((x & (PGT_type_mask|PGT_validated)) != \
597 (PGT_##level##_page_table|PGT_validated)) ) \
598 { \
599 put_page(page); \
600 return 0; \
601 } \
602 } \
603 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
604 } \
605 \
606 return 1; \
607 }
610 int is_iomem_page(unsigned long mfn)
611 {
612 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
613 }
616 int
617 get_page_from_l1e(
618 l1_pgentry_t l1e, struct domain *d)
619 {
620 unsigned long mfn = l1e_get_pfn(l1e);
621 struct page_info *page = mfn_to_page(mfn);
622 uint32_t l1f = l1e_get_flags(l1e);
623 struct vcpu *curr = current;
624 int okay;
626 if ( !(l1f & _PAGE_PRESENT) )
627 return 1;
629 if ( unlikely(l1f & l1_disallow_mask(d)) )
630 {
631 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
632 return 0;
633 }
635 if ( is_iomem_page(mfn) )
636 {
637 /* DOMID_IO reverts to caller for privilege checks. */
638 if ( d == dom_io )
639 d = curr->domain;
641 if ( !iomem_access_permitted(d, mfn, mfn) )
642 {
643 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
644 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
645 d->domain_id, mfn);
646 return 0;
647 }
649 return 1;
650 }
652 /* Foreign mappings into guests in shadow external mode don't
653 * contribute to writeable mapping refcounts. (This allows the
654 * qemu-dm helper process in dom0 to map the domain's memory without
655 * messing up the count of "real" writable mappings.) */
656 okay = (((l1f & _PAGE_RW) &&
657 !(unlikely(paging_mode_external(d) && (d != curr->domain))))
658 ? get_page_and_type(page, d, PGT_writable_page)
659 : get_page(page, d));
660 if ( !okay )
661 {
662 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
663 " for dom%d",
664 mfn, get_gpfn_from_mfn(mfn),
665 l1e_get_intpte(l1e), d->domain_id);
666 }
667 else if ( pte_flags_to_cacheattr(l1f) !=
668 ((page->count_info >> PGC_cacheattr_base) & 7) )
669 {
670 uint32_t x, nx, y = page->count_info;
671 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
673 if ( is_xen_heap_page(page) )
674 {
675 if ( (l1f & _PAGE_RW) &&
676 !(unlikely(paging_mode_external(d) &&
677 (d != curr->domain))) )
678 put_page_type(page);
679 put_page(page);
680 MEM_LOG("Attempt to change cache attributes of Xen heap page");
681 return 0;
682 }
684 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
685 {
686 x = y;
687 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
688 y = cmpxchg(&page->count_info, x, nx);
689 }
691 #ifdef __x86_64__
692 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
693 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
694 #endif
695 }
697 return okay;
698 }
701 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
702 define_get_linear_pagetable(l2);
703 static int
704 get_page_from_l2e(
705 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
706 {
707 int rc;
709 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
710 return 1;
712 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
713 {
714 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
715 return 0;
716 }
718 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
719 if ( unlikely(!rc) )
720 rc = get_l2_linear_pagetable(l2e, pfn, d);
722 return rc;
723 }
726 #if CONFIG_PAGING_LEVELS >= 3
727 define_get_linear_pagetable(l3);
728 static int
729 get_page_from_l3e(
730 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
731 {
732 int rc;
734 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
735 return 1;
737 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
738 {
739 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
740 return 0;
741 }
743 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
744 if ( unlikely(!rc) )
745 rc = get_l3_linear_pagetable(l3e, pfn, d);
747 return rc;
748 }
749 #endif /* 3 level */
751 #if CONFIG_PAGING_LEVELS >= 4
752 define_get_linear_pagetable(l4);
753 static int
754 get_page_from_l4e(
755 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
756 {
757 int rc;
759 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
760 return 1;
762 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
763 {
764 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
765 return 0;
766 }
768 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
769 if ( unlikely(!rc) )
770 rc = get_l4_linear_pagetable(l4e, pfn, d);
772 return rc;
773 }
774 #endif /* 4 level */
776 #ifdef __x86_64__
778 #ifdef USER_MAPPINGS_ARE_GLOBAL
779 #define adjust_guest_l1e(pl1e, d) \
780 do { \
781 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
782 likely(!is_pv_32on64_domain(d)) ) \
783 { \
784 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
785 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
786 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
787 MEM_LOG("Global bit is set to kernel page %lx", \
788 l1e_get_pfn((pl1e))); \
789 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
790 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
791 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
792 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
793 } \
794 } while ( 0 )
795 #else
796 #define adjust_guest_l1e(pl1e, d) \
797 do { \
798 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
799 likely(!is_pv_32on64_domain(d)) ) \
800 l1e_add_flags((pl1e), _PAGE_USER); \
801 } while ( 0 )
802 #endif
804 #define adjust_guest_l2e(pl2e, d) \
805 do { \
806 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
807 likely(!is_pv_32on64_domain(d)) ) \
808 l2e_add_flags((pl2e), _PAGE_USER); \
809 } while ( 0 )
811 #define adjust_guest_l3e(pl3e, d) \
812 do { \
813 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
814 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
815 _PAGE_USER : \
816 _PAGE_USER|_PAGE_RW); \
817 } while ( 0 )
819 #define adjust_guest_l4e(pl4e, d) \
820 do { \
821 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
822 likely(!is_pv_32on64_domain(d)) ) \
823 l4e_add_flags((pl4e), _PAGE_USER); \
824 } while ( 0 )
826 #else /* !defined(__x86_64__) */
828 #define adjust_guest_l1e(_p, _d) ((void)(_d))
829 #define adjust_guest_l2e(_p, _d) ((void)(_d))
830 #define adjust_guest_l3e(_p, _d) ((void)(_d))
832 #endif
834 #ifdef CONFIG_COMPAT
835 #define unadjust_guest_l3e(pl3e, d) \
836 do { \
837 if ( unlikely(is_pv_32on64_domain(d)) && \
838 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
839 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
840 } while ( 0 )
841 #else
842 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
843 #endif
845 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
846 {
847 unsigned long pfn = l1e_get_pfn(l1e);
848 struct page_info *page;
849 struct domain *e;
850 struct vcpu *v;
852 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
853 return;
855 page = mfn_to_page(pfn);
857 e = page_get_owner(page);
859 /*
860 * Check if this is a mapping that was established via a grant reference.
861 * If it was then we should not be here: we require that such mappings are
862 * explicitly destroyed via the grant-table interface.
863 *
864 * The upshot of this is that the guest can end up with active grants that
865 * it cannot destroy (because it no longer has a PTE to present to the
866 * grant-table interface). This can lead to subtle hard-to-catch bugs,
867 * hence a special grant PTE flag can be enabled to catch the bug early.
868 *
869 * (Note that the undestroyable active grants are not a security hole in
870 * Xen. All active grants can safely be cleaned up when the domain dies.)
871 */
872 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
873 !d->is_shutting_down && !d->is_dying )
874 {
875 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
876 l1e_get_intpte(l1e));
877 domain_crash(d);
878 }
880 /* Remember we didn't take a type-count of foreign writable mappings
881 * to paging-external domains */
882 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
883 !(unlikely((e != d) && paging_mode_external(e))) )
884 {
885 put_page_and_type(page);
886 }
887 else
888 {
889 /* We expect this is rare so we blow the entire shadow LDT. */
890 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
891 PGT_ldt_page)) &&
892 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
893 (d == e) )
894 {
895 for_each_vcpu ( d, v )
896 invalidate_shadow_ldt(v);
897 }
898 put_page(page);
899 }
900 }
903 /*
904 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
905 * Note also that this automatically deals correctly with linear p.t.'s.
906 */
907 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
908 {
909 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
910 (l2e_get_pfn(l2e) != pfn) )
911 put_page_and_type(l2e_get_page(l2e));
912 }
915 #if CONFIG_PAGING_LEVELS >= 3
916 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
917 {
918 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
919 (l3e_get_pfn(l3e) != pfn) )
920 put_page_and_type(l3e_get_page(l3e));
921 }
922 #endif
924 #if CONFIG_PAGING_LEVELS >= 4
925 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
926 {
927 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
928 (l4e_get_pfn(l4e) != pfn) )
929 put_page_and_type(l4e_get_page(l4e));
930 }
931 #endif
933 static int alloc_l1_table(struct page_info *page)
934 {
935 struct domain *d = page_get_owner(page);
936 unsigned long pfn = page_to_mfn(page);
937 l1_pgentry_t *pl1e;
938 int i;
940 pl1e = map_domain_page(pfn);
942 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
943 {
944 if ( is_guest_l1_slot(i) &&
945 unlikely(!get_page_from_l1e(pl1e[i], d)) )
946 goto fail;
948 adjust_guest_l1e(pl1e[i], d);
949 }
951 unmap_domain_page(pl1e);
952 return 1;
954 fail:
955 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
956 while ( i-- > 0 )
957 if ( is_guest_l1_slot(i) )
958 put_page_from_l1e(pl1e[i], d);
960 unmap_domain_page(pl1e);
961 return 0;
962 }
964 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
965 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
966 {
967 struct page_info *page;
968 l2_pgentry_t *pl2e;
969 l3_pgentry_t l3e3;
970 #ifndef CONFIG_COMPAT
971 l2_pgentry_t l2e;
972 int i;
973 #endif
975 if ( !is_pv_32bit_domain(d) )
976 return 1;
978 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
980 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
981 l3e3 = pl3e[3];
982 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
983 {
984 MEM_LOG("PAE L3 3rd slot is empty");
985 return 0;
986 }
988 /*
989 * The Xen-private mappings include linear mappings. The L2 thus cannot
990 * be shared by multiple L3 tables. The test here is adequate because:
991 * 1. Cannot appear in slots != 3 because get_page_type() checks the
992 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
993 * 2. Cannot appear in another page table's L3:
994 * a. alloc_l3_table() calls this function and this check will fail
995 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
996 */
997 page = l3e_get_page(l3e3);
998 BUG_ON(page->u.inuse.type_info & PGT_pinned);
999 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1000 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1001 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1003 MEM_LOG("PAE L3 3rd slot is shared");
1004 return 0;
1007 /* Xen private mappings. */
1008 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1009 #ifndef CONFIG_COMPAT
1010 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1011 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1012 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1013 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1015 l2e = l2e_from_page(
1016 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1017 __PAGE_HYPERVISOR);
1018 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1020 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1022 l2e = l2e_empty();
1023 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1024 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1025 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1027 #else
1028 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1029 &compat_idle_pg_table_l2[
1030 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1031 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1032 #endif
1033 unmap_domain_page(pl2e);
1035 return 1;
1037 #else
1038 # define create_pae_xen_mappings(d, pl3e) (1)
1039 #endif
1041 #ifdef CONFIG_X86_PAE
1042 /* Flush a pgdir update into low-memory caches. */
1043 static void pae_flush_pgd(
1044 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1046 struct domain *d = page_get_owner(mfn_to_page(mfn));
1047 struct vcpu *v;
1048 intpte_t _ol3e, _nl3e, _pl3e;
1049 l3_pgentry_t *l3tab_ptr;
1050 struct pae_l3_cache *cache;
1052 if ( unlikely(shadow_mode_enabled(d)) )
1054 cpumask_t m = CPU_MASK_NONE;
1055 /* Re-shadow this l3 table on any vcpus that are using it */
1056 for_each_vcpu ( d, v )
1057 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1059 paging_update_cr3(v);
1060 cpus_or(m, m, v->vcpu_dirty_cpumask);
1062 flush_tlb_mask(m);
1065 /* If below 4GB then the pgdir is not shadowed in low memory. */
1066 if ( !l3tab_needs_shadow(mfn) )
1067 return;
1069 for_each_vcpu ( d, v )
1071 cache = &v->arch.pae_l3_cache;
1073 spin_lock(&cache->lock);
1075 if ( cache->high_mfn == mfn )
1077 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1078 _ol3e = l3e_get_intpte(*l3tab_ptr);
1079 _nl3e = l3e_get_intpte(nl3e);
1080 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1081 BUG_ON(_pl3e != _ol3e);
1084 spin_unlock(&cache->lock);
1087 flush_tlb_mask(d->domain_dirty_cpumask);
1089 #else
1090 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1091 #endif
1093 static int alloc_l2_table(struct page_info *page, unsigned long type)
1095 struct domain *d = page_get_owner(page);
1096 unsigned long pfn = page_to_mfn(page);
1097 l2_pgentry_t *pl2e;
1098 int i;
1100 pl2e = map_domain_page(pfn);
1102 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1104 if ( is_guest_l2_slot(d, type, i) &&
1105 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1106 goto fail;
1108 adjust_guest_l2e(pl2e[i], d);
1111 #if CONFIG_PAGING_LEVELS == 2
1112 /* Xen private mappings. */
1113 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1114 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1115 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1116 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1117 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1118 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1119 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1120 l2e_from_page(
1121 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1122 __PAGE_HYPERVISOR);
1123 #endif
1125 unmap_domain_page(pl2e);
1126 return 1;
1128 fail:
1129 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1130 while ( i-- > 0 )
1131 if ( is_guest_l2_slot(d, type, i) )
1132 put_page_from_l2e(pl2e[i], pfn);
1134 unmap_domain_page(pl2e);
1135 return 0;
1139 #if CONFIG_PAGING_LEVELS >= 3
1140 static int alloc_l3_table(struct page_info *page)
1142 struct domain *d = page_get_owner(page);
1143 unsigned long pfn = page_to_mfn(page);
1144 l3_pgentry_t *pl3e;
1145 int i;
1147 #ifdef CONFIG_X86_PAE
1148 /*
1149 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1150 * the weird 'extended cr3' format for dealing with high-order address
1151 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1152 */
1153 if ( (pfn >= 0x100000) &&
1154 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1155 d->vcpu[0] && d->vcpu[0]->is_initialised )
1157 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1158 return 0;
1160 #endif
1162 pl3e = map_domain_page(pfn);
1164 /*
1165 * PAE guests allocate full pages, but aren't required to initialize
1166 * more than the first four entries; when running in compatibility
1167 * mode, however, the full page is visible to the MMU, and hence all
1168 * 512 entries must be valid/verified, which is most easily achieved
1169 * by clearing them out.
1170 */
1171 if ( is_pv_32on64_domain(d) )
1172 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1174 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1176 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1177 if ( is_pv_32bit_domain(d) && (i == 3) )
1179 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1180 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1181 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1182 PGT_l2_page_table |
1183 PGT_pae_xen_l2,
1184 d) )
1185 goto fail;
1187 else
1188 #endif
1189 if ( is_guest_l3_slot(i) &&
1190 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1191 goto fail;
1193 adjust_guest_l3e(pl3e[i], d);
1196 if ( !create_pae_xen_mappings(d, pl3e) )
1197 goto fail;
1199 unmap_domain_page(pl3e);
1200 return 1;
1202 fail:
1203 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1204 while ( i-- > 0 )
1205 if ( is_guest_l3_slot(i) )
1206 put_page_from_l3e(pl3e[i], pfn);
1208 unmap_domain_page(pl3e);
1209 return 0;
1211 #else
1212 #define alloc_l3_table(page) (0)
1213 #endif
1215 #if CONFIG_PAGING_LEVELS >= 4
1216 static int alloc_l4_table(struct page_info *page)
1218 struct domain *d = page_get_owner(page);
1219 unsigned long pfn = page_to_mfn(page);
1220 l4_pgentry_t *pl4e = page_to_virt(page);
1221 int i;
1223 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1225 if ( is_guest_l4_slot(d, i) &&
1226 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1227 goto fail;
1229 adjust_guest_l4e(pl4e[i], d);
1232 /* Xen private mappings. */
1233 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1234 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1235 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1236 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1237 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1238 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1239 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1240 __PAGE_HYPERVISOR);
1241 if ( is_pv_32on64_domain(d) )
1242 pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1243 l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
1244 __PAGE_HYPERVISOR);
1246 return 1;
1248 fail:
1249 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1250 while ( i-- > 0 )
1251 if ( is_guest_l4_slot(d, i) )
1252 put_page_from_l4e(pl4e[i], pfn);
1254 return 0;
1256 #else
1257 #define alloc_l4_table(page) (0)
1258 #endif
1261 static void free_l1_table(struct page_info *page)
1263 struct domain *d = page_get_owner(page);
1264 unsigned long pfn = page_to_mfn(page);
1265 l1_pgentry_t *pl1e;
1266 int i;
1268 pl1e = map_domain_page(pfn);
1270 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1271 if ( is_guest_l1_slot(i) )
1272 put_page_from_l1e(pl1e[i], d);
1274 unmap_domain_page(pl1e);
1278 static void free_l2_table(struct page_info *page)
1280 #ifdef CONFIG_COMPAT
1281 struct domain *d = page_get_owner(page);
1282 #endif
1283 unsigned long pfn = page_to_mfn(page);
1284 l2_pgentry_t *pl2e;
1285 int i;
1287 pl2e = map_domain_page(pfn);
1289 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1290 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1291 put_page_from_l2e(pl2e[i], pfn);
1293 unmap_domain_page(pl2e);
1295 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1299 #if CONFIG_PAGING_LEVELS >= 3
1301 static void free_l3_table(struct page_info *page)
1303 struct domain *d = page_get_owner(page);
1304 unsigned long pfn = page_to_mfn(page);
1305 l3_pgentry_t *pl3e;
1306 int i;
1308 pl3e = map_domain_page(pfn);
1310 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1311 if ( is_guest_l3_slot(i) )
1313 put_page_from_l3e(pl3e[i], pfn);
1314 unadjust_guest_l3e(pl3e[i], d);
1317 unmap_domain_page(pl3e);
1320 #endif
1322 #if CONFIG_PAGING_LEVELS >= 4
1324 static void free_l4_table(struct page_info *page)
1326 struct domain *d = page_get_owner(page);
1327 unsigned long pfn = page_to_mfn(page);
1328 l4_pgentry_t *pl4e = page_to_virt(page);
1329 int i;
1331 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1332 if ( is_guest_l4_slot(d, i) )
1333 put_page_from_l4e(pl4e[i], pfn);
1336 #endif
1339 /* How to write an entry to the guest pagetables.
1340 * Returns 0 for failure (pointer not valid), 1 for success. */
1341 static inline int update_intpte(intpte_t *p,
1342 intpte_t old,
1343 intpte_t new,
1344 unsigned long mfn,
1345 struct vcpu *v)
1347 int rv = 1;
1348 #ifndef PTE_UPDATE_WITH_CMPXCHG
1349 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1350 #else
1352 intpte_t t = old;
1353 for ( ; ; )
1355 rv = paging_cmpxchg_guest_entry(v, p, &t, new, _mfn(mfn));
1356 if ( unlikely(rv == 0) )
1358 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1359 ": saw %" PRIpte, old, new, t);
1360 break;
1363 if ( t == old )
1364 break;
1366 /* Allowed to change in Accessed/Dirty flags only. */
1367 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1369 old = t;
1372 #endif
1373 return rv;
1376 /* Macro that wraps the appropriate type-changes around update_intpte().
1377 * Arguments are: type, ptr, old, new, mfn, vcpu */
1378 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v) \
1379 update_intpte(&_t ## e_get_intpte(*(_p)), \
1380 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1381 (_m), (_v))
1383 /* Update the L1 entry at pl1e to new value nl1e. */
1384 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1385 unsigned long gl1mfn)
1387 l1_pgentry_t ol1e;
1388 struct vcpu *curr = current;
1389 struct domain *d = curr->domain;
1390 unsigned long mfn;
1392 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1393 return 0;
1395 if ( unlikely(paging_mode_refcounts(d)) )
1396 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr);
1398 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1400 /* Translate foreign guest addresses. */
1401 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1402 if ( unlikely(mfn == INVALID_MFN) )
1403 return 0;
1404 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1405 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1407 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1409 MEM_LOG("Bad L1 flags %x",
1410 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1411 return 0;
1414 adjust_guest_l1e(nl1e, d);
1416 /* Fast path for identical mapping, r/w and presence. */
1417 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1418 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr);
1420 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1421 return 0;
1423 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr)) )
1425 put_page_from_l1e(nl1e, d);
1426 return 0;
1429 else
1431 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr)) )
1432 return 0;
1435 put_page_from_l1e(ol1e, d);
1436 return 1;
1440 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1441 static int mod_l2_entry(l2_pgentry_t *pl2e,
1442 l2_pgentry_t nl2e,
1443 unsigned long pfn,
1444 unsigned long type)
1446 l2_pgentry_t ol2e;
1447 struct vcpu *curr = current;
1448 struct domain *d = curr->domain;
1450 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1452 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1453 return 0;
1456 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1457 return 0;
1459 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1461 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1463 MEM_LOG("Bad L2 flags %x",
1464 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1465 return 0;
1468 adjust_guest_l2e(nl2e, d);
1470 /* Fast path for identical mapping and presence. */
1471 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1472 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr);
1474 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1475 return 0;
1477 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr)) )
1479 put_page_from_l2e(nl2e, pfn);
1480 return 0;
1483 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr)) )
1485 return 0;
1488 put_page_from_l2e(ol2e, pfn);
1489 return 1;
1492 #if CONFIG_PAGING_LEVELS >= 3
1494 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1495 static int mod_l3_entry(l3_pgentry_t *pl3e,
1496 l3_pgentry_t nl3e,
1497 unsigned long pfn)
1499 l3_pgentry_t ol3e;
1500 struct vcpu *curr = current;
1501 struct domain *d = curr->domain;
1502 int okay;
1504 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1506 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1507 return 0;
1510 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1511 /*
1512 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1513 * would be a pain to ensure they remain continuously valid throughout.
1514 */
1515 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1516 return 0;
1517 #endif
1519 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1520 return 0;
1522 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1524 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1526 MEM_LOG("Bad L3 flags %x",
1527 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1528 return 0;
1531 adjust_guest_l3e(nl3e, d);
1533 /* Fast path for identical mapping and presence. */
1534 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1535 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr);
1537 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1538 return 0;
1540 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr)) )
1542 put_page_from_l3e(nl3e, pfn);
1543 return 0;
1546 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr)) )
1548 return 0;
1551 okay = create_pae_xen_mappings(d, pl3e);
1552 BUG_ON(!okay);
1554 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1556 put_page_from_l3e(ol3e, pfn);
1557 return 1;
1560 #endif
1562 #if CONFIG_PAGING_LEVELS >= 4
1564 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1565 static int mod_l4_entry(l4_pgentry_t *pl4e,
1566 l4_pgentry_t nl4e,
1567 unsigned long pfn)
1569 struct vcpu *curr = current;
1570 struct domain *d = curr->domain;
1571 l4_pgentry_t ol4e;
1573 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1575 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1576 return 0;
1579 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1580 return 0;
1582 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1584 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1586 MEM_LOG("Bad L4 flags %x",
1587 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1588 return 0;
1591 adjust_guest_l4e(nl4e, d);
1593 /* Fast path for identical mapping and presence. */
1594 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1595 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr);
1597 if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
1598 return 0;
1600 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr)) )
1602 put_page_from_l4e(nl4e, pfn);
1603 return 0;
1606 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr)) )
1608 return 0;
1611 put_page_from_l4e(ol4e, pfn);
1612 return 1;
1615 #endif
1617 void put_page(struct page_info *page)
1619 u32 nx, x, y = page->count_info;
1621 do {
1622 x = y;
1623 nx = x - 1;
1625 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1627 if ( unlikely((nx & PGC_count_mask) == 0) )
1629 cleanup_page_cacheattr(page);
1630 free_domheap_page(page);
1635 int get_page(struct page_info *page, struct domain *domain)
1637 u32 x, nx, y = page->count_info;
1638 u32 d, nd = page->u.inuse._domain;
1639 u32 _domain = pickle_domptr(domain);
1641 do {
1642 x = y;
1643 nx = x + 1;
1644 d = nd;
1645 if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
1646 unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
1647 unlikely(d != _domain) ) /* Wrong owner? */
1649 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1650 gdprintk(XENLOG_INFO,
1651 "Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
1652 PRtype_info "\n",
1653 page_to_mfn(page), domain, unpickle_domptr(d),
1654 x, page->u.inuse.type_info);
1655 return 0;
1657 asm volatile (
1658 LOCK_PREFIX "cmpxchg8b %3"
1659 : "=d" (nd), "=a" (y), "=c" (d),
1660 "=m" (*(volatile u64 *)(&page->count_info))
1661 : "0" (d), "1" (x), "c" (d), "b" (nx) );
1663 while ( unlikely(nd != d) || unlikely(y != x) );
1665 return 1;
1669 static int alloc_page_type(struct page_info *page, unsigned long type)
1671 struct domain *owner = page_get_owner(page);
1673 /* A page table is dirtied when its type count becomes non-zero. */
1674 if ( likely(owner != NULL) )
1675 paging_mark_dirty(owner, page_to_mfn(page));
1677 switch ( type & PGT_type_mask )
1679 case PGT_l1_page_table:
1680 return alloc_l1_table(page);
1681 case PGT_l2_page_table:
1682 return alloc_l2_table(page, type);
1683 case PGT_l3_page_table:
1684 return alloc_l3_table(page);
1685 case PGT_l4_page_table:
1686 return alloc_l4_table(page);
1687 case PGT_gdt_page:
1688 case PGT_ldt_page:
1689 return alloc_segdesc_page(page);
1690 default:
1691 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1692 type, page->u.inuse.type_info,
1693 page->count_info);
1694 BUG();
1697 return 0;
1701 void free_page_type(struct page_info *page, unsigned long type)
1703 struct domain *owner = page_get_owner(page);
1704 unsigned long gmfn;
1706 if ( likely(owner != NULL) )
1708 /*
1709 * We have to flush before the next use of the linear mapping
1710 * (e.g., update_va_mapping()) or we could end up modifying a page
1711 * that is no longer a page table (and hence screw up ref counts).
1712 */
1713 if ( current->domain == owner )
1714 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1715 else
1716 flush_tlb_mask(owner->domain_dirty_cpumask);
1718 if ( unlikely(paging_mode_enabled(owner)) )
1720 /* A page table is dirtied when its type count becomes zero. */
1721 paging_mark_dirty(owner, page_to_mfn(page));
1723 if ( shadow_mode_refcounts(owner) )
1724 return;
1726 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1727 ASSERT(VALID_M2P(gmfn));
1728 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1732 switch ( type & PGT_type_mask )
1734 case PGT_l1_page_table:
1735 free_l1_table(page);
1736 break;
1738 case PGT_l2_page_table:
1739 free_l2_table(page);
1740 break;
1742 #if CONFIG_PAGING_LEVELS >= 3
1743 case PGT_l3_page_table:
1744 free_l3_table(page);
1745 break;
1746 #endif
1748 #if CONFIG_PAGING_LEVELS >= 4
1749 case PGT_l4_page_table:
1750 free_l4_table(page);
1751 break;
1752 #endif
1754 default:
1755 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1756 type, page_to_mfn(page));
1757 BUG();
1762 void put_page_type(struct page_info *page)
1764 unsigned long nx, x, y = page->u.inuse.type_info;
1766 again:
1767 do {
1768 x = y;
1769 nx = x - 1;
1771 ASSERT((x & PGT_count_mask) != 0);
1773 if ( unlikely((nx & PGT_count_mask) == 0) )
1775 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1776 likely(nx & PGT_validated) )
1778 /*
1779 * Page-table pages must be unvalidated when count is zero. The
1780 * 'free' is safe because the refcnt is non-zero and validated
1781 * bit is clear => other ops will spin or fail.
1782 */
1783 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1784 x & ~PGT_validated)) != x) )
1785 goto again;
1786 /* We cleared the 'valid bit' so we do the clean up. */
1787 free_page_type(page, x);
1788 /* Carry on, but with the 'valid bit' now clear. */
1789 x &= ~PGT_validated;
1790 nx &= ~PGT_validated;
1793 /*
1794 * Record TLB information for flush later. We do not stamp page
1795 * tables when running in shadow mode:
1796 * 1. Pointless, since it's the shadow pt's which must be tracked.
1797 * 2. Shadow mode reuses this field for shadowed page tables to
1798 * store flags info -- we don't want to conflict with that.
1799 */
1800 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1801 (page->count_info & PGC_page_table)) )
1802 page->tlbflush_timestamp = tlbflush_current_time();
1805 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1809 int get_page_type(struct page_info *page, unsigned long type)
1811 unsigned long nx, x, y = page->u.inuse.type_info;
1813 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1815 again:
1816 do {
1817 x = y;
1818 nx = x + 1;
1819 if ( unlikely((nx & PGT_count_mask) == 0) )
1821 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1822 return 0;
1824 else if ( unlikely((x & PGT_count_mask) == 0) )
1826 struct domain *d = page_get_owner(page);
1828 /* Never allow a shadowed frame to go from type count 0 to 1 */
1829 if ( d && shadow_mode_enabled(d) )
1830 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1832 ASSERT(!(x & PGT_pae_xen_l2));
1833 if ( (x & PGT_type_mask) != type )
1835 /*
1836 * On type change we check to flush stale TLB entries. This
1837 * may be unnecessary (e.g., page was GDT/LDT) but those
1838 * circumstances should be very rare.
1839 */
1840 cpumask_t mask = d->domain_dirty_cpumask;
1842 /* Don't flush if the timestamp is old enough */
1843 tlbflush_filter(mask, page->tlbflush_timestamp);
1845 if ( unlikely(!cpus_empty(mask)) &&
1846 /* Shadow mode: track only writable pages. */
1847 (!shadow_mode_enabled(page_get_owner(page)) ||
1848 ((nx & PGT_type_mask) == PGT_writable_page)) )
1850 perfc_incr(need_flush_tlb_flush);
1851 flush_tlb_mask(mask);
1854 /* We lose existing type and validity. */
1855 nx &= ~(PGT_type_mask | PGT_validated);
1856 nx |= type;
1858 /* No special validation needed for writable pages. */
1859 /* Page tables and GDT/LDT need to be scanned for validity. */
1860 if ( type == PGT_writable_page )
1861 nx |= PGT_validated;
1864 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1866 /* Don't log failure if it could be a recursive-mapping attempt. */
1867 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
1868 (type == PGT_l1_page_table) )
1869 return 0;
1870 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
1871 (type == PGT_l2_page_table) )
1872 return 0;
1873 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
1874 (type == PGT_l3_page_table) )
1875 return 0;
1876 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
1877 "for mfn %lx (pfn %lx)",
1878 x, type, page_to_mfn(page),
1879 get_gpfn_from_mfn(page_to_mfn(page)));
1880 return 0;
1882 else if ( unlikely(!(x & PGT_validated)) )
1884 /* Someone else is updating validation of this page. Wait... */
1885 while ( (y = page->u.inuse.type_info) == x )
1886 cpu_relax();
1887 goto again;
1890 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1892 if ( unlikely(!(nx & PGT_validated)) )
1894 /* Try to validate page type; drop the new reference on failure. */
1895 if ( unlikely(!alloc_page_type(page, type)) )
1897 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1898 PRtype_info ": caf=%08x taf=%" PRtype_info,
1899 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1900 type, page->count_info, page->u.inuse.type_info);
1901 /* Noone else can get a reference. We hold the only ref. */
1902 page->u.inuse.type_info = 0;
1903 return 0;
1906 /* Noone else is updating simultaneously. */
1907 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1910 return 1;
1914 void cleanup_page_cacheattr(struct page_info *page)
1916 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
1918 if ( likely(cacheattr == 0) )
1919 return;
1921 page->count_info &= ~PGC_cacheattr_mask;
1923 BUG_ON(is_xen_heap_page(page));
1925 #ifdef __x86_64__
1926 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
1927 1, PAGE_HYPERVISOR);
1928 #endif
1932 int new_guest_cr3(unsigned long mfn)
1934 struct vcpu *v = current;
1935 struct domain *d = v->domain;
1936 int okay;
1937 unsigned long old_base_mfn;
1939 #ifdef CONFIG_COMPAT
1940 if ( is_pv_32on64_domain(d) )
1942 okay = paging_mode_refcounts(d)
1943 ? 0 /* Old code was broken, but what should it be? */
1944 : mod_l4_entry(
1945 __va(pagetable_get_paddr(v->arch.guest_table)),
1946 l4e_from_pfn(
1947 mfn,
1948 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
1949 pagetable_get_pfn(v->arch.guest_table));
1950 if ( unlikely(!okay) )
1952 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
1953 return 0;
1956 invalidate_shadow_ldt(v);
1957 write_ptbase(v);
1959 return 1;
1961 #endif
1962 okay = paging_mode_refcounts(d)
1963 ? get_page_from_pagenr(mfn, d)
1964 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1965 if ( unlikely(!okay) )
1967 MEM_LOG("Error while installing new baseptr %lx", mfn);
1968 return 0;
1971 invalidate_shadow_ldt(v);
1973 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1975 v->arch.guest_table = pagetable_from_pfn(mfn);
1976 update_cr3(v);
1978 write_ptbase(v);
1980 if ( likely(old_base_mfn != 0) )
1982 if ( paging_mode_refcounts(d) )
1983 put_page(mfn_to_page(old_base_mfn));
1984 else
1985 put_page_and_type(mfn_to_page(old_base_mfn));
1988 return 1;
1991 static void process_deferred_ops(void)
1993 unsigned int deferred_ops;
1994 struct domain *d = current->domain;
1995 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1997 deferred_ops = info->deferred_ops;
1998 info->deferred_ops = 0;
2000 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2002 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2003 flush_tlb_mask(d->domain_dirty_cpumask);
2004 else
2005 flush_tlb_local();
2008 if ( deferred_ops & DOP_RELOAD_LDT )
2009 (void)map_ldt_shadow_page(0);
2011 if ( unlikely(info->foreign != NULL) )
2013 rcu_unlock_domain(info->foreign);
2014 info->foreign = NULL;
2018 static int set_foreigndom(domid_t domid)
2020 struct domain *e, *d = current->domain;
2021 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2022 int okay = 1;
2024 ASSERT(info->foreign == NULL);
2026 if ( likely(domid == DOMID_SELF) )
2027 goto out;
2029 if ( unlikely(domid == d->domain_id) )
2031 MEM_LOG("Cannot specify itself as foreign domain");
2032 okay = 0;
2034 else if ( unlikely(paging_mode_translate(d)) )
2036 MEM_LOG("Cannot mix foreign mappings with translated domains");
2037 okay = 0;
2039 else if ( !IS_PRIV(d) )
2041 switch ( domid )
2043 case DOMID_IO:
2044 info->foreign = rcu_lock_domain(dom_io);
2045 break;
2046 default:
2047 MEM_LOG("Cannot set foreign dom");
2048 okay = 0;
2049 break;
2052 else
2054 info->foreign = e = rcu_lock_domain_by_id(domid);
2055 if ( e == NULL )
2057 switch ( domid )
2059 case DOMID_XEN:
2060 info->foreign = rcu_lock_domain(dom_xen);
2061 break;
2062 case DOMID_IO:
2063 info->foreign = rcu_lock_domain(dom_io);
2064 break;
2065 default:
2066 MEM_LOG("Unknown domain '%u'", domid);
2067 okay = 0;
2068 break;
2073 out:
2074 return okay;
2077 static inline cpumask_t vcpumask_to_pcpumask(
2078 struct domain *d, unsigned long vmask)
2080 unsigned int vcpu_id;
2081 cpumask_t pmask = CPU_MASK_NONE;
2082 struct vcpu *v;
2084 while ( vmask != 0 )
2086 vcpu_id = find_first_set_bit(vmask);
2087 vmask &= ~(1UL << vcpu_id);
2088 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2089 ((v = d->vcpu[vcpu_id]) != NULL) )
2090 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2093 return pmask;
2096 int do_mmuext_op(
2097 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2098 unsigned int count,
2099 XEN_GUEST_HANDLE(uint) pdone,
2100 unsigned int foreigndom)
2102 struct mmuext_op op;
2103 int rc = 0, i = 0, okay;
2104 unsigned long mfn = 0, gmfn = 0, type;
2105 unsigned int done = 0;
2106 struct page_info *page;
2107 struct vcpu *v = current;
2108 struct domain *d = v->domain;
2110 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2112 count &= ~MMU_UPDATE_PREEMPTED;
2113 if ( unlikely(!guest_handle_is_null(pdone)) )
2114 (void)copy_from_guest(&done, pdone, 1);
2116 else
2117 perfc_incr(calls_to_mmuext_op);
2119 if ( unlikely(!guest_handle_okay(uops, count)) )
2121 rc = -EFAULT;
2122 goto out;
2125 if ( !set_foreigndom(foreigndom) )
2127 rc = -ESRCH;
2128 goto out;
2131 LOCK_BIGLOCK(d);
2133 for ( i = 0; i < count; i++ )
2135 if ( hypercall_preempt_check() )
2137 rc = hypercall_create_continuation(
2138 __HYPERVISOR_mmuext_op, "hihi",
2139 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2140 break;
2143 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2145 MEM_LOG("Bad __copy_from_guest");
2146 rc = -EFAULT;
2147 break;
2150 okay = 1;
2151 gmfn = op.arg1.mfn;
2152 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2153 page = mfn_to_page(mfn);
2155 switch ( op.cmd )
2157 case MMUEXT_PIN_L1_TABLE:
2158 type = PGT_l1_page_table;
2159 goto pin_page;
2161 case MMUEXT_PIN_L2_TABLE:
2162 type = PGT_l2_page_table;
2163 goto pin_page;
2165 case MMUEXT_PIN_L3_TABLE:
2166 type = PGT_l3_page_table;
2167 goto pin_page;
2169 case MMUEXT_PIN_L4_TABLE:
2170 if ( is_pv_32bit_domain(FOREIGNDOM) )
2171 break;
2172 type = PGT_l4_page_table;
2174 pin_page:
2175 rc = xsm_memory_pin_page(d, page);
2176 if ( rc )
2177 break;
2179 /* Ignore pinning of invalid paging levels. */
2180 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2181 break;
2183 if ( paging_mode_refcounts(FOREIGNDOM) )
2184 break;
2186 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2187 if ( unlikely(!okay) )
2189 MEM_LOG("Error while pinning mfn %lx", mfn);
2190 break;
2193 if ( unlikely(test_and_set_bit(_PGT_pinned,
2194 &page->u.inuse.type_info)) )
2196 MEM_LOG("Mfn %lx already pinned", mfn);
2197 put_page_and_type(page);
2198 okay = 0;
2199 break;
2202 /* A page is dirtied when its pin status is set. */
2203 paging_mark_dirty(d, mfn);
2205 /* We can race domain destruction (domain_relinquish_resources). */
2206 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2208 int drop_ref;
2209 spin_lock(&FOREIGNDOM->page_alloc_lock);
2210 drop_ref = (FOREIGNDOM->is_dying &&
2211 test_and_clear_bit(_PGT_pinned,
2212 &page->u.inuse.type_info));
2213 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2214 if ( drop_ref )
2215 put_page_and_type(page);
2218 break;
2220 case MMUEXT_UNPIN_TABLE:
2221 if ( paging_mode_refcounts(d) )
2222 break;
2224 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2226 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2227 mfn, page_get_owner(page));
2229 else if ( likely(test_and_clear_bit(_PGT_pinned,
2230 &page->u.inuse.type_info)) )
2232 put_page_and_type(page);
2233 put_page(page);
2234 /* A page is dirtied when its pin status is cleared. */
2235 paging_mark_dirty(d, mfn);
2237 else
2239 okay = 0;
2240 put_page(page);
2241 MEM_LOG("Mfn %lx not pinned", mfn);
2243 break;
2245 case MMUEXT_NEW_BASEPTR:
2246 okay = new_guest_cr3(mfn);
2247 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2248 break;
2250 #ifdef __x86_64__
2251 case MMUEXT_NEW_USER_BASEPTR: {
2252 unsigned long old_mfn;
2254 if ( mfn != 0 )
2256 if ( paging_mode_refcounts(d) )
2257 okay = get_page_from_pagenr(mfn, d);
2258 else
2259 okay = get_page_and_type_from_pagenr(
2260 mfn, PGT_root_page_table, d);
2261 if ( unlikely(!okay) )
2263 MEM_LOG("Error while installing new mfn %lx", mfn);
2264 break;
2268 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2269 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2271 if ( old_mfn != 0 )
2273 if ( paging_mode_refcounts(d) )
2274 put_page(mfn_to_page(old_mfn));
2275 else
2276 put_page_and_type(mfn_to_page(old_mfn));
2279 break;
2281 #endif
2283 case MMUEXT_TLB_FLUSH_LOCAL:
2284 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2285 break;
2287 case MMUEXT_INVLPG_LOCAL:
2288 if ( !paging_mode_enabled(d)
2289 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2290 flush_tlb_one_local(op.arg1.linear_addr);
2291 break;
2293 case MMUEXT_TLB_FLUSH_MULTI:
2294 case MMUEXT_INVLPG_MULTI:
2296 unsigned long vmask;
2297 cpumask_t pmask;
2298 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2300 okay = 0;
2301 break;
2303 pmask = vcpumask_to_pcpumask(d, vmask);
2304 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2305 flush_tlb_mask(pmask);
2306 else
2307 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2308 break;
2311 case MMUEXT_TLB_FLUSH_ALL:
2312 flush_tlb_mask(d->domain_dirty_cpumask);
2313 break;
2315 case MMUEXT_INVLPG_ALL:
2316 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2317 break;
2319 case MMUEXT_FLUSH_CACHE:
2320 if ( unlikely(!cache_flush_permitted(d)) )
2322 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2323 okay = 0;
2325 else
2327 wbinvd();
2329 break;
2331 case MMUEXT_SET_LDT:
2333 unsigned long ptr = op.arg1.linear_addr;
2334 unsigned long ents = op.arg2.nr_ents;
2336 if ( paging_mode_external(d) )
2338 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2339 okay = 0;
2341 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2342 (ents > 8192) ||
2343 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2345 okay = 0;
2346 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2348 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2349 (v->arch.guest_context.ldt_base != ptr) )
2351 invalidate_shadow_ldt(v);
2352 v->arch.guest_context.ldt_base = ptr;
2353 v->arch.guest_context.ldt_ents = ents;
2354 load_LDT(v);
2355 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2356 if ( ents != 0 )
2357 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2359 break;
2362 default:
2363 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2364 rc = -ENOSYS;
2365 okay = 0;
2366 break;
2369 if ( unlikely(!okay) )
2371 rc = rc ? rc : -EINVAL;
2372 break;
2375 guest_handle_add_offset(uops, 1);
2378 process_deferred_ops();
2380 UNLOCK_BIGLOCK(d);
2382 perfc_add(num_mmuext_ops, i);
2384 out:
2385 /* Add incremental work we have done to the @done output parameter. */
2386 if ( unlikely(!guest_handle_is_null(pdone)) )
2388 done += i;
2389 copy_to_guest(pdone, &done, 1);
2392 return rc;
2395 int do_mmu_update(
2396 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2397 unsigned int count,
2398 XEN_GUEST_HANDLE(uint) pdone,
2399 unsigned int foreigndom)
2401 struct mmu_update req;
2402 void *va;
2403 unsigned long gpfn, gmfn, mfn;
2404 struct page_info *page;
2405 int rc = 0, okay = 1, i = 0;
2406 unsigned int cmd, done = 0;
2407 struct vcpu *v = current;
2408 struct domain *d = v->domain;
2409 unsigned long type_info;
2410 struct domain_mmap_cache mapcache;
2412 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2414 count &= ~MMU_UPDATE_PREEMPTED;
2415 if ( unlikely(!guest_handle_is_null(pdone)) )
2416 (void)copy_from_guest(&done, pdone, 1);
2418 else
2419 perfc_incr(calls_to_mmu_update);
2421 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2423 rc = -EFAULT;
2424 goto out;
2427 if ( !set_foreigndom(foreigndom) )
2429 rc = -ESRCH;
2430 goto out;
2433 domain_mmap_cache_init(&mapcache);
2435 LOCK_BIGLOCK(d);
2437 for ( i = 0; i < count; i++ )
2439 if ( hypercall_preempt_check() )
2441 rc = hypercall_create_continuation(
2442 __HYPERVISOR_mmu_update, "hihi",
2443 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2444 break;
2447 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2449 MEM_LOG("Bad __copy_from_guest");
2450 rc = -EFAULT;
2451 break;
2454 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2455 okay = 0;
2457 switch ( cmd )
2459 /*
2460 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2461 */
2462 case MMU_NORMAL_PT_UPDATE:
2464 rc = xsm_mmu_normal_update(d, req.val);
2465 if ( rc )
2466 break;
2468 gmfn = req.ptr >> PAGE_SHIFT;
2469 mfn = gmfn_to_mfn(d, gmfn);
2471 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
2473 MEM_LOG("Could not get page for normal update");
2474 break;
2477 va = map_domain_page_with_cache(mfn, &mapcache);
2478 va = (void *)((unsigned long)va +
2479 (unsigned long)(req.ptr & ~PAGE_MASK));
2480 page = mfn_to_page(mfn);
2482 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2484 case PGT_l1_page_table:
2485 case PGT_l2_page_table:
2486 case PGT_l3_page_table:
2487 case PGT_l4_page_table:
2489 if ( paging_mode_refcounts(d) )
2491 MEM_LOG("mmu update on auto-refcounted domain!");
2492 break;
2495 if ( unlikely(!get_page_type(
2496 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2497 goto not_a_pt;
2499 switch ( type_info & PGT_type_mask )
2501 case PGT_l1_page_table:
2503 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2504 okay = mod_l1_entry(va, l1e, mfn);
2506 break;
2507 case PGT_l2_page_table:
2509 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2510 okay = mod_l2_entry(va, l2e, mfn, type_info);
2512 break;
2513 #if CONFIG_PAGING_LEVELS >= 3
2514 case PGT_l3_page_table:
2516 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2517 okay = mod_l3_entry(va, l3e, mfn);
2519 break;
2520 #endif
2521 #if CONFIG_PAGING_LEVELS >= 4
2522 case PGT_l4_page_table:
2524 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2525 okay = mod_l4_entry(va, l4e, mfn);
2527 break;
2528 #endif
2531 put_page_type(page);
2533 break;
2535 default:
2536 not_a_pt:
2538 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2539 break;
2541 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2543 put_page_type(page);
2545 break;
2548 unmap_domain_page_with_cache(va, &mapcache);
2550 put_page(page);
2551 break;
2553 case MMU_MACHPHYS_UPDATE:
2555 mfn = req.ptr >> PAGE_SHIFT;
2556 gpfn = req.val;
2558 rc = xsm_mmu_machphys_update(d, mfn);
2559 if ( rc )
2560 break;
2562 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2564 MEM_LOG("Could not get page for mach->phys update");
2565 break;
2568 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2570 MEM_LOG("Mach-phys update on auto-translate guest");
2571 break;
2574 set_gpfn_from_mfn(mfn, gpfn);
2575 okay = 1;
2577 paging_mark_dirty(FOREIGNDOM, mfn);
2579 put_page(mfn_to_page(mfn));
2580 break;
2582 default:
2583 MEM_LOG("Invalid page update command %x", cmd);
2584 rc = -ENOSYS;
2585 okay = 0;
2586 break;
2589 if ( unlikely(!okay) )
2591 rc = rc ? rc : -EINVAL;
2592 break;
2595 guest_handle_add_offset(ureqs, 1);
2598 process_deferred_ops();
2600 UNLOCK_BIGLOCK(d);
2602 domain_mmap_cache_destroy(&mapcache);
2604 perfc_add(num_page_updates, i);
2606 out:
2607 /* Add incremental work we have done to the @done output parameter. */
2608 if ( unlikely(!guest_handle_is_null(pdone)) )
2610 done += i;
2611 copy_to_guest(pdone, &done, 1);
2614 return rc;
2618 static int create_grant_pte_mapping(
2619 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2621 int rc = GNTST_okay;
2622 void *va;
2623 unsigned long gmfn, mfn;
2624 struct page_info *page;
2625 u32 type;
2626 l1_pgentry_t ol1e;
2627 struct domain *d = v->domain;
2629 ASSERT(spin_is_locked(&d->big_lock));
2631 adjust_guest_l1e(nl1e, d);
2633 gmfn = pte_addr >> PAGE_SHIFT;
2634 mfn = gmfn_to_mfn(d, gmfn);
2636 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2638 MEM_LOG("Could not get page for normal update");
2639 return GNTST_general_error;
2642 va = map_domain_page(mfn);
2643 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2644 page = mfn_to_page(mfn);
2646 type = page->u.inuse.type_info & PGT_type_mask;
2647 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2649 MEM_LOG("Grant map attempted to update a non-L1 page");
2650 rc = GNTST_general_error;
2651 goto failed;
2654 ol1e = *(l1_pgentry_t *)va;
2655 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v) )
2657 put_page_type(page);
2658 rc = GNTST_general_error;
2659 goto failed;
2662 if ( !paging_mode_refcounts(d) )
2663 put_page_from_l1e(ol1e, d);
2665 put_page_type(page);
2667 failed:
2668 unmap_domain_page(va);
2669 put_page(page);
2671 return rc;
2674 static int destroy_grant_pte_mapping(
2675 uint64_t addr, unsigned long frame, struct domain *d)
2677 int rc = GNTST_okay;
2678 void *va;
2679 unsigned long gmfn, mfn;
2680 struct page_info *page;
2681 u32 type;
2682 l1_pgentry_t ol1e;
2684 gmfn = addr >> PAGE_SHIFT;
2685 mfn = gmfn_to_mfn(d, gmfn);
2687 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2689 MEM_LOG("Could not get page for normal update");
2690 return GNTST_general_error;
2693 va = map_domain_page(mfn);
2694 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2695 page = mfn_to_page(mfn);
2697 type = page->u.inuse.type_info & PGT_type_mask;
2698 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2700 MEM_LOG("Grant map attempted to update a non-L1 page");
2701 rc = GNTST_general_error;
2702 goto failed;
2705 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2707 put_page_type(page);
2708 rc = GNTST_general_error;
2709 goto failed;
2712 /* Check that the virtual address supplied is actually mapped to frame. */
2713 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2715 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2716 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2717 put_page_type(page);
2718 rc = GNTST_general_error;
2719 goto failed;
2722 /* Delete pagetable entry. */
2723 if ( unlikely(!UPDATE_ENTRY(l1,
2724 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2725 d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
2727 MEM_LOG("Cannot delete PTE entry at %p", va);
2728 put_page_type(page);
2729 rc = GNTST_general_error;
2730 goto failed;
2733 put_page_type(page);
2735 failed:
2736 unmap_domain_page(va);
2737 put_page(page);
2738 return rc;
2742 static int create_grant_va_mapping(
2743 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2745 l1_pgentry_t *pl1e, ol1e;
2746 struct domain *d = v->domain;
2747 unsigned long gl1mfn;
2748 int okay;
2750 ASSERT(spin_is_locked(&d->big_lock));
2752 adjust_guest_l1e(nl1e, d);
2754 pl1e = guest_map_l1e(v, va, &gl1mfn);
2755 if ( !pl1e )
2757 MEM_LOG("Could not find L1 PTE for address %lx", va);
2758 return GNTST_general_error;
2760 ol1e = *pl1e;
2761 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v);
2762 guest_unmap_l1e(v, pl1e);
2763 pl1e = NULL;
2765 if ( !okay )
2766 return GNTST_general_error;
2768 if ( !paging_mode_refcounts(d) )
2769 put_page_from_l1e(ol1e, d);
2771 return GNTST_okay;
2774 static int replace_grant_va_mapping(
2775 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2777 l1_pgentry_t *pl1e, ol1e;
2778 unsigned long gl1mfn;
2779 int rc = 0;
2781 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2782 if ( !pl1e )
2784 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2785 return GNTST_general_error;
2787 ol1e = *pl1e;
2789 /* Check that the virtual address supplied is actually mapped to frame. */
2790 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2792 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2793 l1e_get_pfn(ol1e), addr, frame);
2794 rc = GNTST_general_error;
2795 goto out;
2798 /* Delete pagetable entry. */
2799 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v)) )
2801 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2802 rc = GNTST_general_error;
2803 goto out;
2806 out:
2807 guest_unmap_l1e(v, pl1e);
2808 return rc;
2811 static int destroy_grant_va_mapping(
2812 unsigned long addr, unsigned long frame, struct vcpu *v)
2814 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2817 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
2818 unsigned int flags, unsigned int cache_flags)
2820 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2822 if ( (flags & GNTMAP_application_map) )
2823 l1e_add_flags(pte,_PAGE_USER);
2824 if ( !(flags & GNTMAP_readonly) )
2825 l1e_add_flags(pte,_PAGE_RW);
2827 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
2829 if ( flags & GNTMAP_contains_pte )
2830 return create_grant_pte_mapping(addr, pte, current);
2831 return create_grant_va_mapping(addr, pte, current);
2834 int replace_grant_host_mapping(
2835 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
2837 struct vcpu *curr = current;
2838 l1_pgentry_t *pl1e, ol1e;
2839 unsigned long gl1mfn;
2840 int rc;
2842 if ( flags & GNTMAP_contains_pte )
2844 if ( !new_addr )
2845 return destroy_grant_pte_mapping(addr, frame, curr->domain);
2847 MEM_LOG("Unsupported grant table operation");
2848 return GNTST_general_error;
2851 if ( !new_addr )
2852 return destroy_grant_va_mapping(addr, frame, curr);
2854 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
2855 if ( !pl1e )
2857 MEM_LOG("Could not find L1 PTE for address %lx",
2858 (unsigned long)new_addr);
2859 return GNTST_general_error;
2861 ol1e = *pl1e;
2863 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, curr)) )
2865 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2866 guest_unmap_l1e(curr, pl1e);
2867 return GNTST_general_error;
2870 guest_unmap_l1e(curr, pl1e);
2872 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
2873 if ( rc && !paging_mode_refcounts(curr->domain) )
2874 put_page_from_l1e(ol1e, curr->domain);
2876 return rc;
2879 int steal_page(
2880 struct domain *d, struct page_info *page, unsigned int memflags)
2882 u32 _d, _nd, x, y;
2884 spin_lock(&d->page_alloc_lock);
2886 /*
2887 * The tricky bit: atomically release ownership while there is just one
2888 * benign reference to the page (PGC_allocated). If that reference
2889 * disappears then the deallocation routine will safely spin.
2890 */
2891 _d = pickle_domptr(d);
2892 _nd = page->u.inuse._domain;
2893 y = page->count_info;
2894 do {
2895 x = y;
2896 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2897 (1 | PGC_allocated)) || unlikely(_nd != _d) )
2899 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2900 " caf=%08x, taf=%" PRtype_info "\n",
2901 (void *) page_to_mfn(page),
2902 d, d->domain_id, unpickle_domptr(_nd), x,
2903 page->u.inuse.type_info);
2904 spin_unlock(&d->page_alloc_lock);
2905 return -1;
2907 asm volatile (
2908 LOCK_PREFIX "cmpxchg8b %2"
2909 : "=d" (_nd), "=a" (y),
2910 "=m" (*(volatile u64 *)(&page->count_info))
2911 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2912 } while (unlikely(_nd != _d) || unlikely(y != x));
2914 /*
2915 * Unlink from 'd'. At least one reference remains (now anonymous), so
2916 * noone else is spinning to try to delete this page from 'd'.
2917 */
2918 if ( !(memflags & MEMF_no_refcount) )
2919 d->tot_pages--;
2920 list_del(&page->list);
2922 spin_unlock(&d->page_alloc_lock);
2924 return 0;
2927 int do_update_va_mapping(unsigned long va, u64 val64,
2928 unsigned long flags)
2930 l1_pgentry_t val = l1e_from_intpte(val64);
2931 struct vcpu *v = current;
2932 struct domain *d = v->domain;
2933 l1_pgentry_t *pl1e;
2934 unsigned long vmask, bmap_ptr, gl1mfn;
2935 cpumask_t pmask;
2936 int rc = 0;
2938 perfc_incr(calls_to_update_va);
2940 if ( unlikely(!__addr_ok(va) && !paging_mode_external(d)) )
2941 return -EINVAL;
2943 rc = xsm_update_va_mapping(current->domain, val);
2944 if ( rc )
2945 return rc;
2947 LOCK_BIGLOCK(d);
2949 pl1e = guest_map_l1e(v, va, &gl1mfn);
2951 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) )
2952 rc = -EINVAL;
2954 if ( pl1e )
2955 guest_unmap_l1e(v, pl1e);
2956 pl1e = NULL;
2958 process_deferred_ops();
2960 UNLOCK_BIGLOCK(d);
2962 switch ( flags & UVMF_FLUSHTYPE_MASK )
2964 case UVMF_TLB_FLUSH:
2965 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2967 case UVMF_LOCAL:
2968 flush_tlb_local();
2969 break;
2970 case UVMF_ALL:
2971 flush_tlb_mask(d->domain_dirty_cpumask);
2972 break;
2973 default:
2974 if ( unlikely(!is_pv_32on64_domain(d) ?
2975 get_user(vmask, (unsigned long *)bmap_ptr) :
2976 get_user(vmask, (unsigned int *)bmap_ptr)) )
2977 rc = -EFAULT;
2978 pmask = vcpumask_to_pcpumask(d, vmask);
2979 flush_tlb_mask(pmask);
2980 break;
2982 break;
2984 case UVMF_INVLPG:
2985 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2987 case UVMF_LOCAL:
2988 if ( !paging_mode_enabled(d) ||
2989 (paging_invlpg(v, va) != 0) )
2990 flush_tlb_one_local(va);
2991 break;
2992 case UVMF_ALL:
2993 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2994 break;
2995 default:
2996 if ( unlikely(!is_pv_32on64_domain(d) ?
2997 get_user(vmask, (unsigned long *)bmap_ptr) :
2998 get_user(vmask, (unsigned int *)bmap_ptr)) )
2999 rc = -EFAULT;
3000 pmask = vcpumask_to_pcpumask(d, vmask);
3001 flush_tlb_one_mask(pmask, va);
3002 break;
3004 break;
3007 return rc;
3010 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3011 unsigned long flags,
3012 domid_t domid)
3014 int rc;
3016 if ( unlikely(!IS_PRIV(current->domain)) )
3017 return -EPERM;
3019 if ( !set_foreigndom(domid) )
3020 return -ESRCH;
3022 rc = do_update_va_mapping(va, val64, flags);
3024 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3025 process_deferred_ops(); /* only to clear foreigndom */
3027 return rc;
3032 /*************************
3033 * Descriptor Tables
3034 */
3036 void destroy_gdt(struct vcpu *v)
3038 int i;
3039 unsigned long pfn;
3041 v->arch.guest_context.gdt_ents = 0;
3042 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3044 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3045 put_page_and_type(mfn_to_page(pfn));
3046 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3047 v->arch.guest_context.gdt_frames[i] = 0;
3052 long set_gdt(struct vcpu *v,
3053 unsigned long *frames,
3054 unsigned int entries)
3056 struct domain *d = v->domain;
3057 /* NB. There are 512 8-byte entries per GDT page. */
3058 int i, nr_pages = (entries + 511) / 512;
3059 unsigned long mfn;
3061 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3062 return -EINVAL;
3064 /* Check the pages in the new GDT. */
3065 for ( i = 0; i < nr_pages; i++ )
3067 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3068 if ( !mfn_valid(mfn) ||
3069 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
3070 goto fail;
3073 /* Tear down the old GDT. */
3074 destroy_gdt(v);
3076 /* Install the new GDT. */
3077 v->arch.guest_context.gdt_ents = entries;
3078 for ( i = 0; i < nr_pages; i++ )
3080 v->arch.guest_context.gdt_frames[i] = frames[i];
3081 l1e_write(&v->arch.perdomain_ptes[i],
3082 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3085 return 0;
3087 fail:
3088 while ( i-- > 0 )
3089 put_page_and_type(mfn_to_page(frames[i]));
3090 return -EINVAL;
3094 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3096 int nr_pages = (entries + 511) / 512;
3097 unsigned long frames[16];
3098 struct vcpu *curr = current;
3099 long ret;
3101 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3102 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3103 return -EINVAL;
3105 if ( copy_from_guest(frames, frame_list, nr_pages) )
3106 return -EFAULT;
3108 LOCK_BIGLOCK(curr->domain);
3110 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3111 flush_tlb_local();
3113 UNLOCK_BIGLOCK(curr->domain);
3115 return ret;
3119 long do_update_descriptor(u64 pa, u64 desc)
3121 struct domain *dom = current->domain;
3122 unsigned long gmfn = pa >> PAGE_SHIFT;
3123 unsigned long mfn;
3124 unsigned int offset;
3125 struct desc_struct *gdt_pent, d;
3126 struct page_info *page;
3127 long ret = -EINVAL;
3129 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3131 *(u64 *)&d = desc;
3133 mfn = gmfn_to_mfn(dom, gmfn);
3134 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3135 !mfn_valid(mfn) ||
3136 !check_descriptor(dom, &d) )
3137 return -EINVAL;
3139 page = mfn_to_page(mfn);
3140 if ( unlikely(!get_page(page, dom)) )
3141 return -EINVAL;
3143 /* Check if the given frame is in use in an unsafe context. */
3144 switch ( page->u.inuse.type_info & PGT_type_mask )
3146 case PGT_gdt_page:
3147 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
3148 goto out;
3149 break;
3150 case PGT_ldt_page:
3151 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
3152 goto out;
3153 break;
3154 default:
3155 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3156 goto out;
3157 break;
3160 paging_mark_dirty(dom, mfn);
3162 /* All is good so make the update. */
3163 gdt_pent = map_domain_page(mfn);
3164 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3165 unmap_domain_page(gdt_pent);
3167 put_page_type(page);
3169 ret = 0; /* success */
3171 out:
3172 put_page(page);
3174 return ret;
3177 typedef struct e820entry e820entry_t;
3178 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3180 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3182 switch ( op )
3184 case XENMEM_add_to_physmap:
3186 struct xen_add_to_physmap xatp;
3187 unsigned long prev_mfn, mfn = 0, gpfn;
3188 struct domain *d;
3190 if ( copy_from_guest(&xatp, arg, 1) )
3191 return -EFAULT;
3193 if ( xatp.domid == DOMID_SELF )
3194 d = rcu_lock_current_domain();
3195 else if ( !IS_PRIV(current->domain) )
3196 return -EPERM;
3197 else if ( (d = rcu_lock_domain_by_id(xatp.domid)) == NULL )
3198 return -ESRCH;
3200 if ( xsm_add_to_physmap(current->domain, d) )
3202 rcu_unlock_domain(d);
3203 return -EPERM;
3206 switch ( xatp.space )
3208 case XENMAPSPACE_shared_info:
3209 if ( xatp.idx == 0 )
3210 mfn = virt_to_mfn(d->shared_info);
3211 /* XXX: assumption here, this is called after E820 table is build
3212 * need the E820 to initialize MTRR.
3213 */
3214 if ( is_hvm_domain(d) ) {
3215 extern void init_mtrr_in_hyper(struct vcpu *);
3216 struct vcpu *vs;
3217 for_each_vcpu(d, vs)
3218 init_mtrr_in_hyper(vs);
3220 break;
3221 case XENMAPSPACE_grant_table:
3222 spin_lock(&d->grant_table->lock);
3224 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3225 (xatp.idx < max_nr_grant_frames) )
3226 gnttab_grow_table(d, xatp.idx + 1);
3228 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3229 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3231 spin_unlock(&d->grant_table->lock);
3232 break;
3233 default:
3234 break;
3237 if ( !paging_mode_translate(d) || (mfn == 0) )
3239 rcu_unlock_domain(d);
3240 return -EINVAL;
3243 LOCK_BIGLOCK(d);
3245 /* Remove previously mapped page if it was present. */
3246 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3247 if ( mfn_valid(prev_mfn) )
3249 if ( is_xen_heap_mfn(prev_mfn) )
3250 /* Xen heap frames are simply unhooked from this phys slot. */
3251 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
3252 else
3253 /* Normal domain memory is freed, to avoid leaking memory. */
3254 guest_remove_page(d, xatp.gpfn);
3257 /* Unmap from old location, if any. */
3258 gpfn = get_gpfn_from_mfn(mfn);
3259 if ( gpfn != INVALID_M2P_ENTRY )
3260 guest_physmap_remove_page(d, gpfn, mfn);
3262 /* Map at new location. */
3263 guest_physmap_add_page(d, xatp.gpfn, mfn);
3265 UNLOCK_BIGLOCK(d);
3267 rcu_unlock_domain(d);
3269 break;
3272 case XENMEM_set_memory_map:
3274 struct xen_foreign_memory_map fmap;
3275 struct domain *d;
3276 int rc;
3278 if ( copy_from_guest(&fmap, arg, 1) )
3279 return -EFAULT;
3281 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3282 return -EINVAL;
3284 if ( fmap.domid == DOMID_SELF )
3285 d = rcu_lock_current_domain();
3286 else if ( !IS_PRIV(current->domain) )
3287 return -EPERM;
3288 else if ( (d = rcu_lock_domain_by_id(fmap.domid)) == NULL )
3289 return -ESRCH;
3291 rc = xsm_domain_memory_map(d);
3292 if ( rc )
3294 rcu_unlock_domain(d);
3295 return rc;
3298 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3299 fmap.map.nr_entries) ? -EFAULT : 0;
3300 d->arch.nr_e820 = fmap.map.nr_entries;
3302 rcu_unlock_domain(d);
3303 return rc;
3306 case XENMEM_memory_map:
3308 struct xen_memory_map map;
3309 struct domain *d = current->domain;
3311 /* Backwards compatibility. */
3312 if ( d->arch.nr_e820 == 0 )
3313 return -ENOSYS;
3315 if ( copy_from_guest(&map, arg, 1) )
3316 return -EFAULT;
3318 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3319 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3320 copy_to_guest(arg, &map, 1) )
3321 return -EFAULT;
3323 return 0;
3326 case XENMEM_machine_memory_map:
3328 struct xen_memory_map memmap;
3329 XEN_GUEST_HANDLE(e820entry_t) buffer;
3330 int count;
3331 int rc;
3333 if ( !IS_PRIV(current->domain) )
3334 return -EINVAL;
3336 rc = xsm_machine_memory_map();
3337 if ( rc )
3338 return rc;
3340 if ( copy_from_guest(&memmap, arg, 1) )
3341 return -EFAULT;
3342 if ( memmap.nr_entries < e820.nr_map + 1 )
3343 return -EINVAL;
3345 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3347 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3348 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3349 return -EFAULT;
3351 memmap.nr_entries = count;
3353 if ( copy_to_guest(arg, &memmap, 1) )
3354 return -EFAULT;
3356 return 0;
3359 case XENMEM_machphys_mapping:
3361 static const struct xen_machphys_mapping mapping = {
3362 .v_start = MACH2PHYS_VIRT_START,
3363 .v_end = MACH2PHYS_VIRT_END,
3364 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3365 };
3367 if ( copy_to_guest(arg, &mapping, 1) )
3368 return -EFAULT;
3370 return 0;
3373 default:
3374 return subarch_memory_op(op, arg);
3377 return 0;
3381 /*************************
3382 * Writable Pagetables
3383 */
3385 struct ptwr_emulate_ctxt {
3386 struct x86_emulate_ctxt ctxt;
3387 unsigned long cr2;
3388 l1_pgentry_t pte;
3389 };
3391 static int ptwr_emulated_read(
3392 enum x86_segment seg,
3393 unsigned long offset,
3394 unsigned long *val,
3395 unsigned int bytes,
3396 struct x86_emulate_ctxt *ctxt)
3398 unsigned int rc;
3399 unsigned long addr = offset;
3401 *val = 0;
3402 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3404 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3405 return X86EMUL_EXCEPTION;
3408 return X86EMUL_OKAY;
3411 static int ptwr_emulated_update(
3412 unsigned long addr,
3413 paddr_t old,
3414 paddr_t val,
3415 unsigned int bytes,
3416 unsigned int do_cmpxchg,
3417 struct ptwr_emulate_ctxt *ptwr_ctxt)
3419 unsigned long mfn;
3420 unsigned long unaligned_addr = addr;
3421 struct page_info *page;
3422 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3423 struct vcpu *v = current;
3424 struct domain *d = v->domain;
3426 /* Only allow naturally-aligned stores within the original %cr2 page. */
3427 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3429 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3430 ptwr_ctxt->cr2, addr, bytes);
3431 return X86EMUL_UNHANDLEABLE;
3434 /* Turn a sub-word access into a full-word access. */
3435 if ( bytes != sizeof(paddr_t) )
3437 paddr_t full;
3438 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3440 /* Align address; read full word. */
3441 addr &= ~(sizeof(paddr_t)-1);
3442 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3444 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3445 return X86EMUL_EXCEPTION;
3447 /* Mask out bits provided by caller. */
3448 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3449 /* Shift the caller value and OR in the missing bits. */
3450 val &= (((paddr_t)1 << (bytes*8)) - 1);
3451 val <<= (offset)*8;
3452 val |= full;
3453 /* Also fill in missing parts of the cmpxchg old value. */
3454 old &= (((paddr_t)1 << (bytes*8)) - 1);
3455 old <<= (offset)*8;
3456 old |= full;
3459 pte = ptwr_ctxt->pte;
3460 mfn = l1e_get_pfn(pte);
3461 page = mfn_to_page(mfn);
3463 /* We are looking only for read-only mappings of p.t. pages. */
3464 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3465 ASSERT(mfn_valid(mfn));
3466 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3467 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3468 ASSERT(page_get_owner(page) == d);
3470 /* Check the new PTE. */
3471 nl1e = l1e_from_intpte(val);
3472 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3474 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3475 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3476 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3478 /*
3479 * If this is an upper-half write to a PAE PTE then we assume that
3480 * the guest has simply got the two writes the wrong way round. We
3481 * zap the PRESENT bit on the assumption that the bottom half will
3482 * be written immediately after we return to the guest.
3483 */
3484 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3485 l1e_get_intpte(nl1e));
3486 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3488 else
3490 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3491 return X86EMUL_UNHANDLEABLE;
3495 adjust_guest_l1e(nl1e, d);
3497 /* Checked successfully: do the update (write or cmpxchg). */
3498 pl1e = map_domain_page(mfn);
3499 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3500 if ( do_cmpxchg )
3502 int okay;
3503 intpte_t t = old;
3504 ol1e = l1e_from_intpte(old);
3506 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3507 &t, l1e_get_intpte(nl1e), _mfn(mfn));
3508 okay = (okay && t == old);
3510 if ( !okay )
3512 unmap_domain_page(pl1e);
3513 put_page_from_l1e(nl1e, d);
3514 return X86EMUL_CMPXCHG_FAILED;
3517 else
3519 ol1e = *pl1e;
3520 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v) )
3521 BUG();
3524 trace_ptwr_emulation(addr, nl1e);
3526 unmap_domain_page(pl1e);
3528 /* Finally, drop the old PTE. */
3529 put_page_from_l1e(ol1e, d);
3531 return X86EMUL_OKAY;
3534 static int ptwr_emulated_write(
3535 enum x86_segment seg,
3536 unsigned long offset,
3537 unsigned long val,
3538 unsigned int bytes,
3539 struct x86_emulate_ctxt *ctxt)
3541 return ptwr_emulated_update(
3542 offset, 0, val, bytes, 0,
3543 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3546 static int ptwr_emulated_cmpxchg(
3547 enum x86_segment seg,
3548 unsigned long offset,
3549 unsigned long old,
3550 unsigned long new,
3551 unsigned int bytes,
3552 struct x86_emulate_ctxt *ctxt)
3554 return ptwr_emulated_update(
3555 offset, old, new, bytes, 1,
3556 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3559 static int ptwr_emulated_cmpxchg8b(
3560 enum x86_segment seg,
3561 unsigned long offset,
3562 unsigned long old,
3563 unsigned long old_hi,
3564 unsigned long new,
3565 unsigned long new_hi,
3566 struct x86_emulate_ctxt *ctxt)
3568 if ( CONFIG_PAGING_LEVELS == 2 )
3569 return X86EMUL_UNHANDLEABLE;
3570 return ptwr_emulated_update(
3571 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3572 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3575 static struct x86_emulate_ops ptwr_emulate_ops = {
3576 .read = ptwr_emulated_read,
3577 .insn_fetch = ptwr_emulated_read,
3578 .write = ptwr_emulated_write,
3579 .cmpxchg = ptwr_emulated_cmpxchg,
3580 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3581 };
3583 /* Write page fault handler: check if guest is trying to modify a PTE. */
3584 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3585 struct cpu_user_regs *regs)
3587 struct domain *d = v->domain;
3588 struct page_info *page;
3589 l1_pgentry_t pte;
3590 struct ptwr_emulate_ctxt ptwr_ctxt;
3591 int rc;
3593 LOCK_BIGLOCK(d);
3595 /* Attempt to read the PTE that maps the VA being accessed. */
3596 guest_get_eff_l1e(v, addr, &pte);
3597 page = l1e_get_page(pte);
3599 /* We are looking only for read-only mappings of p.t. pages. */
3600 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3601 !mfn_valid(l1e_get_pfn(pte)) ||
3602 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3603 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3604 (page_get_owner(page) != d) )
3605 goto bail;
3607 ptwr_ctxt.ctxt.regs = regs;
3608 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3609 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3610 ptwr_ctxt.cr2 = addr;
3611 ptwr_ctxt.pte = pte;
3613 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3614 if ( rc == X86EMUL_UNHANDLEABLE )
3615 goto bail;
3617 UNLOCK_BIGLOCK(d);
3618 perfc_incr(ptwr_emulations);
3619 return EXCRET_fault_fixed;
3621 bail:
3622 UNLOCK_BIGLOCK(d);
3623 return 0;
3626 void free_xen_pagetable(void *v)
3628 extern int early_boot;
3630 BUG_ON(early_boot);
3632 if ( is_xen_heap_page(virt_to_page(v)) )
3633 free_xenheap_page(v);
3634 else
3635 free_domheap_page(virt_to_page(v));
3638 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
3639 #define l1f_to_l2f(f) ((f) | _PAGE_PSE)
3640 #define l2f_to_l1f(f) ((f) & ~_PAGE_PSE)
3642 /*
3643 * map_pages_to_xen() can be called with interrupts disabled:
3644 * * During early bootstrap; or
3645 * * alloc_xenheap_pages() via memguard_guard_range
3646 * In these cases it is safe to use flush_area_local():
3647 * * Because only the local CPU is online; or
3648 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
3649 */
3650 #define flush_area(v,f) (!local_irq_is_enabled() ? \
3651 flush_area_local((const void *)v, f) : \
3652 flush_area_all((const void *)v, f))
3654 int map_pages_to_xen(
3655 unsigned long virt,
3656 unsigned long mfn,
3657 unsigned long nr_mfns,
3658 unsigned int flags)
3660 l2_pgentry_t *pl2e, ol2e;
3661 l1_pgentry_t *pl1e, ol1e;
3662 unsigned int i;
3664 while ( nr_mfns != 0 )
3666 pl2e = virt_to_xen_l2e(virt);
3668 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3669 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3670 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
3672 /* Super-page mapping. */
3673 ol2e = *pl2e;
3674 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_l2f(flags)));
3676 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3678 unsigned int flush_flags =
3679 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3681 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3683 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3684 flush_flags |= FLUSH_TLB_GLOBAL;
3685 if ( (l2e_get_flags(ol2e) ^ l1f_to_l2f(flags)) &
3686 l1f_to_l2f(PAGE_CACHE_ATTRS) )
3687 flush_flags |= FLUSH_CACHE;
3688 flush_area(virt, flush_flags);
3690 else
3692 pl1e = l2e_to_l1e(ol2e);
3693 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3695 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
3696 flush_flags |= FLUSH_TLB_GLOBAL;
3697 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
3698 PAGE_CACHE_ATTRS )
3699 flush_flags |= FLUSH_CACHE;
3701 flush_area(virt, flush_flags);
3702 free_xen_pagetable(pl1e);
3706 virt += 1UL << L2_PAGETABLE_SHIFT;
3707 mfn += 1UL << PAGETABLE_ORDER;
3708 nr_mfns -= 1UL << PAGETABLE_ORDER;
3710 else
3712 /* Normal page mapping. */
3713 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3715 pl1e = alloc_xen_pagetable();
3716 if ( pl1e == NULL )
3717 return -ENOMEM;
3718 clear_page(pl1e);
3719 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3720 __PAGE_HYPERVISOR));
3722 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3724 unsigned int flush_flags =
3725 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
3727 /* Skip this PTE if there is no change. */
3728 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
3729 l1_table_offset(virt)) == mfn) &&
3730 (((l2f_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
3731 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
3733 virt += 1UL << L1_PAGETABLE_SHIFT;
3734 mfn += 1UL;
3735 nr_mfns -= 1UL;
3736 continue;
3739 pl1e = alloc_xen_pagetable();
3740 if ( pl1e == NULL )
3741 return -ENOMEM;
3743 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3744 l1e_write(&pl1e[i],
3745 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3746 l2f_to_l1f(l2e_get_flags(*pl2e))));
3748 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
3749 flush_flags |= FLUSH_TLB_GLOBAL;
3751 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3752 __PAGE_HYPERVISOR));
3753 flush_area(virt, flush_flags);
3756 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3757 ol1e = *pl1e;
3758 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
3759 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3761 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
3762 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3763 flush_flags |= FLUSH_TLB_GLOBAL;
3764 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
3765 flush_flags |= FLUSH_CACHE;
3766 flush_area(virt, flush_flags);
3769 virt += 1UL << L1_PAGETABLE_SHIFT;
3770 mfn += 1UL;
3771 nr_mfns -= 1UL;
3773 if ( (flags == PAGE_HYPERVISOR) &&
3774 ((nr_mfns == 0) ||
3775 ((((virt >> PAGE_SHIFT) | mfn) &
3776 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
3778 unsigned long base_mfn;
3779 pl1e = l2e_to_l1e(*pl2e);
3780 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
3781 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
3782 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
3783 (l1e_get_flags(*pl1e) != flags) )
3784 break;
3785 if ( i == L1_PAGETABLE_ENTRIES )
3787 ol2e = *pl2e;
3788 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
3789 l1f_to_l2f(flags)));
3790 flush_area(virt, (FLUSH_TLB_GLOBAL |
3791 FLUSH_ORDER(PAGETABLE_ORDER)));
3792 free_xen_pagetable(l2e_to_l1e(ol2e));
3798 return 0;
3801 void destroy_xen_mappings(unsigned long s, unsigned long e)
3803 l2_pgentry_t *pl2e;
3804 l1_pgentry_t *pl1e;
3805 unsigned int i;
3806 unsigned long v = s;
3808 ASSERT((s & ~PAGE_MASK) == 0);
3809 ASSERT((e & ~PAGE_MASK) == 0);
3811 while ( v < e )
3813 pl2e = virt_to_xen_l2e(v);
3815 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3817 v += 1UL << L2_PAGETABLE_SHIFT;
3818 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
3819 continue;
3822 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3824 if ( (l1_table_offset(v) == 0) &&
3825 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
3827 /* PSE: whole superpage is destroyed. */
3828 l2e_write_atomic(pl2e, l2e_empty());
3829 v += 1UL << L2_PAGETABLE_SHIFT;
3831 else
3833 /* PSE: shatter the superpage and try again. */
3834 pl1e = alloc_xen_pagetable();
3835 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3836 l1e_write(&pl1e[i],
3837 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3838 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
3839 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3840 __PAGE_HYPERVISOR));
3843 else
3845 /* Ordinary 4kB mapping. */
3846 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
3847 l1e_write_atomic(pl1e, l1e_empty());
3848 v += PAGE_SIZE;
3850 /* If we are done with the L2E, check if it is now empty. */
3851 if ( (v != e) && (l1_table_offset(v) != 0) )
3852 continue;
3853 pl1e = l2e_to_l1e(*pl2e);
3854 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3855 if ( l1e_get_intpte(pl1e[i]) != 0 )
3856 break;
3857 if ( i == L1_PAGETABLE_ENTRIES )
3859 /* Empty: zap the L2E and free the L1 page. */
3860 l2e_write_atomic(pl2e, l2e_empty());
3861 flush_all(FLUSH_TLB_GLOBAL); /* flush before free */
3862 free_xen_pagetable(pl1e);
3867 flush_all(FLUSH_TLB_GLOBAL);
3870 void __set_fixmap(
3871 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3873 BUG_ON(idx >= __end_of_fixed_addresses);
3874 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3877 #ifdef MEMORY_GUARD
3879 void memguard_init(void)
3881 map_pages_to_xen(
3882 (unsigned long)__va(xen_phys_start),
3883 xen_phys_start >> PAGE_SHIFT,
3884 (xenheap_phys_end - xen_phys_start) >> PAGE_SHIFT,
3885 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3886 #ifdef __x86_64__
3887 map_pages_to_xen(
3888 XEN_VIRT_START,
3889 xen_phys_start >> PAGE_SHIFT,
3890 (__pa(&_end) + PAGE_SIZE - 1 - xen_phys_start) >> PAGE_SHIFT,
3891 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3892 #endif
3895 static void __memguard_change_range(void *p, unsigned long l, int guard)
3897 unsigned long _p = (unsigned long)p;
3898 unsigned long _l = (unsigned long)l;
3899 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3901 /* Ensure we are dealing with a page-aligned whole number of pages. */
3902 ASSERT((_p&~PAGE_MASK) == 0);
3903 ASSERT((_l&~PAGE_MASK) == 0);
3905 if ( guard )
3906 flags &= ~_PAGE_PRESENT;
3908 map_pages_to_xen(
3909 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3912 void memguard_guard_range(void *p, unsigned long l)
3914 __memguard_change_range(p, l, 1);
3917 void memguard_unguard_range(void *p, unsigned long l)
3919 __memguard_change_range(p, l, 0);
3922 #endif
3924 void memguard_guard_stack(void *p)
3926 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
3927 p = (void *)((unsigned long)p + STACK_SIZE -
3928 PRIMARY_STACK_SIZE - PAGE_SIZE);
3929 memguard_guard_range(p, PAGE_SIZE);
3932 /*
3933 * Local variables:
3934 * mode: C
3935 * c-set-style: "BSD"
3936 * c-basic-offset: 4
3937 * tab-width: 4
3938 * indent-tabs-mode: nil
3939 * End:
3940 */