ia64/xen-unstable

view xen/arch/x86/mm.c @ 13915:a00b8d3800a8

[XEN] Snapshot PAE l3es when they are shadowed.
We don't update the shadows so we mustn't look at the guest l3es
or we'll be confused by them if they change.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Wed Feb 14 14:46:18 2007 +0000 (2007-02-14)
parents cbbd748c4b58
children 70f05d642a2e
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <public/memory.h>
113 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
115 /*
116 * PTE updates can be done with ordinary writes except:
117 * 1. Debug builds get extra checking by using CMPXCHG[8B].
118 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
119 */
120 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
121 #define PTE_UPDATE_WITH_CMPXCHG
122 #endif
124 /* Used to defer flushing of memory structures. */
125 struct percpu_mm_info {
126 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
127 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
128 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
129 unsigned int deferred_ops;
130 /* If non-NULL, specifies a foreign subject domain for some operations. */
131 struct domain *foreign;
132 };
133 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 /* Frame table and its size in pages. */
145 struct page_info *frame_table;
146 unsigned long max_page;
147 unsigned long total_pages;
149 #ifdef CONFIG_COMPAT
150 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
151 #define l3_disallow_mask(d) (!IS_COMPAT(d) ? \
152 L3_DISALLOW_MASK : \
153 COMPAT_L3_DISALLOW_MASK)
154 #else
155 #define l3_disallow_mask(d) L3_DISALLOW_MASK
156 #endif
158 static void queue_deferred_ops(struct domain *d, unsigned int ops)
159 {
160 if ( d == current->domain )
161 this_cpu(percpu_mm_info).deferred_ops |= ops;
162 else
163 BUG_ON(!test_bit(_DOMF_paused, &d->domain_flags) ||
164 !cpus_empty(d->domain_dirty_cpumask));
165 }
167 void __init init_frametable(void)
168 {
169 unsigned long nr_pages, page_step, i, mfn;
171 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
173 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
174 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
176 for ( i = 0; i < nr_pages; i += page_step )
177 {
178 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
179 if ( mfn == 0 )
180 panic("Not enough memory for frame table\n");
181 map_pages_to_xen(
182 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
183 mfn, page_step, PAGE_HYPERVISOR);
184 }
186 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
187 }
189 void arch_init_memory(void)
190 {
191 extern void subarch_init_memory(void);
193 unsigned long i, pfn, rstart_pfn, rend_pfn;
195 /*
196 * Initialise our DOMID_XEN domain.
197 * Any Xen-heap pages that we will allow to be mapped will have
198 * their domain field set to dom_xen.
199 */
200 dom_xen = alloc_domain(DOMID_XEN);
201 BUG_ON(dom_xen == NULL);
203 /*
204 * Initialise our DOMID_IO domain.
205 * This domain owns I/O pages that are within the range of the page_info
206 * array. Mappings occur at the priv of the caller.
207 */
208 dom_io = alloc_domain(DOMID_IO);
209 BUG_ON(dom_io == NULL);
211 /* First 1MB of RAM is historically marked as I/O. */
212 for ( i = 0; i < 0x100; i++ )
213 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
215 /* Any areas not specified as RAM by the e820 map are considered I/O. */
216 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
217 {
218 if ( e820.map[i].type != E820_RAM )
219 continue;
220 /* Every page from cursor to start of next RAM region is I/O. */
221 rstart_pfn = PFN_UP(e820.map[i].addr);
222 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
223 for ( ; pfn < rstart_pfn; pfn++ )
224 {
225 BUG_ON(!mfn_valid(pfn));
226 share_xen_page_with_guest(
227 mfn_to_page(pfn), dom_io, XENSHARE_writable);
228 }
229 /* Skip the RAM region. */
230 pfn = rend_pfn;
231 }
232 BUG_ON(pfn != max_page);
234 subarch_init_memory();
235 }
237 int memory_is_conventional_ram(paddr_t p)
238 {
239 int i;
241 for ( i = 0; i < e820.nr_map; i++ )
242 {
243 if ( (e820.map[i].type == E820_RAM) &&
244 (e820.map[i].addr <= p) &&
245 (e820.map[i].size > p) )
246 return 1;
247 }
249 return 0;
250 }
252 void share_xen_page_with_guest(
253 struct page_info *page, struct domain *d, int readonly)
254 {
255 if ( page_get_owner(page) == d )
256 return;
258 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
260 spin_lock(&d->page_alloc_lock);
262 /* The incremented type count pins as writable or read-only. */
263 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
264 page->u.inuse.type_info |= PGT_validated | 1;
266 page_set_owner(page, d);
267 wmb(); /* install valid domain ptr before updating refcnt. */
268 ASSERT(page->count_info == 0);
269 page->count_info |= PGC_allocated | 1;
271 if ( unlikely(d->xenheap_pages++ == 0) )
272 get_knownalive_domain(d);
273 list_add_tail(&page->list, &d->xenpage_list);
275 spin_unlock(&d->page_alloc_lock);
276 }
278 void share_xen_page_with_privileged_guests(
279 struct page_info *page, int readonly)
280 {
281 share_xen_page_with_guest(page, dom_xen, readonly);
282 }
284 #if defined(CONFIG_X86_PAE)
286 #ifdef NDEBUG
287 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
288 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
289 #else
290 /*
291 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
292 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
293 * (detected by lack of an owning domain). As required for correctness, we
294 * always shadow PDPTs above 4GB.
295 */
296 #define l3tab_needs_shadow(mfn) \
297 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
298 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
299 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
300 ((mfn) >= 0x100000))
301 #endif
303 static l1_pgentry_t *fix_pae_highmem_pl1e;
305 /* Cache the address of PAE high-memory fixmap page tables. */
306 static int __init cache_pae_fixmap_address(void)
307 {
308 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
309 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
310 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
311 return 0;
312 }
313 __initcall(cache_pae_fixmap_address);
315 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
317 void make_cr3(struct vcpu *v, unsigned long mfn)
318 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
319 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
320 {
321 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
322 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
323 unsigned int cpu = smp_processor_id();
325 /* Fast path: does this mfn need a shadow at all? */
326 if ( !l3tab_needs_shadow(mfn) )
327 {
328 v->arch.cr3 = mfn << PAGE_SHIFT;
329 /* Cache is no longer in use or valid */
330 cache->high_mfn = 0;
331 return;
332 }
334 /* Caching logic is not interrupt safe. */
335 ASSERT(!in_irq());
337 /* Protects against pae_flush_pgd(). */
338 spin_lock(&cache->lock);
340 cache->inuse_idx ^= 1;
341 cache->high_mfn = mfn;
343 /* Map the guest L3 table and copy to the chosen low-memory cache. */
344 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
345 /* First check the previous high mapping can't be in the TLB.
346 * (i.e. have we loaded CR3 since we last did this?) */
347 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
348 local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
349 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
350 lowmem_l3tab = cache->table[cache->inuse_idx];
351 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
352 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
353 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
355 v->arch.cr3 = __pa(lowmem_l3tab);
357 spin_unlock(&cache->lock);
358 }
360 #else /* !CONFIG_X86_PAE */
362 void make_cr3(struct vcpu *v, unsigned long mfn)
363 {
364 v->arch.cr3 = mfn << PAGE_SHIFT;
365 }
367 #endif /* !CONFIG_X86_PAE */
369 void write_ptbase(struct vcpu *v)
370 {
371 write_cr3(v->arch.cr3);
372 }
374 /* Should be called after CR3 is updated.
375 * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3.
376 *
377 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
378 * for HVM guests, arch.monitor_table and hvm's guest CR3.
379 *
380 * Update ref counts to shadow tables appropriately.
381 */
382 void update_cr3(struct vcpu *v)
383 {
384 unsigned long cr3_mfn=0;
386 if ( paging_mode_enabled(v->domain) )
387 {
388 paging_update_cr3(v);
389 return;
390 }
392 #if CONFIG_PAGING_LEVELS == 4
393 if ( !(v->arch.flags & TF_kernel_mode) )
394 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
395 else
396 #endif
397 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
399 make_cr3(v, cr3_mfn);
400 }
403 void invalidate_shadow_ldt(struct vcpu *v)
404 {
405 int i;
406 unsigned long pfn;
407 struct page_info *page;
409 if ( v->arch.shadow_ldt_mapcnt == 0 )
410 return;
412 v->arch.shadow_ldt_mapcnt = 0;
414 for ( i = 16; i < 32; i++ )
415 {
416 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
417 if ( pfn == 0 ) continue;
418 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
419 page = mfn_to_page(pfn);
420 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
421 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
422 put_page_and_type(page);
423 }
425 /* Dispose of the (now possibly invalid) mappings from the TLB. */
426 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
427 }
430 static int alloc_segdesc_page(struct page_info *page)
431 {
432 struct desc_struct *descs;
433 int i;
435 descs = map_domain_page(page_to_mfn(page));
437 for ( i = 0; i < 512; i++ )
438 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
439 goto fail;
441 unmap_domain_page(descs);
442 return 1;
444 fail:
445 unmap_domain_page(descs);
446 return 0;
447 }
450 /* Map shadow page at offset @off. */
451 int map_ldt_shadow_page(unsigned int off)
452 {
453 struct vcpu *v = current;
454 struct domain *d = v->domain;
455 unsigned long gmfn, mfn;
456 l1_pgentry_t l1e, nl1e;
457 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
458 int okay;
460 BUG_ON(unlikely(in_irq()));
462 guest_get_eff_kern_l1e(v, gva, &l1e);
463 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
464 return 0;
466 gmfn = l1e_get_pfn(l1e);
467 mfn = gmfn_to_mfn(d, gmfn);
468 if ( unlikely(!mfn_valid(mfn)) )
469 return 0;
471 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
472 if ( unlikely(!okay) )
473 return 0;
475 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
477 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
478 v->arch.shadow_ldt_mapcnt++;
480 return 1;
481 }
484 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
485 {
486 struct page_info *page = mfn_to_page(page_nr);
488 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
489 {
490 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
491 return 0;
492 }
494 return 1;
495 }
498 static int get_page_and_type_from_pagenr(unsigned long page_nr,
499 unsigned long type,
500 struct domain *d)
501 {
502 struct page_info *page = mfn_to_page(page_nr);
504 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
505 return 0;
507 if ( unlikely(!get_page_type(page, type)) )
508 {
509 put_page(page);
510 return 0;
511 }
513 return 1;
514 }
516 #ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
517 /*
518 * We allow root tables to map each other (a.k.a. linear page tables). It
519 * needs some special care with reference counts and access permissions:
520 * 1. The mapping entry must be read-only, or the guest may get write access
521 * to its own PTEs.
522 * 2. We must only bump the reference counts for an *already validated*
523 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
524 * on a validation that is required to complete that validation.
525 * 3. We only need to increment the reference counts for the mapped page
526 * frame if it is mapped by a different root table. This is sufficient and
527 * also necessary to allow validation of a root table mapping itself.
528 */
529 static int
530 get_linear_pagetable(
531 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
532 {
533 unsigned long x, y;
534 struct page_info *page;
535 unsigned long pfn;
537 if ( (root_get_flags(re) & _PAGE_RW) )
538 {
539 MEM_LOG("Attempt to create linear p.t. with write perms");
540 return 0;
541 }
543 if ( (pfn = root_get_pfn(re)) != re_pfn )
544 {
545 /* Make sure the mapped frame belongs to the correct domain. */
546 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
547 return 0;
549 /*
550 * Make sure that the mapped frame is an already-validated L2 table.
551 * If so, atomically increment the count (checking for overflow).
552 */
553 page = mfn_to_page(pfn);
554 y = page->u.inuse.type_info;
555 do {
556 x = y;
557 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
558 unlikely((x & (PGT_type_mask|PGT_validated)) !=
559 (PGT_root_page_table|PGT_validated)) )
560 {
561 put_page(page);
562 return 0;
563 }
564 }
565 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
566 }
568 return 1;
569 }
570 #endif /* !CONFIG_X86_PAE */
572 int
573 get_page_from_l1e(
574 l1_pgentry_t l1e, struct domain *d)
575 {
576 unsigned long mfn = l1e_get_pfn(l1e);
577 struct page_info *page = mfn_to_page(mfn);
578 int okay;
580 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
581 return 1;
583 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
584 {
585 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
586 return 0;
587 }
589 if ( unlikely(!mfn_valid(mfn)) ||
590 unlikely(page_get_owner(page) == dom_io) )
591 {
592 /* DOMID_IO reverts to caller for privilege checks. */
593 if ( d == dom_io )
594 d = current->domain;
596 if ( !iomem_access_permitted(d, mfn, mfn) )
597 {
598 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
599 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
600 d->domain_id, mfn);
601 return 0;
602 }
604 /* No reference counting for out-of-range I/O pages. */
605 if ( !mfn_valid(mfn) )
606 return 1;
608 d = dom_io;
609 }
611 /* Foreign mappings into guests in shadow external mode don't
612 * contribute to writeable mapping refcounts. (This allows the
613 * qemu-dm helper process in dom0 to map the domain's memory without
614 * messing up the count of "real" writable mappings.) */
615 okay = (((l1e_get_flags(l1e) & _PAGE_RW) &&
616 !(unlikely(paging_mode_external(d) && (d != current->domain))))
617 ? get_page_and_type(page, d, PGT_writable_page)
618 : get_page(page, d));
619 if ( !okay )
620 {
621 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
622 " for dom%d",
623 mfn, get_gpfn_from_mfn(mfn),
624 l1e_get_intpte(l1e), d->domain_id);
625 }
627 return okay;
628 }
631 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
632 static int
633 get_page_from_l2e(
634 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
635 {
636 int rc;
638 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
639 return 1;
641 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
642 {
643 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
644 return 0;
645 }
647 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
648 #if CONFIG_PAGING_LEVELS == 2
649 if ( unlikely(!rc) )
650 rc = get_linear_pagetable(l2e, pfn, d);
651 #endif
652 return rc;
653 }
656 #if CONFIG_PAGING_LEVELS >= 3
657 static int
658 get_page_from_l3e(
659 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
660 {
661 int rc;
663 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
664 return 1;
666 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
667 {
668 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
669 return 0;
670 }
672 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
673 return rc;
674 }
675 #endif /* 3 level */
677 #if CONFIG_PAGING_LEVELS >= 4
678 static int
679 get_page_from_l4e(
680 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
681 {
682 int rc;
684 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
685 return 1;
687 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
688 {
689 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
690 return 0;
691 }
693 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
695 if ( unlikely(!rc) )
696 rc = get_linear_pagetable(l4e, pfn, d);
698 return rc;
699 }
700 #endif /* 4 level */
702 #ifdef __x86_64__
704 #ifdef USER_MAPPINGS_ARE_GLOBAL
705 #define adjust_guest_l1e(pl1e, d) \
706 do { \
707 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
708 likely(!IS_COMPAT(d)) ) \
709 { \
710 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
711 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
712 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
713 MEM_LOG("Global bit is set to kernel page %lx", \
714 l1e_get_pfn((pl1e))); \
715 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
716 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
717 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
718 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
719 } \
720 } while ( 0 )
721 #else
722 #define adjust_guest_l1e(pl1e, d) \
723 do { \
724 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
725 likely(!IS_COMPAT(d)) ) \
726 l1e_add_flags((pl1e), _PAGE_USER); \
727 } while ( 0 )
728 #endif
730 #define adjust_guest_l2e(pl2e, d) \
731 do { \
732 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
733 likely(!IS_COMPAT(d)) ) \
734 l2e_add_flags((pl2e), _PAGE_USER); \
735 } while ( 0 )
737 #define adjust_guest_l3e(pl3e, d) \
738 do { \
739 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
740 l3e_add_flags((pl3e), likely(!IS_COMPAT(d)) ? \
741 _PAGE_USER : \
742 _PAGE_USER|_PAGE_RW); \
743 } while ( 0 )
745 #define adjust_guest_l4e(pl4e, d) \
746 do { \
747 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
748 likely(!IS_COMPAT(d)) ) \
749 l4e_add_flags((pl4e), _PAGE_USER); \
750 } while ( 0 )
752 #else /* !defined(__x86_64__) */
754 #define adjust_guest_l1e(_p, _d) ((void)(_d))
755 #define adjust_guest_l2e(_p, _d) ((void)(_d))
756 #define adjust_guest_l3e(_p, _d) ((void)(_d))
758 #endif
760 #ifdef CONFIG_COMPAT
761 #define unadjust_guest_l3e(pl3e, d) \
762 do { \
763 if ( unlikely(IS_COMPAT(d)) && \
764 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
765 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
766 } while ( 0 )
767 #else
768 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
769 #endif
771 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
772 {
773 unsigned long pfn = l1e_get_pfn(l1e);
774 struct page_info *page = mfn_to_page(pfn);
775 struct domain *e;
776 struct vcpu *v;
778 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
779 return;
781 e = page_get_owner(page);
783 /*
784 * Check if this is a mapping that was established via a grant reference.
785 * If it was then we should not be here: we require that such mappings are
786 * explicitly destroyed via the grant-table interface.
787 *
788 * The upshot of this is that the guest can end up with active grants that
789 * it cannot destroy (because it no longer has a PTE to present to the
790 * grant-table interface). This can lead to subtle hard-to-catch bugs,
791 * hence a special grant PTE flag can be enabled to catch the bug early.
792 *
793 * (Note that the undestroyable active grants are not a security hole in
794 * Xen. All active grants can safely be cleaned up when the domain dies.)
795 */
796 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
797 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
798 {
799 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
800 l1e_get_intpte(l1e));
801 domain_crash(d);
802 }
804 /* Remember we didn't take a type-count of foreign writable mappings
805 * to paging-external domains */
806 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
807 !(unlikely((e != d) && paging_mode_external(e))) )
808 {
809 put_page_and_type(page);
810 }
811 else
812 {
813 /* We expect this is rare so we blow the entire shadow LDT. */
814 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
815 PGT_ldt_page)) &&
816 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
817 (d == e) )
818 {
819 for_each_vcpu ( d, v )
820 invalidate_shadow_ldt(v);
821 }
822 put_page(page);
823 }
824 }
827 /*
828 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
829 * Note also that this automatically deals correctly with linear p.t.'s.
830 */
831 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
832 {
833 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
834 (l2e_get_pfn(l2e) != pfn) )
835 put_page_and_type(l2e_get_page(l2e));
836 }
839 #if CONFIG_PAGING_LEVELS >= 3
840 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
841 {
842 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
843 (l3e_get_pfn(l3e) != pfn) )
844 put_page_and_type(l3e_get_page(l3e));
845 }
846 #endif
848 #if CONFIG_PAGING_LEVELS >= 4
849 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
850 {
851 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
852 (l4e_get_pfn(l4e) != pfn) )
853 put_page_and_type(l4e_get_page(l4e));
854 }
855 #endif
857 static int alloc_l1_table(struct page_info *page)
858 {
859 struct domain *d = page_get_owner(page);
860 unsigned long pfn = page_to_mfn(page);
861 l1_pgentry_t *pl1e;
862 int i;
864 pl1e = map_domain_page(pfn);
866 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
867 {
868 if ( is_guest_l1_slot(i) &&
869 unlikely(!get_page_from_l1e(pl1e[i], d)) )
870 goto fail;
872 adjust_guest_l1e(pl1e[i], d);
873 }
875 unmap_domain_page(pl1e);
876 return 1;
878 fail:
879 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
880 while ( i-- > 0 )
881 if ( is_guest_l1_slot(i) )
882 put_page_from_l1e(pl1e[i], d);
884 unmap_domain_page(pl1e);
885 return 0;
886 }
888 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
889 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
890 {
891 struct page_info *page;
892 l2_pgentry_t *pl2e;
893 l3_pgentry_t l3e3;
894 #ifndef CONFIG_COMPAT
895 l2_pgentry_t l2e;
896 int i;
897 #else
899 if ( !IS_COMPAT(d) )
900 return 1;
901 #endif
903 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
905 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
906 l3e3 = pl3e[3];
907 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
908 {
909 MEM_LOG("PAE L3 3rd slot is empty");
910 return 0;
911 }
913 /*
914 * The Xen-private mappings include linear mappings. The L2 thus cannot
915 * be shared by multiple L3 tables. The test here is adequate because:
916 * 1. Cannot appear in slots != 3 because get_page_type() checks the
917 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
918 * 2. Cannot appear in another page table's L3:
919 * a. alloc_l3_table() calls this function and this check will fail
920 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
921 */
922 page = l3e_get_page(l3e3);
923 BUG_ON(page->u.inuse.type_info & PGT_pinned);
924 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
925 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
926 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
927 {
928 MEM_LOG("PAE L3 3rd slot is shared");
929 return 0;
930 }
932 /* Xen private mappings. */
933 pl2e = map_domain_page(l3e_get_pfn(l3e3));
934 #ifndef CONFIG_COMPAT
935 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
936 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
937 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
938 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
939 {
940 l2e = l2e_from_page(
941 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
942 __PAGE_HYPERVISOR);
943 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
944 }
945 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
946 {
947 l2e = l2e_empty();
948 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
949 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
950 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
951 }
952 #else
953 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
954 &compat_idle_pg_table_l2[
955 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
956 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
957 #endif
958 unmap_domain_page(pl2e);
960 return 1;
961 }
962 #else
963 # define create_pae_xen_mappings(d, pl3e) (1)
964 #endif
966 #ifdef CONFIG_X86_PAE
967 /* Flush a pgdir update into low-memory caches. */
968 static void pae_flush_pgd(
969 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
970 {
971 struct domain *d = page_get_owner(mfn_to_page(mfn));
972 struct vcpu *v;
973 intpte_t _ol3e, _nl3e, _pl3e;
974 l3_pgentry_t *l3tab_ptr;
975 struct pae_l3_cache *cache;
977 if ( unlikely(shadow_mode_enabled(d)) )
978 {
979 cpumask_t m = CPU_MASK_NONE;
980 /* Re-shadow this l3 table on any vcpus that are using it */
981 for_each_vcpu ( d, v )
982 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
983 {
984 paging_update_cr3(v);
985 cpus_or(m, m, v->vcpu_dirty_cpumask);
986 }
987 flush_tlb_mask(m);
988 }
990 /* If below 4GB then the pgdir is not shadowed in low memory. */
991 if ( !l3tab_needs_shadow(mfn) )
992 return;
994 for_each_vcpu ( d, v )
995 {
996 cache = &v->arch.pae_l3_cache;
998 spin_lock(&cache->lock);
1000 if ( cache->high_mfn == mfn )
1002 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1003 _ol3e = l3e_get_intpte(*l3tab_ptr);
1004 _nl3e = l3e_get_intpte(nl3e);
1005 _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
1006 BUG_ON(_pl3e != _ol3e);
1009 spin_unlock(&cache->lock);
1012 flush_tlb_mask(d->domain_dirty_cpumask);
1014 #else
1015 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1016 #endif
1018 static int alloc_l2_table(struct page_info *page, unsigned long type)
1020 struct domain *d = page_get_owner(page);
1021 unsigned long pfn = page_to_mfn(page);
1022 l2_pgentry_t *pl2e;
1023 int i;
1025 pl2e = map_domain_page(pfn);
1027 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1029 if ( is_guest_l2_slot(d, type, i) &&
1030 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1031 goto fail;
1033 adjust_guest_l2e(pl2e[i], d);
1036 #if CONFIG_PAGING_LEVELS == 2
1037 /* Xen private mappings. */
1038 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1039 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1040 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1041 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1042 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1043 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1044 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1045 l2e_from_page(
1046 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1047 __PAGE_HYPERVISOR);
1048 #endif
1050 unmap_domain_page(pl2e);
1051 return 1;
1053 fail:
1054 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1055 while ( i-- > 0 )
1056 if ( is_guest_l2_slot(d, type, i) )
1057 put_page_from_l2e(pl2e[i], pfn);
1059 unmap_domain_page(pl2e);
1060 return 0;
1064 #if CONFIG_PAGING_LEVELS >= 3
1065 static int alloc_l3_table(struct page_info *page)
1067 struct domain *d = page_get_owner(page);
1068 unsigned long pfn = page_to_mfn(page);
1069 l3_pgentry_t *pl3e;
1070 int i;
1072 #ifdef CONFIG_X86_PAE
1073 /*
1074 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1075 * the weird 'extended cr3' format for dealing with high-order address
1076 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1077 */
1078 if ( (pfn >= 0x100000) &&
1079 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1080 d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) )
1082 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1083 return 0;
1085 #endif
1087 pl3e = map_domain_page(pfn);
1089 /*
1090 * PAE guests allocate full pages, but aren't required to initialize
1091 * more than the first four entries; when running in compatibility
1092 * mode, however, the full page is visible to the MMU, and hence all
1093 * 512 entries must be valid/verified, which is most easily achieved
1094 * by clearing them out.
1095 */
1096 if ( IS_COMPAT(d) )
1097 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1099 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1101 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1102 if ( (CONFIG_PAGING_LEVELS < 4 || IS_COMPAT(d)) && i == 3 )
1104 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1105 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1106 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1107 PGT_l2_page_table |
1108 PGT_pae_xen_l2,
1109 d) )
1110 goto fail;
1112 else
1113 #endif
1114 if ( is_guest_l3_slot(i) &&
1115 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1116 goto fail;
1118 adjust_guest_l3e(pl3e[i], d);
1121 if ( !create_pae_xen_mappings(d, pl3e) )
1122 goto fail;
1124 unmap_domain_page(pl3e);
1125 return 1;
1127 fail:
1128 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1129 while ( i-- > 0 )
1130 if ( is_guest_l3_slot(i) )
1131 put_page_from_l3e(pl3e[i], pfn);
1133 unmap_domain_page(pl3e);
1134 return 0;
1136 #else
1137 #define alloc_l3_table(page) (0)
1138 #endif
1140 #if CONFIG_PAGING_LEVELS >= 4
1141 static int alloc_l4_table(struct page_info *page)
1143 struct domain *d = page_get_owner(page);
1144 unsigned long pfn = page_to_mfn(page);
1145 l4_pgentry_t *pl4e = page_to_virt(page);
1146 int i;
1148 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1150 if ( is_guest_l4_slot(i) &&
1151 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1152 goto fail;
1154 adjust_guest_l4e(pl4e[i], d);
1157 /* Xen private mappings. */
1158 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1159 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1160 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1161 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1162 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1163 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1164 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1165 __PAGE_HYPERVISOR);
1166 if ( IS_COMPAT(d) )
1167 pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1168 l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
1169 __PAGE_HYPERVISOR);
1171 return 1;
1173 fail:
1174 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1175 while ( i-- > 0 )
1176 if ( is_guest_l4_slot(i) )
1177 put_page_from_l4e(pl4e[i], pfn);
1179 return 0;
1181 #else
1182 #define alloc_l4_table(page) (0)
1183 #endif
1186 static void free_l1_table(struct page_info *page)
1188 struct domain *d = page_get_owner(page);
1189 unsigned long pfn = page_to_mfn(page);
1190 l1_pgentry_t *pl1e;
1191 int i;
1193 pl1e = map_domain_page(pfn);
1195 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1196 if ( is_guest_l1_slot(i) )
1197 put_page_from_l1e(pl1e[i], d);
1199 unmap_domain_page(pl1e);
1203 static void free_l2_table(struct page_info *page)
1205 #ifdef CONFIG_COMPAT
1206 struct domain *d = page_get_owner(page);
1207 #endif
1208 unsigned long pfn = page_to_mfn(page);
1209 l2_pgentry_t *pl2e;
1210 int i;
1212 pl2e = map_domain_page(pfn);
1214 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1215 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1216 put_page_from_l2e(pl2e[i], pfn);
1218 unmap_domain_page(pl2e);
1220 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1224 #if CONFIG_PAGING_LEVELS >= 3
1226 static void free_l3_table(struct page_info *page)
1228 struct domain *d = page_get_owner(page);
1229 unsigned long pfn = page_to_mfn(page);
1230 l3_pgentry_t *pl3e;
1231 int i;
1233 pl3e = map_domain_page(pfn);
1235 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1236 if ( is_guest_l3_slot(i) )
1238 put_page_from_l3e(pl3e[i], pfn);
1239 unadjust_guest_l3e(pl3e[i], d);
1242 unmap_domain_page(pl3e);
1245 #endif
1247 #if CONFIG_PAGING_LEVELS >= 4
1249 static void free_l4_table(struct page_info *page)
1251 unsigned long pfn = page_to_mfn(page);
1252 l4_pgentry_t *pl4e = page_to_virt(page);
1253 int i;
1255 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1256 if ( is_guest_l4_slot(i) )
1257 put_page_from_l4e(pl4e[i], pfn);
1260 #endif
1263 /* How to write an entry to the guest pagetables.
1264 * Returns 0 for failure (pointer not valid), 1 for success. */
1265 static inline int update_intpte(intpte_t *p,
1266 intpte_t old,
1267 intpte_t new,
1268 unsigned long mfn,
1269 struct vcpu *v)
1271 int rv = 1;
1272 #ifndef PTE_UPDATE_WITH_CMPXCHG
1273 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1274 #else
1276 intpte_t t = old;
1277 for ( ; ; )
1279 rv = paging_cmpxchg_guest_entry(v, p, &t, new, _mfn(mfn));
1280 if ( unlikely(rv == 0) )
1282 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1283 ": saw %" PRIpte, old, new, t);
1284 break;
1287 if ( t == old )
1288 break;
1290 /* Allowed to change in Accessed/Dirty flags only. */
1291 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1293 old = t;
1296 #endif
1297 return rv;
1300 /* Macro that wraps the appropriate type-changes around update_intpte().
1301 * Arguments are: type, ptr, old, new, mfn, vcpu */
1302 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v) \
1303 update_intpte((intpte_t *)(_p), \
1304 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1305 (_m), (_v))
1307 /* Update the L1 entry at pl1e to new value nl1e. */
1308 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1309 unsigned long gl1mfn)
1311 l1_pgentry_t ol1e;
1312 struct domain *d = current->domain;
1314 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1315 return 0;
1317 if ( unlikely(paging_mode_refcounts(d)) )
1318 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1320 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1322 /* Translate foreign guest addresses. */
1323 nl1e = l1e_from_pfn(gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e)),
1324 l1e_get_flags(nl1e));
1326 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1328 MEM_LOG("Bad L1 flags %x",
1329 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1330 return 0;
1333 adjust_guest_l1e(nl1e, d);
1335 /* Fast path for identical mapping, r/w and presence. */
1336 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1337 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1339 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1340 return 0;
1342 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1344 put_page_from_l1e(nl1e, d);
1345 return 0;
1348 else
1350 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1351 return 0;
1354 put_page_from_l1e(ol1e, d);
1355 return 1;
1359 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1360 static int mod_l2_entry(l2_pgentry_t *pl2e,
1361 l2_pgentry_t nl2e,
1362 unsigned long pfn,
1363 unsigned long type)
1365 l2_pgentry_t ol2e;
1366 struct domain *d = current->domain;
1368 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1370 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1371 return 0;
1374 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1375 return 0;
1377 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1379 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1381 MEM_LOG("Bad L2 flags %x",
1382 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1383 return 0;
1386 adjust_guest_l2e(nl2e, d);
1388 /* Fast path for identical mapping and presence. */
1389 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1390 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current);
1392 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1393 return 0;
1395 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1397 put_page_from_l2e(nl2e, pfn);
1398 return 0;
1401 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1403 return 0;
1406 put_page_from_l2e(ol2e, pfn);
1407 return 1;
1410 #if CONFIG_PAGING_LEVELS >= 3
1412 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1413 static int mod_l3_entry(l3_pgentry_t *pl3e,
1414 l3_pgentry_t nl3e,
1415 unsigned long pfn)
1417 l3_pgentry_t ol3e;
1418 struct domain *d = current->domain;
1419 int okay;
1421 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1423 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1424 return 0;
1427 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1428 /*
1429 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1430 * would be a pain to ensure they remain continuously valid throughout.
1431 */
1432 if ( (CONFIG_PAGING_LEVELS < 4 || IS_COMPAT(d)) &&
1433 pgentry_ptr_to_slot(pl3e) >= 3 )
1434 return 0;
1435 #endif
1437 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1438 return 0;
1440 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1442 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1444 MEM_LOG("Bad L3 flags %x",
1445 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1446 return 0;
1449 adjust_guest_l3e(nl3e, d);
1451 /* Fast path for identical mapping and presence. */
1452 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1453 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current);
1455 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1456 return 0;
1458 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1460 put_page_from_l3e(nl3e, pfn);
1461 return 0;
1464 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1466 return 0;
1469 okay = create_pae_xen_mappings(d, pl3e);
1470 BUG_ON(!okay);
1472 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1474 put_page_from_l3e(ol3e, pfn);
1475 return 1;
1478 #endif
1480 #if CONFIG_PAGING_LEVELS >= 4
1482 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1483 static int mod_l4_entry(l4_pgentry_t *pl4e,
1484 l4_pgentry_t nl4e,
1485 unsigned long pfn)
1487 l4_pgentry_t ol4e;
1489 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1491 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1492 return 0;
1495 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1496 return 0;
1498 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1500 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1502 MEM_LOG("Bad L4 flags %x",
1503 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1504 return 0;
1507 adjust_guest_l4e(nl4e, current->domain);
1509 /* Fast path for identical mapping and presence. */
1510 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1511 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current);
1513 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1514 return 0;
1516 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1518 put_page_from_l4e(nl4e, pfn);
1519 return 0;
1522 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1524 return 0;
1527 put_page_from_l4e(ol4e, pfn);
1528 return 1;
1531 #endif
1533 int alloc_page_type(struct page_info *page, unsigned long type)
1535 struct domain *owner = page_get_owner(page);
1537 /* A page table is dirtied when its type count becomes non-zero. */
1538 if ( likely(owner != NULL) )
1539 mark_dirty(owner, page_to_mfn(page));
1541 switch ( type & PGT_type_mask )
1543 case PGT_l1_page_table:
1544 return alloc_l1_table(page);
1545 case PGT_l2_page_table:
1546 return alloc_l2_table(page, type);
1547 case PGT_l3_page_table:
1548 return alloc_l3_table(page);
1549 case PGT_l4_page_table:
1550 return alloc_l4_table(page);
1551 case PGT_gdt_page:
1552 case PGT_ldt_page:
1553 return alloc_segdesc_page(page);
1554 default:
1555 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1556 type, page->u.inuse.type_info,
1557 page->count_info);
1558 BUG();
1561 return 0;
1565 void free_page_type(struct page_info *page, unsigned long type)
1567 struct domain *owner = page_get_owner(page);
1568 unsigned long gmfn;
1570 if ( likely(owner != NULL) )
1572 /*
1573 * We have to flush before the next use of the linear mapping
1574 * (e.g., update_va_mapping()) or we could end up modifying a page
1575 * that is no longer a page table (and hence screw up ref counts).
1576 */
1577 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1579 if ( unlikely(paging_mode_enabled(owner)) )
1581 /* A page table is dirtied when its type count becomes zero. */
1582 mark_dirty(owner, page_to_mfn(page));
1584 if ( shadow_mode_refcounts(owner) )
1585 return;
1587 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1588 ASSERT(VALID_M2P(gmfn));
1589 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1593 switch ( type & PGT_type_mask )
1595 case PGT_l1_page_table:
1596 free_l1_table(page);
1597 break;
1599 case PGT_l2_page_table:
1600 free_l2_table(page);
1601 break;
1603 #if CONFIG_PAGING_LEVELS >= 3
1604 case PGT_l3_page_table:
1605 free_l3_table(page);
1606 break;
1607 #endif
1609 #if CONFIG_PAGING_LEVELS >= 4
1610 case PGT_l4_page_table:
1611 free_l4_table(page);
1612 break;
1613 #endif
1615 default:
1616 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1617 type, page_to_mfn(page));
1618 BUG();
1623 void put_page_type(struct page_info *page)
1625 unsigned long nx, x, y = page->u.inuse.type_info;
1627 again:
1628 do {
1629 x = y;
1630 nx = x - 1;
1632 ASSERT((x & PGT_count_mask) != 0);
1634 if ( unlikely((nx & PGT_count_mask) == 0) )
1636 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1637 likely(nx & PGT_validated) )
1639 /*
1640 * Page-table pages must be unvalidated when count is zero. The
1641 * 'free' is safe because the refcnt is non-zero and validated
1642 * bit is clear => other ops will spin or fail.
1643 */
1644 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1645 x & ~PGT_validated)) != x) )
1646 goto again;
1647 /* We cleared the 'valid bit' so we do the clean up. */
1648 free_page_type(page, x);
1649 /* Carry on, but with the 'valid bit' now clear. */
1650 x &= ~PGT_validated;
1651 nx &= ~PGT_validated;
1654 /*
1655 * Record TLB information for flush later. We do not stamp page
1656 * tables when running in shadow mode:
1657 * 1. Pointless, since it's the shadow pt's which must be tracked.
1658 * 2. Shadow mode reuses this field for shadowed page tables to
1659 * store flags info -- we don't want to conflict with that.
1660 */
1661 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1662 (page->count_info & PGC_page_table)) )
1663 page->tlbflush_timestamp = tlbflush_current_time();
1666 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1670 int get_page_type(struct page_info *page, unsigned long type)
1672 unsigned long nx, x, y = page->u.inuse.type_info;
1674 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1676 again:
1677 do {
1678 x = y;
1679 nx = x + 1;
1680 if ( unlikely((nx & PGT_count_mask) == 0) )
1682 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1683 return 0;
1685 else if ( unlikely((x & PGT_count_mask) == 0) )
1687 struct domain *d = page_get_owner(page);
1689 /* Never allow a shadowed frame to go from type count 0 to 1 */
1690 if ( d && shadow_mode_enabled(d) )
1691 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1693 ASSERT(!(x & PGT_pae_xen_l2));
1694 if ( (x & PGT_type_mask) != type )
1696 /*
1697 * On type change we check to flush stale TLB entries. This
1698 * may be unnecessary (e.g., page was GDT/LDT) but those
1699 * circumstances should be very rare.
1700 */
1701 cpumask_t mask = d->domain_dirty_cpumask;
1703 /* Don't flush if the timestamp is old enough */
1704 tlbflush_filter(mask, page->tlbflush_timestamp);
1706 if ( unlikely(!cpus_empty(mask)) &&
1707 /* Shadow mode: track only writable pages. */
1708 (!shadow_mode_enabled(page_get_owner(page)) ||
1709 ((nx & PGT_type_mask) == PGT_writable_page)) )
1711 perfc_incrc(need_flush_tlb_flush);
1712 flush_tlb_mask(mask);
1715 /* We lose existing type, back pointer, and validity. */
1716 nx &= ~(PGT_type_mask | PGT_validated);
1717 nx |= type;
1719 /* No special validation needed for writable pages. */
1720 /* Page tables and GDT/LDT need to be scanned for validity. */
1721 if ( type == PGT_writable_page )
1722 nx |= PGT_validated;
1725 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1727 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1728 (type != PGT_l1_page_table) )
1729 MEM_LOG("Bad type (saw %" PRtype_info
1730 " != exp %" PRtype_info ") "
1731 "for mfn %lx (pfn %lx)",
1732 x, type, page_to_mfn(page),
1733 get_gpfn_from_mfn(page_to_mfn(page)));
1734 return 0;
1736 else if ( unlikely(!(x & PGT_validated)) )
1738 /* Someone else is updating validation of this page. Wait... */
1739 while ( (y = page->u.inuse.type_info) == x )
1740 cpu_relax();
1741 goto again;
1744 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1746 if ( unlikely(!(nx & PGT_validated)) )
1748 /* Try to validate page type; drop the new reference on failure. */
1749 if ( unlikely(!alloc_page_type(page, type)) )
1751 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1752 PRtype_info ": caf=%08x taf=%" PRtype_info,
1753 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1754 type, page->count_info, page->u.inuse.type_info);
1755 /* Noone else can get a reference. We hold the only ref. */
1756 page->u.inuse.type_info = 0;
1757 return 0;
1760 /* Noone else is updating simultaneously. */
1761 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1764 return 1;
1768 int new_guest_cr3(unsigned long mfn)
1770 struct vcpu *v = current;
1771 struct domain *d = v->domain;
1772 int okay;
1773 unsigned long old_base_mfn;
1775 #ifdef CONFIG_COMPAT
1776 if ( IS_COMPAT(d) )
1778 okay = paging_mode_refcounts(d)
1779 ? 0 /* Old code was broken, but what should it be? */
1780 : mod_l4_entry(__va(pagetable_get_paddr(v->arch.guest_table)),
1781 l4e_from_pfn(mfn, (_PAGE_PRESENT|_PAGE_RW|
1782 _PAGE_USER|_PAGE_ACCESSED)), 0);
1783 if ( unlikely(!okay) )
1785 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
1786 return 0;
1789 invalidate_shadow_ldt(v);
1790 write_ptbase(v);
1792 return 1;
1794 #endif
1795 okay = paging_mode_refcounts(d)
1796 ? get_page_from_pagenr(mfn, d)
1797 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1798 if ( unlikely(!okay) )
1800 MEM_LOG("Error while installing new baseptr %lx", mfn);
1801 return 0;
1804 invalidate_shadow_ldt(v);
1806 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1808 v->arch.guest_table = pagetable_from_pfn(mfn);
1809 update_cr3(v);
1811 write_ptbase(v);
1813 if ( likely(old_base_mfn != 0) )
1815 if ( paging_mode_refcounts(d) )
1816 put_page(mfn_to_page(old_base_mfn));
1817 else
1818 put_page_and_type(mfn_to_page(old_base_mfn));
1821 return 1;
1824 static void process_deferred_ops(void)
1826 unsigned int deferred_ops;
1827 struct domain *d = current->domain;
1828 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1830 deferred_ops = info->deferred_ops;
1831 info->deferred_ops = 0;
1833 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1835 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1836 flush_tlb_mask(d->domain_dirty_cpumask);
1837 else
1838 local_flush_tlb();
1841 if ( deferred_ops & DOP_RELOAD_LDT )
1842 (void)map_ldt_shadow_page(0);
1844 if ( unlikely(info->foreign != NULL) )
1846 put_domain(info->foreign);
1847 info->foreign = NULL;
1851 static int set_foreigndom(domid_t domid)
1853 struct domain *e, *d = current->domain;
1854 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1855 int okay = 1;
1857 ASSERT(info->foreign == NULL);
1859 if ( likely(domid == DOMID_SELF) )
1860 goto out;
1862 if ( unlikely(domid == d->domain_id) )
1864 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1865 d->domain_id);
1866 okay = 0;
1868 else if ( unlikely(paging_mode_translate(d)) )
1870 MEM_LOG("Cannot mix foreign mappings with translated domains");
1871 okay = 0;
1873 else if ( !IS_PRIV(d) )
1875 switch ( domid )
1877 case DOMID_IO:
1878 get_knownalive_domain(dom_io);
1879 info->foreign = dom_io;
1880 break;
1881 default:
1882 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1883 okay = 0;
1884 break;
1887 else
1889 info->foreign = e = get_domain_by_id(domid);
1890 if ( e == NULL )
1892 switch ( domid )
1894 case DOMID_XEN:
1895 get_knownalive_domain(dom_xen);
1896 info->foreign = dom_xen;
1897 break;
1898 case DOMID_IO:
1899 get_knownalive_domain(dom_io);
1900 info->foreign = dom_io;
1901 break;
1902 default:
1903 MEM_LOG("Unknown domain '%u'", domid);
1904 okay = 0;
1905 break;
1910 out:
1911 return okay;
1914 static inline cpumask_t vcpumask_to_pcpumask(
1915 struct domain *d, unsigned long vmask)
1917 unsigned int vcpu_id;
1918 cpumask_t pmask = CPU_MASK_NONE;
1919 struct vcpu *v;
1921 while ( vmask != 0 )
1923 vcpu_id = find_first_set_bit(vmask);
1924 vmask &= ~(1UL << vcpu_id);
1925 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1926 ((v = d->vcpu[vcpu_id]) != NULL) )
1927 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1930 return pmask;
1933 int do_mmuext_op(
1934 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1935 unsigned int count,
1936 XEN_GUEST_HANDLE(uint) pdone,
1937 unsigned int foreigndom)
1939 struct mmuext_op op;
1940 int rc = 0, i = 0, okay;
1941 unsigned long mfn = 0, gmfn = 0, type;
1942 unsigned int done = 0;
1943 struct page_info *page;
1944 struct vcpu *v = current;
1945 struct domain *d = v->domain;
1947 LOCK_BIGLOCK(d);
1949 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1951 count &= ~MMU_UPDATE_PREEMPTED;
1952 if ( unlikely(!guest_handle_is_null(pdone)) )
1953 (void)copy_from_guest(&done, pdone, 1);
1956 if ( !set_foreigndom(foreigndom) )
1958 rc = -ESRCH;
1959 goto out;
1962 if ( unlikely(!guest_handle_okay(uops, count)) )
1964 rc = -EFAULT;
1965 goto out;
1968 for ( i = 0; i < count; i++ )
1970 if ( hypercall_preempt_check() )
1972 rc = hypercall_create_continuation(
1973 __HYPERVISOR_mmuext_op, "hihi",
1974 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1975 break;
1978 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
1980 MEM_LOG("Bad __copy_from_guest");
1981 rc = -EFAULT;
1982 break;
1985 okay = 1;
1986 gmfn = op.arg1.mfn;
1987 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
1988 page = mfn_to_page(mfn);
1990 switch ( op.cmd )
1992 case MMUEXT_PIN_L1_TABLE:
1993 type = PGT_l1_page_table;
1994 goto pin_page;
1996 case MMUEXT_PIN_L2_TABLE:
1997 type = PGT_l2_page_table;
1998 goto pin_page;
2000 case MMUEXT_PIN_L3_TABLE:
2001 type = PGT_l3_page_table;
2002 goto pin_page;
2004 case MMUEXT_PIN_L4_TABLE:
2005 if ( IS_COMPAT(FOREIGNDOM) )
2006 break;
2007 type = PGT_l4_page_table;
2009 pin_page:
2010 /* Ignore pinning of invalid paging levels. */
2011 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2012 break;
2014 if ( paging_mode_refcounts(FOREIGNDOM) )
2015 break;
2017 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2018 if ( unlikely(!okay) )
2020 MEM_LOG("Error while pinning mfn %lx", mfn);
2021 break;
2024 if ( unlikely(test_and_set_bit(_PGT_pinned,
2025 &page->u.inuse.type_info)) )
2027 MEM_LOG("Mfn %lx already pinned", mfn);
2028 put_page_and_type(page);
2029 okay = 0;
2030 break;
2033 /* A page is dirtied when its pin status is set. */
2034 mark_dirty(d, mfn);
2036 break;
2038 case MMUEXT_UNPIN_TABLE:
2039 if ( paging_mode_refcounts(d) )
2040 break;
2042 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2044 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2045 mfn, page_get_owner(page));
2047 else if ( likely(test_and_clear_bit(_PGT_pinned,
2048 &page->u.inuse.type_info)) )
2050 put_page_and_type(page);
2051 put_page(page);
2052 /* A page is dirtied when its pin status is cleared. */
2053 mark_dirty(d, mfn);
2055 else
2057 okay = 0;
2058 put_page(page);
2059 MEM_LOG("Mfn %lx not pinned", mfn);
2061 break;
2063 case MMUEXT_NEW_BASEPTR:
2064 okay = new_guest_cr3(mfn);
2065 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2066 break;
2068 #ifdef __x86_64__
2069 case MMUEXT_NEW_USER_BASEPTR:
2070 if ( IS_COMPAT(FOREIGNDOM) )
2072 okay = 0;
2073 break;
2075 if (likely(mfn != 0))
2077 if ( paging_mode_refcounts(d) )
2078 okay = get_page_from_pagenr(mfn, d);
2079 else
2080 okay = get_page_and_type_from_pagenr(
2081 mfn, PGT_root_page_table, d);
2083 if ( unlikely(!okay) )
2085 MEM_LOG("Error while installing new mfn %lx", mfn);
2087 else
2089 unsigned long old_mfn =
2090 pagetable_get_pfn(v->arch.guest_table_user);
2091 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2092 if ( old_mfn != 0 )
2094 if ( paging_mode_refcounts(d) )
2095 put_page(mfn_to_page(old_mfn));
2096 else
2097 put_page_and_type(mfn_to_page(old_mfn));
2100 break;
2101 #endif
2103 case MMUEXT_TLB_FLUSH_LOCAL:
2104 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2105 break;
2107 case MMUEXT_INVLPG_LOCAL:
2108 if ( !paging_mode_enabled(d)
2109 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2110 local_flush_tlb_one(op.arg1.linear_addr);
2111 break;
2113 case MMUEXT_TLB_FLUSH_MULTI:
2114 case MMUEXT_INVLPG_MULTI:
2116 unsigned long vmask;
2117 cpumask_t pmask;
2118 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2120 okay = 0;
2121 break;
2123 pmask = vcpumask_to_pcpumask(d, vmask);
2124 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2125 flush_tlb_mask(pmask);
2126 else
2127 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2128 break;
2131 case MMUEXT_TLB_FLUSH_ALL:
2132 flush_tlb_mask(d->domain_dirty_cpumask);
2133 break;
2135 case MMUEXT_INVLPG_ALL:
2136 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2137 break;
2139 case MMUEXT_FLUSH_CACHE:
2140 if ( unlikely(!cache_flush_permitted(d)) )
2142 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2143 okay = 0;
2145 else
2147 wbinvd();
2149 break;
2151 case MMUEXT_SET_LDT:
2153 unsigned long ptr = op.arg1.linear_addr;
2154 unsigned long ents = op.arg2.nr_ents;
2156 if ( paging_mode_external(d) )
2158 MEM_LOG("ignoring SET_LDT hypercall from external "
2159 "domain %u", d->domain_id);
2160 okay = 0;
2162 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2163 (ents > 8192) ||
2164 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2166 okay = 0;
2167 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2169 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2170 (v->arch.guest_context.ldt_base != ptr) )
2172 invalidate_shadow_ldt(v);
2173 v->arch.guest_context.ldt_base = ptr;
2174 v->arch.guest_context.ldt_ents = ents;
2175 load_LDT(v);
2176 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2177 if ( ents != 0 )
2178 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2180 break;
2183 default:
2184 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2185 rc = -ENOSYS;
2186 okay = 0;
2187 break;
2190 if ( unlikely(!okay) )
2192 rc = rc ? rc : -EINVAL;
2193 break;
2196 guest_handle_add_offset(uops, 1);
2199 out:
2200 process_deferred_ops();
2202 /* Add incremental work we have done to the @done output parameter. */
2203 if ( unlikely(!guest_handle_is_null(pdone)) )
2205 done += i;
2206 copy_to_guest(pdone, &done, 1);
2209 UNLOCK_BIGLOCK(d);
2210 return rc;
2213 int do_mmu_update(
2214 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2215 unsigned int count,
2216 XEN_GUEST_HANDLE(uint) pdone,
2217 unsigned int foreigndom)
2219 struct mmu_update req;
2220 void *va;
2221 unsigned long gpfn, gmfn, mfn;
2222 struct page_info *page;
2223 int rc = 0, okay = 1, i = 0;
2224 unsigned int cmd, done = 0;
2225 struct vcpu *v = current;
2226 struct domain *d = v->domain;
2227 unsigned long type_info;
2228 struct domain_mmap_cache mapcache, sh_mapcache;
2230 LOCK_BIGLOCK(d);
2232 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2234 count &= ~MMU_UPDATE_PREEMPTED;
2235 if ( unlikely(!guest_handle_is_null(pdone)) )
2236 (void)copy_from_guest(&done, pdone, 1);
2239 domain_mmap_cache_init(&mapcache);
2240 domain_mmap_cache_init(&sh_mapcache);
2242 if ( !set_foreigndom(foreigndom) )
2244 rc = -ESRCH;
2245 goto out;
2248 perfc_incrc(calls_to_mmu_update);
2249 perfc_addc(num_page_updates, count);
2251 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2253 rc = -EFAULT;
2254 goto out;
2257 for ( i = 0; i < count; i++ )
2259 if ( hypercall_preempt_check() )
2261 rc = hypercall_create_continuation(
2262 __HYPERVISOR_mmu_update, "hihi",
2263 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2264 break;
2267 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2269 MEM_LOG("Bad __copy_from_guest");
2270 rc = -EFAULT;
2271 break;
2274 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2275 okay = 0;
2277 switch ( cmd )
2279 /*
2280 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2281 */
2282 case MMU_NORMAL_PT_UPDATE:
2284 gmfn = req.ptr >> PAGE_SHIFT;
2285 mfn = gmfn_to_mfn(d, gmfn);
2287 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2289 MEM_LOG("Could not get page for normal update");
2290 break;
2293 va = map_domain_page_with_cache(mfn, &mapcache);
2294 va = (void *)((unsigned long)va +
2295 (unsigned long)(req.ptr & ~PAGE_MASK));
2296 page = mfn_to_page(mfn);
2298 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2300 case PGT_l1_page_table:
2301 case PGT_l2_page_table:
2302 case PGT_l3_page_table:
2303 case PGT_l4_page_table:
2305 if ( paging_mode_refcounts(d) )
2307 MEM_LOG("mmu update on auto-refcounted domain!");
2308 break;
2311 if ( unlikely(!get_page_type(
2312 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2313 goto not_a_pt;
2315 switch ( type_info & PGT_type_mask )
2317 case PGT_l1_page_table:
2319 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2320 okay = mod_l1_entry(va, l1e, mfn);
2322 break;
2323 case PGT_l2_page_table:
2325 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2326 okay = mod_l2_entry(va, l2e, mfn, type_info);
2328 break;
2329 #if CONFIG_PAGING_LEVELS >= 3
2330 case PGT_l3_page_table:
2332 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2333 okay = mod_l3_entry(va, l3e, mfn);
2335 break;
2336 #endif
2337 #if CONFIG_PAGING_LEVELS >= 4
2338 case PGT_l4_page_table:
2339 if ( !IS_COMPAT(FOREIGNDOM) )
2341 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2342 okay = mod_l4_entry(va, l4e, mfn);
2344 break;
2345 #endif
2348 put_page_type(page);
2350 break;
2352 default:
2353 not_a_pt:
2355 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2356 break;
2358 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2360 put_page_type(page);
2362 break;
2365 unmap_domain_page_with_cache(va, &mapcache);
2367 put_page(page);
2368 break;
2370 case MMU_MACHPHYS_UPDATE:
2372 mfn = req.ptr >> PAGE_SHIFT;
2373 gpfn = req.val;
2375 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2377 MEM_LOG("Could not get page for mach->phys update");
2378 break;
2381 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2383 MEM_LOG("Mach-phys update on auto-translate guest");
2384 break;
2387 set_gpfn_from_mfn(mfn, gpfn);
2388 okay = 1;
2390 mark_dirty(FOREIGNDOM, mfn);
2392 put_page(mfn_to_page(mfn));
2393 break;
2395 default:
2396 MEM_LOG("Invalid page update command %x", cmd);
2397 rc = -ENOSYS;
2398 okay = 0;
2399 break;
2402 if ( unlikely(!okay) )
2404 rc = rc ? rc : -EINVAL;
2405 break;
2408 guest_handle_add_offset(ureqs, 1);
2411 out:
2412 domain_mmap_cache_destroy(&mapcache);
2413 domain_mmap_cache_destroy(&sh_mapcache);
2415 process_deferred_ops();
2417 /* Add incremental work we have done to the @done output parameter. */
2418 if ( unlikely(!guest_handle_is_null(pdone)) )
2420 done += i;
2421 copy_to_guest(pdone, &done, 1);
2424 UNLOCK_BIGLOCK(d);
2425 return rc;
2429 static int create_grant_pte_mapping(
2430 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2432 int rc = GNTST_okay;
2433 void *va;
2434 unsigned long gmfn, mfn;
2435 struct page_info *page;
2436 u32 type;
2437 l1_pgentry_t ol1e;
2438 struct domain *d = v->domain;
2440 ASSERT(spin_is_locked(&d->big_lock));
2442 adjust_guest_l1e(nl1e, d);
2444 gmfn = pte_addr >> PAGE_SHIFT;
2445 mfn = gmfn_to_mfn(d, gmfn);
2447 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2449 MEM_LOG("Could not get page for normal update");
2450 return GNTST_general_error;
2453 va = map_domain_page(mfn);
2454 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2455 page = mfn_to_page(mfn);
2457 type = page->u.inuse.type_info & PGT_type_mask;
2458 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2460 MEM_LOG("Grant map attempted to update a non-L1 page");
2461 rc = GNTST_general_error;
2462 goto failed;
2465 ol1e = *(l1_pgentry_t *)va;
2466 if ( !UPDATE_ENTRY(l1, va, ol1e, nl1e, mfn, v) )
2468 put_page_type(page);
2469 rc = GNTST_general_error;
2470 goto failed;
2473 if ( !paging_mode_refcounts(d) )
2474 put_page_from_l1e(ol1e, d);
2476 put_page_type(page);
2478 failed:
2479 unmap_domain_page(va);
2480 put_page(page);
2482 return rc;
2485 static int destroy_grant_pte_mapping(
2486 uint64_t addr, unsigned long frame, struct domain *d)
2488 int rc = GNTST_okay;
2489 void *va;
2490 unsigned long gmfn, mfn;
2491 struct page_info *page;
2492 u32 type;
2493 l1_pgentry_t ol1e;
2495 gmfn = addr >> PAGE_SHIFT;
2496 mfn = gmfn_to_mfn(d, gmfn);
2498 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2500 MEM_LOG("Could not get page for normal update");
2501 return GNTST_general_error;
2504 va = map_domain_page(mfn);
2505 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2506 page = mfn_to_page(mfn);
2508 type = page->u.inuse.type_info & PGT_type_mask;
2509 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2511 MEM_LOG("Grant map attempted to update a non-L1 page");
2512 rc = GNTST_general_error;
2513 goto failed;
2516 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2518 put_page_type(page);
2519 rc = GNTST_general_error;
2520 goto failed;
2523 /* Check that the virtual address supplied is actually mapped to frame. */
2524 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2526 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2527 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2528 put_page_type(page);
2529 rc = GNTST_general_error;
2530 goto failed;
2533 /* Delete pagetable entry. */
2534 if ( unlikely(!UPDATE_ENTRY(l1,
2535 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2536 d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
2538 MEM_LOG("Cannot delete PTE entry at %p", va);
2539 put_page_type(page);
2540 rc = GNTST_general_error;
2541 goto failed;
2544 put_page_type(page);
2546 failed:
2547 unmap_domain_page(va);
2548 put_page(page);
2549 return rc;
2553 static int create_grant_va_mapping(
2554 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2556 l1_pgentry_t *pl1e, ol1e;
2557 struct domain *d = v->domain;
2558 unsigned long gl1mfn;
2559 int okay;
2561 ASSERT(spin_is_locked(&d->big_lock));
2563 adjust_guest_l1e(nl1e, d);
2565 pl1e = guest_map_l1e(v, va, &gl1mfn);
2566 if ( !pl1e )
2568 MEM_LOG("Could not find L1 PTE for address %lx", va);
2569 return GNTST_general_error;
2571 ol1e = *pl1e;
2572 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v);
2573 guest_unmap_l1e(v, pl1e);
2574 pl1e = NULL;
2576 if ( !okay )
2577 return GNTST_general_error;
2579 if ( !paging_mode_refcounts(d) )
2580 put_page_from_l1e(ol1e, d);
2582 return GNTST_okay;
2585 static int destroy_grant_va_mapping(
2586 unsigned long addr, unsigned long frame, struct vcpu *v)
2588 l1_pgentry_t *pl1e, ol1e;
2589 unsigned long gl1mfn;
2590 int rc = 0;
2592 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2593 if ( !pl1e )
2595 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2596 return GNTST_general_error;
2598 ol1e = *pl1e;
2600 /* Check that the virtual address supplied is actually mapped to frame. */
2601 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2603 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2604 l1e_get_pfn(ol1e), addr, frame);
2605 rc = GNTST_general_error;
2606 goto out;
2609 /* Delete pagetable entry. */
2610 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, v)) )
2612 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2613 rc = GNTST_general_error;
2614 goto out;
2617 out:
2618 guest_unmap_l1e(v, pl1e);
2619 return rc;
2622 int create_grant_host_mapping(
2623 uint64_t addr, unsigned long frame, unsigned int flags)
2625 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2627 if ( (flags & GNTMAP_application_map) )
2628 l1e_add_flags(pte,_PAGE_USER);
2629 if ( !(flags & GNTMAP_readonly) )
2630 l1e_add_flags(pte,_PAGE_RW);
2632 if ( flags & GNTMAP_contains_pte )
2633 return create_grant_pte_mapping(addr, pte, current);
2634 return create_grant_va_mapping(addr, pte, current);
2637 int destroy_grant_host_mapping(
2638 uint64_t addr, unsigned long frame, unsigned int flags)
2640 if ( flags & GNTMAP_contains_pte )
2641 return destroy_grant_pte_mapping(addr, frame, current->domain);
2642 return destroy_grant_va_mapping(addr, frame, current);
2645 int steal_page(
2646 struct domain *d, struct page_info *page, unsigned int memflags)
2648 u32 _d, _nd, x, y;
2650 spin_lock(&d->page_alloc_lock);
2652 /*
2653 * The tricky bit: atomically release ownership while there is just one
2654 * benign reference to the page (PGC_allocated). If that reference
2655 * disappears then the deallocation routine will safely spin.
2656 */
2657 _d = pickle_domptr(d);
2658 _nd = page->u.inuse._domain;
2659 y = page->count_info;
2660 do {
2661 x = y;
2662 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2663 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2664 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2665 " caf=%08x, taf=%" PRtype_info "\n",
2666 (void *) page_to_mfn(page),
2667 d, d->domain_id, unpickle_domptr(_nd), x,
2668 page->u.inuse.type_info);
2669 spin_unlock(&d->page_alloc_lock);
2670 return -1;
2672 __asm__ __volatile__(
2673 LOCK_PREFIX "cmpxchg8b %2"
2674 : "=d" (_nd), "=a" (y),
2675 "=m" (*(volatile u64 *)(&page->count_info))
2676 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2677 } while (unlikely(_nd != _d) || unlikely(y != x));
2679 /*
2680 * Unlink from 'd'. At least one reference remains (now anonymous), so
2681 * noone else is spinning to try to delete this page from 'd'.
2682 */
2683 if ( !(memflags & MEMF_no_refcount) )
2684 d->tot_pages--;
2685 list_del(&page->list);
2687 spin_unlock(&d->page_alloc_lock);
2689 return 0;
2692 int do_update_va_mapping(unsigned long va, u64 val64,
2693 unsigned long flags)
2695 l1_pgentry_t val = l1e_from_intpte(val64);
2696 struct vcpu *v = current;
2697 struct domain *d = v->domain;
2698 l1_pgentry_t *pl1e;
2699 unsigned long vmask, bmap_ptr, gl1mfn;
2700 cpumask_t pmask;
2701 int rc = 0;
2703 perfc_incrc(calls_to_update_va);
2705 if ( unlikely(!__addr_ok(va) && !paging_mode_external(d)) )
2706 return -EINVAL;
2708 LOCK_BIGLOCK(d);
2710 pl1e = guest_map_l1e(v, va, &gl1mfn);
2712 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) )
2713 rc = -EINVAL;
2715 if ( pl1e )
2716 guest_unmap_l1e(v, pl1e);
2717 pl1e = NULL;
2719 switch ( flags & UVMF_FLUSHTYPE_MASK )
2721 case UVMF_TLB_FLUSH:
2722 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2724 case UVMF_LOCAL:
2725 local_flush_tlb();
2726 break;
2727 case UVMF_ALL:
2728 flush_tlb_mask(d->domain_dirty_cpumask);
2729 break;
2730 default:
2731 if ( unlikely(!IS_COMPAT(d) ?
2732 get_user(vmask, (unsigned long *)bmap_ptr) :
2733 get_user(vmask, (unsigned int *)bmap_ptr)) )
2734 rc = -EFAULT;
2735 pmask = vcpumask_to_pcpumask(d, vmask);
2736 flush_tlb_mask(pmask);
2737 break;
2739 break;
2741 case UVMF_INVLPG:
2742 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2744 case UVMF_LOCAL:
2745 if ( !paging_mode_enabled(d)
2746 || (paging_invlpg(current, va) != 0) )
2747 local_flush_tlb_one(va);
2748 break;
2749 case UVMF_ALL:
2750 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2751 break;
2752 default:
2753 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2754 rc = -EFAULT;
2755 pmask = vcpumask_to_pcpumask(d, vmask);
2756 flush_tlb_one_mask(pmask, va);
2757 break;
2759 break;
2762 process_deferred_ops();
2764 UNLOCK_BIGLOCK(d);
2766 return rc;
2769 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2770 unsigned long flags,
2771 domid_t domid)
2773 int rc;
2775 if ( unlikely(!IS_PRIV(current->domain)) )
2776 return -EPERM;
2778 if ( !set_foreigndom(domid) )
2779 return -ESRCH;
2781 rc = do_update_va_mapping(va, val64, flags);
2783 return rc;
2788 /*************************
2789 * Descriptor Tables
2790 */
2792 void destroy_gdt(struct vcpu *v)
2794 int i;
2795 unsigned long pfn;
2797 v->arch.guest_context.gdt_ents = 0;
2798 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2800 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2801 put_page_and_type(mfn_to_page(pfn));
2802 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
2803 v->arch.guest_context.gdt_frames[i] = 0;
2808 long set_gdt(struct vcpu *v,
2809 unsigned long *frames,
2810 unsigned int entries)
2812 struct domain *d = v->domain;
2813 /* NB. There are 512 8-byte entries per GDT page. */
2814 int i, nr_pages = (entries + 511) / 512;
2815 unsigned long mfn;
2817 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2818 return -EINVAL;
2820 /* Check the pages in the new GDT. */
2821 for ( i = 0; i < nr_pages; i++ ) {
2822 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2823 if ( !mfn_valid(mfn) ||
2824 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2825 goto fail;
2828 /* Tear down the old GDT. */
2829 destroy_gdt(v);
2831 /* Install the new GDT. */
2832 v->arch.guest_context.gdt_ents = entries;
2833 for ( i = 0; i < nr_pages; i++ )
2835 v->arch.guest_context.gdt_frames[i] = frames[i];
2836 l1e_write(&v->arch.perdomain_ptes[i],
2837 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
2840 return 0;
2842 fail:
2843 while ( i-- > 0 )
2844 put_page_and_type(mfn_to_page(frames[i]));
2845 return -EINVAL;
2849 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2851 int nr_pages = (entries + 511) / 512;
2852 unsigned long frames[16];
2853 long ret;
2855 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2856 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2857 return -EINVAL;
2859 if ( copy_from_guest((unsigned long *)frames, frame_list, nr_pages) )
2860 return -EFAULT;
2862 LOCK_BIGLOCK(current->domain);
2864 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2865 local_flush_tlb();
2867 UNLOCK_BIGLOCK(current->domain);
2869 return ret;
2873 long do_update_descriptor(u64 pa, u64 desc)
2875 struct domain *dom = current->domain;
2876 unsigned long gmfn = pa >> PAGE_SHIFT;
2877 unsigned long mfn;
2878 unsigned int offset;
2879 struct desc_struct *gdt_pent, d;
2880 struct page_info *page;
2881 long ret = -EINVAL;
2883 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2885 *(u64 *)&d = desc;
2887 LOCK_BIGLOCK(dom);
2889 mfn = gmfn_to_mfn(dom, gmfn);
2890 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2891 !mfn_valid(mfn) ||
2892 !check_descriptor(dom, &d) )
2894 UNLOCK_BIGLOCK(dom);
2895 return -EINVAL;
2898 page = mfn_to_page(mfn);
2899 if ( unlikely(!get_page(page, dom)) )
2901 UNLOCK_BIGLOCK(dom);
2902 return -EINVAL;
2905 /* Check if the given frame is in use in an unsafe context. */
2906 switch ( page->u.inuse.type_info & PGT_type_mask )
2908 case PGT_gdt_page:
2909 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2910 goto out;
2911 break;
2912 case PGT_ldt_page:
2913 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2914 goto out;
2915 break;
2916 default:
2917 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2918 goto out;
2919 break;
2922 mark_dirty(dom, mfn);
2924 /* All is good so make the update. */
2925 gdt_pent = map_domain_page(mfn);
2926 memcpy(&gdt_pent[offset], &d, 8);
2927 unmap_domain_page(gdt_pent);
2929 put_page_type(page);
2931 ret = 0; /* success */
2933 out:
2934 put_page(page);
2936 UNLOCK_BIGLOCK(dom);
2938 return ret;
2941 typedef struct e820entry e820entry_t;
2942 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
2944 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2946 switch ( op )
2948 case XENMEM_add_to_physmap:
2950 struct xen_add_to_physmap xatp;
2951 unsigned long prev_mfn, mfn = 0, gpfn;
2952 struct domain *d;
2954 if ( copy_from_guest(&xatp, arg, 1) )
2955 return -EFAULT;
2957 if ( xatp.domid == DOMID_SELF )
2959 d = current->domain;
2960 get_knownalive_domain(d);
2962 else if ( !IS_PRIV(current->domain) )
2963 return -EPERM;
2964 else if ( (d = get_domain_by_id(xatp.domid)) == NULL )
2965 return -ESRCH;
2967 switch ( xatp.space )
2969 case XENMAPSPACE_shared_info:
2970 if ( xatp.idx == 0 )
2971 mfn = virt_to_mfn(d->shared_info);
2972 break;
2973 case XENMAPSPACE_grant_table:
2974 if ( xatp.idx < NR_GRANT_FRAMES )
2975 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
2976 break;
2977 default:
2978 break;
2981 if ( !paging_mode_translate(d) || (mfn == 0) )
2983 put_domain(d);
2984 return -EINVAL;
2987 LOCK_BIGLOCK(d);
2989 /* Remove previously mapped page if it was present. */
2990 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
2991 if ( mfn_valid(prev_mfn) )
2993 if ( IS_XEN_HEAP_FRAME(mfn_to_page(prev_mfn)) )
2994 /* Xen heap frames are simply unhooked from this phys slot. */
2995 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
2996 else
2997 /* Normal domain memory is freed, to avoid leaking memory. */
2998 guest_remove_page(d, xatp.gpfn);
3001 /* Unmap from old location, if any. */
3002 gpfn = get_gpfn_from_mfn(mfn);
3003 if ( gpfn != INVALID_M2P_ENTRY )
3004 guest_physmap_remove_page(d, gpfn, mfn);
3006 /* Map at new location. */
3007 guest_physmap_add_page(d, xatp.gpfn, mfn);
3009 UNLOCK_BIGLOCK(d);
3011 put_domain(d);
3013 break;
3016 case XENMEM_set_memory_map:
3018 struct xen_foreign_memory_map fmap;
3019 struct domain *d;
3020 int rc;
3022 if ( copy_from_guest(&fmap, arg, 1) )
3023 return -EFAULT;
3025 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3026 return -EINVAL;
3028 if ( fmap.domid == DOMID_SELF )
3030 d = current->domain;
3031 get_knownalive_domain(d);
3033 else if ( !IS_PRIV(current->domain) )
3034 return -EPERM;
3035 else if ( (d = get_domain_by_id(fmap.domid)) == NULL )
3036 return -ESRCH;
3038 rc = copy_from_guest(&d->arch.e820[0], fmap.map.buffer,
3039 fmap.map.nr_entries) ? -EFAULT : 0;
3040 d->arch.nr_e820 = fmap.map.nr_entries;
3042 put_domain(d);
3043 return rc;
3046 case XENMEM_memory_map:
3048 struct xen_memory_map map;
3049 struct domain *d = current->domain;
3051 /* Backwards compatibility. */
3052 if ( d->arch.nr_e820 == 0 )
3053 return -ENOSYS;
3055 if ( copy_from_guest(&map, arg, 1) )
3056 return -EFAULT;
3058 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3059 if ( copy_to_guest(map.buffer, &d->arch.e820[0], map.nr_entries) ||
3060 copy_to_guest(arg, &map, 1) )
3061 return -EFAULT;
3063 return 0;
3066 case XENMEM_machine_memory_map:
3068 struct xen_memory_map memmap;
3069 XEN_GUEST_HANDLE(e820entry_t) buffer;
3070 int count;
3072 if ( !IS_PRIV(current->domain) )
3073 return -EINVAL;
3075 if ( copy_from_guest(&memmap, arg, 1) )
3076 return -EFAULT;
3077 if ( memmap.nr_entries < e820.nr_map + 1 )
3078 return -EINVAL;
3080 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3082 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3083 if ( copy_to_guest(buffer, &e820.map[0], count) < 0 )
3084 return -EFAULT;
3086 memmap.nr_entries = count;
3088 if ( copy_to_guest(arg, &memmap, 1) )
3089 return -EFAULT;
3091 return 0;
3094 case XENMEM_machphys_mapping:
3096 struct xen_machphys_mapping mapping = {
3097 .v_start = MACH2PHYS_VIRT_START,
3098 .v_end = MACH2PHYS_VIRT_END,
3099 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3100 };
3102 if ( copy_to_guest(arg, &mapping, 1) )
3103 return -EFAULT;
3105 return 0;
3108 default:
3109 return subarch_memory_op(op, arg);
3112 return 0;
3116 /*************************
3117 * Writable Pagetables
3118 */
3120 struct ptwr_emulate_ctxt {
3121 struct x86_emulate_ctxt ctxt;
3122 unsigned long cr2;
3123 l1_pgentry_t pte;
3124 };
3126 static int ptwr_emulated_read(
3127 enum x86_segment seg,
3128 unsigned long offset,
3129 unsigned long *val,
3130 unsigned int bytes,
3131 struct x86_emulate_ctxt *ctxt)
3133 unsigned int rc;
3134 unsigned long addr = offset;
3136 *val = 0;
3137 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3139 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3140 return X86EMUL_PROPAGATE_FAULT;
3143 return X86EMUL_CONTINUE;
3146 static int ptwr_emulated_update(
3147 unsigned long addr,
3148 paddr_t old,
3149 paddr_t val,
3150 unsigned int bytes,
3151 unsigned int do_cmpxchg,
3152 struct ptwr_emulate_ctxt *ptwr_ctxt)
3154 unsigned long mfn;
3155 struct page_info *page;
3156 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3157 struct vcpu *v = current;
3158 struct domain *d = v->domain;
3160 /* Only allow naturally-aligned stores within the original %cr2 page. */
3161 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3163 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3164 ptwr_ctxt->cr2, addr, bytes);
3165 return X86EMUL_UNHANDLEABLE;
3168 /* Turn a sub-word access into a full-word access. */
3169 if ( bytes != sizeof(paddr_t) )
3171 paddr_t full;
3172 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3174 /* Align address; read full word. */
3175 addr &= ~(sizeof(paddr_t)-1);
3176 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3178 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3179 return X86EMUL_PROPAGATE_FAULT;
3181 /* Mask out bits provided by caller. */
3182 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3183 /* Shift the caller value and OR in the missing bits. */
3184 val &= (((paddr_t)1 << (bytes*8)) - 1);
3185 val <<= (offset)*8;
3186 val |= full;
3187 /* Also fill in missing parts of the cmpxchg old value. */
3188 old &= (((paddr_t)1 << (bytes*8)) - 1);
3189 old <<= (offset)*8;
3190 old |= full;
3193 pte = ptwr_ctxt->pte;
3194 mfn = l1e_get_pfn(pte);
3195 page = mfn_to_page(mfn);
3197 /* We are looking only for read-only mappings of p.t. pages. */
3198 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3199 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3200 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3201 ASSERT(page_get_owner(page) == d);
3203 /* Check the new PTE. */
3204 nl1e = l1e_from_intpte(val);
3205 if ( unlikely(!get_page_from_l1e(gl1e_to_ml1e(d, nl1e), d)) )
3207 if ( (CONFIG_PAGING_LEVELS == 3 || IS_COMPAT(d)) &&
3208 (bytes == 4) && (addr & 4) && !do_cmpxchg &&
3209 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3211 /*
3212 * If this is an upper-half write to a PAE PTE then we assume that
3213 * the guest has simply got the two writes the wrong way round. We
3214 * zap the PRESENT bit on the assumption that the bottom half will
3215 * be written immediately after we return to the guest.
3216 */
3217 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3218 l1e_get_intpte(nl1e));
3219 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3221 else
3223 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3224 return X86EMUL_UNHANDLEABLE;
3228 adjust_guest_l1e(nl1e, d);
3230 /* Checked successfully: do the update (write or cmpxchg). */
3231 pl1e = map_domain_page(page_to_mfn(page));
3232 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3233 if ( do_cmpxchg )
3235 int okay;
3236 intpte_t t = old;
3237 ol1e = l1e_from_intpte(old);
3239 okay = paging_cmpxchg_guest_entry(v, (intpte_t *) pl1e,
3240 &t, val, _mfn(mfn));
3241 okay = (okay && t == old);
3243 if ( !okay )
3245 unmap_domain_page(pl1e);
3246 put_page_from_l1e(gl1e_to_ml1e(d, nl1e), d);
3247 return X86EMUL_CMPXCHG_FAILED;
3250 else
3252 ol1e = *pl1e;
3253 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, page_to_mfn(page), v) )
3254 BUG();
3257 unmap_domain_page(pl1e);
3259 /* Finally, drop the old PTE. */
3260 put_page_from_l1e(gl1e_to_ml1e(d, ol1e), d);
3262 return X86EMUL_CONTINUE;
3265 static int ptwr_emulated_write(
3266 enum x86_segment seg,
3267 unsigned long offset,
3268 unsigned long val,
3269 unsigned int bytes,
3270 struct x86_emulate_ctxt *ctxt)
3272 return ptwr_emulated_update(
3273 offset, 0, val, bytes, 0,
3274 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3277 static int ptwr_emulated_cmpxchg(
3278 enum x86_segment seg,
3279 unsigned long offset,
3280 unsigned long old,
3281 unsigned long new,
3282 unsigned int bytes,
3283 struct x86_emulate_ctxt *ctxt)
3285 return ptwr_emulated_update(
3286 offset, old, new, bytes, 1,
3287 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3290 static int ptwr_emulated_cmpxchg8b(
3291 enum x86_segment seg,
3292 unsigned long offset,
3293 unsigned long old,
3294 unsigned long old_hi,
3295 unsigned long new,
3296 unsigned long new_hi,
3297 struct x86_emulate_ctxt *ctxt)
3299 if ( CONFIG_PAGING_LEVELS == 2 )
3300 return X86EMUL_UNHANDLEABLE;
3301 return ptwr_emulated_update(
3302 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3303 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3306 static struct x86_emulate_ops ptwr_emulate_ops = {
3307 .read = ptwr_emulated_read,
3308 .insn_fetch = ptwr_emulated_read,
3309 .write = ptwr_emulated_write,
3310 .cmpxchg = ptwr_emulated_cmpxchg,
3311 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3312 };
3314 /* Write page fault handler: check if guest is trying to modify a PTE. */
3315 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3316 struct cpu_user_regs *regs)
3318 struct domain *d = v->domain;
3319 struct page_info *page;
3320 l1_pgentry_t pte;
3321 struct ptwr_emulate_ctxt ptwr_ctxt;
3323 LOCK_BIGLOCK(d);
3325 /*
3326 * Attempt to read the PTE that maps the VA being accessed. By checking for
3327 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3328 */
3329 guest_get_eff_l1e(v, addr, &pte);
3330 if ( !(l1e_get_flags(pte) & _PAGE_PRESENT) )
3331 goto bail;
3332 page = l1e_get_page(pte);
3334 /* We are looking only for read-only mappings of p.t. pages. */
3335 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3336 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3337 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3338 (page_get_owner(page) != d) )
3339 goto bail;
3341 ptwr_ctxt.ctxt.regs = regs;
3342 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3343 IS_COMPAT(d) ? 32 : BITS_PER_LONG;
3344 ptwr_ctxt.cr2 = addr;
3345 ptwr_ctxt.pte = pte;
3346 if ( x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops) )
3347 goto bail;
3349 UNLOCK_BIGLOCK(d);
3350 perfc_incrc(ptwr_emulations);
3351 return EXCRET_fault_fixed;
3353 bail:
3354 UNLOCK_BIGLOCK(d);
3355 return 0;
3358 int map_pages_to_xen(
3359 unsigned long virt,
3360 unsigned long mfn,
3361 unsigned long nr_mfns,
3362 unsigned long flags)
3364 l2_pgentry_t *pl2e, ol2e;
3365 l1_pgentry_t *pl1e, ol1e;
3366 unsigned int i;
3368 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3369 flags &= ~MAP_SMALL_PAGES;
3371 while ( nr_mfns != 0 )
3373 pl2e = virt_to_xen_l2e(virt);
3375 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3376 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3377 !map_small_pages )
3379 /* Super-page mapping. */
3380 ol2e = *pl2e;
3381 l2e_write(pl2e, l2e_from_pfn(mfn, flags|_PAGE_PSE));
3383 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3385 local_flush_tlb_pge();
3386 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3387 free_xen_pagetable(l2e_get_page(ol2e));
3390 virt += 1UL << L2_PAGETABLE_SHIFT;
3391 mfn += 1UL << PAGETABLE_ORDER;
3392 nr_mfns -= 1UL << PAGETABLE_ORDER;
3394 else
3396 /* Normal page mapping. */
3397 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3399 pl1e = page_to_virt(alloc_xen_pagetable());
3400 clear_page(pl1e);
3401 l2e_write(pl2e, l2e_from_page(virt_to_page(pl1e),
3402 __PAGE_HYPERVISOR));
3404 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3406 pl1e = page_to_virt(alloc_xen_pagetable());
3407 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3408 l1e_write(&pl1e[i],
3409 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3410 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
3411 l2e_write(pl2e, l2e_from_page(virt_to_page(pl1e),
3412 __PAGE_HYPERVISOR));
3413 local_flush_tlb_pge();
3416 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3417 ol1e = *pl1e;
3418 l1e_write(pl1e, l1e_from_pfn(mfn, flags));
3419 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3420 local_flush_tlb_one(virt);
3422 virt += 1UL << L1_PAGETABLE_SHIFT;
3423 mfn += 1UL;
3424 nr_mfns -= 1UL;
3428 return 0;
3431 void __set_fixmap(
3432 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3434 BUG_ON(idx >= __end_of_fixed_addresses);
3435 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3438 #ifdef MEMORY_GUARD
3440 void memguard_init(void)
3442 map_pages_to_xen(
3443 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3444 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3447 static void __memguard_change_range(void *p, unsigned long l, int guard)
3449 unsigned long _p = (unsigned long)p;
3450 unsigned long _l = (unsigned long)l;
3451 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3453 /* Ensure we are dealing with a page-aligned whole number of pages. */
3454 ASSERT((_p&PAGE_MASK) != 0);
3455 ASSERT((_l&PAGE_MASK) != 0);
3456 ASSERT((_p&~PAGE_MASK) == 0);
3457 ASSERT((_l&~PAGE_MASK) == 0);
3459 if ( guard )
3460 flags &= ~_PAGE_PRESENT;
3462 map_pages_to_xen(
3463 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3466 void memguard_guard_range(void *p, unsigned long l)
3468 __memguard_change_range(p, l, 1);
3471 void memguard_unguard_range(void *p, unsigned long l)
3473 __memguard_change_range(p, l, 0);
3476 #endif
3478 void memguard_guard_stack(void *p)
3480 BUILD_BUG_ON((DEBUG_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
3481 p = (void *)((unsigned long)p + STACK_SIZE - DEBUG_STACK_SIZE - PAGE_SIZE);
3482 memguard_guard_range(p, PAGE_SIZE);
3485 /*
3486 * Local variables:
3487 * mode: C
3488 * c-set-style: "BSD"
3489 * c-basic-offset: 4
3490 * tab-width: 4
3491 * indent-tabs-mode: nil
3492 * End:
3493 */