direct-io.hg

view xen/arch/x86/mm.c @ 15457:08bcc54aee8e

x86: Improve e820 scanning for all I/O regions.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Jul 03 12:02:31 2007 +0100 (2007-07-03)
parents 83cbda5c1e1b
children 842e085dbb77
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
114 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
116 /*
117 * PTE updates can be done with ordinary writes except:
118 * 1. Debug builds get extra checking by using CMPXCHG[8B].
119 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
120 */
121 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
122 #define PTE_UPDATE_WITH_CMPXCHG
123 #endif
125 /* Used to defer flushing of memory structures. */
126 struct percpu_mm_info {
127 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
128 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
129 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
130 unsigned int deferred_ops;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 };
134 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
136 /*
137 * Returns the current foreign domain; defaults to the currently-executing
138 * domain if a foreign override hasn't been specified.
139 */
140 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
142 /* Private domain structs for DOMID_XEN and DOMID_IO. */
143 static struct domain *dom_xen, *dom_io;
145 /* Frame table and its size in pages. */
146 struct page_info *frame_table;
147 unsigned long max_page;
148 unsigned long total_pages;
150 #ifdef CONFIG_COMPAT
151 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
152 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
153 L3_DISALLOW_MASK : \
154 COMPAT_L3_DISALLOW_MASK)
155 #else
156 #define l3_disallow_mask(d) L3_DISALLOW_MASK
157 #endif
159 static void queue_deferred_ops(struct domain *d, unsigned int ops)
160 {
161 ASSERT(d == current->domain);
162 this_cpu(percpu_mm_info).deferred_ops |= ops;
163 }
165 void __init init_frametable(void)
166 {
167 unsigned long nr_pages, page_step, i, mfn;
169 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
171 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
172 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
174 for ( i = 0; i < nr_pages; i += page_step )
175 {
176 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
177 if ( mfn == 0 )
178 panic("Not enough memory for frame table\n");
179 map_pages_to_xen(
180 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
181 mfn, page_step, PAGE_HYPERVISOR);
182 }
184 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
185 }
187 void __init arch_init_memory(void)
188 {
189 extern void subarch_init_memory(void);
191 unsigned long i, pfn, rstart_pfn, rend_pfn;
193 /*
194 * Initialise our DOMID_XEN domain.
195 * Any Xen-heap pages that we will allow to be mapped will have
196 * their domain field set to dom_xen.
197 */
198 dom_xen = alloc_domain(DOMID_XEN);
199 BUG_ON(dom_xen == NULL);
201 /*
202 * Initialise our DOMID_IO domain.
203 * This domain owns I/O pages that are within the range of the page_info
204 * array. Mappings occur at the priv of the caller.
205 */
206 dom_io = alloc_domain(DOMID_IO);
207 BUG_ON(dom_io == NULL);
209 /* First 1MB of RAM is historically marked as I/O. */
210 for ( i = 0; i < 0x100; i++ )
211 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
213 /* Any areas not specified as RAM by the e820 map are considered I/O. */
214 pfn = i = 0;
215 while ( pfn < max_page )
216 {
217 while ( (i < e820.nr_map) && (e820.map[i].type != E820_RAM) )
218 i++;
220 if ( i == e820.nr_map )
221 {
222 /* No more RAM regions: mark as I/O right to end of memory map. */
223 rstart_pfn = rend_pfn = max_page;
224 }
225 else
226 {
227 /* Mark as I/O just up as far as next RAM region. */
228 rstart_pfn = min_t(unsigned long, max_page,
229 PFN_UP(e820.map[i].addr));
230 rend_pfn = max_t(unsigned long, rstart_pfn,
231 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
232 }
234 /* Mark as I/O up to next RAM region. */
235 for ( ; pfn < rstart_pfn; pfn++ )
236 {
237 BUG_ON(!mfn_valid(pfn));
238 share_xen_page_with_guest(
239 mfn_to_page(pfn), dom_io, XENSHARE_writable);
240 }
242 /* Skip the RAM region. */
243 pfn = rend_pfn;
244 }
246 subarch_init_memory();
247 }
249 int memory_is_conventional_ram(paddr_t p)
250 {
251 int i;
253 for ( i = 0; i < e820.nr_map; i++ )
254 {
255 if ( (e820.map[i].type == E820_RAM) &&
256 (e820.map[i].addr <= p) &&
257 (e820.map[i].size > p) )
258 return 1;
259 }
261 return 0;
262 }
264 unsigned long domain_get_maximum_gpfn(struct domain *d)
265 {
266 if ( is_hvm_domain(d) )
267 return d->arch.p2m.max_mapped_pfn;
268 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
269 return arch_get_max_pfn(d) - 1;
270 }
272 void share_xen_page_with_guest(
273 struct page_info *page, struct domain *d, int readonly)
274 {
275 if ( page_get_owner(page) == d )
276 return;
278 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
280 spin_lock(&d->page_alloc_lock);
282 /* The incremented type count pins as writable or read-only. */
283 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
284 page->u.inuse.type_info |= PGT_validated | 1;
286 page_set_owner(page, d);
287 wmb(); /* install valid domain ptr before updating refcnt. */
288 ASSERT(page->count_info == 0);
290 /* Only add to the allocation list if the domain isn't dying. */
291 if ( !d->is_dying )
292 {
293 page->count_info |= PGC_allocated | 1;
294 if ( unlikely(d->xenheap_pages++ == 0) )
295 get_knownalive_domain(d);
296 list_add_tail(&page->list, &d->xenpage_list);
297 }
299 spin_unlock(&d->page_alloc_lock);
300 }
302 void share_xen_page_with_privileged_guests(
303 struct page_info *page, int readonly)
304 {
305 share_xen_page_with_guest(page, dom_xen, readonly);
306 }
308 #if defined(CONFIG_X86_PAE)
310 #ifdef NDEBUG
311 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
312 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
313 #else
314 /*
315 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
316 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
317 * (detected by lack of an owning domain). As required for correctness, we
318 * always shadow PDPTs above 4GB.
319 */
320 #define l3tab_needs_shadow(mfn) \
321 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
322 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
323 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
324 ((mfn) >= 0x100000))
325 #endif
327 static l1_pgentry_t *fix_pae_highmem_pl1e;
329 /* Cache the address of PAE high-memory fixmap page tables. */
330 static int __init cache_pae_fixmap_address(void)
331 {
332 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
333 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
334 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
335 return 0;
336 }
337 __initcall(cache_pae_fixmap_address);
339 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
341 void make_cr3(struct vcpu *v, unsigned long mfn)
342 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
343 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
344 {
345 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
346 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
347 unsigned int cpu = smp_processor_id();
349 /* Fast path: does this mfn need a shadow at all? */
350 if ( !l3tab_needs_shadow(mfn) )
351 {
352 v->arch.cr3 = mfn << PAGE_SHIFT;
353 /* Cache is no longer in use or valid */
354 cache->high_mfn = 0;
355 return;
356 }
358 /* Caching logic is not interrupt safe. */
359 ASSERT(!in_irq());
361 /* Protects against pae_flush_pgd(). */
362 spin_lock(&cache->lock);
364 cache->inuse_idx ^= 1;
365 cache->high_mfn = mfn;
367 /* Map the guest L3 table and copy to the chosen low-memory cache. */
368 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
369 /* First check the previous high mapping can't be in the TLB.
370 * (i.e. have we loaded CR3 since we last did this?) */
371 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
372 local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
373 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
374 lowmem_l3tab = cache->table[cache->inuse_idx];
375 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
376 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
377 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
379 v->arch.cr3 = __pa(lowmem_l3tab);
381 spin_unlock(&cache->lock);
382 }
384 #else /* !CONFIG_X86_PAE */
386 void make_cr3(struct vcpu *v, unsigned long mfn)
387 {
388 v->arch.cr3 = mfn << PAGE_SHIFT;
389 }
391 #endif /* !CONFIG_X86_PAE */
393 void write_ptbase(struct vcpu *v)
394 {
395 write_cr3(v->arch.cr3);
396 }
398 /* Should be called after CR3 is updated.
399 * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3.
400 *
401 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
402 * for HVM guests, arch.monitor_table and hvm's guest CR3.
403 *
404 * Update ref counts to shadow tables appropriately.
405 */
406 void update_cr3(struct vcpu *v)
407 {
408 unsigned long cr3_mfn=0;
410 if ( paging_mode_enabled(v->domain) )
411 {
412 paging_update_cr3(v);
413 return;
414 }
416 #if CONFIG_PAGING_LEVELS == 4
417 if ( !(v->arch.flags & TF_kernel_mode) )
418 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
419 else
420 #endif
421 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
423 make_cr3(v, cr3_mfn);
424 }
427 static void invalidate_shadow_ldt(struct vcpu *v)
428 {
429 int i;
430 unsigned long pfn;
431 struct page_info *page;
433 if ( v->arch.shadow_ldt_mapcnt == 0 )
434 return;
436 v->arch.shadow_ldt_mapcnt = 0;
438 for ( i = 16; i < 32; i++ )
439 {
440 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
441 if ( pfn == 0 ) continue;
442 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
443 page = mfn_to_page(pfn);
444 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
445 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
446 put_page_and_type(page);
447 }
449 /* Dispose of the (now possibly invalid) mappings from the TLB. */
450 if ( v == current )
451 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
452 else
453 flush_tlb_mask(v->domain->domain_dirty_cpumask);
454 }
457 static int alloc_segdesc_page(struct page_info *page)
458 {
459 struct desc_struct *descs;
460 int i;
462 descs = map_domain_page(page_to_mfn(page));
464 for ( i = 0; i < 512; i++ )
465 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
466 goto fail;
468 unmap_domain_page(descs);
469 return 1;
471 fail:
472 unmap_domain_page(descs);
473 return 0;
474 }
477 /* Map shadow page at offset @off. */
478 int map_ldt_shadow_page(unsigned int off)
479 {
480 struct vcpu *v = current;
481 struct domain *d = v->domain;
482 unsigned long gmfn, mfn;
483 l1_pgentry_t l1e, nl1e;
484 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
485 int okay;
487 BUG_ON(unlikely(in_irq()));
489 guest_get_eff_kern_l1e(v, gva, &l1e);
490 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
491 return 0;
493 gmfn = l1e_get_pfn(l1e);
494 mfn = gmfn_to_mfn(d, gmfn);
495 if ( unlikely(!mfn_valid(mfn)) )
496 return 0;
498 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
499 if ( unlikely(!okay) )
500 return 0;
502 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
504 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
505 v->arch.shadow_ldt_mapcnt++;
507 return 1;
508 }
511 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
512 {
513 struct page_info *page = mfn_to_page(page_nr);
515 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
516 {
517 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
518 return 0;
519 }
521 return 1;
522 }
525 static int get_page_and_type_from_pagenr(unsigned long page_nr,
526 unsigned long type,
527 struct domain *d)
528 {
529 struct page_info *page = mfn_to_page(page_nr);
531 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
532 return 0;
534 if ( unlikely(!get_page_type(page, type)) )
535 {
536 put_page(page);
537 return 0;
538 }
540 return 1;
541 }
543 /*
544 * We allow root tables to map each other (a.k.a. linear page tables). It
545 * needs some special care with reference counts and access permissions:
546 * 1. The mapping entry must be read-only, or the guest may get write access
547 * to its own PTEs.
548 * 2. We must only bump the reference counts for an *already validated*
549 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
550 * on a validation that is required to complete that validation.
551 * 3. We only need to increment the reference counts for the mapped page
552 * frame if it is mapped by a different root table. This is sufficient and
553 * also necessary to allow validation of a root table mapping itself.
554 */
555 #define define_get_linear_pagetable(level) \
556 static int \
557 get_##level##_linear_pagetable( \
558 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
559 { \
560 unsigned long x, y; \
561 struct page_info *page; \
562 unsigned long pfn; \
563 \
564 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
565 { \
566 MEM_LOG("Attempt to create linear p.t. with write perms"); \
567 return 0; \
568 } \
569 \
570 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
571 { \
572 /* Make sure the mapped frame belongs to the correct domain. */ \
573 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
574 return 0; \
575 \
576 /* \
577 * Ensure that the mapped frame is an already-validated page table. \
578 * If so, atomically increment the count (checking for overflow). \
579 */ \
580 page = mfn_to_page(pfn); \
581 y = page->u.inuse.type_info; \
582 do { \
583 x = y; \
584 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
585 unlikely((x & (PGT_type_mask|PGT_validated)) != \
586 (PGT_##level##_page_table|PGT_validated)) ) \
587 { \
588 put_page(page); \
589 return 0; \
590 } \
591 } \
592 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
593 } \
594 \
595 return 1; \
596 }
598 int
599 get_page_from_l1e(
600 l1_pgentry_t l1e, struct domain *d)
601 {
602 unsigned long mfn = l1e_get_pfn(l1e);
603 struct page_info *page = mfn_to_page(mfn);
604 int okay;
606 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
607 return 1;
609 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
610 {
611 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
612 return 0;
613 }
615 if ( unlikely(!mfn_valid(mfn)) ||
616 unlikely(page_get_owner(page) == dom_io) )
617 {
618 /* DOMID_IO reverts to caller for privilege checks. */
619 if ( d == dom_io )
620 d = current->domain;
622 if ( !iomem_access_permitted(d, mfn, mfn) )
623 {
624 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
625 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
626 d->domain_id, mfn);
627 return 0;
628 }
630 /* No reference counting for out-of-range I/O pages. */
631 if ( !mfn_valid(mfn) )
632 return 1;
634 d = dom_io;
635 }
637 /* Foreign mappings into guests in shadow external mode don't
638 * contribute to writeable mapping refcounts. (This allows the
639 * qemu-dm helper process in dom0 to map the domain's memory without
640 * messing up the count of "real" writable mappings.) */
641 okay = (((l1e_get_flags(l1e) & _PAGE_RW) &&
642 !(unlikely(paging_mode_external(d) && (d != current->domain))))
643 ? get_page_and_type(page, d, PGT_writable_page)
644 : get_page(page, d));
645 if ( !okay )
646 {
647 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
648 " for dom%d",
649 mfn, get_gpfn_from_mfn(mfn),
650 l1e_get_intpte(l1e), d->domain_id);
651 }
653 return okay;
654 }
657 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
658 define_get_linear_pagetable(l2);
659 static int
660 get_page_from_l2e(
661 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
662 {
663 int rc;
665 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
666 return 1;
668 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
669 {
670 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
671 return 0;
672 }
674 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
675 if ( unlikely(!rc) )
676 rc = get_l2_linear_pagetable(l2e, pfn, d);
678 return rc;
679 }
682 #if CONFIG_PAGING_LEVELS >= 3
683 define_get_linear_pagetable(l3);
684 static int
685 get_page_from_l3e(
686 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
687 {
688 int rc;
690 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
691 return 1;
693 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
694 {
695 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
696 return 0;
697 }
699 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
700 if ( unlikely(!rc) )
701 rc = get_l3_linear_pagetable(l3e, pfn, d);
703 return rc;
704 }
705 #endif /* 3 level */
707 #if CONFIG_PAGING_LEVELS >= 4
708 define_get_linear_pagetable(l4);
709 static int
710 get_page_from_l4e(
711 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
712 {
713 int rc;
715 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
716 return 1;
718 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
719 {
720 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
721 return 0;
722 }
724 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
725 if ( unlikely(!rc) )
726 rc = get_l4_linear_pagetable(l4e, pfn, d);
728 return rc;
729 }
730 #endif /* 4 level */
732 #ifdef __x86_64__
734 #ifdef USER_MAPPINGS_ARE_GLOBAL
735 #define adjust_guest_l1e(pl1e, d) \
736 do { \
737 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
738 likely(!is_pv_32on64_domain(d)) ) \
739 { \
740 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
741 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
742 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
743 MEM_LOG("Global bit is set to kernel page %lx", \
744 l1e_get_pfn((pl1e))); \
745 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
746 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
747 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
748 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
749 } \
750 } while ( 0 )
751 #else
752 #define adjust_guest_l1e(pl1e, d) \
753 do { \
754 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
755 likely(!is_pv_32on64_domain(d)) ) \
756 l1e_add_flags((pl1e), _PAGE_USER); \
757 } while ( 0 )
758 #endif
760 #define adjust_guest_l2e(pl2e, d) \
761 do { \
762 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
763 likely(!is_pv_32on64_domain(d)) ) \
764 l2e_add_flags((pl2e), _PAGE_USER); \
765 } while ( 0 )
767 #define adjust_guest_l3e(pl3e, d) \
768 do { \
769 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
770 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
771 _PAGE_USER : \
772 _PAGE_USER|_PAGE_RW); \
773 } while ( 0 )
775 #define adjust_guest_l4e(pl4e, d) \
776 do { \
777 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
778 likely(!is_pv_32on64_domain(d)) ) \
779 l4e_add_flags((pl4e), _PAGE_USER); \
780 } while ( 0 )
782 #else /* !defined(__x86_64__) */
784 #define adjust_guest_l1e(_p, _d) ((void)(_d))
785 #define adjust_guest_l2e(_p, _d) ((void)(_d))
786 #define adjust_guest_l3e(_p, _d) ((void)(_d))
788 #endif
790 #ifdef CONFIG_COMPAT
791 #define unadjust_guest_l3e(pl3e, d) \
792 do { \
793 if ( unlikely(is_pv_32on64_domain(d)) && \
794 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
795 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
796 } while ( 0 )
797 #else
798 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
799 #endif
801 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
802 {
803 unsigned long pfn = l1e_get_pfn(l1e);
804 struct page_info *page = mfn_to_page(pfn);
805 struct domain *e;
806 struct vcpu *v;
808 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
809 return;
811 e = page_get_owner(page);
813 /*
814 * Check if this is a mapping that was established via a grant reference.
815 * If it was then we should not be here: we require that such mappings are
816 * explicitly destroyed via the grant-table interface.
817 *
818 * The upshot of this is that the guest can end up with active grants that
819 * it cannot destroy (because it no longer has a PTE to present to the
820 * grant-table interface). This can lead to subtle hard-to-catch bugs,
821 * hence a special grant PTE flag can be enabled to catch the bug early.
822 *
823 * (Note that the undestroyable active grants are not a security hole in
824 * Xen. All active grants can safely be cleaned up when the domain dies.)
825 */
826 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
827 !d->is_shutting_down && !d->is_dying )
828 {
829 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
830 l1e_get_intpte(l1e));
831 domain_crash(d);
832 }
834 /* Remember we didn't take a type-count of foreign writable mappings
835 * to paging-external domains */
836 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
837 !(unlikely((e != d) && paging_mode_external(e))) )
838 {
839 put_page_and_type(page);
840 }
841 else
842 {
843 /* We expect this is rare so we blow the entire shadow LDT. */
844 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
845 PGT_ldt_page)) &&
846 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
847 (d == e) )
848 {
849 for_each_vcpu ( d, v )
850 invalidate_shadow_ldt(v);
851 }
852 put_page(page);
853 }
854 }
857 /*
858 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
859 * Note also that this automatically deals correctly with linear p.t.'s.
860 */
861 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
862 {
863 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
864 (l2e_get_pfn(l2e) != pfn) )
865 put_page_and_type(l2e_get_page(l2e));
866 }
869 #if CONFIG_PAGING_LEVELS >= 3
870 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
871 {
872 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
873 (l3e_get_pfn(l3e) != pfn) )
874 put_page_and_type(l3e_get_page(l3e));
875 }
876 #endif
878 #if CONFIG_PAGING_LEVELS >= 4
879 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
880 {
881 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
882 (l4e_get_pfn(l4e) != pfn) )
883 put_page_and_type(l4e_get_page(l4e));
884 }
885 #endif
887 static int alloc_l1_table(struct page_info *page)
888 {
889 struct domain *d = page_get_owner(page);
890 unsigned long pfn = page_to_mfn(page);
891 l1_pgentry_t *pl1e;
892 int i;
894 pl1e = map_domain_page(pfn);
896 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
897 {
898 if ( is_guest_l1_slot(i) &&
899 unlikely(!get_page_from_l1e(pl1e[i], d)) )
900 goto fail;
902 adjust_guest_l1e(pl1e[i], d);
903 }
905 unmap_domain_page(pl1e);
906 return 1;
908 fail:
909 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
910 while ( i-- > 0 )
911 if ( is_guest_l1_slot(i) )
912 put_page_from_l1e(pl1e[i], d);
914 unmap_domain_page(pl1e);
915 return 0;
916 }
918 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
919 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
920 {
921 struct page_info *page;
922 l2_pgentry_t *pl2e;
923 l3_pgentry_t l3e3;
924 #ifndef CONFIG_COMPAT
925 l2_pgentry_t l2e;
926 int i;
927 #endif
929 if ( !is_pv_32bit_domain(d) )
930 return 1;
932 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
934 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
935 l3e3 = pl3e[3];
936 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
937 {
938 MEM_LOG("PAE L3 3rd slot is empty");
939 return 0;
940 }
942 /*
943 * The Xen-private mappings include linear mappings. The L2 thus cannot
944 * be shared by multiple L3 tables. The test here is adequate because:
945 * 1. Cannot appear in slots != 3 because get_page_type() checks the
946 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
947 * 2. Cannot appear in another page table's L3:
948 * a. alloc_l3_table() calls this function and this check will fail
949 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
950 */
951 page = l3e_get_page(l3e3);
952 BUG_ON(page->u.inuse.type_info & PGT_pinned);
953 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
954 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
955 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
956 {
957 MEM_LOG("PAE L3 3rd slot is shared");
958 return 0;
959 }
961 /* Xen private mappings. */
962 pl2e = map_domain_page(l3e_get_pfn(l3e3));
963 #ifndef CONFIG_COMPAT
964 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
965 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
966 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
967 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
968 {
969 l2e = l2e_from_page(
970 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
971 __PAGE_HYPERVISOR);
972 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
973 }
974 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
975 {
976 l2e = l2e_empty();
977 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
978 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
979 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
980 }
981 #else
982 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
983 &compat_idle_pg_table_l2[
984 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
985 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
986 #endif
987 unmap_domain_page(pl2e);
989 return 1;
990 }
991 #else
992 # define create_pae_xen_mappings(d, pl3e) (1)
993 #endif
995 #ifdef CONFIG_X86_PAE
996 /* Flush a pgdir update into low-memory caches. */
997 static void pae_flush_pgd(
998 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
999 {
1000 struct domain *d = page_get_owner(mfn_to_page(mfn));
1001 struct vcpu *v;
1002 intpte_t _ol3e, _nl3e, _pl3e;
1003 l3_pgentry_t *l3tab_ptr;
1004 struct pae_l3_cache *cache;
1006 if ( unlikely(shadow_mode_enabled(d)) )
1008 cpumask_t m = CPU_MASK_NONE;
1009 /* Re-shadow this l3 table on any vcpus that are using it */
1010 for_each_vcpu ( d, v )
1011 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1013 paging_update_cr3(v);
1014 cpus_or(m, m, v->vcpu_dirty_cpumask);
1016 flush_tlb_mask(m);
1019 /* If below 4GB then the pgdir is not shadowed in low memory. */
1020 if ( !l3tab_needs_shadow(mfn) )
1021 return;
1023 for_each_vcpu ( d, v )
1025 cache = &v->arch.pae_l3_cache;
1027 spin_lock(&cache->lock);
1029 if ( cache->high_mfn == mfn )
1031 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1032 _ol3e = l3e_get_intpte(*l3tab_ptr);
1033 _nl3e = l3e_get_intpte(nl3e);
1034 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1035 BUG_ON(_pl3e != _ol3e);
1038 spin_unlock(&cache->lock);
1041 flush_tlb_mask(d->domain_dirty_cpumask);
1043 #else
1044 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1045 #endif
1047 static int alloc_l2_table(struct page_info *page, unsigned long type)
1049 struct domain *d = page_get_owner(page);
1050 unsigned long pfn = page_to_mfn(page);
1051 l2_pgentry_t *pl2e;
1052 int i;
1054 pl2e = map_domain_page(pfn);
1056 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1058 if ( is_guest_l2_slot(d, type, i) &&
1059 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1060 goto fail;
1062 adjust_guest_l2e(pl2e[i], d);
1065 #if CONFIG_PAGING_LEVELS == 2
1066 /* Xen private mappings. */
1067 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1068 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1069 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1070 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1071 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1072 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1073 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1074 l2e_from_page(
1075 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1076 __PAGE_HYPERVISOR);
1077 #endif
1079 unmap_domain_page(pl2e);
1080 return 1;
1082 fail:
1083 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1084 while ( i-- > 0 )
1085 if ( is_guest_l2_slot(d, type, i) )
1086 put_page_from_l2e(pl2e[i], pfn);
1088 unmap_domain_page(pl2e);
1089 return 0;
1093 #if CONFIG_PAGING_LEVELS >= 3
1094 static int alloc_l3_table(struct page_info *page)
1096 struct domain *d = page_get_owner(page);
1097 unsigned long pfn = page_to_mfn(page);
1098 l3_pgentry_t *pl3e;
1099 int i;
1101 #ifdef CONFIG_X86_PAE
1102 /*
1103 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1104 * the weird 'extended cr3' format for dealing with high-order address
1105 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1106 */
1107 if ( (pfn >= 0x100000) &&
1108 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1109 d->vcpu[0] && d->vcpu[0]->is_initialised )
1111 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1112 return 0;
1114 #endif
1116 pl3e = map_domain_page(pfn);
1118 /*
1119 * PAE guests allocate full pages, but aren't required to initialize
1120 * more than the first four entries; when running in compatibility
1121 * mode, however, the full page is visible to the MMU, and hence all
1122 * 512 entries must be valid/verified, which is most easily achieved
1123 * by clearing them out.
1124 */
1125 if ( is_pv_32on64_domain(d) )
1126 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1128 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1130 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1131 if ( is_pv_32bit_domain(d) && (i == 3) )
1133 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1134 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1135 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1136 PGT_l2_page_table |
1137 PGT_pae_xen_l2,
1138 d) )
1139 goto fail;
1141 else
1142 #endif
1143 if ( is_guest_l3_slot(i) &&
1144 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1145 goto fail;
1147 adjust_guest_l3e(pl3e[i], d);
1150 if ( !create_pae_xen_mappings(d, pl3e) )
1151 goto fail;
1153 unmap_domain_page(pl3e);
1154 return 1;
1156 fail:
1157 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1158 while ( i-- > 0 )
1159 if ( is_guest_l3_slot(i) )
1160 put_page_from_l3e(pl3e[i], pfn);
1162 unmap_domain_page(pl3e);
1163 return 0;
1165 #else
1166 #define alloc_l3_table(page) (0)
1167 #endif
1169 #if CONFIG_PAGING_LEVELS >= 4
1170 static int alloc_l4_table(struct page_info *page)
1172 struct domain *d = page_get_owner(page);
1173 unsigned long pfn = page_to_mfn(page);
1174 l4_pgentry_t *pl4e = page_to_virt(page);
1175 int i;
1177 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1179 if ( is_guest_l4_slot(d, i) &&
1180 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1181 goto fail;
1183 adjust_guest_l4e(pl4e[i], d);
1186 /* Xen private mappings. */
1187 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1188 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1189 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1190 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1191 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1192 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1193 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1194 __PAGE_HYPERVISOR);
1195 if ( is_pv_32on64_domain(d) )
1196 pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1197 l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
1198 __PAGE_HYPERVISOR);
1200 return 1;
1202 fail:
1203 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1204 while ( i-- > 0 )
1205 if ( is_guest_l4_slot(d, i) )
1206 put_page_from_l4e(pl4e[i], pfn);
1208 return 0;
1210 #else
1211 #define alloc_l4_table(page) (0)
1212 #endif
1215 static void free_l1_table(struct page_info *page)
1217 struct domain *d = page_get_owner(page);
1218 unsigned long pfn = page_to_mfn(page);
1219 l1_pgentry_t *pl1e;
1220 int i;
1222 pl1e = map_domain_page(pfn);
1224 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1225 if ( is_guest_l1_slot(i) )
1226 put_page_from_l1e(pl1e[i], d);
1228 unmap_domain_page(pl1e);
1232 static void free_l2_table(struct page_info *page)
1234 #ifdef CONFIG_COMPAT
1235 struct domain *d = page_get_owner(page);
1236 #endif
1237 unsigned long pfn = page_to_mfn(page);
1238 l2_pgentry_t *pl2e;
1239 int i;
1241 pl2e = map_domain_page(pfn);
1243 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1244 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1245 put_page_from_l2e(pl2e[i], pfn);
1247 unmap_domain_page(pl2e);
1249 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1253 #if CONFIG_PAGING_LEVELS >= 3
1255 static void free_l3_table(struct page_info *page)
1257 struct domain *d = page_get_owner(page);
1258 unsigned long pfn = page_to_mfn(page);
1259 l3_pgentry_t *pl3e;
1260 int i;
1262 pl3e = map_domain_page(pfn);
1264 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1265 if ( is_guest_l3_slot(i) )
1267 put_page_from_l3e(pl3e[i], pfn);
1268 unadjust_guest_l3e(pl3e[i], d);
1271 unmap_domain_page(pl3e);
1274 #endif
1276 #if CONFIG_PAGING_LEVELS >= 4
1278 static void free_l4_table(struct page_info *page)
1280 struct domain *d = page_get_owner(page);
1281 unsigned long pfn = page_to_mfn(page);
1282 l4_pgentry_t *pl4e = page_to_virt(page);
1283 int i;
1285 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1286 if ( is_guest_l4_slot(d, i) )
1287 put_page_from_l4e(pl4e[i], pfn);
1290 #endif
1293 /* How to write an entry to the guest pagetables.
1294 * Returns 0 for failure (pointer not valid), 1 for success. */
1295 static inline int update_intpte(intpte_t *p,
1296 intpte_t old,
1297 intpte_t new,
1298 unsigned long mfn,
1299 struct vcpu *v)
1301 int rv = 1;
1302 #ifndef PTE_UPDATE_WITH_CMPXCHG
1303 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1304 #else
1306 intpte_t t = old;
1307 for ( ; ; )
1309 rv = paging_cmpxchg_guest_entry(v, p, &t, new, _mfn(mfn));
1310 if ( unlikely(rv == 0) )
1312 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1313 ": saw %" PRIpte, old, new, t);
1314 break;
1317 if ( t == old )
1318 break;
1320 /* Allowed to change in Accessed/Dirty flags only. */
1321 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1323 old = t;
1326 #endif
1327 return rv;
1330 /* Macro that wraps the appropriate type-changes around update_intpte().
1331 * Arguments are: type, ptr, old, new, mfn, vcpu */
1332 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v) \
1333 update_intpte(&_t ## e_get_intpte(*(_p)), \
1334 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1335 (_m), (_v))
1337 /* Update the L1 entry at pl1e to new value nl1e. */
1338 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1339 unsigned long gl1mfn)
1341 l1_pgentry_t ol1e;
1342 struct domain *d = current->domain;
1343 unsigned long mfn;
1345 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1346 return 0;
1348 if ( unlikely(paging_mode_refcounts(d)) )
1349 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1351 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1353 /* Translate foreign guest addresses. */
1354 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1355 if ( unlikely(mfn == INVALID_MFN) )
1356 return 0;
1357 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1358 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1360 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1362 MEM_LOG("Bad L1 flags %x",
1363 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1364 return 0;
1367 adjust_guest_l1e(nl1e, d);
1369 /* Fast path for identical mapping, r/w and presence. */
1370 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1371 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1373 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1374 return 0;
1376 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1378 put_page_from_l1e(nl1e, d);
1379 return 0;
1382 else
1384 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1385 return 0;
1388 put_page_from_l1e(ol1e, d);
1389 return 1;
1393 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1394 static int mod_l2_entry(l2_pgentry_t *pl2e,
1395 l2_pgentry_t nl2e,
1396 unsigned long pfn,
1397 unsigned long type)
1399 l2_pgentry_t ol2e;
1400 struct domain *d = current->domain;
1402 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1404 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1405 return 0;
1408 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1409 return 0;
1411 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1413 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1415 MEM_LOG("Bad L2 flags %x",
1416 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1417 return 0;
1420 adjust_guest_l2e(nl2e, d);
1422 /* Fast path for identical mapping and presence. */
1423 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1424 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current);
1426 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1427 return 0;
1429 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1431 put_page_from_l2e(nl2e, pfn);
1432 return 0;
1435 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1437 return 0;
1440 put_page_from_l2e(ol2e, pfn);
1441 return 1;
1444 #if CONFIG_PAGING_LEVELS >= 3
1446 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1447 static int mod_l3_entry(l3_pgentry_t *pl3e,
1448 l3_pgentry_t nl3e,
1449 unsigned long pfn)
1451 l3_pgentry_t ol3e;
1452 struct domain *d = current->domain;
1453 int okay;
1455 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1457 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1458 return 0;
1461 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1462 /*
1463 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1464 * would be a pain to ensure they remain continuously valid throughout.
1465 */
1466 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1467 return 0;
1468 #endif
1470 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1471 return 0;
1473 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1475 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1477 MEM_LOG("Bad L3 flags %x",
1478 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1479 return 0;
1482 adjust_guest_l3e(nl3e, d);
1484 /* Fast path for identical mapping and presence. */
1485 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1486 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current);
1488 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1489 return 0;
1491 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1493 put_page_from_l3e(nl3e, pfn);
1494 return 0;
1497 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1499 return 0;
1502 okay = create_pae_xen_mappings(d, pl3e);
1503 BUG_ON(!okay);
1505 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1507 put_page_from_l3e(ol3e, pfn);
1508 return 1;
1511 #endif
1513 #if CONFIG_PAGING_LEVELS >= 4
1515 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1516 static int mod_l4_entry(struct domain *d,
1517 l4_pgentry_t *pl4e,
1518 l4_pgentry_t nl4e,
1519 unsigned long pfn)
1521 l4_pgentry_t ol4e;
1523 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1525 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1526 return 0;
1529 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1530 return 0;
1532 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1534 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1536 MEM_LOG("Bad L4 flags %x",
1537 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1538 return 0;
1541 adjust_guest_l4e(nl4e, current->domain);
1543 /* Fast path for identical mapping and presence. */
1544 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1545 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current);
1547 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1548 return 0;
1550 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1552 put_page_from_l4e(nl4e, pfn);
1553 return 0;
1556 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1558 return 0;
1561 put_page_from_l4e(ol4e, pfn);
1562 return 1;
1565 #endif
1567 int alloc_page_type(struct page_info *page, unsigned long type)
1569 struct domain *owner = page_get_owner(page);
1571 /* A page table is dirtied when its type count becomes non-zero. */
1572 if ( likely(owner != NULL) )
1573 paging_mark_dirty(owner, page_to_mfn(page));
1575 switch ( type & PGT_type_mask )
1577 case PGT_l1_page_table:
1578 return alloc_l1_table(page);
1579 case PGT_l2_page_table:
1580 return alloc_l2_table(page, type);
1581 case PGT_l3_page_table:
1582 return alloc_l3_table(page);
1583 case PGT_l4_page_table:
1584 return alloc_l4_table(page);
1585 case PGT_gdt_page:
1586 case PGT_ldt_page:
1587 return alloc_segdesc_page(page);
1588 default:
1589 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1590 type, page->u.inuse.type_info,
1591 page->count_info);
1592 BUG();
1595 return 0;
1599 void free_page_type(struct page_info *page, unsigned long type)
1601 struct domain *owner = page_get_owner(page);
1602 unsigned long gmfn;
1604 if ( likely(owner != NULL) )
1606 /*
1607 * We have to flush before the next use of the linear mapping
1608 * (e.g., update_va_mapping()) or we could end up modifying a page
1609 * that is no longer a page table (and hence screw up ref counts).
1610 */
1611 if ( current->domain == owner )
1612 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1613 else
1614 flush_tlb_mask(owner->domain_dirty_cpumask);
1616 if ( unlikely(paging_mode_enabled(owner)) )
1618 /* A page table is dirtied when its type count becomes zero. */
1619 paging_mark_dirty(owner, page_to_mfn(page));
1621 if ( shadow_mode_refcounts(owner) )
1622 return;
1624 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1625 ASSERT(VALID_M2P(gmfn));
1626 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1630 switch ( type & PGT_type_mask )
1632 case PGT_l1_page_table:
1633 free_l1_table(page);
1634 break;
1636 case PGT_l2_page_table:
1637 free_l2_table(page);
1638 break;
1640 #if CONFIG_PAGING_LEVELS >= 3
1641 case PGT_l3_page_table:
1642 free_l3_table(page);
1643 break;
1644 #endif
1646 #if CONFIG_PAGING_LEVELS >= 4
1647 case PGT_l4_page_table:
1648 free_l4_table(page);
1649 break;
1650 #endif
1652 default:
1653 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1654 type, page_to_mfn(page));
1655 BUG();
1660 void put_page_type(struct page_info *page)
1662 unsigned long nx, x, y = page->u.inuse.type_info;
1664 again:
1665 do {
1666 x = y;
1667 nx = x - 1;
1669 ASSERT((x & PGT_count_mask) != 0);
1671 if ( unlikely((nx & PGT_count_mask) == 0) )
1673 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1674 likely(nx & PGT_validated) )
1676 /*
1677 * Page-table pages must be unvalidated when count is zero. The
1678 * 'free' is safe because the refcnt is non-zero and validated
1679 * bit is clear => other ops will spin or fail.
1680 */
1681 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1682 x & ~PGT_validated)) != x) )
1683 goto again;
1684 /* We cleared the 'valid bit' so we do the clean up. */
1685 free_page_type(page, x);
1686 /* Carry on, but with the 'valid bit' now clear. */
1687 x &= ~PGT_validated;
1688 nx &= ~PGT_validated;
1691 /*
1692 * Record TLB information for flush later. We do not stamp page
1693 * tables when running in shadow mode:
1694 * 1. Pointless, since it's the shadow pt's which must be tracked.
1695 * 2. Shadow mode reuses this field for shadowed page tables to
1696 * store flags info -- we don't want to conflict with that.
1697 */
1698 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1699 (page->count_info & PGC_page_table)) )
1700 page->tlbflush_timestamp = tlbflush_current_time();
1703 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1707 int get_page_type(struct page_info *page, unsigned long type)
1709 unsigned long nx, x, y = page->u.inuse.type_info;
1711 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1713 again:
1714 do {
1715 x = y;
1716 nx = x + 1;
1717 if ( unlikely((nx & PGT_count_mask) == 0) )
1719 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1720 return 0;
1722 else if ( unlikely((x & PGT_count_mask) == 0) )
1724 struct domain *d = page_get_owner(page);
1726 /* Never allow a shadowed frame to go from type count 0 to 1 */
1727 if ( d && shadow_mode_enabled(d) )
1728 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1730 ASSERT(!(x & PGT_pae_xen_l2));
1731 if ( (x & PGT_type_mask) != type )
1733 /*
1734 * On type change we check to flush stale TLB entries. This
1735 * may be unnecessary (e.g., page was GDT/LDT) but those
1736 * circumstances should be very rare.
1737 */
1738 cpumask_t mask = d->domain_dirty_cpumask;
1740 /* Don't flush if the timestamp is old enough */
1741 tlbflush_filter(mask, page->tlbflush_timestamp);
1743 if ( unlikely(!cpus_empty(mask)) &&
1744 /* Shadow mode: track only writable pages. */
1745 (!shadow_mode_enabled(page_get_owner(page)) ||
1746 ((nx & PGT_type_mask) == PGT_writable_page)) )
1748 perfc_incr(need_flush_tlb_flush);
1749 flush_tlb_mask(mask);
1752 /* We lose existing type, back pointer, and validity. */
1753 nx &= ~(PGT_type_mask | PGT_validated);
1754 nx |= type;
1756 /* No special validation needed for writable pages. */
1757 /* Page tables and GDT/LDT need to be scanned for validity. */
1758 if ( type == PGT_writable_page )
1759 nx |= PGT_validated;
1762 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1764 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1765 (type != PGT_l1_page_table) )
1766 MEM_LOG("Bad type (saw %" PRtype_info
1767 " != exp %" PRtype_info ") "
1768 "for mfn %lx (pfn %lx)",
1769 x, type, page_to_mfn(page),
1770 get_gpfn_from_mfn(page_to_mfn(page)));
1771 return 0;
1773 else if ( unlikely(!(x & PGT_validated)) )
1775 /* Someone else is updating validation of this page. Wait... */
1776 while ( (y = page->u.inuse.type_info) == x )
1777 cpu_relax();
1778 goto again;
1781 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1783 if ( unlikely(!(nx & PGT_validated)) )
1785 /* Try to validate page type; drop the new reference on failure. */
1786 if ( unlikely(!alloc_page_type(page, type)) )
1788 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1789 PRtype_info ": caf=%08x taf=%" PRtype_info,
1790 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1791 type, page->count_info, page->u.inuse.type_info);
1792 /* Noone else can get a reference. We hold the only ref. */
1793 page->u.inuse.type_info = 0;
1794 return 0;
1797 /* Noone else is updating simultaneously. */
1798 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1801 return 1;
1805 int new_guest_cr3(unsigned long mfn)
1807 struct vcpu *v = current;
1808 struct domain *d = v->domain;
1809 int okay;
1810 unsigned long old_base_mfn;
1812 #ifdef CONFIG_COMPAT
1813 if ( is_pv_32on64_domain(d) )
1815 okay = paging_mode_refcounts(d)
1816 ? 0 /* Old code was broken, but what should it be? */
1817 : mod_l4_entry(
1818 d,
1819 __va(pagetable_get_paddr(v->arch.guest_table)),
1820 l4e_from_pfn(
1821 mfn,
1822 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
1823 pagetable_get_pfn(v->arch.guest_table));
1824 if ( unlikely(!okay) )
1826 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
1827 return 0;
1830 invalidate_shadow_ldt(v);
1831 write_ptbase(v);
1833 return 1;
1835 #endif
1836 okay = paging_mode_refcounts(d)
1837 ? get_page_from_pagenr(mfn, d)
1838 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1839 if ( unlikely(!okay) )
1841 MEM_LOG("Error while installing new baseptr %lx", mfn);
1842 return 0;
1845 invalidate_shadow_ldt(v);
1847 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1849 v->arch.guest_table = pagetable_from_pfn(mfn);
1850 update_cr3(v);
1852 write_ptbase(v);
1854 if ( likely(old_base_mfn != 0) )
1856 if ( paging_mode_refcounts(d) )
1857 put_page(mfn_to_page(old_base_mfn));
1858 else
1859 put_page_and_type(mfn_to_page(old_base_mfn));
1862 return 1;
1865 static void process_deferred_ops(void)
1867 unsigned int deferred_ops;
1868 struct domain *d = current->domain;
1869 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1871 deferred_ops = info->deferred_ops;
1872 info->deferred_ops = 0;
1874 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1876 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1877 flush_tlb_mask(d->domain_dirty_cpumask);
1878 else
1879 local_flush_tlb();
1882 if ( deferred_ops & DOP_RELOAD_LDT )
1883 (void)map_ldt_shadow_page(0);
1885 if ( unlikely(info->foreign != NULL) )
1887 rcu_unlock_domain(info->foreign);
1888 info->foreign = NULL;
1892 static int set_foreigndom(domid_t domid)
1894 struct domain *e, *d = current->domain;
1895 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1896 int okay = 1;
1898 ASSERT(info->foreign == NULL);
1900 if ( likely(domid == DOMID_SELF) )
1901 goto out;
1903 if ( unlikely(domid == d->domain_id) )
1905 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1906 d->domain_id);
1907 okay = 0;
1909 else if ( unlikely(paging_mode_translate(d)) )
1911 MEM_LOG("Cannot mix foreign mappings with translated domains");
1912 okay = 0;
1914 else if ( !IS_PRIV(d) )
1916 switch ( domid )
1918 case DOMID_IO:
1919 info->foreign = rcu_lock_domain(dom_io);
1920 break;
1921 default:
1922 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1923 okay = 0;
1924 break;
1927 else
1929 info->foreign = e = rcu_lock_domain_by_id(domid);
1930 if ( e == NULL )
1932 switch ( domid )
1934 case DOMID_XEN:
1935 info->foreign = rcu_lock_domain(dom_xen);
1936 break;
1937 case DOMID_IO:
1938 info->foreign = rcu_lock_domain(dom_io);
1939 break;
1940 default:
1941 MEM_LOG("Unknown domain '%u'", domid);
1942 okay = 0;
1943 break;
1948 out:
1949 return okay;
1952 static inline cpumask_t vcpumask_to_pcpumask(
1953 struct domain *d, unsigned long vmask)
1955 unsigned int vcpu_id;
1956 cpumask_t pmask = CPU_MASK_NONE;
1957 struct vcpu *v;
1959 while ( vmask != 0 )
1961 vcpu_id = find_first_set_bit(vmask);
1962 vmask &= ~(1UL << vcpu_id);
1963 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1964 ((v = d->vcpu[vcpu_id]) != NULL) )
1965 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1968 return pmask;
1971 int do_mmuext_op(
1972 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1973 unsigned int count,
1974 XEN_GUEST_HANDLE(uint) pdone,
1975 unsigned int foreigndom)
1977 struct mmuext_op op;
1978 int rc = 0, i = 0, okay;
1979 unsigned long mfn = 0, gmfn = 0, type;
1980 unsigned int done = 0;
1981 struct page_info *page;
1982 struct vcpu *v = current;
1983 struct domain *d = v->domain;
1985 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1987 count &= ~MMU_UPDATE_PREEMPTED;
1988 if ( unlikely(!guest_handle_is_null(pdone)) )
1989 (void)copy_from_guest(&done, pdone, 1);
1991 else
1992 perfc_incr(calls_to_mmuext_op);
1994 if ( unlikely(!guest_handle_okay(uops, count)) )
1996 rc = -EFAULT;
1997 goto out;
2000 if ( !set_foreigndom(foreigndom) )
2002 rc = -ESRCH;
2003 goto out;
2006 LOCK_BIGLOCK(d);
2008 for ( i = 0; i < count; i++ )
2010 if ( hypercall_preempt_check() )
2012 rc = hypercall_create_continuation(
2013 __HYPERVISOR_mmuext_op, "hihi",
2014 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2015 break;
2018 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2020 MEM_LOG("Bad __copy_from_guest");
2021 rc = -EFAULT;
2022 break;
2025 okay = 1;
2026 gmfn = op.arg1.mfn;
2027 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2028 page = mfn_to_page(mfn);
2030 switch ( op.cmd )
2032 case MMUEXT_PIN_L1_TABLE:
2033 type = PGT_l1_page_table;
2034 goto pin_page;
2036 case MMUEXT_PIN_L2_TABLE:
2037 type = PGT_l2_page_table;
2038 goto pin_page;
2040 case MMUEXT_PIN_L3_TABLE:
2041 type = PGT_l3_page_table;
2042 goto pin_page;
2044 case MMUEXT_PIN_L4_TABLE:
2045 if ( is_pv_32bit_domain(FOREIGNDOM) )
2046 break;
2047 type = PGT_l4_page_table;
2049 pin_page:
2050 /* Ignore pinning of invalid paging levels. */
2051 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2052 break;
2054 if ( paging_mode_refcounts(FOREIGNDOM) )
2055 break;
2057 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2058 if ( unlikely(!okay) )
2060 MEM_LOG("Error while pinning mfn %lx", mfn);
2061 break;
2064 if ( unlikely(test_and_set_bit(_PGT_pinned,
2065 &page->u.inuse.type_info)) )
2067 MEM_LOG("Mfn %lx already pinned", mfn);
2068 put_page_and_type(page);
2069 okay = 0;
2070 break;
2073 /* A page is dirtied when its pin status is set. */
2074 paging_mark_dirty(d, mfn);
2076 /* We can race domain destruction (domain_relinquish_resources). */
2077 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2079 int drop_ref;
2080 spin_lock(&FOREIGNDOM->page_alloc_lock);
2081 drop_ref = (FOREIGNDOM->is_dying &&
2082 test_and_clear_bit(_PGT_pinned,
2083 &page->u.inuse.type_info));
2084 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2085 if ( drop_ref )
2086 put_page_and_type(page);
2089 break;
2091 case MMUEXT_UNPIN_TABLE:
2092 if ( paging_mode_refcounts(d) )
2093 break;
2095 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2097 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2098 mfn, page_get_owner(page));
2100 else if ( likely(test_and_clear_bit(_PGT_pinned,
2101 &page->u.inuse.type_info)) )
2103 put_page_and_type(page);
2104 put_page(page);
2105 /* A page is dirtied when its pin status is cleared. */
2106 paging_mark_dirty(d, mfn);
2108 else
2110 okay = 0;
2111 put_page(page);
2112 MEM_LOG("Mfn %lx not pinned", mfn);
2114 break;
2116 case MMUEXT_NEW_BASEPTR:
2117 okay = new_guest_cr3(mfn);
2118 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2119 break;
2121 #ifdef __x86_64__
2122 case MMUEXT_NEW_USER_BASEPTR: {
2123 unsigned long old_mfn;
2125 if ( mfn != 0 )
2127 if ( paging_mode_refcounts(d) )
2128 okay = get_page_from_pagenr(mfn, d);
2129 else
2130 okay = get_page_and_type_from_pagenr(
2131 mfn, PGT_root_page_table, d);
2132 if ( unlikely(!okay) )
2134 MEM_LOG("Error while installing new mfn %lx", mfn);
2135 break;
2139 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2140 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2142 if ( old_mfn != 0 )
2144 if ( paging_mode_refcounts(d) )
2145 put_page(mfn_to_page(old_mfn));
2146 else
2147 put_page_and_type(mfn_to_page(old_mfn));
2150 break;
2152 #endif
2154 case MMUEXT_TLB_FLUSH_LOCAL:
2155 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2156 break;
2158 case MMUEXT_INVLPG_LOCAL:
2159 if ( !paging_mode_enabled(d)
2160 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2161 local_flush_tlb_one(op.arg1.linear_addr);
2162 break;
2164 case MMUEXT_TLB_FLUSH_MULTI:
2165 case MMUEXT_INVLPG_MULTI:
2167 unsigned long vmask;
2168 cpumask_t pmask;
2169 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2171 okay = 0;
2172 break;
2174 pmask = vcpumask_to_pcpumask(d, vmask);
2175 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2176 flush_tlb_mask(pmask);
2177 else
2178 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2179 break;
2182 case MMUEXT_TLB_FLUSH_ALL:
2183 flush_tlb_mask(d->domain_dirty_cpumask);
2184 break;
2186 case MMUEXT_INVLPG_ALL:
2187 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2188 break;
2190 case MMUEXT_FLUSH_CACHE:
2191 if ( unlikely(!cache_flush_permitted(d)) )
2193 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2194 okay = 0;
2196 else
2198 wbinvd();
2200 break;
2202 case MMUEXT_SET_LDT:
2204 unsigned long ptr = op.arg1.linear_addr;
2205 unsigned long ents = op.arg2.nr_ents;
2207 if ( paging_mode_external(d) )
2209 MEM_LOG("ignoring SET_LDT hypercall from external "
2210 "domain %u", d->domain_id);
2211 okay = 0;
2213 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2214 (ents > 8192) ||
2215 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2217 okay = 0;
2218 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2220 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2221 (v->arch.guest_context.ldt_base != ptr) )
2223 invalidate_shadow_ldt(v);
2224 v->arch.guest_context.ldt_base = ptr;
2225 v->arch.guest_context.ldt_ents = ents;
2226 load_LDT(v);
2227 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2228 if ( ents != 0 )
2229 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2231 break;
2234 default:
2235 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2236 rc = -ENOSYS;
2237 okay = 0;
2238 break;
2241 if ( unlikely(!okay) )
2243 rc = rc ? rc : -EINVAL;
2244 break;
2247 guest_handle_add_offset(uops, 1);
2250 process_deferred_ops();
2252 UNLOCK_BIGLOCK(d);
2254 perfc_add(num_mmuext_ops, i);
2256 out:
2257 /* Add incremental work we have done to the @done output parameter. */
2258 if ( unlikely(!guest_handle_is_null(pdone)) )
2260 done += i;
2261 copy_to_guest(pdone, &done, 1);
2264 return rc;
2267 int do_mmu_update(
2268 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2269 unsigned int count,
2270 XEN_GUEST_HANDLE(uint) pdone,
2271 unsigned int foreigndom)
2273 struct mmu_update req;
2274 void *va;
2275 unsigned long gpfn, gmfn, mfn;
2276 struct page_info *page;
2277 int rc = 0, okay = 1, i = 0;
2278 unsigned int cmd, done = 0;
2279 struct vcpu *v = current;
2280 struct domain *d = v->domain;
2281 unsigned long type_info;
2282 struct domain_mmap_cache mapcache;
2284 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2286 count &= ~MMU_UPDATE_PREEMPTED;
2287 if ( unlikely(!guest_handle_is_null(pdone)) )
2288 (void)copy_from_guest(&done, pdone, 1);
2290 else
2291 perfc_incr(calls_to_mmu_update);
2293 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2295 rc = -EFAULT;
2296 goto out;
2299 if ( !set_foreigndom(foreigndom) )
2301 rc = -ESRCH;
2302 goto out;
2305 domain_mmap_cache_init(&mapcache);
2307 LOCK_BIGLOCK(d);
2309 for ( i = 0; i < count; i++ )
2311 if ( hypercall_preempt_check() )
2313 rc = hypercall_create_continuation(
2314 __HYPERVISOR_mmu_update, "hihi",
2315 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2316 break;
2319 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2321 MEM_LOG("Bad __copy_from_guest");
2322 rc = -EFAULT;
2323 break;
2326 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2327 okay = 0;
2329 switch ( cmd )
2331 /*
2332 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2333 */
2334 case MMU_NORMAL_PT_UPDATE:
2336 gmfn = req.ptr >> PAGE_SHIFT;
2337 mfn = gmfn_to_mfn(d, gmfn);
2339 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2341 MEM_LOG("Could not get page for normal update");
2342 break;
2345 va = map_domain_page_with_cache(mfn, &mapcache);
2346 va = (void *)((unsigned long)va +
2347 (unsigned long)(req.ptr & ~PAGE_MASK));
2348 page = mfn_to_page(mfn);
2350 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2352 case PGT_l1_page_table:
2353 case PGT_l2_page_table:
2354 case PGT_l3_page_table:
2355 case PGT_l4_page_table:
2357 if ( paging_mode_refcounts(d) )
2359 MEM_LOG("mmu update on auto-refcounted domain!");
2360 break;
2363 if ( unlikely(!get_page_type(
2364 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2365 goto not_a_pt;
2367 switch ( type_info & PGT_type_mask )
2369 case PGT_l1_page_table:
2371 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2372 okay = mod_l1_entry(va, l1e, mfn);
2374 break;
2375 case PGT_l2_page_table:
2377 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2378 okay = mod_l2_entry(va, l2e, mfn, type_info);
2380 break;
2381 #if CONFIG_PAGING_LEVELS >= 3
2382 case PGT_l3_page_table:
2384 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2385 okay = mod_l3_entry(va, l3e, mfn);
2387 break;
2388 #endif
2389 #if CONFIG_PAGING_LEVELS >= 4
2390 case PGT_l4_page_table:
2392 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2393 okay = mod_l4_entry(d, va, l4e, mfn);
2395 break;
2396 #endif
2399 put_page_type(page);
2401 break;
2403 default:
2404 not_a_pt:
2406 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2407 break;
2409 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2411 put_page_type(page);
2413 break;
2416 unmap_domain_page_with_cache(va, &mapcache);
2418 put_page(page);
2419 break;
2421 case MMU_MACHPHYS_UPDATE:
2423 mfn = req.ptr >> PAGE_SHIFT;
2424 gpfn = req.val;
2426 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2428 MEM_LOG("Could not get page for mach->phys update");
2429 break;
2432 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2434 MEM_LOG("Mach-phys update on auto-translate guest");
2435 break;
2438 set_gpfn_from_mfn(mfn, gpfn);
2439 okay = 1;
2441 paging_mark_dirty(FOREIGNDOM, mfn);
2443 put_page(mfn_to_page(mfn));
2444 break;
2446 default:
2447 MEM_LOG("Invalid page update command %x", cmd);
2448 rc = -ENOSYS;
2449 okay = 0;
2450 break;
2453 if ( unlikely(!okay) )
2455 rc = rc ? rc : -EINVAL;
2456 break;
2459 guest_handle_add_offset(ureqs, 1);
2462 process_deferred_ops();
2464 UNLOCK_BIGLOCK(d);
2466 domain_mmap_cache_destroy(&mapcache);
2468 perfc_add(num_page_updates, i);
2470 out:
2471 /* Add incremental work we have done to the @done output parameter. */
2472 if ( unlikely(!guest_handle_is_null(pdone)) )
2474 done += i;
2475 copy_to_guest(pdone, &done, 1);
2478 return rc;
2482 static int create_grant_pte_mapping(
2483 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2485 int rc = GNTST_okay;
2486 void *va;
2487 unsigned long gmfn, mfn;
2488 struct page_info *page;
2489 u32 type;
2490 l1_pgentry_t ol1e;
2491 struct domain *d = v->domain;
2493 ASSERT(spin_is_locked(&d->big_lock));
2495 adjust_guest_l1e(nl1e, d);
2497 gmfn = pte_addr >> PAGE_SHIFT;
2498 mfn = gmfn_to_mfn(d, gmfn);
2500 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2502 MEM_LOG("Could not get page for normal update");
2503 return GNTST_general_error;
2506 va = map_domain_page(mfn);
2507 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2508 page = mfn_to_page(mfn);
2510 type = page->u.inuse.type_info & PGT_type_mask;
2511 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2513 MEM_LOG("Grant map attempted to update a non-L1 page");
2514 rc = GNTST_general_error;
2515 goto failed;
2518 ol1e = *(l1_pgentry_t *)va;
2519 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v) )
2521 put_page_type(page);
2522 rc = GNTST_general_error;
2523 goto failed;
2526 if ( !paging_mode_refcounts(d) )
2527 put_page_from_l1e(ol1e, d);
2529 put_page_type(page);
2531 failed:
2532 unmap_domain_page(va);
2533 put_page(page);
2535 return rc;
2538 static int destroy_grant_pte_mapping(
2539 uint64_t addr, unsigned long frame, struct domain *d)
2541 int rc = GNTST_okay;
2542 void *va;
2543 unsigned long gmfn, mfn;
2544 struct page_info *page;
2545 u32 type;
2546 l1_pgentry_t ol1e;
2548 gmfn = addr >> PAGE_SHIFT;
2549 mfn = gmfn_to_mfn(d, gmfn);
2551 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2553 MEM_LOG("Could not get page for normal update");
2554 return GNTST_general_error;
2557 va = map_domain_page(mfn);
2558 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2559 page = mfn_to_page(mfn);
2561 type = page->u.inuse.type_info & PGT_type_mask;
2562 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2564 MEM_LOG("Grant map attempted to update a non-L1 page");
2565 rc = GNTST_general_error;
2566 goto failed;
2569 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2571 put_page_type(page);
2572 rc = GNTST_general_error;
2573 goto failed;
2576 /* Check that the virtual address supplied is actually mapped to frame. */
2577 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2579 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2580 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2581 put_page_type(page);
2582 rc = GNTST_general_error;
2583 goto failed;
2586 /* Delete pagetable entry. */
2587 if ( unlikely(!UPDATE_ENTRY(l1,
2588 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2589 d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
2591 MEM_LOG("Cannot delete PTE entry at %p", va);
2592 put_page_type(page);
2593 rc = GNTST_general_error;
2594 goto failed;
2597 put_page_type(page);
2599 failed:
2600 unmap_domain_page(va);
2601 put_page(page);
2602 return rc;
2606 static int create_grant_va_mapping(
2607 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2609 l1_pgentry_t *pl1e, ol1e;
2610 struct domain *d = v->domain;
2611 unsigned long gl1mfn;
2612 int okay;
2614 ASSERT(spin_is_locked(&d->big_lock));
2616 adjust_guest_l1e(nl1e, d);
2618 pl1e = guest_map_l1e(v, va, &gl1mfn);
2619 if ( !pl1e )
2621 MEM_LOG("Could not find L1 PTE for address %lx", va);
2622 return GNTST_general_error;
2624 ol1e = *pl1e;
2625 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v);
2626 guest_unmap_l1e(v, pl1e);
2627 pl1e = NULL;
2629 if ( !okay )
2630 return GNTST_general_error;
2632 if ( !paging_mode_refcounts(d) )
2633 put_page_from_l1e(ol1e, d);
2635 return GNTST_okay;
2638 static int replace_grant_va_mapping(
2639 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2641 l1_pgentry_t *pl1e, ol1e;
2642 unsigned long gl1mfn;
2643 int rc = 0;
2645 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2646 if ( !pl1e )
2648 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2649 return GNTST_general_error;
2651 ol1e = *pl1e;
2653 /* Check that the virtual address supplied is actually mapped to frame. */
2654 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2656 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2657 l1e_get_pfn(ol1e), addr, frame);
2658 rc = GNTST_general_error;
2659 goto out;
2662 /* Delete pagetable entry. */
2663 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v)) )
2665 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2666 rc = GNTST_general_error;
2667 goto out;
2670 out:
2671 guest_unmap_l1e(v, pl1e);
2672 return rc;
2675 static int destroy_grant_va_mapping(
2676 unsigned long addr, unsigned long frame, struct vcpu *v)
2678 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2681 int create_grant_host_mapping(
2682 uint64_t addr, unsigned long frame, unsigned int flags)
2684 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2686 if ( (flags & GNTMAP_application_map) )
2687 l1e_add_flags(pte,_PAGE_USER);
2688 if ( !(flags & GNTMAP_readonly) )
2689 l1e_add_flags(pte,_PAGE_RW);
2691 if ( flags & GNTMAP_contains_pte )
2692 return create_grant_pte_mapping(addr, pte, current);
2693 return create_grant_va_mapping(addr, pte, current);
2696 int replace_grant_host_mapping(
2697 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
2699 l1_pgentry_t *pl1e, ol1e;
2700 unsigned long gl1mfn;
2701 int rc;
2703 if ( flags & GNTMAP_contains_pte )
2705 if (!new_addr)
2706 return destroy_grant_pte_mapping(addr, frame, current->domain);
2708 MEM_LOG("Unsupported grant table operation");
2709 return GNTST_general_error;
2712 if (!new_addr)
2713 return destroy_grant_va_mapping(addr, frame, current);
2715 pl1e = guest_map_l1e(current, new_addr, &gl1mfn);
2716 if ( !pl1e )
2718 MEM_LOG("Could not find L1 PTE for address %lx",
2719 (unsigned long)new_addr);
2720 return GNTST_general_error;
2722 ol1e = *pl1e;
2724 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, current)) )
2726 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2727 guest_unmap_l1e(current, pl1e);
2728 return GNTST_general_error;
2731 guest_unmap_l1e(current, pl1e);
2733 rc = replace_grant_va_mapping(addr, frame, ol1e, current);
2734 if ( rc && !paging_mode_refcounts(current->domain) )
2735 put_page_from_l1e(ol1e, current->domain);
2737 return rc;
2740 int steal_page(
2741 struct domain *d, struct page_info *page, unsigned int memflags)
2743 u32 _d, _nd, x, y;
2745 spin_lock(&d->page_alloc_lock);
2747 /*
2748 * The tricky bit: atomically release ownership while there is just one
2749 * benign reference to the page (PGC_allocated). If that reference
2750 * disappears then the deallocation routine will safely spin.
2751 */
2752 _d = pickle_domptr(d);
2753 _nd = page->u.inuse._domain;
2754 y = page->count_info;
2755 do {
2756 x = y;
2757 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2758 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2759 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2760 " caf=%08x, taf=%" PRtype_info "\n",
2761 (void *) page_to_mfn(page),
2762 d, d->domain_id, unpickle_domptr(_nd), x,
2763 page->u.inuse.type_info);
2764 spin_unlock(&d->page_alloc_lock);
2765 return -1;
2767 __asm__ __volatile__(
2768 LOCK_PREFIX "cmpxchg8b %2"
2769 : "=d" (_nd), "=a" (y),
2770 "=m" (*(volatile u64 *)(&page->count_info))
2771 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2772 } while (unlikely(_nd != _d) || unlikely(y != x));
2774 /*
2775 * Unlink from 'd'. At least one reference remains (now anonymous), so
2776 * noone else is spinning to try to delete this page from 'd'.
2777 */
2778 if ( !(memflags & MEMF_no_refcount) )
2779 d->tot_pages--;
2780 list_del(&page->list);
2782 spin_unlock(&d->page_alloc_lock);
2784 return 0;
2787 int do_update_va_mapping(unsigned long va, u64 val64,
2788 unsigned long flags)
2790 l1_pgentry_t val = l1e_from_intpte(val64);
2791 struct vcpu *v = current;
2792 struct domain *d = v->domain;
2793 l1_pgentry_t *pl1e;
2794 unsigned long vmask, bmap_ptr, gl1mfn;
2795 cpumask_t pmask;
2796 int rc = 0;
2798 perfc_incr(calls_to_update_va);
2800 if ( unlikely(!__addr_ok(va) && !paging_mode_external(d)) )
2801 return -EINVAL;
2803 LOCK_BIGLOCK(d);
2805 pl1e = guest_map_l1e(v, va, &gl1mfn);
2807 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) )
2808 rc = -EINVAL;
2810 if ( pl1e )
2811 guest_unmap_l1e(v, pl1e);
2812 pl1e = NULL;
2814 process_deferred_ops();
2816 UNLOCK_BIGLOCK(d);
2818 switch ( flags & UVMF_FLUSHTYPE_MASK )
2820 case UVMF_TLB_FLUSH:
2821 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2823 case UVMF_LOCAL:
2824 local_flush_tlb();
2825 break;
2826 case UVMF_ALL:
2827 flush_tlb_mask(d->domain_dirty_cpumask);
2828 break;
2829 default:
2830 if ( unlikely(!is_pv_32on64_domain(d) ?
2831 get_user(vmask, (unsigned long *)bmap_ptr) :
2832 get_user(vmask, (unsigned int *)bmap_ptr)) )
2833 rc = -EFAULT;
2834 pmask = vcpumask_to_pcpumask(d, vmask);
2835 flush_tlb_mask(pmask);
2836 break;
2838 break;
2840 case UVMF_INVLPG:
2841 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2843 case UVMF_LOCAL:
2844 if ( !paging_mode_enabled(d)
2845 || (paging_invlpg(current, va) != 0) )
2846 local_flush_tlb_one(va);
2847 break;
2848 case UVMF_ALL:
2849 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2850 break;
2851 default:
2852 if ( unlikely(!is_pv_32on64_domain(d) ?
2853 get_user(vmask, (unsigned long *)bmap_ptr) :
2854 get_user(vmask, (unsigned int *)bmap_ptr)) )
2855 rc = -EFAULT;
2856 pmask = vcpumask_to_pcpumask(d, vmask);
2857 flush_tlb_one_mask(pmask, va);
2858 break;
2860 break;
2863 return rc;
2866 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2867 unsigned long flags,
2868 domid_t domid)
2870 int rc;
2872 if ( unlikely(!IS_PRIV(current->domain)) )
2873 return -EPERM;
2875 if ( !set_foreigndom(domid) )
2876 return -ESRCH;
2878 rc = do_update_va_mapping(va, val64, flags);
2880 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
2881 process_deferred_ops(); /* only to clear foreigndom */
2883 return rc;
2888 /*************************
2889 * Descriptor Tables
2890 */
2892 void destroy_gdt(struct vcpu *v)
2894 int i;
2895 unsigned long pfn;
2897 v->arch.guest_context.gdt_ents = 0;
2898 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2900 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2901 put_page_and_type(mfn_to_page(pfn));
2902 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
2903 v->arch.guest_context.gdt_frames[i] = 0;
2908 long set_gdt(struct vcpu *v,
2909 unsigned long *frames,
2910 unsigned int entries)
2912 struct domain *d = v->domain;
2913 /* NB. There are 512 8-byte entries per GDT page. */
2914 int i, nr_pages = (entries + 511) / 512;
2915 unsigned long mfn;
2917 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2918 return -EINVAL;
2920 /* Check the pages in the new GDT. */
2921 for ( i = 0; i < nr_pages; i++ ) {
2922 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2923 if ( !mfn_valid(mfn) ||
2924 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2925 goto fail;
2928 /* Tear down the old GDT. */
2929 destroy_gdt(v);
2931 /* Install the new GDT. */
2932 v->arch.guest_context.gdt_ents = entries;
2933 for ( i = 0; i < nr_pages; i++ )
2935 v->arch.guest_context.gdt_frames[i] = frames[i];
2936 l1e_write(&v->arch.perdomain_ptes[i],
2937 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
2940 return 0;
2942 fail:
2943 while ( i-- > 0 )
2944 put_page_and_type(mfn_to_page(frames[i]));
2945 return -EINVAL;
2949 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2951 int nr_pages = (entries + 511) / 512;
2952 unsigned long frames[16];
2953 long ret;
2955 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2956 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2957 return -EINVAL;
2959 if ( copy_from_guest(frames, frame_list, nr_pages) )
2960 return -EFAULT;
2962 LOCK_BIGLOCK(current->domain);
2964 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2965 local_flush_tlb();
2967 UNLOCK_BIGLOCK(current->domain);
2969 return ret;
2973 long do_update_descriptor(u64 pa, u64 desc)
2975 struct domain *dom = current->domain;
2976 unsigned long gmfn = pa >> PAGE_SHIFT;
2977 unsigned long mfn;
2978 unsigned int offset;
2979 struct desc_struct *gdt_pent, d;
2980 struct page_info *page;
2981 long ret = -EINVAL;
2983 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2985 *(u64 *)&d = desc;
2987 LOCK_BIGLOCK(dom);
2989 mfn = gmfn_to_mfn(dom, gmfn);
2990 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2991 !mfn_valid(mfn) ||
2992 !check_descriptor(dom, &d) )
2994 UNLOCK_BIGLOCK(dom);
2995 return -EINVAL;
2998 page = mfn_to_page(mfn);
2999 if ( unlikely(!get_page(page, dom)) )
3001 UNLOCK_BIGLOCK(dom);
3002 return -EINVAL;
3005 /* Check if the given frame is in use in an unsafe context. */
3006 switch ( page->u.inuse.type_info & PGT_type_mask )
3008 case PGT_gdt_page:
3009 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
3010 goto out;
3011 break;
3012 case PGT_ldt_page:
3013 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
3014 goto out;
3015 break;
3016 default:
3017 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3018 goto out;
3019 break;
3022 paging_mark_dirty(dom, mfn);
3024 /* All is good so make the update. */
3025 gdt_pent = map_domain_page(mfn);
3026 memcpy(&gdt_pent[offset], &d, 8);
3027 unmap_domain_page(gdt_pent);
3029 put_page_type(page);
3031 ret = 0; /* success */
3033 out:
3034 put_page(page);
3036 UNLOCK_BIGLOCK(dom);
3038 return ret;
3041 typedef struct e820entry e820entry_t;
3042 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3044 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3046 switch ( op )
3048 case XENMEM_add_to_physmap:
3050 struct xen_add_to_physmap xatp;
3051 unsigned long prev_mfn, mfn = 0, gpfn;
3052 struct domain *d;
3054 if ( copy_from_guest(&xatp, arg, 1) )
3055 return -EFAULT;
3057 if ( xatp.domid == DOMID_SELF )
3058 d = rcu_lock_current_domain();
3059 else if ( !IS_PRIV(current->domain) )
3060 return -EPERM;
3061 else if ( (d = rcu_lock_domain_by_id(xatp.domid)) == NULL )
3062 return -ESRCH;
3064 switch ( xatp.space )
3066 case XENMAPSPACE_shared_info:
3067 if ( xatp.idx == 0 )
3068 mfn = virt_to_mfn(d->shared_info);
3069 break;
3070 case XENMAPSPACE_grant_table:
3071 spin_lock(&d->grant_table->lock);
3073 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3074 (xatp.idx < max_nr_grant_frames) )
3075 gnttab_grow_table(d, xatp.idx + 1);
3077 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3078 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3080 spin_unlock(&d->grant_table->lock);
3081 break;
3082 default:
3083 break;
3086 if ( !paging_mode_translate(d) || (mfn == 0) )
3088 rcu_unlock_domain(d);
3089 return -EINVAL;
3092 LOCK_BIGLOCK(d);
3094 /* Remove previously mapped page if it was present. */
3095 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3096 if ( mfn_valid(prev_mfn) )
3098 if ( is_xen_heap_frame(mfn_to_page(prev_mfn)) )
3099 /* Xen heap frames are simply unhooked from this phys slot. */
3100 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
3101 else
3102 /* Normal domain memory is freed, to avoid leaking memory. */
3103 guest_remove_page(d, xatp.gpfn);
3106 /* Unmap from old location, if any. */
3107 gpfn = get_gpfn_from_mfn(mfn);
3108 if ( gpfn != INVALID_M2P_ENTRY )
3109 guest_physmap_remove_page(d, gpfn, mfn);
3111 /* Map at new location. */
3112 guest_physmap_add_page(d, xatp.gpfn, mfn);
3114 UNLOCK_BIGLOCK(d);
3116 rcu_unlock_domain(d);
3118 break;
3121 case XENMEM_set_memory_map:
3123 struct xen_foreign_memory_map fmap;
3124 struct domain *d;
3125 int rc;
3127 if ( copy_from_guest(&fmap, arg, 1) )
3128 return -EFAULT;
3130 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3131 return -EINVAL;
3133 if ( fmap.domid == DOMID_SELF )
3134 d = rcu_lock_current_domain();
3135 else if ( !IS_PRIV(current->domain) )
3136 return -EPERM;
3137 else if ( (d = rcu_lock_domain_by_id(fmap.domid)) == NULL )
3138 return -ESRCH;
3140 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3141 fmap.map.nr_entries) ? -EFAULT : 0;
3142 d->arch.nr_e820 = fmap.map.nr_entries;
3144 rcu_unlock_domain(d);
3145 return rc;
3148 case XENMEM_memory_map:
3150 struct xen_memory_map map;
3151 struct domain *d = current->domain;
3153 /* Backwards compatibility. */
3154 if ( d->arch.nr_e820 == 0 )
3155 return -ENOSYS;
3157 if ( copy_from_guest(&map, arg, 1) )
3158 return -EFAULT;
3160 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3161 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3162 copy_to_guest(arg, &map, 1) )
3163 return -EFAULT;
3165 return 0;
3168 case XENMEM_machine_memory_map:
3170 struct xen_memory_map memmap;
3171 XEN_GUEST_HANDLE(e820entry_t) buffer;
3172 int count;
3174 if ( !IS_PRIV(current->domain) )
3175 return -EINVAL;
3177 if ( copy_from_guest(&memmap, arg, 1) )
3178 return -EFAULT;
3179 if ( memmap.nr_entries < e820.nr_map + 1 )
3180 return -EINVAL;
3182 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3184 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3185 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3186 return -EFAULT;
3188 memmap.nr_entries = count;
3190 if ( copy_to_guest(arg, &memmap, 1) )
3191 return -EFAULT;
3193 return 0;
3196 case XENMEM_machphys_mapping:
3198 static const struct xen_machphys_mapping mapping = {
3199 .v_start = MACH2PHYS_VIRT_START,
3200 .v_end = MACH2PHYS_VIRT_END,
3201 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3202 };
3204 if ( copy_to_guest(arg, &mapping, 1) )
3205 return -EFAULT;
3207 return 0;
3210 default:
3211 return subarch_memory_op(op, arg);
3214 return 0;
3218 /*************************
3219 * Writable Pagetables
3220 */
3222 struct ptwr_emulate_ctxt {
3223 struct x86_emulate_ctxt ctxt;
3224 unsigned long cr2;
3225 l1_pgentry_t pte;
3226 };
3228 static int ptwr_emulated_read(
3229 enum x86_segment seg,
3230 unsigned long offset,
3231 unsigned long *val,
3232 unsigned int bytes,
3233 struct x86_emulate_ctxt *ctxt)
3235 unsigned int rc;
3236 unsigned long addr = offset;
3238 *val = 0;
3239 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3241 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3242 return X86EMUL_EXCEPTION;
3245 return X86EMUL_OKAY;
3248 static int ptwr_emulated_update(
3249 unsigned long addr,
3250 paddr_t old,
3251 paddr_t val,
3252 unsigned int bytes,
3253 unsigned int do_cmpxchg,
3254 struct ptwr_emulate_ctxt *ptwr_ctxt)
3256 unsigned long mfn;
3257 unsigned long unaligned_addr = addr;
3258 struct page_info *page;
3259 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3260 struct vcpu *v = current;
3261 struct domain *d = v->domain;
3263 /* Only allow naturally-aligned stores within the original %cr2 page. */
3264 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3266 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3267 ptwr_ctxt->cr2, addr, bytes);
3268 return X86EMUL_UNHANDLEABLE;
3271 /* Turn a sub-word access into a full-word access. */
3272 if ( bytes != sizeof(paddr_t) )
3274 paddr_t full;
3275 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3277 /* Align address; read full word. */
3278 addr &= ~(sizeof(paddr_t)-1);
3279 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3281 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3282 return X86EMUL_EXCEPTION;
3284 /* Mask out bits provided by caller. */
3285 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3286 /* Shift the caller value and OR in the missing bits. */
3287 val &= (((paddr_t)1 << (bytes*8)) - 1);
3288 val <<= (offset)*8;
3289 val |= full;
3290 /* Also fill in missing parts of the cmpxchg old value. */
3291 old &= (((paddr_t)1 << (bytes*8)) - 1);
3292 old <<= (offset)*8;
3293 old |= full;
3296 pte = ptwr_ctxt->pte;
3297 mfn = l1e_get_pfn(pte);
3298 page = mfn_to_page(mfn);
3300 /* We are looking only for read-only mappings of p.t. pages. */
3301 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3302 ASSERT(mfn_valid(mfn));
3303 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3304 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3305 ASSERT(page_get_owner(page) == d);
3307 /* Check the new PTE. */
3308 nl1e = l1e_from_intpte(val);
3309 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3311 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3312 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3313 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3315 /*
3316 * If this is an upper-half write to a PAE PTE then we assume that
3317 * the guest has simply got the two writes the wrong way round. We
3318 * zap the PRESENT bit on the assumption that the bottom half will
3319 * be written immediately after we return to the guest.
3320 */
3321 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3322 l1e_get_intpte(nl1e));
3323 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3325 else
3327 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3328 return X86EMUL_UNHANDLEABLE;
3332 adjust_guest_l1e(nl1e, d);
3334 /* Checked successfully: do the update (write or cmpxchg). */
3335 pl1e = map_domain_page(mfn);
3336 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3337 if ( do_cmpxchg )
3339 int okay;
3340 intpte_t t = old;
3341 ol1e = l1e_from_intpte(old);
3343 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3344 &t, val, _mfn(mfn));
3345 okay = (okay && t == old);
3347 if ( !okay )
3349 unmap_domain_page(pl1e);
3350 put_page_from_l1e(nl1e, d);
3351 return X86EMUL_CMPXCHG_FAILED;
3354 else
3356 ol1e = *pl1e;
3357 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v) )
3358 BUG();
3361 unmap_domain_page(pl1e);
3363 /* Finally, drop the old PTE. */
3364 put_page_from_l1e(ol1e, d);
3366 return X86EMUL_OKAY;
3369 static int ptwr_emulated_write(
3370 enum x86_segment seg,
3371 unsigned long offset,
3372 unsigned long val,
3373 unsigned int bytes,
3374 struct x86_emulate_ctxt *ctxt)
3376 return ptwr_emulated_update(
3377 offset, 0, val, bytes, 0,
3378 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3381 static int ptwr_emulated_cmpxchg(
3382 enum x86_segment seg,
3383 unsigned long offset,
3384 unsigned long old,
3385 unsigned long new,
3386 unsigned int bytes,
3387 struct x86_emulate_ctxt *ctxt)
3389 return ptwr_emulated_update(
3390 offset, old, new, bytes, 1,
3391 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3394 static int ptwr_emulated_cmpxchg8b(
3395 enum x86_segment seg,
3396 unsigned long offset,
3397 unsigned long old,
3398 unsigned long old_hi,
3399 unsigned long new,
3400 unsigned long new_hi,
3401 struct x86_emulate_ctxt *ctxt)
3403 if ( CONFIG_PAGING_LEVELS == 2 )
3404 return X86EMUL_UNHANDLEABLE;
3405 return ptwr_emulated_update(
3406 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3407 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3410 static struct x86_emulate_ops ptwr_emulate_ops = {
3411 .read = ptwr_emulated_read,
3412 .insn_fetch = ptwr_emulated_read,
3413 .write = ptwr_emulated_write,
3414 .cmpxchg = ptwr_emulated_cmpxchg,
3415 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3416 };
3418 /* Write page fault handler: check if guest is trying to modify a PTE. */
3419 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3420 struct cpu_user_regs *regs)
3422 struct domain *d = v->domain;
3423 struct page_info *page;
3424 l1_pgentry_t pte;
3425 struct ptwr_emulate_ctxt ptwr_ctxt;
3426 int rc;
3428 LOCK_BIGLOCK(d);
3430 /* Attempt to read the PTE that maps the VA being accessed. */
3431 guest_get_eff_l1e(v, addr, &pte);
3432 page = l1e_get_page(pte);
3434 /* We are looking only for read-only mappings of p.t. pages. */
3435 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3436 !mfn_valid(l1e_get_pfn(pte)) ||
3437 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3438 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3439 (page_get_owner(page) != d) )
3440 goto bail;
3442 ptwr_ctxt.ctxt.regs = regs;
3443 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3444 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3445 ptwr_ctxt.cr2 = addr;
3446 ptwr_ctxt.pte = pte;
3448 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3449 if ( rc == X86EMUL_UNHANDLEABLE )
3450 goto bail;
3452 UNLOCK_BIGLOCK(d);
3453 perfc_incr(ptwr_emulations);
3454 return EXCRET_fault_fixed;
3456 bail:
3457 UNLOCK_BIGLOCK(d);
3458 return 0;
3461 void free_xen_pagetable(void *v)
3463 extern int early_boot;
3465 BUG_ON(early_boot);
3467 if ( is_xen_heap_frame(virt_to_page(v)) )
3468 free_xenheap_page(v);
3469 else
3470 free_domheap_page(virt_to_page(v));
3473 int map_pages_to_xen(
3474 unsigned long virt,
3475 unsigned long mfn,
3476 unsigned long nr_mfns,
3477 unsigned long flags)
3479 l2_pgentry_t *pl2e, ol2e;
3480 l1_pgentry_t *pl1e, ol1e;
3481 unsigned int i;
3483 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3484 flags &= ~MAP_SMALL_PAGES;
3486 while ( nr_mfns != 0 )
3488 pl2e = virt_to_xen_l2e(virt);
3490 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3491 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3492 !map_small_pages )
3494 /* Super-page mapping. */
3495 ol2e = *pl2e;
3496 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, flags|_PAGE_PSE));
3498 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3500 local_flush_tlb_pge();
3501 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3502 free_xen_pagetable(mfn_to_virt(l2e_get_pfn(ol2e)));
3505 virt += 1UL << L2_PAGETABLE_SHIFT;
3506 mfn += 1UL << PAGETABLE_ORDER;
3507 nr_mfns -= 1UL << PAGETABLE_ORDER;
3509 else
3511 /* Normal page mapping. */
3512 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3514 pl1e = alloc_xen_pagetable();
3515 clear_page(pl1e);
3516 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3517 __PAGE_HYPERVISOR));
3519 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3521 pl1e = alloc_xen_pagetable();
3522 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3523 l1e_write(&pl1e[i],
3524 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3525 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
3526 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3527 __PAGE_HYPERVISOR));
3528 local_flush_tlb_pge();
3531 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3532 ol1e = *pl1e;
3533 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
3534 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3535 local_flush_tlb_one(virt);
3537 virt += 1UL << L1_PAGETABLE_SHIFT;
3538 mfn += 1UL;
3539 nr_mfns -= 1UL;
3543 return 0;
3546 void destroy_xen_mappings(unsigned long s, unsigned long e)
3548 l2_pgentry_t *pl2e;
3549 l1_pgentry_t *pl1e;
3550 unsigned int i;
3551 unsigned long v = s;
3553 ASSERT((s & ~PAGE_MASK) == 0);
3554 ASSERT((e & ~PAGE_MASK) == 0);
3556 while ( v < e )
3558 pl2e = virt_to_xen_l2e(v);
3560 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3562 v += 1UL << L2_PAGETABLE_SHIFT;
3563 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
3564 continue;
3567 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3569 if ( (l1_table_offset(v) == 0) &&
3570 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
3572 /* PSE: whole superpage is destroyed. */
3573 l2e_write_atomic(pl2e, l2e_empty());
3574 v += 1UL << L2_PAGETABLE_SHIFT;
3576 else
3578 /* PSE: shatter the superpage and try again. */
3579 pl1e = alloc_xen_pagetable();
3580 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3581 l1e_write(&pl1e[i],
3582 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3583 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
3584 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3585 __PAGE_HYPERVISOR));
3588 else
3590 /* Ordinary 4kB mapping. */
3591 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
3592 l1e_write_atomic(pl1e, l1e_empty());
3593 v += PAGE_SIZE;
3595 /* If we are done with the L2E, check if it is now empty. */
3596 if ( (v != e) && (l1_table_offset(v) != 0) )
3597 continue;
3598 pl1e = l2e_to_l1e(*pl2e);
3599 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3600 if ( l1e_get_intpte(pl1e[i]) != 0 )
3601 break;
3602 if ( i == L1_PAGETABLE_ENTRIES )
3604 /* Empty: zap the L2E and free the L1 page. */
3605 l2e_write_atomic(pl2e, l2e_empty());
3606 free_xen_pagetable(pl1e);
3611 flush_tlb_all_pge();
3614 void __set_fixmap(
3615 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3617 BUG_ON(idx >= __end_of_fixed_addresses);
3618 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3621 #ifdef MEMORY_GUARD
3623 void memguard_init(void)
3625 map_pages_to_xen(
3626 (unsigned long)__va(xen_phys_start),
3627 xen_phys_start >> PAGE_SHIFT,
3628 (xenheap_phys_end - xen_phys_start) >> PAGE_SHIFT,
3629 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3630 #ifdef __x86_64__
3631 map_pages_to_xen(
3632 XEN_VIRT_START,
3633 xen_phys_start >> PAGE_SHIFT,
3634 (__pa(&_end) + PAGE_SIZE - 1 - xen_phys_start) >> PAGE_SHIFT,
3635 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3636 #endif
3639 static void __memguard_change_range(void *p, unsigned long l, int guard)
3641 unsigned long _p = (unsigned long)p;
3642 unsigned long _l = (unsigned long)l;
3643 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3645 /* Ensure we are dealing with a page-aligned whole number of pages. */
3646 ASSERT((_p&PAGE_MASK) != 0);
3647 ASSERT((_l&PAGE_MASK) != 0);
3648 ASSERT((_p&~PAGE_MASK) == 0);
3649 ASSERT((_l&~PAGE_MASK) == 0);
3651 if ( guard )
3652 flags &= ~_PAGE_PRESENT;
3654 map_pages_to_xen(
3655 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3658 void memguard_guard_range(void *p, unsigned long l)
3660 __memguard_change_range(p, l, 1);
3663 void memguard_unguard_range(void *p, unsigned long l)
3665 __memguard_change_range(p, l, 0);
3668 #endif
3670 void memguard_guard_stack(void *p)
3672 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
3673 p = (void *)((unsigned long)p + STACK_SIZE -
3674 PRIMARY_STACK_SIZE - PAGE_SIZE);
3675 memguard_guard_range(p, PAGE_SIZE);
3678 /*
3679 * Local variables:
3680 * mode: C
3681 * c-set-style: "BSD"
3682 * c-basic-offset: 4
3683 * tab-width: 4
3684 * indent-tabs-mode: nil
3685 * End:
3686 */