direct-io.hg

view xen/arch/x86/mm.c @ 15454:83cbda5c1e1b

x86-64: bump STACK_SIZE to 32 so that trampoline and IST stacks fit
without undue squeezing.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Tue Jul 03 11:41:25 2007 +0100 (2007-07-03)
parents a5360bf18668
children 08bcc54aee8e
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
114 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
116 /*
117 * PTE updates can be done with ordinary writes except:
118 * 1. Debug builds get extra checking by using CMPXCHG[8B].
119 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
120 */
121 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
122 #define PTE_UPDATE_WITH_CMPXCHG
123 #endif
125 /* Used to defer flushing of memory structures. */
126 struct percpu_mm_info {
127 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
128 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
129 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
130 unsigned int deferred_ops;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 };
134 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
136 /*
137 * Returns the current foreign domain; defaults to the currently-executing
138 * domain if a foreign override hasn't been specified.
139 */
140 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
142 /* Private domain structs for DOMID_XEN and DOMID_IO. */
143 static struct domain *dom_xen, *dom_io;
145 /* Frame table and its size in pages. */
146 struct page_info *frame_table;
147 unsigned long max_page;
148 unsigned long total_pages;
150 #ifdef CONFIG_COMPAT
151 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
152 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
153 L3_DISALLOW_MASK : \
154 COMPAT_L3_DISALLOW_MASK)
155 #else
156 #define l3_disallow_mask(d) L3_DISALLOW_MASK
157 #endif
159 static void queue_deferred_ops(struct domain *d, unsigned int ops)
160 {
161 ASSERT(d == current->domain);
162 this_cpu(percpu_mm_info).deferred_ops |= ops;
163 }
165 void __init init_frametable(void)
166 {
167 unsigned long nr_pages, page_step, i, mfn;
169 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
171 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
172 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
174 for ( i = 0; i < nr_pages; i += page_step )
175 {
176 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
177 if ( mfn == 0 )
178 panic("Not enough memory for frame table\n");
179 map_pages_to_xen(
180 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
181 mfn, page_step, PAGE_HYPERVISOR);
182 }
184 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
185 }
187 void __init arch_init_memory(void)
188 {
189 extern void subarch_init_memory(void);
191 unsigned long i, pfn, rstart_pfn, rend_pfn;
193 /*
194 * Initialise our DOMID_XEN domain.
195 * Any Xen-heap pages that we will allow to be mapped will have
196 * their domain field set to dom_xen.
197 */
198 dom_xen = alloc_domain(DOMID_XEN);
199 BUG_ON(dom_xen == NULL);
201 /*
202 * Initialise our DOMID_IO domain.
203 * This domain owns I/O pages that are within the range of the page_info
204 * array. Mappings occur at the priv of the caller.
205 */
206 dom_io = alloc_domain(DOMID_IO);
207 BUG_ON(dom_io == NULL);
209 /* First 1MB of RAM is historically marked as I/O. */
210 for ( i = 0; i < 0x100; i++ )
211 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
213 /* Any areas not specified as RAM by the e820 map are considered I/O. */
214 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
215 {
216 if ( e820.map[i].type != E820_RAM )
217 continue;
218 /* Every page from cursor to start of next RAM region is I/O. */
219 rstart_pfn = PFN_UP(e820.map[i].addr);
220 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
221 for ( ; pfn < rstart_pfn; pfn++ )
222 {
223 BUG_ON(!mfn_valid(pfn));
224 share_xen_page_with_guest(
225 mfn_to_page(pfn), dom_io, XENSHARE_writable);
226 }
227 /* Skip the RAM region. */
228 pfn = rend_pfn;
229 }
230 BUG_ON(pfn != max_page);
232 subarch_init_memory();
233 }
235 int memory_is_conventional_ram(paddr_t p)
236 {
237 int i;
239 for ( i = 0; i < e820.nr_map; i++ )
240 {
241 if ( (e820.map[i].type == E820_RAM) &&
242 (e820.map[i].addr <= p) &&
243 (e820.map[i].size > p) )
244 return 1;
245 }
247 return 0;
248 }
250 unsigned long domain_get_maximum_gpfn(struct domain *d)
251 {
252 if ( is_hvm_domain(d) )
253 return d->arch.p2m.max_mapped_pfn;
254 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
255 return arch_get_max_pfn(d) - 1;
256 }
258 void share_xen_page_with_guest(
259 struct page_info *page, struct domain *d, int readonly)
260 {
261 if ( page_get_owner(page) == d )
262 return;
264 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
266 spin_lock(&d->page_alloc_lock);
268 /* The incremented type count pins as writable or read-only. */
269 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
270 page->u.inuse.type_info |= PGT_validated | 1;
272 page_set_owner(page, d);
273 wmb(); /* install valid domain ptr before updating refcnt. */
274 ASSERT(page->count_info == 0);
276 /* Only add to the allocation list if the domain isn't dying. */
277 if ( !d->is_dying )
278 {
279 page->count_info |= PGC_allocated | 1;
280 if ( unlikely(d->xenheap_pages++ == 0) )
281 get_knownalive_domain(d);
282 list_add_tail(&page->list, &d->xenpage_list);
283 }
285 spin_unlock(&d->page_alloc_lock);
286 }
288 void share_xen_page_with_privileged_guests(
289 struct page_info *page, int readonly)
290 {
291 share_xen_page_with_guest(page, dom_xen, readonly);
292 }
294 #if defined(CONFIG_X86_PAE)
296 #ifdef NDEBUG
297 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
298 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
299 #else
300 /*
301 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
302 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
303 * (detected by lack of an owning domain). As required for correctness, we
304 * always shadow PDPTs above 4GB.
305 */
306 #define l3tab_needs_shadow(mfn) \
307 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
308 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
309 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
310 ((mfn) >= 0x100000))
311 #endif
313 static l1_pgentry_t *fix_pae_highmem_pl1e;
315 /* Cache the address of PAE high-memory fixmap page tables. */
316 static int __init cache_pae_fixmap_address(void)
317 {
318 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
319 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
320 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
321 return 0;
322 }
323 __initcall(cache_pae_fixmap_address);
325 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
327 void make_cr3(struct vcpu *v, unsigned long mfn)
328 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
329 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
330 {
331 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
332 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
333 unsigned int cpu = smp_processor_id();
335 /* Fast path: does this mfn need a shadow at all? */
336 if ( !l3tab_needs_shadow(mfn) )
337 {
338 v->arch.cr3 = mfn << PAGE_SHIFT;
339 /* Cache is no longer in use or valid */
340 cache->high_mfn = 0;
341 return;
342 }
344 /* Caching logic is not interrupt safe. */
345 ASSERT(!in_irq());
347 /* Protects against pae_flush_pgd(). */
348 spin_lock(&cache->lock);
350 cache->inuse_idx ^= 1;
351 cache->high_mfn = mfn;
353 /* Map the guest L3 table and copy to the chosen low-memory cache. */
354 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
355 /* First check the previous high mapping can't be in the TLB.
356 * (i.e. have we loaded CR3 since we last did this?) */
357 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
358 local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
359 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
360 lowmem_l3tab = cache->table[cache->inuse_idx];
361 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
362 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
363 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
365 v->arch.cr3 = __pa(lowmem_l3tab);
367 spin_unlock(&cache->lock);
368 }
370 #else /* !CONFIG_X86_PAE */
372 void make_cr3(struct vcpu *v, unsigned long mfn)
373 {
374 v->arch.cr3 = mfn << PAGE_SHIFT;
375 }
377 #endif /* !CONFIG_X86_PAE */
379 void write_ptbase(struct vcpu *v)
380 {
381 write_cr3(v->arch.cr3);
382 }
384 /* Should be called after CR3 is updated.
385 * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3.
386 *
387 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
388 * for HVM guests, arch.monitor_table and hvm's guest CR3.
389 *
390 * Update ref counts to shadow tables appropriately.
391 */
392 void update_cr3(struct vcpu *v)
393 {
394 unsigned long cr3_mfn=0;
396 if ( paging_mode_enabled(v->domain) )
397 {
398 paging_update_cr3(v);
399 return;
400 }
402 #if CONFIG_PAGING_LEVELS == 4
403 if ( !(v->arch.flags & TF_kernel_mode) )
404 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
405 else
406 #endif
407 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
409 make_cr3(v, cr3_mfn);
410 }
413 static void invalidate_shadow_ldt(struct vcpu *v)
414 {
415 int i;
416 unsigned long pfn;
417 struct page_info *page;
419 if ( v->arch.shadow_ldt_mapcnt == 0 )
420 return;
422 v->arch.shadow_ldt_mapcnt = 0;
424 for ( i = 16; i < 32; i++ )
425 {
426 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
427 if ( pfn == 0 ) continue;
428 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
429 page = mfn_to_page(pfn);
430 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
431 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
432 put_page_and_type(page);
433 }
435 /* Dispose of the (now possibly invalid) mappings from the TLB. */
436 if ( v == current )
437 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
438 else
439 flush_tlb_mask(v->domain->domain_dirty_cpumask);
440 }
443 static int alloc_segdesc_page(struct page_info *page)
444 {
445 struct desc_struct *descs;
446 int i;
448 descs = map_domain_page(page_to_mfn(page));
450 for ( i = 0; i < 512; i++ )
451 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
452 goto fail;
454 unmap_domain_page(descs);
455 return 1;
457 fail:
458 unmap_domain_page(descs);
459 return 0;
460 }
463 /* Map shadow page at offset @off. */
464 int map_ldt_shadow_page(unsigned int off)
465 {
466 struct vcpu *v = current;
467 struct domain *d = v->domain;
468 unsigned long gmfn, mfn;
469 l1_pgentry_t l1e, nl1e;
470 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
471 int okay;
473 BUG_ON(unlikely(in_irq()));
475 guest_get_eff_kern_l1e(v, gva, &l1e);
476 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
477 return 0;
479 gmfn = l1e_get_pfn(l1e);
480 mfn = gmfn_to_mfn(d, gmfn);
481 if ( unlikely(!mfn_valid(mfn)) )
482 return 0;
484 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
485 if ( unlikely(!okay) )
486 return 0;
488 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
490 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
491 v->arch.shadow_ldt_mapcnt++;
493 return 1;
494 }
497 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
498 {
499 struct page_info *page = mfn_to_page(page_nr);
501 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
502 {
503 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
504 return 0;
505 }
507 return 1;
508 }
511 static int get_page_and_type_from_pagenr(unsigned long page_nr,
512 unsigned long type,
513 struct domain *d)
514 {
515 struct page_info *page = mfn_to_page(page_nr);
517 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
518 return 0;
520 if ( unlikely(!get_page_type(page, type)) )
521 {
522 put_page(page);
523 return 0;
524 }
526 return 1;
527 }
529 /*
530 * We allow root tables to map each other (a.k.a. linear page tables). It
531 * needs some special care with reference counts and access permissions:
532 * 1. The mapping entry must be read-only, or the guest may get write access
533 * to its own PTEs.
534 * 2. We must only bump the reference counts for an *already validated*
535 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
536 * on a validation that is required to complete that validation.
537 * 3. We only need to increment the reference counts for the mapped page
538 * frame if it is mapped by a different root table. This is sufficient and
539 * also necessary to allow validation of a root table mapping itself.
540 */
541 #define define_get_linear_pagetable(level) \
542 static int \
543 get_##level##_linear_pagetable( \
544 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
545 { \
546 unsigned long x, y; \
547 struct page_info *page; \
548 unsigned long pfn; \
549 \
550 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
551 { \
552 MEM_LOG("Attempt to create linear p.t. with write perms"); \
553 return 0; \
554 } \
555 \
556 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
557 { \
558 /* Make sure the mapped frame belongs to the correct domain. */ \
559 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
560 return 0; \
561 \
562 /* \
563 * Ensure that the mapped frame is an already-validated page table. \
564 * If so, atomically increment the count (checking for overflow). \
565 */ \
566 page = mfn_to_page(pfn); \
567 y = page->u.inuse.type_info; \
568 do { \
569 x = y; \
570 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
571 unlikely((x & (PGT_type_mask|PGT_validated)) != \
572 (PGT_##level##_page_table|PGT_validated)) ) \
573 { \
574 put_page(page); \
575 return 0; \
576 } \
577 } \
578 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
579 } \
580 \
581 return 1; \
582 }
584 int
585 get_page_from_l1e(
586 l1_pgentry_t l1e, struct domain *d)
587 {
588 unsigned long mfn = l1e_get_pfn(l1e);
589 struct page_info *page = mfn_to_page(mfn);
590 int okay;
592 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
593 return 1;
595 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
596 {
597 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
598 return 0;
599 }
601 if ( unlikely(!mfn_valid(mfn)) ||
602 unlikely(page_get_owner(page) == dom_io) )
603 {
604 /* DOMID_IO reverts to caller for privilege checks. */
605 if ( d == dom_io )
606 d = current->domain;
608 if ( !iomem_access_permitted(d, mfn, mfn) )
609 {
610 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
611 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
612 d->domain_id, mfn);
613 return 0;
614 }
616 /* No reference counting for out-of-range I/O pages. */
617 if ( !mfn_valid(mfn) )
618 return 1;
620 d = dom_io;
621 }
623 /* Foreign mappings into guests in shadow external mode don't
624 * contribute to writeable mapping refcounts. (This allows the
625 * qemu-dm helper process in dom0 to map the domain's memory without
626 * messing up the count of "real" writable mappings.) */
627 okay = (((l1e_get_flags(l1e) & _PAGE_RW) &&
628 !(unlikely(paging_mode_external(d) && (d != current->domain))))
629 ? get_page_and_type(page, d, PGT_writable_page)
630 : get_page(page, d));
631 if ( !okay )
632 {
633 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
634 " for dom%d",
635 mfn, get_gpfn_from_mfn(mfn),
636 l1e_get_intpte(l1e), d->domain_id);
637 }
639 return okay;
640 }
643 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
644 define_get_linear_pagetable(l2);
645 static int
646 get_page_from_l2e(
647 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
648 {
649 int rc;
651 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
652 return 1;
654 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
655 {
656 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
657 return 0;
658 }
660 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
661 if ( unlikely(!rc) )
662 rc = get_l2_linear_pagetable(l2e, pfn, d);
664 return rc;
665 }
668 #if CONFIG_PAGING_LEVELS >= 3
669 define_get_linear_pagetable(l3);
670 static int
671 get_page_from_l3e(
672 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
673 {
674 int rc;
676 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
677 return 1;
679 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
680 {
681 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
682 return 0;
683 }
685 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
686 if ( unlikely(!rc) )
687 rc = get_l3_linear_pagetable(l3e, pfn, d);
689 return rc;
690 }
691 #endif /* 3 level */
693 #if CONFIG_PAGING_LEVELS >= 4
694 define_get_linear_pagetable(l4);
695 static int
696 get_page_from_l4e(
697 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
698 {
699 int rc;
701 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
702 return 1;
704 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
705 {
706 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
707 return 0;
708 }
710 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
711 if ( unlikely(!rc) )
712 rc = get_l4_linear_pagetable(l4e, pfn, d);
714 return rc;
715 }
716 #endif /* 4 level */
718 #ifdef __x86_64__
720 #ifdef USER_MAPPINGS_ARE_GLOBAL
721 #define adjust_guest_l1e(pl1e, d) \
722 do { \
723 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
724 likely(!is_pv_32on64_domain(d)) ) \
725 { \
726 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
727 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
728 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
729 MEM_LOG("Global bit is set to kernel page %lx", \
730 l1e_get_pfn((pl1e))); \
731 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
732 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
733 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
734 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
735 } \
736 } while ( 0 )
737 #else
738 #define adjust_guest_l1e(pl1e, d) \
739 do { \
740 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
741 likely(!is_pv_32on64_domain(d)) ) \
742 l1e_add_flags((pl1e), _PAGE_USER); \
743 } while ( 0 )
744 #endif
746 #define adjust_guest_l2e(pl2e, d) \
747 do { \
748 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
749 likely(!is_pv_32on64_domain(d)) ) \
750 l2e_add_flags((pl2e), _PAGE_USER); \
751 } while ( 0 )
753 #define adjust_guest_l3e(pl3e, d) \
754 do { \
755 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
756 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
757 _PAGE_USER : \
758 _PAGE_USER|_PAGE_RW); \
759 } while ( 0 )
761 #define adjust_guest_l4e(pl4e, d) \
762 do { \
763 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
764 likely(!is_pv_32on64_domain(d)) ) \
765 l4e_add_flags((pl4e), _PAGE_USER); \
766 } while ( 0 )
768 #else /* !defined(__x86_64__) */
770 #define adjust_guest_l1e(_p, _d) ((void)(_d))
771 #define adjust_guest_l2e(_p, _d) ((void)(_d))
772 #define adjust_guest_l3e(_p, _d) ((void)(_d))
774 #endif
776 #ifdef CONFIG_COMPAT
777 #define unadjust_guest_l3e(pl3e, d) \
778 do { \
779 if ( unlikely(is_pv_32on64_domain(d)) && \
780 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
781 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
782 } while ( 0 )
783 #else
784 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
785 #endif
787 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
788 {
789 unsigned long pfn = l1e_get_pfn(l1e);
790 struct page_info *page = mfn_to_page(pfn);
791 struct domain *e;
792 struct vcpu *v;
794 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
795 return;
797 e = page_get_owner(page);
799 /*
800 * Check if this is a mapping that was established via a grant reference.
801 * If it was then we should not be here: we require that such mappings are
802 * explicitly destroyed via the grant-table interface.
803 *
804 * The upshot of this is that the guest can end up with active grants that
805 * it cannot destroy (because it no longer has a PTE to present to the
806 * grant-table interface). This can lead to subtle hard-to-catch bugs,
807 * hence a special grant PTE flag can be enabled to catch the bug early.
808 *
809 * (Note that the undestroyable active grants are not a security hole in
810 * Xen. All active grants can safely be cleaned up when the domain dies.)
811 */
812 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
813 !d->is_shutting_down && !d->is_dying )
814 {
815 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
816 l1e_get_intpte(l1e));
817 domain_crash(d);
818 }
820 /* Remember we didn't take a type-count of foreign writable mappings
821 * to paging-external domains */
822 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
823 !(unlikely((e != d) && paging_mode_external(e))) )
824 {
825 put_page_and_type(page);
826 }
827 else
828 {
829 /* We expect this is rare so we blow the entire shadow LDT. */
830 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
831 PGT_ldt_page)) &&
832 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
833 (d == e) )
834 {
835 for_each_vcpu ( d, v )
836 invalidate_shadow_ldt(v);
837 }
838 put_page(page);
839 }
840 }
843 /*
844 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
845 * Note also that this automatically deals correctly with linear p.t.'s.
846 */
847 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
848 {
849 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
850 (l2e_get_pfn(l2e) != pfn) )
851 put_page_and_type(l2e_get_page(l2e));
852 }
855 #if CONFIG_PAGING_LEVELS >= 3
856 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
857 {
858 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
859 (l3e_get_pfn(l3e) != pfn) )
860 put_page_and_type(l3e_get_page(l3e));
861 }
862 #endif
864 #if CONFIG_PAGING_LEVELS >= 4
865 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
866 {
867 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
868 (l4e_get_pfn(l4e) != pfn) )
869 put_page_and_type(l4e_get_page(l4e));
870 }
871 #endif
873 static int alloc_l1_table(struct page_info *page)
874 {
875 struct domain *d = page_get_owner(page);
876 unsigned long pfn = page_to_mfn(page);
877 l1_pgentry_t *pl1e;
878 int i;
880 pl1e = map_domain_page(pfn);
882 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
883 {
884 if ( is_guest_l1_slot(i) &&
885 unlikely(!get_page_from_l1e(pl1e[i], d)) )
886 goto fail;
888 adjust_guest_l1e(pl1e[i], d);
889 }
891 unmap_domain_page(pl1e);
892 return 1;
894 fail:
895 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
896 while ( i-- > 0 )
897 if ( is_guest_l1_slot(i) )
898 put_page_from_l1e(pl1e[i], d);
900 unmap_domain_page(pl1e);
901 return 0;
902 }
904 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
905 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
906 {
907 struct page_info *page;
908 l2_pgentry_t *pl2e;
909 l3_pgentry_t l3e3;
910 #ifndef CONFIG_COMPAT
911 l2_pgentry_t l2e;
912 int i;
913 #endif
915 if ( !is_pv_32bit_domain(d) )
916 return 1;
918 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
920 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
921 l3e3 = pl3e[3];
922 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
923 {
924 MEM_LOG("PAE L3 3rd slot is empty");
925 return 0;
926 }
928 /*
929 * The Xen-private mappings include linear mappings. The L2 thus cannot
930 * be shared by multiple L3 tables. The test here is adequate because:
931 * 1. Cannot appear in slots != 3 because get_page_type() checks the
932 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
933 * 2. Cannot appear in another page table's L3:
934 * a. alloc_l3_table() calls this function and this check will fail
935 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
936 */
937 page = l3e_get_page(l3e3);
938 BUG_ON(page->u.inuse.type_info & PGT_pinned);
939 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
940 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
941 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
942 {
943 MEM_LOG("PAE L3 3rd slot is shared");
944 return 0;
945 }
947 /* Xen private mappings. */
948 pl2e = map_domain_page(l3e_get_pfn(l3e3));
949 #ifndef CONFIG_COMPAT
950 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
951 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
952 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
953 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
954 {
955 l2e = l2e_from_page(
956 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
957 __PAGE_HYPERVISOR);
958 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
959 }
960 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
961 {
962 l2e = l2e_empty();
963 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
964 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
965 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
966 }
967 #else
968 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
969 &compat_idle_pg_table_l2[
970 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
971 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
972 #endif
973 unmap_domain_page(pl2e);
975 return 1;
976 }
977 #else
978 # define create_pae_xen_mappings(d, pl3e) (1)
979 #endif
981 #ifdef CONFIG_X86_PAE
982 /* Flush a pgdir update into low-memory caches. */
983 static void pae_flush_pgd(
984 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
985 {
986 struct domain *d = page_get_owner(mfn_to_page(mfn));
987 struct vcpu *v;
988 intpte_t _ol3e, _nl3e, _pl3e;
989 l3_pgentry_t *l3tab_ptr;
990 struct pae_l3_cache *cache;
992 if ( unlikely(shadow_mode_enabled(d)) )
993 {
994 cpumask_t m = CPU_MASK_NONE;
995 /* Re-shadow this l3 table on any vcpus that are using it */
996 for_each_vcpu ( d, v )
997 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
998 {
999 paging_update_cr3(v);
1000 cpus_or(m, m, v->vcpu_dirty_cpumask);
1002 flush_tlb_mask(m);
1005 /* If below 4GB then the pgdir is not shadowed in low memory. */
1006 if ( !l3tab_needs_shadow(mfn) )
1007 return;
1009 for_each_vcpu ( d, v )
1011 cache = &v->arch.pae_l3_cache;
1013 spin_lock(&cache->lock);
1015 if ( cache->high_mfn == mfn )
1017 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1018 _ol3e = l3e_get_intpte(*l3tab_ptr);
1019 _nl3e = l3e_get_intpte(nl3e);
1020 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1021 BUG_ON(_pl3e != _ol3e);
1024 spin_unlock(&cache->lock);
1027 flush_tlb_mask(d->domain_dirty_cpumask);
1029 #else
1030 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1031 #endif
1033 static int alloc_l2_table(struct page_info *page, unsigned long type)
1035 struct domain *d = page_get_owner(page);
1036 unsigned long pfn = page_to_mfn(page);
1037 l2_pgentry_t *pl2e;
1038 int i;
1040 pl2e = map_domain_page(pfn);
1042 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1044 if ( is_guest_l2_slot(d, type, i) &&
1045 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1046 goto fail;
1048 adjust_guest_l2e(pl2e[i], d);
1051 #if CONFIG_PAGING_LEVELS == 2
1052 /* Xen private mappings. */
1053 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
1054 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
1055 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1056 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
1057 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
1058 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1059 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1060 l2e_from_page(
1061 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1062 __PAGE_HYPERVISOR);
1063 #endif
1065 unmap_domain_page(pl2e);
1066 return 1;
1068 fail:
1069 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1070 while ( i-- > 0 )
1071 if ( is_guest_l2_slot(d, type, i) )
1072 put_page_from_l2e(pl2e[i], pfn);
1074 unmap_domain_page(pl2e);
1075 return 0;
1079 #if CONFIG_PAGING_LEVELS >= 3
1080 static int alloc_l3_table(struct page_info *page)
1082 struct domain *d = page_get_owner(page);
1083 unsigned long pfn = page_to_mfn(page);
1084 l3_pgentry_t *pl3e;
1085 int i;
1087 #ifdef CONFIG_X86_PAE
1088 /*
1089 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1090 * the weird 'extended cr3' format for dealing with high-order address
1091 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1092 */
1093 if ( (pfn >= 0x100000) &&
1094 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1095 d->vcpu[0] && d->vcpu[0]->is_initialised )
1097 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1098 return 0;
1100 #endif
1102 pl3e = map_domain_page(pfn);
1104 /*
1105 * PAE guests allocate full pages, but aren't required to initialize
1106 * more than the first four entries; when running in compatibility
1107 * mode, however, the full page is visible to the MMU, and hence all
1108 * 512 entries must be valid/verified, which is most easily achieved
1109 * by clearing them out.
1110 */
1111 if ( is_pv_32on64_domain(d) )
1112 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1114 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1116 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1117 if ( is_pv_32bit_domain(d) && (i == 3) )
1119 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1120 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1121 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1122 PGT_l2_page_table |
1123 PGT_pae_xen_l2,
1124 d) )
1125 goto fail;
1127 else
1128 #endif
1129 if ( is_guest_l3_slot(i) &&
1130 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1131 goto fail;
1133 adjust_guest_l3e(pl3e[i], d);
1136 if ( !create_pae_xen_mappings(d, pl3e) )
1137 goto fail;
1139 unmap_domain_page(pl3e);
1140 return 1;
1142 fail:
1143 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1144 while ( i-- > 0 )
1145 if ( is_guest_l3_slot(i) )
1146 put_page_from_l3e(pl3e[i], pfn);
1148 unmap_domain_page(pl3e);
1149 return 0;
1151 #else
1152 #define alloc_l3_table(page) (0)
1153 #endif
1155 #if CONFIG_PAGING_LEVELS >= 4
1156 static int alloc_l4_table(struct page_info *page)
1158 struct domain *d = page_get_owner(page);
1159 unsigned long pfn = page_to_mfn(page);
1160 l4_pgentry_t *pl4e = page_to_virt(page);
1161 int i;
1163 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1165 if ( is_guest_l4_slot(d, i) &&
1166 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1167 goto fail;
1169 adjust_guest_l4e(pl4e[i], d);
1172 /* Xen private mappings. */
1173 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1174 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1175 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1176 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1177 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1178 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1179 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1180 __PAGE_HYPERVISOR);
1181 if ( is_pv_32on64_domain(d) )
1182 pl4e[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
1183 l4e_from_page(virt_to_page(d->arch.mm_arg_xlat_l3),
1184 __PAGE_HYPERVISOR);
1186 return 1;
1188 fail:
1189 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1190 while ( i-- > 0 )
1191 if ( is_guest_l4_slot(d, i) )
1192 put_page_from_l4e(pl4e[i], pfn);
1194 return 0;
1196 #else
1197 #define alloc_l4_table(page) (0)
1198 #endif
1201 static void free_l1_table(struct page_info *page)
1203 struct domain *d = page_get_owner(page);
1204 unsigned long pfn = page_to_mfn(page);
1205 l1_pgentry_t *pl1e;
1206 int i;
1208 pl1e = map_domain_page(pfn);
1210 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1211 if ( is_guest_l1_slot(i) )
1212 put_page_from_l1e(pl1e[i], d);
1214 unmap_domain_page(pl1e);
1218 static void free_l2_table(struct page_info *page)
1220 #ifdef CONFIG_COMPAT
1221 struct domain *d = page_get_owner(page);
1222 #endif
1223 unsigned long pfn = page_to_mfn(page);
1224 l2_pgentry_t *pl2e;
1225 int i;
1227 pl2e = map_domain_page(pfn);
1229 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1230 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1231 put_page_from_l2e(pl2e[i], pfn);
1233 unmap_domain_page(pl2e);
1235 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1239 #if CONFIG_PAGING_LEVELS >= 3
1241 static void free_l3_table(struct page_info *page)
1243 struct domain *d = page_get_owner(page);
1244 unsigned long pfn = page_to_mfn(page);
1245 l3_pgentry_t *pl3e;
1246 int i;
1248 pl3e = map_domain_page(pfn);
1250 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1251 if ( is_guest_l3_slot(i) )
1253 put_page_from_l3e(pl3e[i], pfn);
1254 unadjust_guest_l3e(pl3e[i], d);
1257 unmap_domain_page(pl3e);
1260 #endif
1262 #if CONFIG_PAGING_LEVELS >= 4
1264 static void free_l4_table(struct page_info *page)
1266 struct domain *d = page_get_owner(page);
1267 unsigned long pfn = page_to_mfn(page);
1268 l4_pgentry_t *pl4e = page_to_virt(page);
1269 int i;
1271 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1272 if ( is_guest_l4_slot(d, i) )
1273 put_page_from_l4e(pl4e[i], pfn);
1276 #endif
1279 /* How to write an entry to the guest pagetables.
1280 * Returns 0 for failure (pointer not valid), 1 for success. */
1281 static inline int update_intpte(intpte_t *p,
1282 intpte_t old,
1283 intpte_t new,
1284 unsigned long mfn,
1285 struct vcpu *v)
1287 int rv = 1;
1288 #ifndef PTE_UPDATE_WITH_CMPXCHG
1289 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1290 #else
1292 intpte_t t = old;
1293 for ( ; ; )
1295 rv = paging_cmpxchg_guest_entry(v, p, &t, new, _mfn(mfn));
1296 if ( unlikely(rv == 0) )
1298 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1299 ": saw %" PRIpte, old, new, t);
1300 break;
1303 if ( t == old )
1304 break;
1306 /* Allowed to change in Accessed/Dirty flags only. */
1307 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1309 old = t;
1312 #endif
1313 return rv;
1316 /* Macro that wraps the appropriate type-changes around update_intpte().
1317 * Arguments are: type, ptr, old, new, mfn, vcpu */
1318 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v) \
1319 update_intpte(&_t ## e_get_intpte(*(_p)), \
1320 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1321 (_m), (_v))
1323 /* Update the L1 entry at pl1e to new value nl1e. */
1324 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1325 unsigned long gl1mfn)
1327 l1_pgentry_t ol1e;
1328 struct domain *d = current->domain;
1329 unsigned long mfn;
1331 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1332 return 0;
1334 if ( unlikely(paging_mode_refcounts(d)) )
1335 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1337 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1339 /* Translate foreign guest addresses. */
1340 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1341 if ( unlikely(mfn == INVALID_MFN) )
1342 return 0;
1343 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1344 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1346 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1348 MEM_LOG("Bad L1 flags %x",
1349 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1350 return 0;
1353 adjust_guest_l1e(nl1e, d);
1355 /* Fast path for identical mapping, r/w and presence. */
1356 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1357 return UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current);
1359 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1360 return 0;
1362 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1364 put_page_from_l1e(nl1e, d);
1365 return 0;
1368 else
1370 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, current)) )
1371 return 0;
1374 put_page_from_l1e(ol1e, d);
1375 return 1;
1379 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1380 static int mod_l2_entry(l2_pgentry_t *pl2e,
1381 l2_pgentry_t nl2e,
1382 unsigned long pfn,
1383 unsigned long type)
1385 l2_pgentry_t ol2e;
1386 struct domain *d = current->domain;
1388 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1390 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1391 return 0;
1394 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1395 return 0;
1397 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1399 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1401 MEM_LOG("Bad L2 flags %x",
1402 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1403 return 0;
1406 adjust_guest_l2e(nl2e, d);
1408 /* Fast path for identical mapping and presence. */
1409 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1410 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current);
1412 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1413 return 0;
1415 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1417 put_page_from_l2e(nl2e, pfn);
1418 return 0;
1421 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, current)) )
1423 return 0;
1426 put_page_from_l2e(ol2e, pfn);
1427 return 1;
1430 #if CONFIG_PAGING_LEVELS >= 3
1432 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1433 static int mod_l3_entry(l3_pgentry_t *pl3e,
1434 l3_pgentry_t nl3e,
1435 unsigned long pfn)
1437 l3_pgentry_t ol3e;
1438 struct domain *d = current->domain;
1439 int okay;
1441 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1443 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1444 return 0;
1447 #if defined(CONFIG_X86_PAE) || defined(CONFIG_COMPAT)
1448 /*
1449 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1450 * would be a pain to ensure they remain continuously valid throughout.
1451 */
1452 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1453 return 0;
1454 #endif
1456 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1457 return 0;
1459 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1461 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1463 MEM_LOG("Bad L3 flags %x",
1464 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1465 return 0;
1468 adjust_guest_l3e(nl3e, d);
1470 /* Fast path for identical mapping and presence. */
1471 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1472 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current);
1474 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1475 return 0;
1477 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1479 put_page_from_l3e(nl3e, pfn);
1480 return 0;
1483 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, current)) )
1485 return 0;
1488 okay = create_pae_xen_mappings(d, pl3e);
1489 BUG_ON(!okay);
1491 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1493 put_page_from_l3e(ol3e, pfn);
1494 return 1;
1497 #endif
1499 #if CONFIG_PAGING_LEVELS >= 4
1501 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1502 static int mod_l4_entry(struct domain *d,
1503 l4_pgentry_t *pl4e,
1504 l4_pgentry_t nl4e,
1505 unsigned long pfn)
1507 l4_pgentry_t ol4e;
1509 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1511 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1512 return 0;
1515 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1516 return 0;
1518 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1520 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1522 MEM_LOG("Bad L4 flags %x",
1523 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1524 return 0;
1527 adjust_guest_l4e(nl4e, current->domain);
1529 /* Fast path for identical mapping and presence. */
1530 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1531 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current);
1533 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1534 return 0;
1536 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1538 put_page_from_l4e(nl4e, pfn);
1539 return 0;
1542 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, current)) )
1544 return 0;
1547 put_page_from_l4e(ol4e, pfn);
1548 return 1;
1551 #endif
1553 int alloc_page_type(struct page_info *page, unsigned long type)
1555 struct domain *owner = page_get_owner(page);
1557 /* A page table is dirtied when its type count becomes non-zero. */
1558 if ( likely(owner != NULL) )
1559 paging_mark_dirty(owner, page_to_mfn(page));
1561 switch ( type & PGT_type_mask )
1563 case PGT_l1_page_table:
1564 return alloc_l1_table(page);
1565 case PGT_l2_page_table:
1566 return alloc_l2_table(page, type);
1567 case PGT_l3_page_table:
1568 return alloc_l3_table(page);
1569 case PGT_l4_page_table:
1570 return alloc_l4_table(page);
1571 case PGT_gdt_page:
1572 case PGT_ldt_page:
1573 return alloc_segdesc_page(page);
1574 default:
1575 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1576 type, page->u.inuse.type_info,
1577 page->count_info);
1578 BUG();
1581 return 0;
1585 void free_page_type(struct page_info *page, unsigned long type)
1587 struct domain *owner = page_get_owner(page);
1588 unsigned long gmfn;
1590 if ( likely(owner != NULL) )
1592 /*
1593 * We have to flush before the next use of the linear mapping
1594 * (e.g., update_va_mapping()) or we could end up modifying a page
1595 * that is no longer a page table (and hence screw up ref counts).
1596 */
1597 if ( current->domain == owner )
1598 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1599 else
1600 flush_tlb_mask(owner->domain_dirty_cpumask);
1602 if ( unlikely(paging_mode_enabled(owner)) )
1604 /* A page table is dirtied when its type count becomes zero. */
1605 paging_mark_dirty(owner, page_to_mfn(page));
1607 if ( shadow_mode_refcounts(owner) )
1608 return;
1610 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1611 ASSERT(VALID_M2P(gmfn));
1612 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1616 switch ( type & PGT_type_mask )
1618 case PGT_l1_page_table:
1619 free_l1_table(page);
1620 break;
1622 case PGT_l2_page_table:
1623 free_l2_table(page);
1624 break;
1626 #if CONFIG_PAGING_LEVELS >= 3
1627 case PGT_l3_page_table:
1628 free_l3_table(page);
1629 break;
1630 #endif
1632 #if CONFIG_PAGING_LEVELS >= 4
1633 case PGT_l4_page_table:
1634 free_l4_table(page);
1635 break;
1636 #endif
1638 default:
1639 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1640 type, page_to_mfn(page));
1641 BUG();
1646 void put_page_type(struct page_info *page)
1648 unsigned long nx, x, y = page->u.inuse.type_info;
1650 again:
1651 do {
1652 x = y;
1653 nx = x - 1;
1655 ASSERT((x & PGT_count_mask) != 0);
1657 if ( unlikely((nx & PGT_count_mask) == 0) )
1659 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1660 likely(nx & PGT_validated) )
1662 /*
1663 * Page-table pages must be unvalidated when count is zero. The
1664 * 'free' is safe because the refcnt is non-zero and validated
1665 * bit is clear => other ops will spin or fail.
1666 */
1667 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1668 x & ~PGT_validated)) != x) )
1669 goto again;
1670 /* We cleared the 'valid bit' so we do the clean up. */
1671 free_page_type(page, x);
1672 /* Carry on, but with the 'valid bit' now clear. */
1673 x &= ~PGT_validated;
1674 nx &= ~PGT_validated;
1677 /*
1678 * Record TLB information for flush later. We do not stamp page
1679 * tables when running in shadow mode:
1680 * 1. Pointless, since it's the shadow pt's which must be tracked.
1681 * 2. Shadow mode reuses this field for shadowed page tables to
1682 * store flags info -- we don't want to conflict with that.
1683 */
1684 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1685 (page->count_info & PGC_page_table)) )
1686 page->tlbflush_timestamp = tlbflush_current_time();
1689 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1693 int get_page_type(struct page_info *page, unsigned long type)
1695 unsigned long nx, x, y = page->u.inuse.type_info;
1697 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1699 again:
1700 do {
1701 x = y;
1702 nx = x + 1;
1703 if ( unlikely((nx & PGT_count_mask) == 0) )
1705 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1706 return 0;
1708 else if ( unlikely((x & PGT_count_mask) == 0) )
1710 struct domain *d = page_get_owner(page);
1712 /* Never allow a shadowed frame to go from type count 0 to 1 */
1713 if ( d && shadow_mode_enabled(d) )
1714 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1716 ASSERT(!(x & PGT_pae_xen_l2));
1717 if ( (x & PGT_type_mask) != type )
1719 /*
1720 * On type change we check to flush stale TLB entries. This
1721 * may be unnecessary (e.g., page was GDT/LDT) but those
1722 * circumstances should be very rare.
1723 */
1724 cpumask_t mask = d->domain_dirty_cpumask;
1726 /* Don't flush if the timestamp is old enough */
1727 tlbflush_filter(mask, page->tlbflush_timestamp);
1729 if ( unlikely(!cpus_empty(mask)) &&
1730 /* Shadow mode: track only writable pages. */
1731 (!shadow_mode_enabled(page_get_owner(page)) ||
1732 ((nx & PGT_type_mask) == PGT_writable_page)) )
1734 perfc_incr(need_flush_tlb_flush);
1735 flush_tlb_mask(mask);
1738 /* We lose existing type, back pointer, and validity. */
1739 nx &= ~(PGT_type_mask | PGT_validated);
1740 nx |= type;
1742 /* No special validation needed for writable pages. */
1743 /* Page tables and GDT/LDT need to be scanned for validity. */
1744 if ( type == PGT_writable_page )
1745 nx |= PGT_validated;
1748 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1750 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1751 (type != PGT_l1_page_table) )
1752 MEM_LOG("Bad type (saw %" PRtype_info
1753 " != exp %" PRtype_info ") "
1754 "for mfn %lx (pfn %lx)",
1755 x, type, page_to_mfn(page),
1756 get_gpfn_from_mfn(page_to_mfn(page)));
1757 return 0;
1759 else if ( unlikely(!(x & PGT_validated)) )
1761 /* Someone else is updating validation of this page. Wait... */
1762 while ( (y = page->u.inuse.type_info) == x )
1763 cpu_relax();
1764 goto again;
1767 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1769 if ( unlikely(!(nx & PGT_validated)) )
1771 /* Try to validate page type; drop the new reference on failure. */
1772 if ( unlikely(!alloc_page_type(page, type)) )
1774 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1775 PRtype_info ": caf=%08x taf=%" PRtype_info,
1776 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1777 type, page->count_info, page->u.inuse.type_info);
1778 /* Noone else can get a reference. We hold the only ref. */
1779 page->u.inuse.type_info = 0;
1780 return 0;
1783 /* Noone else is updating simultaneously. */
1784 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1787 return 1;
1791 int new_guest_cr3(unsigned long mfn)
1793 struct vcpu *v = current;
1794 struct domain *d = v->domain;
1795 int okay;
1796 unsigned long old_base_mfn;
1798 #ifdef CONFIG_COMPAT
1799 if ( is_pv_32on64_domain(d) )
1801 okay = paging_mode_refcounts(d)
1802 ? 0 /* Old code was broken, but what should it be? */
1803 : mod_l4_entry(
1804 d,
1805 __va(pagetable_get_paddr(v->arch.guest_table)),
1806 l4e_from_pfn(
1807 mfn,
1808 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
1809 pagetable_get_pfn(v->arch.guest_table));
1810 if ( unlikely(!okay) )
1812 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
1813 return 0;
1816 invalidate_shadow_ldt(v);
1817 write_ptbase(v);
1819 return 1;
1821 #endif
1822 okay = paging_mode_refcounts(d)
1823 ? get_page_from_pagenr(mfn, d)
1824 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1825 if ( unlikely(!okay) )
1827 MEM_LOG("Error while installing new baseptr %lx", mfn);
1828 return 0;
1831 invalidate_shadow_ldt(v);
1833 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1835 v->arch.guest_table = pagetable_from_pfn(mfn);
1836 update_cr3(v);
1838 write_ptbase(v);
1840 if ( likely(old_base_mfn != 0) )
1842 if ( paging_mode_refcounts(d) )
1843 put_page(mfn_to_page(old_base_mfn));
1844 else
1845 put_page_and_type(mfn_to_page(old_base_mfn));
1848 return 1;
1851 static void process_deferred_ops(void)
1853 unsigned int deferred_ops;
1854 struct domain *d = current->domain;
1855 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1857 deferred_ops = info->deferred_ops;
1858 info->deferred_ops = 0;
1860 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1862 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1863 flush_tlb_mask(d->domain_dirty_cpumask);
1864 else
1865 local_flush_tlb();
1868 if ( deferred_ops & DOP_RELOAD_LDT )
1869 (void)map_ldt_shadow_page(0);
1871 if ( unlikely(info->foreign != NULL) )
1873 rcu_unlock_domain(info->foreign);
1874 info->foreign = NULL;
1878 static int set_foreigndom(domid_t domid)
1880 struct domain *e, *d = current->domain;
1881 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1882 int okay = 1;
1884 ASSERT(info->foreign == NULL);
1886 if ( likely(domid == DOMID_SELF) )
1887 goto out;
1889 if ( unlikely(domid == d->domain_id) )
1891 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1892 d->domain_id);
1893 okay = 0;
1895 else if ( unlikely(paging_mode_translate(d)) )
1897 MEM_LOG("Cannot mix foreign mappings with translated domains");
1898 okay = 0;
1900 else if ( !IS_PRIV(d) )
1902 switch ( domid )
1904 case DOMID_IO:
1905 info->foreign = rcu_lock_domain(dom_io);
1906 break;
1907 default:
1908 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1909 okay = 0;
1910 break;
1913 else
1915 info->foreign = e = rcu_lock_domain_by_id(domid);
1916 if ( e == NULL )
1918 switch ( domid )
1920 case DOMID_XEN:
1921 info->foreign = rcu_lock_domain(dom_xen);
1922 break;
1923 case DOMID_IO:
1924 info->foreign = rcu_lock_domain(dom_io);
1925 break;
1926 default:
1927 MEM_LOG("Unknown domain '%u'", domid);
1928 okay = 0;
1929 break;
1934 out:
1935 return okay;
1938 static inline cpumask_t vcpumask_to_pcpumask(
1939 struct domain *d, unsigned long vmask)
1941 unsigned int vcpu_id;
1942 cpumask_t pmask = CPU_MASK_NONE;
1943 struct vcpu *v;
1945 while ( vmask != 0 )
1947 vcpu_id = find_first_set_bit(vmask);
1948 vmask &= ~(1UL << vcpu_id);
1949 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1950 ((v = d->vcpu[vcpu_id]) != NULL) )
1951 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1954 return pmask;
1957 int do_mmuext_op(
1958 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1959 unsigned int count,
1960 XEN_GUEST_HANDLE(uint) pdone,
1961 unsigned int foreigndom)
1963 struct mmuext_op op;
1964 int rc = 0, i = 0, okay;
1965 unsigned long mfn = 0, gmfn = 0, type;
1966 unsigned int done = 0;
1967 struct page_info *page;
1968 struct vcpu *v = current;
1969 struct domain *d = v->domain;
1971 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1973 count &= ~MMU_UPDATE_PREEMPTED;
1974 if ( unlikely(!guest_handle_is_null(pdone)) )
1975 (void)copy_from_guest(&done, pdone, 1);
1977 else
1978 perfc_incr(calls_to_mmuext_op);
1980 if ( unlikely(!guest_handle_okay(uops, count)) )
1982 rc = -EFAULT;
1983 goto out;
1986 if ( !set_foreigndom(foreigndom) )
1988 rc = -ESRCH;
1989 goto out;
1992 LOCK_BIGLOCK(d);
1994 for ( i = 0; i < count; i++ )
1996 if ( hypercall_preempt_check() )
1998 rc = hypercall_create_continuation(
1999 __HYPERVISOR_mmuext_op, "hihi",
2000 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2001 break;
2004 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2006 MEM_LOG("Bad __copy_from_guest");
2007 rc = -EFAULT;
2008 break;
2011 okay = 1;
2012 gmfn = op.arg1.mfn;
2013 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2014 page = mfn_to_page(mfn);
2016 switch ( op.cmd )
2018 case MMUEXT_PIN_L1_TABLE:
2019 type = PGT_l1_page_table;
2020 goto pin_page;
2022 case MMUEXT_PIN_L2_TABLE:
2023 type = PGT_l2_page_table;
2024 goto pin_page;
2026 case MMUEXT_PIN_L3_TABLE:
2027 type = PGT_l3_page_table;
2028 goto pin_page;
2030 case MMUEXT_PIN_L4_TABLE:
2031 if ( is_pv_32bit_domain(FOREIGNDOM) )
2032 break;
2033 type = PGT_l4_page_table;
2035 pin_page:
2036 /* Ignore pinning of invalid paging levels. */
2037 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2038 break;
2040 if ( paging_mode_refcounts(FOREIGNDOM) )
2041 break;
2043 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2044 if ( unlikely(!okay) )
2046 MEM_LOG("Error while pinning mfn %lx", mfn);
2047 break;
2050 if ( unlikely(test_and_set_bit(_PGT_pinned,
2051 &page->u.inuse.type_info)) )
2053 MEM_LOG("Mfn %lx already pinned", mfn);
2054 put_page_and_type(page);
2055 okay = 0;
2056 break;
2059 /* A page is dirtied when its pin status is set. */
2060 paging_mark_dirty(d, mfn);
2062 /* We can race domain destruction (domain_relinquish_resources). */
2063 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2065 int drop_ref;
2066 spin_lock(&FOREIGNDOM->page_alloc_lock);
2067 drop_ref = (FOREIGNDOM->is_dying &&
2068 test_and_clear_bit(_PGT_pinned,
2069 &page->u.inuse.type_info));
2070 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2071 if ( drop_ref )
2072 put_page_and_type(page);
2075 break;
2077 case MMUEXT_UNPIN_TABLE:
2078 if ( paging_mode_refcounts(d) )
2079 break;
2081 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2083 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2084 mfn, page_get_owner(page));
2086 else if ( likely(test_and_clear_bit(_PGT_pinned,
2087 &page->u.inuse.type_info)) )
2089 put_page_and_type(page);
2090 put_page(page);
2091 /* A page is dirtied when its pin status is cleared. */
2092 paging_mark_dirty(d, mfn);
2094 else
2096 okay = 0;
2097 put_page(page);
2098 MEM_LOG("Mfn %lx not pinned", mfn);
2100 break;
2102 case MMUEXT_NEW_BASEPTR:
2103 okay = new_guest_cr3(mfn);
2104 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2105 break;
2107 #ifdef __x86_64__
2108 case MMUEXT_NEW_USER_BASEPTR: {
2109 unsigned long old_mfn;
2111 if ( mfn != 0 )
2113 if ( paging_mode_refcounts(d) )
2114 okay = get_page_from_pagenr(mfn, d);
2115 else
2116 okay = get_page_and_type_from_pagenr(
2117 mfn, PGT_root_page_table, d);
2118 if ( unlikely(!okay) )
2120 MEM_LOG("Error while installing new mfn %lx", mfn);
2121 break;
2125 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2126 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2128 if ( old_mfn != 0 )
2130 if ( paging_mode_refcounts(d) )
2131 put_page(mfn_to_page(old_mfn));
2132 else
2133 put_page_and_type(mfn_to_page(old_mfn));
2136 break;
2138 #endif
2140 case MMUEXT_TLB_FLUSH_LOCAL:
2141 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2142 break;
2144 case MMUEXT_INVLPG_LOCAL:
2145 if ( !paging_mode_enabled(d)
2146 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2147 local_flush_tlb_one(op.arg1.linear_addr);
2148 break;
2150 case MMUEXT_TLB_FLUSH_MULTI:
2151 case MMUEXT_INVLPG_MULTI:
2153 unsigned long vmask;
2154 cpumask_t pmask;
2155 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2157 okay = 0;
2158 break;
2160 pmask = vcpumask_to_pcpumask(d, vmask);
2161 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2162 flush_tlb_mask(pmask);
2163 else
2164 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2165 break;
2168 case MMUEXT_TLB_FLUSH_ALL:
2169 flush_tlb_mask(d->domain_dirty_cpumask);
2170 break;
2172 case MMUEXT_INVLPG_ALL:
2173 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2174 break;
2176 case MMUEXT_FLUSH_CACHE:
2177 if ( unlikely(!cache_flush_permitted(d)) )
2179 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2180 okay = 0;
2182 else
2184 wbinvd();
2186 break;
2188 case MMUEXT_SET_LDT:
2190 unsigned long ptr = op.arg1.linear_addr;
2191 unsigned long ents = op.arg2.nr_ents;
2193 if ( paging_mode_external(d) )
2195 MEM_LOG("ignoring SET_LDT hypercall from external "
2196 "domain %u", d->domain_id);
2197 okay = 0;
2199 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2200 (ents > 8192) ||
2201 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2203 okay = 0;
2204 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2206 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2207 (v->arch.guest_context.ldt_base != ptr) )
2209 invalidate_shadow_ldt(v);
2210 v->arch.guest_context.ldt_base = ptr;
2211 v->arch.guest_context.ldt_ents = ents;
2212 load_LDT(v);
2213 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2214 if ( ents != 0 )
2215 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2217 break;
2220 default:
2221 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2222 rc = -ENOSYS;
2223 okay = 0;
2224 break;
2227 if ( unlikely(!okay) )
2229 rc = rc ? rc : -EINVAL;
2230 break;
2233 guest_handle_add_offset(uops, 1);
2236 process_deferred_ops();
2238 UNLOCK_BIGLOCK(d);
2240 perfc_add(num_mmuext_ops, i);
2242 out:
2243 /* Add incremental work we have done to the @done output parameter. */
2244 if ( unlikely(!guest_handle_is_null(pdone)) )
2246 done += i;
2247 copy_to_guest(pdone, &done, 1);
2250 return rc;
2253 int do_mmu_update(
2254 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2255 unsigned int count,
2256 XEN_GUEST_HANDLE(uint) pdone,
2257 unsigned int foreigndom)
2259 struct mmu_update req;
2260 void *va;
2261 unsigned long gpfn, gmfn, mfn;
2262 struct page_info *page;
2263 int rc = 0, okay = 1, i = 0;
2264 unsigned int cmd, done = 0;
2265 struct vcpu *v = current;
2266 struct domain *d = v->domain;
2267 unsigned long type_info;
2268 struct domain_mmap_cache mapcache;
2270 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2272 count &= ~MMU_UPDATE_PREEMPTED;
2273 if ( unlikely(!guest_handle_is_null(pdone)) )
2274 (void)copy_from_guest(&done, pdone, 1);
2276 else
2277 perfc_incr(calls_to_mmu_update);
2279 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2281 rc = -EFAULT;
2282 goto out;
2285 if ( !set_foreigndom(foreigndom) )
2287 rc = -ESRCH;
2288 goto out;
2291 domain_mmap_cache_init(&mapcache);
2293 LOCK_BIGLOCK(d);
2295 for ( i = 0; i < count; i++ )
2297 if ( hypercall_preempt_check() )
2299 rc = hypercall_create_continuation(
2300 __HYPERVISOR_mmu_update, "hihi",
2301 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2302 break;
2305 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2307 MEM_LOG("Bad __copy_from_guest");
2308 rc = -EFAULT;
2309 break;
2312 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2313 okay = 0;
2315 switch ( cmd )
2317 /*
2318 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2319 */
2320 case MMU_NORMAL_PT_UPDATE:
2322 gmfn = req.ptr >> PAGE_SHIFT;
2323 mfn = gmfn_to_mfn(d, gmfn);
2325 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2327 MEM_LOG("Could not get page for normal update");
2328 break;
2331 va = map_domain_page_with_cache(mfn, &mapcache);
2332 va = (void *)((unsigned long)va +
2333 (unsigned long)(req.ptr & ~PAGE_MASK));
2334 page = mfn_to_page(mfn);
2336 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2338 case PGT_l1_page_table:
2339 case PGT_l2_page_table:
2340 case PGT_l3_page_table:
2341 case PGT_l4_page_table:
2343 if ( paging_mode_refcounts(d) )
2345 MEM_LOG("mmu update on auto-refcounted domain!");
2346 break;
2349 if ( unlikely(!get_page_type(
2350 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2351 goto not_a_pt;
2353 switch ( type_info & PGT_type_mask )
2355 case PGT_l1_page_table:
2357 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2358 okay = mod_l1_entry(va, l1e, mfn);
2360 break;
2361 case PGT_l2_page_table:
2363 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2364 okay = mod_l2_entry(va, l2e, mfn, type_info);
2366 break;
2367 #if CONFIG_PAGING_LEVELS >= 3
2368 case PGT_l3_page_table:
2370 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2371 okay = mod_l3_entry(va, l3e, mfn);
2373 break;
2374 #endif
2375 #if CONFIG_PAGING_LEVELS >= 4
2376 case PGT_l4_page_table:
2378 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2379 okay = mod_l4_entry(d, va, l4e, mfn);
2381 break;
2382 #endif
2385 put_page_type(page);
2387 break;
2389 default:
2390 not_a_pt:
2392 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2393 break;
2395 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2397 put_page_type(page);
2399 break;
2402 unmap_domain_page_with_cache(va, &mapcache);
2404 put_page(page);
2405 break;
2407 case MMU_MACHPHYS_UPDATE:
2409 mfn = req.ptr >> PAGE_SHIFT;
2410 gpfn = req.val;
2412 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2414 MEM_LOG("Could not get page for mach->phys update");
2415 break;
2418 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2420 MEM_LOG("Mach-phys update on auto-translate guest");
2421 break;
2424 set_gpfn_from_mfn(mfn, gpfn);
2425 okay = 1;
2427 paging_mark_dirty(FOREIGNDOM, mfn);
2429 put_page(mfn_to_page(mfn));
2430 break;
2432 default:
2433 MEM_LOG("Invalid page update command %x", cmd);
2434 rc = -ENOSYS;
2435 okay = 0;
2436 break;
2439 if ( unlikely(!okay) )
2441 rc = rc ? rc : -EINVAL;
2442 break;
2445 guest_handle_add_offset(ureqs, 1);
2448 process_deferred_ops();
2450 UNLOCK_BIGLOCK(d);
2452 domain_mmap_cache_destroy(&mapcache);
2454 perfc_add(num_page_updates, i);
2456 out:
2457 /* Add incremental work we have done to the @done output parameter. */
2458 if ( unlikely(!guest_handle_is_null(pdone)) )
2460 done += i;
2461 copy_to_guest(pdone, &done, 1);
2464 return rc;
2468 static int create_grant_pte_mapping(
2469 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2471 int rc = GNTST_okay;
2472 void *va;
2473 unsigned long gmfn, mfn;
2474 struct page_info *page;
2475 u32 type;
2476 l1_pgentry_t ol1e;
2477 struct domain *d = v->domain;
2479 ASSERT(spin_is_locked(&d->big_lock));
2481 adjust_guest_l1e(nl1e, d);
2483 gmfn = pte_addr >> PAGE_SHIFT;
2484 mfn = gmfn_to_mfn(d, gmfn);
2486 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2488 MEM_LOG("Could not get page for normal update");
2489 return GNTST_general_error;
2492 va = map_domain_page(mfn);
2493 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2494 page = mfn_to_page(mfn);
2496 type = page->u.inuse.type_info & PGT_type_mask;
2497 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2499 MEM_LOG("Grant map attempted to update a non-L1 page");
2500 rc = GNTST_general_error;
2501 goto failed;
2504 ol1e = *(l1_pgentry_t *)va;
2505 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v) )
2507 put_page_type(page);
2508 rc = GNTST_general_error;
2509 goto failed;
2512 if ( !paging_mode_refcounts(d) )
2513 put_page_from_l1e(ol1e, d);
2515 put_page_type(page);
2517 failed:
2518 unmap_domain_page(va);
2519 put_page(page);
2521 return rc;
2524 static int destroy_grant_pte_mapping(
2525 uint64_t addr, unsigned long frame, struct domain *d)
2527 int rc = GNTST_okay;
2528 void *va;
2529 unsigned long gmfn, mfn;
2530 struct page_info *page;
2531 u32 type;
2532 l1_pgentry_t ol1e;
2534 gmfn = addr >> PAGE_SHIFT;
2535 mfn = gmfn_to_mfn(d, gmfn);
2537 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2539 MEM_LOG("Could not get page for normal update");
2540 return GNTST_general_error;
2543 va = map_domain_page(mfn);
2544 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2545 page = mfn_to_page(mfn);
2547 type = page->u.inuse.type_info & PGT_type_mask;
2548 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2550 MEM_LOG("Grant map attempted to update a non-L1 page");
2551 rc = GNTST_general_error;
2552 goto failed;
2555 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2557 put_page_type(page);
2558 rc = GNTST_general_error;
2559 goto failed;
2562 /* Check that the virtual address supplied is actually mapped to frame. */
2563 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2565 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2566 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2567 put_page_type(page);
2568 rc = GNTST_general_error;
2569 goto failed;
2572 /* Delete pagetable entry. */
2573 if ( unlikely(!UPDATE_ENTRY(l1,
2574 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2575 d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
2577 MEM_LOG("Cannot delete PTE entry at %p", va);
2578 put_page_type(page);
2579 rc = GNTST_general_error;
2580 goto failed;
2583 put_page_type(page);
2585 failed:
2586 unmap_domain_page(va);
2587 put_page(page);
2588 return rc;
2592 static int create_grant_va_mapping(
2593 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2595 l1_pgentry_t *pl1e, ol1e;
2596 struct domain *d = v->domain;
2597 unsigned long gl1mfn;
2598 int okay;
2600 ASSERT(spin_is_locked(&d->big_lock));
2602 adjust_guest_l1e(nl1e, d);
2604 pl1e = guest_map_l1e(v, va, &gl1mfn);
2605 if ( !pl1e )
2607 MEM_LOG("Could not find L1 PTE for address %lx", va);
2608 return GNTST_general_error;
2610 ol1e = *pl1e;
2611 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v);
2612 guest_unmap_l1e(v, pl1e);
2613 pl1e = NULL;
2615 if ( !okay )
2616 return GNTST_general_error;
2618 if ( !paging_mode_refcounts(d) )
2619 put_page_from_l1e(ol1e, d);
2621 return GNTST_okay;
2624 static int replace_grant_va_mapping(
2625 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2627 l1_pgentry_t *pl1e, ol1e;
2628 unsigned long gl1mfn;
2629 int rc = 0;
2631 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2632 if ( !pl1e )
2634 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2635 return GNTST_general_error;
2637 ol1e = *pl1e;
2639 /* Check that the virtual address supplied is actually mapped to frame. */
2640 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2642 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2643 l1e_get_pfn(ol1e), addr, frame);
2644 rc = GNTST_general_error;
2645 goto out;
2648 /* Delete pagetable entry. */
2649 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v)) )
2651 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2652 rc = GNTST_general_error;
2653 goto out;
2656 out:
2657 guest_unmap_l1e(v, pl1e);
2658 return rc;
2661 static int destroy_grant_va_mapping(
2662 unsigned long addr, unsigned long frame, struct vcpu *v)
2664 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2667 int create_grant_host_mapping(
2668 uint64_t addr, unsigned long frame, unsigned int flags)
2670 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2672 if ( (flags & GNTMAP_application_map) )
2673 l1e_add_flags(pte,_PAGE_USER);
2674 if ( !(flags & GNTMAP_readonly) )
2675 l1e_add_flags(pte,_PAGE_RW);
2677 if ( flags & GNTMAP_contains_pte )
2678 return create_grant_pte_mapping(addr, pte, current);
2679 return create_grant_va_mapping(addr, pte, current);
2682 int replace_grant_host_mapping(
2683 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
2685 l1_pgentry_t *pl1e, ol1e;
2686 unsigned long gl1mfn;
2687 int rc;
2689 if ( flags & GNTMAP_contains_pte )
2691 if (!new_addr)
2692 return destroy_grant_pte_mapping(addr, frame, current->domain);
2694 MEM_LOG("Unsupported grant table operation");
2695 return GNTST_general_error;
2698 if (!new_addr)
2699 return destroy_grant_va_mapping(addr, frame, current);
2701 pl1e = guest_map_l1e(current, new_addr, &gl1mfn);
2702 if ( !pl1e )
2704 MEM_LOG("Could not find L1 PTE for address %lx",
2705 (unsigned long)new_addr);
2706 return GNTST_general_error;
2708 ol1e = *pl1e;
2710 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, current)) )
2712 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2713 guest_unmap_l1e(current, pl1e);
2714 return GNTST_general_error;
2717 guest_unmap_l1e(current, pl1e);
2719 rc = replace_grant_va_mapping(addr, frame, ol1e, current);
2720 if ( rc && !paging_mode_refcounts(current->domain) )
2721 put_page_from_l1e(ol1e, current->domain);
2723 return rc;
2726 int steal_page(
2727 struct domain *d, struct page_info *page, unsigned int memflags)
2729 u32 _d, _nd, x, y;
2731 spin_lock(&d->page_alloc_lock);
2733 /*
2734 * The tricky bit: atomically release ownership while there is just one
2735 * benign reference to the page (PGC_allocated). If that reference
2736 * disappears then the deallocation routine will safely spin.
2737 */
2738 _d = pickle_domptr(d);
2739 _nd = page->u.inuse._domain;
2740 y = page->count_info;
2741 do {
2742 x = y;
2743 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2744 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2745 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2746 " caf=%08x, taf=%" PRtype_info "\n",
2747 (void *) page_to_mfn(page),
2748 d, d->domain_id, unpickle_domptr(_nd), x,
2749 page->u.inuse.type_info);
2750 spin_unlock(&d->page_alloc_lock);
2751 return -1;
2753 __asm__ __volatile__(
2754 LOCK_PREFIX "cmpxchg8b %2"
2755 : "=d" (_nd), "=a" (y),
2756 "=m" (*(volatile u64 *)(&page->count_info))
2757 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2758 } while (unlikely(_nd != _d) || unlikely(y != x));
2760 /*
2761 * Unlink from 'd'. At least one reference remains (now anonymous), so
2762 * noone else is spinning to try to delete this page from 'd'.
2763 */
2764 if ( !(memflags & MEMF_no_refcount) )
2765 d->tot_pages--;
2766 list_del(&page->list);
2768 spin_unlock(&d->page_alloc_lock);
2770 return 0;
2773 int do_update_va_mapping(unsigned long va, u64 val64,
2774 unsigned long flags)
2776 l1_pgentry_t val = l1e_from_intpte(val64);
2777 struct vcpu *v = current;
2778 struct domain *d = v->domain;
2779 l1_pgentry_t *pl1e;
2780 unsigned long vmask, bmap_ptr, gl1mfn;
2781 cpumask_t pmask;
2782 int rc = 0;
2784 perfc_incr(calls_to_update_va);
2786 if ( unlikely(!__addr_ok(va) && !paging_mode_external(d)) )
2787 return -EINVAL;
2789 LOCK_BIGLOCK(d);
2791 pl1e = guest_map_l1e(v, va, &gl1mfn);
2793 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) )
2794 rc = -EINVAL;
2796 if ( pl1e )
2797 guest_unmap_l1e(v, pl1e);
2798 pl1e = NULL;
2800 process_deferred_ops();
2802 UNLOCK_BIGLOCK(d);
2804 switch ( flags & UVMF_FLUSHTYPE_MASK )
2806 case UVMF_TLB_FLUSH:
2807 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2809 case UVMF_LOCAL:
2810 local_flush_tlb();
2811 break;
2812 case UVMF_ALL:
2813 flush_tlb_mask(d->domain_dirty_cpumask);
2814 break;
2815 default:
2816 if ( unlikely(!is_pv_32on64_domain(d) ?
2817 get_user(vmask, (unsigned long *)bmap_ptr) :
2818 get_user(vmask, (unsigned int *)bmap_ptr)) )
2819 rc = -EFAULT;
2820 pmask = vcpumask_to_pcpumask(d, vmask);
2821 flush_tlb_mask(pmask);
2822 break;
2824 break;
2826 case UVMF_INVLPG:
2827 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2829 case UVMF_LOCAL:
2830 if ( !paging_mode_enabled(d)
2831 || (paging_invlpg(current, va) != 0) )
2832 local_flush_tlb_one(va);
2833 break;
2834 case UVMF_ALL:
2835 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2836 break;
2837 default:
2838 if ( unlikely(!is_pv_32on64_domain(d) ?
2839 get_user(vmask, (unsigned long *)bmap_ptr) :
2840 get_user(vmask, (unsigned int *)bmap_ptr)) )
2841 rc = -EFAULT;
2842 pmask = vcpumask_to_pcpumask(d, vmask);
2843 flush_tlb_one_mask(pmask, va);
2844 break;
2846 break;
2849 return rc;
2852 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2853 unsigned long flags,
2854 domid_t domid)
2856 int rc;
2858 if ( unlikely(!IS_PRIV(current->domain)) )
2859 return -EPERM;
2861 if ( !set_foreigndom(domid) )
2862 return -ESRCH;
2864 rc = do_update_va_mapping(va, val64, flags);
2866 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
2867 process_deferred_ops(); /* only to clear foreigndom */
2869 return rc;
2874 /*************************
2875 * Descriptor Tables
2876 */
2878 void destroy_gdt(struct vcpu *v)
2880 int i;
2881 unsigned long pfn;
2883 v->arch.guest_context.gdt_ents = 0;
2884 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2886 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2887 put_page_and_type(mfn_to_page(pfn));
2888 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
2889 v->arch.guest_context.gdt_frames[i] = 0;
2894 long set_gdt(struct vcpu *v,
2895 unsigned long *frames,
2896 unsigned int entries)
2898 struct domain *d = v->domain;
2899 /* NB. There are 512 8-byte entries per GDT page. */
2900 int i, nr_pages = (entries + 511) / 512;
2901 unsigned long mfn;
2903 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2904 return -EINVAL;
2906 /* Check the pages in the new GDT. */
2907 for ( i = 0; i < nr_pages; i++ ) {
2908 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2909 if ( !mfn_valid(mfn) ||
2910 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2911 goto fail;
2914 /* Tear down the old GDT. */
2915 destroy_gdt(v);
2917 /* Install the new GDT. */
2918 v->arch.guest_context.gdt_ents = entries;
2919 for ( i = 0; i < nr_pages; i++ )
2921 v->arch.guest_context.gdt_frames[i] = frames[i];
2922 l1e_write(&v->arch.perdomain_ptes[i],
2923 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
2926 return 0;
2928 fail:
2929 while ( i-- > 0 )
2930 put_page_and_type(mfn_to_page(frames[i]));
2931 return -EINVAL;
2935 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2937 int nr_pages = (entries + 511) / 512;
2938 unsigned long frames[16];
2939 long ret;
2941 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2942 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2943 return -EINVAL;
2945 if ( copy_from_guest(frames, frame_list, nr_pages) )
2946 return -EFAULT;
2948 LOCK_BIGLOCK(current->domain);
2950 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2951 local_flush_tlb();
2953 UNLOCK_BIGLOCK(current->domain);
2955 return ret;
2959 long do_update_descriptor(u64 pa, u64 desc)
2961 struct domain *dom = current->domain;
2962 unsigned long gmfn = pa >> PAGE_SHIFT;
2963 unsigned long mfn;
2964 unsigned int offset;
2965 struct desc_struct *gdt_pent, d;
2966 struct page_info *page;
2967 long ret = -EINVAL;
2969 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2971 *(u64 *)&d = desc;
2973 LOCK_BIGLOCK(dom);
2975 mfn = gmfn_to_mfn(dom, gmfn);
2976 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2977 !mfn_valid(mfn) ||
2978 !check_descriptor(dom, &d) )
2980 UNLOCK_BIGLOCK(dom);
2981 return -EINVAL;
2984 page = mfn_to_page(mfn);
2985 if ( unlikely(!get_page(page, dom)) )
2987 UNLOCK_BIGLOCK(dom);
2988 return -EINVAL;
2991 /* Check if the given frame is in use in an unsafe context. */
2992 switch ( page->u.inuse.type_info & PGT_type_mask )
2994 case PGT_gdt_page:
2995 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2996 goto out;
2997 break;
2998 case PGT_ldt_page:
2999 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
3000 goto out;
3001 break;
3002 default:
3003 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3004 goto out;
3005 break;
3008 paging_mark_dirty(dom, mfn);
3010 /* All is good so make the update. */
3011 gdt_pent = map_domain_page(mfn);
3012 memcpy(&gdt_pent[offset], &d, 8);
3013 unmap_domain_page(gdt_pent);
3015 put_page_type(page);
3017 ret = 0; /* success */
3019 out:
3020 put_page(page);
3022 UNLOCK_BIGLOCK(dom);
3024 return ret;
3027 typedef struct e820entry e820entry_t;
3028 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3030 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3032 switch ( op )
3034 case XENMEM_add_to_physmap:
3036 struct xen_add_to_physmap xatp;
3037 unsigned long prev_mfn, mfn = 0, gpfn;
3038 struct domain *d;
3040 if ( copy_from_guest(&xatp, arg, 1) )
3041 return -EFAULT;
3043 if ( xatp.domid == DOMID_SELF )
3044 d = rcu_lock_current_domain();
3045 else if ( !IS_PRIV(current->domain) )
3046 return -EPERM;
3047 else if ( (d = rcu_lock_domain_by_id(xatp.domid)) == NULL )
3048 return -ESRCH;
3050 switch ( xatp.space )
3052 case XENMAPSPACE_shared_info:
3053 if ( xatp.idx == 0 )
3054 mfn = virt_to_mfn(d->shared_info);
3055 break;
3056 case XENMAPSPACE_grant_table:
3057 spin_lock(&d->grant_table->lock);
3059 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3060 (xatp.idx < max_nr_grant_frames) )
3061 gnttab_grow_table(d, xatp.idx + 1);
3063 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3064 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3066 spin_unlock(&d->grant_table->lock);
3067 break;
3068 default:
3069 break;
3072 if ( !paging_mode_translate(d) || (mfn == 0) )
3074 rcu_unlock_domain(d);
3075 return -EINVAL;
3078 LOCK_BIGLOCK(d);
3080 /* Remove previously mapped page if it was present. */
3081 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3082 if ( mfn_valid(prev_mfn) )
3084 if ( is_xen_heap_frame(mfn_to_page(prev_mfn)) )
3085 /* Xen heap frames are simply unhooked from this phys slot. */
3086 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
3087 else
3088 /* Normal domain memory is freed, to avoid leaking memory. */
3089 guest_remove_page(d, xatp.gpfn);
3092 /* Unmap from old location, if any. */
3093 gpfn = get_gpfn_from_mfn(mfn);
3094 if ( gpfn != INVALID_M2P_ENTRY )
3095 guest_physmap_remove_page(d, gpfn, mfn);
3097 /* Map at new location. */
3098 guest_physmap_add_page(d, xatp.gpfn, mfn);
3100 UNLOCK_BIGLOCK(d);
3102 rcu_unlock_domain(d);
3104 break;
3107 case XENMEM_set_memory_map:
3109 struct xen_foreign_memory_map fmap;
3110 struct domain *d;
3111 int rc;
3113 if ( copy_from_guest(&fmap, arg, 1) )
3114 return -EFAULT;
3116 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3117 return -EINVAL;
3119 if ( fmap.domid == DOMID_SELF )
3120 d = rcu_lock_current_domain();
3121 else if ( !IS_PRIV(current->domain) )
3122 return -EPERM;
3123 else if ( (d = rcu_lock_domain_by_id(fmap.domid)) == NULL )
3124 return -ESRCH;
3126 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3127 fmap.map.nr_entries) ? -EFAULT : 0;
3128 d->arch.nr_e820 = fmap.map.nr_entries;
3130 rcu_unlock_domain(d);
3131 return rc;
3134 case XENMEM_memory_map:
3136 struct xen_memory_map map;
3137 struct domain *d = current->domain;
3139 /* Backwards compatibility. */
3140 if ( d->arch.nr_e820 == 0 )
3141 return -ENOSYS;
3143 if ( copy_from_guest(&map, arg, 1) )
3144 return -EFAULT;
3146 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3147 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3148 copy_to_guest(arg, &map, 1) )
3149 return -EFAULT;
3151 return 0;
3154 case XENMEM_machine_memory_map:
3156 struct xen_memory_map memmap;
3157 XEN_GUEST_HANDLE(e820entry_t) buffer;
3158 int count;
3160 if ( !IS_PRIV(current->domain) )
3161 return -EINVAL;
3163 if ( copy_from_guest(&memmap, arg, 1) )
3164 return -EFAULT;
3165 if ( memmap.nr_entries < e820.nr_map + 1 )
3166 return -EINVAL;
3168 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3170 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3171 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3172 return -EFAULT;
3174 memmap.nr_entries = count;
3176 if ( copy_to_guest(arg, &memmap, 1) )
3177 return -EFAULT;
3179 return 0;
3182 case XENMEM_machphys_mapping:
3184 static const struct xen_machphys_mapping mapping = {
3185 .v_start = MACH2PHYS_VIRT_START,
3186 .v_end = MACH2PHYS_VIRT_END,
3187 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3188 };
3190 if ( copy_to_guest(arg, &mapping, 1) )
3191 return -EFAULT;
3193 return 0;
3196 default:
3197 return subarch_memory_op(op, arg);
3200 return 0;
3204 /*************************
3205 * Writable Pagetables
3206 */
3208 struct ptwr_emulate_ctxt {
3209 struct x86_emulate_ctxt ctxt;
3210 unsigned long cr2;
3211 l1_pgentry_t pte;
3212 };
3214 static int ptwr_emulated_read(
3215 enum x86_segment seg,
3216 unsigned long offset,
3217 unsigned long *val,
3218 unsigned int bytes,
3219 struct x86_emulate_ctxt *ctxt)
3221 unsigned int rc;
3222 unsigned long addr = offset;
3224 *val = 0;
3225 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3227 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3228 return X86EMUL_EXCEPTION;
3231 return X86EMUL_OKAY;
3234 static int ptwr_emulated_update(
3235 unsigned long addr,
3236 paddr_t old,
3237 paddr_t val,
3238 unsigned int bytes,
3239 unsigned int do_cmpxchg,
3240 struct ptwr_emulate_ctxt *ptwr_ctxt)
3242 unsigned long mfn;
3243 unsigned long unaligned_addr = addr;
3244 struct page_info *page;
3245 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3246 struct vcpu *v = current;
3247 struct domain *d = v->domain;
3249 /* Only allow naturally-aligned stores within the original %cr2 page. */
3250 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3252 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3253 ptwr_ctxt->cr2, addr, bytes);
3254 return X86EMUL_UNHANDLEABLE;
3257 /* Turn a sub-word access into a full-word access. */
3258 if ( bytes != sizeof(paddr_t) )
3260 paddr_t full;
3261 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3263 /* Align address; read full word. */
3264 addr &= ~(sizeof(paddr_t)-1);
3265 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3267 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3268 return X86EMUL_EXCEPTION;
3270 /* Mask out bits provided by caller. */
3271 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3272 /* Shift the caller value and OR in the missing bits. */
3273 val &= (((paddr_t)1 << (bytes*8)) - 1);
3274 val <<= (offset)*8;
3275 val |= full;
3276 /* Also fill in missing parts of the cmpxchg old value. */
3277 old &= (((paddr_t)1 << (bytes*8)) - 1);
3278 old <<= (offset)*8;
3279 old |= full;
3282 pte = ptwr_ctxt->pte;
3283 mfn = l1e_get_pfn(pte);
3284 page = mfn_to_page(mfn);
3286 /* We are looking only for read-only mappings of p.t. pages. */
3287 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3288 ASSERT(mfn_valid(mfn));
3289 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3290 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3291 ASSERT(page_get_owner(page) == d);
3293 /* Check the new PTE. */
3294 nl1e = l1e_from_intpte(val);
3295 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3297 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3298 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3299 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3301 /*
3302 * If this is an upper-half write to a PAE PTE then we assume that
3303 * the guest has simply got the two writes the wrong way round. We
3304 * zap the PRESENT bit on the assumption that the bottom half will
3305 * be written immediately after we return to the guest.
3306 */
3307 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3308 l1e_get_intpte(nl1e));
3309 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3311 else
3313 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3314 return X86EMUL_UNHANDLEABLE;
3318 adjust_guest_l1e(nl1e, d);
3320 /* Checked successfully: do the update (write or cmpxchg). */
3321 pl1e = map_domain_page(mfn);
3322 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3323 if ( do_cmpxchg )
3325 int okay;
3326 intpte_t t = old;
3327 ol1e = l1e_from_intpte(old);
3329 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3330 &t, val, _mfn(mfn));
3331 okay = (okay && t == old);
3333 if ( !okay )
3335 unmap_domain_page(pl1e);
3336 put_page_from_l1e(nl1e, d);
3337 return X86EMUL_CMPXCHG_FAILED;
3340 else
3342 ol1e = *pl1e;
3343 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v) )
3344 BUG();
3347 unmap_domain_page(pl1e);
3349 /* Finally, drop the old PTE. */
3350 put_page_from_l1e(ol1e, d);
3352 return X86EMUL_OKAY;
3355 static int ptwr_emulated_write(
3356 enum x86_segment seg,
3357 unsigned long offset,
3358 unsigned long val,
3359 unsigned int bytes,
3360 struct x86_emulate_ctxt *ctxt)
3362 return ptwr_emulated_update(
3363 offset, 0, val, bytes, 0,
3364 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3367 static int ptwr_emulated_cmpxchg(
3368 enum x86_segment seg,
3369 unsigned long offset,
3370 unsigned long old,
3371 unsigned long new,
3372 unsigned int bytes,
3373 struct x86_emulate_ctxt *ctxt)
3375 return ptwr_emulated_update(
3376 offset, old, new, bytes, 1,
3377 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3380 static int ptwr_emulated_cmpxchg8b(
3381 enum x86_segment seg,
3382 unsigned long offset,
3383 unsigned long old,
3384 unsigned long old_hi,
3385 unsigned long new,
3386 unsigned long new_hi,
3387 struct x86_emulate_ctxt *ctxt)
3389 if ( CONFIG_PAGING_LEVELS == 2 )
3390 return X86EMUL_UNHANDLEABLE;
3391 return ptwr_emulated_update(
3392 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3393 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3396 static struct x86_emulate_ops ptwr_emulate_ops = {
3397 .read = ptwr_emulated_read,
3398 .insn_fetch = ptwr_emulated_read,
3399 .write = ptwr_emulated_write,
3400 .cmpxchg = ptwr_emulated_cmpxchg,
3401 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3402 };
3404 /* Write page fault handler: check if guest is trying to modify a PTE. */
3405 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3406 struct cpu_user_regs *regs)
3408 struct domain *d = v->domain;
3409 struct page_info *page;
3410 l1_pgentry_t pte;
3411 struct ptwr_emulate_ctxt ptwr_ctxt;
3412 int rc;
3414 LOCK_BIGLOCK(d);
3416 /* Attempt to read the PTE that maps the VA being accessed. */
3417 guest_get_eff_l1e(v, addr, &pte);
3418 page = l1e_get_page(pte);
3420 /* We are looking only for read-only mappings of p.t. pages. */
3421 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3422 !mfn_valid(l1e_get_pfn(pte)) ||
3423 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3424 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3425 (page_get_owner(page) != d) )
3426 goto bail;
3428 ptwr_ctxt.ctxt.regs = regs;
3429 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3430 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3431 ptwr_ctxt.cr2 = addr;
3432 ptwr_ctxt.pte = pte;
3434 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3435 if ( rc == X86EMUL_UNHANDLEABLE )
3436 goto bail;
3438 UNLOCK_BIGLOCK(d);
3439 perfc_incr(ptwr_emulations);
3440 return EXCRET_fault_fixed;
3442 bail:
3443 UNLOCK_BIGLOCK(d);
3444 return 0;
3447 void free_xen_pagetable(void *v)
3449 extern int early_boot;
3451 BUG_ON(early_boot);
3453 if ( is_xen_heap_frame(virt_to_page(v)) )
3454 free_xenheap_page(v);
3455 else
3456 free_domheap_page(virt_to_page(v));
3459 int map_pages_to_xen(
3460 unsigned long virt,
3461 unsigned long mfn,
3462 unsigned long nr_mfns,
3463 unsigned long flags)
3465 l2_pgentry_t *pl2e, ol2e;
3466 l1_pgentry_t *pl1e, ol1e;
3467 unsigned int i;
3469 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3470 flags &= ~MAP_SMALL_PAGES;
3472 while ( nr_mfns != 0 )
3474 pl2e = virt_to_xen_l2e(virt);
3476 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3477 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3478 !map_small_pages )
3480 /* Super-page mapping. */
3481 ol2e = *pl2e;
3482 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, flags|_PAGE_PSE));
3484 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3486 local_flush_tlb_pge();
3487 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3488 free_xen_pagetable(mfn_to_virt(l2e_get_pfn(ol2e)));
3491 virt += 1UL << L2_PAGETABLE_SHIFT;
3492 mfn += 1UL << PAGETABLE_ORDER;
3493 nr_mfns -= 1UL << PAGETABLE_ORDER;
3495 else
3497 /* Normal page mapping. */
3498 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3500 pl1e = alloc_xen_pagetable();
3501 clear_page(pl1e);
3502 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3503 __PAGE_HYPERVISOR));
3505 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3507 pl1e = alloc_xen_pagetable();
3508 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3509 l1e_write(&pl1e[i],
3510 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3511 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
3512 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3513 __PAGE_HYPERVISOR));
3514 local_flush_tlb_pge();
3517 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3518 ol1e = *pl1e;
3519 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
3520 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3521 local_flush_tlb_one(virt);
3523 virt += 1UL << L1_PAGETABLE_SHIFT;
3524 mfn += 1UL;
3525 nr_mfns -= 1UL;
3529 return 0;
3532 void destroy_xen_mappings(unsigned long s, unsigned long e)
3534 l2_pgentry_t *pl2e;
3535 l1_pgentry_t *pl1e;
3536 unsigned int i;
3537 unsigned long v = s;
3539 ASSERT((s & ~PAGE_MASK) == 0);
3540 ASSERT((e & ~PAGE_MASK) == 0);
3542 while ( v < e )
3544 pl2e = virt_to_xen_l2e(v);
3546 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3548 v += 1UL << L2_PAGETABLE_SHIFT;
3549 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
3550 continue;
3553 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3555 if ( (l1_table_offset(v) == 0) &&
3556 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
3558 /* PSE: whole superpage is destroyed. */
3559 l2e_write_atomic(pl2e, l2e_empty());
3560 v += 1UL << L2_PAGETABLE_SHIFT;
3562 else
3564 /* PSE: shatter the superpage and try again. */
3565 pl1e = alloc_xen_pagetable();
3566 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3567 l1e_write(&pl1e[i],
3568 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3569 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
3570 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
3571 __PAGE_HYPERVISOR));
3574 else
3576 /* Ordinary 4kB mapping. */
3577 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
3578 l1e_write_atomic(pl1e, l1e_empty());
3579 v += PAGE_SIZE;
3581 /* If we are done with the L2E, check if it is now empty. */
3582 if ( (v != e) && (l1_table_offset(v) != 0) )
3583 continue;
3584 pl1e = l2e_to_l1e(*pl2e);
3585 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3586 if ( l1e_get_intpte(pl1e[i]) != 0 )
3587 break;
3588 if ( i == L1_PAGETABLE_ENTRIES )
3590 /* Empty: zap the L2E and free the L1 page. */
3591 l2e_write_atomic(pl2e, l2e_empty());
3592 free_xen_pagetable(pl1e);
3597 flush_tlb_all_pge();
3600 void __set_fixmap(
3601 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3603 BUG_ON(idx >= __end_of_fixed_addresses);
3604 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3607 #ifdef MEMORY_GUARD
3609 void memguard_init(void)
3611 map_pages_to_xen(
3612 (unsigned long)__va(xen_phys_start),
3613 xen_phys_start >> PAGE_SHIFT,
3614 (xenheap_phys_end - xen_phys_start) >> PAGE_SHIFT,
3615 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3616 #ifdef __x86_64__
3617 map_pages_to_xen(
3618 XEN_VIRT_START,
3619 xen_phys_start >> PAGE_SHIFT,
3620 (__pa(&_end) + PAGE_SIZE - 1 - xen_phys_start) >> PAGE_SHIFT,
3621 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3622 #endif
3625 static void __memguard_change_range(void *p, unsigned long l, int guard)
3627 unsigned long _p = (unsigned long)p;
3628 unsigned long _l = (unsigned long)l;
3629 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3631 /* Ensure we are dealing with a page-aligned whole number of pages. */
3632 ASSERT((_p&PAGE_MASK) != 0);
3633 ASSERT((_l&PAGE_MASK) != 0);
3634 ASSERT((_p&~PAGE_MASK) == 0);
3635 ASSERT((_l&~PAGE_MASK) == 0);
3637 if ( guard )
3638 flags &= ~_PAGE_PRESENT;
3640 map_pages_to_xen(
3641 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3644 void memguard_guard_range(void *p, unsigned long l)
3646 __memguard_change_range(p, l, 1);
3649 void memguard_unguard_range(void *p, unsigned long l)
3651 __memguard_change_range(p, l, 0);
3654 #endif
3656 void memguard_guard_stack(void *p)
3658 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
3659 p = (void *)((unsigned long)p + STACK_SIZE -
3660 PRIMARY_STACK_SIZE - PAGE_SIZE);
3661 memguard_guard_range(p, PAGE_SIZE);
3664 /*
3665 * Local variables:
3666 * mode: C
3667 * c-set-style: "BSD"
3668 * c-basic-offset: 4
3669 * tab-width: 4
3670 * indent-tabs-mode: nil
3671 * End:
3672 */