direct-io.hg

view xen/arch/x86/mm.c @ 12803:df5fa63490f4

[XEN] Implement XENMEM_set_memory_map, which specifies memory map to
be returned by XENMEM_memory_map. Hook this into the domain builder.

Based on a patch by Glauber de Oliveira Costa <gcosta@redhat.com>

Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Fri Dec 08 11:30:30 2006 +0000 (2006-12-08)
parents 1d83974d08b1
children 37141c3a3d39
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/shadow.h>
103 #include <asm/page.h>
104 #include <asm/flushtlb.h>
105 #include <asm/io.h>
106 #include <asm/ldt.h>
107 #include <asm/x86_emulate.h>
108 #include <asm/e820.h>
109 #include <public/memory.h>
111 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
113 /*
114 * PTE updates can be done with ordinary writes except:
115 * 1. Debug builds get extra checking by using CMPXCHG[8B].
116 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
117 */
118 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
119 #define PTE_UPDATE_WITH_CMPXCHG
120 #endif
122 /*
123 * Both do_mmuext_op() and do_mmu_update():
124 * We steal the m.s.b. of the @count parameter to indicate whether this
125 * invocation of do_mmu_update() is resuming a previously preempted call.
126 */
127 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
129 static void free_l2_table(struct page_info *page);
130 static void free_l1_table(struct page_info *page);
132 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
133 unsigned long type);
134 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn);
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 static struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 void __init init_frametable(void)
162 {
163 unsigned long nr_pages, page_step, i, mfn;
165 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
167 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
168 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
170 for ( i = 0; i < nr_pages; i += page_step )
171 {
172 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
173 if ( mfn == 0 )
174 panic("Not enough memory for frame table\n");
175 map_pages_to_xen(
176 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
177 mfn, page_step, PAGE_HYPERVISOR);
178 }
180 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
181 }
183 void arch_init_memory(void)
184 {
185 extern void subarch_init_memory(void);
187 unsigned long i, pfn, rstart_pfn, rend_pfn;
189 /*
190 * Initialise our DOMID_XEN domain.
191 * Any Xen-heap pages that we will allow to be mapped will have
192 * their domain field set to dom_xen.
193 */
194 dom_xen = alloc_domain(DOMID_XEN);
195 BUG_ON(dom_xen == NULL);
197 /*
198 * Initialise our DOMID_IO domain.
199 * This domain owns I/O pages that are within the range of the page_info
200 * array. Mappings occur at the priv of the caller.
201 */
202 dom_io = alloc_domain(DOMID_IO);
203 BUG_ON(dom_io == NULL);
205 /* First 1MB of RAM is historically marked as I/O. */
206 for ( i = 0; i < 0x100; i++ )
207 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
209 /* Any areas not specified as RAM by the e820 map are considered I/O. */
210 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
211 {
212 if ( e820.map[i].type != E820_RAM )
213 continue;
214 /* Every page from cursor to start of next RAM region is I/O. */
215 rstart_pfn = PFN_UP(e820.map[i].addr);
216 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
217 for ( ; pfn < rstart_pfn; pfn++ )
218 {
219 BUG_ON(!mfn_valid(pfn));
220 share_xen_page_with_guest(
221 mfn_to_page(pfn), dom_io, XENSHARE_writable);
222 }
223 /* Skip the RAM region. */
224 pfn = rend_pfn;
225 }
226 BUG_ON(pfn != max_page);
228 subarch_init_memory();
229 }
231 int memory_is_conventional_ram(paddr_t p)
232 {
233 int i;
235 for ( i = 0; i < e820.nr_map; i++ )
236 {
237 if ( (e820.map[i].type == E820_RAM) &&
238 (e820.map[i].addr <= p) &&
239 (e820.map[i].size > p) )
240 return 1;
241 }
243 return 0;
244 }
246 void share_xen_page_with_guest(
247 struct page_info *page, struct domain *d, int readonly)
248 {
249 if ( page_get_owner(page) == d )
250 return;
252 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
254 spin_lock(&d->page_alloc_lock);
256 /* The incremented type count pins as writable or read-only. */
257 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
258 page->u.inuse.type_info |= PGT_validated | 1;
260 page_set_owner(page, d);
261 wmb(); /* install valid domain ptr before updating refcnt. */
262 ASSERT(page->count_info == 0);
263 page->count_info |= PGC_allocated | 1;
265 if ( unlikely(d->xenheap_pages++ == 0) )
266 get_knownalive_domain(d);
267 list_add_tail(&page->list, &d->xenpage_list);
269 spin_unlock(&d->page_alloc_lock);
270 }
272 void share_xen_page_with_privileged_guests(
273 struct page_info *page, int readonly)
274 {
275 share_xen_page_with_guest(page, dom_xen, readonly);
276 }
278 #if defined(CONFIG_X86_PAE)
280 #ifdef NDEBUG
281 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
282 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
283 #else
284 /*
285 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
286 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
287 * (detected by lack of an owning domain). As required for correctness, we
288 * always shadow PDPTs above 4GB.
289 */
290 #define l3tab_needs_shadow(mfn) \
291 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
292 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
293 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
294 ((mfn) >= 0x100000))
295 #endif
297 static l1_pgentry_t *fix_pae_highmem_pl1e;
299 /* Cache the address of PAE high-memory fixmap page tables. */
300 static int __init cache_pae_fixmap_address(void)
301 {
302 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
303 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
304 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
305 return 0;
306 }
307 __initcall(cache_pae_fixmap_address);
309 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
311 void make_cr3(struct vcpu *v, unsigned long mfn)
312 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
313 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
314 {
315 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
316 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
317 unsigned int cpu = smp_processor_id();
319 /* Fast path: does this mfn need a shadow at all? */
320 if ( !l3tab_needs_shadow(mfn) )
321 {
322 v->arch.cr3 = mfn << PAGE_SHIFT;
323 /* Cache is no longer in use or valid */
324 cache->high_mfn = 0;
325 return;
326 }
328 /* Caching logic is not interrupt safe. */
329 ASSERT(!in_irq());
331 /* Protects against pae_flush_pgd(). */
332 spin_lock(&cache->lock);
334 cache->inuse_idx ^= 1;
335 cache->high_mfn = mfn;
337 /* Map the guest L3 table and copy to the chosen low-memory cache. */
338 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
339 /* First check the previous high mapping can't be in the TLB.
340 * (i.e. have we loaded CR3 since we last did this?) */
341 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
342 local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
343 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
344 lowmem_l3tab = cache->table[cache->inuse_idx];
345 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
346 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
347 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
349 v->arch.cr3 = __pa(lowmem_l3tab);
351 spin_unlock(&cache->lock);
352 }
354 #else /* !CONFIG_X86_PAE */
356 void make_cr3(struct vcpu *v, unsigned long mfn)
357 {
358 v->arch.cr3 = mfn << PAGE_SHIFT;
359 }
361 #endif /* !CONFIG_X86_PAE */
363 void write_ptbase(struct vcpu *v)
364 {
365 write_cr3(v->arch.cr3);
366 }
368 void invalidate_shadow_ldt(struct vcpu *v)
369 {
370 int i;
371 unsigned long pfn;
372 struct page_info *page;
374 if ( v->arch.shadow_ldt_mapcnt == 0 )
375 return;
377 v->arch.shadow_ldt_mapcnt = 0;
379 for ( i = 16; i < 32; i++ )
380 {
381 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
382 if ( pfn == 0 ) continue;
383 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
384 page = mfn_to_page(pfn);
385 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
386 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
387 put_page_and_type(page);
388 }
390 /* Dispose of the (now possibly invalid) mappings from the TLB. */
391 ASSERT(v->processor == smp_processor_id());
392 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
393 }
396 static int alloc_segdesc_page(struct page_info *page)
397 {
398 struct desc_struct *descs;
399 int i;
401 descs = map_domain_page(page_to_mfn(page));
403 for ( i = 0; i < 512; i++ )
404 if ( unlikely(!check_descriptor(&descs[i])) )
405 goto fail;
407 unmap_domain_page(descs);
408 return 1;
410 fail:
411 unmap_domain_page(descs);
412 return 0;
413 }
416 /* Map shadow page at offset @off. */
417 int map_ldt_shadow_page(unsigned int off)
418 {
419 struct vcpu *v = current;
420 struct domain *d = v->domain;
421 unsigned long gmfn, mfn;
422 l1_pgentry_t l1e, nl1e;
423 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
424 int okay;
426 BUG_ON(unlikely(in_irq()));
428 guest_get_eff_kern_l1e(v, gva, &l1e);
429 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
430 return 0;
432 gmfn = l1e_get_pfn(l1e);
433 mfn = gmfn_to_mfn(d, gmfn);
434 if ( unlikely(!mfn_valid(mfn)) )
435 return 0;
437 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
439 if ( !okay && unlikely(shadow_mode_refcounts(d)) )
440 {
441 shadow_lock(d);
442 shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
443 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
444 shadow_unlock(d);
445 }
447 if ( unlikely(!okay) )
448 return 0;
450 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
452 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
453 v->arch.shadow_ldt_mapcnt++;
455 return 1;
456 }
459 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
460 {
461 struct page_info *page = mfn_to_page(page_nr);
463 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
464 {
465 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
466 return 0;
467 }
469 return 1;
470 }
473 static int get_page_and_type_from_pagenr(unsigned long page_nr,
474 unsigned long type,
475 struct domain *d)
476 {
477 struct page_info *page = mfn_to_page(page_nr);
479 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
480 return 0;
482 if ( unlikely(!get_page_type(page, type)) )
483 {
484 put_page(page);
485 return 0;
486 }
488 return 1;
489 }
491 #ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
492 /*
493 * We allow root tables to map each other (a.k.a. linear page tables). It
494 * needs some special care with reference counts and access permissions:
495 * 1. The mapping entry must be read-only, or the guest may get write access
496 * to its own PTEs.
497 * 2. We must only bump the reference counts for an *already validated*
498 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
499 * on a validation that is required to complete that validation.
500 * 3. We only need to increment the reference counts for the mapped page
501 * frame if it is mapped by a different root table. This is sufficient and
502 * also necessary to allow validation of a root table mapping itself.
503 */
504 static int
505 get_linear_pagetable(
506 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
507 {
508 unsigned long x, y;
509 struct page_info *page;
510 unsigned long pfn;
512 ASSERT( !shadow_mode_refcounts(d) );
514 if ( (root_get_flags(re) & _PAGE_RW) )
515 {
516 MEM_LOG("Attempt to create linear p.t. with write perms");
517 return 0;
518 }
520 if ( (pfn = root_get_pfn(re)) != re_pfn )
521 {
522 /* Make sure the mapped frame belongs to the correct domain. */
523 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
524 return 0;
526 /*
527 * Make sure that the mapped frame is an already-validated L2 table.
528 * If so, atomically increment the count (checking for overflow).
529 */
530 page = mfn_to_page(pfn);
531 y = page->u.inuse.type_info;
532 do {
533 x = y;
534 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
535 unlikely((x & (PGT_type_mask|PGT_validated)) !=
536 (PGT_root_page_table|PGT_validated)) )
537 {
538 put_page(page);
539 return 0;
540 }
541 }
542 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
543 }
545 return 1;
546 }
547 #endif /* !CONFIG_X86_PAE */
549 int
550 get_page_from_l1e(
551 l1_pgentry_t l1e, struct domain *d)
552 {
553 unsigned long mfn = l1e_get_pfn(l1e);
554 struct page_info *page = mfn_to_page(mfn);
555 int okay;
557 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
558 return 1;
560 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
561 {
562 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
563 return 0;
564 }
566 if ( unlikely(!mfn_valid(mfn)) ||
567 unlikely(page_get_owner(page) == dom_io) )
568 {
569 /* DOMID_IO reverts to caller for privilege checks. */
570 if ( d == dom_io )
571 d = current->domain;
573 if ( !iomem_access_permitted(d, mfn, mfn) )
574 {
575 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
576 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
577 d->domain_id, mfn);
578 return 0;
579 }
581 /* No reference counting for out-of-range I/O pages. */
582 if ( !mfn_valid(mfn) )
583 return 1;
585 d = dom_io;
586 }
588 /* Foreign mappings into guests in shadow external mode don't
589 * contribute to writeable mapping refcounts. (This allows the
590 * qemu-dm helper process in dom0 to map the domain's memory without
591 * messing up the count of "real" writable mappings.) */
592 okay = (((l1e_get_flags(l1e) & _PAGE_RW) &&
593 !(unlikely(shadow_mode_external(d) && (d != current->domain))))
594 ? get_page_and_type(page, d, PGT_writable_page)
595 : get_page(page, d));
596 if ( !okay )
597 {
598 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
599 " for dom%d",
600 mfn, get_gpfn_from_mfn(mfn),
601 l1e_get_intpte(l1e), d->domain_id);
602 }
604 return okay;
605 }
608 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
609 static int
610 get_page_from_l2e(
611 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
612 {
613 int rc;
615 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
616 return 1;
618 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
619 {
620 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
621 return 0;
622 }
624 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
625 #if CONFIG_PAGING_LEVELS == 2
626 if ( unlikely(!rc) )
627 rc = get_linear_pagetable(l2e, pfn, d);
628 #endif
629 return rc;
630 }
633 #if CONFIG_PAGING_LEVELS >= 3
634 static int
635 get_page_from_l3e(
636 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
637 {
638 int rc;
640 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
641 return 1;
643 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
644 {
645 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
646 return 0;
647 }
649 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
650 return rc;
651 }
652 #endif /* 3 level */
654 #if CONFIG_PAGING_LEVELS >= 4
655 static int
656 get_page_from_l4e(
657 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
658 {
659 int rc;
661 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
662 return 1;
664 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
665 {
666 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
667 return 0;
668 }
670 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
672 if ( unlikely(!rc) )
673 rc = get_linear_pagetable(l4e, pfn, d);
675 return rc;
676 }
677 #endif /* 4 level */
679 #ifdef __x86_64__
681 #ifdef USER_MAPPINGS_ARE_GLOBAL
682 #define adjust_guest_l1e(pl1e) \
683 do { \
684 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) ) \
685 { \
686 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
687 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
688 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
689 MEM_LOG("Global bit is set to kernel page %lx", \
690 l1e_get_pfn((pl1e))); \
691 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
692 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
693 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
694 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
695 } \
696 } while ( 0 )
697 #else
698 #define adjust_guest_l1e(pl1e) \
699 do { \
700 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) ) \
701 l1e_add_flags((pl1e), _PAGE_USER); \
702 } while ( 0 )
703 #endif
705 #define adjust_guest_l2e(pl2e) \
706 do { \
707 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) ) \
708 l2e_add_flags((pl2e), _PAGE_USER); \
709 } while ( 0 )
711 #define adjust_guest_l3e(pl3e) \
712 do { \
713 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
714 l3e_add_flags((pl3e), _PAGE_USER); \
715 } while ( 0 )
717 #define adjust_guest_l4e(pl4e) \
718 do { \
719 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) ) \
720 l4e_add_flags((pl4e), _PAGE_USER); \
721 } while ( 0 )
723 #else /* !defined(__x86_64__) */
725 #define adjust_guest_l1e(_p) ((void)0)
726 #define adjust_guest_l2e(_p) ((void)0)
727 #define adjust_guest_l3e(_p) ((void)0)
729 #endif
731 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
732 {
733 unsigned long pfn = l1e_get_pfn(l1e);
734 struct page_info *page = mfn_to_page(pfn);
735 struct domain *e;
736 struct vcpu *v;
738 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
739 return;
741 e = page_get_owner(page);
743 /*
744 * Check if this is a mapping that was established via a grant reference.
745 * If it was then we should not be here: we require that such mappings are
746 * explicitly destroyed via the grant-table interface.
747 *
748 * The upshot of this is that the guest can end up with active grants that
749 * it cannot destroy (because it no longer has a PTE to present to the
750 * grant-table interface). This can lead to subtle hard-to-catch bugs,
751 * hence a special grant PTE flag can be enabled to catch the bug early.
752 *
753 * (Note that the undestroyable active grants are not a security hole in
754 * Xen. All active grants can safely be cleaned up when the domain dies.)
755 */
756 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
757 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
758 {
759 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
760 l1e_get_intpte(l1e));
761 domain_crash(d);
762 }
764 /* Remember we didn't take a type-count of foreign writable mappings
765 * to shadow external domains */
766 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
767 !(unlikely((e != d) && shadow_mode_external(e))) )
768 {
769 put_page_and_type(page);
770 }
771 else
772 {
773 /* We expect this is rare so we blow the entire shadow LDT. */
774 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
775 PGT_ldt_page)) &&
776 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
777 (d == e) )
778 {
779 for_each_vcpu ( d, v )
780 invalidate_shadow_ldt(v);
781 }
782 put_page(page);
783 }
784 }
787 /*
788 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
789 * Note also that this automatically deals correctly with linear p.t.'s.
790 */
791 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
792 {
793 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
794 (l2e_get_pfn(l2e) != pfn) )
795 put_page_and_type(mfn_to_page(l2e_get_pfn(l2e)));
796 }
799 #if CONFIG_PAGING_LEVELS >= 3
800 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
801 {
802 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
803 (l3e_get_pfn(l3e) != pfn) )
804 put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
805 }
806 #endif
808 #if CONFIG_PAGING_LEVELS >= 4
809 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
810 {
811 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
812 (l4e_get_pfn(l4e) != pfn) )
813 put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
814 }
815 #endif
817 static int alloc_l1_table(struct page_info *page)
818 {
819 struct domain *d = page_get_owner(page);
820 unsigned long pfn = page_to_mfn(page);
821 l1_pgentry_t *pl1e;
822 int i;
824 ASSERT(!shadow_mode_refcounts(d));
826 pl1e = map_domain_page(pfn);
828 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
829 {
830 if ( is_guest_l1_slot(i) &&
831 unlikely(!get_page_from_l1e(pl1e[i], d)) )
832 goto fail;
834 adjust_guest_l1e(pl1e[i]);
835 }
837 unmap_domain_page(pl1e);
838 return 1;
840 fail:
841 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
842 while ( i-- > 0 )
843 if ( is_guest_l1_slot(i) )
844 put_page_from_l1e(pl1e[i], d);
846 unmap_domain_page(pl1e);
847 return 0;
848 }
850 #ifdef CONFIG_X86_PAE
851 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
852 {
853 struct page_info *page;
854 l2_pgentry_t *pl2e, l2e;
855 l3_pgentry_t l3e3;
856 int i;
858 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
860 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
861 l3e3 = pl3e[3];
862 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
863 {
864 MEM_LOG("PAE L3 3rd slot is empty");
865 return 0;
866 }
868 /*
869 * The Xen-private mappings include linear mappings. The L2 thus cannot
870 * be shared by multiple L3 tables. The test here is adequate because:
871 * 1. Cannot appear in slots != 3 because get_page_type() checks the
872 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
873 * 2. Cannot appear in another page table's L3:
874 * a. alloc_l3_table() calls this function and this check will fail
875 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
876 *
877 * XXX -- this needs revisiting for shadow_mode_refcount()==true...
878 */
879 page = l3e_get_page(l3e3);
880 BUG_ON(page->u.inuse.type_info & PGT_pinned);
881 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
882 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
883 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
884 {
885 MEM_LOG("PAE L3 3rd slot is shared");
886 return 0;
887 }
889 /* Xen private mappings. */
890 pl2e = map_domain_page(l3e_get_pfn(l3e3));
891 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
892 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
893 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
894 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
895 {
896 l2e = l2e_from_page(
897 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
898 __PAGE_HYPERVISOR);
899 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
900 }
901 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
902 {
903 l2e = l2e_empty();
904 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
905 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
906 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
907 }
908 unmap_domain_page(pl2e);
910 return 1;
911 }
913 /* Flush a pgdir update into low-memory caches. */
914 static void pae_flush_pgd(
915 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
916 {
917 struct domain *d = page_get_owner(mfn_to_page(mfn));
918 struct vcpu *v;
919 intpte_t _ol3e, _nl3e, _pl3e;
920 l3_pgentry_t *l3tab_ptr;
921 struct pae_l3_cache *cache;
923 /* If below 4GB then the pgdir is not shadowed in low memory. */
924 if ( !l3tab_needs_shadow(mfn) )
925 return;
927 for_each_vcpu ( d, v )
928 {
929 cache = &v->arch.pae_l3_cache;
931 spin_lock(&cache->lock);
933 if ( cache->high_mfn == mfn )
934 {
935 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
936 _ol3e = l3e_get_intpte(*l3tab_ptr);
937 _nl3e = l3e_get_intpte(nl3e);
938 _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
939 BUG_ON(_pl3e != _ol3e);
940 }
942 spin_unlock(&cache->lock);
943 }
945 flush_tlb_mask(d->domain_dirty_cpumask);
946 }
948 #elif CONFIG_X86_64
949 # define create_pae_xen_mappings(pl3e) (1)
950 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
951 #else
952 # define create_pae_xen_mappings(pl3e) (1)
953 #endif
955 static int alloc_l2_table(struct page_info *page, unsigned long type)
956 {
957 struct domain *d = page_get_owner(page);
958 unsigned long pfn = page_to_mfn(page);
959 l2_pgentry_t *pl2e;
960 int i;
962 ASSERT(!shadow_mode_refcounts(d));
964 pl2e = map_domain_page(pfn);
966 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
967 {
968 if ( is_guest_l2_slot(type, i) &&
969 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
970 goto fail;
972 adjust_guest_l2e(pl2e[i]);
973 }
975 #if CONFIG_PAGING_LEVELS == 2
976 /* Xen private mappings. */
977 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
978 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
979 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
980 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
981 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
982 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
983 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
984 l2e_from_page(
985 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
986 __PAGE_HYPERVISOR);
987 #endif
989 unmap_domain_page(pl2e);
990 return 1;
992 fail:
993 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
994 while ( i-- > 0 )
995 if ( is_guest_l2_slot(type, i) )
996 put_page_from_l2e(pl2e[i], pfn);
998 unmap_domain_page(pl2e);
999 return 0;
1003 #if CONFIG_PAGING_LEVELS >= 3
1004 static int alloc_l3_table(struct page_info *page)
1006 struct domain *d = page_get_owner(page);
1007 unsigned long pfn = page_to_mfn(page);
1008 l3_pgentry_t *pl3e;
1009 int i;
1011 ASSERT(!shadow_mode_refcounts(d));
1013 #ifdef CONFIG_X86_PAE
1014 /*
1015 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1016 * the weird 'extended cr3' format for dealing with high-order address
1017 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1018 */
1019 if ( (pfn >= 0x100000) &&
1020 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1021 d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) )
1023 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1024 return 0;
1026 #endif
1028 pl3e = map_domain_page(pfn);
1029 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1031 #ifdef CONFIG_X86_PAE
1032 if ( i == 3 )
1034 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1035 (l3e_get_flags(pl3e[i]) & L3_DISALLOW_MASK) ||
1036 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1037 PGT_l2_page_table |
1038 PGT_pae_xen_l2,
1039 d) )
1040 goto fail;
1042 else
1043 #endif
1044 if ( is_guest_l3_slot(i) &&
1045 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1046 goto fail;
1048 adjust_guest_l3e(pl3e[i]);
1051 if ( !create_pae_xen_mappings(pl3e) )
1052 goto fail;
1054 unmap_domain_page(pl3e);
1055 return 1;
1057 fail:
1058 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1059 while ( i-- > 0 )
1060 if ( is_guest_l3_slot(i) )
1061 put_page_from_l3e(pl3e[i], pfn);
1063 unmap_domain_page(pl3e);
1064 return 0;
1066 #else
1067 #define alloc_l3_table(page) (0)
1068 #endif
1070 #if CONFIG_PAGING_LEVELS >= 4
1071 static int alloc_l4_table(struct page_info *page)
1073 struct domain *d = page_get_owner(page);
1074 unsigned long pfn = page_to_mfn(page);
1075 l4_pgentry_t *pl4e = page_to_virt(page);
1076 int i;
1078 ASSERT(!shadow_mode_refcounts(d));
1080 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1082 if ( is_guest_l4_slot(i) &&
1083 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1084 goto fail;
1086 adjust_guest_l4e(pl4e[i]);
1089 /* Xen private mappings. */
1090 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1091 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1092 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1093 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1094 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1095 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1096 l4e_from_page(
1097 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
1098 __PAGE_HYPERVISOR);
1100 return 1;
1102 fail:
1103 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1104 while ( i-- > 0 )
1105 if ( is_guest_l4_slot(i) )
1106 put_page_from_l4e(pl4e[i], pfn);
1108 return 0;
1110 #else
1111 #define alloc_l4_table(page) (0)
1112 #endif
1115 static void free_l1_table(struct page_info *page)
1117 struct domain *d = page_get_owner(page);
1118 unsigned long pfn = page_to_mfn(page);
1119 l1_pgentry_t *pl1e;
1120 int i;
1122 pl1e = map_domain_page(pfn);
1124 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1125 if ( is_guest_l1_slot(i) )
1126 put_page_from_l1e(pl1e[i], d);
1128 unmap_domain_page(pl1e);
1132 static void free_l2_table(struct page_info *page)
1134 unsigned long pfn = page_to_mfn(page);
1135 l2_pgentry_t *pl2e;
1136 int i;
1138 pl2e = map_domain_page(pfn);
1140 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1141 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
1142 put_page_from_l2e(pl2e[i], pfn);
1144 unmap_domain_page(pl2e);
1146 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1150 #if CONFIG_PAGING_LEVELS >= 3
1152 static void free_l3_table(struct page_info *page)
1154 unsigned long pfn = page_to_mfn(page);
1155 l3_pgentry_t *pl3e;
1156 int i;
1158 pl3e = map_domain_page(pfn);
1160 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1161 if ( is_guest_l3_slot(i) )
1162 put_page_from_l3e(pl3e[i], pfn);
1164 unmap_domain_page(pl3e);
1167 #endif
1169 #if CONFIG_PAGING_LEVELS >= 4
1171 static void free_l4_table(struct page_info *page)
1173 unsigned long pfn = page_to_mfn(page);
1174 l4_pgentry_t *pl4e = page_to_virt(page);
1175 int i;
1177 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1178 if ( is_guest_l4_slot(i) )
1179 put_page_from_l4e(pl4e[i], pfn);
1182 #endif
1184 static inline int update_l1e(l1_pgentry_t *pl1e,
1185 l1_pgentry_t ol1e,
1186 l1_pgentry_t nl1e,
1187 unsigned long gl1mfn,
1188 struct vcpu *v)
1190 int rv = 1;
1191 if ( unlikely(shadow_mode_enabled(v->domain)) )
1192 shadow_lock(v->domain);
1193 #ifndef PTE_UPDATE_WITH_CMPXCHG
1194 rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
1195 #else
1197 intpte_t o = l1e_get_intpte(ol1e);
1198 intpte_t n = l1e_get_intpte(nl1e);
1200 for ( ; ; )
1202 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
1204 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1205 ": saw %" PRIpte,
1206 l1e_get_intpte(ol1e),
1207 l1e_get_intpte(nl1e),
1208 o);
1209 rv = 0;
1210 break;
1213 if ( o == l1e_get_intpte(ol1e) )
1214 break;
1216 /* Allowed to change in Accessed/Dirty flags only. */
1217 BUG_ON((o ^ l1e_get_intpte(ol1e)) &
1218 ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
1219 ol1e = l1e_from_intpte(o);
1222 #endif
1223 if ( unlikely(shadow_mode_enabled(v->domain)) && rv )
1225 shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
1226 shadow_unlock(v->domain);
1228 return rv;
1232 /* Update the L1 entry at pl1e to new value nl1e. */
1233 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1234 unsigned long gl1mfn)
1236 l1_pgentry_t ol1e;
1237 struct domain *d = current->domain;
1239 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1240 return 0;
1242 if ( unlikely(shadow_mode_refcounts(d)) )
1243 return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
1245 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1247 /* Translate foreign guest addresses. */
1248 nl1e = l1e_from_pfn(gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e)),
1249 l1e_get_flags(nl1e));
1251 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1253 MEM_LOG("Bad L1 flags %x",
1254 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1255 return 0;
1258 adjust_guest_l1e(nl1e);
1260 /* Fast path for identical mapping, r/w and presence. */
1261 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1262 return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
1264 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1265 return 0;
1267 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
1269 put_page_from_l1e(nl1e, d);
1270 return 0;
1273 else
1275 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
1276 return 0;
1279 put_page_from_l1e(ol1e, d);
1280 return 1;
1283 #ifndef PTE_UPDATE_WITH_CMPXCHG
1284 #define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
1285 #else
1286 #define _UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1287 for ( ; ; ) \
1288 { \
1289 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1290 _t ## e_get_intpte(_o), \
1291 _t ## e_get_intpte(_n)); \
1292 if ( __o == _t ## e_get_intpte(_o) ) \
1293 break; \
1294 /* Allowed to change in Accessed/Dirty flags only. */ \
1295 BUG_ON((__o ^ _t ## e_get_intpte(_o)) & \
1296 ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY)); \
1297 _o = _t ## e_from_intpte(__o); \
1298 } \
1299 1; })
1300 #endif
1301 #define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \
1302 int rv; \
1303 if ( unlikely(shadow_mode_enabled(current->domain)) ) \
1304 shadow_lock(current->domain); \
1305 rv = _UPDATE_ENTRY(_t, _p, _o, _n); \
1306 if ( unlikely(shadow_mode_enabled(current->domain)) ) \
1307 { \
1308 shadow_validate_guest_entry(current, _mfn(_m), (_p)); \
1309 shadow_unlock(current->domain); \
1310 } \
1311 rv; \
1312 })
1314 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1315 static int mod_l2_entry(l2_pgentry_t *pl2e,
1316 l2_pgentry_t nl2e,
1317 unsigned long pfn,
1318 unsigned long type)
1320 l2_pgentry_t ol2e;
1322 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1324 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1325 return 0;
1328 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1329 return 0;
1331 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1333 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1335 MEM_LOG("Bad L2 flags %x",
1336 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1337 return 0;
1340 adjust_guest_l2e(nl2e);
1342 /* Fast path for identical mapping and presence. */
1343 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1344 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn);
1346 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain)) )
1347 return 0;
1349 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
1351 put_page_from_l2e(nl2e, pfn);
1352 return 0;
1355 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
1357 return 0;
1360 put_page_from_l2e(ol2e, pfn);
1361 return 1;
1364 #if CONFIG_PAGING_LEVELS >= 3
1366 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1367 static int mod_l3_entry(l3_pgentry_t *pl3e,
1368 l3_pgentry_t nl3e,
1369 unsigned long pfn)
1371 l3_pgentry_t ol3e;
1372 int okay;
1374 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1376 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1377 return 0;
1380 #ifdef CONFIG_X86_PAE
1381 /*
1382 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1383 * would be a pain to ensure they remain continuously valid throughout.
1384 */
1385 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1386 return 0;
1387 #endif
1389 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1390 return 0;
1392 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1394 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1396 MEM_LOG("Bad L3 flags %x",
1397 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1398 return 0;
1401 adjust_guest_l3e(nl3e);
1403 /* Fast path for identical mapping and presence. */
1404 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1405 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn);
1407 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain)) )
1408 return 0;
1410 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
1412 put_page_from_l3e(nl3e, pfn);
1413 return 0;
1416 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
1418 return 0;
1421 okay = create_pae_xen_mappings(pl3e);
1422 BUG_ON(!okay);
1424 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1426 put_page_from_l3e(ol3e, pfn);
1427 return 1;
1430 #endif
1432 #if CONFIG_PAGING_LEVELS >= 4
1434 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1435 static int mod_l4_entry(l4_pgentry_t *pl4e,
1436 l4_pgentry_t nl4e,
1437 unsigned long pfn)
1439 l4_pgentry_t ol4e;
1441 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1443 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1444 return 0;
1447 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1448 return 0;
1450 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1452 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1454 MEM_LOG("Bad L4 flags %x",
1455 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1456 return 0;
1459 adjust_guest_l4e(nl4e);
1461 /* Fast path for identical mapping and presence. */
1462 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1463 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn);
1465 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1466 return 0;
1468 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
1470 put_page_from_l4e(nl4e, pfn);
1471 return 0;
1474 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
1476 return 0;
1479 put_page_from_l4e(ol4e, pfn);
1480 return 1;
1483 #endif
1485 int alloc_page_type(struct page_info *page, unsigned long type)
1487 struct domain *owner = page_get_owner(page);
1489 /* A page table is dirtied when its type count becomes non-zero. */
1490 if ( likely(owner != NULL) )
1491 mark_dirty(owner, page_to_mfn(page));
1493 switch ( type & PGT_type_mask )
1495 case PGT_l1_page_table:
1496 return alloc_l1_table(page);
1497 case PGT_l2_page_table:
1498 return alloc_l2_table(page, type);
1499 case PGT_l3_page_table:
1500 return alloc_l3_table(page);
1501 case PGT_l4_page_table:
1502 return alloc_l4_table(page);
1503 case PGT_gdt_page:
1504 case PGT_ldt_page:
1505 return alloc_segdesc_page(page);
1506 default:
1507 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1508 type, page->u.inuse.type_info,
1509 page->count_info);
1510 BUG();
1513 return 0;
1517 void free_page_type(struct page_info *page, unsigned long type)
1519 struct domain *owner = page_get_owner(page);
1520 unsigned long gmfn;
1522 if ( likely(owner != NULL) )
1524 /*
1525 * We have to flush before the next use of the linear mapping
1526 * (e.g., update_va_mapping()) or we could end up modifying a page
1527 * that is no longer a page table (and hence screw up ref counts).
1528 */
1529 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
1531 if ( unlikely(shadow_mode_enabled(owner)) )
1533 /* A page table is dirtied when its type count becomes zero. */
1534 mark_dirty(owner, page_to_mfn(page));
1536 if ( shadow_mode_refcounts(owner) )
1537 return;
1539 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1540 ASSERT(VALID_M2P(gmfn));
1541 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1545 switch ( type & PGT_type_mask )
1547 case PGT_l1_page_table:
1548 free_l1_table(page);
1549 break;
1551 case PGT_l2_page_table:
1552 free_l2_table(page);
1553 break;
1555 #if CONFIG_PAGING_LEVELS >= 3
1556 case PGT_l3_page_table:
1557 free_l3_table(page);
1558 break;
1559 #endif
1561 #if CONFIG_PAGING_LEVELS >= 4
1562 case PGT_l4_page_table:
1563 free_l4_table(page);
1564 break;
1565 #endif
1567 default:
1568 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1569 type, page_to_mfn(page));
1570 BUG();
1575 void put_page_type(struct page_info *page)
1577 unsigned long nx, x, y = page->u.inuse.type_info;
1579 again:
1580 do {
1581 x = y;
1582 nx = x - 1;
1584 ASSERT((x & PGT_count_mask) != 0);
1586 if ( unlikely((nx & PGT_count_mask) == 0) )
1588 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1589 likely(nx & PGT_validated) )
1591 /*
1592 * Page-table pages must be unvalidated when count is zero. The
1593 * 'free' is safe because the refcnt is non-zero and validated
1594 * bit is clear => other ops will spin or fail.
1595 */
1596 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1597 x & ~PGT_validated)) != x) )
1598 goto again;
1599 /* We cleared the 'valid bit' so we do the clean up. */
1600 free_page_type(page, x);
1601 /* Carry on, but with the 'valid bit' now clear. */
1602 x &= ~PGT_validated;
1603 nx &= ~PGT_validated;
1606 /*
1607 * Record TLB information for flush later. We do not stamp page
1608 * tables when running in shadow mode:
1609 * 1. Pointless, since it's the shadow pt's which must be tracked.
1610 * 2. Shadow mode reuses this field for shadowed page tables to
1611 * store flags info -- we don't want to conflict with that.
1612 */
1613 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1614 (page->count_info & PGC_page_table)) )
1615 page->tlbflush_timestamp = tlbflush_current_time();
1618 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1622 int get_page_type(struct page_info *page, unsigned long type)
1624 unsigned long nx, x, y = page->u.inuse.type_info;
1626 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1628 again:
1629 do {
1630 x = y;
1631 nx = x + 1;
1632 if ( unlikely((nx & PGT_count_mask) == 0) )
1634 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1635 return 0;
1637 else if ( unlikely((x & PGT_count_mask) == 0) )
1639 struct domain *d = page_get_owner(page);
1641 /* Never allow a shadowed frame to go from type count 0 to 1 */
1642 if ( d && shadow_mode_enabled(d) )
1643 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1645 ASSERT(!(x & PGT_pae_xen_l2));
1646 if ( (x & PGT_type_mask) != type )
1648 /*
1649 * On type change we check to flush stale TLB entries. This
1650 * may be unnecessary (e.g., page was GDT/LDT) but those
1651 * circumstances should be very rare.
1652 */
1653 cpumask_t mask = d->domain_dirty_cpumask;
1655 /* Don't flush if the timestamp is old enough */
1656 tlbflush_filter(mask, page->tlbflush_timestamp);
1658 if ( unlikely(!cpus_empty(mask)) &&
1659 /* Shadow mode: track only writable pages. */
1660 (!shadow_mode_enabled(page_get_owner(page)) ||
1661 ((nx & PGT_type_mask) == PGT_writable_page)) )
1663 perfc_incrc(need_flush_tlb_flush);
1664 flush_tlb_mask(mask);
1667 /* We lose existing type, back pointer, and validity. */
1668 nx &= ~(PGT_type_mask | PGT_validated);
1669 nx |= type;
1671 /* No special validation needed for writable pages. */
1672 /* Page tables and GDT/LDT need to be scanned for validity. */
1673 if ( type == PGT_writable_page )
1674 nx |= PGT_validated;
1677 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1679 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1680 (type != PGT_l1_page_table) )
1681 MEM_LOG("Bad type (saw %" PRtype_info
1682 " != exp %" PRtype_info ") "
1683 "for mfn %lx (pfn %lx)",
1684 x, type, page_to_mfn(page),
1685 get_gpfn_from_mfn(page_to_mfn(page)));
1686 return 0;
1688 else if ( unlikely(!(x & PGT_validated)) )
1690 /* Someone else is updating validation of this page. Wait... */
1691 while ( (y = page->u.inuse.type_info) == x )
1692 cpu_relax();
1693 goto again;
1696 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1698 if ( unlikely(!(nx & PGT_validated)) )
1700 /* Try to validate page type; drop the new reference on failure. */
1701 if ( unlikely(!alloc_page_type(page, type)) )
1703 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1704 PRtype_info ": caf=%08x taf=%" PRtype_info,
1705 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1706 type, page->count_info, page->u.inuse.type_info);
1707 /* Noone else can get a reference. We hold the only ref. */
1708 page->u.inuse.type_info = 0;
1709 return 0;
1712 /* Noone else is updating simultaneously. */
1713 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1716 return 1;
1720 int new_guest_cr3(unsigned long mfn)
1722 struct vcpu *v = current;
1723 struct domain *d = v->domain;
1724 int okay;
1725 unsigned long old_base_mfn;
1727 if ( is_hvm_domain(d) && !hvm_paging_enabled(v) )
1728 return 0;
1730 if ( shadow_mode_refcounts(d) )
1732 okay = get_page_from_pagenr(mfn, d);
1733 if ( unlikely(!okay) )
1735 MEM_LOG("Error while installing new baseptr %lx", mfn);
1736 return 0;
1739 else
1741 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1742 if ( unlikely(!okay) )
1744 /* Switch to idle pagetable: this VCPU has no active p.t. now. */
1745 MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn);
1746 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1747 v->arch.guest_table = pagetable_null();
1748 update_cr3(v);
1749 write_cr3(__pa(idle_pg_table));
1750 if ( old_base_mfn != 0 )
1751 put_page_and_type(mfn_to_page(old_base_mfn));
1753 /* Retry the validation with no active p.t. for this VCPU. */
1754 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1755 if ( !okay )
1757 /* Failure here is unrecoverable: the VCPU has no pagetable! */
1758 MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
1759 domain_crash(d);
1760 ASSERT(v->processor == smp_processor_id());
1761 this_cpu(percpu_mm_info).deferred_ops = 0;
1762 return 0;
1767 invalidate_shadow_ldt(v);
1769 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1771 v->arch.guest_table = pagetable_from_pfn(mfn);
1772 update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */
1774 write_ptbase(v);
1776 if ( likely(old_base_mfn != 0) )
1778 if ( shadow_mode_refcounts(d) )
1779 put_page(mfn_to_page(old_base_mfn));
1780 else
1781 put_page_and_type(mfn_to_page(old_base_mfn));
1784 return 1;
1787 static void process_deferred_ops(void)
1789 unsigned int deferred_ops;
1790 struct domain *d = current->domain;
1791 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1793 deferred_ops = info->deferred_ops;
1794 info->deferred_ops = 0;
1796 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1798 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1799 flush_tlb_mask(d->domain_dirty_cpumask);
1800 else
1801 local_flush_tlb();
1804 if ( deferred_ops & DOP_RELOAD_LDT )
1805 (void)map_ldt_shadow_page(0);
1807 if ( unlikely(info->foreign != NULL) )
1809 put_domain(info->foreign);
1810 info->foreign = NULL;
1814 static int set_foreigndom(domid_t domid)
1816 struct domain *e, *d = current->domain;
1817 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1818 int okay = 1;
1820 ASSERT(info->foreign == NULL);
1822 if ( likely(domid == DOMID_SELF) )
1823 goto out;
1825 if ( unlikely(domid == d->domain_id) )
1827 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1828 d->domain_id);
1829 okay = 0;
1831 else if ( unlikely(shadow_mode_translate(d)) )
1833 MEM_LOG("Cannot mix foreign mappings with translated domains");
1834 okay = 0;
1836 else if ( !IS_PRIV(d) )
1838 switch ( domid )
1840 case DOMID_IO:
1841 get_knownalive_domain(dom_io);
1842 info->foreign = dom_io;
1843 break;
1844 default:
1845 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1846 okay = 0;
1847 break;
1850 else
1852 info->foreign = e = find_domain_by_id(domid);
1853 if ( e == NULL )
1855 switch ( domid )
1857 case DOMID_XEN:
1858 get_knownalive_domain(dom_xen);
1859 info->foreign = dom_xen;
1860 break;
1861 case DOMID_IO:
1862 get_knownalive_domain(dom_io);
1863 info->foreign = dom_io;
1864 break;
1865 default:
1866 MEM_LOG("Unknown domain '%u'", domid);
1867 okay = 0;
1868 break;
1873 out:
1874 return okay;
1877 static inline cpumask_t vcpumask_to_pcpumask(
1878 struct domain *d, unsigned long vmask)
1880 unsigned int vcpu_id;
1881 cpumask_t pmask = CPU_MASK_NONE;
1882 struct vcpu *v;
1884 while ( vmask != 0 )
1886 vcpu_id = find_first_set_bit(vmask);
1887 vmask &= ~(1UL << vcpu_id);
1888 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1889 ((v = d->vcpu[vcpu_id]) != NULL) )
1890 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1893 return pmask;
1896 int do_mmuext_op(
1897 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1898 unsigned int count,
1899 XEN_GUEST_HANDLE(uint) pdone,
1900 unsigned int foreigndom)
1902 struct mmuext_op op;
1903 int rc = 0, i = 0, okay;
1904 unsigned long mfn = 0, gmfn = 0, type;
1905 unsigned int done = 0;
1906 struct page_info *page;
1907 struct vcpu *v = current;
1908 struct domain *d = v->domain;
1910 LOCK_BIGLOCK(d);
1912 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1914 count &= ~MMU_UPDATE_PREEMPTED;
1915 if ( unlikely(!guest_handle_is_null(pdone)) )
1916 (void)copy_from_guest(&done, pdone, 1);
1919 if ( !set_foreigndom(foreigndom) )
1921 rc = -ESRCH;
1922 goto out;
1925 if ( unlikely(!guest_handle_okay(uops, count)) )
1927 rc = -EFAULT;
1928 goto out;
1931 for ( i = 0; i < count; i++ )
1933 if ( hypercall_preempt_check() )
1935 rc = hypercall_create_continuation(
1936 __HYPERVISOR_mmuext_op, "hihi",
1937 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1938 break;
1941 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
1943 MEM_LOG("Bad __copy_from_guest");
1944 rc = -EFAULT;
1945 break;
1948 okay = 1;
1949 gmfn = op.arg1.mfn;
1950 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
1951 page = mfn_to_page(mfn);
1953 switch ( op.cmd )
1955 case MMUEXT_PIN_L1_TABLE:
1956 type = PGT_l1_page_table;
1957 goto pin_page;
1959 case MMUEXT_PIN_L2_TABLE:
1960 type = PGT_l2_page_table;
1961 goto pin_page;
1963 case MMUEXT_PIN_L3_TABLE:
1964 type = PGT_l3_page_table;
1965 goto pin_page;
1967 case MMUEXT_PIN_L4_TABLE:
1968 type = PGT_l4_page_table;
1970 pin_page:
1971 /* Ignore pinning of invalid paging levels. */
1972 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
1973 break;
1975 if ( shadow_mode_refcounts(FOREIGNDOM) )
1976 break;
1978 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1979 if ( unlikely(!okay) )
1981 MEM_LOG("Error while pinning mfn %lx", mfn);
1982 break;
1985 if ( unlikely(test_and_set_bit(_PGT_pinned,
1986 &page->u.inuse.type_info)) )
1988 MEM_LOG("Mfn %lx already pinned", mfn);
1989 put_page_and_type(page);
1990 okay = 0;
1991 break;
1994 /* A page is dirtied when its pin status is set. */
1995 mark_dirty(d, mfn);
1997 break;
1999 case MMUEXT_UNPIN_TABLE:
2000 if ( shadow_mode_refcounts(d) )
2001 break;
2003 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2005 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2006 mfn, page_get_owner(page));
2008 else if ( likely(test_and_clear_bit(_PGT_pinned,
2009 &page->u.inuse.type_info)) )
2011 put_page_and_type(page);
2012 put_page(page);
2013 /* A page is dirtied when its pin status is cleared. */
2014 mark_dirty(d, mfn);
2016 else
2018 okay = 0;
2019 put_page(page);
2020 MEM_LOG("Mfn %lx not pinned", mfn);
2022 break;
2024 case MMUEXT_NEW_BASEPTR:
2025 okay = new_guest_cr3(mfn);
2026 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2027 break;
2029 #ifdef __x86_64__
2030 case MMUEXT_NEW_USER_BASEPTR:
2031 okay = 1;
2032 if (likely(mfn != 0))
2034 if ( shadow_mode_refcounts(d) )
2035 okay = get_page_from_pagenr(mfn, d);
2036 else
2037 okay = get_page_and_type_from_pagenr(
2038 mfn, PGT_root_page_table, d);
2040 if ( unlikely(!okay) )
2042 MEM_LOG("Error while installing new mfn %lx", mfn);
2044 else
2046 unsigned long old_mfn =
2047 pagetable_get_pfn(v->arch.guest_table_user);
2048 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2049 if ( old_mfn != 0 )
2051 if ( shadow_mode_refcounts(d) )
2052 put_page(mfn_to_page(old_mfn));
2053 else
2054 put_page_and_type(mfn_to_page(old_mfn));
2057 break;
2058 #endif
2060 case MMUEXT_TLB_FLUSH_LOCAL:
2061 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2062 break;
2064 case MMUEXT_INVLPG_LOCAL:
2065 if ( !shadow_mode_enabled(d)
2066 || shadow_invlpg(v, op.arg1.linear_addr) != 0 )
2067 local_flush_tlb_one(op.arg1.linear_addr);
2068 break;
2070 case MMUEXT_TLB_FLUSH_MULTI:
2071 case MMUEXT_INVLPG_MULTI:
2073 unsigned long vmask;
2074 cpumask_t pmask;
2075 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2077 okay = 0;
2078 break;
2080 pmask = vcpumask_to_pcpumask(d, vmask);
2081 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2082 flush_tlb_mask(pmask);
2083 else
2084 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2085 break;
2088 case MMUEXT_TLB_FLUSH_ALL:
2089 flush_tlb_mask(d->domain_dirty_cpumask);
2090 break;
2092 case MMUEXT_INVLPG_ALL:
2093 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2094 break;
2096 case MMUEXT_FLUSH_CACHE:
2097 if ( unlikely(!cache_flush_permitted(d)) )
2099 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2100 okay = 0;
2102 else
2104 wbinvd();
2106 break;
2108 case MMUEXT_SET_LDT:
2110 unsigned long ptr = op.arg1.linear_addr;
2111 unsigned long ents = op.arg2.nr_ents;
2113 if ( shadow_mode_external(d) )
2115 MEM_LOG("ignoring SET_LDT hypercall from external "
2116 "domain %u", d->domain_id);
2117 okay = 0;
2119 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2120 (ents > 8192) ||
2121 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2123 okay = 0;
2124 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2126 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2127 (v->arch.guest_context.ldt_base != ptr) )
2129 invalidate_shadow_ldt(v);
2130 v->arch.guest_context.ldt_base = ptr;
2131 v->arch.guest_context.ldt_ents = ents;
2132 load_LDT(v);
2133 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2134 if ( ents != 0 )
2135 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2137 break;
2140 default:
2141 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2142 rc = -ENOSYS;
2143 okay = 0;
2144 break;
2147 if ( unlikely(!okay) )
2149 rc = rc ? rc : -EINVAL;
2150 break;
2153 guest_handle_add_offset(uops, 1);
2156 out:
2157 process_deferred_ops();
2159 /* Add incremental work we have done to the @done output parameter. */
2160 if ( unlikely(!guest_handle_is_null(pdone)) )
2162 done += i;
2163 copy_to_guest(pdone, &done, 1);
2166 UNLOCK_BIGLOCK(d);
2167 return rc;
2170 int do_mmu_update(
2171 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2172 unsigned int count,
2173 XEN_GUEST_HANDLE(uint) pdone,
2174 unsigned int foreigndom)
2176 struct mmu_update req;
2177 void *va;
2178 unsigned long gpfn, gmfn, mfn;
2179 struct page_info *page;
2180 int rc = 0, okay = 1, i = 0;
2181 unsigned int cmd, done = 0;
2182 struct vcpu *v = current;
2183 struct domain *d = v->domain;
2184 unsigned long type_info;
2185 struct domain_mmap_cache mapcache, sh_mapcache;
2187 LOCK_BIGLOCK(d);
2189 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2191 count &= ~MMU_UPDATE_PREEMPTED;
2192 if ( unlikely(!guest_handle_is_null(pdone)) )
2193 (void)copy_from_guest(&done, pdone, 1);
2196 domain_mmap_cache_init(&mapcache);
2197 domain_mmap_cache_init(&sh_mapcache);
2199 if ( !set_foreigndom(foreigndom) )
2201 rc = -ESRCH;
2202 goto out;
2205 perfc_incrc(calls_to_mmu_update);
2206 perfc_addc(num_page_updates, count);
2208 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2210 rc = -EFAULT;
2211 goto out;
2214 for ( i = 0; i < count; i++ )
2216 if ( hypercall_preempt_check() )
2218 rc = hypercall_create_continuation(
2219 __HYPERVISOR_mmu_update, "hihi",
2220 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2221 break;
2224 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2226 MEM_LOG("Bad __copy_from_guest");
2227 rc = -EFAULT;
2228 break;
2231 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2232 okay = 0;
2234 switch ( cmd )
2236 /*
2237 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2238 */
2239 case MMU_NORMAL_PT_UPDATE:
2241 gmfn = req.ptr >> PAGE_SHIFT;
2242 mfn = gmfn_to_mfn(d, gmfn);
2244 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2246 MEM_LOG("Could not get page for normal update");
2247 break;
2250 va = map_domain_page_with_cache(mfn, &mapcache);
2251 va = (void *)((unsigned long)va +
2252 (unsigned long)(req.ptr & ~PAGE_MASK));
2253 page = mfn_to_page(mfn);
2255 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2257 case PGT_l1_page_table:
2258 case PGT_l2_page_table:
2259 case PGT_l3_page_table:
2260 case PGT_l4_page_table:
2262 if ( shadow_mode_refcounts(d) )
2264 MEM_LOG("mmu update on shadow-refcounted domain!");
2265 break;
2268 if ( unlikely(!get_page_type(
2269 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2270 goto not_a_pt;
2272 switch ( type_info & PGT_type_mask )
2274 case PGT_l1_page_table:
2276 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2277 okay = mod_l1_entry(va, l1e, mfn);
2279 break;
2280 case PGT_l2_page_table:
2282 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2283 okay = mod_l2_entry(
2284 (l2_pgentry_t *)va, l2e, mfn, type_info);
2286 break;
2287 #if CONFIG_PAGING_LEVELS >= 3
2288 case PGT_l3_page_table:
2290 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2291 okay = mod_l3_entry(va, l3e, mfn);
2293 break;
2294 #endif
2295 #if CONFIG_PAGING_LEVELS >= 4
2296 case PGT_l4_page_table:
2298 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2299 okay = mod_l4_entry(va, l4e, mfn);
2301 break;
2302 #endif
2305 put_page_type(page);
2307 break;
2309 default:
2310 not_a_pt:
2312 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2313 break;
2315 if ( unlikely(shadow_mode_enabled(d)) )
2316 shadow_lock(d);
2318 *(intpte_t *)va = req.val;
2319 okay = 1;
2321 if ( unlikely(shadow_mode_enabled(d)) )
2323 shadow_validate_guest_entry(v, _mfn(mfn), va);
2324 shadow_unlock(d);
2327 put_page_type(page);
2329 break;
2332 unmap_domain_page_with_cache(va, &mapcache);
2334 put_page(page);
2335 break;
2337 case MMU_MACHPHYS_UPDATE:
2339 mfn = req.ptr >> PAGE_SHIFT;
2340 gpfn = req.val;
2342 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2344 MEM_LOG("Could not get page for mach->phys update");
2345 break;
2348 if ( unlikely(shadow_mode_translate(FOREIGNDOM)) )
2350 MEM_LOG("Mach-phys update on shadow-translate guest");
2351 break;
2354 set_gpfn_from_mfn(mfn, gpfn);
2355 okay = 1;
2357 mark_dirty(FOREIGNDOM, mfn);
2359 put_page(mfn_to_page(mfn));
2360 break;
2362 default:
2363 MEM_LOG("Invalid page update command %x", cmd);
2364 rc = -ENOSYS;
2365 okay = 0;
2366 break;
2369 if ( unlikely(!okay) )
2371 rc = rc ? rc : -EINVAL;
2372 break;
2375 guest_handle_add_offset(ureqs, 1);
2378 out:
2379 domain_mmap_cache_destroy(&mapcache);
2380 domain_mmap_cache_destroy(&sh_mapcache);
2382 process_deferred_ops();
2384 /* Add incremental work we have done to the @done output parameter. */
2385 if ( unlikely(!guest_handle_is_null(pdone)) )
2387 done += i;
2388 copy_to_guest(pdone, &done, 1);
2391 UNLOCK_BIGLOCK(d);
2392 return rc;
2396 static int create_grant_pte_mapping(
2397 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2399 int rc = GNTST_okay;
2400 void *va;
2401 unsigned long gmfn, mfn;
2402 struct page_info *page;
2403 u32 type;
2404 l1_pgentry_t ol1e;
2405 struct domain *d = v->domain;
2407 ASSERT(spin_is_locked(&d->big_lock));
2409 adjust_guest_l1e(nl1e);
2411 gmfn = pte_addr >> PAGE_SHIFT;
2412 mfn = gmfn_to_mfn(d, gmfn);
2414 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2416 MEM_LOG("Could not get page for normal update");
2417 return GNTST_general_error;
2420 va = map_domain_page(mfn);
2421 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2422 page = mfn_to_page(mfn);
2424 type = page->u.inuse.type_info & PGT_type_mask;
2425 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2427 MEM_LOG("Grant map attempted to update a non-L1 page");
2428 rc = GNTST_general_error;
2429 goto failed;
2432 ol1e = *(l1_pgentry_t *)va;
2433 if ( !update_l1e(va, ol1e, nl1e, mfn, v) )
2435 put_page_type(page);
2436 rc = GNTST_general_error;
2437 goto failed;
2440 if ( !shadow_mode_refcounts(d) )
2441 put_page_from_l1e(ol1e, d);
2443 put_page_type(page);
2445 failed:
2446 unmap_domain_page(va);
2447 put_page(page);
2449 return rc;
2452 static int destroy_grant_pte_mapping(
2453 uint64_t addr, unsigned long frame, struct domain *d)
2455 int rc = GNTST_okay;
2456 void *va;
2457 unsigned long gmfn, mfn;
2458 struct page_info *page;
2459 u32 type;
2460 l1_pgentry_t ol1e;
2462 gmfn = addr >> PAGE_SHIFT;
2463 mfn = gmfn_to_mfn(d, gmfn);
2465 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2467 MEM_LOG("Could not get page for normal update");
2468 return GNTST_general_error;
2471 va = map_domain_page(mfn);
2472 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2473 page = mfn_to_page(mfn);
2475 type = page->u.inuse.type_info & PGT_type_mask;
2476 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2478 MEM_LOG("Grant map attempted to update a non-L1 page");
2479 rc = GNTST_general_error;
2480 goto failed;
2483 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2485 put_page_type(page);
2486 rc = GNTST_general_error;
2487 goto failed;
2490 /* Check that the virtual address supplied is actually mapped to frame. */
2491 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2493 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2494 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2495 put_page_type(page);
2496 rc = GNTST_general_error;
2497 goto failed;
2500 /* Delete pagetable entry. */
2501 if ( unlikely(!update_l1e(
2502 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2503 d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
2505 MEM_LOG("Cannot delete PTE entry at %p", va);
2506 put_page_type(page);
2507 rc = GNTST_general_error;
2508 goto failed;
2511 put_page_type(page);
2513 failed:
2514 unmap_domain_page(va);
2515 put_page(page);
2516 return rc;
2520 static int create_grant_va_mapping(
2521 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2523 l1_pgentry_t *pl1e, ol1e;
2524 struct domain *d = v->domain;
2525 unsigned long gl1mfn;
2526 int okay;
2528 ASSERT(spin_is_locked(&d->big_lock));
2530 adjust_guest_l1e(nl1e);
2532 pl1e = guest_map_l1e(v, va, &gl1mfn);
2533 if ( !pl1e )
2535 MEM_LOG("Could not find L1 PTE for address %lx", va);
2536 return GNTST_general_error;
2538 ol1e = *pl1e;
2539 okay = update_l1e(pl1e, ol1e, nl1e, gl1mfn, v);
2540 guest_unmap_l1e(v, pl1e);
2541 pl1e = NULL;
2543 if ( !okay )
2544 return GNTST_general_error;
2546 if ( !shadow_mode_refcounts(d) )
2547 put_page_from_l1e(ol1e, d);
2549 return GNTST_okay;
2552 static int destroy_grant_va_mapping(
2553 unsigned long addr, unsigned long frame, struct vcpu *v)
2555 l1_pgentry_t *pl1e, ol1e;
2556 unsigned long gl1mfn;
2557 int rc = 0;
2559 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2560 if ( !pl1e )
2562 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2563 return GNTST_general_error;
2565 ol1e = *pl1e;
2567 /* Check that the virtual address supplied is actually mapped to frame. */
2568 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2570 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2571 l1e_get_pfn(ol1e), addr, frame);
2572 rc = GNTST_general_error;
2573 goto out;
2576 /* Delete pagetable entry. */
2577 if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), gl1mfn, v)) )
2579 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2580 rc = GNTST_general_error;
2581 goto out;
2584 out:
2585 guest_unmap_l1e(v, pl1e);
2586 return rc;
2589 int create_grant_host_mapping(
2590 uint64_t addr, unsigned long frame, unsigned int flags)
2592 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2594 if ( (flags & GNTMAP_application_map) )
2595 l1e_add_flags(pte,_PAGE_USER);
2596 if ( !(flags & GNTMAP_readonly) )
2597 l1e_add_flags(pte,_PAGE_RW);
2599 if ( flags & GNTMAP_contains_pte )
2600 return create_grant_pte_mapping(addr, pte, current);
2601 return create_grant_va_mapping(addr, pte, current);
2604 int destroy_grant_host_mapping(
2605 uint64_t addr, unsigned long frame, unsigned int flags)
2607 if ( flags & GNTMAP_contains_pte )
2608 return destroy_grant_pte_mapping(addr, frame, current->domain);
2609 return destroy_grant_va_mapping(addr, frame, current);
2612 int steal_page(
2613 struct domain *d, struct page_info *page, unsigned int memflags)
2615 u32 _d, _nd, x, y;
2617 spin_lock(&d->page_alloc_lock);
2619 /*
2620 * The tricky bit: atomically release ownership while there is just one
2621 * benign reference to the page (PGC_allocated). If that reference
2622 * disappears then the deallocation routine will safely spin.
2623 */
2624 _d = pickle_domptr(d);
2625 _nd = page->u.inuse._domain;
2626 y = page->count_info;
2627 do {
2628 x = y;
2629 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2630 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2631 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2632 " caf=%08x, taf=%" PRtype_info "\n",
2633 (void *) page_to_mfn(page),
2634 d, d->domain_id, unpickle_domptr(_nd), x,
2635 page->u.inuse.type_info);
2636 spin_unlock(&d->page_alloc_lock);
2637 return -1;
2639 __asm__ __volatile__(
2640 LOCK_PREFIX "cmpxchg8b %2"
2641 : "=d" (_nd), "=a" (y),
2642 "=m" (*(volatile u64 *)(&page->count_info))
2643 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2644 } while (unlikely(_nd != _d) || unlikely(y != x));
2646 /*
2647 * Unlink from 'd'. At least one reference remains (now anonymous), so
2648 * noone else is spinning to try to delete this page from 'd'.
2649 */
2650 if ( !(memflags & MEMF_no_refcount) )
2651 d->tot_pages--;
2652 list_del(&page->list);
2654 spin_unlock(&d->page_alloc_lock);
2656 return 0;
2659 int do_update_va_mapping(unsigned long va, u64 val64,
2660 unsigned long flags)
2662 l1_pgentry_t val = l1e_from_intpte(val64);
2663 struct vcpu *v = current;
2664 struct domain *d = v->domain;
2665 l1_pgentry_t *pl1e;
2666 unsigned long vmask, bmap_ptr, gl1mfn;
2667 cpumask_t pmask;
2668 int rc = 0;
2670 perfc_incrc(calls_to_update_va);
2672 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2673 return -EINVAL;
2675 LOCK_BIGLOCK(d);
2677 pl1e = guest_map_l1e(v, va, &gl1mfn);
2679 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) )
2680 rc = -EINVAL;
2682 if ( pl1e )
2683 guest_unmap_l1e(v, pl1e);
2684 pl1e = NULL;
2686 switch ( flags & UVMF_FLUSHTYPE_MASK )
2688 case UVMF_TLB_FLUSH:
2689 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2691 case UVMF_LOCAL:
2692 local_flush_tlb();
2693 break;
2694 case UVMF_ALL:
2695 flush_tlb_mask(d->domain_dirty_cpumask);
2696 break;
2697 default:
2698 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2699 rc = -EFAULT;
2700 pmask = vcpumask_to_pcpumask(d, vmask);
2701 flush_tlb_mask(pmask);
2702 break;
2704 break;
2706 case UVMF_INVLPG:
2707 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2709 case UVMF_LOCAL:
2710 if ( !shadow_mode_enabled(d)
2711 || (shadow_invlpg(current, va) != 0) )
2712 local_flush_tlb_one(va);
2713 break;
2714 case UVMF_ALL:
2715 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2716 break;
2717 default:
2718 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2719 rc = -EFAULT;
2720 pmask = vcpumask_to_pcpumask(d, vmask);
2721 flush_tlb_one_mask(pmask, va);
2722 break;
2724 break;
2727 process_deferred_ops();
2729 UNLOCK_BIGLOCK(d);
2731 return rc;
2734 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2735 unsigned long flags,
2736 domid_t domid)
2738 int rc;
2740 if ( unlikely(!IS_PRIV(current->domain)) )
2741 return -EPERM;
2743 if ( !set_foreigndom(domid) )
2744 return -ESRCH;
2746 rc = do_update_va_mapping(va, val64, flags);
2748 return rc;
2753 /*************************
2754 * Descriptor Tables
2755 */
2757 void destroy_gdt(struct vcpu *v)
2759 int i;
2760 unsigned long pfn;
2762 v->arch.guest_context.gdt_ents = 0;
2763 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2765 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2766 put_page_and_type(mfn_to_page(pfn));
2767 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
2768 v->arch.guest_context.gdt_frames[i] = 0;
2773 long set_gdt(struct vcpu *v,
2774 unsigned long *frames,
2775 unsigned int entries)
2777 struct domain *d = v->domain;
2778 /* NB. There are 512 8-byte entries per GDT page. */
2779 int i, nr_pages = (entries + 511) / 512;
2780 unsigned long mfn;
2782 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2783 return -EINVAL;
2785 /* Check the pages in the new GDT. */
2786 for ( i = 0; i < nr_pages; i++ ) {
2787 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2788 if ( !mfn_valid(mfn) ||
2789 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2790 goto fail;
2793 /* Tear down the old GDT. */
2794 destroy_gdt(v);
2796 /* Install the new GDT. */
2797 v->arch.guest_context.gdt_ents = entries;
2798 for ( i = 0; i < nr_pages; i++ )
2800 v->arch.guest_context.gdt_frames[i] = frames[i];
2801 l1e_write(&v->arch.perdomain_ptes[i],
2802 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
2805 return 0;
2807 fail:
2808 while ( i-- > 0 )
2809 put_page_and_type(mfn_to_page(frames[i]));
2810 return -EINVAL;
2814 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2816 int nr_pages = (entries + 511) / 512;
2817 unsigned long frames[16];
2818 long ret;
2820 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2821 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2822 return -EINVAL;
2824 if ( copy_from_guest((unsigned long *)frames, frame_list, nr_pages) )
2825 return -EFAULT;
2827 LOCK_BIGLOCK(current->domain);
2829 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2830 local_flush_tlb();
2832 UNLOCK_BIGLOCK(current->domain);
2834 return ret;
2838 long do_update_descriptor(u64 pa, u64 desc)
2840 struct domain *dom = current->domain;
2841 unsigned long gmfn = pa >> PAGE_SHIFT;
2842 unsigned long mfn;
2843 unsigned int offset;
2844 struct desc_struct *gdt_pent, d;
2845 struct page_info *page;
2846 long ret = -EINVAL;
2848 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2850 *(u64 *)&d = desc;
2852 LOCK_BIGLOCK(dom);
2854 mfn = gmfn_to_mfn(dom, gmfn);
2855 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2856 !mfn_valid(mfn) ||
2857 !check_descriptor(&d) )
2859 UNLOCK_BIGLOCK(dom);
2860 return -EINVAL;
2863 page = mfn_to_page(mfn);
2864 if ( unlikely(!get_page(page, dom)) )
2866 UNLOCK_BIGLOCK(dom);
2867 return -EINVAL;
2870 /* Check if the given frame is in use in an unsafe context. */
2871 switch ( page->u.inuse.type_info & PGT_type_mask )
2873 case PGT_gdt_page:
2874 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2875 goto out;
2876 break;
2877 case PGT_ldt_page:
2878 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2879 goto out;
2880 break;
2881 default:
2882 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2883 goto out;
2884 break;
2887 mark_dirty(dom, mfn);
2889 /* All is good so make the update. */
2890 gdt_pent = map_domain_page(mfn);
2891 memcpy(&gdt_pent[offset], &d, 8);
2892 unmap_domain_page(gdt_pent);
2894 put_page_type(page);
2896 ret = 0; /* success */
2898 out:
2899 put_page(page);
2901 UNLOCK_BIGLOCK(dom);
2903 return ret;
2906 typedef struct e820entry e820entry_t;
2907 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
2909 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2911 switch ( op )
2913 case XENMEM_add_to_physmap:
2915 struct xen_add_to_physmap xatp;
2916 unsigned long prev_mfn, mfn = 0, gpfn;
2917 struct domain *d;
2919 if ( copy_from_guest(&xatp, arg, 1) )
2920 return -EFAULT;
2922 if ( xatp.domid == DOMID_SELF )
2924 d = current->domain;
2925 get_knownalive_domain(d);
2927 else if ( !IS_PRIV(current->domain) )
2928 return -EPERM;
2929 else if ( (d = find_domain_by_id(xatp.domid)) == NULL )
2930 return -ESRCH;
2932 switch ( xatp.space )
2934 case XENMAPSPACE_shared_info:
2935 if ( xatp.idx == 0 )
2936 mfn = virt_to_mfn(d->shared_info);
2937 break;
2938 case XENMAPSPACE_grant_table:
2939 if ( xatp.idx < NR_GRANT_FRAMES )
2940 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
2941 break;
2942 default:
2943 break;
2946 if ( !shadow_mode_translate(d) || (mfn == 0) )
2948 put_domain(d);
2949 return -EINVAL;
2952 LOCK_BIGLOCK(d);
2954 /* Remove previously mapped page if it was present. */
2955 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
2956 if ( mfn_valid(prev_mfn) )
2958 if ( IS_XEN_HEAP_FRAME(mfn_to_page(prev_mfn)) )
2959 /* Xen heap frames are simply unhooked from this phys slot. */
2960 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
2961 else
2962 /* Normal domain memory is freed, to avoid leaking memory. */
2963 guest_remove_page(d, xatp.gpfn);
2966 /* Unmap from old location, if any. */
2967 gpfn = get_gpfn_from_mfn(mfn);
2968 if ( gpfn != INVALID_M2P_ENTRY )
2969 guest_physmap_remove_page(d, gpfn, mfn);
2971 /* Map at new location. */
2972 guest_physmap_add_page(d, xatp.gpfn, mfn);
2974 UNLOCK_BIGLOCK(d);
2976 put_domain(d);
2978 break;
2981 case XENMEM_set_memory_map:
2983 struct xen_foreign_memory_map fmap;
2984 struct domain *d;
2985 int rc;
2987 if ( copy_from_guest(&fmap, arg, 1) )
2988 return -EFAULT;
2990 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
2991 return -EINVAL;
2993 if ( fmap.domid == DOMID_SELF )
2995 d = current->domain;
2996 get_knownalive_domain(d);
2998 else if ( !IS_PRIV(current->domain) )
2999 return -EPERM;
3000 else if ( (d = find_domain_by_id(fmap.domid)) == NULL )
3001 return -ESRCH;
3003 rc = copy_from_guest(&d->arch.e820[0], fmap.map.buffer,
3004 fmap.map.nr_entries) ? -EFAULT : 0;
3005 d->arch.nr_e820 = fmap.map.nr_entries;
3007 put_domain(d);
3008 return rc;
3011 case XENMEM_memory_map:
3013 struct xen_memory_map map;
3014 struct domain *d = current->domain;
3016 /* Backwards compatibility. */
3017 if ( d->arch.nr_e820 == 0 )
3018 return -ENOSYS;
3020 if ( copy_from_guest(&map, arg, 1) )
3021 return -EFAULT;
3023 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3024 if ( copy_to_guest(map.buffer, &d->arch.e820[0], map.nr_entries) ||
3025 copy_to_guest(arg, &map, 1) )
3026 return -EFAULT;
3028 return 0;
3031 case XENMEM_machine_memory_map:
3033 struct xen_memory_map memmap;
3034 XEN_GUEST_HANDLE(e820entry_t) buffer;
3035 int count;
3037 if ( !IS_PRIV(current->domain) )
3038 return -EINVAL;
3040 if ( copy_from_guest(&memmap, arg, 1) )
3041 return -EFAULT;
3042 if ( memmap.nr_entries < e820.nr_map + 1 )
3043 return -EINVAL;
3045 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3047 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3048 if ( copy_to_guest(buffer, &e820.map[0], count) < 0 )
3049 return -EFAULT;
3051 memmap.nr_entries = count;
3053 if ( copy_to_guest(arg, &memmap, 1) )
3054 return -EFAULT;
3056 return 0;
3059 case XENMEM_machphys_mapping:
3061 struct xen_machphys_mapping mapping = {
3062 .v_start = MACH2PHYS_VIRT_START,
3063 .v_end = MACH2PHYS_VIRT_END,
3064 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3065 };
3067 if ( copy_to_guest(arg, &mapping, 1) )
3068 return -EFAULT;
3070 return 0;
3073 default:
3074 return subarch_memory_op(op, arg);
3077 return 0;
3081 /*************************
3082 * Writable Pagetables
3083 */
3085 struct ptwr_emulate_ctxt {
3086 struct x86_emulate_ctxt ctxt;
3087 unsigned long cr2;
3088 l1_pgentry_t pte;
3089 };
3091 static int ptwr_emulated_read(
3092 enum x86_segment seg,
3093 unsigned long offset,
3094 unsigned long *val,
3095 unsigned int bytes,
3096 struct x86_emulate_ctxt *ctxt)
3098 unsigned int rc;
3099 unsigned long addr = offset;
3101 *val = 0;
3102 if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
3104 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3105 return X86EMUL_PROPAGATE_FAULT;
3108 return X86EMUL_CONTINUE;
3111 static int ptwr_emulated_update(
3112 unsigned long addr,
3113 paddr_t old,
3114 paddr_t val,
3115 unsigned int bytes,
3116 unsigned int do_cmpxchg,
3117 struct ptwr_emulate_ctxt *ptwr_ctxt)
3119 unsigned long gmfn, mfn;
3120 struct page_info *page;
3121 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3122 struct vcpu *v = current;
3123 struct domain *d = v->domain;
3125 /* Only allow naturally-aligned stores within the original %cr2 page. */
3126 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3128 MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
3129 ptwr_ctxt->cr2, addr, bytes);
3130 return X86EMUL_UNHANDLEABLE;
3133 /* Turn a sub-word access into a full-word access. */
3134 if ( bytes != sizeof(paddr_t) )
3136 paddr_t full;
3137 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3139 /* Align address; read full word. */
3140 addr &= ~(sizeof(paddr_t)-1);
3141 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3143 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3144 return X86EMUL_PROPAGATE_FAULT;
3146 /* Mask out bits provided by caller. */
3147 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3148 /* Shift the caller value and OR in the missing bits. */
3149 val &= (((paddr_t)1 << (bytes*8)) - 1);
3150 val <<= (offset)*8;
3151 val |= full;
3152 /* Also fill in missing parts of the cmpxchg old value. */
3153 old &= (((paddr_t)1 << (bytes*8)) - 1);
3154 old <<= (offset)*8;
3155 old |= full;
3158 pte = ptwr_ctxt->pte;
3159 gmfn = l1e_get_pfn(pte);
3160 mfn = gmfn_to_mfn(d, gmfn);
3161 page = mfn_to_page(mfn);
3163 /* We are looking only for read-only mappings of p.t. pages. */
3164 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3165 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3166 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3167 ASSERT(page_get_owner(page) == d);
3169 /* Check the new PTE. */
3170 nl1e = l1e_from_intpte(val);
3171 if ( unlikely(!get_page_from_l1e(gl1e_to_ml1e(d, nl1e), d)) )
3173 if ( (CONFIG_PAGING_LEVELS == 3) &&
3174 (bytes == 4) &&
3175 !do_cmpxchg &&
3176 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3178 /*
3179 * If this is a half-write to a PAE PTE then we assume that the
3180 * guest has simply got the two writes the wrong way round. We
3181 * zap the PRESENT bit on the assumption the bottom half will be
3182 * written immediately after we return to the guest.
3183 */
3184 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3185 l1e_get_intpte(nl1e));
3186 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3188 else
3190 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3191 return X86EMUL_UNHANDLEABLE;
3195 adjust_guest_l1e(nl1e);
3197 /* Checked successfully: do the update (write or cmpxchg). */
3198 pl1e = map_domain_page(page_to_mfn(page));
3199 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3200 if ( do_cmpxchg )
3202 if ( shadow_mode_enabled(d) )
3203 shadow_lock(d);
3204 ol1e = l1e_from_intpte(old);
3205 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3207 if ( shadow_mode_enabled(d) )
3208 shadow_unlock(d);
3209 unmap_domain_page(pl1e);
3210 put_page_from_l1e(gl1e_to_ml1e(d, nl1e), d);
3211 return X86EMUL_CMPXCHG_FAILED;
3213 if ( unlikely(shadow_mode_enabled(d)) )
3215 shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
3216 shadow_unlock(d);
3219 else
3221 ol1e = *pl1e;
3222 if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) )
3223 BUG();
3226 unmap_domain_page(pl1e);
3228 /* Finally, drop the old PTE. */
3229 put_page_from_l1e(gl1e_to_ml1e(d, ol1e), d);
3231 return X86EMUL_CONTINUE;
3234 static int ptwr_emulated_write(
3235 enum x86_segment seg,
3236 unsigned long offset,
3237 unsigned long val,
3238 unsigned int bytes,
3239 struct x86_emulate_ctxt *ctxt)
3241 return ptwr_emulated_update(
3242 offset, 0, val, bytes, 0,
3243 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3246 static int ptwr_emulated_cmpxchg(
3247 enum x86_segment seg,
3248 unsigned long offset,
3249 unsigned long old,
3250 unsigned long new,
3251 unsigned int bytes,
3252 struct x86_emulate_ctxt *ctxt)
3254 return ptwr_emulated_update(
3255 offset, old, new, bytes, 1,
3256 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3259 static int ptwr_emulated_cmpxchg8b(
3260 enum x86_segment seg,
3261 unsigned long offset,
3262 unsigned long old,
3263 unsigned long old_hi,
3264 unsigned long new,
3265 unsigned long new_hi,
3266 struct x86_emulate_ctxt *ctxt)
3268 if ( CONFIG_PAGING_LEVELS == 2 )
3269 return X86EMUL_UNHANDLEABLE;
3270 return ptwr_emulated_update(
3271 offset, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1,
3272 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3275 static struct x86_emulate_ops ptwr_emulate_ops = {
3276 .read = ptwr_emulated_read,
3277 .insn_fetch = ptwr_emulated_read,
3278 .write = ptwr_emulated_write,
3279 .cmpxchg = ptwr_emulated_cmpxchg,
3280 .cmpxchg8b = ptwr_emulated_cmpxchg8b
3281 };
3283 /* Write page fault handler: check if guest is trying to modify a PTE. */
3284 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3285 struct cpu_user_regs *regs)
3287 struct domain *d = v->domain;
3288 unsigned long pfn;
3289 struct page_info *page;
3290 l1_pgentry_t pte;
3291 struct ptwr_emulate_ctxt ptwr_ctxt;
3293 LOCK_BIGLOCK(d);
3295 /*
3296 * Attempt to read the PTE that maps the VA being accessed. By checking for
3297 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3298 */
3299 guest_get_eff_l1e(v, addr, &pte);
3300 if ( !(l1e_get_flags(pte) & _PAGE_PRESENT) )
3301 goto bail;
3302 pfn = l1e_get_pfn(pte);
3303 page = mfn_to_page(pfn);
3305 /* We are looking only for read-only mappings of p.t. pages. */
3306 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3307 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3308 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3309 (page_get_owner(page) != d) )
3310 goto bail;
3312 ptwr_ctxt.ctxt.regs = guest_cpu_user_regs();
3313 ptwr_ctxt.ctxt.mode = X86EMUL_MODE_HOST;
3314 ptwr_ctxt.cr2 = addr;
3315 ptwr_ctxt.pte = pte;
3316 if ( x86_emulate_memop(&ptwr_ctxt.ctxt, &ptwr_emulate_ops) )
3317 goto bail;
3319 UNLOCK_BIGLOCK(d);
3320 perfc_incrc(ptwr_emulations);
3321 return EXCRET_fault_fixed;
3323 bail:
3324 UNLOCK_BIGLOCK(d);
3325 return 0;
3328 int map_pages_to_xen(
3329 unsigned long virt,
3330 unsigned long mfn,
3331 unsigned long nr_mfns,
3332 unsigned long flags)
3334 l2_pgentry_t *pl2e, ol2e;
3335 l1_pgentry_t *pl1e, ol1e;
3336 unsigned int i;
3338 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3339 flags &= ~MAP_SMALL_PAGES;
3341 while ( nr_mfns != 0 )
3343 pl2e = virt_to_xen_l2e(virt);
3345 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3346 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3347 !map_small_pages )
3349 /* Super-page mapping. */
3350 ol2e = *pl2e;
3351 l2e_write(pl2e, l2e_from_pfn(mfn, flags|_PAGE_PSE));
3353 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3355 local_flush_tlb_pge();
3356 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3357 free_xen_pagetable(l2e_get_page(ol2e));
3360 virt += 1UL << L2_PAGETABLE_SHIFT;
3361 mfn += 1UL << PAGETABLE_ORDER;
3362 nr_mfns -= 1UL << PAGETABLE_ORDER;
3364 else
3366 /* Normal page mapping. */
3367 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3369 pl1e = page_to_virt(alloc_xen_pagetable());
3370 clear_page(pl1e);
3371 l2e_write(pl2e, l2e_from_page(virt_to_page(pl1e),
3372 __PAGE_HYPERVISOR));
3374 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3376 pl1e = page_to_virt(alloc_xen_pagetable());
3377 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3378 l1e_write(&pl1e[i],
3379 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
3380 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
3381 l2e_write(pl2e, l2e_from_page(virt_to_page(pl1e),
3382 __PAGE_HYPERVISOR));
3383 local_flush_tlb_pge();
3386 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3387 ol1e = *pl1e;
3388 l1e_write(pl1e, l1e_from_pfn(mfn, flags));
3389 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3390 local_flush_tlb_one(virt);
3392 virt += 1UL << L1_PAGETABLE_SHIFT;
3393 mfn += 1UL;
3394 nr_mfns -= 1UL;
3398 return 0;
3401 void __set_fixmap(
3402 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3404 BUG_ON(idx >= __end_of_fixed_addresses);
3405 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3408 #ifdef MEMORY_GUARD
3410 void memguard_init(void)
3412 map_pages_to_xen(
3413 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3414 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3417 static void __memguard_change_range(void *p, unsigned long l, int guard)
3419 unsigned long _p = (unsigned long)p;
3420 unsigned long _l = (unsigned long)l;
3421 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3423 /* Ensure we are dealing with a page-aligned whole number of pages. */
3424 ASSERT((_p&PAGE_MASK) != 0);
3425 ASSERT((_l&PAGE_MASK) != 0);
3426 ASSERT((_p&~PAGE_MASK) == 0);
3427 ASSERT((_l&~PAGE_MASK) == 0);
3429 if ( guard )
3430 flags &= ~_PAGE_PRESENT;
3432 map_pages_to_xen(
3433 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3436 void memguard_guard_range(void *p, unsigned long l)
3438 __memguard_change_range(p, l, 1);
3441 void memguard_unguard_range(void *p, unsigned long l)
3443 __memguard_change_range(p, l, 0);
3446 #endif
3448 void memguard_guard_stack(void *p)
3450 BUILD_BUG_ON((DEBUG_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
3451 p = (void *)((unsigned long)p + STACK_SIZE - DEBUG_STACK_SIZE - PAGE_SIZE);
3452 memguard_guard_range(p, PAGE_SIZE);
3455 /*
3456 * Local variables:
3457 * mode: C
3458 * c-set-style: "BSD"
3459 * c-basic-offset: 4
3460 * tab-width: 4
3461 * indent-tabs-mode: nil
3462 * End:
3463 */