direct-io.hg

view xen/arch/x86/mm.c @ 11648:5f42b4824e45

[XEN] Fix interaction between tlbflush timestamp and shadow flags
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <tim.deegan@xensource.com>
date Thu Sep 28 17:09:11 2006 +0100 (2006-09-28)
parents 69e52712fbc4
children b6ee084892da
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/shadow.h>
103 #include <asm/page.h>
104 #include <asm/flushtlb.h>
105 #include <asm/io.h>
106 #include <asm/ldt.h>
107 #include <asm/x86_emulate.h>
108 #include <asm/e820.h>
109 #include <public/memory.h>
111 #ifdef VERBOSE
112 #define MEM_LOG(_f, _a...) \
113 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
114 current->domain->domain_id , __LINE__ , ## _a )
115 #else
116 #define MEM_LOG(_f, _a...) ((void)0)
117 #endif
119 /*
120 * PTE updates can be done with ordinary writes except:
121 * 1. Debug builds get extra checking by using CMPXCHG[8B].
122 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
123 */
124 #if !defined(NDEBUG) || defined(CONFIG_X86_PAE)
125 #define PTE_UPDATE_WITH_CMPXCHG
126 #endif
128 /*
129 * Both do_mmuext_op() and do_mmu_update():
130 * We steal the m.s.b. of the @count parameter to indicate whether this
131 * invocation of do_mmu_update() is resuming a previously preempted call.
132 */
133 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
135 static void free_l2_table(struct page_info *page);
136 static void free_l1_table(struct page_info *page);
138 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
139 unsigned long type);
140 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn);
142 /* Used to defer flushing of memory structures. */
143 struct percpu_mm_info {
144 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
145 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
146 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
147 unsigned int deferred_ops;
148 /* If non-NULL, specifies a foreign subject domain for some operations. */
149 struct domain *foreign;
150 };
151 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
153 /*
154 * Returns the current foreign domain; defaults to the currently-executing
155 * domain if a foreign override hasn't been specified.
156 */
157 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
159 /* Private domain structs for DOMID_XEN and DOMID_IO. */
160 static struct domain *dom_xen, *dom_io;
162 /* Frame table and its size in pages. */
163 struct page_info *frame_table;
164 unsigned long max_page;
165 unsigned long total_pages;
167 void __init init_frametable(void)
168 {
169 unsigned long nr_pages, page_step, i, mfn;
171 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
173 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
174 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
176 for ( i = 0; i < nr_pages; i += page_step )
177 {
178 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
179 if ( mfn == 0 )
180 panic("Not enough memory for frame table\n");
181 map_pages_to_xen(
182 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
183 mfn, page_step, PAGE_HYPERVISOR);
184 }
186 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
187 }
189 void arch_init_memory(void)
190 {
191 extern void subarch_init_memory(void);
193 unsigned long i, pfn, rstart_pfn, rend_pfn;
195 /*
196 * Initialise our DOMID_XEN domain.
197 * Any Xen-heap pages that we will allow to be mapped will have
198 * their domain field set to dom_xen.
199 */
200 dom_xen = alloc_domain(DOMID_XEN);
201 BUG_ON(dom_xen == NULL);
203 /*
204 * Initialise our DOMID_IO domain.
205 * This domain owns I/O pages that are within the range of the page_info
206 * array. Mappings occur at the priv of the caller.
207 */
208 dom_io = alloc_domain(DOMID_IO);
209 BUG_ON(dom_io == NULL);
211 /* First 1MB of RAM is historically marked as I/O. */
212 for ( i = 0; i < 0x100; i++ )
213 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
215 /* Any areas not specified as RAM by the e820 map are considered I/O. */
216 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
217 {
218 if ( e820.map[i].type != E820_RAM )
219 continue;
220 /* Every page from cursor to start of next RAM region is I/O. */
221 rstart_pfn = PFN_UP(e820.map[i].addr);
222 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
223 for ( ; pfn < rstart_pfn; pfn++ )
224 {
225 BUG_ON(!mfn_valid(pfn));
226 share_xen_page_with_guest(
227 mfn_to_page(pfn), dom_io, XENSHARE_writable);
228 }
229 /* Skip the RAM region. */
230 pfn = rend_pfn;
231 }
232 BUG_ON(pfn != max_page);
234 subarch_init_memory();
235 }
237 int memory_is_conventional_ram(paddr_t p)
238 {
239 int i;
241 for ( i = 0; i < e820.nr_map; i++ )
242 {
243 if ( (e820.map[i].type == E820_RAM) &&
244 (e820.map[i].addr <= p) &&
245 (e820.map[i].size > p) )
246 return 1;
247 }
249 return 0;
250 }
252 void share_xen_page_with_guest(
253 struct page_info *page, struct domain *d, int readonly)
254 {
255 if ( page_get_owner(page) == d )
256 return;
258 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
260 spin_lock(&d->page_alloc_lock);
262 /* The incremented type count pins as writable or read-only. */
263 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
264 page->u.inuse.type_info |= PGT_validated | 1;
266 page_set_owner(page, d);
267 wmb(); /* install valid domain ptr before updating refcnt. */
268 ASSERT(page->count_info == 0);
269 page->count_info |= PGC_allocated | 1;
271 if ( unlikely(d->xenheap_pages++ == 0) )
272 get_knownalive_domain(d);
273 list_add_tail(&page->list, &d->xenpage_list);
275 spin_unlock(&d->page_alloc_lock);
276 }
278 void share_xen_page_with_privileged_guests(
279 struct page_info *page, int readonly)
280 {
281 share_xen_page_with_guest(page, dom_xen, readonly);
282 }
284 #if defined(CONFIG_X86_PAE)
286 #ifdef NDEBUG
287 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
288 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
289 #else
290 /*
291 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
292 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
293 * (detected by lack of an owning domain). As required for correctness, we
294 * always shadow PDPTs above 4GB.
295 */
296 #define l3tab_needs_shadow(mfn) \
297 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
298 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
299 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
300 ((mfn) >= 0x100000))
301 #endif
303 static l1_pgentry_t *fix_pae_highmem_pl1e;
305 /* Cache the address of PAE high-memory fixmap page tables. */
306 static int __init cache_pae_fixmap_address(void)
307 {
308 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
309 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
310 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
311 return 0;
312 }
313 __initcall(cache_pae_fixmap_address);
315 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
317 void make_cr3(struct vcpu *v, unsigned long mfn)
318 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
319 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
320 {
321 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
322 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
323 unsigned int cpu = smp_processor_id();
325 /* Fast path: does this mfn need a shadow at all? */
326 if ( !l3tab_needs_shadow(mfn) )
327 {
328 v->arch.cr3 = mfn << PAGE_SHIFT;
329 /* Cache is no longer in use or valid */
330 cache->high_mfn = 0;
331 return;
332 }
334 /* Caching logic is not interrupt safe. */
335 ASSERT(!in_irq());
337 /* Protects against pae_flush_pgd(). */
338 spin_lock(&cache->lock);
340 cache->inuse_idx ^= 1;
341 cache->high_mfn = mfn;
343 /* Map the guest L3 table and copy to the chosen low-memory cache. */
344 *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
345 /* First check the previous high mapping can't be in the TLB.
346 * (i.e. have we loaded CR3 since we last did this?) */
347 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
348 local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
349 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
350 lowmem_l3tab = cache->table[cache->inuse_idx];
351 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
352 *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
353 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
355 v->arch.cr3 = __pa(lowmem_l3tab);
357 spin_unlock(&cache->lock);
358 }
360 #else /* !CONFIG_X86_PAE */
362 void make_cr3(struct vcpu *v, unsigned long mfn)
363 {
364 v->arch.cr3 = mfn << PAGE_SHIFT;
365 }
367 #endif /* !CONFIG_X86_PAE */
369 void write_ptbase(struct vcpu *v)
370 {
371 write_cr3(v->arch.cr3);
372 }
374 void invalidate_shadow_ldt(struct vcpu *v)
375 {
376 int i;
377 unsigned long pfn;
378 struct page_info *page;
380 if ( v->arch.shadow_ldt_mapcnt == 0 )
381 return;
383 v->arch.shadow_ldt_mapcnt = 0;
385 for ( i = 16; i < 32; i++ )
386 {
387 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
388 if ( pfn == 0 ) continue;
389 v->arch.perdomain_ptes[i] = l1e_empty();
390 page = mfn_to_page(pfn);
391 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
392 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
393 put_page_and_type(page);
394 }
396 /* Dispose of the (now possibly invalid) mappings from the TLB. */
397 ASSERT(v->processor == smp_processor_id());
398 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
399 }
402 static int alloc_segdesc_page(struct page_info *page)
403 {
404 struct desc_struct *descs;
405 int i;
407 descs = map_domain_page(page_to_mfn(page));
409 for ( i = 0; i < 512; i++ )
410 if ( unlikely(!check_descriptor(&descs[i])) )
411 goto fail;
413 unmap_domain_page(descs);
414 return 1;
416 fail:
417 unmap_domain_page(descs);
418 return 0;
419 }
422 /* Map shadow page at offset @off. */
423 int map_ldt_shadow_page(unsigned int off)
424 {
425 struct vcpu *v = current;
426 struct domain *d = v->domain;
427 unsigned long gmfn, mfn;
428 l1_pgentry_t l1e, nl1e;
429 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
430 int res;
432 #if defined(__x86_64__)
433 /* If in user mode, switch to kernel mode just to read LDT mapping. */
434 int user_mode = !(v->arch.flags & TF_kernel_mode);
435 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
436 #elif defined(__i386__)
437 #define TOGGLE_MODE() ((void)0)
438 #endif
440 BUG_ON(unlikely(in_irq()));
442 TOGGLE_MODE();
443 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
444 sizeof(l1e));
445 TOGGLE_MODE();
447 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
448 return 0;
450 gmfn = l1e_get_pfn(l1e);
451 mfn = gmfn_to_mfn(d, gmfn);
452 if ( unlikely(!VALID_MFN(mfn)) )
453 return 0;
455 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
457 if ( !res && unlikely(shadow_mode_refcounts(d)) )
458 {
459 shadow_lock(d);
460 shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
461 res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
462 shadow_unlock(d);
463 }
465 if ( unlikely(!res) )
466 return 0;
468 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
470 v->arch.perdomain_ptes[off + 16] = nl1e;
471 v->arch.shadow_ldt_mapcnt++;
473 return 1;
474 }
477 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
478 {
479 struct page_info *page = mfn_to_page(page_nr);
481 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
482 {
483 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
484 return 0;
485 }
487 return 1;
488 }
491 static int get_page_and_type_from_pagenr(unsigned long page_nr,
492 unsigned long type,
493 struct domain *d)
494 {
495 struct page_info *page = mfn_to_page(page_nr);
497 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
498 return 0;
500 if ( unlikely(!get_page_type(page, type)) )
501 {
502 put_page(page);
503 return 0;
504 }
506 return 1;
507 }
509 #ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
510 /*
511 * We allow root tables to map each other (a.k.a. linear page tables). It
512 * needs some special care with reference counts and access permissions:
513 * 1. The mapping entry must be read-only, or the guest may get write access
514 * to its own PTEs.
515 * 2. We must only bump the reference counts for an *already validated*
516 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
517 * on a validation that is required to complete that validation.
518 * 3. We only need to increment the reference counts for the mapped page
519 * frame if it is mapped by a different root table. This is sufficient and
520 * also necessary to allow validation of a root table mapping itself.
521 */
522 static int
523 get_linear_pagetable(
524 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
525 {
526 unsigned long x, y;
527 struct page_info *page;
528 unsigned long pfn;
530 ASSERT( !shadow_mode_refcounts(d) );
532 if ( (root_get_flags(re) & _PAGE_RW) )
533 {
534 MEM_LOG("Attempt to create linear p.t. with write perms");
535 return 0;
536 }
538 if ( (pfn = root_get_pfn(re)) != re_pfn )
539 {
540 /* Make sure the mapped frame belongs to the correct domain. */
541 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
542 return 0;
544 /*
545 * Make sure that the mapped frame is an already-validated L2 table.
546 * If so, atomically increment the count (checking for overflow).
547 */
548 page = mfn_to_page(pfn);
549 y = page->u.inuse.type_info;
550 do {
551 x = y;
552 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
553 unlikely((x & (PGT_type_mask|PGT_validated)) !=
554 (PGT_root_page_table|PGT_validated)) )
555 {
556 put_page(page);
557 return 0;
558 }
559 }
560 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
561 }
563 return 1;
564 }
565 #endif /* !CONFIG_X86_PAE */
567 int
568 get_page_from_l1e(
569 l1_pgentry_t l1e, struct domain *d)
570 {
571 unsigned long mfn = l1e_get_pfn(l1e);
572 struct page_info *page = mfn_to_page(mfn);
573 int okay;
575 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
576 return 1;
578 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
579 {
580 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
581 return 0;
582 }
584 if ( unlikely(!mfn_valid(mfn)) ||
585 unlikely(page_get_owner(page) == dom_io) )
586 {
587 /* DOMID_IO reverts to caller for privilege checks. */
588 if ( d == dom_io )
589 d = current->domain;
591 if ( !iomem_access_permitted(d, mfn, mfn) )
592 {
593 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
594 d->domain_id, mfn);
595 return 0;
596 }
598 /* No reference counting for out-of-range I/O pages. */
599 if ( !mfn_valid(mfn) )
600 return 1;
602 d = dom_io;
603 }
605 /* Foreign mappings into guests in shadow external mode don't
606 * contribute to writeable mapping refcounts. (This allows the
607 * qemu-dm helper process in dom0 to map the domain's memory without
608 * messing up the count of "real" writable mappings.) */
609 okay = (((l1e_get_flags(l1e) & _PAGE_RW) &&
610 !(unlikely(shadow_mode_external(d) && (d != current->domain))))
611 ? get_page_and_type(page, d, PGT_writable_page)
612 : get_page(page, d));
613 if ( !okay )
614 {
615 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
616 " for dom%d",
617 mfn, get_gpfn_from_mfn(mfn),
618 l1e_get_intpte(l1e), d->domain_id);
619 }
621 return okay;
622 }
625 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
626 static int
627 get_page_from_l2e(
628 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
629 {
630 int rc;
632 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
633 return 1;
635 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
636 {
637 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
638 return 0;
639 }
641 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
642 #if CONFIG_PAGING_LEVELS == 2
643 if ( unlikely(!rc) )
644 rc = get_linear_pagetable(l2e, pfn, d);
645 #endif
646 return rc;
647 }
650 #if CONFIG_PAGING_LEVELS >= 3
651 static int
652 get_page_from_l3e(
653 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
654 {
655 int rc;
657 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
658 return 1;
660 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
661 {
662 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
663 return 0;
664 }
666 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
667 return rc;
668 }
669 #endif /* 3 level */
671 #if CONFIG_PAGING_LEVELS >= 4
672 static int
673 get_page_from_l4e(
674 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
675 {
676 int rc;
678 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
679 return 1;
681 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
682 {
683 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
684 return 0;
685 }
687 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
689 if ( unlikely(!rc) )
690 rc = get_linear_pagetable(l4e, pfn, d);
692 return rc;
693 }
694 #endif /* 4 level */
696 #ifdef __x86_64__
698 #ifdef USER_MAPPINGS_ARE_GLOBAL
699 #define adjust_guest_l1e(pl1e) \
700 do { \
701 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) ) \
702 { \
703 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
704 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
705 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
706 MEM_LOG("Global bit is set to kernel page %lx", \
707 l1e_get_pfn((pl1e))); \
708 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
709 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
710 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
711 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
712 } \
713 } while ( 0 )
714 #else
715 #define adjust_guest_l1e(pl1e) \
716 do { \
717 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) ) \
718 l1e_add_flags((pl1e), _PAGE_USER); \
719 } while ( 0 )
720 #endif
722 #define adjust_guest_l2e(pl2e) \
723 do { \
724 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) ) \
725 l2e_add_flags((pl2e), _PAGE_USER); \
726 } while ( 0 )
728 #define adjust_guest_l3e(pl3e) \
729 do { \
730 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
731 l3e_add_flags((pl3e), _PAGE_USER); \
732 } while ( 0 )
734 #define adjust_guest_l4e(pl4e) \
735 do { \
736 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) ) \
737 l4e_add_flags((pl4e), _PAGE_USER); \
738 } while ( 0 )
740 #else /* !defined(__x86_64__) */
742 #define adjust_guest_l1e(_p) ((void)0)
743 #define adjust_guest_l2e(_p) ((void)0)
744 #define adjust_guest_l3e(_p) ((void)0)
746 #endif
748 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
749 {
750 unsigned long pfn = l1e_get_pfn(l1e);
751 struct page_info *page = mfn_to_page(pfn);
752 struct domain *e;
753 struct vcpu *v;
755 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(pfn) )
756 return;
758 e = page_get_owner(page);
760 /*
761 * Check if this is a mapping that was established via a grant reference.
762 * If it was then we should not be here: we require that such mappings are
763 * explicitly destroyed via the grant-table interface.
764 *
765 * The upshot of this is that the guest can end up with active grants that
766 * it cannot destroy (because it no longer has a PTE to present to the
767 * grant-table interface). This can lead to subtle hard-to-catch bugs,
768 * hence a special grant PTE flag can be enabled to catch the bug early.
769 *
770 * (Note that the undestroyable active grants are not a security hole in
771 * Xen. All active grants can safely be cleaned up when the domain dies.)
772 */
773 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
774 !(d->domain_flags & (DOMF_shutdown|DOMF_dying)) )
775 {
776 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
777 l1e_get_intpte(l1e));
778 domain_crash(d);
779 }
781 /* Remember we didn't take a type-count of foreign writable mappings
782 * to shadow external domains */
783 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
784 !(unlikely((e != d) && shadow_mode_external(e))) )
785 {
786 put_page_and_type(page);
787 }
788 else
789 {
790 /* We expect this is rare so we blow the entire shadow LDT. */
791 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
792 PGT_ldt_page)) &&
793 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
794 (d == e) )
795 {
796 for_each_vcpu ( d, v )
797 invalidate_shadow_ldt(v);
798 }
799 put_page(page);
800 }
801 }
804 /*
805 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
806 * Note also that this automatically deals correctly with linear p.t.'s.
807 */
808 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
809 {
810 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
811 (l2e_get_pfn(l2e) != pfn) )
812 put_page_and_type(mfn_to_page(l2e_get_pfn(l2e)));
813 }
816 #if CONFIG_PAGING_LEVELS >= 3
817 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
818 {
819 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
820 (l3e_get_pfn(l3e) != pfn) )
821 put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
822 }
823 #endif
825 #if CONFIG_PAGING_LEVELS >= 4
826 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
827 {
828 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
829 (l4e_get_pfn(l4e) != pfn) )
830 put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
831 }
832 #endif
834 static int alloc_l1_table(struct page_info *page)
835 {
836 struct domain *d = page_get_owner(page);
837 unsigned long pfn = page_to_mfn(page);
838 l1_pgentry_t *pl1e;
839 int i;
841 ASSERT(!shadow_mode_refcounts(d));
843 pl1e = map_domain_page(pfn);
845 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
846 {
847 if ( is_guest_l1_slot(i) &&
848 unlikely(!get_page_from_l1e(pl1e[i], d)) )
849 goto fail;
851 adjust_guest_l1e(pl1e[i]);
852 }
854 unmap_domain_page(pl1e);
855 return 1;
857 fail:
858 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
859 while ( i-- > 0 )
860 if ( is_guest_l1_slot(i) )
861 put_page_from_l1e(pl1e[i], d);
863 unmap_domain_page(pl1e);
864 return 0;
865 }
867 #ifdef CONFIG_X86_PAE
868 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
869 {
870 struct page_info *page;
871 l2_pgentry_t *pl2e;
872 l3_pgentry_t l3e3;
873 int i;
875 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
877 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
878 l3e3 = pl3e[3];
879 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
880 {
881 MEM_LOG("PAE L3 3rd slot is empty");
882 return 0;
883 }
885 /*
886 * The Xen-private mappings include linear mappings. The L2 thus cannot
887 * be shared by multiple L3 tables. The test here is adequate because:
888 * 1. Cannot appear in slots != 3 because get_page_type() checks the
889 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
890 * 2. Cannot appear in another page table's L3:
891 * a. alloc_l3_table() calls this function and this check will fail
892 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
893 *
894 * XXX -- this needs revisiting for shadow_mode_refcount()==true...
895 */
896 page = l3e_get_page(l3e3);
897 BUG_ON(page->u.inuse.type_info & PGT_pinned);
898 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
899 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
900 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
901 {
902 MEM_LOG("PAE L3 3rd slot is shared");
903 return 0;
904 }
906 /* Xen private mappings. */
907 pl2e = map_domain_page(l3e_get_pfn(l3e3));
908 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
909 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
910 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
911 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
912 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
913 l2e_from_page(
914 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
915 __PAGE_HYPERVISOR);
916 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
917 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
918 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
919 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
920 l2e_empty();
921 unmap_domain_page(pl2e);
923 return 1;
924 }
926 /* Flush a pgdir update into low-memory caches. */
927 static void pae_flush_pgd(
928 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
929 {
930 struct domain *d = page_get_owner(mfn_to_page(mfn));
931 struct vcpu *v;
932 intpte_t _ol3e, _nl3e, _pl3e;
933 l3_pgentry_t *l3tab_ptr;
934 struct pae_l3_cache *cache;
936 /* If below 4GB then the pgdir is not shadowed in low memory. */
937 if ( !l3tab_needs_shadow(mfn) )
938 return;
940 for_each_vcpu ( d, v )
941 {
942 cache = &v->arch.pae_l3_cache;
944 spin_lock(&cache->lock);
946 if ( cache->high_mfn == mfn )
947 {
948 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
949 _ol3e = l3e_get_intpte(*l3tab_ptr);
950 _nl3e = l3e_get_intpte(nl3e);
951 _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
952 BUG_ON(_pl3e != _ol3e);
953 }
955 spin_unlock(&cache->lock);
956 }
958 flush_tlb_mask(d->domain_dirty_cpumask);
959 }
961 #elif CONFIG_X86_64
962 # define create_pae_xen_mappings(pl3e) (1)
963 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
964 #else
965 # define create_pae_xen_mappings(pl3e) (1)
966 #endif
968 static int alloc_l2_table(struct page_info *page, unsigned long type)
969 {
970 struct domain *d = page_get_owner(page);
971 unsigned long pfn = page_to_mfn(page);
972 l2_pgentry_t *pl2e;
973 int i;
975 ASSERT(!shadow_mode_refcounts(d));
977 pl2e = map_domain_page(pfn);
979 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
980 {
981 if ( is_guest_l2_slot(type, i) &&
982 unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
983 goto fail;
985 adjust_guest_l2e(pl2e[i]);
986 }
988 #if CONFIG_PAGING_LEVELS == 2
989 /* Xen private mappings. */
990 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
991 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
992 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
993 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
994 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
995 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
996 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
997 l2e_from_page(
998 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
999 __PAGE_HYPERVISOR);
1000 #endif
1002 unmap_domain_page(pl2e);
1003 return 1;
1005 fail:
1006 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1007 while ( i-- > 0 )
1008 if ( is_guest_l2_slot(type, i) )
1009 put_page_from_l2e(pl2e[i], pfn);
1011 unmap_domain_page(pl2e);
1012 return 0;
1016 #if CONFIG_PAGING_LEVELS >= 3
1017 static int alloc_l3_table(struct page_info *page)
1019 struct domain *d = page_get_owner(page);
1020 unsigned long pfn = page_to_mfn(page);
1021 l3_pgentry_t *pl3e;
1022 int i;
1024 ASSERT(!shadow_mode_refcounts(d));
1026 #ifdef CONFIG_X86_PAE
1027 /*
1028 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1029 * the weird 'extended cr3' format for dealing with high-order address
1030 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1031 */
1032 if ( (pfn >= 0x100000) &&
1033 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1034 d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) )
1036 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1037 return 0;
1039 #endif
1041 pl3e = map_domain_page(pfn);
1042 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1044 #ifdef CONFIG_X86_PAE
1045 if ( i == 3 )
1047 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1048 (l3e_get_flags(pl3e[i]) & L3_DISALLOW_MASK) ||
1049 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1050 PGT_l2_page_table |
1051 PGT_pae_xen_l2,
1052 d) )
1053 goto fail;
1055 else
1056 #endif
1057 if ( is_guest_l3_slot(i) &&
1058 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1059 goto fail;
1061 adjust_guest_l3e(pl3e[i]);
1064 if ( !create_pae_xen_mappings(pl3e) )
1065 goto fail;
1067 unmap_domain_page(pl3e);
1068 return 1;
1070 fail:
1071 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1072 while ( i-- > 0 )
1073 if ( is_guest_l3_slot(i) )
1074 put_page_from_l3e(pl3e[i], pfn);
1076 unmap_domain_page(pl3e);
1077 return 0;
1079 #else
1080 #define alloc_l3_table(page) (0)
1081 #endif
1083 #if CONFIG_PAGING_LEVELS >= 4
1084 static int alloc_l4_table(struct page_info *page)
1086 struct domain *d = page_get_owner(page);
1087 unsigned long pfn = page_to_mfn(page);
1088 l4_pgentry_t *pl4e = page_to_virt(page);
1089 int i;
1091 ASSERT(!shadow_mode_refcounts(d));
1093 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1095 if ( is_guest_l4_slot(i) &&
1096 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1097 goto fail;
1099 adjust_guest_l4e(pl4e[i]);
1102 /* Xen private mappings. */
1103 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1104 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1105 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1106 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1107 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1108 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1109 l4e_from_page(
1110 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
1111 __PAGE_HYPERVISOR);
1113 return 1;
1115 fail:
1116 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1117 while ( i-- > 0 )
1118 if ( is_guest_l4_slot(i) )
1119 put_page_from_l4e(pl4e[i], pfn);
1121 return 0;
1123 #else
1124 #define alloc_l4_table(page) (0)
1125 #endif
1128 static void free_l1_table(struct page_info *page)
1130 struct domain *d = page_get_owner(page);
1131 unsigned long pfn = page_to_mfn(page);
1132 l1_pgentry_t *pl1e;
1133 int i;
1135 pl1e = map_domain_page(pfn);
1137 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1138 if ( is_guest_l1_slot(i) )
1139 put_page_from_l1e(pl1e[i], d);
1141 unmap_domain_page(pl1e);
1145 static void free_l2_table(struct page_info *page)
1147 unsigned long pfn = page_to_mfn(page);
1148 l2_pgentry_t *pl2e;
1149 int i;
1151 pl2e = map_domain_page(pfn);
1153 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1154 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
1155 put_page_from_l2e(pl2e[i], pfn);
1157 unmap_domain_page(pl2e);
1159 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1163 #if CONFIG_PAGING_LEVELS >= 3
1165 static void free_l3_table(struct page_info *page)
1167 unsigned long pfn = page_to_mfn(page);
1168 l3_pgentry_t *pl3e;
1169 int i;
1171 pl3e = map_domain_page(pfn);
1173 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1174 if ( is_guest_l3_slot(i) )
1175 put_page_from_l3e(pl3e[i], pfn);
1177 unmap_domain_page(pl3e);
1180 #endif
1182 #if CONFIG_PAGING_LEVELS >= 4
1184 static void free_l4_table(struct page_info *page)
1186 unsigned long pfn = page_to_mfn(page);
1187 l4_pgentry_t *pl4e = page_to_virt(page);
1188 int i;
1190 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1191 if ( is_guest_l4_slot(i) )
1192 put_page_from_l4e(pl4e[i], pfn);
1195 #endif
1197 static inline int update_l1e(l1_pgentry_t *pl1e,
1198 l1_pgentry_t ol1e,
1199 l1_pgentry_t nl1e,
1200 unsigned long gl1mfn,
1201 struct vcpu *v)
1203 int rv = 1;
1204 if ( unlikely(shadow_mode_enabled(v->domain)) )
1205 shadow_lock(v->domain);
1206 #ifndef PTE_UPDATE_WITH_CMPXCHG
1207 rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
1208 #else
1210 intpte_t o = l1e_get_intpte(ol1e);
1211 intpte_t n = l1e_get_intpte(nl1e);
1213 for ( ; ; )
1215 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
1217 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1218 ": saw %" PRIpte,
1219 l1e_get_intpte(ol1e),
1220 l1e_get_intpte(nl1e),
1221 o);
1222 rv = 0;
1223 break;
1226 if ( o == l1e_get_intpte(ol1e) )
1227 break;
1229 /* Allowed to change in Accessed/Dirty flags only. */
1230 BUG_ON((o ^ l1e_get_intpte(ol1e)) &
1231 ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
1232 ol1e = l1e_from_intpte(o);
1235 #endif
1236 if ( unlikely(shadow_mode_enabled(v->domain)) )
1238 shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
1239 shadow_unlock(v->domain);
1241 return rv;
1245 /* Update the L1 entry at pl1e to new value nl1e. */
1246 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1247 unsigned long gl1mfn)
1249 l1_pgentry_t ol1e;
1250 struct domain *d = current->domain;
1252 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1253 return 0;
1255 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1257 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1259 MEM_LOG("Bad L1 flags %x",
1260 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1261 return 0;
1264 adjust_guest_l1e(nl1e);
1266 /* Fast path for identical mapping, r/w and presence. */
1267 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1268 return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
1270 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1271 return 0;
1273 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
1275 put_page_from_l1e(nl1e, d);
1276 return 0;
1279 else
1281 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
1282 return 0;
1285 put_page_from_l1e(ol1e, d);
1286 return 1;
1289 #ifndef PTE_UPDATE_WITH_CMPXCHG
1290 #define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
1291 #else
1292 #define _UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1293 for ( ; ; ) \
1294 { \
1295 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1296 _t ## e_get_intpte(_o), \
1297 _t ## e_get_intpte(_n)); \
1298 if ( __o == _t ## e_get_intpte(_o) ) \
1299 break; \
1300 /* Allowed to change in Accessed/Dirty flags only. */ \
1301 BUG_ON((__o ^ _t ## e_get_intpte(_o)) & \
1302 ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY)); \
1303 _o = _t ## e_from_intpte(__o); \
1304 } \
1305 1; })
1306 #endif
1307 #define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \
1308 int rv; \
1309 if ( unlikely(shadow_mode_enabled(current->domain)) ) \
1310 shadow_lock(current->domain); \
1311 rv = _UPDATE_ENTRY(_t, _p, _o, _n); \
1312 if ( unlikely(shadow_mode_enabled(current->domain)) ) \
1313 { \
1314 shadow_validate_guest_entry(current, _mfn(_m), (_p)); \
1315 shadow_unlock(current->domain); \
1316 } \
1317 rv; \
1318 })
1320 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1321 static int mod_l2_entry(l2_pgentry_t *pl2e,
1322 l2_pgentry_t nl2e,
1323 unsigned long pfn,
1324 unsigned long type)
1326 l2_pgentry_t ol2e;
1328 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1330 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1331 return 0;
1334 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1335 return 0;
1337 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1339 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1341 MEM_LOG("Bad L2 flags %x",
1342 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1343 return 0;
1346 adjust_guest_l2e(nl2e);
1348 /* Fast path for identical mapping and presence. */
1349 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1350 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn);
1352 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain)) )
1353 return 0;
1355 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
1357 put_page_from_l2e(nl2e, pfn);
1358 return 0;
1361 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
1363 return 0;
1366 put_page_from_l2e(ol2e, pfn);
1367 return 1;
1370 #if CONFIG_PAGING_LEVELS >= 3
1372 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1373 static int mod_l3_entry(l3_pgentry_t *pl3e,
1374 l3_pgentry_t nl3e,
1375 unsigned long pfn)
1377 l3_pgentry_t ol3e;
1378 int okay;
1380 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1382 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1383 return 0;
1386 #ifdef CONFIG_X86_PAE
1387 /*
1388 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1389 * would be a pain to ensure they remain continuously valid throughout.
1390 */
1391 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1392 return 0;
1393 #endif
1395 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1396 return 0;
1398 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1400 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1402 MEM_LOG("Bad L3 flags %x",
1403 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1404 return 0;
1407 adjust_guest_l3e(nl3e);
1409 /* Fast path for identical mapping and presence. */
1410 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1411 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn);
1413 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain)) )
1414 return 0;
1416 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
1418 put_page_from_l3e(nl3e, pfn);
1419 return 0;
1422 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
1424 return 0;
1427 okay = create_pae_xen_mappings(pl3e);
1428 BUG_ON(!okay);
1430 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1432 put_page_from_l3e(ol3e, pfn);
1433 return 1;
1436 #endif
1438 #if CONFIG_PAGING_LEVELS >= 4
1440 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1441 static int mod_l4_entry(l4_pgentry_t *pl4e,
1442 l4_pgentry_t nl4e,
1443 unsigned long pfn)
1445 l4_pgentry_t ol4e;
1447 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1449 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1450 return 0;
1453 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1454 return 0;
1456 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1458 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1460 MEM_LOG("Bad L4 flags %x",
1461 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1462 return 0;
1465 adjust_guest_l4e(nl4e);
1467 /* Fast path for identical mapping and presence. */
1468 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1469 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn);
1471 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
1472 return 0;
1474 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
1476 put_page_from_l4e(nl4e, pfn);
1477 return 0;
1480 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
1482 return 0;
1485 put_page_from_l4e(ol4e, pfn);
1486 return 1;
1489 #endif
1491 int alloc_page_type(struct page_info *page, unsigned long type)
1493 struct domain *owner = page_get_owner(page);
1495 /* A page table is dirtied when its type count becomes non-zero. */
1496 if ( likely(owner != NULL) )
1497 mark_dirty(owner, page_to_mfn(page));
1499 switch ( type & PGT_type_mask )
1501 case PGT_l1_page_table:
1502 return alloc_l1_table(page);
1503 case PGT_l2_page_table:
1504 return alloc_l2_table(page, type);
1505 case PGT_l3_page_table:
1506 return alloc_l3_table(page);
1507 case PGT_l4_page_table:
1508 return alloc_l4_table(page);
1509 case PGT_gdt_page:
1510 case PGT_ldt_page:
1511 return alloc_segdesc_page(page);
1512 default:
1513 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1514 type, page->u.inuse.type_info,
1515 page->count_info);
1516 BUG();
1519 return 0;
1523 void free_page_type(struct page_info *page, unsigned long type)
1525 struct domain *owner = page_get_owner(page);
1526 unsigned long gmfn;
1528 if ( likely(owner != NULL) )
1530 /*
1531 * We have to flush before the next use of the linear mapping
1532 * (e.g., update_va_mapping()) or we could end up modifying a page
1533 * that is no longer a page table (and hence screw up ref counts).
1534 */
1535 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
1537 if ( unlikely(shadow_mode_enabled(owner)) )
1539 /* A page table is dirtied when its type count becomes zero. */
1540 mark_dirty(owner, page_to_mfn(page));
1542 if ( shadow_mode_refcounts(owner) )
1543 return;
1545 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1546 ASSERT(VALID_M2P(gmfn));
1547 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1551 switch ( type & PGT_type_mask )
1553 case PGT_l1_page_table:
1554 free_l1_table(page);
1555 break;
1557 case PGT_l2_page_table:
1558 free_l2_table(page);
1559 break;
1561 #if CONFIG_PAGING_LEVELS >= 3
1562 case PGT_l3_page_table:
1563 free_l3_table(page);
1564 break;
1565 #endif
1567 #if CONFIG_PAGING_LEVELS >= 4
1568 case PGT_l4_page_table:
1569 free_l4_table(page);
1570 break;
1571 #endif
1573 default:
1574 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1575 type, page_to_mfn(page));
1576 BUG();
1581 void put_page_type(struct page_info *page)
1583 unsigned long nx, x, y = page->u.inuse.type_info;
1585 again:
1586 do {
1587 x = y;
1588 nx = x - 1;
1590 ASSERT((x & PGT_count_mask) != 0);
1592 if ( unlikely((nx & PGT_count_mask) == 0) )
1594 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1595 likely(nx & PGT_validated) )
1597 /*
1598 * Page-table pages must be unvalidated when count is zero. The
1599 * 'free' is safe because the refcnt is non-zero and validated
1600 * bit is clear => other ops will spin or fail.
1601 */
1602 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1603 x & ~PGT_validated)) != x) )
1604 goto again;
1605 /* We cleared the 'valid bit' so we do the clean up. */
1606 free_page_type(page, x);
1607 /* Carry on, but with the 'valid bit' now clear. */
1608 x &= ~PGT_validated;
1609 nx &= ~PGT_validated;
1612 /*
1613 * Record TLB information for flush later. We do not stamp page
1614 * tables when running in shadow mode:
1615 * 1. Pointless, since it's the shadow pt's which must be tracked.
1616 * 2. Shadow mode reuses this field for shadowed page tables to
1617 * store flags info -- we don't want to conflict with that.
1618 */
1619 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1620 (page->count_info & PGC_page_table)) )
1621 page->tlbflush_timestamp = tlbflush_current_time();
1624 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1628 int get_page_type(struct page_info *page, unsigned long type)
1630 unsigned long nx, x, y = page->u.inuse.type_info;
1632 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1634 again:
1635 do {
1636 x = y;
1637 nx = x + 1;
1638 if ( unlikely((nx & PGT_count_mask) == 0) )
1640 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1641 return 0;
1643 else if ( unlikely((x & PGT_count_mask) == 0) )
1645 struct domain *d = page_get_owner(page);
1647 /* Never allow a shadowed frame to go from type count 0 to 1 */
1648 if ( d && shadow_mode_enabled(d) )
1649 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1651 ASSERT(!(x & PGT_pae_xen_l2));
1652 if ( (x & PGT_type_mask) != type )
1654 /*
1655 * On type change we check to flush stale TLB entries. This
1656 * may be unnecessary (e.g., page was GDT/LDT) but those
1657 * circumstances should be very rare.
1658 */
1659 cpumask_t mask = d->domain_dirty_cpumask;
1661 /* Don't flush if the timestamp is old enough */
1662 tlbflush_filter(mask, page->tlbflush_timestamp);
1664 if ( unlikely(!cpus_empty(mask)) &&
1665 /* Shadow mode: track only writable pages. */
1666 (!shadow_mode_enabled(page_get_owner(page)) ||
1667 ((nx & PGT_type_mask) == PGT_writable_page)) )
1669 perfc_incrc(need_flush_tlb_flush);
1670 flush_tlb_mask(mask);
1673 /* We lose existing type, back pointer, and validity. */
1674 nx &= ~(PGT_type_mask | PGT_validated);
1675 nx |= type;
1677 /* No special validation needed for writable pages. */
1678 /* Page tables and GDT/LDT need to be scanned for validity. */
1679 if ( type == PGT_writable_page )
1680 nx |= PGT_validated;
1683 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1685 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1686 (type != PGT_l1_page_table) )
1687 MEM_LOG("Bad type (saw %" PRtype_info
1688 " != exp %" PRtype_info ") "
1689 "for mfn %lx (pfn %lx)",
1690 x, type, page_to_mfn(page),
1691 get_gpfn_from_mfn(page_to_mfn(page)));
1692 return 0;
1694 else if ( unlikely(!(x & PGT_validated)) )
1696 /* Someone else is updating validation of this page. Wait... */
1697 while ( (y = page->u.inuse.type_info) == x )
1698 cpu_relax();
1699 goto again;
1702 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1704 if ( unlikely(!(nx & PGT_validated)) )
1706 /* Try to validate page type; drop the new reference on failure. */
1707 if ( unlikely(!alloc_page_type(page, type)) )
1709 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1710 PRtype_info ": caf=%08x taf=%" PRtype_info,
1711 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
1712 type, page->count_info, page->u.inuse.type_info);
1713 /* Noone else can get a reference. We hold the only ref. */
1714 page->u.inuse.type_info = 0;
1715 return 0;
1718 /* Noone else is updating simultaneously. */
1719 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1722 return 1;
1726 int new_guest_cr3(unsigned long mfn)
1728 struct vcpu *v = current;
1729 struct domain *d = v->domain;
1730 int okay;
1731 unsigned long old_base_mfn;
1733 if ( hvm_guest(v) && !hvm_paging_enabled(v) )
1734 domain_crash_synchronous();
1736 if ( shadow_mode_refcounts(d) )
1738 okay = get_page_from_pagenr(mfn, d);
1739 if ( unlikely(!okay) )
1741 MEM_LOG("Error while installing new baseptr %lx", mfn);
1742 return 0;
1745 else
1747 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1748 if ( unlikely(!okay) )
1750 /* Switch to idle pagetable: this VCPU has no active p.t. now. */
1751 MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn);
1752 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1753 v->arch.guest_table = pagetable_null();
1754 update_cr3(v);
1755 write_cr3(__pa(idle_pg_table));
1756 if ( old_base_mfn != 0 )
1757 put_page_and_type(mfn_to_page(old_base_mfn));
1759 /* Retry the validation with no active p.t. for this VCPU. */
1760 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1761 if ( !okay )
1763 /* Failure here is unrecoverable: the VCPU has no pagetable! */
1764 MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
1765 domain_crash(d);
1766 ASSERT(v->processor == smp_processor_id());
1767 this_cpu(percpu_mm_info).deferred_ops = 0;
1768 return 0;
1773 invalidate_shadow_ldt(v);
1775 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1777 v->arch.guest_table = pagetable_from_pfn(mfn);
1778 update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */
1780 write_ptbase(v);
1782 if ( likely(old_base_mfn != 0) )
1784 if ( shadow_mode_refcounts(d) )
1785 put_page(mfn_to_page(old_base_mfn));
1786 else
1787 put_page_and_type(mfn_to_page(old_base_mfn));
1790 return 1;
1793 static void process_deferred_ops(void)
1795 unsigned int deferred_ops;
1796 struct domain *d = current->domain;
1797 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1799 deferred_ops = info->deferred_ops;
1800 info->deferred_ops = 0;
1802 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
1804 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
1805 flush_tlb_mask(d->domain_dirty_cpumask);
1806 else
1807 local_flush_tlb();
1810 if ( deferred_ops & DOP_RELOAD_LDT )
1811 (void)map_ldt_shadow_page(0);
1813 if ( unlikely(info->foreign != NULL) )
1815 put_domain(info->foreign);
1816 info->foreign = NULL;
1820 static int set_foreigndom(domid_t domid)
1822 struct domain *e, *d = current->domain;
1823 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
1824 int okay = 1;
1826 ASSERT(info->foreign == NULL);
1828 if ( likely(domid == DOMID_SELF) )
1829 goto out;
1831 if ( domid == d->domain_id )
1833 MEM_LOG("Dom %u tried to specify itself as foreign domain",
1834 d->domain_id);
1835 okay = 0;
1837 else if ( !IS_PRIV(d) )
1839 switch ( domid )
1841 case DOMID_IO:
1842 get_knownalive_domain(dom_io);
1843 info->foreign = dom_io;
1844 break;
1845 default:
1846 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1847 okay = 0;
1848 break;
1851 else
1853 info->foreign = e = find_domain_by_id(domid);
1854 if ( e == NULL )
1856 switch ( domid )
1858 case DOMID_XEN:
1859 get_knownalive_domain(dom_xen);
1860 info->foreign = dom_xen;
1861 break;
1862 case DOMID_IO:
1863 get_knownalive_domain(dom_io);
1864 info->foreign = dom_io;
1865 break;
1866 default:
1867 MEM_LOG("Unknown domain '%u'", domid);
1868 okay = 0;
1869 break;
1874 out:
1875 return okay;
1878 static inline cpumask_t vcpumask_to_pcpumask(
1879 struct domain *d, unsigned long vmask)
1881 unsigned int vcpu_id;
1882 cpumask_t pmask = CPU_MASK_NONE;
1883 struct vcpu *v;
1885 while ( vmask != 0 )
1887 vcpu_id = find_first_set_bit(vmask);
1888 vmask &= ~(1UL << vcpu_id);
1889 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1890 ((v = d->vcpu[vcpu_id]) != NULL) )
1891 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
1894 return pmask;
1897 int do_mmuext_op(
1898 XEN_GUEST_HANDLE(mmuext_op_t) uops,
1899 unsigned int count,
1900 XEN_GUEST_HANDLE(uint) pdone,
1901 unsigned int foreigndom)
1903 struct mmuext_op op;
1904 int rc = 0, i = 0, okay;
1905 unsigned long mfn, type;
1906 unsigned int done = 0;
1907 struct page_info *page;
1908 struct vcpu *v = current;
1909 struct domain *d = v->domain;
1911 LOCK_BIGLOCK(d);
1913 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1915 count &= ~MMU_UPDATE_PREEMPTED;
1916 if ( unlikely(!guest_handle_is_null(pdone)) )
1917 (void)copy_from_guest(&done, pdone, 1);
1920 if ( !set_foreigndom(foreigndom) )
1922 rc = -ESRCH;
1923 goto out;
1926 if ( unlikely(!guest_handle_okay(uops, count)) )
1928 rc = -EFAULT;
1929 goto out;
1932 for ( i = 0; i < count; i++ )
1934 if ( hypercall_preempt_check() )
1936 rc = hypercall_create_continuation(
1937 __HYPERVISOR_mmuext_op, "hihi",
1938 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1939 break;
1942 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
1944 MEM_LOG("Bad __copy_from_guest");
1945 rc = -EFAULT;
1946 break;
1949 okay = 1;
1950 mfn = op.arg1.mfn;
1951 page = mfn_to_page(mfn);
1953 switch ( op.cmd )
1955 case MMUEXT_PIN_L1_TABLE:
1956 type = PGT_l1_page_table;
1957 goto pin_page;
1959 case MMUEXT_PIN_L2_TABLE:
1960 type = PGT_l2_page_table;
1961 goto pin_page;
1963 case MMUEXT_PIN_L3_TABLE:
1964 type = PGT_l3_page_table;
1965 goto pin_page;
1967 case MMUEXT_PIN_L4_TABLE:
1968 type = PGT_l4_page_table;
1970 pin_page:
1971 /* Ignore pinning of invalid paging levels. */
1972 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
1973 break;
1975 if ( shadow_mode_refcounts(FOREIGNDOM) )
1976 break;
1978 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1979 if ( unlikely(!okay) )
1981 MEM_LOG("Error while pinning mfn %lx", mfn);
1982 break;
1985 if ( unlikely(test_and_set_bit(_PGT_pinned,
1986 &page->u.inuse.type_info)) )
1988 MEM_LOG("Mfn %lx already pinned", mfn);
1989 put_page_and_type(page);
1990 okay = 0;
1991 break;
1994 /* A page is dirtied when its pin status is set. */
1995 mark_dirty(d, mfn);
1997 break;
1999 case MMUEXT_UNPIN_TABLE:
2000 if ( shadow_mode_refcounts(d) )
2001 break;
2003 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2005 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2006 mfn, page_get_owner(page));
2008 else if ( likely(test_and_clear_bit(_PGT_pinned,
2009 &page->u.inuse.type_info)) )
2011 put_page_and_type(page);
2012 put_page(page);
2013 /* A page is dirtied when its pin status is cleared. */
2014 mark_dirty(d, mfn);
2016 else
2018 okay = 0;
2019 put_page(page);
2020 MEM_LOG("Mfn %lx not pinned", mfn);
2022 break;
2024 case MMUEXT_NEW_BASEPTR:
2025 mfn = gmfn_to_mfn(current->domain, mfn);
2026 okay = new_guest_cr3(mfn);
2027 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2028 break;
2030 #ifdef __x86_64__
2031 case MMUEXT_NEW_USER_BASEPTR:
2032 okay = 1;
2033 if (likely(mfn != 0))
2034 okay = get_page_and_type_from_pagenr(
2035 mfn, PGT_root_page_table, d);
2036 if ( unlikely(!okay) )
2038 MEM_LOG("Error while installing new mfn %lx", mfn);
2040 else
2042 unsigned long old_mfn =
2043 pagetable_get_pfn(v->arch.guest_table_user);
2044 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2045 if ( old_mfn != 0 )
2046 put_page_and_type(mfn_to_page(old_mfn));
2048 break;
2049 #endif
2051 case MMUEXT_TLB_FLUSH_LOCAL:
2052 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2053 break;
2055 case MMUEXT_INVLPG_LOCAL:
2056 if ( !shadow_mode_enabled(d)
2057 || shadow_invlpg(v, op.arg1.linear_addr) != 0 )
2058 local_flush_tlb_one(op.arg1.linear_addr);
2059 break;
2061 case MMUEXT_TLB_FLUSH_MULTI:
2062 case MMUEXT_INVLPG_MULTI:
2064 unsigned long vmask;
2065 cpumask_t pmask;
2066 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
2068 okay = 0;
2069 break;
2071 pmask = vcpumask_to_pcpumask(d, vmask);
2072 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2073 flush_tlb_mask(pmask);
2074 else
2075 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2076 break;
2079 case MMUEXT_TLB_FLUSH_ALL:
2080 flush_tlb_mask(d->domain_dirty_cpumask);
2081 break;
2083 case MMUEXT_INVLPG_ALL:
2084 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2085 break;
2087 case MMUEXT_FLUSH_CACHE:
2088 if ( unlikely(!cache_flush_permitted(d)) )
2090 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2091 okay = 0;
2093 else
2095 wbinvd();
2097 break;
2099 case MMUEXT_SET_LDT:
2101 unsigned long ptr = op.arg1.linear_addr;
2102 unsigned long ents = op.arg2.nr_ents;
2104 if ( shadow_mode_external(d) )
2106 MEM_LOG("ignoring SET_LDT hypercall from external "
2107 "domain %u", d->domain_id);
2108 okay = 0;
2110 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2111 (ents > 8192) ||
2112 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2114 okay = 0;
2115 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2117 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2118 (v->arch.guest_context.ldt_base != ptr) )
2120 invalidate_shadow_ldt(v);
2121 v->arch.guest_context.ldt_base = ptr;
2122 v->arch.guest_context.ldt_ents = ents;
2123 load_LDT(v);
2124 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2125 if ( ents != 0 )
2126 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2128 break;
2131 default:
2132 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2133 okay = 0;
2134 break;
2137 if ( unlikely(!okay) )
2139 rc = -EINVAL;
2140 break;
2143 guest_handle_add_offset(uops, 1);
2146 out:
2147 process_deferred_ops();
2149 /* Add incremental work we have done to the @done output parameter. */
2150 done += i;
2151 if ( unlikely(!guest_handle_is_null(pdone)) )
2152 copy_to_guest(pdone, &done, 1);
2154 UNLOCK_BIGLOCK(d);
2155 return rc;
2158 int do_mmu_update(
2159 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2160 unsigned int count,
2161 XEN_GUEST_HANDLE(uint) pdone,
2162 unsigned int foreigndom)
2164 struct mmu_update req;
2165 void *va;
2166 unsigned long gpfn, gmfn, mfn;
2167 struct page_info *page;
2168 int rc = 0, okay = 1, i = 0;
2169 unsigned int cmd, done = 0;
2170 struct vcpu *v = current;
2171 struct domain *d = v->domain;
2172 unsigned long type_info;
2173 struct domain_mmap_cache mapcache, sh_mapcache;
2175 LOCK_BIGLOCK(d);
2177 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2179 count &= ~MMU_UPDATE_PREEMPTED;
2180 if ( unlikely(!guest_handle_is_null(pdone)) )
2181 (void)copy_from_guest(&done, pdone, 1);
2184 domain_mmap_cache_init(&mapcache);
2185 domain_mmap_cache_init(&sh_mapcache);
2187 if ( !set_foreigndom(foreigndom) )
2189 rc = -ESRCH;
2190 goto out;
2193 perfc_incrc(calls_to_mmu_update);
2194 perfc_addc(num_page_updates, count);
2196 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2198 rc = -EFAULT;
2199 goto out;
2202 for ( i = 0; i < count; i++ )
2204 if ( hypercall_preempt_check() )
2206 rc = hypercall_create_continuation(
2207 __HYPERVISOR_mmu_update, "hihi",
2208 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2209 break;
2212 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2214 MEM_LOG("Bad __copy_from_guest");
2215 rc = -EFAULT;
2216 break;
2219 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2220 okay = 0;
2222 switch ( cmd )
2224 /*
2225 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2226 */
2227 case MMU_NORMAL_PT_UPDATE:
2229 gmfn = req.ptr >> PAGE_SHIFT;
2230 mfn = gmfn_to_mfn(d, gmfn);
2232 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2234 MEM_LOG("Could not get page for normal update");
2235 break;
2238 va = map_domain_page_with_cache(mfn, &mapcache);
2239 va = (void *)((unsigned long)va +
2240 (unsigned long)(req.ptr & ~PAGE_MASK));
2241 page = mfn_to_page(mfn);
2243 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2245 case PGT_l1_page_table:
2246 case PGT_l2_page_table:
2247 case PGT_l3_page_table:
2248 case PGT_l4_page_table:
2250 if ( shadow_mode_refcounts(d) )
2252 DPRINTK("mmu update on shadow-refcounted domain!");
2253 break;
2256 if ( unlikely(!get_page_type(
2257 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2258 goto not_a_pt;
2260 switch ( type_info & PGT_type_mask )
2262 case PGT_l1_page_table:
2264 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2265 okay = mod_l1_entry(va, l1e, mfn);
2267 break;
2268 case PGT_l2_page_table:
2270 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2271 okay = mod_l2_entry(
2272 (l2_pgentry_t *)va, l2e, mfn, type_info);
2274 break;
2275 #if CONFIG_PAGING_LEVELS >= 3
2276 case PGT_l3_page_table:
2278 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2279 okay = mod_l3_entry(va, l3e, mfn);
2281 break;
2282 #endif
2283 #if CONFIG_PAGING_LEVELS >= 4
2284 case PGT_l4_page_table:
2286 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2287 okay = mod_l4_entry(va, l4e, mfn);
2289 break;
2290 #endif
2293 put_page_type(page);
2295 break;
2297 default:
2298 not_a_pt:
2300 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2301 break;
2303 if ( unlikely(shadow_mode_enabled(d)) )
2304 shadow_lock(d);
2306 *(intpte_t *)va = req.val;
2307 okay = 1;
2309 if ( unlikely(shadow_mode_enabled(d)) )
2311 shadow_validate_guest_entry(v, _mfn(mfn), va);
2312 shadow_unlock(d);
2315 put_page_type(page);
2317 break;
2320 unmap_domain_page_with_cache(va, &mapcache);
2322 put_page(page);
2323 break;
2325 case MMU_MACHPHYS_UPDATE:
2327 mfn = req.ptr >> PAGE_SHIFT;
2328 gpfn = req.val;
2330 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2332 MEM_LOG("Could not get page for mach->phys update");
2333 break;
2336 if ( shadow_mode_translate(FOREIGNDOM) )
2337 shadow_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
2338 else
2339 set_gpfn_from_mfn(mfn, gpfn);
2340 okay = 1;
2342 // Mark the new gfn dirty...
2343 mark_dirty(FOREIGNDOM, mfn);
2345 put_page(mfn_to_page(mfn));
2346 break;
2348 default:
2349 MEM_LOG("Invalid page update command %x", cmd);
2350 break;
2353 if ( unlikely(!okay) )
2355 rc = -EINVAL;
2356 break;
2359 guest_handle_add_offset(ureqs, 1);
2362 out:
2363 domain_mmap_cache_destroy(&mapcache);
2364 domain_mmap_cache_destroy(&sh_mapcache);
2366 process_deferred_ops();
2368 /* Add incremental work we have done to the @done output parameter. */
2369 done += i;
2370 if ( unlikely(!guest_handle_is_null(pdone)) )
2371 copy_to_guest(pdone, &done, 1);
2373 UNLOCK_BIGLOCK(d);
2374 return rc;
2378 static int create_grant_pte_mapping(
2379 unsigned long pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2381 int rc = GNTST_okay;
2382 void *va;
2383 unsigned long gmfn, mfn;
2384 struct page_info *page;
2385 u32 type;
2386 l1_pgentry_t ol1e;
2387 struct domain *d = v->domain;
2389 ASSERT(spin_is_locked(&d->big_lock));
2391 adjust_guest_l1e(nl1e);
2393 gmfn = pte_addr >> PAGE_SHIFT;
2394 mfn = gmfn_to_mfn(d, gmfn);
2396 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2398 MEM_LOG("Could not get page for normal update");
2399 return GNTST_general_error;
2402 va = map_domain_page(mfn);
2403 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2404 page = mfn_to_page(mfn);
2406 type = page->u.inuse.type_info & PGT_type_mask;
2407 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2409 MEM_LOG("Grant map attempted to update a non-L1 page");
2410 rc = GNTST_general_error;
2411 goto failed;
2414 ol1e = *(l1_pgentry_t *)va;
2415 if ( !update_l1e(va, ol1e, nl1e, mfn, v) )
2417 put_page_type(page);
2418 rc = GNTST_general_error;
2419 goto failed;
2422 if ( !shadow_mode_refcounts(d) )
2423 put_page_from_l1e(ol1e, d);
2425 put_page_type(page);
2427 failed:
2428 unmap_domain_page(va);
2429 put_page(page);
2431 return rc;
2434 static int destroy_grant_pte_mapping(
2435 unsigned long addr, unsigned long frame, struct domain *d)
2437 int rc = GNTST_okay;
2438 void *va;
2439 unsigned long gmfn, mfn;
2440 struct page_info *page;
2441 u32 type;
2442 l1_pgentry_t ol1e;
2444 gmfn = addr >> PAGE_SHIFT;
2445 mfn = gmfn_to_mfn(d, gmfn);
2447 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2449 MEM_LOG("Could not get page for normal update");
2450 return GNTST_general_error;
2453 va = map_domain_page(mfn);
2454 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2455 page = mfn_to_page(mfn);
2457 type = page->u.inuse.type_info & PGT_type_mask;
2458 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2460 MEM_LOG("Grant map attempted to update a non-L1 page");
2461 rc = GNTST_general_error;
2462 goto failed;
2465 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2467 put_page_type(page);
2468 rc = GNTST_general_error;
2469 goto failed;
2472 /* Check that the virtual address supplied is actually mapped to frame. */
2473 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2475 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2476 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2477 put_page_type(page);
2478 rc = GNTST_general_error;
2479 goto failed;
2482 /* Delete pagetable entry. */
2483 if ( unlikely(!update_l1e(
2484 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2485 d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
2487 MEM_LOG("Cannot delete PTE entry at %p", va);
2488 put_page_type(page);
2489 rc = GNTST_general_error;
2490 goto failed;
2493 put_page_type(page);
2495 failed:
2496 unmap_domain_page(va);
2497 put_page(page);
2498 return rc;
2502 static int create_grant_va_mapping(
2503 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2505 l1_pgentry_t *pl1e, ol1e;
2506 struct domain *d = v->domain;
2508 ASSERT(spin_is_locked(&d->big_lock));
2510 adjust_guest_l1e(nl1e);
2512 pl1e = &linear_pg_table[l1_linear_offset(va)];
2514 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2515 !update_l1e(pl1e, ol1e, nl1e,
2516 l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
2517 return GNTST_general_error;
2519 if ( !shadow_mode_refcounts(d) )
2520 put_page_from_l1e(ol1e, d);
2522 return GNTST_okay;
2525 static int destroy_grant_va_mapping(
2526 unsigned long addr, unsigned long frame, struct domain *d)
2528 l1_pgentry_t *pl1e, ol1e;
2530 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2532 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2534 MEM_LOG("Could not find PTE entry for address %lx", addr);
2535 return GNTST_general_error;
2538 /*
2539 * Check that the virtual address supplied is actually mapped to
2540 * frame.
2541 */
2542 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2544 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2545 l1e_get_pfn(ol1e), addr, frame);
2546 return GNTST_general_error;
2549 /* Delete pagetable entry. */
2550 if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(),
2551 l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]),
2552 d->vcpu[0] /* Change for per-vcpu shadows */)) )
2554 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2555 return GNTST_general_error;
2558 return 0;
2561 int create_grant_host_mapping(
2562 unsigned long addr, unsigned long frame, unsigned int flags)
2564 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2566 if ( (flags & GNTMAP_application_map) )
2567 l1e_add_flags(pte,_PAGE_USER);
2568 if ( !(flags & GNTMAP_readonly) )
2569 l1e_add_flags(pte,_PAGE_RW);
2571 if ( flags & GNTMAP_contains_pte )
2572 return create_grant_pte_mapping(addr, pte, current);
2573 return create_grant_va_mapping(addr, pte, current);
2576 int destroy_grant_host_mapping(
2577 unsigned long addr, unsigned long frame, unsigned int flags)
2579 if ( flags & GNTMAP_contains_pte )
2580 return destroy_grant_pte_mapping(addr, frame, current->domain);
2581 return destroy_grant_va_mapping(addr, frame, current->domain);
2584 int steal_page(
2585 struct domain *d, struct page_info *page, unsigned int memflags)
2587 u32 _d, _nd, x, y;
2589 spin_lock(&d->page_alloc_lock);
2591 /*
2592 * The tricky bit: atomically release ownership while there is just one
2593 * benign reference to the page (PGC_allocated). If that reference
2594 * disappears then the deallocation routine will safely spin.
2595 */
2596 _d = pickle_domptr(d);
2597 _nd = page->u.inuse._domain;
2598 y = page->count_info;
2599 do {
2600 x = y;
2601 if (unlikely((x & (PGC_count_mask|PGC_allocated)) !=
2602 (1 | PGC_allocated)) || unlikely(_nd != _d)) {
2603 DPRINTK("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
2604 " caf=%08x, taf=%" PRtype_info "\n",
2605 (void *) page_to_mfn(page),
2606 d, d->domain_id, unpickle_domptr(_nd), x,
2607 page->u.inuse.type_info);
2608 spin_unlock(&d->page_alloc_lock);
2609 return -1;
2611 __asm__ __volatile__(
2612 LOCK_PREFIX "cmpxchg8b %2"
2613 : "=d" (_nd), "=a" (y),
2614 "=m" (*(volatile u64 *)(&page->count_info))
2615 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
2616 } while (unlikely(_nd != _d) || unlikely(y != x));
2618 /*
2619 * Unlink from 'd'. At least one reference remains (now anonymous), so
2620 * noone else is spinning to try to delete this page from 'd'.
2621 */
2622 if ( !(memflags & MEMF_no_refcount) )
2623 d->tot_pages--;
2624 list_del(&page->list);
2626 spin_unlock(&d->page_alloc_lock);
2628 return 0;
2631 int do_update_va_mapping(unsigned long va, u64 val64,
2632 unsigned long flags)
2634 l1_pgentry_t val = l1e_from_intpte(val64);
2635 struct vcpu *v = current;
2636 struct domain *d = v->domain;
2637 unsigned long vmask, bmap_ptr;
2638 cpumask_t pmask;
2639 int rc = 0;
2641 perfc_incrc(calls_to_update_va);
2643 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2644 return -EINVAL;
2646 if ( unlikely(shadow_mode_refcounts(d)) )
2648 DPRINTK("Grant op on a shadow-refcounted domain\n");
2649 return -EINVAL;
2652 LOCK_BIGLOCK(d);
2654 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2656 if ( unlikely(this_cpu(percpu_mm_info).foreign &&
2657 (shadow_mode_translate(d) ||
2658 shadow_mode_translate(
2659 this_cpu(percpu_mm_info).foreign))) )
2661 /*
2662 * The foreign domain's pfn's are in a different namespace. There's
2663 * not enough information in just a gpte to figure out how to
2664 * (re-)shadow this entry.
2665 */
2666 domain_crash(d);
2670 if ( unlikely(!mod_l1_entry(
2671 &linear_pg_table[l1_linear_offset(va)], val,
2672 l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) )
2673 rc = -EINVAL;
2675 switch ( flags & UVMF_FLUSHTYPE_MASK )
2677 case UVMF_TLB_FLUSH:
2678 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2680 case UVMF_LOCAL:
2681 local_flush_tlb();
2682 break;
2683 case UVMF_ALL:
2684 flush_tlb_mask(d->domain_dirty_cpumask);
2685 break;
2686 default:
2687 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2688 rc = -EFAULT;
2689 pmask = vcpumask_to_pcpumask(d, vmask);
2690 flush_tlb_mask(pmask);
2691 break;
2693 break;
2695 case UVMF_INVLPG:
2696 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2698 case UVMF_LOCAL:
2699 if ( !shadow_mode_enabled(d)
2700 || (shadow_invlpg(current, va) != 0) )
2701 local_flush_tlb_one(va);
2702 break;
2703 case UVMF_ALL:
2704 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
2705 break;
2706 default:
2707 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2708 rc = -EFAULT;
2709 pmask = vcpumask_to_pcpumask(d, vmask);
2710 flush_tlb_one_mask(pmask, va);
2711 break;
2713 break;
2716 process_deferred_ops();
2718 UNLOCK_BIGLOCK(d);
2720 return rc;
2723 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2724 unsigned long flags,
2725 domid_t domid)
2727 int rc;
2729 if ( unlikely(!IS_PRIV(current->domain)) )
2730 return -EPERM;
2732 if ( !set_foreigndom(domid) )
2733 return -ESRCH;
2735 rc = do_update_va_mapping(va, val64, flags);
2737 return rc;
2742 /*************************
2743 * Descriptor Tables
2744 */
2746 void destroy_gdt(struct vcpu *v)
2748 int i;
2749 unsigned long pfn;
2751 v->arch.guest_context.gdt_ents = 0;
2752 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2754 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2755 put_page_and_type(mfn_to_page(pfn));
2756 v->arch.perdomain_ptes[i] = l1e_empty();
2757 v->arch.guest_context.gdt_frames[i] = 0;
2762 long set_gdt(struct vcpu *v,
2763 unsigned long *frames,
2764 unsigned int entries)
2766 struct domain *d = v->domain;
2767 /* NB. There are 512 8-byte entries per GDT page. */
2768 int i, nr_pages = (entries + 511) / 512;
2769 unsigned long mfn;
2771 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2772 return -EINVAL;
2774 /* Check the pages in the new GDT. */
2775 for ( i = 0; i < nr_pages; i++ ) {
2776 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
2777 if ( !mfn_valid(mfn) ||
2778 !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
2779 goto fail;
2782 /* Tear down the old GDT. */
2783 destroy_gdt(v);
2785 /* Install the new GDT. */
2786 v->arch.guest_context.gdt_ents = entries;
2787 for ( i = 0; i < nr_pages; i++ )
2789 v->arch.guest_context.gdt_frames[i] = frames[i];
2790 v->arch.perdomain_ptes[i] =
2791 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2794 return 0;
2796 fail:
2797 while ( i-- > 0 )
2798 put_page_and_type(mfn_to_page(frames[i]));
2799 return -EINVAL;
2803 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
2805 int nr_pages = (entries + 511) / 512;
2806 unsigned long frames[16];
2807 long ret;
2809 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2810 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2811 return -EINVAL;
2813 if ( copy_from_guest((unsigned long *)frames, frame_list, nr_pages) )
2814 return -EFAULT;
2816 LOCK_BIGLOCK(current->domain);
2818 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2819 local_flush_tlb();
2821 UNLOCK_BIGLOCK(current->domain);
2823 return ret;
2827 long do_update_descriptor(u64 pa, u64 desc)
2829 struct domain *dom = current->domain;
2830 unsigned long gmfn = pa >> PAGE_SHIFT;
2831 unsigned long mfn;
2832 unsigned int offset;
2833 struct desc_struct *gdt_pent, d;
2834 struct page_info *page;
2835 long ret = -EINVAL;
2837 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2839 *(u64 *)&d = desc;
2841 LOCK_BIGLOCK(dom);
2843 if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
2844 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2845 !mfn_valid(mfn) ||
2846 !check_descriptor(&d) )
2848 UNLOCK_BIGLOCK(dom);
2849 return -EINVAL;
2852 page = mfn_to_page(mfn);
2853 if ( unlikely(!get_page(page, dom)) )
2855 UNLOCK_BIGLOCK(dom);
2856 return -EINVAL;
2859 /* Check if the given frame is in use in an unsafe context. */
2860 switch ( page->u.inuse.type_info & PGT_type_mask )
2862 case PGT_gdt_page:
2863 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2864 goto out;
2865 break;
2866 case PGT_ldt_page:
2867 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2868 goto out;
2869 break;
2870 default:
2871 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2872 goto out;
2873 break;
2876 mark_dirty(dom, mfn);
2878 /* All is good so make the update. */
2879 gdt_pent = map_domain_page(mfn);
2880 memcpy(&gdt_pent[offset], &d, 8);
2881 unmap_domain_page(gdt_pent);
2883 put_page_type(page);
2885 ret = 0; /* success */
2887 out:
2888 put_page(page);
2890 UNLOCK_BIGLOCK(dom);
2892 return ret;
2895 typedef struct e820entry e820entry_t;
2896 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
2898 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
2900 switch ( op )
2902 case XENMEM_add_to_physmap:
2904 struct xen_add_to_physmap xatp;
2905 unsigned long prev_mfn, mfn = 0, gpfn;
2906 struct domain *d;
2908 if ( copy_from_guest(&xatp, arg, 1) )
2909 return -EFAULT;
2911 if ( xatp.domid == DOMID_SELF )
2913 d = current->domain;
2914 get_knownalive_domain(d);
2916 else if ( !IS_PRIV(current->domain) )
2917 return -EPERM;
2918 else if ( (d = find_domain_by_id(xatp.domid)) == NULL )
2919 return -ESRCH;
2921 switch ( xatp.space )
2923 case XENMAPSPACE_shared_info:
2924 if ( xatp.idx == 0 )
2925 mfn = virt_to_mfn(d->shared_info);
2926 break;
2927 case XENMAPSPACE_grant_table:
2928 if ( xatp.idx < NR_GRANT_FRAMES )
2929 mfn = virt_to_mfn(d->grant_table->shared) + xatp.idx;
2930 break;
2931 default:
2932 break;
2935 if ( !shadow_mode_translate(d) || (mfn == 0) )
2937 put_domain(d);
2938 return -EINVAL;
2941 LOCK_BIGLOCK(d);
2943 /* Remove previously mapped page if it was present. */
2944 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
2945 if ( mfn_valid(prev_mfn) )
2947 if ( IS_XEN_HEAP_FRAME(mfn_to_page(prev_mfn)) )
2948 /* Xen heap frames are simply unhooked from this phys slot. */
2949 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
2950 else
2951 /* Normal domain memory is freed, to avoid leaking memory. */
2952 guest_remove_page(d, xatp.gpfn);
2955 /* Unmap from old location, if any. */
2956 gpfn = get_gpfn_from_mfn(mfn);
2957 if ( gpfn != INVALID_M2P_ENTRY )
2958 guest_physmap_remove_page(d, gpfn, mfn);
2960 /* Map at new location. */
2961 guest_physmap_add_page(d, xatp.gpfn, mfn);
2963 UNLOCK_BIGLOCK(d);
2965 put_domain(d);
2967 break;
2970 case XENMEM_memory_map:
2972 return -ENOSYS;
2975 case XENMEM_machine_memory_map:
2977 struct xen_memory_map memmap;
2978 XEN_GUEST_HANDLE(e820entry_t) buffer;
2979 int count;
2981 if ( !IS_PRIV(current->domain) )
2982 return -EINVAL;
2984 if ( copy_from_guest(&memmap, arg, 1) )
2985 return -EFAULT;
2986 if ( memmap.nr_entries < e820.nr_map + 1 )
2987 return -EINVAL;
2989 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
2991 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
2992 if ( copy_to_guest(buffer, &e820.map[0], count) < 0 )
2993 return -EFAULT;
2995 memmap.nr_entries = count;
2997 if ( copy_to_guest(arg, &memmap, 1) )
2998 return -EFAULT;
3000 return 0;
3003 case XENMEM_machphys_mapping:
3005 struct xen_machphys_mapping mapping = {
3006 .v_start = MACH2PHYS_VIRT_START,
3007 .v_end = MACH2PHYS_VIRT_END,
3008 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3009 };
3011 if ( copy_to_guest(arg, &mapping, 1) )
3012 return -EFAULT;
3014 return 0;
3017 default:
3018 return subarch_memory_op(op, arg);
3021 return 0;
3025 /*************************
3026 * Writable Pagetables
3027 */
3029 static int ptwr_emulated_update(
3030 unsigned long addr,
3031 paddr_t old,
3032 paddr_t val,
3033 unsigned int bytes,
3034 unsigned int do_cmpxchg)
3036 unsigned long pfn;
3037 struct page_info *page;
3038 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3039 struct vcpu *v = current;
3040 struct domain *d = v->domain;
3042 /* Aligned access only, thank you. */
3043 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3045 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3046 bytes, addr);
3047 return X86EMUL_UNHANDLEABLE;
3050 /* Turn a sub-word access into a full-word access. */
3051 if ( bytes != sizeof(paddr_t) )
3053 paddr_t full;
3054 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3056 /* Align address; read full word. */
3057 addr &= ~(sizeof(paddr_t)-1);
3058 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3060 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3061 return X86EMUL_PROPAGATE_FAULT;
3063 /* Mask out bits provided by caller. */
3064 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3065 /* Shift the caller value and OR in the missing bits. */
3066 val &= (((paddr_t)1 << (bytes*8)) - 1);
3067 val <<= (offset)*8;
3068 val |= full;
3069 /* Also fill in missing parts of the cmpxchg old value. */
3070 old &= (((paddr_t)1 << (bytes*8)) - 1);
3071 old <<= (offset)*8;
3072 old |= full;
3075 /* Read the PTE that maps the page being updated. */
3076 if ( __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3077 sizeof(pte)) )
3079 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3080 return X86EMUL_UNHANDLEABLE;
3083 pfn = l1e_get_pfn(pte);
3084 page = mfn_to_page(pfn);
3086 /* We are looking only for read-only mappings of p.t. pages. */
3087 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3088 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3089 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3090 ASSERT(page_get_owner(page) == d);
3092 /* Check the new PTE. */
3093 nl1e = l1e_from_intpte(val);
3094 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3096 if ( (CONFIG_PAGING_LEVELS == 3) &&
3097 (bytes == 4) &&
3098 !do_cmpxchg &&
3099 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3101 /*
3102 * If this is a half-write to a PAE PTE then we assume that the
3103 * guest has simply got the two writes the wrong way round. We
3104 * zap the PRESENT bit on the assumption the bottom half will be
3105 * written immediately after we return to the guest.
3106 */
3107 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte"\n",
3108 l1e_get_intpte(nl1e));
3109 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3111 else
3113 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3114 return X86EMUL_UNHANDLEABLE;
3118 adjust_guest_l1e(nl1e);
3120 /* Checked successfully: do the update (write or cmpxchg). */
3121 pl1e = map_domain_page(page_to_mfn(page));
3122 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3123 if ( do_cmpxchg )
3125 if ( shadow_mode_enabled(d) )
3126 shadow_lock(d);
3127 ol1e = l1e_from_intpte(old);
3128 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3130 if ( shadow_mode_enabled(d) )
3131 shadow_unlock(d);
3132 unmap_domain_page(pl1e);
3133 put_page_from_l1e(nl1e, d);
3134 return X86EMUL_CMPXCHG_FAILED;
3136 if ( unlikely(shadow_mode_enabled(v->domain)) )
3138 shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
3139 shadow_unlock(v->domain);
3142 else
3144 ol1e = *pl1e;
3145 if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) )
3146 BUG();
3149 unmap_domain_page(pl1e);
3151 /* Finally, drop the old PTE. */
3152 put_page_from_l1e(ol1e, d);
3154 return X86EMUL_CONTINUE;
3157 static int ptwr_emulated_write(
3158 unsigned long addr,
3159 unsigned long val,
3160 unsigned int bytes,
3161 struct x86_emulate_ctxt *ctxt)
3163 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3166 static int ptwr_emulated_cmpxchg(
3167 unsigned long addr,
3168 unsigned long old,
3169 unsigned long new,
3170 unsigned int bytes,
3171 struct x86_emulate_ctxt *ctxt)
3173 return ptwr_emulated_update(addr, old, new, bytes, 1);
3176 static int ptwr_emulated_cmpxchg8b(
3177 unsigned long addr,
3178 unsigned long old,
3179 unsigned long old_hi,
3180 unsigned long new,
3181 unsigned long new_hi,
3182 struct x86_emulate_ctxt *ctxt)
3184 if ( CONFIG_PAGING_LEVELS == 2 )
3185 return X86EMUL_UNHANDLEABLE;
3186 else
3187 return ptwr_emulated_update(
3188 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3191 static struct x86_emulate_ops ptwr_emulate_ops = {
3192 .read_std = x86_emulate_read_std,
3193 .write_std = x86_emulate_write_std,
3194 .read_emulated = x86_emulate_read_std,
3195 .write_emulated = ptwr_emulated_write,
3196 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3197 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3198 };
3200 /* Write page fault handler: check if guest is trying to modify a PTE. */
3201 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3202 struct cpu_user_regs *regs)
3204 unsigned long pfn;
3205 struct page_info *page;
3206 l1_pgentry_t pte;
3207 l2_pgentry_t *pl2e, l2e;
3208 struct x86_emulate_ctxt emul_ctxt;
3210 LOCK_BIGLOCK(d);
3212 /*
3213 * Attempt to read the PTE that maps the VA being accessed. By checking for
3214 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3215 */
3216 pl2e = &__linear_l2_table[l2_linear_offset(addr)];
3217 if ( __copy_from_user(&l2e, pl2e, sizeof(l2e)) ||
3218 !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3219 __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3220 sizeof(pte)) )
3221 goto bail;
3223 pfn = l1e_get_pfn(pte);
3224 page = mfn_to_page(pfn);
3226 /* We are looking only for read-only mappings of p.t. pages. */
3227 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3228 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3229 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3230 (page_get_owner(page) != d) )
3231 goto bail;
3233 emul_ctxt.regs = guest_cpu_user_regs();
3234 emul_ctxt.cr2 = addr;
3235 emul_ctxt.mode = X86EMUL_MODE_HOST;
3236 if ( x86_emulate_memop(&emul_ctxt, &ptwr_emulate_ops) )
3237 goto bail;
3239 UNLOCK_BIGLOCK(d);
3240 perfc_incrc(ptwr_emulations);
3241 return EXCRET_fault_fixed;
3243 bail:
3244 UNLOCK_BIGLOCK(d);
3245 return 0;
3248 int map_pages_to_xen(
3249 unsigned long virt,
3250 unsigned long mfn,
3251 unsigned long nr_mfns,
3252 unsigned long flags)
3254 l2_pgentry_t *pl2e, ol2e;
3255 l1_pgentry_t *pl1e, ol1e;
3256 unsigned int i;
3258 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3259 flags &= ~MAP_SMALL_PAGES;
3261 while ( nr_mfns != 0 )
3263 pl2e = virt_to_xen_l2e(virt);
3265 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3266 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
3267 !map_small_pages )
3269 /* Super-page mapping. */
3270 ol2e = *pl2e;
3271 *pl2e = l2e_from_pfn(mfn, flags|_PAGE_PSE);
3273 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3275 local_flush_tlb_pge();
3276 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3277 free_xen_pagetable(l2e_get_page(ol2e));
3280 virt += 1UL << L2_PAGETABLE_SHIFT;
3281 mfn += 1UL << PAGETABLE_ORDER;
3282 nr_mfns -= 1UL << PAGETABLE_ORDER;
3284 else
3286 /* Normal page mapping. */
3287 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3289 pl1e = page_to_virt(alloc_xen_pagetable());
3290 clear_page(pl1e);
3291 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3293 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3295 pl1e = page_to_virt(alloc_xen_pagetable());
3296 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3297 pl1e[i] = l1e_from_pfn(
3298 l2e_get_pfn(*pl2e) + i,
3299 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3300 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3301 local_flush_tlb_pge();
3304 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3305 ol1e = *pl1e;
3306 *pl1e = l1e_from_pfn(mfn, flags);
3307 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3308 local_flush_tlb_one(virt);
3310 virt += 1UL << L1_PAGETABLE_SHIFT;
3311 mfn += 1UL;
3312 nr_mfns -= 1UL;
3316 return 0;
3319 void __set_fixmap(
3320 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
3322 BUG_ON(idx >= __end_of_fixed_addresses);
3323 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
3326 #ifdef MEMORY_GUARD
3328 void memguard_init(void)
3330 map_pages_to_xen(
3331 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3332 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3335 static void __memguard_change_range(void *p, unsigned long l, int guard)
3337 unsigned long _p = (unsigned long)p;
3338 unsigned long _l = (unsigned long)l;
3339 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3341 /* Ensure we are dealing with a page-aligned whole number of pages. */
3342 ASSERT((_p&PAGE_MASK) != 0);
3343 ASSERT((_l&PAGE_MASK) != 0);
3344 ASSERT((_p&~PAGE_MASK) == 0);
3345 ASSERT((_l&~PAGE_MASK) == 0);
3347 if ( guard )
3348 flags &= ~_PAGE_PRESENT;
3350 map_pages_to_xen(
3351 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3354 void memguard_guard_range(void *p, unsigned long l)
3356 __memguard_change_range(p, l, 1);
3359 void memguard_unguard_range(void *p, unsigned long l)
3361 __memguard_change_range(p, l, 0);
3364 #endif
3366 void memguard_guard_stack(void *p)
3368 BUILD_BUG_ON((DEBUG_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
3369 p = (void *)((unsigned long)p + STACK_SIZE - DEBUG_STACK_SIZE - PAGE_SIZE);
3370 memguard_guard_range(p, PAGE_SIZE);
3373 /*
3374 * Local variables:
3375 * mode: C
3376 * c-set-style: "BSD"
3377 * c-basic-offset: 4
3378 * tab-width: 4
3379 * indent-tabs-mode: nil
3380 * End:
3381 */