ia64/xen-unstable

view xen/arch/x86/mm.c @ 18394:dade7f0bdc8d

hvm: Use main memory for video memory.

When creating an HVM domain, if e.g. another domain is created before
qemu allocates video memory, the extra 8MB memory ballooning is not
available any more, because it got consumed by the other domain.

This fixes it by taking video memory from the main memory:

- make hvmloader use e820_malloc to reserve some of the main memory
and notify ioemu of its address through the Xen platform PCI card.
- add XENMAPSPACE_mfn to the xen_add_to_physmap memory op, to allow
ioemu to move the MFNs between the original position and the PCI
mapping, when LFB acceleration is disabled/enabled
- add a remove_from_physmap memory op, to allow ioemu to unmap it
completely for the case of old guests with acceleration disabled.
- add xc_domain_memory_translate_gpfn_list to libxc to allow ioemu to
get the MFNs of the video memory.
- have xend save the PCI memory space instead of ioemu: if a memory
page is there, the guest can access it like usual memory, so xend
can safely be responsible to save it. The extra benefit is that
live migration will apply the logdirty optimization there too.
- handle old saved images, populating the video memory from ioemu if
really needed.

Signed-off-by: Samuel Thibault <samuel.thibault@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Aug 27 14:53:39 2008 +0100 (2008-08-27)
parents 1e99ba540356
children 86b956d8cf04
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/domain.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <xen/domain_page.h>
99 #include <xen/event.h>
100 #include <xen/iocap.h>
101 #include <xen/guest_access.h>
102 #include <asm/paging.h>
103 #include <asm/shadow.h>
104 #include <asm/page.h>
105 #include <asm/flushtlb.h>
106 #include <asm/io.h>
107 #include <asm/ldt.h>
108 #include <asm/x86_emulate.h>
109 #include <asm/e820.h>
110 #include <asm/hypercall.h>
111 #include <asm/shared.h>
112 #include <public/memory.h>
113 #include <xsm/xsm.h>
114 #include <xen/trace.h>
116 /*
117 * Mapping of first 2 or 4 megabytes of memory. This is mapped with 4kB
118 * mappings to avoid type conflicts with fixed-range MTRRs covering the
119 * lowest megabyte of physical memory. In any case the VGA hole should be
120 * mapped with type UC.
121 */
122 l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
123 l1_identmap[L1_PAGETABLE_ENTRIES];
125 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
127 /*
128 * PTE updates can be done with ordinary writes except:
129 * 1. Debug builds get extra checking by using CMPXCHG[8B].
130 * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B.
131 */
132 #if !defined(NDEBUG) || defined(__i386__)
133 #define PTE_UPDATE_WITH_CMPXCHG
134 #endif
136 /* Used to defer flushing of memory structures. */
137 struct percpu_mm_info {
138 #define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */
139 #define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */
140 #define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */
141 unsigned int deferred_ops;
142 /* If non-NULL, specifies a foreign subject domain for some operations. */
143 struct domain *foreign;
144 };
145 static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);
147 /*
148 * Returns the current foreign domain; defaults to the currently-executing
149 * domain if a foreign override hasn't been specified.
150 */
151 #define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)
153 /* Private domain structs for DOMID_XEN and DOMID_IO. */
154 struct domain *dom_xen, *dom_io;
156 /* Frame table and its size in pages. */
157 struct page_info *frame_table;
158 unsigned long max_page;
159 unsigned long total_pages;
161 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
163 #define l1_disallow_mask(d) \
164 ((d != dom_io) && \
165 (rangeset_is_empty((d)->iomem_caps) && \
166 rangeset_is_empty((d)->arch.ioport_caps) && \
167 !has_arch_pdevs(d)) ? \
168 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
170 #ifdef CONFIG_COMPAT
171 l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
172 #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \
173 L3_DISALLOW_MASK : \
174 COMPAT_L3_DISALLOW_MASK)
175 #else
176 #define l3_disallow_mask(d) L3_DISALLOW_MASK
177 #endif
179 static void queue_deferred_ops(struct domain *d, unsigned int ops)
180 {
181 ASSERT(d == current->domain);
182 this_cpu(percpu_mm_info).deferred_ops |= ops;
183 }
185 void __init init_frametable(void)
186 {
187 unsigned long nr_pages, page_step, i, mfn;
189 frame_table = (struct page_info *)FRAMETABLE_VIRT_START;
191 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
192 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
194 for ( i = 0; i < nr_pages; i += page_step )
195 {
196 mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
197 if ( mfn == 0 )
198 panic("Not enough memory for frame table\n");
199 map_pages_to_xen(
200 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
201 mfn, page_step, PAGE_HYPERVISOR);
202 }
204 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
206 #if defined(__x86_64__)
207 for ( i = 0; i < max_page; i ++ )
208 spin_lock_init(&frame_table[i].lock);
209 #endif
210 }
212 void __init arch_init_memory(void)
213 {
214 extern void subarch_init_memory(void);
216 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
218 /*
219 * Initialise our DOMID_XEN domain.
220 * Any Xen-heap pages that we will allow to be mapped will have
221 * their domain field set to dom_xen.
222 */
223 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
224 BUG_ON(dom_xen == NULL);
226 /*
227 * Initialise our DOMID_IO domain.
228 * This domain owns I/O pages that are within the range of the page_info
229 * array. Mappings occur at the priv of the caller.
230 */
231 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
232 BUG_ON(dom_io == NULL);
234 /* First 1MB of RAM is historically marked as I/O. */
235 for ( i = 0; i < 0x100; i++ )
236 share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable);
238 /* Any areas not specified as RAM by the e820 map are considered I/O. */
239 for ( i = 0, pfn = 0; pfn < max_page; i++ )
240 {
241 while ( (i < e820.nr_map) &&
242 (e820.map[i].type != E820_RAM) &&
243 (e820.map[i].type != E820_UNUSABLE) )
244 i++;
246 if ( i >= e820.nr_map )
247 {
248 /* No more RAM regions: mark as I/O right to end of memory map. */
249 rstart_pfn = rend_pfn = max_page;
250 }
251 else
252 {
253 /* Mark as I/O just up as far as next RAM region. */
254 rstart_pfn = min_t(unsigned long, max_page,
255 PFN_UP(e820.map[i].addr));
256 rend_pfn = max_t(unsigned long, rstart_pfn,
257 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
258 }
260 /*
261 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
262 * In particular this ensures that RAM holes are respected even in
263 * the statically-initialised 1-16MB mapping area.
264 */
265 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
266 ioend_pfn = rstart_pfn;
267 #if defined(CONFIG_X86_32)
268 ioend_pfn = min_t(unsigned long, ioend_pfn,
269 DIRECTMAP_MBYTES << (20 - PAGE_SHIFT));
270 #endif
271 if ( iostart_pfn < ioend_pfn )
272 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
273 (unsigned long)mfn_to_virt(ioend_pfn));
275 /* Mark as I/O up to next RAM region. */
276 for ( ; pfn < rstart_pfn; pfn++ )
277 {
278 BUG_ON(!mfn_valid(pfn));
279 share_xen_page_with_guest(
280 mfn_to_page(pfn), dom_io, XENSHARE_writable);
281 }
283 /* Skip the RAM region. */
284 pfn = rend_pfn;
285 }
287 subarch_init_memory();
288 }
290 int memory_is_conventional_ram(paddr_t p)
291 {
292 int i;
294 for ( i = 0; i < e820.nr_map; i++ )
295 {
296 if ( (e820.map[i].type == E820_RAM) &&
297 (e820.map[i].addr <= p) &&
298 (e820.map[i].size > p) )
299 return 1;
300 }
302 return 0;
303 }
305 unsigned long domain_get_maximum_gpfn(struct domain *d)
306 {
307 if ( is_hvm_domain(d) )
308 return d->arch.p2m->max_mapped_pfn;
309 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
310 return arch_get_max_pfn(d) - 1;
311 }
313 void share_xen_page_with_guest(
314 struct page_info *page, struct domain *d, int readonly)
315 {
316 if ( page_get_owner(page) == d )
317 return;
319 set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY);
321 spin_lock(&d->page_alloc_lock);
323 /* The incremented type count pins as writable or read-only. */
324 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
325 page->u.inuse.type_info |= PGT_validated | 1;
327 page_set_owner(page, d);
328 wmb(); /* install valid domain ptr before updating refcnt. */
329 ASSERT(page->count_info == 0);
331 /* Only add to the allocation list if the domain isn't dying. */
332 if ( !d->is_dying )
333 {
334 page->count_info |= PGC_allocated | 1;
335 if ( unlikely(d->xenheap_pages++ == 0) )
336 get_knownalive_domain(d);
337 list_add_tail(&page->list, &d->xenpage_list);
338 }
340 spin_unlock(&d->page_alloc_lock);
341 }
343 void share_xen_page_with_privileged_guests(
344 struct page_info *page, int readonly)
345 {
346 share_xen_page_with_guest(page, dom_xen, readonly);
347 }
349 #if defined(__i386__)
351 #ifdef NDEBUG
352 /* Only PDPTs above 4GB boundary need to be shadowed in low memory. */
353 #define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)
354 #else
355 /*
356 * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
357 * We cannot safely shadow the idle page table, nor shadow (v1) page tables
358 * (detected by lack of an owning domain). As required for correctness, we
359 * always shadow PDPTs above 4GB.
360 */
361 #define l3tab_needs_shadow(mfn) \
362 (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
363 (page_get_owner(mfn_to_page(mfn)) != NULL) && \
364 ((mfn) & 1)) || /* odd MFNs are shadowed */ \
365 ((mfn) >= 0x100000))
366 #endif
368 static l1_pgentry_t *fix_pae_highmem_pl1e;
370 /* Cache the address of PAE high-memory fixmap page tables. */
371 static int __init cache_pae_fixmap_address(void)
372 {
373 unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0);
374 l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base);
375 fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base);
376 return 0;
377 }
378 __initcall(cache_pae_fixmap_address);
380 static DEFINE_PER_CPU(u32, make_cr3_timestamp);
382 void make_cr3(struct vcpu *v, unsigned long mfn)
383 /* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
384 * necessary, and sets v->arch.cr3 to the value to load in CR3. */
385 {
386 l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
387 struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
388 unsigned int cpu = smp_processor_id();
390 /* Fast path: does this mfn need a shadow at all? */
391 if ( !l3tab_needs_shadow(mfn) )
392 {
393 v->arch.cr3 = mfn << PAGE_SHIFT;
394 /* Cache is no longer in use or valid */
395 cache->high_mfn = 0;
396 return;
397 }
399 /* Caching logic is not interrupt safe. */
400 ASSERT(!in_irq());
402 /* Protects against pae_flush_pgd(). */
403 spin_lock(&cache->lock);
405 cache->inuse_idx ^= 1;
406 cache->high_mfn = mfn;
408 /* Map the guest L3 table and copy to the chosen low-memory cache. */
409 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
410 /* First check the previous high mapping can't be in the TLB.
411 * (i.e. have we loaded CR3 since we last did this?) */
412 if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
413 flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
414 highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
415 lowmem_l3tab = cache->table[cache->inuse_idx];
416 memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
417 l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty());
418 this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
420 v->arch.cr3 = __pa(lowmem_l3tab);
422 spin_unlock(&cache->lock);
423 }
425 #else /* !defined(__i386__) */
427 void make_cr3(struct vcpu *v, unsigned long mfn)
428 {
429 v->arch.cr3 = mfn << PAGE_SHIFT;
430 }
432 #endif /* !defined(__i386__) */
434 void write_ptbase(struct vcpu *v)
435 {
436 write_cr3(v->arch.cr3);
437 }
439 /*
440 * Should be called after CR3 is updated.
441 *
442 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
443 * for HVM guests, arch.monitor_table and hvm's guest CR3.
444 *
445 * Update ref counts to shadow tables appropriately.
446 */
447 void update_cr3(struct vcpu *v)
448 {
449 unsigned long cr3_mfn=0;
451 if ( paging_mode_enabled(v->domain) )
452 {
453 paging_update_cr3(v);
454 return;
455 }
457 #if CONFIG_PAGING_LEVELS == 4
458 if ( !(v->arch.flags & TF_kernel_mode) )
459 cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
460 else
461 #endif
462 cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
464 make_cr3(v, cr3_mfn);
465 }
468 static void invalidate_shadow_ldt(struct vcpu *v)
469 {
470 int i;
471 unsigned long pfn;
472 struct page_info *page;
474 if ( v->arch.shadow_ldt_mapcnt == 0 )
475 return;
477 v->arch.shadow_ldt_mapcnt = 0;
479 for ( i = 16; i < 32; i++ )
480 {
481 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
482 if ( pfn == 0 ) continue;
483 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
484 page = mfn_to_page(pfn);
485 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
486 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
487 put_page_and_type(page);
488 }
490 /* Dispose of the (now possibly invalid) mappings from the TLB. */
491 if ( v == current )
492 queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
493 else
494 flush_tlb_mask(v->domain->domain_dirty_cpumask);
495 }
498 static int alloc_segdesc_page(struct page_info *page)
499 {
500 struct desc_struct *descs;
501 int i;
503 descs = map_domain_page(page_to_mfn(page));
505 for ( i = 0; i < 512; i++ )
506 if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
507 goto fail;
509 unmap_domain_page(descs);
510 return 1;
512 fail:
513 unmap_domain_page(descs);
514 return 0;
515 }
518 /* Map shadow page at offset @off. */
519 int map_ldt_shadow_page(unsigned int off)
520 {
521 struct vcpu *v = current;
522 struct domain *d = v->domain;
523 unsigned long gmfn, mfn;
524 l1_pgentry_t l1e, nl1e;
525 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
526 int okay;
528 BUG_ON(unlikely(in_irq()));
530 guest_get_eff_kern_l1e(v, gva, &l1e);
531 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
532 return 0;
534 gmfn = l1e_get_pfn(l1e);
535 mfn = gmfn_to_mfn(d, gmfn);
536 if ( unlikely(!mfn_valid(mfn)) )
537 return 0;
539 okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
540 if ( unlikely(!okay) )
541 return 0;
543 nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
545 l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
546 v->arch.shadow_ldt_mapcnt++;
548 return 1;
549 }
552 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
553 {
554 struct page_info *page = mfn_to_page(page_nr);
556 if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
557 {
558 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
559 return 0;
560 }
562 return 1;
563 }
566 static int get_page_and_type_from_pagenr(unsigned long page_nr,
567 unsigned long type,
568 struct domain *d)
569 {
570 struct page_info *page = mfn_to_page(page_nr);
572 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
573 return 0;
575 if ( unlikely(!get_page_type(page, type)) )
576 {
577 put_page(page);
578 return 0;
579 }
581 return 1;
582 }
584 /*
585 * We allow root tables to map each other (a.k.a. linear page tables). It
586 * needs some special care with reference counts and access permissions:
587 * 1. The mapping entry must be read-only, or the guest may get write access
588 * to its own PTEs.
589 * 2. We must only bump the reference counts for an *already validated*
590 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
591 * on a validation that is required to complete that validation.
592 * 3. We only need to increment the reference counts for the mapped page
593 * frame if it is mapped by a different root table. This is sufficient and
594 * also necessary to allow validation of a root table mapping itself.
595 */
596 #define define_get_linear_pagetable(level) \
597 static int \
598 get_##level##_linear_pagetable( \
599 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
600 { \
601 unsigned long x, y; \
602 struct page_info *page; \
603 unsigned long pfn; \
604 \
605 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
606 { \
607 MEM_LOG("Attempt to create linear p.t. with write perms"); \
608 return 0; \
609 } \
610 \
611 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
612 { \
613 /* Make sure the mapped frame belongs to the correct domain. */ \
614 if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \
615 return 0; \
616 \
617 /* \
618 * Ensure that the mapped frame is an already-validated page table. \
619 * If so, atomically increment the count (checking for overflow). \
620 */ \
621 page = mfn_to_page(pfn); \
622 y = page->u.inuse.type_info; \
623 do { \
624 x = y; \
625 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
626 unlikely((x & (PGT_type_mask|PGT_validated)) != \
627 (PGT_##level##_page_table|PGT_validated)) ) \
628 { \
629 put_page(page); \
630 return 0; \
631 } \
632 } \
633 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
634 } \
635 \
636 return 1; \
637 }
640 int is_iomem_page(unsigned long mfn)
641 {
642 return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
643 }
646 int
647 get_page_from_l1e(
648 l1_pgentry_t l1e, struct domain *d)
649 {
650 unsigned long mfn = l1e_get_pfn(l1e);
651 struct page_info *page = mfn_to_page(mfn);
652 uint32_t l1f = l1e_get_flags(l1e);
653 struct vcpu *curr = current;
654 struct domain *owner;
655 int okay;
657 if ( !(l1f & _PAGE_PRESENT) )
658 return 1;
660 if ( unlikely(l1f & l1_disallow_mask(d)) )
661 {
662 MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(d));
663 return 0;
664 }
666 if ( is_iomem_page(mfn) )
667 {
668 /* DOMID_IO reverts to caller for privilege checks. */
669 if ( d == dom_io )
670 d = curr->domain;
672 if ( !iomem_access_permitted(d, mfn, mfn) )
673 {
674 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
675 MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
676 d->domain_id, mfn);
677 return 0;
678 }
680 return 1;
681 }
683 /*
684 * Let privileged domains transfer the right to map their target
685 * domain's pages. This is used to allow stub-domain pvfb export to dom0,
686 * until pvfb supports granted mappings. At that time this minor hack
687 * can go away.
688 */
689 owner = page_get_owner(page);
690 if ( unlikely(d != owner) && (owner != NULL) &&
691 (d != curr->domain) && IS_PRIV_FOR(d, owner) )
692 d = owner;
694 /* Foreign mappings into guests in shadow external mode don't
695 * contribute to writeable mapping refcounts. (This allows the
696 * qemu-dm helper process in dom0 to map the domain's memory without
697 * messing up the count of "real" writable mappings.) */
698 okay = (((l1f & _PAGE_RW) &&
699 !(unlikely(paging_mode_external(d) && (d != curr->domain))))
700 ? get_page_and_type(page, d, PGT_writable_page)
701 : get_page(page, d));
702 if ( !okay )
703 {
704 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
705 " for dom%d",
706 mfn, get_gpfn_from_mfn(mfn),
707 l1e_get_intpte(l1e), d->domain_id);
708 }
709 else if ( pte_flags_to_cacheattr(l1f) !=
710 ((page->count_info >> PGC_cacheattr_base) & 7) )
711 {
712 uint32_t x, nx, y = page->count_info;
713 uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
715 if ( is_xen_heap_page(page) )
716 {
717 if ( (l1f & _PAGE_RW) &&
718 !(unlikely(paging_mode_external(d) &&
719 (d != curr->domain))) )
720 put_page_type(page);
721 put_page(page);
722 MEM_LOG("Attempt to change cache attributes of Xen heap page");
723 return 0;
724 }
726 while ( ((y >> PGC_cacheattr_base) & 7) != cacheattr )
727 {
728 x = y;
729 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
730 y = cmpxchg(&page->count_info, x, nx);
731 }
733 #ifdef __x86_64__
734 map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
735 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
736 #endif
737 }
739 return okay;
740 }
743 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
744 define_get_linear_pagetable(l2);
745 static int
746 get_page_from_l2e(
747 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
748 {
749 int rc;
751 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
752 return 1;
754 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
755 {
756 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
757 return 0;
758 }
760 rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
761 if ( unlikely(!rc) )
762 rc = get_l2_linear_pagetable(l2e, pfn, d);
764 return rc;
765 }
768 #if CONFIG_PAGING_LEVELS >= 3
769 define_get_linear_pagetable(l3);
770 static int
771 get_page_from_l3e(
772 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
773 {
774 int rc;
776 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
777 return 1;
779 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
780 {
781 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
782 return 0;
783 }
785 rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
786 if ( unlikely(!rc) )
787 rc = get_l3_linear_pagetable(l3e, pfn, d);
789 return rc;
790 }
791 #endif /* 3 level */
793 #if CONFIG_PAGING_LEVELS >= 4
794 define_get_linear_pagetable(l4);
795 static int
796 get_page_from_l4e(
797 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
798 {
799 int rc;
801 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
802 return 1;
804 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
805 {
806 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
807 return 0;
808 }
810 rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
811 if ( unlikely(!rc) )
812 rc = get_l4_linear_pagetable(l4e, pfn, d);
814 return rc;
815 }
816 #endif /* 4 level */
818 #ifdef __x86_64__
820 #ifdef USER_MAPPINGS_ARE_GLOBAL
821 #define adjust_guest_l1e(pl1e, d) \
822 do { \
823 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
824 likely(!is_pv_32on64_domain(d)) ) \
825 { \
826 /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \
827 if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
828 == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \
829 MEM_LOG("Global bit is set to kernel page %lx", \
830 l1e_get_pfn((pl1e))); \
831 if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \
832 l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \
833 if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \
834 l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \
835 } \
836 } while ( 0 )
837 #else
838 #define adjust_guest_l1e(pl1e, d) \
839 do { \
840 if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \
841 likely(!is_pv_32on64_domain(d)) ) \
842 l1e_add_flags((pl1e), _PAGE_USER); \
843 } while ( 0 )
844 #endif
846 #define adjust_guest_l2e(pl2e, d) \
847 do { \
848 if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \
849 likely(!is_pv_32on64_domain(d)) ) \
850 l2e_add_flags((pl2e), _PAGE_USER); \
851 } while ( 0 )
853 #define adjust_guest_l3e(pl3e, d) \
854 do { \
855 if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
856 l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
857 _PAGE_USER : \
858 _PAGE_USER|_PAGE_RW); \
859 } while ( 0 )
861 #define adjust_guest_l4e(pl4e, d) \
862 do { \
863 if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
864 likely(!is_pv_32on64_domain(d)) ) \
865 l4e_add_flags((pl4e), _PAGE_USER); \
866 } while ( 0 )
868 #else /* !defined(__x86_64__) */
870 #define adjust_guest_l1e(_p, _d) ((void)(_d))
871 #define adjust_guest_l2e(_p, _d) ((void)(_d))
872 #define adjust_guest_l3e(_p, _d) ((void)(_d))
874 #endif
876 #ifdef CONFIG_COMPAT
877 #define unadjust_guest_l3e(pl3e, d) \
878 do { \
879 if ( unlikely(is_pv_32on64_domain(d)) && \
880 likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \
881 l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \
882 } while ( 0 )
883 #else
884 #define unadjust_guest_l3e(_p, _d) ((void)(_d))
885 #endif
887 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
888 {
889 unsigned long pfn = l1e_get_pfn(l1e);
890 struct page_info *page;
891 struct domain *e;
892 struct vcpu *v;
894 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) )
895 return;
897 page = mfn_to_page(pfn);
899 e = page_get_owner(page);
901 /*
902 * Check if this is a mapping that was established via a grant reference.
903 * If it was then we should not be here: we require that such mappings are
904 * explicitly destroyed via the grant-table interface.
905 *
906 * The upshot of this is that the guest can end up with active grants that
907 * it cannot destroy (because it no longer has a PTE to present to the
908 * grant-table interface). This can lead to subtle hard-to-catch bugs,
909 * hence a special grant PTE flag can be enabled to catch the bug early.
910 *
911 * (Note that the undestroyable active grants are not a security hole in
912 * Xen. All active grants can safely be cleaned up when the domain dies.)
913 */
914 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
915 !d->is_shutting_down && !d->is_dying )
916 {
917 MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte,
918 l1e_get_intpte(l1e));
919 domain_crash(d);
920 }
922 /* Remember we didn't take a type-count of foreign writable mappings
923 * to paging-external domains */
924 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
925 !(unlikely((e != d) && paging_mode_external(e))) )
926 {
927 put_page_and_type(page);
928 }
929 else
930 {
931 /* We expect this is rare so we blow the entire shadow LDT. */
932 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
933 PGT_seg_desc_page)) &&
934 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
935 (d == e) )
936 {
937 for_each_vcpu ( d, v )
938 invalidate_shadow_ldt(v);
939 }
940 put_page(page);
941 }
942 }
945 /*
946 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
947 * Note also that this automatically deals correctly with linear p.t.'s.
948 */
949 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
950 {
951 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
952 (l2e_get_pfn(l2e) != pfn) )
953 put_page_and_type(l2e_get_page(l2e));
954 }
957 #if CONFIG_PAGING_LEVELS >= 3
958 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
959 {
960 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
961 (l3e_get_pfn(l3e) != pfn) )
962 put_page_and_type(l3e_get_page(l3e));
963 }
964 #endif
966 #if CONFIG_PAGING_LEVELS >= 4
967 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
968 {
969 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
970 (l4e_get_pfn(l4e) != pfn) )
971 put_page_and_type(l4e_get_page(l4e));
972 }
973 #endif
975 static int alloc_l1_table(struct page_info *page)
976 {
977 struct domain *d = page_get_owner(page);
978 unsigned long pfn = page_to_mfn(page);
979 l1_pgentry_t *pl1e;
980 int i;
982 pl1e = map_domain_page(pfn);
984 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
985 {
986 if ( is_guest_l1_slot(i) &&
987 unlikely(!get_page_from_l1e(pl1e[i], d)) )
988 goto fail;
990 adjust_guest_l1e(pl1e[i], d);
991 }
993 unmap_domain_page(pl1e);
994 return 1;
996 fail:
997 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
998 while ( i-- > 0 )
999 if ( is_guest_l1_slot(i) )
1000 put_page_from_l1e(pl1e[i], d);
1002 unmap_domain_page(pl1e);
1003 return 0;
1006 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1008 struct page_info *page;
1009 l2_pgentry_t *pl2e;
1010 l3_pgentry_t l3e3;
1011 #ifndef CONFIG_COMPAT
1012 l2_pgentry_t l2e;
1013 int i;
1014 #endif
1016 if ( !is_pv_32bit_domain(d) )
1017 return 1;
1019 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1021 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1022 l3e3 = pl3e[3];
1023 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1025 MEM_LOG("PAE L3 3rd slot is empty");
1026 return 0;
1029 /*
1030 * The Xen-private mappings include linear mappings. The L2 thus cannot
1031 * be shared by multiple L3 tables. The test here is adequate because:
1032 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1033 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1034 * 2. Cannot appear in another page table's L3:
1035 * a. alloc_l3_table() calls this function and this check will fail
1036 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1037 */
1038 page = l3e_get_page(l3e3);
1039 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1040 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1041 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1042 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1044 MEM_LOG("PAE L3 3rd slot is shared");
1045 return 0;
1048 /* Xen private mappings. */
1049 pl2e = map_domain_page(l3e_get_pfn(l3e3));
1050 #ifndef CONFIG_COMPAT
1051 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1052 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1053 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1054 for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1056 l2e = l2e_from_page(
1057 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
1058 __PAGE_HYPERVISOR);
1059 l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
1061 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
1063 l2e = l2e_empty();
1064 if ( l3e_get_flags(pl3e[i]) & _PAGE_PRESENT )
1065 l2e = l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR);
1066 l2e_write(&pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], l2e);
1068 #else
1069 memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1070 &compat_idle_pg_table_l2[
1071 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1072 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e));
1073 #endif
1074 unmap_domain_page(pl2e);
1076 return 1;
1079 #ifdef __i386__
1080 /* Flush a pgdir update into low-memory caches. */
1081 static void pae_flush_pgd(
1082 unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
1084 struct domain *d = page_get_owner(mfn_to_page(mfn));
1085 struct vcpu *v;
1086 intpte_t _ol3e, _nl3e, _pl3e;
1087 l3_pgentry_t *l3tab_ptr;
1088 struct pae_l3_cache *cache;
1090 if ( unlikely(shadow_mode_enabled(d)) )
1092 cpumask_t m = CPU_MASK_NONE;
1093 /* Re-shadow this l3 table on any vcpus that are using it */
1094 for_each_vcpu ( d, v )
1095 if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
1097 paging_update_cr3(v);
1098 cpus_or(m, m, v->vcpu_dirty_cpumask);
1100 flush_tlb_mask(m);
1103 /* If below 4GB then the pgdir is not shadowed in low memory. */
1104 if ( !l3tab_needs_shadow(mfn) )
1105 return;
1107 for_each_vcpu ( d, v )
1109 cache = &v->arch.pae_l3_cache;
1111 spin_lock(&cache->lock);
1113 if ( cache->high_mfn == mfn )
1115 l3tab_ptr = &cache->table[cache->inuse_idx][idx];
1116 _ol3e = l3e_get_intpte(*l3tab_ptr);
1117 _nl3e = l3e_get_intpte(nl3e);
1118 _pl3e = cmpxchg(&l3e_get_intpte(*l3tab_ptr), _ol3e, _nl3e);
1119 BUG_ON(_pl3e != _ol3e);
1122 spin_unlock(&cache->lock);
1125 flush_tlb_mask(d->domain_dirty_cpumask);
1127 #else
1128 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
1129 #endif
1131 static int alloc_l2_table(struct page_info *page, unsigned long type)
1133 struct domain *d = page_get_owner(page);
1134 unsigned long pfn = page_to_mfn(page);
1135 l2_pgentry_t *pl2e;
1136 int i;
1138 pl2e = map_domain_page(pfn);
1140 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1142 if ( !is_guest_l2_slot(d, type, i) )
1143 continue;
1145 if ( unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
1146 goto fail;
1148 adjust_guest_l2e(pl2e[i], d);
1151 unmap_domain_page(pl2e);
1152 return 1;
1154 fail:
1155 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
1156 while ( i-- > 0 )
1157 if ( is_guest_l2_slot(d, type, i) )
1158 put_page_from_l2e(pl2e[i], pfn);
1160 unmap_domain_page(pl2e);
1161 return 0;
1165 #if CONFIG_PAGING_LEVELS >= 3
1166 static int alloc_l3_table(struct page_info *page)
1168 struct domain *d = page_get_owner(page);
1169 unsigned long pfn = page_to_mfn(page);
1170 l3_pgentry_t *pl3e;
1171 int i;
1173 #if CONFIG_PAGING_LEVELS == 3
1174 /*
1175 * PAE pgdirs above 4GB are unacceptable if the guest does not understand
1176 * the weird 'extended cr3' format for dealing with high-order address
1177 * bits. We cut some slack for control tools (before vcpu0 is initialised).
1178 */
1179 if ( (pfn >= 0x100000) &&
1180 unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
1181 d->vcpu[0] && d->vcpu[0]->is_initialised )
1183 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
1184 return 0;
1186 #endif
1188 pl3e = map_domain_page(pfn);
1190 /*
1191 * PAE guests allocate full pages, but aren't required to initialize
1192 * more than the first four entries; when running in compatibility
1193 * mode, however, the full page is visible to the MMU, and hence all
1194 * 512 entries must be valid/verified, which is most easily achieved
1195 * by clearing them out.
1196 */
1197 if ( is_pv_32on64_domain(d) )
1198 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1200 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1202 if ( is_pv_32bit_domain(d) && (i == 3) )
1204 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1205 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
1206 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
1207 PGT_l2_page_table |
1208 PGT_pae_xen_l2,
1209 d) )
1210 goto fail;
1212 else if ( !is_guest_l3_slot(i) )
1213 continue;
1214 else if ( unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
1215 goto fail;
1217 adjust_guest_l3e(pl3e[i], d);
1220 if ( !create_pae_xen_mappings(d, pl3e) )
1221 goto fail;
1223 unmap_domain_page(pl3e);
1224 return 1;
1226 fail:
1227 MEM_LOG("Failure in alloc_l3_table: entry %d", i);
1228 while ( i-- > 0 )
1230 if ( !is_guest_l3_slot(i) )
1231 continue;
1232 unadjust_guest_l3e(pl3e[i], d);
1233 put_page_from_l3e(pl3e[i], pfn);
1236 unmap_domain_page(pl3e);
1237 return 0;
1239 #else
1240 #define alloc_l3_table(page) (0)
1241 #endif
1243 #if CONFIG_PAGING_LEVELS >= 4
1244 static int alloc_l4_table(struct page_info *page)
1246 struct domain *d = page_get_owner(page);
1247 unsigned long pfn = page_to_mfn(page);
1248 l4_pgentry_t *pl4e = page_to_virt(page);
1249 int i;
1251 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1253 if ( !is_guest_l4_slot(d, i) )
1254 continue;
1256 if ( unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
1257 goto fail;
1259 adjust_guest_l4e(pl4e[i], d);
1262 /* Xen private mappings. */
1263 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1264 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1265 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1266 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
1267 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
1268 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
1269 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
1270 __PAGE_HYPERVISOR);
1272 return 1;
1274 fail:
1275 MEM_LOG("Failure in alloc_l4_table: entry %d", i);
1276 while ( i-- > 0 )
1277 if ( is_guest_l4_slot(d, i) )
1278 put_page_from_l4e(pl4e[i], pfn);
1280 return 0;
1282 #else
1283 #define alloc_l4_table(page) (0)
1284 #endif
1287 static void free_l1_table(struct page_info *page)
1289 struct domain *d = page_get_owner(page);
1290 unsigned long pfn = page_to_mfn(page);
1291 l1_pgentry_t *pl1e;
1292 int i;
1294 pl1e = map_domain_page(pfn);
1296 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1297 if ( is_guest_l1_slot(i) )
1298 put_page_from_l1e(pl1e[i], d);
1300 unmap_domain_page(pl1e);
1304 static void free_l2_table(struct page_info *page)
1306 #ifdef CONFIG_COMPAT
1307 struct domain *d = page_get_owner(page);
1308 #endif
1309 unsigned long pfn = page_to_mfn(page);
1310 l2_pgentry_t *pl2e;
1311 int i;
1313 pl2e = map_domain_page(pfn);
1315 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
1316 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
1317 put_page_from_l2e(pl2e[i], pfn);
1319 unmap_domain_page(pl2e);
1321 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1325 #if CONFIG_PAGING_LEVELS >= 3
1327 static void free_l3_table(struct page_info *page)
1329 struct domain *d = page_get_owner(page);
1330 unsigned long pfn = page_to_mfn(page);
1331 l3_pgentry_t *pl3e;
1332 int i;
1334 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1335 if ( d->arch.relmem == RELMEM_l3 )
1336 return;
1337 #endif
1339 pl3e = map_domain_page(pfn);
1341 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1342 if ( is_guest_l3_slot(i) )
1344 put_page_from_l3e(pl3e[i], pfn);
1345 unadjust_guest_l3e(pl3e[i], d);
1348 unmap_domain_page(pl3e);
1351 #endif
1353 #if CONFIG_PAGING_LEVELS >= 4
1355 static void free_l4_table(struct page_info *page)
1357 struct domain *d = page_get_owner(page);
1358 unsigned long pfn = page_to_mfn(page);
1359 l4_pgentry_t *pl4e = page_to_virt(page);
1360 int i;
1362 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1363 if ( d->arch.relmem == RELMEM_l4 )
1364 return;
1365 #endif
1367 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1368 if ( is_guest_l4_slot(d, i) )
1369 put_page_from_l4e(pl4e[i], pfn);
1372 #endif
1374 static void page_lock(struct page_info *page)
1376 #if defined(__i386__)
1377 while ( unlikely(test_and_set_bit(_PGC_locked, &page->count_info)) )
1378 while ( test_bit(_PGC_locked, &page->count_info) )
1379 cpu_relax();
1380 #else
1381 spin_lock(&page->lock);
1382 #endif
1385 static void page_unlock(struct page_info *page)
1387 #if defined(__i386__)
1388 clear_bit(_PGC_locked, &page->count_info);
1389 #else
1390 spin_unlock(&page->lock);
1391 #endif
1394 /* How to write an entry to the guest pagetables.
1395 * Returns 0 for failure (pointer not valid), 1 for success. */
1396 static inline int update_intpte(intpte_t *p,
1397 intpte_t old,
1398 intpte_t new,
1399 unsigned long mfn,
1400 struct vcpu *v,
1401 int preserve_ad)
1403 int rv = 1;
1404 #ifndef PTE_UPDATE_WITH_CMPXCHG
1405 if ( !preserve_ad )
1407 rv = paging_write_guest_entry(v, p, new, _mfn(mfn));
1409 else
1410 #endif
1412 intpte_t t = old;
1413 for ( ; ; )
1415 intpte_t _new = new;
1416 if ( preserve_ad )
1417 _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY);
1419 rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn));
1420 if ( unlikely(rv == 0) )
1422 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1423 ": saw %" PRIpte, old, _new, t);
1424 break;
1427 if ( t == old )
1428 break;
1430 /* Allowed to change in Accessed/Dirty flags only. */
1431 BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY));
1433 old = t;
1436 return rv;
1439 /* Macro that wraps the appropriate type-changes around update_intpte().
1440 * Arguments are: type, ptr, old, new, mfn, vcpu */
1441 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
1442 update_intpte(&_t ## e_get_intpte(*(_p)), \
1443 _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
1444 (_m), (_v), (_ad))
1446 /* Update the L1 entry at pl1e to new value nl1e. */
1447 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1448 unsigned long gl1mfn, int preserve_ad)
1450 l1_pgentry_t ol1e;
1451 struct vcpu *curr = current;
1452 struct domain *d = curr->domain;
1453 unsigned long mfn;
1454 struct page_info *l1pg = mfn_to_page(gl1mfn);
1455 int rc = 1;
1457 page_lock(l1pg);
1459 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1460 return page_unlock(l1pg), 0;
1462 if ( unlikely(paging_mode_refcounts(d)) )
1464 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad);
1465 page_unlock(l1pg);
1466 return rc;
1469 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1471 /* Translate foreign guest addresses. */
1472 mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
1473 if ( unlikely(mfn == INVALID_MFN) )
1474 return page_unlock(l1pg), 0;
1475 ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
1476 nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
1478 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
1480 page_unlock(l1pg);
1481 MEM_LOG("Bad L1 flags %x",
1482 l1e_get_flags(nl1e) & l1_disallow_mask(d));
1483 return 0;
1486 /* Fast path for identical mapping, r/w and presence. */
1487 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
1489 adjust_guest_l1e(nl1e, d);
1490 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1491 preserve_ad);
1492 page_unlock(l1pg);
1493 return rc;
1496 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1497 return page_unlock(l1pg), 0;
1499 adjust_guest_l1e(nl1e, d);
1500 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1501 preserve_ad)) )
1503 ol1e = nl1e;
1504 rc = 0;
1507 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
1508 preserve_ad)) )
1510 page_unlock(l1pg);
1511 return 0;
1514 page_unlock(l1pg);
1515 put_page_from_l1e(ol1e, d);
1516 return rc;
1520 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1521 static int mod_l2_entry(l2_pgentry_t *pl2e,
1522 l2_pgentry_t nl2e,
1523 unsigned long pfn,
1524 unsigned long type,
1525 int preserve_ad)
1527 l2_pgentry_t ol2e;
1528 struct vcpu *curr = current;
1529 struct domain *d = curr->domain;
1530 struct page_info *l2pg = mfn_to_page(pfn);
1531 int rc = 1;
1533 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
1535 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1536 return 0;
1539 page_lock(l2pg);
1541 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1542 return page_unlock(l2pg), 0;
1544 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1546 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1548 page_unlock(l2pg);
1549 MEM_LOG("Bad L2 flags %x",
1550 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1551 return 0;
1554 /* Fast path for identical mapping and presence. */
1555 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
1557 adjust_guest_l2e(nl2e, d);
1558 rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad);
1559 page_unlock(l2pg);
1560 return rc;
1563 if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
1564 return page_unlock(l2pg), 0;
1566 adjust_guest_l2e(nl2e, d);
1567 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1568 preserve_ad)) )
1570 ol2e = nl2e;
1571 rc = 0;
1574 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
1575 preserve_ad)) )
1577 page_unlock(l2pg);
1578 return 0;
1581 page_unlock(l2pg);
1582 put_page_from_l2e(ol2e, pfn);
1583 return rc;
1586 #if CONFIG_PAGING_LEVELS >= 3
1588 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1589 static int mod_l3_entry(l3_pgentry_t *pl3e,
1590 l3_pgentry_t nl3e,
1591 unsigned long pfn,
1592 int preserve_ad)
1594 l3_pgentry_t ol3e;
1595 struct vcpu *curr = current;
1596 struct domain *d = curr->domain;
1597 struct page_info *l3pg = mfn_to_page(pfn);
1598 int rc = 1;
1600 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1602 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1603 return 0;
1606 /*
1607 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1608 * would be a pain to ensure they remain continuously valid throughout.
1609 */
1610 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
1611 return 0;
1613 page_lock(l3pg);
1615 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1616 return page_unlock(l3pg), 0;
1618 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1620 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
1622 page_unlock(l3pg);
1623 MEM_LOG("Bad L3 flags %x",
1624 l3e_get_flags(nl3e) & l3_disallow_mask(d));
1625 return 0;
1628 /* Fast path for identical mapping and presence. */
1629 if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
1631 adjust_guest_l3e(nl3e, d);
1632 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
1633 page_unlock(l3pg);
1634 return rc;
1637 if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
1638 return page_unlock(l3pg), 0;
1640 adjust_guest_l3e(nl3e, d);
1641 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1642 preserve_ad)) )
1644 ol3e = nl3e;
1645 rc = 0;
1648 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
1649 preserve_ad)) )
1651 page_unlock(l3pg);
1652 return 0;
1655 if ( likely(rc) )
1657 if ( !create_pae_xen_mappings(d, pl3e) )
1658 BUG();
1660 pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
1663 page_unlock(l3pg);
1664 put_page_from_l3e(ol3e, pfn);
1665 return rc;
1668 #endif
1670 #if CONFIG_PAGING_LEVELS >= 4
1672 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1673 static int mod_l4_entry(l4_pgentry_t *pl4e,
1674 l4_pgentry_t nl4e,
1675 unsigned long pfn,
1676 int preserve_ad)
1678 struct vcpu *curr = current;
1679 struct domain *d = curr->domain;
1680 l4_pgentry_t ol4e;
1681 struct page_info *l4pg = mfn_to_page(pfn);
1682 int rc = 1;
1684 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
1686 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1687 return 0;
1690 page_lock(l4pg);
1692 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1693 return page_unlock(l4pg), 0;
1695 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1697 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1699 page_unlock(l4pg);
1700 MEM_LOG("Bad L4 flags %x",
1701 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1702 return 0;
1705 /* Fast path for identical mapping and presence. */
1706 if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
1708 adjust_guest_l4e(nl4e, d);
1709 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
1710 page_unlock(l4pg);
1711 return rc;
1714 if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
1715 return page_unlock(l4pg), 0;
1717 adjust_guest_l4e(nl4e, d);
1718 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1719 preserve_ad)) )
1721 ol4e = nl4e;
1722 rc = 0;
1725 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
1726 preserve_ad)) )
1728 page_unlock(l4pg);
1729 return 0;
1732 page_unlock(l4pg);
1733 put_page_from_l4e(ol4e, pfn);
1734 return rc;
1737 #endif
1739 void put_page(struct page_info *page)
1741 u32 nx, x, y = page->count_info;
1743 do {
1744 x = y;
1745 nx = x - 1;
1747 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
1749 if ( unlikely((nx & PGC_count_mask) == 0) )
1751 cleanup_page_cacheattr(page);
1752 free_domheap_page(page);
1757 int get_page(struct page_info *page, struct domain *domain)
1759 u32 x, nx, y = page->count_info;
1760 u32 d, nd = page->u.inuse._domain;
1761 u32 _domain = pickle_domptr(domain);
1763 do {
1764 x = y;
1765 nx = x + 1;
1766 d = nd;
1767 if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
1768 unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
1769 unlikely(d != _domain) ) /* Wrong owner? */
1771 if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
1772 gdprintk(XENLOG_INFO,
1773 "Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
1774 PRtype_info "\n",
1775 page_to_mfn(page), domain, unpickle_domptr(d),
1776 x, page->u.inuse.type_info);
1777 return 0;
1779 asm volatile (
1780 LOCK_PREFIX "cmpxchg8b %2"
1781 : "=d" (nd), "=a" (y),
1782 "=m" (*(volatile u64 *)(&page->count_info))
1783 : "0" (d), "1" (x), "c" (d), "b" (nx) );
1785 while ( unlikely(nd != d) || unlikely(y != x) );
1787 return 1;
1791 static int alloc_page_type(struct page_info *page, unsigned long type)
1793 struct domain *owner = page_get_owner(page);
1795 /* A page table is dirtied when its type count becomes non-zero. */
1796 if ( likely(owner != NULL) )
1797 paging_mark_dirty(owner, page_to_mfn(page));
1799 switch ( type & PGT_type_mask )
1801 case PGT_l1_page_table:
1802 return alloc_l1_table(page);
1803 case PGT_l2_page_table:
1804 return alloc_l2_table(page, type);
1805 case PGT_l3_page_table:
1806 return alloc_l3_table(page);
1807 case PGT_l4_page_table:
1808 return alloc_l4_table(page);
1809 case PGT_seg_desc_page:
1810 return alloc_segdesc_page(page);
1811 default:
1812 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1813 type, page->u.inuse.type_info,
1814 page->count_info);
1815 BUG();
1818 return 0;
1822 void free_page_type(struct page_info *page, unsigned long type)
1824 struct domain *owner = page_get_owner(page);
1825 unsigned long gmfn;
1827 if ( likely(owner != NULL) )
1829 /*
1830 * We have to flush before the next use of the linear mapping
1831 * (e.g., update_va_mapping()) or we could end up modifying a page
1832 * that is no longer a page table (and hence screw up ref counts).
1833 */
1834 if ( current->domain == owner )
1835 queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
1836 else
1837 flush_tlb_mask(owner->domain_dirty_cpumask);
1839 if ( unlikely(paging_mode_enabled(owner)) )
1841 /* A page table is dirtied when its type count becomes zero. */
1842 paging_mark_dirty(owner, page_to_mfn(page));
1844 if ( shadow_mode_refcounts(owner) )
1845 return;
1847 gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
1848 ASSERT(VALID_M2P(gmfn));
1849 shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
1853 switch ( type & PGT_type_mask )
1855 case PGT_l1_page_table:
1856 free_l1_table(page);
1857 break;
1859 case PGT_l2_page_table:
1860 free_l2_table(page);
1861 break;
1863 #if CONFIG_PAGING_LEVELS >= 3
1864 case PGT_l3_page_table:
1865 free_l3_table(page);
1866 break;
1867 #endif
1869 #if CONFIG_PAGING_LEVELS >= 4
1870 case PGT_l4_page_table:
1871 free_l4_table(page);
1872 break;
1873 #endif
1875 default:
1876 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1877 type, page_to_mfn(page));
1878 BUG();
1883 void put_page_type(struct page_info *page)
1885 unsigned long nx, x, y = page->u.inuse.type_info;
1887 again:
1888 do {
1889 x = y;
1890 nx = x - 1;
1892 ASSERT((x & PGT_count_mask) != 0);
1894 if ( unlikely((nx & PGT_count_mask) == 0) )
1896 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1897 likely(nx & PGT_validated) )
1899 /*
1900 * Page-table pages must be unvalidated when count is zero. The
1901 * 'free' is safe because the refcnt is non-zero and validated
1902 * bit is clear => other ops will spin or fail.
1903 */
1904 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1905 x & ~PGT_validated)) != x) )
1906 goto again;
1907 /* We cleared the 'valid bit' so we do the clean up. */
1908 free_page_type(page, x);
1909 /* Carry on, but with the 'valid bit' now clear. */
1910 x &= ~PGT_validated;
1911 nx &= ~PGT_validated;
1914 /*
1915 * Record TLB information for flush later. We do not stamp page
1916 * tables when running in shadow mode:
1917 * 1. Pointless, since it's the shadow pt's which must be tracked.
1918 * 2. Shadow mode reuses this field for shadowed page tables to
1919 * store flags info -- we don't want to conflict with that.
1920 */
1921 if ( !(shadow_mode_enabled(page_get_owner(page)) &&
1922 (page->count_info & PGC_page_table)) )
1923 page->tlbflush_timestamp = tlbflush_current_time();
1926 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1930 int get_page_type(struct page_info *page, unsigned long type)
1932 unsigned long nx, x, y = page->u.inuse.type_info;
1934 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
1936 again:
1937 do {
1938 x = y;
1939 nx = x + 1;
1940 if ( unlikely((nx & PGT_count_mask) == 0) )
1942 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
1943 return 0;
1945 else if ( unlikely((x & PGT_count_mask) == 0) )
1947 struct domain *d = page_get_owner(page);
1949 /* Normally we should never let a page go from type count 0
1950 * to type count 1 when it is shadowed. One exception:
1951 * out-of-sync shadowed pages are allowed to become
1952 * writeable. */
1953 if ( d && shadow_mode_enabled(d)
1954 && (page->count_info & PGC_page_table)
1955 && !((page->shadow_flags & (1u<<29))
1956 && type == PGT_writable_page) )
1957 shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
1959 ASSERT(!(x & PGT_pae_xen_l2));
1960 if ( (x & PGT_type_mask) != type )
1962 /*
1963 * On type change we check to flush stale TLB entries. This
1964 * may be unnecessary (e.g., page was GDT/LDT) but those
1965 * circumstances should be very rare.
1966 */
1967 cpumask_t mask = d->domain_dirty_cpumask;
1969 /* Don't flush if the timestamp is old enough */
1970 tlbflush_filter(mask, page->tlbflush_timestamp);
1972 if ( unlikely(!cpus_empty(mask)) &&
1973 /* Shadow mode: track only writable pages. */
1974 (!shadow_mode_enabled(page_get_owner(page)) ||
1975 ((nx & PGT_type_mask) == PGT_writable_page)) )
1977 perfc_incr(need_flush_tlb_flush);
1978 flush_tlb_mask(mask);
1981 /* We lose existing type and validity. */
1982 nx &= ~(PGT_type_mask | PGT_validated);
1983 nx |= type;
1985 /* No special validation needed for writable pages. */
1986 /* Page tables and GDT/LDT need to be scanned for validity. */
1987 if ( type == PGT_writable_page )
1988 nx |= PGT_validated;
1991 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
1993 /* Don't log failure if it could be a recursive-mapping attempt. */
1994 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
1995 (type == PGT_l1_page_table) )
1996 return 0;
1997 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
1998 (type == PGT_l2_page_table) )
1999 return 0;
2000 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2001 (type == PGT_l3_page_table) )
2002 return 0;
2003 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2004 "for mfn %lx (pfn %lx)",
2005 x, type, page_to_mfn(page),
2006 get_gpfn_from_mfn(page_to_mfn(page)));
2007 return 0;
2009 else if ( unlikely(!(x & PGT_validated)) )
2011 /* Someone else is updating validation of this page. Wait... */
2012 while ( (y = page->u.inuse.type_info) == x )
2013 cpu_relax();
2014 goto again;
2017 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
2019 if ( unlikely((x & PGT_type_mask) != type) )
2021 /* Special pages should not be accessible from devices. */
2022 struct domain *d = page_get_owner(page);
2023 if ( d && unlikely(need_iommu(d)) )
2025 if ( (x & PGT_type_mask) == PGT_writable_page )
2026 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
2027 else if ( type == PGT_writable_page )
2028 iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
2029 page_to_mfn(page));
2033 if ( unlikely(!(nx & PGT_validated)) )
2035 /* Try to validate page type; drop the new reference on failure. */
2036 if ( unlikely(!alloc_page_type(page, type)) )
2038 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
2039 PRtype_info ": caf=%08x taf=%" PRtype_info,
2040 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
2041 type, page->count_info, page->u.inuse.type_info);
2042 /* Noone else can get a reference. We hold the only ref. */
2043 page->u.inuse.type_info = 0;
2044 return 0;
2047 /* Noone else is updating simultaneously. */
2048 __set_bit(_PGT_validated, &page->u.inuse.type_info);
2051 return 1;
2055 void cleanup_page_cacheattr(struct page_info *page)
2057 uint32_t cacheattr = (page->count_info >> PGC_cacheattr_base) & 7;
2059 if ( likely(cacheattr == 0) )
2060 return;
2062 page->count_info &= ~PGC_cacheattr_mask;
2064 BUG_ON(is_xen_heap_page(page));
2066 #ifdef __x86_64__
2067 map_pages_to_xen((unsigned long)page_to_virt(page), page_to_mfn(page),
2068 1, PAGE_HYPERVISOR);
2069 #endif
2073 int new_guest_cr3(unsigned long mfn)
2075 struct vcpu *v = current;
2076 struct domain *d = v->domain;
2077 int okay;
2078 unsigned long old_base_mfn;
2080 #ifdef CONFIG_COMPAT
2081 if ( is_pv_32on64_domain(d) )
2083 okay = paging_mode_refcounts(d)
2084 ? 0 /* Old code was broken, but what should it be? */
2085 : mod_l4_entry(
2086 __va(pagetable_get_paddr(v->arch.guest_table)),
2087 l4e_from_pfn(
2088 mfn,
2089 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
2090 pagetable_get_pfn(v->arch.guest_table), 0);
2091 if ( unlikely(!okay) )
2093 MEM_LOG("Error while installing new compat baseptr %lx", mfn);
2094 return 0;
2097 invalidate_shadow_ldt(v);
2098 write_ptbase(v);
2100 return 1;
2102 #endif
2103 okay = paging_mode_refcounts(d)
2104 ? get_page_from_pagenr(mfn, d)
2105 : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
2106 if ( unlikely(!okay) )
2108 MEM_LOG("Error while installing new baseptr %lx", mfn);
2109 return 0;
2112 invalidate_shadow_ldt(v);
2114 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2116 v->arch.guest_table = pagetable_from_pfn(mfn);
2117 update_cr3(v);
2119 write_ptbase(v);
2121 if ( likely(old_base_mfn != 0) )
2123 if ( paging_mode_refcounts(d) )
2124 put_page(mfn_to_page(old_base_mfn));
2125 else
2126 put_page_and_type(mfn_to_page(old_base_mfn));
2129 return 1;
2132 static void process_deferred_ops(void)
2134 unsigned int deferred_ops;
2135 struct domain *d = current->domain;
2136 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2138 deferred_ops = info->deferred_ops;
2139 info->deferred_ops = 0;
2141 if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
2143 if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
2144 flush_tlb_mask(d->domain_dirty_cpumask);
2145 else
2146 flush_tlb_local();
2149 if ( deferred_ops & DOP_RELOAD_LDT )
2150 (void)map_ldt_shadow_page(0);
2152 if ( unlikely(info->foreign != NULL) )
2154 rcu_unlock_domain(info->foreign);
2155 info->foreign = NULL;
2159 static int set_foreigndom(domid_t domid)
2161 struct domain *e, *d = current->domain;
2162 struct percpu_mm_info *info = &this_cpu(percpu_mm_info);
2163 int okay = 1;
2165 ASSERT(info->foreign == NULL);
2167 if ( likely(domid == DOMID_SELF) )
2168 goto out;
2170 if ( unlikely(domid == d->domain_id) )
2172 MEM_LOG("Cannot specify itself as foreign domain");
2173 okay = 0;
2175 else if ( unlikely(paging_mode_translate(d)) )
2177 MEM_LOG("Cannot mix foreign mappings with translated domains");
2178 okay = 0;
2180 else switch ( domid )
2182 case DOMID_IO:
2183 info->foreign = rcu_lock_domain(dom_io);
2184 break;
2185 case DOMID_XEN:
2186 if (!IS_PRIV(d)) {
2187 MEM_LOG("Cannot set foreign dom");
2188 okay = 0;
2189 break;
2191 info->foreign = rcu_lock_domain(dom_xen);
2192 break;
2193 default:
2194 if ( (e = rcu_lock_domain_by_id(domid)) == NULL )
2196 MEM_LOG("Unknown domain '%u'", domid);
2197 okay = 0;
2198 break;
2200 if ( !IS_PRIV_FOR(d, e) )
2202 MEM_LOG("Cannot set foreign dom");
2203 okay = 0;
2204 rcu_unlock_domain(e);
2205 break;
2207 info->foreign = e;
2208 break;
2211 out:
2212 return okay;
2215 static inline cpumask_t vcpumask_to_pcpumask(
2216 struct domain *d, unsigned long vmask)
2218 unsigned int vcpu_id;
2219 cpumask_t pmask = CPU_MASK_NONE;
2220 struct vcpu *v;
2222 /*
2223 * Callers copy only a single guest-sized longword from the guest.
2224 * This must be wide enough to reference all VCPUs. Worst case is 32 bits.
2225 */
2226 BUILD_BUG_ON(MAX_VIRT_CPUS > 32);
2228 while ( vmask != 0 )
2230 vcpu_id = find_first_set_bit(vmask);
2231 vmask &= ~(1UL << vcpu_id);
2232 if ( (vcpu_id < MAX_VIRT_CPUS) &&
2233 ((v = d->vcpu[vcpu_id]) != NULL) )
2234 cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
2237 return pmask;
2240 int do_mmuext_op(
2241 XEN_GUEST_HANDLE(mmuext_op_t) uops,
2242 unsigned int count,
2243 XEN_GUEST_HANDLE(uint) pdone,
2244 unsigned int foreigndom)
2246 struct mmuext_op op;
2247 int rc = 0, i = 0, okay;
2248 unsigned long mfn = 0, gmfn = 0, type;
2249 unsigned int done = 0;
2250 struct page_info *page;
2251 struct vcpu *v = current;
2252 struct domain *d = v->domain;
2254 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2256 count &= ~MMU_UPDATE_PREEMPTED;
2257 if ( unlikely(!guest_handle_is_null(pdone)) )
2258 (void)copy_from_guest(&done, pdone, 1);
2260 else
2261 perfc_incr(calls_to_mmuext_op);
2263 if ( unlikely(!guest_handle_okay(uops, count)) )
2265 rc = -EFAULT;
2266 goto out;
2269 if ( !set_foreigndom(foreigndom) )
2271 rc = -ESRCH;
2272 goto out;
2275 for ( i = 0; i < count; i++ )
2277 if ( hypercall_preempt_check() )
2279 rc = hypercall_create_continuation(
2280 __HYPERVISOR_mmuext_op, "hihi",
2281 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2282 break;
2285 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
2287 MEM_LOG("Bad __copy_from_guest");
2288 rc = -EFAULT;
2289 break;
2292 okay = 1;
2293 gmfn = op.arg1.mfn;
2294 mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
2295 page = mfn_to_page(mfn);
2297 switch ( op.cmd )
2299 case MMUEXT_PIN_L1_TABLE:
2300 type = PGT_l1_page_table;
2301 goto pin_page;
2303 case MMUEXT_PIN_L2_TABLE:
2304 type = PGT_l2_page_table;
2305 goto pin_page;
2307 case MMUEXT_PIN_L3_TABLE:
2308 type = PGT_l3_page_table;
2309 goto pin_page;
2311 case MMUEXT_PIN_L4_TABLE:
2312 if ( is_pv_32bit_domain(FOREIGNDOM) )
2313 break;
2314 type = PGT_l4_page_table;
2316 pin_page:
2317 rc = xsm_memory_pin_page(d, page);
2318 if ( rc )
2319 break;
2321 /* Ignore pinning of invalid paging levels. */
2322 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
2323 break;
2325 if ( paging_mode_refcounts(FOREIGNDOM) )
2326 break;
2328 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
2329 if ( unlikely(!okay) )
2331 MEM_LOG("Error while pinning mfn %lx", mfn);
2332 break;
2335 if ( unlikely(test_and_set_bit(_PGT_pinned,
2336 &page->u.inuse.type_info)) )
2338 MEM_LOG("Mfn %lx already pinned", mfn);
2339 put_page_and_type(page);
2340 okay = 0;
2341 break;
2344 /* A page is dirtied when its pin status is set. */
2345 paging_mark_dirty(d, mfn);
2347 /* We can race domain destruction (domain_relinquish_resources). */
2348 if ( unlikely(this_cpu(percpu_mm_info).foreign != NULL) )
2350 int drop_ref;
2351 spin_lock(&FOREIGNDOM->page_alloc_lock);
2352 drop_ref = (FOREIGNDOM->is_dying &&
2353 test_and_clear_bit(_PGT_pinned,
2354 &page->u.inuse.type_info));
2355 spin_unlock(&FOREIGNDOM->page_alloc_lock);
2356 if ( drop_ref )
2357 put_page_and_type(page);
2360 break;
2362 case MMUEXT_UNPIN_TABLE:
2363 if ( paging_mode_refcounts(d) )
2364 break;
2366 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
2368 MEM_LOG("Mfn %lx bad domain (dom=%p)",
2369 mfn, page_get_owner(page));
2371 else if ( likely(test_and_clear_bit(_PGT_pinned,
2372 &page->u.inuse.type_info)) )
2374 put_page_and_type(page);
2375 put_page(page);
2376 /* A page is dirtied when its pin status is cleared. */
2377 paging_mark_dirty(d, mfn);
2379 else
2381 okay = 0;
2382 put_page(page);
2383 MEM_LOG("Mfn %lx not pinned", mfn);
2385 break;
2387 case MMUEXT_NEW_BASEPTR:
2388 okay = new_guest_cr3(mfn);
2389 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
2390 break;
2392 #ifdef __x86_64__
2393 case MMUEXT_NEW_USER_BASEPTR: {
2394 unsigned long old_mfn;
2396 if ( mfn != 0 )
2398 if ( paging_mode_refcounts(d) )
2399 okay = get_page_from_pagenr(mfn, d);
2400 else
2401 okay = get_page_and_type_from_pagenr(
2402 mfn, PGT_root_page_table, d);
2403 if ( unlikely(!okay) )
2405 MEM_LOG("Error while installing new mfn %lx", mfn);
2406 break;
2410 old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
2411 v->arch.guest_table_user = pagetable_from_pfn(mfn);
2413 if ( old_mfn != 0 )
2415 if ( paging_mode_refcounts(d) )
2416 put_page(mfn_to_page(old_mfn));
2417 else
2418 put_page_and_type(mfn_to_page(old_mfn));
2421 break;
2423 #endif
2425 case MMUEXT_TLB_FLUSH_LOCAL:
2426 this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
2427 break;
2429 case MMUEXT_INVLPG_LOCAL:
2430 if ( !paging_mode_enabled(d)
2431 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
2432 flush_tlb_one_local(op.arg1.linear_addr);
2433 break;
2435 case MMUEXT_TLB_FLUSH_MULTI:
2436 case MMUEXT_INVLPG_MULTI:
2438 unsigned long vmask;
2439 cpumask_t pmask;
2440 if ( unlikely(copy_from_guest(&vmask, op.arg2.vcpumask, 1)) )
2442 okay = 0;
2443 break;
2445 pmask = vcpumask_to_pcpumask(d, vmask);
2446 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
2447 flush_tlb_mask(pmask);
2448 else
2449 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
2450 break;
2453 case MMUEXT_TLB_FLUSH_ALL:
2454 flush_tlb_mask(d->domain_dirty_cpumask);
2455 break;
2457 case MMUEXT_INVLPG_ALL:
2458 flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
2459 break;
2461 case MMUEXT_FLUSH_CACHE:
2462 if ( unlikely(!cache_flush_permitted(d)) )
2464 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
2465 okay = 0;
2467 else
2469 wbinvd();
2471 break;
2473 case MMUEXT_SET_LDT:
2475 unsigned long ptr = op.arg1.linear_addr;
2476 unsigned long ents = op.arg2.nr_ents;
2478 if ( paging_mode_external(d) )
2480 MEM_LOG("ignoring SET_LDT hypercall from external domain");
2481 okay = 0;
2483 else if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
2484 (ents > 8192) ||
2485 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
2487 okay = 0;
2488 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
2490 else if ( (v->arch.guest_context.ldt_ents != ents) ||
2491 (v->arch.guest_context.ldt_base != ptr) )
2493 invalidate_shadow_ldt(v);
2494 v->arch.guest_context.ldt_base = ptr;
2495 v->arch.guest_context.ldt_ents = ents;
2496 load_LDT(v);
2497 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
2498 if ( ents != 0 )
2499 this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
2501 break;
2504 default:
2505 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
2506 rc = -ENOSYS;
2507 okay = 0;
2508 break;
2511 if ( unlikely(!okay) )
2513 rc = rc ? rc : -EINVAL;
2514 break;
2517 guest_handle_add_offset(uops, 1);
2520 process_deferred_ops();
2522 perfc_add(num_mmuext_ops, i);
2524 out:
2525 /* Add incremental work we have done to the @done output parameter. */
2526 if ( unlikely(!guest_handle_is_null(pdone)) )
2528 done += i;
2529 copy_to_guest(pdone, &done, 1);
2532 return rc;
2535 int do_mmu_update(
2536 XEN_GUEST_HANDLE(mmu_update_t) ureqs,
2537 unsigned int count,
2538 XEN_GUEST_HANDLE(uint) pdone,
2539 unsigned int foreigndom)
2541 struct mmu_update req;
2542 void *va;
2543 unsigned long gpfn, gmfn, mfn;
2544 struct page_info *page;
2545 int rc = 0, okay = 1, i = 0;
2546 unsigned int cmd, done = 0;
2547 struct vcpu *v = current;
2548 struct domain *d = v->domain;
2549 unsigned long type_info;
2550 struct domain_mmap_cache mapcache;
2552 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2554 count &= ~MMU_UPDATE_PREEMPTED;
2555 if ( unlikely(!guest_handle_is_null(pdone)) )
2556 (void)copy_from_guest(&done, pdone, 1);
2558 else
2559 perfc_incr(calls_to_mmu_update);
2561 if ( unlikely(!guest_handle_okay(ureqs, count)) )
2563 rc = -EFAULT;
2564 goto out;
2567 if ( !set_foreigndom(foreigndom) )
2569 rc = -ESRCH;
2570 goto out;
2573 domain_mmap_cache_init(&mapcache);
2575 for ( i = 0; i < count; i++ )
2577 if ( hypercall_preempt_check() )
2579 rc = hypercall_create_continuation(
2580 __HYPERVISOR_mmu_update, "hihi",
2581 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2582 break;
2585 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
2587 MEM_LOG("Bad __copy_from_guest");
2588 rc = -EFAULT;
2589 break;
2592 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2593 okay = 0;
2595 switch ( cmd )
2597 /*
2598 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2599 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
2600 * current A/D bits.
2601 */
2602 case MMU_NORMAL_PT_UPDATE:
2603 case MMU_PT_UPDATE_PRESERVE_AD:
2604 rc = xsm_mmu_normal_update(d, req.val);
2605 if ( rc )
2606 break;
2608 req.ptr -= cmd;
2609 gmfn = req.ptr >> PAGE_SHIFT;
2610 mfn = gmfn_to_mfn(d, gmfn);
2612 if ( unlikely(!get_page_from_pagenr(mfn, d)) )
2614 MEM_LOG("Could not get page for normal update");
2615 break;
2618 va = map_domain_page_with_cache(mfn, &mapcache);
2619 va = (void *)((unsigned long)va +
2620 (unsigned long)(req.ptr & ~PAGE_MASK));
2621 page = mfn_to_page(mfn);
2623 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2625 case PGT_l1_page_table:
2626 case PGT_l2_page_table:
2627 case PGT_l3_page_table:
2628 case PGT_l4_page_table:
2630 if ( paging_mode_refcounts(d) )
2632 MEM_LOG("mmu update on auto-refcounted domain!");
2633 break;
2636 if ( unlikely(!get_page_type(
2637 page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
2638 goto not_a_pt;
2640 switch ( type_info & PGT_type_mask )
2642 case PGT_l1_page_table:
2644 l1_pgentry_t l1e = l1e_from_intpte(req.val);
2645 okay = mod_l1_entry(va, l1e, mfn,
2646 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2648 break;
2649 case PGT_l2_page_table:
2651 l2_pgentry_t l2e = l2e_from_intpte(req.val);
2652 okay = mod_l2_entry(va, l2e, mfn, type_info,
2653 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2655 break;
2656 #if CONFIG_PAGING_LEVELS >= 3
2657 case PGT_l3_page_table:
2659 l3_pgentry_t l3e = l3e_from_intpte(req.val);
2660 okay = mod_l3_entry(va, l3e, mfn,
2661 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2663 break;
2664 #endif
2665 #if CONFIG_PAGING_LEVELS >= 4
2666 case PGT_l4_page_table:
2668 l4_pgentry_t l4e = l4e_from_intpte(req.val);
2669 okay = mod_l4_entry(va, l4e, mfn,
2670 cmd == MMU_PT_UPDATE_PRESERVE_AD);
2672 break;
2673 #endif
2676 put_page_type(page);
2678 break;
2680 default:
2681 not_a_pt:
2683 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2684 break;
2686 perfc_incr(writable_mmu_updates);
2688 okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
2690 put_page_type(page);
2692 break;
2695 unmap_domain_page_with_cache(va, &mapcache);
2697 put_page(page);
2698 break;
2700 case MMU_MACHPHYS_UPDATE:
2702 mfn = req.ptr >> PAGE_SHIFT;
2703 gpfn = req.val;
2705 rc = xsm_mmu_machphys_update(d, mfn);
2706 if ( rc )
2707 break;
2709 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2711 MEM_LOG("Could not get page for mach->phys update");
2712 break;
2715 if ( unlikely(paging_mode_translate(FOREIGNDOM)) )
2717 MEM_LOG("Mach-phys update on auto-translate guest");
2718 break;
2721 set_gpfn_from_mfn(mfn, gpfn);
2722 okay = 1;
2724 paging_mark_dirty(FOREIGNDOM, mfn);
2726 put_page(mfn_to_page(mfn));
2727 break;
2729 default:
2730 MEM_LOG("Invalid page update command %x", cmd);
2731 rc = -ENOSYS;
2732 okay = 0;
2733 break;
2736 if ( unlikely(!okay) )
2738 rc = rc ? rc : -EINVAL;
2739 break;
2742 guest_handle_add_offset(ureqs, 1);
2745 process_deferred_ops();
2747 domain_mmap_cache_destroy(&mapcache);
2749 perfc_add(num_page_updates, i);
2751 out:
2752 /* Add incremental work we have done to the @done output parameter. */
2753 if ( unlikely(!guest_handle_is_null(pdone)) )
2755 done += i;
2756 copy_to_guest(pdone, &done, 1);
2759 return rc;
2763 static int create_grant_pte_mapping(
2764 uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v)
2766 int rc = GNTST_okay;
2767 void *va;
2768 unsigned long gmfn, mfn;
2769 struct page_info *page;
2770 u32 type;
2771 l1_pgentry_t ol1e;
2772 struct domain *d = v->domain;
2774 ASSERT(domain_is_locked(d));
2776 adjust_guest_l1e(nl1e, d);
2778 gmfn = pte_addr >> PAGE_SHIFT;
2779 mfn = gmfn_to_mfn(d, gmfn);
2781 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2783 MEM_LOG("Could not get page for normal update");
2784 return GNTST_general_error;
2787 va = map_domain_page(mfn);
2788 va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
2789 page = mfn_to_page(mfn);
2791 type = page->u.inuse.type_info & PGT_type_mask;
2792 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2794 MEM_LOG("Grant map attempted to update a non-L1 page");
2795 rc = GNTST_general_error;
2796 goto failed;
2799 page_lock(page);
2801 ol1e = *(l1_pgentry_t *)va;
2802 if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
2804 page_unlock(page);
2805 put_page_type(page);
2806 rc = GNTST_general_error;
2807 goto failed;
2810 page_unlock(page);
2812 if ( !paging_mode_refcounts(d) )
2813 put_page_from_l1e(ol1e, d);
2815 put_page_type(page);
2817 failed:
2818 unmap_domain_page(va);
2819 put_page(page);
2821 return rc;
2824 static int destroy_grant_pte_mapping(
2825 uint64_t addr, unsigned long frame, struct domain *d)
2827 int rc = GNTST_okay;
2828 void *va;
2829 unsigned long gmfn, mfn;
2830 struct page_info *page;
2831 u32 type;
2832 l1_pgentry_t ol1e;
2834 gmfn = addr >> PAGE_SHIFT;
2835 mfn = gmfn_to_mfn(d, gmfn);
2837 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2839 MEM_LOG("Could not get page for normal update");
2840 return GNTST_general_error;
2843 va = map_domain_page(mfn);
2844 va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
2845 page = mfn_to_page(mfn);
2847 type = page->u.inuse.type_info & PGT_type_mask;
2848 if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
2850 MEM_LOG("Grant map attempted to update a non-L1 page");
2851 rc = GNTST_general_error;
2852 goto failed;
2855 page_lock(page);
2857 ol1e = *(l1_pgentry_t *)va;
2859 /* Check that the virtual address supplied is actually mapped to frame. */
2860 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2862 page_unlock(page);
2863 MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
2864 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2865 put_page_type(page);
2866 rc = GNTST_general_error;
2867 goto failed;
2870 /* Delete pagetable entry. */
2871 if ( unlikely(!UPDATE_ENTRY
2872 (l1,
2873 (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
2874 d->vcpu[0] /* Change if we go to per-vcpu shadows. */,
2875 0)) )
2877 page_unlock(page);
2878 MEM_LOG("Cannot delete PTE entry at %p", va);
2879 put_page_type(page);
2880 rc = GNTST_general_error;
2881 goto failed;
2884 page_unlock(page);
2885 put_page_type(page);
2887 failed:
2888 unmap_domain_page(va);
2889 put_page(page);
2890 return rc;
2894 static int create_grant_va_mapping(
2895 unsigned long va, l1_pgentry_t nl1e, struct vcpu *v)
2897 l1_pgentry_t *pl1e, ol1e;
2898 struct domain *d = v->domain;
2899 unsigned long gl1mfn;
2900 struct page_info *l1pg;
2901 int okay;
2903 ASSERT(domain_is_locked(d));
2905 adjust_guest_l1e(nl1e, d);
2907 pl1e = guest_map_l1e(v, va, &gl1mfn);
2908 if ( !pl1e )
2910 MEM_LOG("Could not find L1 PTE for address %lx", va);
2911 return GNTST_general_error;
2913 l1pg = mfn_to_page(gl1mfn);
2914 page_lock(l1pg);
2915 ol1e = *pl1e;
2916 okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
2917 page_unlock(l1pg);
2918 guest_unmap_l1e(v, pl1e);
2919 pl1e = NULL;
2921 if ( !okay )
2922 return GNTST_general_error;
2924 if ( !paging_mode_refcounts(d) )
2925 put_page_from_l1e(ol1e, d);
2927 return GNTST_okay;
2930 static int replace_grant_va_mapping(
2931 unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v)
2933 l1_pgentry_t *pl1e, ol1e;
2934 unsigned long gl1mfn;
2935 struct page_info *l1pg;
2936 int rc = 0;
2938 pl1e = guest_map_l1e(v, addr, &gl1mfn);
2939 if ( !pl1e )
2941 MEM_LOG("Could not find L1 PTE for address %lx", addr);
2942 return GNTST_general_error;
2945 l1pg = mfn_to_page(gl1mfn);
2946 page_lock(l1pg);
2947 ol1e = *pl1e;
2949 /* Check that the virtual address supplied is actually mapped to frame. */
2950 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2952 page_unlock(l1pg);
2953 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2954 l1e_get_pfn(ol1e), addr, frame);
2955 rc = GNTST_general_error;
2956 goto out;
2959 /* Delete pagetable entry. */
2960 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
2962 page_unlock(l1pg);
2963 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2964 rc = GNTST_general_error;
2965 goto out;
2968 page_unlock(l1pg);
2970 out:
2971 guest_unmap_l1e(v, pl1e);
2972 return rc;
2975 static int destroy_grant_va_mapping(
2976 unsigned long addr, unsigned long frame, struct vcpu *v)
2978 return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
2981 int create_grant_host_mapping(uint64_t addr, unsigned long frame,
2982 unsigned int flags, unsigned int cache_flags)
2984 l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
2986 if ( (flags & GNTMAP_application_map) )
2987 l1e_add_flags(pte,_PAGE_USER);
2988 if ( !(flags & GNTMAP_readonly) )
2989 l1e_add_flags(pte,_PAGE_RW);
2991 l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
2993 if ( flags & GNTMAP_contains_pte )
2994 return create_grant_pte_mapping(addr, pte, current);
2995 return create_grant_va_mapping(addr, pte, current);
2998 int replace_grant_host_mapping(
2999 uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags)
3001 struct vcpu *curr = current;
3002 l1_pgentry_t *pl1e, ol1e;
3003 unsigned long gl1mfn;
3004 struct page_info *l1pg;
3005 int rc;
3007 if ( flags & GNTMAP_contains_pte )
3009 if ( !new_addr )
3010 return destroy_grant_pte_mapping(addr, frame, curr->domain);
3012 MEM_LOG("Unsupported grant table operation");
3013 return GNTST_general_error;
3016 if ( !new_addr )
3017 return destroy_grant_va_mapping(addr, frame, curr);
3019 pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
3020 if ( !pl1e )
3022 MEM_LOG("Could not find L1 PTE for address %lx",
3023 (unsigned long)new_addr);
3024 return GNTST_general_error;
3027 l1pg = mfn_to_page(gl1mfn);
3028 page_lock(l1pg);
3029 ol1e = *pl1e;
3031 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
3032 gl1mfn, curr, 0)) )
3034 page_unlock(l1pg);
3035 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
3036 guest_unmap_l1e(curr, pl1e);
3037 return GNTST_general_error;
3040 page_unlock(l1pg);
3041 guest_unmap_l1e(curr, pl1e);
3043 rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
3044 if ( rc && !paging_mode_refcounts(curr->domain) )
3045 put_page_from_l1e(ol1e, curr->domain);
3047 return rc;
3050 int steal_page(
3051 struct domain *d, struct page_info *page, unsigned int memflags)
3053 u32 _d, _nd, x, y;
3055 spin_lock(&d->page_alloc_lock);
3057 /*
3058 * The tricky bit: atomically release ownership while there is just one
3059 * benign reference to the page (PGC_allocated). If that reference
3060 * disappears then the deallocation routine will safely spin.
3061 */
3062 _d = pickle_domptr(d);
3063 _nd = page->u.inuse._domain;
3064 y = page->count_info;
3065 do {
3066 x = y;
3067 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
3068 (1 | PGC_allocated)) || unlikely(_nd != _d) )
3070 MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
3071 " caf=%08x, taf=%" PRtype_info "\n",
3072 (void *) page_to_mfn(page),
3073 d, d->domain_id, unpickle_domptr(_nd), x,
3074 page->u.inuse.type_info);
3075 spin_unlock(&d->page_alloc_lock);
3076 return -1;
3078 asm volatile (
3079 LOCK_PREFIX "cmpxchg8b %2"
3080 : "=d" (_nd), "=a" (y),
3081 "=m" (*(volatile u64 *)(&page->count_info))
3082 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
3083 } while (unlikely(_nd != _d) || unlikely(y != x));
3085 /*
3086 * Unlink from 'd'. At least one reference remains (now anonymous), so
3087 * noone else is spinning to try to delete this page from 'd'.
3088 */
3089 if ( !(memflags & MEMF_no_refcount) )
3090 d->tot_pages--;
3091 list_del(&page->list);
3093 spin_unlock(&d->page_alloc_lock);
3095 return 0;
3098 int do_update_va_mapping(unsigned long va, u64 val64,
3099 unsigned long flags)
3101 l1_pgentry_t val = l1e_from_intpte(val64);
3102 struct vcpu *v = current;
3103 struct domain *d = v->domain;
3104 l1_pgentry_t *pl1e;
3105 unsigned long vmask, bmap_ptr, gl1mfn;
3106 cpumask_t pmask;
3107 int rc = 0;
3109 perfc_incr(calls_to_update_va);
3111 if ( unlikely(!access_ok(va, 1) && !paging_mode_external(d)) )
3112 return -EINVAL;
3114 rc = xsm_update_va_mapping(d, val);
3115 if ( rc )
3116 return rc;
3118 pl1e = guest_map_l1e(v, va, &gl1mfn);
3120 if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) )
3121 rc = -EINVAL;
3123 if ( pl1e )
3124 guest_unmap_l1e(v, pl1e);
3125 pl1e = NULL;
3127 process_deferred_ops();
3129 switch ( flags & UVMF_FLUSHTYPE_MASK )
3131 case UVMF_TLB_FLUSH:
3132 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3134 case UVMF_LOCAL:
3135 flush_tlb_local();
3136 break;
3137 case UVMF_ALL:
3138 flush_tlb_mask(d->domain_dirty_cpumask);
3139 break;
3140 default:
3141 if ( unlikely(!is_pv_32on64_domain(d) ?
3142 get_user(vmask, (unsigned long *)bmap_ptr) :
3143 get_user(vmask, (unsigned int *)bmap_ptr)) )
3144 rc = -EFAULT;
3145 pmask = vcpumask_to_pcpumask(d, vmask);
3146 flush_tlb_mask(pmask);
3147 break;
3149 break;
3151 case UVMF_INVLPG:
3152 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3154 case UVMF_LOCAL:
3155 if ( !paging_mode_enabled(d) ||
3156 (paging_invlpg(v, va) != 0) )
3157 flush_tlb_one_local(va);
3158 break;
3159 case UVMF_ALL:
3160 flush_tlb_one_mask(d->domain_dirty_cpumask, va);
3161 break;
3162 default:
3163 if ( unlikely(!is_pv_32on64_domain(d) ?
3164 get_user(vmask, (unsigned long *)bmap_ptr) :
3165 get_user(vmask, (unsigned int *)bmap_ptr)) )
3166 rc = -EFAULT;
3167 pmask = vcpumask_to_pcpumask(d, vmask);
3168 flush_tlb_one_mask(pmask, va);
3169 break;
3171 break;
3174 return rc;
3177 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
3178 unsigned long flags,
3179 domid_t domid)
3181 int rc;
3183 if ( !set_foreigndom(domid) )
3184 return -ESRCH;
3186 rc = do_update_va_mapping(va, val64, flags);
3188 BUG_ON(this_cpu(percpu_mm_info).deferred_ops);
3189 process_deferred_ops(); /* only to clear foreigndom */
3191 return rc;
3196 /*************************
3197 * Descriptor Tables
3198 */
3200 void destroy_gdt(struct vcpu *v)
3202 int i;
3203 unsigned long pfn;
3205 v->arch.guest_context.gdt_ents = 0;
3206 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
3208 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
3209 put_page_and_type(mfn_to_page(pfn));
3210 l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
3211 v->arch.guest_context.gdt_frames[i] = 0;
3216 long set_gdt(struct vcpu *v,
3217 unsigned long *frames,
3218 unsigned int entries)
3220 struct domain *d = v->domain;
3221 /* NB. There are 512 8-byte entries per GDT page. */
3222 int i, nr_pages = (entries + 511) / 512;
3223 unsigned long mfn;
3225 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3226 return -EINVAL;
3228 /* Check the pages in the new GDT. */
3229 for ( i = 0; i < nr_pages; i++ )
3231 mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
3232 if ( !mfn_valid(mfn) ||
3233 !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
3234 goto fail;
3237 /* Tear down the old GDT. */
3238 destroy_gdt(v);
3240 /* Install the new GDT. */
3241 v->arch.guest_context.gdt_ents = entries;
3242 for ( i = 0; i < nr_pages; i++ )
3244 v->arch.guest_context.gdt_frames[i] = frames[i];
3245 l1e_write(&v->arch.perdomain_ptes[i],
3246 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
3249 return 0;
3251 fail:
3252 while ( i-- > 0 )
3253 put_page_and_type(mfn_to_page(frames[i]));
3254 return -EINVAL;
3258 long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
3260 int nr_pages = (entries + 511) / 512;
3261 unsigned long frames[16];
3262 struct vcpu *curr = current;
3263 long ret;
3265 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
3266 if ( entries > FIRST_RESERVED_GDT_ENTRY )
3267 return -EINVAL;
3269 if ( copy_from_guest(frames, frame_list, nr_pages) )
3270 return -EFAULT;
3272 domain_lock(curr->domain);
3274 if ( (ret = set_gdt(curr, frames, entries)) == 0 )
3275 flush_tlb_local();
3277 domain_unlock(curr->domain);
3279 return ret;
3283 long do_update_descriptor(u64 pa, u64 desc)
3285 struct domain *dom = current->domain;
3286 unsigned long gmfn = pa >> PAGE_SHIFT;
3287 unsigned long mfn;
3288 unsigned int offset;
3289 struct desc_struct *gdt_pent, d;
3290 struct page_info *page;
3291 long ret = -EINVAL;
3293 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
3295 *(u64 *)&d = desc;
3297 mfn = gmfn_to_mfn(dom, gmfn);
3298 if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
3299 !mfn_valid(mfn) ||
3300 !check_descriptor(dom, &d) )
3301 return -EINVAL;
3303 page = mfn_to_page(mfn);
3304 if ( unlikely(!get_page(page, dom)) )
3305 return -EINVAL;
3307 /* Check if the given frame is in use in an unsafe context. */
3308 switch ( page->u.inuse.type_info & PGT_type_mask )
3310 case PGT_seg_desc_page:
3311 if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
3312 goto out;
3313 break;
3314 default:
3315 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
3316 goto out;
3317 break;
3320 paging_mark_dirty(dom, mfn);
3322 /* All is good so make the update. */
3323 gdt_pent = map_domain_page(mfn);
3324 atomic_write64((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
3325 unmap_domain_page(gdt_pent);
3327 put_page_type(page);
3329 ret = 0; /* success */
3331 out:
3332 put_page(page);
3334 return ret;
3337 typedef struct e820entry e820entry_t;
3338 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
3340 long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
3342 struct page_info *page = NULL;
3343 switch ( op )
3345 case XENMEM_add_to_physmap:
3347 struct xen_add_to_physmap xatp;
3348 unsigned long prev_mfn, mfn = 0, gpfn;
3349 struct domain *d;
3351 if ( copy_from_guest(&xatp, arg, 1) )
3352 return -EFAULT;
3354 if ( xatp.domid == DOMID_SELF )
3356 d = rcu_lock_current_domain();
3358 else
3360 if ( (d = rcu_lock_domain_by_id(xatp.domid)) == NULL )
3361 return -ESRCH;
3362 if ( !IS_PRIV_FOR(current->domain, d) )
3364 rcu_unlock_domain(d);
3365 return -EPERM;
3369 if ( xsm_add_to_physmap(current->domain, d) )
3371 rcu_unlock_domain(d);
3372 return -EPERM;
3375 switch ( xatp.space )
3377 case XENMAPSPACE_shared_info:
3378 if ( xatp.idx == 0 )
3379 mfn = virt_to_mfn(d->shared_info);
3380 break;
3381 case XENMAPSPACE_grant_table:
3382 spin_lock(&d->grant_table->lock);
3384 if ( (xatp.idx >= nr_grant_frames(d->grant_table)) &&
3385 (xatp.idx < max_nr_grant_frames) )
3386 gnttab_grow_table(d, xatp.idx + 1);
3388 if ( xatp.idx < nr_grant_frames(d->grant_table) )
3389 mfn = virt_to_mfn(d->grant_table->shared[xatp.idx]);
3391 spin_unlock(&d->grant_table->lock);
3392 break;
3393 case XENMAPSPACE_mfn:
3395 if ( get_page_from_pagenr(xatp.idx, d) ) {
3396 mfn = xatp.idx;
3397 page = mfn_to_page(mfn);
3399 break;
3401 default:
3402 break;
3405 if ( !paging_mode_translate(d) || (mfn == 0) )
3407 if ( page )
3408 put_page(page);
3409 rcu_unlock_domain(d);
3410 return -EINVAL;
3413 domain_lock(d);
3415 /* Remove previously mapped page if it was present. */
3416 prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
3417 if ( mfn_valid(prev_mfn) )
3419 if ( is_xen_heap_mfn(prev_mfn) )
3420 /* Xen heap frames are simply unhooked from this phys slot. */
3421 guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
3422 else
3423 /* Normal domain memory is freed, to avoid leaking memory. */
3424 guest_remove_page(d, xatp.gpfn);
3427 /* Unmap from old location, if any. */
3428 gpfn = get_gpfn_from_mfn(mfn);
3429 if ( gpfn != INVALID_M2P_ENTRY )
3430 guest_physmap_remove_page(d, gpfn, mfn, 0);
3432 /* Map at new location. */
3433 guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
3435 domain_unlock(d);
3437 if ( page )
3438 put_page(page);
3440 rcu_unlock_domain(d);
3442 break;
3445 case XENMEM_remove_from_physmap:
3447 struct xen_remove_from_physmap xrfp;
3448 unsigned long mfn;
3449 struct domain *d;
3451 if ( copy_from_guest(&xrfp, arg, 1) )
3452 return -EFAULT;
3454 if ( xrfp.domid == DOMID_SELF )
3456 d = rcu_lock_current_domain();
3458 else
3460 if ( (d = rcu_lock_domain_by_id(xrfp.domid)) == NULL )
3461 return -ESRCH;
3462 if ( !IS_PRIV_FOR(current->domain, d) )
3464 rcu_unlock_domain(d);
3465 return -EPERM;
3469 if ( xsm_remove_from_physmap(current->domain, d) )
3471 rcu_unlock_domain(d);
3472 return -EPERM;
3475 domain_lock(d);
3477 mfn = gmfn_to_mfn(d, xrfp.gpfn);
3479 if ( mfn_valid(mfn) )
3480 guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);
3482 domain_unlock(d);
3484 rcu_unlock_domain(d);
3486 break;
3489 case XENMEM_set_memory_map:
3491 struct xen_foreign_memory_map fmap;
3492 struct domain *d;
3493 int rc;
3495 if ( copy_from_guest(&fmap, arg, 1) )
3496 return -EFAULT;
3498 if ( fmap.map.nr_entries > ARRAY_SIZE(d->arch.e820) )
3499 return -EINVAL;
3501 if ( fmap.domid == DOMID_SELF )
3503 d = rcu_lock_current_domain();
3505 else
3507 if ( (d = rcu_lock_domain_by_id(fmap.domid)) == NULL )
3508 return -ESRCH;
3509 if ( !IS_PRIV_FOR(current->domain, d) )
3511 rcu_unlock_domain(d);
3512 return -EPERM;
3516 rc = xsm_domain_memory_map(d);
3517 if ( rc )
3519 rcu_unlock_domain(d);
3520 return rc;
3523 rc = copy_from_guest(d->arch.e820, fmap.map.buffer,
3524 fmap.map.nr_entries) ? -EFAULT : 0;
3525 d->arch.nr_e820 = fmap.map.nr_entries;
3527 rcu_unlock_domain(d);
3528 return rc;
3531 case XENMEM_memory_map:
3533 struct xen_memory_map map;
3534 struct domain *d = current->domain;
3536 /* Backwards compatibility. */
3537 if ( d->arch.nr_e820 == 0 )
3538 return -ENOSYS;
3540 if ( copy_from_guest(&map, arg, 1) )
3541 return -EFAULT;
3543 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
3544 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
3545 copy_to_guest(arg, &map, 1) )
3546 return -EFAULT;
3548 return 0;
3551 case XENMEM_machine_memory_map:
3553 struct xen_memory_map memmap;
3554 XEN_GUEST_HANDLE(e820entry_t) buffer;
3555 int count;
3556 int rc;
3558 if ( !IS_PRIV(current->domain) )
3559 return -EINVAL;
3561 rc = xsm_machine_memory_map();
3562 if ( rc )
3563 return rc;
3565 if ( copy_from_guest(&memmap, arg, 1) )
3566 return -EFAULT;
3567 if ( memmap.nr_entries < e820.nr_map + 1 )
3568 return -EINVAL;
3570 buffer = guest_handle_cast(memmap.buffer, e820entry_t);
3572 count = min((unsigned int)e820.nr_map, memmap.nr_entries);
3573 if ( copy_to_guest(buffer, e820.map, count) < 0 )
3574 return -EFAULT;
3576 memmap.nr_entries = count;
3578 if ( copy_to_guest(arg, &memmap, 1) )
3579 return -EFAULT;
3581 return 0;
3584 case XENMEM_machphys_mapping:
3586 static const struct xen_machphys_mapping mapping = {
3587 .v_start = MACH2PHYS_VIRT_START,
3588 .v_end = MACH2PHYS_VIRT_END,
3589 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
3590 };
3592 if ( copy_to_guest(arg, &mapping, 1) )
3593 return -EFAULT;
3595 return 0;
3598 default:
3599 return subarch_memory_op(op, arg);
3602 return 0;
3606 /*************************
3607 * Writable Pagetables
3608 */
3610 struct ptwr_emulate_ctxt {
3611 struct x86_emulate_ctxt ctxt;
3612 unsigned long cr2;
3613 l1_pgentry_t pte;
3614 };
3616 static int ptwr_emulated_read(
3617 enum x86_segment seg,
3618 unsigned long offset,
3619 void *p_data,
3620 unsigned int bytes,
3621 struct x86_emulate_ctxt *ctxt)
3623 unsigned int rc;
3624 unsigned long addr = offset;
3626 if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
3628 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
3629 return X86EMUL_EXCEPTION;
3632 return X86EMUL_OKAY;
3635 static int ptwr_emulated_update(
3636 unsigned long addr,
3637 paddr_t old,
3638 paddr_t val,
3639 unsigned int bytes,
3640 unsigned int do_cmpxchg,
3641 struct ptwr_emulate_ctxt *ptwr_ctxt)
3643 unsigned long mfn;
3644 unsigned long unaligned_addr = addr;
3645 struct page_info *page;
3646 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3647 struct vcpu *v = current;
3648 struct domain *d = v->domain;
3650 /* Only allow naturally-aligned stores within the original %cr2 page. */
3651 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
3653 MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
3654 ptwr_ctxt->cr2, addr, bytes);
3655 return X86EMUL_UNHANDLEABLE;
3658 /* Turn a sub-word access into a full-word access. */
3659 if ( bytes != sizeof(paddr_t) )
3661 paddr_t full;
3662 unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
3664 /* Align address; read full word. */
3665 addr &= ~(sizeof(paddr_t)-1);
3666 if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
3668 propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */
3669 return X86EMUL_EXCEPTION;
3671 /* Mask out bits provided by caller. */
3672 full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
3673 /* Shift the caller value and OR in the missing bits. */
3674 val &= (((paddr_t)1 << (bytes*8)) - 1);
3675 val <<= (offset)*8;
3676 val |= full;
3677 /* Also fill in missing parts of the cmpxchg old value. */
3678 old &= (((paddr_t)1 << (bytes*8)) - 1);
3679 old <<= (offset)*8;
3680 old |= full;
3683 pte = ptwr_ctxt->pte;
3684 mfn = l1e_get_pfn(pte);
3685 page = mfn_to_page(mfn);
3687 /* We are looking only for read-only mappings of p.t. pages. */
3688 ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
3689 ASSERT(mfn_valid(mfn));
3690 ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table);
3691 ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0);
3692 ASSERT(page_get_owner(page) == d);
3694 /* Check the new PTE. */
3695 nl1e = l1e_from_intpte(val);
3696 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3698 if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
3699 (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
3700 (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
3702 /*
3703 * If this is an upper-half write to a PAE PTE then we assume that
3704 * the guest has simply got the two writes the wrong way round. We
3705 * zap the PRESENT bit on the assumption that the bottom half will
3706 * be written immediately after we return to the guest.
3707 */
3708 MEM_LOG("ptwr_emulate: fixing up invalid PAE PTE %"PRIpte,
3709 l1e_get_intpte(nl1e));
3710 l1e_remove_flags(nl1e, _PAGE_PRESENT);
3712 else
3714 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3715 return X86EMUL_UNHANDLEABLE;
3719 adjust_guest_l1e(nl1e, d);
3721 /* Checked successfully: do the update (write or cmpxchg). */
3722 pl1e = map_domain_page(mfn);
3723 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3724 if ( do_cmpxchg )
3726 int okay;
3727 intpte_t t = old;
3728 ol1e = l1e_from_intpte(old);
3730 okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
3731 &t, l1e_get_intpte(nl1e), _mfn(mfn));
3732 okay = (okay && t == old);
3734 if ( !okay )
3736 unmap_domain_page(pl1e);
3737 put_page_from_l1e(nl1e, d);
3738 return X86EMUL_CMPXCHG_FAILED;
3741 else
3743 ol1e = *pl1e;
3744 if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) )
3745 BUG();
3748 trace_ptwr_emulation(addr, nl1e);
3750 unmap_domain_page(pl1e);
3752 /* Finally, drop the old PTE. */
3753 put_page_from_l1e(ol1e, d);
3755 return X86EMUL_OKAY;
3758 static int ptwr_emulated_write(
3759 enum x86_segment seg,
3760 unsigned long offset,
3761 void *p_data,
3762 unsigned int bytes,
3763 struct x86_emulate_ctxt *ctxt)
3765 paddr_t val = 0;
3767 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
3769 MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
3770 offset, bytes);
3771 return X86EMUL_UNHANDLEABLE;
3774 memcpy(&val, p_data, bytes);
3776 return ptwr_emulated_update(
3777 offset, 0, val, bytes, 0,
3778 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3781 static int ptwr_emulated_cmpxchg(
3782 enum x86_segment seg,
3783 unsigned long offset,
3784 void *p_old,
3785 void *p_new,
3786 unsigned int bytes,
3787 struct x86_emulate_ctxt *ctxt)
3789 paddr_t old = 0, new = 0;
3791 if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
3793 MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
3794 offset, bytes);
3795 return X86EMUL_UNHANDLEABLE;
3798 memcpy(&old, p_old, bytes);
3799 memcpy(&new, p_new, bytes);
3801 return ptwr_emulated_update(
3802 offset, old, new, bytes, 1,
3803 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
3806 static struct x86_emulate_ops ptwr_emulate_ops = {
3807 .read = ptwr_emulated_read,
3808 .insn_fetch = ptwr_emulated_read,
3809 .write = ptwr_emulated_write,
3810 .cmpxchg = ptwr_emulated_cmpxchg,
3811 };
3813 /* Write page fault handler: check if guest is trying to modify a PTE. */
3814 int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
3815 struct cpu_user_regs *regs)
3817 struct domain *d = v->domain;
3818 struct page_info *page;
3819 l1_pgentry_t pte;
3820 struct ptwr_emulate_ctxt ptwr_ctxt;
3821 int rc;
3823 /* Attempt to read the PTE that maps the VA being accessed. */
3824 guest_get_eff_l1e(v, addr, &pte);
3825 page = l1e_get_page(pte);
3827 /* We are looking only for read-only mappings of p.t. pages. */
3828 if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
3829 !mfn_valid(l1e_get_pfn(pte)) ||
3830 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3831 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3832 (page_get_owner(page) != d) )
3833 goto bail;
3835 ptwr_ctxt.ctxt.regs = regs;
3836 ptwr_ctxt.ctxt.force_writeback = 0;
3837 ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
3838 is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
3839 ptwr_ctxt.cr2 = addr;
3840 ptwr_ctxt.pte = pte;
3842 page_lock(page);
3843 rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
3844 page_unlock(page);
3845 if ( rc == X86EMUL_UNHANDLEABLE )
3846 goto bail;
3848 perfc_incr(ptwr_emulations);
3849 return EXCRET_fault_fixed;
3851 bail:
3852 return 0;
3855 void free_xen_pagetable(void *v)
3857 extern int early_boot;
3859 if ( early_boot )
3860 return;
3862 if ( is_xen_heap_page(virt_to_page(v)) )
3863 free_xenheap_page(v);
3864 else
3865 free_domheap_page(virt_to_page(v));
3868 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
3869 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
3870 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
3872 /*
3873 * map_pages_to_xen() can be called with interrupts disabled:
3874 * * During early bootstrap; or
3875 * * alloc_xenheap_pages() via memguard_guard_range
3876 * In these cases it is safe to use flush_area_local():
3877 * * Because only the local CPU is online; or
3878 * * Because stale TLB entries do not matter for memguard_[un]guard_range().
3879 */
3880 #define flush_area(v,f) (!local_irq_is_enabled() ? \
3881 flush_area_local((const void *)v, f) : \
3882 flush_area_all((const void *)v, f))
3884 int map_pages_to_xen(
3885 unsigned long virt,
3886 unsigned long mfn,
3887 unsigned long nr_mfns,
3888 unsigned int flags)
3890 l2_pgentry_t *pl2e, ol2e;
3891 l1_pgentry_t *pl1e, ol1e;
3892 unsigned int i;
3894 while ( nr_mfns != 0 )
3896 #ifdef __x86_64__
3897 l3_pgentry_t *pl3e = virt_to_xen_l3e(virt);
3898 l3_pgentry_t ol3e = *pl3e;
3900 if ( cpu_has_page1gb &&
3901 !(((virt >> PAGE_SHIFT) | mfn) &
3902 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
3903 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
3904 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
3906 /* 1GB-page mapping. */
3907 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
3909 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
3911 unsigned int flush_flags =
3912 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3914 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
3916 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
3917 flush_flags |= FLUSH_TLB_GLOBAL;
3918 if ( (l1f_to_lNf(l3e_get_flags(ol3e)) ^ flags) &
3919 PAGE_CACHE_ATTRS )
3920 flush_flags |= FLUSH_CACHE;
3921 flush_area(virt, flush_flags);
3923 else
3925 pl2e = l3e_to_l2e(ol3e);
3926 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3928 ol2e = pl2e[i];
3929 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3930 continue;
3931 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
3933 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
3934 flush_flags |= FLUSH_TLB_GLOBAL;
3935 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
3936 PAGE_CACHE_ATTRS )
3937 flush_flags |= FLUSH_CACHE;
3939 else
3941 unsigned int j;
3943 pl1e = l2e_to_l1e(ol2e);
3944 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
3946 ol1e = pl1e[j];
3947 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
3948 flush_flags |= FLUSH_TLB_GLOBAL;
3949 if ( (l1e_get_flags(ol1e) ^ flags) &
3950 PAGE_CACHE_ATTRS )
3951 flush_flags |= FLUSH_CACHE;
3955 flush_area(virt, flush_flags);
3956 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
3958 ol2e = pl2e[i];
3959 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
3960 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3961 free_xen_pagetable(l2e_to_l1e(ol2e));
3963 free_xen_pagetable(pl2e);
3967 virt += 1UL << L3_PAGETABLE_SHIFT;
3968 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3969 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
3970 continue;
3973 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
3974 (l3e_get_flags(ol3e) & _PAGE_PSE) )
3976 unsigned int flush_flags =
3977 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
3979 /* Skip this PTE if there is no change. */
3980 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
3981 L1_PAGETABLE_ENTRIES - 1)) +
3982 (l2_table_offset(virt) << PAGETABLE_ORDER) +
3983 l1_table_offset(virt) == mfn) &&
3984 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
3985 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
3987 /* We can skip to end of L3 superpage if we got a match. */
3988 i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
3989 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
3990 if ( i > nr_mfns )
3991 i = nr_mfns;
3992 virt += i << PAGE_SHIFT;
3993 mfn += i;
3994 nr_mfns -= i;
3995 continue;
3998 pl2e = alloc_xen_pagetable();
3999 if ( pl2e == NULL )
4000 return -ENOMEM;
4002 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4003 l2e_write(pl2e + i,
4004 l2e_from_pfn(l3e_get_pfn(ol3e) +
4005 (i << PAGETABLE_ORDER),
4006 l3e_get_flags(ol3e)));
4008 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4009 flush_flags |= FLUSH_TLB_GLOBAL;
4011 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4012 __PAGE_HYPERVISOR));
4013 flush_area(virt, flush_flags);
4015 #endif
4017 pl2e = virt_to_xen_l2e(virt);
4019 if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
4020 (nr_mfns >= (1<<PAGETABLE_ORDER)) &&
4021 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4023 /* Super-page mapping. */
4024 ol2e = *pl2e;
4025 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4027 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4029 unsigned int flush_flags =
4030 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4032 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4034 if ( l2e_get_flags(ol2e) & _PAGE_GLOBAL )
4035 flush_flags |= FLUSH_TLB_GLOBAL;
4036 if ( (lNf_to_l1f(l2e_get_flags(ol2e)) ^ flags) &
4037 PAGE_CACHE_ATTRS )
4038 flush_flags |= FLUSH_CACHE;
4039 flush_area(virt, flush_flags);
4041 else
4043 pl1e = l2e_to_l1e(ol2e);
4044 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4046 if ( l1e_get_flags(pl1e[i]) & _PAGE_GLOBAL )
4047 flush_flags |= FLUSH_TLB_GLOBAL;
4048 if ( (l1e_get_flags(pl1e[i]) ^ flags) &
4049 PAGE_CACHE_ATTRS )
4050 flush_flags |= FLUSH_CACHE;
4052 flush_area(virt, flush_flags);
4053 free_xen_pagetable(pl1e);
4057 virt += 1UL << L2_PAGETABLE_SHIFT;
4058 mfn += 1UL << PAGETABLE_ORDER;
4059 nr_mfns -= 1UL << PAGETABLE_ORDER;
4061 else
4063 /* Normal page mapping. */
4064 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4066 pl1e = alloc_xen_pagetable();
4067 if ( pl1e == NULL )
4068 return -ENOMEM;
4069 clear_page(pl1e);
4070 l2e_write(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4071 __PAGE_HYPERVISOR));
4073 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4075 unsigned int flush_flags =
4076 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4078 /* Skip this PTE if there is no change. */
4079 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4080 l1_table_offset(virt)) == mfn) &&
4081 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4082 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4084 /* We can skip to end of L2 superpage if we got a match. */
4085 i = (1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4086 (mfn & ((1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4087 if ( i > nr_mfns )
4088 i = nr_mfns;
4089 virt += i << L1_PAGETABLE_SHIFT;
4090 mfn += i;
4091 nr_mfns -= i;
4092 goto check_l3;
4095 pl1e = alloc_xen_pagetable();
4096 if ( pl1e == NULL )
4097 return -ENOMEM;
4099 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4100 l1e_write(&pl1e[i],
4101 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4102 lNf_to_l1f(l2e_get_flags(*pl2e))));
4104 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4105 flush_flags |= FLUSH_TLB_GLOBAL;
4107 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4108 __PAGE_HYPERVISOR));
4109 flush_area(virt, flush_flags);
4112 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4113 ol1e = *pl1e;
4114 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4115 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4117 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4118 if ( l1e_get_flags(ol1e) & _PAGE_GLOBAL )
4119 flush_flags |= FLUSH_TLB_GLOBAL;
4120 if ( (l1e_get_flags(ol1e) ^ flags) & PAGE_CACHE_ATTRS )
4121 flush_flags |= FLUSH_CACHE;
4122 flush_area(virt, flush_flags);
4125 virt += 1UL << L1_PAGETABLE_SHIFT;
4126 mfn += 1UL;
4127 nr_mfns -= 1UL;
4129 if ( (flags == PAGE_HYPERVISOR) &&
4130 ((nr_mfns == 0) ||
4131 ((((virt >> PAGE_SHIFT) | mfn) &
4132 ((1 << PAGETABLE_ORDER) - 1)) == 0)) )
4134 unsigned long base_mfn;
4135 pl1e = l2e_to_l1e(*pl2e);
4136 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4137 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4138 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4139 (l1e_get_flags(*pl1e) != flags) )
4140 break;
4141 if ( i == L1_PAGETABLE_ENTRIES )
4143 ol2e = *pl2e;
4144 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4145 l1f_to_lNf(flags)));
4146 flush_area(virt - PAGE_SIZE,
4147 FLUSH_TLB_GLOBAL |
4148 FLUSH_ORDER(PAGETABLE_ORDER));
4149 free_xen_pagetable(l2e_to_l1e(ol2e));
4154 check_l3: ;
4155 #ifdef __x86_64__
4156 if ( cpu_has_page1gb &&
4157 (flags == PAGE_HYPERVISOR) &&
4158 ((nr_mfns == 0) ||
4159 !(((virt >> PAGE_SHIFT) | mfn) &
4160 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4162 unsigned long base_mfn;
4164 ol3e = *pl3e;
4165 pl2e = l3e_to_l2e(ol3e);
4166 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4167 L1_PAGETABLE_ENTRIES - 1);
4168 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4169 if ( (l2e_get_pfn(*pl2e) !=
4170 (base_mfn + (i << PAGETABLE_ORDER))) ||
4171 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4172 break;
4173 if ( i == L2_PAGETABLE_ENTRIES )
4175 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4176 l1f_to_lNf(flags)));
4177 flush_area(virt - PAGE_SIZE,
4178 FLUSH_TLB_GLOBAL |
4179 FLUSH_ORDER(2*PAGETABLE_ORDER));
4180 free_xen_pagetable(l3e_to_l2e(ol3e));
4183 #endif
4186 return 0;
4189 void destroy_xen_mappings(unsigned long s, unsigned long e)
4191 l2_pgentry_t *pl2e;
4192 l1_pgentry_t *pl1e;
4193 unsigned int i;
4194 unsigned long v = s;
4196 ASSERT((s & ~PAGE_MASK) == 0);
4197 ASSERT((e & ~PAGE_MASK) == 0);
4199 while ( v < e )
4201 #ifdef __x86_64__
4202 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4204 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4206 v += 1UL << L3_PAGETABLE_SHIFT;
4207 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4208 continue;
4211 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
4213 if ( l2_table_offset(v) == 0 &&
4214 l1_table_offset(v) == 0 &&
4215 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
4217 /* PAGE1GB: whole superpage is destroyed. */
4218 l3e_write_atomic(pl3e, l3e_empty());
4219 v += 1UL << L3_PAGETABLE_SHIFT;
4220 continue;
4223 /* PAGE1GB: shatter the superpage and fall through. */
4224 pl2e = alloc_xen_pagetable();
4225 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4226 l2e_write(pl2e + i,
4227 l2e_from_pfn(l3e_get_pfn(*pl3e) +
4228 (i << PAGETABLE_ORDER),
4229 l3e_get_flags(*pl3e)));
4230 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4231 __PAGE_HYPERVISOR));
4233 #endif
4235 pl2e = virt_to_xen_l2e(v);
4237 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4239 v += 1UL << L2_PAGETABLE_SHIFT;
4240 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
4241 continue;
4244 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4246 if ( (l1_table_offset(v) == 0) &&
4247 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
4249 /* PSE: whole superpage is destroyed. */
4250 l2e_write_atomic(pl2e, l2e_empty());
4251 v += 1UL << L2_PAGETABLE_SHIFT;
4253 else
4255 /* PSE: shatter the superpage and try again. */
4256 pl1e = alloc_xen_pagetable();
4257 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4258 l1e_write(&pl1e[i],
4259 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4260 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
4261 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4262 __PAGE_HYPERVISOR));
4265 else
4267 /* Ordinary 4kB mapping. */
4268 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
4269 l1e_write_atomic(pl1e, l1e_empty());
4270 v += PAGE_SIZE;
4272 /* If we are done with the L2E, check if it is now empty. */
4273 if ( (v != e) && (l1_table_offset(v) != 0) )
4274 continue;
4275 pl1e = l2e_to_l1e(*pl2e);
4276 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4277 if ( l1e_get_intpte(pl1e[i]) != 0 )
4278 break;
4279 if ( i == L1_PAGETABLE_ENTRIES )
4281 /* Empty: zap the L2E and free the L1 page. */
4282 l2e_write_atomic(pl2e, l2e_empty());
4283 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4284 free_xen_pagetable(pl1e);
4288 #ifdef __x86_64__
4289 /* If we are done with the L3E, check if it is now empty. */
4290 if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) )
4291 continue;
4292 pl2e = l3e_to_l2e(*pl3e);
4293 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4294 if ( l2e_get_intpte(pl2e[i]) != 0 )
4295 break;
4296 if ( i == L2_PAGETABLE_ENTRIES )
4298 /* Empty: zap the L3E and free the L2 page. */
4299 l3e_write_atomic(pl3e, l3e_empty());
4300 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
4301 free_xen_pagetable(pl2e);
4303 #endif
4306 flush_area(NULL, FLUSH_TLB_GLOBAL);
4309 void __set_fixmap(
4310 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
4312 BUG_ON(idx >= __end_of_fixed_addresses);
4313 map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags);
4316 #ifdef MEMORY_GUARD
4318 void memguard_init(void)
4320 unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
4321 map_pages_to_xen(
4322 (unsigned long)__va(start),
4323 start >> PAGE_SHIFT,
4324 (xenheap_phys_end - start) >> PAGE_SHIFT,
4325 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4326 #ifdef __x86_64__
4327 BUG_ON(start != xen_phys_start);
4328 map_pages_to_xen(
4329 XEN_VIRT_START,
4330 start >> PAGE_SHIFT,
4331 (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
4332 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
4333 #endif
4336 static void __memguard_change_range(void *p, unsigned long l, int guard)
4338 unsigned long _p = (unsigned long)p;
4339 unsigned long _l = (unsigned long)l;
4340 unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
4342 /* Ensure we are dealing with a page-aligned whole number of pages. */
4343 ASSERT((_p&~PAGE_MASK) == 0);
4344 ASSERT((_l&~PAGE_MASK) == 0);
4346 if ( guard )
4347 flags &= ~_PAGE_PRESENT;
4349 map_pages_to_xen(
4350 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
4353 void memguard_guard_range(void *p, unsigned long l)
4355 __memguard_change_range(p, l, 1);
4358 void memguard_unguard_range(void *p, unsigned long l)
4360 __memguard_change_range(p, l, 0);
4363 #endif
4365 void memguard_guard_stack(void *p)
4367 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
4368 p = (void *)((unsigned long)p + STACK_SIZE -
4369 PRIMARY_STACK_SIZE - PAGE_SIZE);
4370 memguard_guard_range(p, PAGE_SIZE);
4373 /*
4374 * Local variables:
4375 * mode: C
4376 * c-set-style: "BSD"
4377 * c-basic-offset: 4
4378 * tab-width: 4
4379 * indent-tabs-mode: nil
4380 * End:
4381 */