direct-io.hg

view xen/arch/x86/mm.c @ 7794:4f03592bc7f5

Flush writable pagetable state before emulating a PT
update. Avoids possibility of updating a PTE temporarily
marked writable by ptwr batching logic, which can corrupt
page reference counts. Aiee!

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Nov 14 18:27:11 2005 +0100 (2005-11-14)
parents 090e44133d40
children dd754654d427
line source
1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <xen/domain_page.h>
98 #include <xen/event.h>
99 #include <asm/shadow.h>
100 #include <asm/page.h>
101 #include <asm/flushtlb.h>
102 #include <asm/io.h>
103 #include <asm/uaccess.h>
104 #include <asm/ldt.h>
105 #include <asm/x86_emulate.h>
107 #ifdef VERBOSE
108 #define MEM_LOG(_f, _a...) \
109 printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
110 current->domain->domain_id , __LINE__ , ## _a )
111 #else
112 #define MEM_LOG(_f, _a...) ((void)0)
113 #endif
115 /*
116 * Both do_mmuext_op() and do_mmu_update():
117 * We steal the m.s.b. of the @count parameter to indicate whether this
118 * invocation of do_mmu_update() is resuming a previously preempted call.
119 */
120 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
122 static void free_l2_table(struct pfn_info *page);
123 static void free_l1_table(struct pfn_info *page);
125 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
126 unsigned long type);
127 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
129 /* Used to defer flushing of memory structures. */
130 static struct {
131 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
132 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
133 unsigned int deferred_ops;
134 /* If non-NULL, specifies a foreign subject domain for some operations. */
135 struct domain *foreign;
136 } __cacheline_aligned percpu_info[NR_CPUS];
138 /*
139 * Returns the current foreign domain; defaults to the currently-executing
140 * domain if a foreign override hasn't been specified.
141 */
142 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
144 /* Private domain structs for DOMID_XEN and DOMID_IO. */
145 static struct domain *dom_xen, *dom_io;
147 /* Frame table and its size in pages. */
148 struct pfn_info *frame_table;
149 unsigned long max_page;
150 unsigned long total_pages;
152 void __init init_frametable(void)
153 {
154 unsigned long nr_pages, page_step, i, pfn;
156 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
158 nr_pages = PFN_UP(max_page * sizeof(*frame_table));
159 page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT;
161 for ( i = 0; i < nr_pages; i += page_step )
162 {
163 pfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step);
164 if ( pfn == 0 )
165 panic("Not enough memory for frame table\n");
166 map_pages_to_xen(
167 FRAMETABLE_VIRT_START + (i << PAGE_SHIFT),
168 pfn, page_step, PAGE_HYPERVISOR);
169 }
171 memset(frame_table, 0, nr_pages << PAGE_SHIFT);
172 }
174 void arch_init_memory(void)
175 {
176 extern void subarch_init_memory(struct domain *);
178 unsigned long i, pfn, rstart_pfn, rend_pfn;
179 struct pfn_info *page;
181 memset(percpu_info, 0, sizeof(percpu_info));
183 /*
184 * Initialise our DOMID_XEN domain.
185 * Any Xen-heap pages that we will allow to be mapped will have
186 * their domain field set to dom_xen.
187 */
188 dom_xen = alloc_domain();
189 atomic_set(&dom_xen->refcnt, 1);
190 dom_xen->domain_id = DOMID_XEN;
192 /*
193 * Initialise our DOMID_IO domain.
194 * This domain owns I/O pages that are within the range of the pfn_info
195 * array. Mappings occur at the priv of the caller.
196 */
197 dom_io = alloc_domain();
198 atomic_set(&dom_io->refcnt, 1);
199 dom_io->domain_id = DOMID_IO;
201 /* First 1MB of RAM is historically marked as I/O. */
202 for ( i = 0; i < 0x100; i++ )
203 {
204 page = &frame_table[i];
205 page->count_info = PGC_allocated | 1;
206 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
207 page_set_owner(page, dom_io);
208 }
210 /* Any areas not specified as RAM by the e820 map are considered I/O. */
211 for ( i = 0, pfn = 0; i < e820.nr_map; i++ )
212 {
213 if ( e820.map[i].type != E820_RAM )
214 continue;
215 /* Every page from cursor to start of next RAM region is I/O. */
216 rstart_pfn = PFN_UP(e820.map[i].addr);
217 rend_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
218 while ( pfn < rstart_pfn )
219 {
220 BUG_ON(!pfn_valid(pfn));
221 page = &frame_table[pfn++];
222 page->count_info = PGC_allocated | 1;
223 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
224 page_set_owner(page, dom_io);
225 }
226 /* Skip the RAM region. */
227 pfn = rend_pfn;
228 }
229 BUG_ON(pfn != max_page);
231 subarch_init_memory(dom_xen);
232 }
234 void write_ptbase(struct vcpu *v)
235 {
236 write_cr3(pagetable_get_paddr(v->arch.monitor_table));
237 }
239 void invalidate_shadow_ldt(struct vcpu *v)
240 {
241 int i;
242 unsigned long pfn;
243 struct pfn_info *page;
245 if ( v->arch.shadow_ldt_mapcnt == 0 )
246 return;
248 v->arch.shadow_ldt_mapcnt = 0;
250 for ( i = 16; i < 32; i++ )
251 {
252 pfn = l1e_get_pfn(v->arch.perdomain_ptes[i]);
253 if ( pfn == 0 ) continue;
254 v->arch.perdomain_ptes[i] = l1e_empty();
255 page = &frame_table[pfn];
256 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
257 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
258 put_page_and_type(page);
259 }
261 /* Dispose of the (now possibly invalid) mappings from the TLB. */
262 percpu_info[v->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
263 }
266 static int alloc_segdesc_page(struct pfn_info *page)
267 {
268 struct desc_struct *descs;
269 int i;
271 descs = map_domain_page(page_to_pfn(page));
273 for ( i = 0; i < 512; i++ )
274 if ( unlikely(!check_descriptor(&descs[i])) )
275 goto fail;
277 unmap_domain_page(descs);
278 return 1;
280 fail:
281 unmap_domain_page(descs);
282 return 0;
283 }
286 /* Map shadow page at offset @off. */
287 int map_ldt_shadow_page(unsigned int off)
288 {
289 struct vcpu *v = current;
290 struct domain *d = v->domain;
291 unsigned long gpfn, gmfn;
292 l1_pgentry_t l1e, nl1e;
293 unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
294 int res;
296 #if defined(__x86_64__)
297 /* If in user mode, switch to kernel mode just to read LDT mapping. */
298 extern void toggle_guest_mode(struct vcpu *);
299 int user_mode = !(v->arch.flags & TF_kernel_mode);
300 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
301 #elif defined(__i386__)
302 #define TOGGLE_MODE() ((void)0)
303 #endif
305 BUG_ON(unlikely(in_irq()));
307 shadow_sync_va(v, gva);
309 TOGGLE_MODE();
310 __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
311 sizeof(l1e));
312 TOGGLE_MODE();
314 if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
315 return 0;
317 gpfn = l1e_get_pfn(l1e);
318 gmfn = __gpfn_to_mfn(d, gpfn);
319 if ( unlikely(!VALID_MFN(gmfn)) )
320 return 0;
322 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
324 if ( !res && unlikely(shadow_mode_refcounts(d)) )
325 {
326 shadow_lock(d);
327 shadow_remove_all_write_access(d, gpfn, gmfn);
328 res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
329 shadow_unlock(d);
330 }
332 if ( unlikely(!res) )
333 return 0;
335 nl1e = l1e_from_pfn(gmfn, l1e_get_flags(l1e) | _PAGE_RW);
337 v->arch.perdomain_ptes[off + 16] = nl1e;
338 v->arch.shadow_ldt_mapcnt++;
340 return 1;
341 }
344 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
345 {
346 struct pfn_info *page = &frame_table[page_nr];
348 if ( unlikely(!pfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
349 {
350 MEM_LOG("Could not get page ref for pfn %lx", page_nr);
351 return 0;
352 }
354 return 1;
355 }
358 static int get_page_and_type_from_pagenr(unsigned long page_nr,
359 unsigned long type,
360 struct domain *d)
361 {
362 struct pfn_info *page = &frame_table[page_nr];
364 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
365 return 0;
367 if ( unlikely(!get_page_type(page, type)) )
368 {
369 put_page(page);
370 return 0;
371 }
373 return 1;
374 }
376 /*
377 * We allow root tables to map each other (a.k.a. linear page tables). It
378 * needs some special care with reference counts and access permissions:
379 * 1. The mapping entry must be read-only, or the guest may get write access
380 * to its own PTEs.
381 * 2. We must only bump the reference counts for an *already validated*
382 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
383 * on a validation that is required to complete that validation.
384 * 3. We only need to increment the reference counts for the mapped page
385 * frame if it is mapped by a different root table. This is sufficient and
386 * also necessary to allow validation of a root table mapping itself.
387 */
388 static int
389 get_linear_pagetable(
390 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
391 {
392 unsigned long x, y;
393 struct pfn_info *page;
394 unsigned long pfn;
396 ASSERT( !shadow_mode_refcounts(d) );
398 if ( (root_get_flags(re) & _PAGE_RW) )
399 {
400 MEM_LOG("Attempt to create linear p.t. with write perms");
401 return 0;
402 }
404 if ( (pfn = root_get_pfn(re)) != re_pfn )
405 {
406 /* Make sure the mapped frame belongs to the correct domain. */
407 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
408 return 0;
410 /*
411 * Make sure that the mapped frame is an already-validated L2 table.
412 * If so, atomically increment the count (checking for overflow).
413 */
414 page = &frame_table[pfn];
415 y = page->u.inuse.type_info;
416 do {
417 x = y;
418 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
419 unlikely((x & (PGT_type_mask|PGT_validated)) !=
420 (PGT_root_page_table|PGT_validated)) )
421 {
422 put_page(page);
423 return 0;
424 }
425 }
426 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
427 }
429 return 1;
430 }
432 int
433 get_page_from_l1e(
434 l1_pgentry_t l1e, struct domain *d)
435 {
436 unsigned long mfn = l1e_get_pfn(l1e);
437 struct pfn_info *page = &frame_table[mfn];
438 int okay;
439 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
441 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
442 return 1;
444 if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) )
445 {
446 MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK);
447 return 0;
448 }
450 if ( unlikely(!pfn_valid(mfn)) ||
451 unlikely(page_get_owner(page) == dom_io) )
452 {
453 /* DOMID_IO reverts to caller for privilege checks. */
454 if ( d == dom_io )
455 d = current->domain;
457 if ( (!IS_PRIV(d)) &&
458 (!IS_CAPABLE_PHYSDEV(d) || !domain_iomem_in_pfn(d, mfn)) )
459 {
460 MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
461 return 0;
462 }
464 /* No reference counting for out-of-range I/O pages. */
465 if ( !pfn_valid(mfn) )
466 return 1;
468 d = dom_io;
469 }
471 okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
472 get_page_and_type(page, d, PGT_writable_page) :
473 get_page(page, d));
474 if ( !okay )
475 {
476 MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
477 " for dom%d",
478 mfn, get_pfn_from_mfn(mfn), l1e_get_intpte(l1e), d->domain_id);
479 }
481 return okay;
482 }
485 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
486 static int
487 get_page_from_l2e(
488 l2_pgentry_t l2e, unsigned long pfn,
489 struct domain *d, unsigned long vaddr)
490 {
491 int rc;
493 ASSERT(!shadow_mode_refcounts(d));
495 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
496 return 1;
498 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
499 {
500 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
501 return 0;
502 }
504 vaddr >>= L2_PAGETABLE_SHIFT;
505 vaddr <<= PGT_va_shift;
506 rc = get_page_and_type_from_pagenr(
507 l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d);
509 #if CONFIG_PAGING_LEVELS == 2
510 if ( unlikely(!rc) )
511 rc = get_linear_pagetable(l2e, pfn, d);
512 #endif
513 return rc;
514 }
517 #if CONFIG_PAGING_LEVELS >= 3
519 static int
520 get_page_from_l3e(
521 l3_pgentry_t l3e, unsigned long pfn,
522 struct domain *d, unsigned long vaddr)
523 {
524 ASSERT( !shadow_mode_refcounts(d) );
526 int rc;
528 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
529 return 1;
531 if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) )
532 {
533 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK);
534 return 0;
535 }
537 vaddr >>= L3_PAGETABLE_SHIFT;
538 vaddr <<= PGT_va_shift;
539 rc = get_page_and_type_from_pagenr(
540 l3e_get_pfn(l3e),
541 PGT_l2_page_table | vaddr, d);
542 #if CONFIG_PAGING_LEVELS == 3
543 if ( unlikely(!rc) )
544 rc = get_linear_pagetable(l3e, pfn, d);
545 #endif
546 return rc;
547 }
549 #endif /* 3 level */
551 #if CONFIG_PAGING_LEVELS >= 4
553 static int
554 get_page_from_l4e(
555 l4_pgentry_t l4e, unsigned long pfn,
556 struct domain *d, unsigned long vaddr)
557 {
558 int rc;
560 ASSERT( !shadow_mode_refcounts(d) );
562 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
563 return 1;
565 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
566 {
567 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
568 return 0;
569 }
571 vaddr >>= L4_PAGETABLE_SHIFT;
572 vaddr <<= PGT_va_shift;
573 rc = get_page_and_type_from_pagenr(
574 l4e_get_pfn(l4e),
575 PGT_l3_page_table | vaddr, d);
577 if ( unlikely(!rc) )
578 rc = get_linear_pagetable(l4e, pfn, d);
580 return rc;
581 }
583 #endif /* 4 level */
586 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
587 {
588 unsigned long pfn = l1e_get_pfn(l1e);
589 struct pfn_info *page = &frame_table[pfn];
590 struct domain *e;
591 struct vcpu *v;
593 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !pfn_valid(pfn) )
594 return;
596 e = page_get_owner(page);
597 if ( unlikely(e != d) )
598 {
599 /*
600 * Unmap a foreign page that may have been mapped via a grant table.
601 * Note that this can fail for a privileged domain that can map foreign
602 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
603 * counted via a grant entry and some counted directly in the page
604 * structure's reference count. Note that reference counts won't get
605 * dangerously confused as long as we always try to decrement the
606 * grant entry first. We may end up with a mismatch between which
607 * mappings and which unmappings are counted via the grant entry, but
608 * really it doesn't matter as privileged domains have carte blanche.
609 */
610 if (likely(gnttab_check_unmap(e, d, pfn,
611 !(l1e_get_flags(l1e) & _PAGE_RW))))
612 return;
613 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
614 }
616 if ( l1e_get_flags(l1e) & _PAGE_RW )
617 {
618 put_page_and_type(page);
619 }
620 else
621 {
622 /* We expect this is rare so we blow the entire shadow LDT. */
623 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
624 PGT_ldt_page)) &&
625 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
626 (d == e) )
627 {
628 for_each_vcpu ( d, v )
629 invalidate_shadow_ldt(v);
630 }
631 put_page(page);
632 }
633 }
636 /*
637 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
638 * Note also that this automatically deals correctly with linear p.t.'s.
639 */
640 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
641 {
642 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
643 (l2e_get_pfn(l2e) != pfn) )
644 put_page_and_type(&frame_table[l2e_get_pfn(l2e)]);
645 }
648 #if CONFIG_PAGING_LEVELS >= 3
650 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
651 {
652 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
653 (l3e_get_pfn(l3e) != pfn) )
654 put_page_and_type(&frame_table[l3e_get_pfn(l3e)]);
655 }
657 #endif
659 #if CONFIG_PAGING_LEVELS >= 4
661 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
662 {
663 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
664 (l4e_get_pfn(l4e) != pfn) )
665 put_page_and_type(&frame_table[l4e_get_pfn(l4e)]);
666 }
668 #endif
671 static int alloc_l1_table(struct pfn_info *page)
672 {
673 struct domain *d = page_get_owner(page);
674 unsigned long pfn = page_to_pfn(page);
675 l1_pgentry_t *pl1e;
676 int i;
678 ASSERT(!shadow_mode_refcounts(d));
680 pl1e = map_domain_page(pfn);
682 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
683 if ( is_guest_l1_slot(i) &&
684 unlikely(!get_page_from_l1e(pl1e[i], d)) )
685 goto fail;
687 unmap_domain_page(pl1e);
688 return 1;
690 fail:
691 MEM_LOG("Failure in alloc_l1_table: entry %d", i);
692 while ( i-- > 0 )
693 if ( is_guest_l1_slot(i) )
694 put_page_from_l1e(pl1e[i], d);
696 unmap_domain_page(pl1e);
697 return 0;
698 }
700 #ifdef CONFIG_X86_PAE
701 static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
702 {
703 struct pfn_info *page;
704 l2_pgentry_t *pl2e;
705 l3_pgentry_t l3e3;
706 int i;
708 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
710 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
711 l3e3 = pl3e[3];
712 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
713 {
714 MEM_LOG("PAE L3 3rd slot is empty");
715 return 0;
716 }
718 /*
719 * The Xen-private mappings include linear mappings. The L2 thus cannot
720 * be shared by multiple L3 tables. The test here is adequate because:
721 * 1. Cannot appear in slots != 3 because the page would then then have
722 * unknown va backpointer, which get_page_type() explicitly disallows.
723 * 2. Cannot appear in another page table's L3:
724 * a. alloc_l3_table() calls this function and this check will fail
725 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
726 */
727 page = l3e_get_page(l3e3);
728 BUG_ON(page->u.inuse.type_info & PGT_pinned);
729 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
730 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
731 {
732 MEM_LOG("PAE L3 3rd slot is shared");
733 return 0;
734 }
736 /* Xen private mappings. */
737 pl2e = map_domain_page(l3e_get_pfn(l3e3));
738 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
739 &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
740 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
741 for ( i = 0; i < (PERDOMAIN_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
742 pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
743 l2e_from_page(
744 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
745 __PAGE_HYPERVISOR);
746 for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
747 pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
748 (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ?
749 l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) :
750 l2e_empty();
751 unmap_domain_page(pl2e);
753 return 1;
754 }
756 static inline int l1_backptr(
757 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
758 {
759 unsigned long l2_backptr = l2_type & PGT_va_mask;
760 BUG_ON(l2_backptr == PGT_va_unknown);
761 if ( l2_backptr == PGT_va_mutable )
762 return 0;
763 *backptr =
764 ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
765 (offset_in_l2 << L2_PAGETABLE_SHIFT);
766 return 1;
767 }
769 #elif CONFIG_X86_64
770 # define create_pae_xen_mappings(pl3e) (1)
772 static inline int l1_backptr(
773 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
774 {
775 unsigned long l2_backptr = l2_type & PGT_va_mask;
776 BUG_ON(l2_backptr == PGT_va_unknown);
778 *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) |
779 (offset_in_l2 << L2_PAGETABLE_SHIFT);
780 return 1;
781 }
783 static inline int l2_backptr(
784 unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type)
785 {
786 unsigned long l3_backptr = l3_type & PGT_va_mask;
787 BUG_ON(l3_backptr == PGT_va_unknown);
789 *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) |
790 (offset_in_l3 << L3_PAGETABLE_SHIFT);
791 return 1;
792 }
794 static inline int l3_backptr(
795 unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type)
796 {
797 unsigned long l4_backptr = l4_type & PGT_va_mask;
798 BUG_ON(l4_backptr == PGT_va_unknown);
800 *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT);
801 return 1;
802 }
803 #else
804 # define create_pae_xen_mappings(pl3e) (1)
805 # define l1_backptr(bp,l2o,l2t) \
806 ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; })
807 #endif
809 static int alloc_l2_table(struct pfn_info *page, unsigned long type)
810 {
811 struct domain *d = page_get_owner(page);
812 unsigned long pfn = page_to_pfn(page);
813 unsigned long vaddr;
814 l2_pgentry_t *pl2e;
815 int i;
817 /* See the code in shadow_promote() to understand why this is here. */
818 if ( (PGT_base_page_table == PGT_l2_page_table) &&
819 unlikely(shadow_mode_refcounts(d)) )
820 return 1;
821 ASSERT(!shadow_mode_refcounts(d));
823 pl2e = map_domain_page(pfn);
825 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
826 {
827 if ( !l1_backptr(&vaddr, i, type) )
828 goto fail;
829 if ( is_guest_l2_slot(type, i) &&
830 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
831 goto fail;
832 }
834 #if CONFIG_PAGING_LEVELS == 2
835 /* Xen private mappings. */
836 memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
837 &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
838 L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
839 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
840 l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
841 pl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
842 l2e_from_page(
843 virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt),
844 __PAGE_HYPERVISOR);
845 #endif
847 unmap_domain_page(pl2e);
848 return 1;
850 fail:
851 MEM_LOG("Failure in alloc_l2_table: entry %d", i);
852 while ( i-- > 0 )
853 if ( is_guest_l2_slot(type, i) )
854 put_page_from_l2e(pl2e[i], pfn);
856 unmap_domain_page(pl2e);
857 return 0;
858 }
861 #if CONFIG_PAGING_LEVELS >= 3
862 static int alloc_l3_table(struct pfn_info *page, unsigned long type)
863 {
864 struct domain *d = page_get_owner(page);
865 unsigned long pfn = page_to_pfn(page);
866 unsigned long vaddr;
867 l3_pgentry_t *pl3e;
868 int i;
870 ASSERT(!shadow_mode_refcounts(d));
872 #ifdef CONFIG_X86_PAE
873 if ( pfn >= 0x100000 )
874 {
875 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
876 return 0;
877 }
878 #endif
880 pl3e = map_domain_page(pfn);
881 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
882 {
883 #if CONFIG_PAGING_LEVELS >= 4
884 if ( !l2_backptr(&vaddr, i, type) )
885 goto fail;
886 #else
887 vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT;
888 #endif
889 if ( is_guest_l3_slot(i) &&
890 unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
891 goto fail;
892 }
894 if ( !create_pae_xen_mappings(pl3e) )
895 goto fail;
897 unmap_domain_page(pl3e);
898 return 1;
900 fail:
901 while ( i-- > 0 )
902 if ( is_guest_l3_slot(i) )
903 put_page_from_l3e(pl3e[i], pfn);
905 unmap_domain_page(pl3e);
906 return 0;
907 }
908 #else
909 #define alloc_l3_table(page, type) (0)
910 #endif
912 #if CONFIG_PAGING_LEVELS >= 4
913 static int alloc_l4_table(struct pfn_info *page, unsigned long type)
914 {
915 struct domain *d = page_get_owner(page);
916 unsigned long pfn = page_to_pfn(page);
917 l4_pgentry_t *pl4e = page_to_virt(page);
918 unsigned long vaddr;
919 int i;
921 /* See the code in shadow_promote() to understand why this is here. */
922 if ( (PGT_base_page_table == PGT_l4_page_table) &&
923 shadow_mode_refcounts(d) )
924 return 1;
925 ASSERT(!shadow_mode_refcounts(d));
927 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
928 {
929 if ( !l3_backptr(&vaddr, i, type) )
930 goto fail;
932 if ( is_guest_l4_slot(i) &&
933 unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
934 goto fail;
935 }
937 /* Xen private mappings. */
938 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
939 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
940 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
941 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
942 l4e_from_pfn(pfn, __PAGE_HYPERVISOR);
943 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
944 l4e_from_page(
945 virt_to_page(page_get_owner(page)->arch.mm_perdomain_l3),
946 __PAGE_HYPERVISOR);
948 return 1;
950 fail:
951 while ( i-- > 0 )
952 if ( is_guest_l4_slot(i) )
953 put_page_from_l4e(pl4e[i], pfn);
955 return 0;
956 }
957 #else
958 #define alloc_l4_table(page, type) (0)
959 #endif
962 static void free_l1_table(struct pfn_info *page)
963 {
964 struct domain *d = page_get_owner(page);
965 unsigned long pfn = page_to_pfn(page);
966 l1_pgentry_t *pl1e;
967 int i;
969 pl1e = map_domain_page(pfn);
971 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
972 if ( is_guest_l1_slot(i) )
973 put_page_from_l1e(pl1e[i], d);
975 unmap_domain_page(pl1e);
976 }
979 static void free_l2_table(struct pfn_info *page)
980 {
981 unsigned long pfn = page_to_pfn(page);
982 l2_pgentry_t *pl2e;
983 int i;
985 pl2e = map_domain_page(pfn);
987 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
988 if ( is_guest_l2_slot(page->u.inuse.type_info, i) )
989 put_page_from_l2e(pl2e[i], pfn);
991 unmap_domain_page(pl2e);
992 }
995 #if CONFIG_PAGING_LEVELS >= 3
997 static void free_l3_table(struct pfn_info *page)
998 {
999 unsigned long pfn = page_to_pfn(page);
1000 l3_pgentry_t *pl3e;
1001 int i;
1003 pl3e = map_domain_page(pfn);
1005 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1006 if ( is_guest_l3_slot(i) )
1007 put_page_from_l3e(pl3e[i], pfn);
1009 unmap_domain_page(pl3e);
1012 #endif
1014 #if CONFIG_PAGING_LEVELS >= 4
1016 static void free_l4_table(struct pfn_info *page)
1018 unsigned long pfn = page_to_pfn(page);
1019 l4_pgentry_t *pl4e = page_to_virt(page);
1020 int i;
1022 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
1023 if ( is_guest_l4_slot(i) )
1024 put_page_from_l4e(pl4e[i], pfn);
1027 #endif
1029 static inline int update_l1e(l1_pgentry_t *pl1e,
1030 l1_pgentry_t ol1e,
1031 l1_pgentry_t nl1e)
1033 intpte_t o = l1e_get_intpte(ol1e);
1034 intpte_t n = l1e_get_intpte(nl1e);
1036 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
1037 unlikely(o != l1e_get_intpte(ol1e)) )
1039 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
1040 ": saw %" PRIpte,
1041 l1e_get_intpte(ol1e),
1042 l1e_get_intpte(nl1e),
1043 o);
1044 return 0;
1046 return 1;
1050 /* Update the L1 entry at pl1e to new value nl1e. */
1051 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
1053 l1_pgentry_t ol1e;
1054 struct domain *d = current->domain;
1056 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1057 return 0;
1059 if ( unlikely(shadow_mode_refcounts(d)) )
1060 return update_l1e(pl1e, ol1e, nl1e);
1062 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1064 if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
1066 MEM_LOG("Bad L1 flags %x",
1067 l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
1068 return 0;
1071 /* Fast path for identical mapping, r/w and presence. */
1072 if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
1073 return update_l1e(pl1e, ol1e, nl1e);
1075 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
1076 return 0;
1078 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1080 put_page_from_l1e(nl1e, d);
1081 return 0;
1084 else
1086 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
1087 return 0;
1090 put_page_from_l1e(ol1e, d);
1091 return 1;
1094 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
1095 intpte_t __o = cmpxchg((intpte_t *)(_p), \
1096 _t ## e_get_intpte(_o), \
1097 _t ## e_get_intpte(_n)); \
1098 if ( __o != _t ## e_get_intpte(_o) ) \
1099 MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte \
1100 ": saw %" PRIpte "", \
1101 (_t ## e_get_intpte(_o)), \
1102 (_t ## e_get_intpte(_n)), \
1103 (__o)); \
1104 (__o == _t ## e_get_intpte(_o)); })
1106 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
1107 static int mod_l2_entry(l2_pgentry_t *pl2e,
1108 l2_pgentry_t nl2e,
1109 unsigned long pfn,
1110 unsigned long type)
1112 l2_pgentry_t ol2e;
1113 unsigned long vaddr = 0;
1115 if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) )
1117 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
1118 return 0;
1121 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
1122 return 0;
1124 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
1126 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
1128 MEM_LOG("Bad L2 flags %x",
1129 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
1130 return 0;
1133 /* Fast path for identical mapping and presence. */
1134 if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
1135 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
1137 if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
1138 unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
1139 return 0;
1141 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1143 put_page_from_l2e(nl2e, pfn);
1144 return 0;
1147 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
1149 return 0;
1152 put_page_from_l2e(ol2e, pfn);
1153 return 1;
1157 #if CONFIG_PAGING_LEVELS >= 3
1159 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
1160 static int mod_l3_entry(l3_pgentry_t *pl3e,
1161 l3_pgentry_t nl3e,
1162 unsigned long pfn,
1163 unsigned long type)
1165 l3_pgentry_t ol3e;
1166 unsigned long vaddr;
1167 int okay;
1169 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
1171 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
1172 return 0;
1175 #ifdef CONFIG_X86_PAE
1176 /*
1177 * Disallow updates to final L3 slot. It contains Xen mappings, and it
1178 * would be a pain to ensure they remain continuously valid throughout.
1179 */
1180 if ( pgentry_ptr_to_slot(pl3e) >= 3 )
1181 return 0;
1182 #endif
1184 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
1185 return 0;
1187 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
1189 if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) )
1191 MEM_LOG("Bad L3 flags %x",
1192 l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
1193 return 0;
1196 /* Fast path for identical mapping and presence. */
1197 if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
1198 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
1200 #if CONFIG_PAGING_LEVELS >= 4
1201 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
1202 unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1203 return 0;
1204 #else
1205 vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t))
1206 << L3_PAGETABLE_SHIFT;
1207 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
1208 return 0;
1209 #endif
1211 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1213 put_page_from_l3e(nl3e, pfn);
1214 return 0;
1217 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
1219 return 0;
1222 okay = create_pae_xen_mappings(pl3e);
1223 BUG_ON(!okay);
1225 put_page_from_l3e(ol3e, pfn);
1226 return 1;
1229 #endif
1231 #if CONFIG_PAGING_LEVELS >= 4
1233 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
1234 static int mod_l4_entry(l4_pgentry_t *pl4e,
1235 l4_pgentry_t nl4e,
1236 unsigned long pfn,
1237 unsigned long type)
1239 l4_pgentry_t ol4e;
1240 unsigned long vaddr;
1242 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
1244 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
1245 return 0;
1248 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
1249 return 0;
1251 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
1253 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
1255 MEM_LOG("Bad L4 flags %x",
1256 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
1257 return 0;
1260 /* Fast path for identical mapping and presence. */
1261 if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
1262 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
1264 if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
1265 unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
1266 return 0;
1268 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1270 put_page_from_l4e(nl4e, pfn);
1271 return 0;
1274 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
1276 return 0;
1279 put_page_from_l4e(ol4e, pfn);
1280 return 1;
1283 #endif
1285 int alloc_page_type(struct pfn_info *page, unsigned long type)
1287 switch ( type & PGT_type_mask )
1289 case PGT_l1_page_table:
1290 return alloc_l1_table(page);
1291 case PGT_l2_page_table:
1292 return alloc_l2_table(page, type);
1293 case PGT_l3_page_table:
1294 return alloc_l3_table(page, type);
1295 case PGT_l4_page_table:
1296 return alloc_l4_table(page, type);
1297 case PGT_gdt_page:
1298 case PGT_ldt_page:
1299 return alloc_segdesc_page(page);
1300 default:
1301 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n",
1302 type, page->u.inuse.type_info,
1303 page->count_info);
1304 BUG();
1307 return 0;
1311 void free_page_type(struct pfn_info *page, unsigned long type)
1313 struct domain *owner = page_get_owner(page);
1314 unsigned long gpfn;
1316 if ( owner != NULL )
1318 if ( unlikely(shadow_mode_refcounts(owner)) )
1319 return;
1320 if ( unlikely(shadow_mode_enabled(owner)) )
1322 gpfn = __mfn_to_gpfn(owner, page_to_pfn(page));
1323 ASSERT(VALID_M2P(gpfn));
1324 remove_shadow(owner, gpfn, type & PGT_type_mask);
1328 switch ( type & PGT_type_mask )
1330 case PGT_l1_page_table:
1331 free_l1_table(page);
1332 break;
1334 case PGT_l2_page_table:
1335 free_l2_table(page);
1336 break;
1338 #if CONFIG_PAGING_LEVELS >= 3
1339 case PGT_l3_page_table:
1340 free_l3_table(page);
1341 break;
1342 #endif
1344 #if CONFIG_PAGING_LEVELS >= 4
1345 case PGT_l4_page_table:
1346 free_l4_table(page);
1347 break;
1348 #endif
1350 default:
1351 printk("%s: type %lx pfn %lx\n",__FUNCTION__,
1352 type, page_to_pfn(page));
1353 BUG();
1358 void put_page_type(struct pfn_info *page)
1360 unsigned long nx, x, y = page->u.inuse.type_info;
1362 again:
1363 do {
1364 x = y;
1365 nx = x - 1;
1367 ASSERT((x & PGT_count_mask) != 0);
1369 /*
1370 * The page should always be validated while a reference is held. The
1371 * exception is during domain destruction, when we forcibly invalidate
1372 * page-table pages if we detect a referential loop.
1373 * See domain.c:relinquish_list().
1374 */
1375 ASSERT((x & PGT_validated) ||
1376 test_bit(_DOMF_dying, &page_get_owner(page)->domain_flags));
1378 if ( unlikely((nx & PGT_count_mask) == 0) )
1380 /* Record TLB information for flush later. Races are harmless. */
1381 page->tlbflush_timestamp = tlbflush_current_time();
1383 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1384 likely(nx & PGT_validated) )
1386 /*
1387 * Page-table pages must be unvalidated when count is zero. The
1388 * 'free' is safe because the refcnt is non-zero and validated
1389 * bit is clear => other ops will spin or fail.
1390 */
1391 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1392 x & ~PGT_validated)) != x) )
1393 goto again;
1394 /* We cleared the 'valid bit' so we do the clean up. */
1395 free_page_type(page, x);
1396 /* Carry on, but with the 'valid bit' now clear. */
1397 x &= ~PGT_validated;
1398 nx &= ~PGT_validated;
1401 else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) ==
1402 (PGT_pinned | 1)) &&
1403 ((nx & PGT_type_mask) != PGT_writable_page)) )
1405 /* Page is now only pinned. Make the back pointer mutable again. */
1406 nx |= PGT_va_mutable;
1409 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1413 int get_page_type(struct pfn_info *page, unsigned long type)
1415 unsigned long nx, x, y = page->u.inuse.type_info;
1417 again:
1418 do {
1419 x = y;
1420 nx = x + 1;
1421 if ( unlikely((nx & PGT_count_mask) == 0) )
1423 MEM_LOG("Type count overflow on pfn %lx", page_to_pfn(page));
1424 return 0;
1426 else if ( unlikely((x & PGT_count_mask) == 0) )
1428 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1430 if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
1432 /*
1433 * On type change we check to flush stale TLB
1434 * entries. This may be unnecessary (e.g., page
1435 * was GDT/LDT) but those circumstances should be
1436 * very rare.
1437 */
1438 cpumask_t mask = page_get_owner(page)->cpumask;
1439 tlbflush_filter(mask, page->tlbflush_timestamp);
1441 if ( unlikely(!cpus_empty(mask)) )
1443 perfc_incrc(need_flush_tlb_flush);
1444 flush_tlb_mask(mask);
1448 /* We lose existing type, back pointer, and validity. */
1449 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1450 nx |= type;
1452 /* No special validation needed for writable pages. */
1453 /* Page tables and GDT/LDT need to be scanned for validity. */
1454 if ( type == PGT_writable_page )
1455 nx |= PGT_validated;
1458 else
1460 if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1462 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1464 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1465 ((type & PGT_type_mask) != PGT_l1_page_table) )
1466 MEM_LOG("Bad type (saw %" PRtype_info
1467 " != exp %" PRtype_info ") "
1468 "for mfn %lx (pfn %lx)",
1469 x, type, page_to_pfn(page),
1470 get_pfn_from_mfn(page_to_pfn(page)));
1471 return 0;
1473 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1475 /* The va backpointer is mutable, hence we update it. */
1476 nx &= ~PGT_va_mask;
1477 nx |= type; /* we know the actual type is correct */
1479 else if ( ((type & PGT_va_mask) != PGT_va_mutable) &&
1480 ((type & PGT_va_mask) != (x & PGT_va_mask)) )
1482 #ifdef CONFIG_X86_PAE
1483 /* We use backptr as extra typing. Cannot be unknown. */
1484 if ( (type & PGT_type_mask) == PGT_l2_page_table )
1485 return 0;
1486 #endif
1487 /* This table is possibly mapped at multiple locations. */
1488 nx &= ~PGT_va_mask;
1489 nx |= PGT_va_unknown;
1492 if ( unlikely(!(x & PGT_validated)) )
1494 /* Someone else is updating validation of this page. Wait... */
1495 while ( (y = page->u.inuse.type_info) == x )
1496 cpu_relax();
1497 goto again;
1501 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1503 if ( unlikely(!(nx & PGT_validated)) )
1505 /* Try to validate page type; drop the new reference on failure. */
1506 if ( unlikely(!alloc_page_type(page, type)) )
1508 MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
1509 PRtype_info ": caf=%08x taf=%" PRtype_info,
1510 page_to_pfn(page), get_pfn_from_mfn(page_to_pfn(page)),
1511 type, page->count_info, page->u.inuse.type_info);
1512 /* Noone else can get a reference. We hold the only ref. */
1513 page->u.inuse.type_info = 0;
1514 return 0;
1517 /* Noone else is updating simultaneously. */
1518 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1521 return 1;
1525 int new_guest_cr3(unsigned long mfn)
1527 struct vcpu *v = current;
1528 struct domain *d = v->domain;
1529 int okay;
1530 unsigned long old_base_mfn;
1532 if ( shadow_mode_refcounts(d) )
1533 okay = get_page_from_pagenr(mfn, d);
1534 else
1535 okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
1537 if ( likely(okay) )
1539 invalidate_shadow_ldt(v);
1541 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1542 v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1543 update_pagetables(v); /* update shadow_table and monitor_table */
1545 write_ptbase(v);
1547 if ( shadow_mode_refcounts(d) )
1548 put_page(&frame_table[old_base_mfn]);
1549 else
1550 put_page_and_type(&frame_table[old_base_mfn]);
1552 /* CR3 also holds a ref to its shadow... */
1553 if ( shadow_mode_enabled(d) )
1555 if ( v->arch.monitor_shadow_ref )
1556 put_shadow_ref(v->arch.monitor_shadow_ref);
1557 v->arch.monitor_shadow_ref =
1558 pagetable_get_pfn(v->arch.monitor_table);
1559 ASSERT(!page_get_owner(&frame_table[v->arch.monitor_shadow_ref]));
1560 get_shadow_ref(v->arch.monitor_shadow_ref);
1563 else
1565 MEM_LOG("Error while installing new baseptr %lx", mfn);
1568 return okay;
1571 static void process_deferred_ops(unsigned int cpu)
1573 unsigned int deferred_ops;
1574 struct domain *d = current->domain;
1576 deferred_ops = percpu_info[cpu].deferred_ops;
1577 percpu_info[cpu].deferred_ops = 0;
1579 if ( deferred_ops & DOP_FLUSH_TLB )
1581 if ( shadow_mode_enabled(d) )
1582 shadow_sync_all(d);
1583 local_flush_tlb();
1586 if ( deferred_ops & DOP_RELOAD_LDT )
1587 (void)map_ldt_shadow_page(0);
1589 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1591 put_domain(percpu_info[cpu].foreign);
1592 percpu_info[cpu].foreign = NULL;
1596 static int set_foreigndom(unsigned int cpu, domid_t domid)
1598 struct domain *e, *d = current->domain;
1599 int okay = 1;
1601 if ( (e = percpu_info[cpu].foreign) != NULL )
1602 put_domain(e);
1603 percpu_info[cpu].foreign = NULL;
1605 if ( domid == DOMID_SELF )
1606 goto out;
1608 if ( !IS_PRIV(d) )
1610 switch ( domid )
1612 case DOMID_IO:
1613 get_knownalive_domain(dom_io);
1614 percpu_info[cpu].foreign = dom_io;
1615 break;
1616 default:
1617 MEM_LOG("Dom %u cannot set foreign dom", d->domain_id);
1618 okay = 0;
1619 break;
1622 else
1624 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1625 if ( e == NULL )
1627 switch ( domid )
1629 case DOMID_XEN:
1630 get_knownalive_domain(dom_xen);
1631 percpu_info[cpu].foreign = dom_xen;
1632 break;
1633 case DOMID_IO:
1634 get_knownalive_domain(dom_io);
1635 percpu_info[cpu].foreign = dom_io;
1636 break;
1637 default:
1638 MEM_LOG("Unknown domain '%u'", domid);
1639 okay = 0;
1640 break;
1645 out:
1646 return okay;
1649 static inline cpumask_t vcpumask_to_pcpumask(
1650 struct domain *d, unsigned long vmask)
1652 unsigned int vcpu_id;
1653 cpumask_t pmask;
1654 struct vcpu *v;
1656 while ( vmask != 0 )
1658 vcpu_id = find_first_set_bit(vmask);
1659 vmask &= ~(1UL << vcpu_id);
1660 if ( (vcpu_id < MAX_VIRT_CPUS) &&
1661 ((v = d->vcpu[vcpu_id]) != NULL) )
1662 cpu_set(v->processor, pmask);
1665 return pmask;
1668 int do_mmuext_op(
1669 struct mmuext_op *uops,
1670 unsigned int count,
1671 unsigned int *pdone,
1672 unsigned int foreigndom)
1674 struct mmuext_op op;
1675 int rc = 0, i = 0, okay, cpu = smp_processor_id();
1676 unsigned long mfn, type, done = 0;
1677 struct pfn_info *page;
1678 struct vcpu *v = current;
1679 struct domain *d = v->domain, *e;
1680 u32 x, y, _d, _nd;
1682 LOCK_BIGLOCK(d);
1684 cleanup_writable_pagetable(d);
1686 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
1688 count &= ~MMU_UPDATE_PREEMPTED;
1689 if ( unlikely(pdone != NULL) )
1690 (void)get_user(done, pdone);
1693 if ( !set_foreigndom(cpu, foreigndom) )
1695 rc = -EINVAL;
1696 goto out;
1699 if ( unlikely(!array_access_ok(uops, count, sizeof(op))) )
1701 rc = -EFAULT;
1702 goto out;
1705 for ( i = 0; i < count; i++ )
1707 if ( hypercall_preempt_check() )
1709 rc = hypercall4_create_continuation(
1710 __HYPERVISOR_mmuext_op, uops,
1711 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
1712 break;
1715 if ( unlikely(__copy_from_user(&op, uops, sizeof(op)) != 0) )
1717 MEM_LOG("Bad __copy_from_user");
1718 rc = -EFAULT;
1719 break;
1722 okay = 1;
1723 mfn = op.arg1.mfn;
1724 page = &frame_table[mfn];
1726 switch ( op.cmd )
1728 case MMUEXT_PIN_L1_TABLE:
1729 type = PGT_l1_page_table | PGT_va_mutable;
1731 pin_page:
1732 if ( shadow_mode_refcounts(FOREIGNDOM) )
1733 type = PGT_writable_page;
1735 okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
1736 if ( unlikely(!okay) )
1738 MEM_LOG("Error while pinning mfn %lx", mfn);
1739 break;
1742 if ( unlikely(test_and_set_bit(_PGT_pinned,
1743 &page->u.inuse.type_info)) )
1745 MEM_LOG("Mfn %lx already pinned", mfn);
1746 put_page_and_type(page);
1747 okay = 0;
1748 break;
1751 break;
1753 #ifndef CONFIG_X86_PAE /* Unsafe on PAE because of Xen-private mappings. */
1754 case MMUEXT_PIN_L2_TABLE:
1755 type = PGT_l2_page_table | PGT_va_mutable;
1756 goto pin_page;
1757 #endif
1759 case MMUEXT_PIN_L3_TABLE:
1760 type = PGT_l3_page_table | PGT_va_mutable;
1761 goto pin_page;
1763 case MMUEXT_PIN_L4_TABLE:
1764 type = PGT_l4_page_table | PGT_va_mutable;
1765 goto pin_page;
1767 case MMUEXT_UNPIN_TABLE:
1768 if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
1770 MEM_LOG("Mfn %lx bad domain (dom=%p)",
1771 mfn, page_get_owner(page));
1773 else if ( likely(test_and_clear_bit(_PGT_pinned,
1774 &page->u.inuse.type_info)) )
1776 put_page_and_type(page);
1777 put_page(page);
1779 else
1781 okay = 0;
1782 put_page(page);
1783 MEM_LOG("Mfn %lx not pinned", mfn);
1785 break;
1787 case MMUEXT_NEW_BASEPTR:
1788 okay = new_guest_cr3(mfn);
1789 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1790 break;
1792 #ifdef __x86_64__
1793 case MMUEXT_NEW_USER_BASEPTR:
1794 okay = get_page_and_type_from_pagenr(
1795 mfn, PGT_root_page_table, d);
1796 if ( unlikely(!okay) )
1798 MEM_LOG("Error while installing new mfn %lx", mfn);
1800 else
1802 unsigned long old_mfn =
1803 pagetable_get_pfn(v->arch.guest_table_user);
1804 v->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT);
1805 if ( old_mfn != 0 )
1806 put_page_and_type(&frame_table[old_mfn]);
1808 break;
1809 #endif
1811 case MMUEXT_TLB_FLUSH_LOCAL:
1812 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1813 break;
1815 case MMUEXT_INVLPG_LOCAL:
1816 if ( shadow_mode_enabled(d) )
1817 shadow_invlpg(v, op.arg1.linear_addr);
1818 local_flush_tlb_one(op.arg1.linear_addr);
1819 break;
1821 case MMUEXT_TLB_FLUSH_MULTI:
1822 case MMUEXT_INVLPG_MULTI:
1824 unsigned long vmask;
1825 cpumask_t pmask;
1826 if ( unlikely(get_user(vmask, (unsigned long *)op.arg2.vcpumask)) )
1828 okay = 0;
1829 break;
1831 pmask = vcpumask_to_pcpumask(d, vmask);
1832 cpus_and(pmask, pmask, d->cpumask);
1833 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
1834 flush_tlb_mask(pmask);
1835 else
1836 flush_tlb_one_mask(pmask, op.arg1.linear_addr);
1837 break;
1840 case MMUEXT_TLB_FLUSH_ALL:
1841 flush_tlb_mask(d->cpumask);
1842 break;
1844 case MMUEXT_INVLPG_ALL:
1845 flush_tlb_one_mask(d->cpumask, op.arg1.linear_addr);
1846 break;
1848 case MMUEXT_FLUSH_CACHE:
1849 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1851 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
1852 okay = 0;
1854 else
1856 wbinvd();
1858 break;
1860 case MMUEXT_SET_LDT:
1862 if ( shadow_mode_external(d) )
1864 MEM_LOG("ignoring SET_LDT hypercall from external "
1865 "domain %u", d->domain_id);
1866 okay = 0;
1867 break;
1870 unsigned long ptr = op.arg1.linear_addr;
1871 unsigned long ents = op.arg2.nr_ents;
1872 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1873 (ents > 8192) ||
1874 !array_access_ok(ptr, ents, LDT_ENTRY_SIZE) )
1876 okay = 0;
1877 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
1879 else if ( (v->arch.guest_context.ldt_ents != ents) ||
1880 (v->arch.guest_context.ldt_base != ptr) )
1882 invalidate_shadow_ldt(v);
1883 v->arch.guest_context.ldt_base = ptr;
1884 v->arch.guest_context.ldt_ents = ents;
1885 load_LDT(v);
1886 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1887 if ( ents != 0 )
1888 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1890 break;
1893 case MMUEXT_REASSIGN_PAGE:
1894 if ( unlikely(!IS_PRIV(d)) )
1896 MEM_LOG("Dom %u has no reassignment priv", d->domain_id);
1897 okay = 0;
1898 break;
1901 e = percpu_info[cpu].foreign;
1902 if ( unlikely(e == NULL) )
1904 MEM_LOG("No FOREIGNDOM to reassign mfn %lx to", mfn);
1905 okay = 0;
1906 break;
1909 /*
1910 * Grab both page_list locks, in order. This prevents the page from
1911 * disappearing elsewhere while we modify the owner, and we'll need
1912 * both locks if we're successful so that we can change lists.
1913 */
1914 if ( d < e )
1916 spin_lock(&d->page_alloc_lock);
1917 spin_lock(&e->page_alloc_lock);
1919 else
1921 spin_lock(&e->page_alloc_lock);
1922 spin_lock(&d->page_alloc_lock);
1925 /*
1926 * Check that 'e' will accept the page and has reservation
1927 * headroom. Also, a domain mustn't have PGC_allocated pages when
1928 * it is dying.
1929 */
1930 ASSERT(e->tot_pages <= e->max_pages);
1931 if ( unlikely(test_bit(_DOMF_dying, &e->domain_flags)) ||
1932 unlikely(e->tot_pages == e->max_pages) ||
1933 unlikely(IS_XEN_HEAP_FRAME(page)) )
1935 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1936 "page is in Xen heap (%lx), or dom is dying (%ld).",
1937 e->tot_pages, e->max_pages, mfn, e->domain_flags);
1938 okay = 0;
1939 goto reassign_fail;
1942 /*
1943 * The tricky bit: atomically change owner while there is just one
1944 * benign reference to the page (PGC_allocated). If that reference
1945 * disappears then the deallocation routine will safely spin.
1946 */
1947 _d = pickle_domptr(d);
1948 _nd = page->u.inuse._domain;
1949 y = page->count_info;
1950 do {
1951 x = y;
1952 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1953 (1|PGC_allocated)) ||
1954 unlikely(_nd != _d) )
1956 MEM_LOG("Bad page values %lx: ed=%p(%u), sd=%p,"
1957 " caf=%08x, taf=%" PRtype_info,
1958 page_to_pfn(page), d, d->domain_id,
1959 unpickle_domptr(_nd), x, page->u.inuse.type_info);
1960 okay = 0;
1961 goto reassign_fail;
1963 __asm__ __volatile__(
1964 LOCK_PREFIX "cmpxchg8b %3"
1965 : "=d" (_nd), "=a" (y), "=c" (e),
1966 "=m" (*(volatile u64 *)(&page->count_info))
1967 : "0" (_d), "1" (x), "c" (e), "b" (x) );
1969 while ( unlikely(_nd != _d) || unlikely(y != x) );
1971 /*
1972 * Unlink from 'd'. We transferred at least one reference to 'e',
1973 * so noone else is spinning to try to delete this page from 'd'.
1974 */
1975 d->tot_pages--;
1976 list_del(&page->list);
1978 /*
1979 * Add the page to 'e'. Someone may already have removed the last
1980 * reference and want to remove the page from 'e'. However, we have
1981 * the lock so they'll spin waiting for us.
1982 */
1983 if ( unlikely(e->tot_pages++ == 0) )
1984 get_knownalive_domain(e);
1985 list_add_tail(&page->list, &e->page_list);
1987 reassign_fail:
1988 spin_unlock(&d->page_alloc_lock);
1989 spin_unlock(&e->page_alloc_lock);
1990 break;
1992 default:
1993 MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
1994 okay = 0;
1995 break;
1998 if ( unlikely(!okay) )
2000 rc = -EINVAL;
2001 break;
2004 uops++;
2007 out:
2008 process_deferred_ops(cpu);
2010 /* Add incremental work we have done to the @done output parameter. */
2011 if ( unlikely(pdone != NULL) )
2012 __put_user(done + i, pdone);
2014 UNLOCK_BIGLOCK(d);
2015 return rc;
2018 int do_mmu_update(
2019 mmu_update_t *ureqs,
2020 unsigned int count,
2021 unsigned int *pdone,
2022 unsigned int foreigndom)
2024 mmu_update_t req;
2025 void *va;
2026 unsigned long gpfn, mfn;
2027 struct pfn_info *page;
2028 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
2029 unsigned int cmd, done = 0;
2030 struct vcpu *v = current;
2031 struct domain *d = v->domain;
2032 unsigned long type_info;
2033 struct domain_mmap_cache mapcache, sh_mapcache;
2035 LOCK_BIGLOCK(d);
2037 cleanup_writable_pagetable(d);
2039 if ( unlikely(shadow_mode_enabled(d)) )
2040 check_pagetable(v, "pre-mmu"); /* debug */
2042 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
2044 count &= ~MMU_UPDATE_PREEMPTED;
2045 if ( unlikely(pdone != NULL) )
2046 (void)get_user(done, pdone);
2049 domain_mmap_cache_init(&mapcache);
2050 domain_mmap_cache_init(&sh_mapcache);
2052 if ( !set_foreigndom(cpu, foreigndom) )
2054 rc = -EINVAL;
2055 goto out;
2058 perfc_incrc(calls_to_mmu_update);
2059 perfc_addc(num_page_updates, count);
2060 perfc_incr_histo(bpt_updates, count, PT_UPDATES);
2062 if ( unlikely(!array_access_ok(ureqs, count, sizeof(req))) )
2064 rc = -EFAULT;
2065 goto out;
2068 for ( i = 0; i < count; i++ )
2070 if ( hypercall_preempt_check() )
2072 rc = hypercall4_create_continuation(
2073 __HYPERVISOR_mmu_update, ureqs,
2074 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
2075 break;
2078 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
2080 MEM_LOG("Bad __copy_from_user");
2081 rc = -EFAULT;
2082 break;
2085 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
2086 okay = 0;
2088 switch ( cmd )
2090 /*
2091 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
2092 */
2093 case MMU_NORMAL_PT_UPDATE:
2095 gpfn = req.ptr >> PAGE_SHIFT;
2096 mfn = __gpfn_to_mfn(d, gpfn);
2098 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2100 MEM_LOG("Could not get page for normal update");
2101 break;
2104 va = map_domain_page_with_cache(mfn, &mapcache);
2105 va = (void *)((unsigned long)va +
2106 (unsigned long)(req.ptr & ~PAGE_MASK));
2107 page = &frame_table[mfn];
2109 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
2111 case PGT_l1_page_table:
2112 ASSERT( !shadow_mode_refcounts(d) );
2113 if ( likely(get_page_type(
2114 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2116 l1_pgentry_t l1e;
2118 /* FIXME: doesn't work with PAE */
2119 l1e = l1e_from_intpte(req.val);
2120 okay = mod_l1_entry(va, l1e);
2121 if ( okay && unlikely(shadow_mode_enabled(d)) )
2122 shadow_l1_normal_pt_update(
2123 d, req.ptr, l1e, &sh_mapcache);
2124 put_page_type(page);
2126 break;
2127 case PGT_l2_page_table:
2128 ASSERT( !shadow_mode_refcounts(d) );
2129 if ( likely(get_page_type(
2130 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2132 l2_pgentry_t l2e;
2134 /* FIXME: doesn't work with PAE */
2135 l2e = l2e_from_intpte(req.val);
2136 okay = mod_l2_entry(
2137 (l2_pgentry_t *)va, l2e, mfn, type_info);
2138 if ( okay && unlikely(shadow_mode_enabled(d)) )
2139 shadow_l2_normal_pt_update(
2140 d, req.ptr, l2e, &sh_mapcache);
2141 put_page_type(page);
2143 break;
2144 #if CONFIG_PAGING_LEVELS >= 3
2145 case PGT_l3_page_table:
2146 ASSERT( !shadow_mode_refcounts(d) );
2147 if ( likely(get_page_type(
2148 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2150 l3_pgentry_t l3e;
2152 /* FIXME: doesn't work with PAE */
2153 l3e = l3e_from_intpte(req.val);
2154 okay = mod_l3_entry(va, l3e, mfn, type_info);
2155 if ( okay && unlikely(shadow_mode_enabled(d)) )
2156 shadow_l3_normal_pt_update(
2157 d, req.ptr, l3e, &sh_mapcache);
2158 put_page_type(page);
2160 break;
2161 #endif
2162 #if CONFIG_PAGING_LEVELS >= 4
2163 case PGT_l4_page_table:
2164 ASSERT( !shadow_mode_refcounts(d) );
2165 if ( likely(get_page_type(
2166 page, type_info & (PGT_type_mask|PGT_va_mask))) )
2168 l4_pgentry_t l4e;
2170 l4e = l4e_from_intpte(req.val);
2171 okay = mod_l4_entry(va, l4e, mfn, type_info);
2172 if ( okay && unlikely(shadow_mode_enabled(d)) )
2173 shadow_l4_normal_pt_update(
2174 d, req.ptr, l4e, &sh_mapcache);
2175 put_page_type(page);
2177 break;
2178 #endif
2179 default:
2180 if ( likely(get_page_type(page, PGT_writable_page)) )
2182 if ( shadow_mode_enabled(d) )
2184 shadow_lock(d);
2186 if ( shadow_mode_log_dirty(d) )
2187 __mark_dirty(d, mfn);
2189 if ( page_is_page_table(page) &&
2190 !page_out_of_sync(page) )
2192 shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
2196 *(intpte_t *)va = req.val;
2197 okay = 1;
2199 if ( shadow_mode_enabled(d) )
2200 shadow_unlock(d);
2202 put_page_type(page);
2204 break;
2207 unmap_domain_page_with_cache(va, &mapcache);
2209 put_page(page);
2210 break;
2212 case MMU_MACHPHYS_UPDATE:
2214 mfn = req.ptr >> PAGE_SHIFT;
2215 gpfn = req.val;
2217 /* HACK ALERT... Need to think about this some more... */
2218 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && IS_PRIV(d)) )
2220 shadow_lock(FOREIGNDOM);
2221 printk("privileged guest dom%d requests pfn=%lx to "
2222 "map mfn=%lx for dom%d\n",
2223 d->domain_id, gpfn, mfn, FOREIGNDOM->domain_id);
2224 set_pfn_from_mfn(mfn, gpfn);
2225 set_p2m_entry(FOREIGNDOM, gpfn, mfn, &sh_mapcache, &mapcache);
2226 okay = 1;
2227 shadow_unlock(FOREIGNDOM);
2228 break;
2231 if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
2233 MEM_LOG("Could not get page for mach->phys update");
2234 break;
2237 if ( unlikely(shadow_mode_translate(FOREIGNDOM) && !IS_PRIV(d)) )
2239 MEM_LOG("can't mutate the m2p of translated guests");
2240 break;
2243 set_pfn_from_mfn(mfn, gpfn);
2244 okay = 1;
2246 /*
2247 * If in log-dirty mode, mark the corresponding
2248 * page as dirty.
2249 */
2250 if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) &&
2251 mark_dirty(FOREIGNDOM, mfn) )
2252 FOREIGNDOM->arch.shadow_dirty_block_count++;
2254 put_page(&frame_table[mfn]);
2255 break;
2257 default:
2258 MEM_LOG("Invalid page update command %x", cmd);
2259 break;
2262 if ( unlikely(!okay) )
2264 rc = -EINVAL;
2265 break;
2268 ureqs++;
2271 out:
2272 domain_mmap_cache_destroy(&mapcache);
2273 domain_mmap_cache_destroy(&sh_mapcache);
2275 process_deferred_ops(cpu);
2277 /* Add incremental work we have done to the @done output parameter. */
2278 if ( unlikely(pdone != NULL) )
2279 __put_user(done + i, pdone);
2281 if ( unlikely(shadow_mode_enabled(d)) )
2282 check_pagetable(v, "post-mmu"); /* debug */
2284 UNLOCK_BIGLOCK(d);
2285 return rc;
2289 int update_grant_pte_mapping(
2290 unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
2292 int rc = GNTST_okay;
2293 void *va;
2294 unsigned long gpfn, mfn;
2295 struct pfn_info *page;
2296 u32 type_info;
2297 l1_pgentry_t ol1e;
2298 struct domain *d = v->domain;
2300 ASSERT(spin_is_locked(&d->big_lock));
2301 ASSERT(!shadow_mode_refcounts(d));
2302 ASSERT((l1e_get_flags(_nl1e) & L1_DISALLOW_MASK) == 0);
2304 gpfn = pte_addr >> PAGE_SHIFT;
2305 mfn = __gpfn_to_mfn(d, gpfn);
2307 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2309 MEM_LOG("Could not get page for normal update");
2310 return GNTST_general_error;
2313 va = map_domain_page(mfn);
2314 va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK));
2315 page = pfn_to_page(mfn);
2317 type_info = page->u.inuse.type_info;
2318 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2319 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2321 MEM_LOG("Grant map attempted to update a non-L1 page");
2322 rc = GNTST_general_error;
2323 goto failed;
2326 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) ||
2327 !update_l1e(va, ol1e, _nl1e) )
2329 put_page_type(page);
2330 rc = GNTST_general_error;
2331 goto failed;
2334 put_page_from_l1e(ol1e, d);
2336 if ( unlikely(shadow_mode_enabled(d)) )
2338 struct domain_mmap_cache sh_mapcache;
2339 domain_mmap_cache_init(&sh_mapcache);
2340 shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
2341 domain_mmap_cache_destroy(&sh_mapcache);
2344 put_page_type(page);
2346 failed:
2347 unmap_domain_page(va);
2348 put_page(page);
2349 return rc;
2352 int clear_grant_pte_mapping(
2353 unsigned long addr, unsigned long frame, struct domain *d)
2355 int rc = GNTST_okay;
2356 void *va;
2357 unsigned long gpfn, mfn;
2358 struct pfn_info *page;
2359 u32 type_info;
2360 l1_pgentry_t ol1e;
2362 ASSERT(!shadow_mode_refcounts(d));
2364 gpfn = addr >> PAGE_SHIFT;
2365 mfn = __gpfn_to_mfn(d, gpfn);
2367 if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
2369 MEM_LOG("Could not get page for normal update");
2370 return GNTST_general_error;
2373 va = map_domain_page(mfn);
2374 va = (void *)((unsigned long)va + (addr & ~PAGE_MASK));
2375 page = pfn_to_page(mfn);
2377 type_info = page->u.inuse.type_info;
2378 if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
2379 !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
2381 MEM_LOG("Grant map attempted to update a non-L1 page");
2382 rc = GNTST_general_error;
2383 goto failed;
2386 if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) )
2388 put_page_type(page);
2389 rc = GNTST_general_error;
2390 goto failed;
2393 /* Check that the virtual address supplied is actually mapped to frame. */
2394 if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) )
2396 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2397 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
2398 put_page_type(page);
2399 rc = GNTST_general_error;
2400 goto failed;
2403 /* Delete pagetable entry. */
2404 if ( unlikely(__put_user(0, (intpte_t *)va)))
2406 MEM_LOG("Cannot delete PTE entry at %p", va);
2407 put_page_type(page);
2408 rc = GNTST_general_error;
2409 goto failed;
2412 if ( unlikely(shadow_mode_enabled(d)) )
2414 struct domain_mmap_cache sh_mapcache;
2415 domain_mmap_cache_init(&sh_mapcache);
2416 shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
2417 domain_mmap_cache_destroy(&sh_mapcache);
2420 put_page_type(page);
2422 failed:
2423 unmap_domain_page(va);
2424 put_page(page);
2425 return rc;
2429 int update_grant_va_mapping(
2430 unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
2432 l1_pgentry_t *pl1e, ol1e;
2433 struct domain *d = v->domain;
2435 ASSERT(spin_is_locked(&d->big_lock));
2436 ASSERT(!shadow_mode_refcounts(d));
2437 ASSERT((l1e_get_flags(_nl1e) & L1_DISALLOW_MASK) == 0);
2439 /*
2440 * This is actually overkill - we don't need to sync the L1 itself,
2441 * just everything involved in getting to this L1 (i.e. we need
2442 * linear_pg_table[l1_linear_offset(va)] to be in sync)...
2443 */
2444 __shadow_sync_va(v, va);
2446 pl1e = &linear_pg_table[l1_linear_offset(va)];
2448 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
2449 !update_l1e(pl1e, ol1e, _nl1e) )
2450 return GNTST_general_error;
2452 put_page_from_l1e(ol1e, d);
2454 if ( unlikely(shadow_mode_enabled(d)) )
2455 shadow_do_update_va_mapping(va, _nl1e, v);
2457 return GNTST_okay;
2460 int clear_grant_va_mapping(unsigned long addr, unsigned long frame)
2462 l1_pgentry_t *pl1e, ol1e;
2464 pl1e = &linear_pg_table[l1_linear_offset(addr)];
2466 if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
2468 MEM_LOG("Could not find PTE entry for address %lx", addr);
2469 return GNTST_general_error;
2472 /*
2473 * Check that the virtual address supplied is actually mapped to
2474 * frame.
2475 */
2476 if ( unlikely(l1e_get_pfn(ol1e) != frame) )
2478 MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
2479 l1e_get_pfn(ol1e), addr, frame);
2480 return GNTST_general_error;
2483 /* Delete pagetable entry. */
2484 if ( unlikely(__put_user(0, &pl1e->l1)) )
2486 MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
2487 return GNTST_general_error;
2490 return 0;
2494 int do_update_va_mapping(unsigned long va, u64 val64,
2495 unsigned long flags)
2497 l1_pgentry_t val = l1e_from_intpte(val64);
2498 struct vcpu *v = current;
2499 struct domain *d = v->domain;
2500 unsigned int cpu = v->processor;
2501 unsigned long vmask, bmap_ptr;
2502 cpumask_t pmask;
2503 int rc = 0;
2505 perfc_incrc(calls_to_update_va);
2507 if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
2508 return -EINVAL;
2510 LOCK_BIGLOCK(d);
2512 cleanup_writable_pagetable(d);
2514 if ( unlikely(shadow_mode_enabled(d)) )
2515 check_pagetable(v, "pre-va"); /* debug */
2517 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
2518 val)) )
2519 rc = -EINVAL;
2521 if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
2523 if ( unlikely(percpu_info[cpu].foreign &&
2524 (shadow_mode_translate(d) ||
2525 shadow_mode_translate(percpu_info[cpu].foreign))) )
2527 /*
2528 * The foreign domain's pfn's are in a different namespace. There's
2529 * not enough information in just a gpte to figure out how to
2530 * (re-)shadow this entry.
2531 */
2532 domain_crash(d);
2535 rc = shadow_do_update_va_mapping(va, val, v);
2537 check_pagetable(v, "post-va"); /* debug */
2540 switch ( flags & UVMF_FLUSHTYPE_MASK )
2542 case UVMF_TLB_FLUSH:
2543 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2545 case UVMF_LOCAL:
2546 if ( unlikely(shadow_mode_enabled(d)) )
2547 shadow_sync_all(d);
2548 local_flush_tlb();
2549 break;
2550 case UVMF_ALL:
2551 flush_tlb_mask(d->cpumask);
2552 break;
2553 default:
2554 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2555 rc = -EFAULT;
2556 pmask = vcpumask_to_pcpumask(d, vmask);
2557 cpus_and(pmask, pmask, d->cpumask);
2558 flush_tlb_mask(pmask);
2559 break;
2561 break;
2563 case UVMF_INVLPG:
2564 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
2566 case UVMF_LOCAL:
2567 if ( unlikely(shadow_mode_enabled(d)) )
2568 shadow_invlpg(current, va);
2569 local_flush_tlb_one(va);
2570 break;
2571 case UVMF_ALL:
2572 flush_tlb_one_mask(d->cpumask, va);
2573 break;
2574 default:
2575 if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
2576 rc = -EFAULT;
2577 pmask = vcpumask_to_pcpumask(d, vmask);
2578 cpus_and(pmask, pmask, d->cpumask);
2579 flush_tlb_one_mask(pmask, va);
2580 break;
2582 break;
2585 process_deferred_ops(cpu);
2587 UNLOCK_BIGLOCK(d);
2589 return rc;
2592 int do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
2593 unsigned long flags,
2594 domid_t domid)
2596 unsigned int cpu = smp_processor_id();
2597 struct domain *d;
2598 int rc;
2600 if ( unlikely(!IS_PRIV(current->domain)) )
2601 return -EPERM;
2603 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
2604 if ( unlikely(d == NULL) )
2606 MEM_LOG("Unknown domain '%u'", domid);
2607 return -ESRCH;
2610 rc = do_update_va_mapping(va, val64, flags);
2612 return rc;
2617 /*************************
2618 * Descriptor Tables
2619 */
2621 void destroy_gdt(struct vcpu *v)
2623 int i;
2624 unsigned long pfn;
2626 v->arch.guest_context.gdt_ents = 0;
2627 for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ )
2629 if ( (pfn = l1e_get_pfn(v->arch.perdomain_ptes[i])) != 0 )
2630 put_page_and_type(&frame_table[pfn]);
2631 v->arch.perdomain_ptes[i] = l1e_empty();
2632 v->arch.guest_context.gdt_frames[i] = 0;
2637 long set_gdt(struct vcpu *v,
2638 unsigned long *frames,
2639 unsigned int entries)
2641 struct domain *d = v->domain;
2642 /* NB. There are 512 8-byte entries per GDT page. */
2643 int i, nr_pages = (entries + 511) / 512;
2644 unsigned long pfn;
2646 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2647 return -EINVAL;
2649 shadow_sync_all(d);
2651 /* Check the pages in the new GDT. */
2652 for ( i = 0; i < nr_pages; i++ ) {
2653 pfn = frames[i];
2654 if ((pfn >= max_page) ||
2655 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2656 goto fail;
2659 /* Tear down the old GDT. */
2660 destroy_gdt(v);
2662 /* Install the new GDT. */
2663 v->arch.guest_context.gdt_ents = entries;
2664 for ( i = 0; i < nr_pages; i++ )
2666 v->arch.guest_context.gdt_frames[i] = frames[i];
2667 v->arch.perdomain_ptes[i] =
2668 l1e_from_pfn(frames[i], __PAGE_HYPERVISOR);
2671 return 0;
2673 fail:
2674 while ( i-- > 0 )
2675 put_page_and_type(&frame_table[frames[i]]);
2676 return -EINVAL;
2680 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
2682 int nr_pages = (entries + 511) / 512;
2683 unsigned long frames[16];
2684 long ret;
2686 /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
2687 if ( entries > FIRST_RESERVED_GDT_ENTRY )
2688 return -EINVAL;
2690 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
2691 return -EFAULT;
2693 LOCK_BIGLOCK(current->domain);
2695 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2696 local_flush_tlb();
2698 UNLOCK_BIGLOCK(current->domain);
2700 return ret;
2704 long do_update_descriptor(u64 pa, u64 desc)
2706 struct domain *dom = current->domain;
2707 unsigned long gpfn = pa >> PAGE_SHIFT;
2708 unsigned long mfn;
2709 unsigned int offset;
2710 struct desc_struct *gdt_pent, d;
2711 struct pfn_info *page;
2712 long ret = -EINVAL;
2714 offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct);
2716 *(u64 *)&d = desc;
2718 LOCK_BIGLOCK(dom);
2720 if ( !VALID_MFN(mfn = __gpfn_to_mfn(dom, gpfn)) ||
2721 (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
2722 (mfn >= max_page) ||
2723 !check_descriptor(&d) )
2725 UNLOCK_BIGLOCK(dom);
2726 return -EINVAL;
2729 page = &frame_table[mfn];
2730 if ( unlikely(!get_page(page, dom)) )
2732 UNLOCK_BIGLOCK(dom);
2733 return -EINVAL;
2736 /* Check if the given frame is in use in an unsafe context. */
2737 switch ( page->u.inuse.type_info & PGT_type_mask )
2739 case PGT_gdt_page:
2740 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2741 goto out;
2742 break;
2743 case PGT_ldt_page:
2744 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2745 goto out;
2746 break;
2747 default:
2748 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2749 goto out;
2750 break;
2753 if ( shadow_mode_enabled(dom) )
2755 shadow_lock(dom);
2757 if ( shadow_mode_log_dirty(dom) )
2758 __mark_dirty(dom, mfn);
2760 if ( page_is_page_table(page) && !page_out_of_sync(page) )
2761 shadow_mark_mfn_out_of_sync(current, gpfn, mfn);
2764 /* All is good so make the update. */
2765 gdt_pent = map_domain_page(mfn);
2766 memcpy(&gdt_pent[offset], &d, 8);
2767 unmap_domain_page(gdt_pent);
2769 if ( shadow_mode_enabled(dom) )
2770 shadow_unlock(dom);
2772 put_page_type(page);
2774 ret = 0; /* success */
2776 out:
2777 put_page(page);
2779 UNLOCK_BIGLOCK(dom);
2781 return ret;
2786 /*************************
2787 * Writable Pagetables
2788 */
2790 #ifdef VVERBOSE
2791 int ptwr_debug = 0x0;
2792 #define PTWR_PRINTK(_f, _a...) \
2793 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2794 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2795 #else
2796 #define PTWR_PRINTK(_f, _a...) ((void)0)
2797 #endif
2800 #ifdef PERF_ARRAYS
2802 /**************** writeable pagetables profiling functions *****************/
2804 #define ptwr_eip_buckets 256
2806 int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES};
2808 #define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int))
2810 struct {
2811 unsigned long eip;
2812 domid_t id;
2813 u32 val[ptwr_eip_stat_thresholdN];
2814 } typedef ptwr_eip_stat_t;
2816 ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets];
2818 static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id )
2820 return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) %
2821 ptwr_eip_buckets;
2824 static void ptwr_eip_stat_inc(u32 *n)
2826 int i, j;
2828 if ( ++(*n) != 0 )
2829 return;
2831 *n = ~0;
2833 /* Re-scale all buckets. */
2834 for ( i = 0; i <ptwr_eip_buckets; i++ )
2835 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2836 ptwr_eip_stats[i].val[j] >>= 1;
2839 static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified)
2841 int i, j, b;
2843 i = b = ptwr_eip_stat_hash(eip, id);
2845 do
2847 if ( !ptwr_eip_stats[i].eip )
2849 /* doesn't exist */
2850 ptwr_eip_stats[i].eip = eip;
2851 ptwr_eip_stats[i].id = id;
2852 memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val));
2855 if ( ptwr_eip_stats[i].eip == eip )
2857 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2858 if ( modified <= ptwr_eip_stat_threshold[j] )
2859 break;
2860 BUG_ON(j >= ptwr_eip_stat_thresholdN);
2861 ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]);
2862 return;
2865 i = (i+1) % ptwr_eip_buckets;
2867 while ( i != b );
2869 printk("ptwr_eip_stat: too many EIPs in use!\n");
2871 ptwr_eip_stat_print();
2872 ptwr_eip_stat_reset();
2875 void ptwr_eip_stat_reset(void)
2877 memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats));
2880 void ptwr_eip_stat_print(void)
2882 struct domain *e;
2883 domid_t d;
2884 int i, j;
2886 for_each_domain( e )
2888 d = e->domain_id;
2890 for ( i = 0; i < ptwr_eip_buckets; i++ )
2892 if ( ptwr_eip_stats[i].eip && ptwr_eip_stats[i].id != d )
2893 continue;
2895 printk("D %d eip %08lx ",
2896 ptwr_eip_stats[i].id, ptwr_eip_stats[i].eip);
2898 for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ )
2899 printk("<=%u %4u \t",
2900 ptwr_eip_stat_threshold[j],
2901 ptwr_eip_stats[i].val[j]);
2902 printk("\n");
2907 #else /* PERF_ARRAYS */
2909 #define ptwr_eip_stat_update(eip, id, modified) ((void)0)
2911 #endif
2913 /*******************************************************************/
2915 /* Re-validate a given p.t. page, given its prior snapshot */
2916 int revalidate_l1(
2917 struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
2919 l1_pgentry_t ol1e, nl1e;
2920 int modified = 0, i;
2922 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2924 ol1e = snapshot[i];
2925 nl1e = l1page[i];
2927 if ( likely(l1e_get_intpte(ol1e) == l1e_get_intpte(nl1e)) )
2928 continue;
2930 /* Update number of entries modified. */
2931 modified++;
2933 /*
2934 * Fast path for PTEs that have merely been write-protected
2935 * (e.g., during a Unix fork()). A strict reduction in privilege.
2936 */
2937 if ( likely(l1e_get_intpte(ol1e) == (l1e_get_intpte(nl1e)|_PAGE_RW)) )
2939 if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
2940 put_page_type(&frame_table[l1e_get_pfn(nl1e)]);
2941 continue;
2944 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2946 /*
2947 * Make the remaining p.t's consistent before crashing, so the
2948 * reference counts are correct.
2949 */
2950 memcpy(&l1page[i], &snapshot[i],
2951 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
2953 /* Crash the offending domain. */
2954 MEM_LOG("ptwr: Could not revalidate l1 page");
2955 domain_crash(d);
2956 break;
2959 put_page_from_l1e(ol1e, d);
2962 return modified;
2966 /* Flush the given writable p.t. page and write-protect it again. */
2967 void ptwr_flush(struct domain *d, const int which)
2969 unsigned long l1va;
2970 l1_pgentry_t *pl1e, pte, *ptep;
2971 l2_pgentry_t *pl2e;
2972 unsigned int modified;
2974 #ifdef CONFIG_X86_64
2975 struct vcpu *v = current;
2976 extern void toggle_guest_mode(struct vcpu *);
2977 int user_mode = !(v->arch.flags & TF_kernel_mode);
2978 #endif
2980 ASSERT(!shadow_mode_enabled(d));
2982 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
2983 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
2984 write_cr3(pagetable_get_paddr(
2985 d->arch.ptwr[which].vcpu->arch.guest_table));
2986 else
2987 TOGGLE_MODE();
2989 l1va = d->arch.ptwr[which].l1va;
2990 ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)];
2992 /*
2993 * STEP 1. Write-protect the p.t. page so no more updates can occur.
2994 */
2996 if ( unlikely(__get_user(pte.l1, &ptep->l1)) )
2998 MEM_LOG("ptwr: Could not read pte at %p", ptep);
2999 /*
3000 * Really a bug. We could read this PTE during the initial fault,
3001 * and pagetables can't have changed meantime.
3002 */
3003 BUG();
3005 PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
3006 PTWR_PRINT_WHICH, ptep, pte.l1);
3007 l1e_remove_flags(pte, _PAGE_RW);
3009 /* Write-protect the p.t. page in the guest page table. */
3010 if ( unlikely(__put_user(pte, ptep)) )
3012 MEM_LOG("ptwr: Could not update pte at %p", ptep);
3013 /*
3014 * Really a bug. We could write this PTE during the initial fault,
3015 * and pagetables can't have changed meantime.
3016 */
3017 BUG();
3020 /* Ensure that there are no stale writable mappings in any TLB. */
3021 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
3022 flush_tlb_one_mask(d->cpumask, l1va);
3023 PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
3024 PTWR_PRINT_WHICH, ptep, pte.l1);
3026 /*
3027 * STEP 2. Validate any modified PTEs.
3028 */
3030 pl1e = d->arch.ptwr[which].pl1e;
3031 modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
3032 unmap_domain_page(pl1e);
3033 perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
3034 ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
3035 d->arch.ptwr[which].prev_nr_updates = modified;
3037 /*
3038 * STEP 3. Reattach the L1 p.t. page into the current address space.
3039 */
3041 if ( which == PTWR_PT_ACTIVE )
3043 pl2e = &__linear_l2_table[d->arch.ptwr[which].l2_idx];
3044 l2e_add_flags(*pl2e, _PAGE_PRESENT);
3047 /*
3048 * STEP 4. Final tidy-up.
3049 */
3051 d->arch.ptwr[which].l1va = 0;
3053 if ( unlikely(d->arch.ptwr[which].vcpu != current) )
3054 write_ptbase(current);
3055 else
3056 TOGGLE_MODE();
3059 static int ptwr_emulated_update(
3060 unsigned long addr,
3061 physaddr_t old,
3062 physaddr_t val,
3063 unsigned int bytes,
3064 unsigned int do_cmpxchg)
3066 unsigned long pfn;
3067 struct pfn_info *page;
3068 l1_pgentry_t pte, ol1e, nl1e, *pl1e;
3069 struct domain *d = current->domain;
3071 /* Aligned access only, thank you. */
3072 if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
3074 MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)",
3075 bytes, addr);
3076 return X86EMUL_UNHANDLEABLE;
3079 /* Turn a sub-word access into a full-word access. */
3080 if ( bytes != sizeof(physaddr_t) )
3082 int rc;
3083 physaddr_t full;
3084 unsigned int offset = addr & (sizeof(physaddr_t)-1);
3086 /* Align address; read full word. */
3087 addr &= ~(sizeof(physaddr_t)-1);
3088 if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full,
3089 sizeof(physaddr_t))) )
3090 return rc;
3091 /* Mask out bits provided by caller. */
3092 full &= ~((((physaddr_t)1 << (bytes*8)) - 1) << (offset*8));
3093 /* Shift the caller value and OR in the missing bits. */
3094 val &= (((physaddr_t)1 << (bytes*8)) - 1);
3095 val <<= (offset)*8;
3096 val |= full;
3097 /* Also fill in missing parts of the cmpxchg old value. */
3098 old &= (((physaddr_t)1 << (bytes*8)) - 1);
3099 old <<= (offset)*8;
3100 old |= full;
3103 /* Read the PTE that maps the page being updated. */
3104 if (__copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
3105 sizeof(pte)))
3107 MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
3108 return X86EMUL_UNHANDLEABLE;
3111 pfn = l1e_get_pfn(pte);
3112 page = &frame_table[pfn];
3114 /* We are looking only for read-only mappings of p.t. pages. */
3115 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
3116 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3117 (page_get_owner(page) != d) )
3119 MEM_LOG("ptwr_emulate: Page is mistyped or bad pte "
3120 "(%lx, %" PRtype_info ")",
3121 l1e_get_pfn(pte), page->u.inuse.type_info);
3122 return X86EMUL_UNHANDLEABLE;
3125 /* Check the new PTE. */
3126 nl1e = l1e_from_intpte(val);
3127 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
3129 MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
3130 return X86EMUL_UNHANDLEABLE;
3133 /* Checked successfully: do the update (write or cmpxchg). */
3134 pl1e = map_domain_page(page_to_pfn(page));
3135 pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
3136 if ( do_cmpxchg )
3138 ol1e = l1e_from_intpte(old);
3139 if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
3141 unmap_domain_page(pl1e);
3142 put_page_from_l1e(nl1e, d);
3143 return X86EMUL_CMPXCHG_FAILED;
3146 else
3148 ol1e = *pl1e;
3149 *pl1e = nl1e;
3151 unmap_domain_page(pl1e);
3153 /* Finally, drop the old PTE. */
3154 put_page_from_l1e(ol1e, d);
3156 return X86EMUL_CONTINUE;
3159 static int ptwr_emulated_write(
3160 unsigned long addr,
3161 unsigned long val,
3162 unsigned int bytes)
3164 return ptwr_emulated_update(addr, 0, val, bytes, 0);
3167 static int ptwr_emulated_cmpxchg(
3168 unsigned long addr,
3169 unsigned long old,
3170 unsigned long new,
3171 unsigned int bytes)
3173 return ptwr_emulated_update(addr, old, new, bytes, 1);
3176 static int ptwr_emulated_cmpxchg8b(
3177 unsigned long addr,
3178 unsigned long old,
3179 unsigned long old_hi,
3180 unsigned long new,
3181 unsigned long new_hi)
3183 return ptwr_emulated_update(
3184 addr, ((u64)old_hi << 32) | old, ((u64)new_hi << 32) | new, 8, 1);
3187 static struct x86_mem_emulator ptwr_mem_emulator = {
3188 .read_std = x86_emulate_read_std,
3189 .write_std = x86_emulate_write_std,
3190 .read_emulated = x86_emulate_read_std,
3191 .write_emulated = ptwr_emulated_write,
3192 .cmpxchg_emulated = ptwr_emulated_cmpxchg,
3193 .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
3194 };
3196 /* Write page fault handler: check if guest is trying to modify a PTE. */
3197 int ptwr_do_page_fault(struct domain *d, unsigned long addr,
3198 struct cpu_user_regs *regs)
3200 unsigned long pfn;
3201 struct pfn_info *page;
3202 l1_pgentry_t pte;
3203 l2_pgentry_t *pl2e, l2e;
3204 int which, flags;
3205 unsigned long l2_idx;
3207 if ( unlikely(shadow_mode_enabled(d)) )
3208 return 0;
3210 /*
3211 * Attempt to read the PTE that maps the VA being accessed. By checking for
3212 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
3213 */
3214 if ( !(l2e_get_flags(__linear_l2_table[l2_linear_offset(addr)]) &
3215 _PAGE_PRESENT) ||
3216 __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)],
3217 sizeof(pte)) )
3219 return 0;
3222 pfn = l1e_get_pfn(pte);
3223 page = &frame_table[pfn];
3225 #ifdef CONFIG_X86_64
3226 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT | _PAGE_USER)
3227 #else
3228 #define WRPT_PTE_FLAGS (_PAGE_RW | _PAGE_PRESENT)
3229 #endif
3231 /*
3232 * Check the required flags for a valid wrpt mapping. If the page is
3233 * already writable then we can return straight to the guest (SMP race).
3234 * We decide whether or not to propagate the fault by testing for write
3235 * permissions in page directories by writing back to the linear mapping.
3236 */
3237 if ( (flags = l1e_get_flags(pte) & WRPT_PTE_FLAGS) == WRPT_PTE_FLAGS )
3238 return !__put_user(
3239 pte.l1, &linear_pg_table[l1_linear_offset(addr)].l1);
3241 /* We are looking only for read-only mappings of p.t. pages. */
3242 if ( ((flags | _PAGE_RW) != WRPT_PTE_FLAGS) ||
3243 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
3244 ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
3245 (page_get_owner(page) != d) )
3247 return 0;
3250 #if 0 /* Leave this in as useful for debugging */
3251 goto emulate;
3252 #endif
3254 PTWR_PRINTK("ptwr_page_fault on l1 pt at va %lx, pfn %lx, eip %lx\n",
3255 addr, pfn, (unsigned long)regs->eip);
3257 /* Get the L2 index at which this L1 p.t. is always mapped. */
3258 l2_idx = page->u.inuse.type_info & PGT_va_mask;
3259 if ( unlikely(l2_idx >= PGT_va_unknown) )
3260 goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
3261 l2_idx >>= PGT_va_shift;
3263 if ( unlikely(l2_idx == l2_linear_offset(addr)) )
3264 goto emulate; /* Urk! Pagetable maps itself! */
3266 /*
3267 * Is the L1 p.t. mapped into the current address space? If so we call it
3268 * an ACTIVE p.t., otherwise it is INACTIVE.
3269 */
3270 pl2e = &__linear_l2_table[l2_idx];
3271 which = PTWR_PT_INACTIVE;
3273 if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
3275 /*
3276 * Check the PRESENT bit to set ACTIVE mode.
3277 * If the PRESENT bit is clear, we may be conflicting with the current
3278 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
3279 * The ptwr_flush call below will restore the PRESENT bit.
3280 */
3281 if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
3282 (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
3283 (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
3284 which = PTWR_PT_ACTIVE;
3287 /*
3288 * If this is a multi-processor guest then ensure that the page is hooked
3289 * into at most one L2 table, which must be the one running on this VCPU.
3290 */
3291 if ( (d->vcpu[0]->next_in_list != NULL) &&
3292 ((page->u.inuse.type_info & PGT_count_mask) !=
3293 (!!(page->u.inuse.type_info & PGT_pinned) +
3294 (which == PTWR_PT_ACTIVE))) )
3296 /* Could be conflicting writable mappings from other VCPUs. */
3297 cleanup_writable_pagetable(d);
3298 goto emulate;
3301 /*
3302 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
3303 * time. If there is already one, we must flush it out.
3304 */
3305 if ( d->arch.ptwr[which].l1va )
3306 ptwr_flush(d, which);
3308 /*
3309 * If last batch made no updates then we are probably stuck. Emulate this
3310 * update to ensure we make progress.
3311 */
3312 if ( d->arch.ptwr[which].prev_nr_updates == 0 )
3314 /* Ensure that we don't get stuck in an emulation-only rut. */
3315 d->arch.ptwr[which].prev_nr_updates = 1;
3316 goto emulate;
3319 PTWR_PRINTK("[%c] batched ptwr_page_fault at va %lx, pt for %08lx, "
3320 "pfn %lx\n", PTWR_PRINT_WHICH, addr,
3321 l2_idx << L2_PAGETABLE_SHIFT, pfn);
3323 d->arch.ptwr[which].l1va = addr | 1;
3324 d->arch.ptwr[which].l2_idx = l2_idx;
3325 d->arch.ptwr[which].vcpu = current;
3327 #ifdef PERF_ARRAYS
3328 d->arch.ptwr[which].eip = regs->eip;
3329 #endif
3331 /* For safety, disconnect the L1 p.t. page from current space. */
3332 if ( which == PTWR_PT_ACTIVE )
3334 l2e_remove_flags(*pl2e, _PAGE_PRESENT);
3335 flush_tlb_mask(d->cpumask);
3338 /* Temporarily map the L1 page, and make a copy of it. */
3339 d->arch.ptwr[which].pl1e = map_domain_page(pfn);
3340 memcpy(d->arch.ptwr[which].page,
3341 d->arch.ptwr[which].pl1e,
3342 L1_PAGETABLE_ENTRIES * sizeof(l1_pgentry_t));
3344 /* Finally, make the p.t. page writable by the guest OS. */
3345 l1e_add_flags(pte, _PAGE_RW);
3346 if ( unlikely(__put_user(pte.l1,
3347 &linear_pg_table[l1_linear_offset(addr)].l1)) )
3349 MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
3350 &linear_pg_table[l1_linear_offset(addr)]);
3351 /* Toss the writable pagetable state and crash. */
3352 unmap_domain_page(d->arch.ptwr[which].pl1e);
3353 d->arch.ptwr[which].l1va = 0;
3354 domain_crash(d);
3355 return 0;
3358 return EXCRET_fault_fixed;
3360 emulate:
3361 /*
3362 * Cleaning up avoids emulating an update to a PTE that is temporarily
3363 * marked writable (_PAGE_RW) by the batched ptwr logic. If this were
3364 * performance critical then the check could compare addr against l1va's in
3365 * ptwr_emulated_update(). Without this flush we can corrupt page refcnts!
3366 */
3367 cleanup_writable_pagetable(d);
3368 if ( x86_emulate_memop(guest_cpu_user_regs(), addr,
3369 &ptwr_mem_emulator, BITS_PER_LONG/8) )
3370 return 0;
3371 perfc_incrc(ptwr_emulations);
3372 return EXCRET_fault_fixed;
3375 int ptwr_init(struct domain *d)
3377 void *x = alloc_xenheap_page();
3378 void *y = alloc_xenheap_page();
3380 if ( (x == NULL) || (y == NULL) )
3382 free_xenheap_page(x);
3383 free_xenheap_page(y);
3384 return -ENOMEM;
3387 d->arch.ptwr[PTWR_PT_ACTIVE].page = x;
3388 d->arch.ptwr[PTWR_PT_INACTIVE].page = y;
3390 return 0;
3393 void ptwr_destroy(struct domain *d)
3395 LOCK_BIGLOCK(d);
3396 cleanup_writable_pagetable(d);
3397 UNLOCK_BIGLOCK(d);
3398 free_xenheap_page(d->arch.ptwr[PTWR_PT_ACTIVE].page);
3399 free_xenheap_page(d->arch.ptwr[PTWR_PT_INACTIVE].page);
3402 void cleanup_writable_pagetable(struct domain *d)
3404 if ( unlikely(!VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
3405 return;
3407 if ( unlikely(shadow_mode_enabled(d)) )
3409 shadow_sync_all(d);
3411 else
3413 if ( d->arch.ptwr[PTWR_PT_ACTIVE].l1va )
3414 ptwr_flush(d, PTWR_PT_ACTIVE);
3415 if ( d->arch.ptwr[PTWR_PT_INACTIVE].l1va )
3416 ptwr_flush(d, PTWR_PT_INACTIVE);
3420 int map_pages_to_xen(
3421 unsigned long virt,
3422 unsigned long pfn,
3423 unsigned long nr_pfns,
3424 unsigned long flags)
3426 l2_pgentry_t *pl2e, ol2e;
3427 l1_pgentry_t *pl1e, ol1e;
3428 unsigned int i;
3430 unsigned int map_small_pages = !!(flags & MAP_SMALL_PAGES);
3431 flags &= ~MAP_SMALL_PAGES;
3433 while ( nr_pfns != 0 )
3435 pl2e = virt_to_xen_l2e(virt);
3437 if ( ((((virt>>PAGE_SHIFT) | pfn) & ((1<<PAGETABLE_ORDER)-1)) == 0) &&
3438 (nr_pfns >= (1<<PAGETABLE_ORDER)) &&
3439 !map_small_pages )
3441 /* Super-page mapping. */
3442 ol2e = *pl2e;
3443 *pl2e = l2e_from_pfn(pfn, flags|_PAGE_PSE);
3445 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
3447 local_flush_tlb_pge();
3448 if ( !(l2e_get_flags(ol2e) & _PAGE_PSE) )
3449 free_xen_pagetable(l2e_get_page(*pl2e));
3452 virt += 1UL << L2_PAGETABLE_SHIFT;
3453 pfn += 1UL << PAGETABLE_ORDER;
3454 nr_pfns -= 1UL << PAGETABLE_ORDER;
3456 else
3458 /* Normal page mapping. */
3459 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
3461 pl1e = page_to_virt(alloc_xen_pagetable());
3462 clear_page(pl1e);
3463 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3465 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
3467 pl1e = page_to_virt(alloc_xen_pagetable());
3468 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
3469 pl1e[i] = l1e_from_pfn(
3470 l2e_get_pfn(*pl2e) + i,
3471 l2e_get_flags(*pl2e) & ~_PAGE_PSE);
3472 *pl2e = l2e_from_page(virt_to_page(pl1e), __PAGE_HYPERVISOR);
3473 local_flush_tlb_pge();
3476 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
3477 ol1e = *pl1e;
3478 *pl1e = l1e_from_pfn(pfn, flags);
3479 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
3480 local_flush_tlb_one(virt);
3482 virt += 1UL << L1_PAGETABLE_SHIFT;
3483 pfn += 1UL;
3484 nr_pfns -= 1UL;
3488 return 0;
3491 void __set_fixmap(
3492 enum fixed_addresses idx, unsigned long p, unsigned long flags)
3494 if ( unlikely(idx >= __end_of_fixed_addresses) )
3495 BUG();
3496 map_pages_to_xen(fix_to_virt(idx), p >> PAGE_SHIFT, 1, flags);
3499 #ifdef MEMORY_GUARD
3501 void memguard_init(void)
3503 map_pages_to_xen(
3504 PAGE_OFFSET, 0, xenheap_phys_end >> PAGE_SHIFT,
3505 __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
3508 static void __memguard_change_range(void *p, unsigned long l, int guard)
3510 unsigned long _p = (unsigned long)p;
3511 unsigned long _l = (unsigned long)l;
3512 unsigned long flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
3514 /* Ensure we are dealing with a page-aligned whole number of pages. */
3515 ASSERT((_p&PAGE_MASK) != 0);
3516 ASSERT((_l&PAGE_MASK) != 0);
3517 ASSERT((_p&~PAGE_MASK) == 0);
3518 ASSERT((_l&~PAGE_MASK) == 0);
3520 if ( guard )
3521 flags &= ~_PAGE_PRESENT;
3523 map_pages_to_xen(
3524 _p, virt_to_phys(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
3527 void memguard_guard_range(void *p, unsigned long l)
3529 __memguard_change_range(p, l, 1);
3532 void memguard_unguard_range(void *p, unsigned long l)
3534 __memguard_change_range(p, l, 0);
3537 #endif
3539 /*
3540 * Local variables:
3541 * mode: C
3542 * c-set-style: "BSD"
3543 * c-basic-offset: 4
3544 * tab-width: 4
3545 * indent-tabs-mode: nil
3546 * End:
3547 */