ia64/xen-unstable

view xen/arch/x86/mm.c @ 3742:d1e1c9854420

bitkeeper revision 1.1159.255.3 (420918d3OV9YNdw3dCaE6e4udrKnDA)

manual merge
author iap10@freefall.cl.cam.ac.uk
date Tue Feb 08 19:53:55 2005 +0000 (2005-02-08)
parents 4412ac39cc85
children e2b4ca470b91
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * arch/x86/mm.c
4 *
5 * Copyright (c) 2002-2005 K A Fraser
6 * Copyright (c) 2004 Christian Limpach
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
23 /*
24 * A description of the x86 page table API:
25 *
26 * Domains trap to do_mmu_update with a list of update requests.
27 * This is a list of (ptr, val) pairs, where the requested operation
28 * is *ptr = val.
29 *
30 * Reference counting of pages:
31 * ----------------------------
32 * Each page has two refcounts: tot_count and type_count.
33 *
34 * TOT_COUNT is the obvious reference count. It counts all uses of a
35 * physical page frame by a domain, including uses as a page directory,
36 * a page table, or simple mappings via a PTE. This count prevents a
37 * domain from releasing a frame back to the free pool when it still holds
38 * a reference to it.
39 *
40 * TYPE_COUNT is more subtle. A frame can be put to one of three
41 * mutually-exclusive uses: it might be used as a page directory, or a
42 * page table, or it may be mapped writable by the domain [of course, a
43 * frame may not be used in any of these three ways!].
44 * So, type_count is a count of the number of times a frame is being
45 * referred to in its current incarnation. Therefore, a page can only
46 * change its type when its type count is zero.
47 *
48 * Pinning the page type:
49 * ----------------------
50 * The type of a page can be pinned/unpinned with the commands
51 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
52 * pinning is not reference counted, so it can't be nested).
53 * This is useful to prevent a page's type count falling to zero, at which
54 * point safety checks would need to be carried out next time the count
55 * is increased again.
56 *
57 * A further note on writable page mappings:
58 * -----------------------------------------
59 * For simplicity, the count of writable mappings for a page may not
60 * correspond to reality. The 'writable count' is incremented for every
61 * PTE which maps the page with the _PAGE_RW flag set. However, for
62 * write access to be possible the page directory entry must also have
63 * its _PAGE_RW bit set. We do not check this as it complicates the
64 * reference counting considerably [consider the case of multiple
65 * directory entries referencing a single page table, some with the RW
66 * bit set, others not -- it starts getting a bit messy].
67 * In normal use, this simplification shouldn't be a problem.
68 * However, the logic can be added if required.
69 *
70 * One more note on read-only page mappings:
71 * -----------------------------------------
72 * We want domains to be able to map pages for read-only access. The
73 * main reason is that page tables and directories should be readable
74 * by a domain, but it would not be safe for them to be writable.
75 * However, domains have free access to rings 1 & 2 of the Intel
76 * privilege model. In terms of page protection, these are considered
77 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
78 * read-only restrictions are respected in supervisor mode -- if the
79 * bit is clear then any mapped page is writable.
80 *
81 * We get round this by always setting the WP bit and disallowing
82 * updates to it. This is very unlikely to cause a problem for guest
83 * OS's, which will generally use the WP bit to simplify copy-on-write
84 * implementation (in that case, OS wants a fault when it writes to
85 * an application-supplied buffer).
86 */
88 #include <xen/config.h>
89 #include <xen/init.h>
90 #include <xen/kernel.h>
91 #include <xen/lib.h>
92 #include <xen/mm.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <asm/shadow.h>
99 #include <asm/page.h>
100 #include <asm/flushtlb.h>
101 #include <asm/io.h>
102 #include <asm/uaccess.h>
103 #include <asm/domain_page.h>
104 #include <asm/ldt.h>
106 #ifdef VERBOSE
107 #define MEM_LOG(_f, _a...) \
108 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
109 current->domain->id , __LINE__ , ## _a )
110 #else
111 #define MEM_LOG(_f, _a...) ((void)0)
112 #endif
114 static int alloc_l2_table(struct pfn_info *page);
115 static int alloc_l1_table(struct pfn_info *page);
116 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
117 static int get_page_and_type_from_pagenr(unsigned long page_nr,
118 u32 type,
119 struct domain *d);
121 static void free_l2_table(struct pfn_info *page);
122 static void free_l1_table(struct pfn_info *page);
124 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
125 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
127 /* Used to defer flushing of memory structures. */
128 static struct {
129 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
130 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
131 unsigned long deferred_ops;
132 /* If non-NULL, specifies a foreign subject domain for some operations. */
133 struct domain *foreign;
134 } __cacheline_aligned percpu_info[NR_CPUS];
136 /*
137 * Returns the current foreign domain; defaults to the currently-executing
138 * domain if a foreign override hasn't been specified.
139 */
140 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
142 /* Private domain structs for DOMID_XEN and DOMID_IO. */
143 static struct domain *dom_xen, *dom_io;
145 /* Frame table and its size in pages. */
146 struct pfn_info *frame_table;
147 unsigned long frame_table_size;
148 unsigned long max_page;
150 void __init init_frametable(void)
151 {
152 unsigned long i, p;
154 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
155 frame_table_size = max_page * sizeof(struct pfn_info);
156 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
158 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
159 {
160 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
161 if ( p == 0 )
162 panic("Not enough memory for frame table\n");
163 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
164 4UL << 20, PAGE_HYPERVISOR);
165 }
167 memset(frame_table, 0, frame_table_size);
168 }
170 void arch_init_memory(void)
171 {
172 extern void subarch_init_memory(struct domain *);
174 memset(percpu_info, 0, sizeof(percpu_info));
176 /*
177 * Initialise our DOMID_XEN domain.
178 * Any Xen-heap pages that we will allow to be mapped will have
179 * their domain field set to dom_xen.
180 */
181 dom_xen = alloc_domain_struct();
182 atomic_set(&dom_xen->refcnt, 1);
183 dom_xen->id = DOMID_XEN;
185 /*
186 * Initialise our DOMID_IO domain.
187 * This domain owns no pages but is considered a special case when
188 * mapping I/O pages, as the mappings occur at the priv of the caller.
189 */
190 dom_io = alloc_domain_struct();
191 atomic_set(&dom_io->refcnt, 1);
192 dom_io->id = DOMID_IO;
194 subarch_init_memory(dom_xen);
195 }
197 void write_ptbase(struct exec_domain *ed)
198 {
199 struct domain *d = ed->domain;
200 unsigned long pa;
202 #ifdef CONFIG_VMX
203 if ( unlikely(shadow_mode(d)) )
204 pa = ((shadow_mode(d) == SHM_full_32) ?
205 pagetable_val(ed->arch.monitor_table) :
206 pagetable_val(ed->arch.shadow_table));
207 else
208 pa = pagetable_val(ed->arch.pagetable);
209 #else
210 if ( unlikely(shadow_mode(d)) )
211 pa = pagetable_val(ed->arch.shadow_table);
212 else
213 pa = pagetable_val(ed->arch.pagetable);
214 #endif
216 write_cr3(pa);
217 }
219 static void __invalidate_shadow_ldt(struct exec_domain *d)
220 {
221 int i;
222 unsigned long pfn;
223 struct pfn_info *page;
225 d->arch.shadow_ldt_mapcnt = 0;
227 for ( i = 16; i < 32; i++ )
228 {
229 pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]);
230 if ( pfn == 0 ) continue;
231 d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
232 page = &frame_table[pfn];
233 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
234 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
235 put_page_and_type(page);
236 }
238 /* Dispose of the (now possibly invalid) mappings from the TLB. */
239 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
240 }
243 static inline void invalidate_shadow_ldt(struct exec_domain *d)
244 {
245 if ( d->arch.shadow_ldt_mapcnt != 0 )
246 __invalidate_shadow_ldt(d);
247 }
250 static int alloc_segdesc_page(struct pfn_info *page)
251 {
252 struct desc_struct *descs;
253 int i;
255 descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
257 for ( i = 0; i < 512; i++ )
258 if ( unlikely(!check_descriptor(&descs[i])) )
259 goto fail;
261 unmap_domain_mem(descs);
262 return 1;
264 fail:
265 unmap_domain_mem(descs);
266 return 0;
267 }
270 /* Map shadow page at offset @off. */
271 int map_ldt_shadow_page(unsigned int off)
272 {
273 struct exec_domain *ed = current;
274 struct domain *d = ed->domain;
275 unsigned long l1e;
277 if ( unlikely(in_irq()) )
278 BUG();
280 __get_user(l1e, (unsigned long *)
281 &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]);
283 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
284 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
285 d, PGT_ldt_page)) )
286 return 0;
288 ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
289 ed->arch.shadow_ldt_mapcnt++;
291 return 1;
292 }
295 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
296 {
297 struct pfn_info *page = &frame_table[page_nr];
299 if ( unlikely(!pfn_is_ram(page_nr)) )
300 {
301 MEM_LOG("Pfn %08lx is not RAM", page_nr);
302 return 0;
303 }
305 if ( unlikely(!get_page(page, d)) )
306 {
307 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
308 return 0;
309 }
311 return 1;
312 }
315 static int get_page_and_type_from_pagenr(unsigned long page_nr,
316 u32 type,
317 struct domain *d)
318 {
319 struct pfn_info *page = &frame_table[page_nr];
321 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
322 return 0;
324 if ( unlikely(!get_page_type(page, type)) )
325 {
326 #ifdef VERBOSE
327 if ( (type & PGT_type_mask) != PGT_l1_page_table )
328 MEM_LOG("Bad page type for pfn %08lx (%08x)",
329 page_nr, page->u.inuse.type_info);
330 #endif
331 put_page(page);
332 return 0;
333 }
335 return 1;
336 }
339 /*
340 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
341 * needs some special care with reference counst and access permissions:
342 * 1. The mapping entry must be read-only, or the guest may get write access
343 * to its own PTEs.
344 * 2. We must only bump the reference counts for an *already validated*
345 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
346 * on a validation that is required to complete that validation.
347 * 3. We only need to increment the reference counts for the mapped page
348 * frame if it is mapped by a different L2 table. This is sufficient and
349 * also necessary to allow validation of an L2 table mapping itself.
350 */
351 static int
352 get_linear_pagetable(
353 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
354 {
355 u32 x, y;
356 struct pfn_info *page;
358 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
359 {
360 MEM_LOG("Attempt to create linear p.t. with write perms");
361 return 0;
362 }
364 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
365 {
366 /* Make sure the mapped frame belongs to the correct domain. */
367 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
368 return 0;
370 /*
371 * Make sure that the mapped frame is an already-validated L2 table.
372 * If so, atomically increment the count (checking for overflow).
373 */
374 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
375 y = page->u.inuse.type_info;
376 do {
377 x = y;
378 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
379 unlikely((x & (PGT_type_mask|PGT_validated)) !=
380 (PGT_l2_page_table|PGT_validated)) )
381 {
382 put_page(page);
383 return 0;
384 }
385 }
386 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
387 }
389 return 1;
390 }
393 static int
394 get_page_from_l1e(
395 l1_pgentry_t l1e, struct domain *d)
396 {
397 unsigned long l1v = l1_pgentry_val(l1e);
398 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
399 struct pfn_info *page = &frame_table[pfn];
400 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
402 if ( !(l1v & _PAGE_PRESENT) )
403 return 1;
405 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
406 {
407 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
408 return 0;
409 }
411 if ( unlikely(!pfn_is_ram(pfn)) )
412 {
413 /* Revert to caller privileges if FD == DOMID_IO. */
414 if ( d == dom_io )
415 d = current->domain;
417 if ( IS_PRIV(d) )
418 return 1;
420 if ( IS_CAPABLE_PHYSDEV(d) )
421 return domain_iomem_in_pfn(d, pfn);
423 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
424 return 0;
425 }
427 return ((l1v & _PAGE_RW) ?
428 get_page_and_type(page, d, PGT_writable_page) :
429 get_page(page, d));
430 }
433 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
434 static int
435 get_page_from_l2e(
436 l2_pgentry_t l2e, unsigned long pfn,
437 struct domain *d, unsigned long va_idx)
438 {
439 int rc;
441 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
442 return 1;
444 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
445 {
446 MEM_LOG("Bad L2 page type settings %04lx",
447 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
448 return 0;
449 }
451 rc = get_page_and_type_from_pagenr(
452 l2_pgentry_to_pagenr(l2e),
453 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
455 if ( unlikely(!rc) )
456 return get_linear_pagetable(l2e, pfn, d);
458 return 1;
459 }
462 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
463 {
464 unsigned long l1v = l1_pgentry_val(l1e);
465 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
466 struct pfn_info *page = &frame_table[pfn];
467 struct domain *e;
469 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
470 return;
472 e = page_get_owner(page);
473 if ( unlikely(e != d) )
474 {
475 /*
476 * Unmap a foreign page that may have been mapped via a grant table.
477 * Note that this can fail for a privileged domain that can map foreign
478 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
479 * counted via a grant entry and some counted directly in the page
480 * structure's reference count. Note that reference counts won't get
481 * dangerously confused as long as we always try to decrement the
482 * grant entry first. We may end up with a mismatch between which
483 * mappings and which unmappings are counted via the grant entry, but
484 * really it doesn't matter as privileged domains have carte blanche.
485 */
486 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
487 return;
488 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
489 }
491 if ( l1v & _PAGE_RW )
492 {
493 put_page_and_type(page);
494 }
495 else
496 {
497 /* We expect this is rare so we blow the entire shadow LDT. */
498 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
499 PGT_ldt_page)) &&
500 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
501 invalidate_shadow_ldt(e->exec_domain[0]);
502 put_page(page);
503 }
504 }
507 /*
508 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
509 * Note also that this automatically deals correctly with linear p.t.'s.
510 */
511 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
512 {
513 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
514 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
515 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
516 }
519 static int alloc_l2_table(struct pfn_info *page)
520 {
521 struct domain *d = page_get_owner(page);
522 unsigned long page_nr = page_to_pfn(page);
523 l2_pgentry_t *pl2e;
524 int i;
526 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
528 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
529 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
530 goto fail;
532 #if defined(__i386__)
533 /* Now we add our private high mappings. */
534 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
535 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
536 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
537 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
538 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
539 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
540 mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) |
541 __PAGE_HYPERVISOR);
542 #endif
544 unmap_domain_mem(pl2e);
545 return 1;
547 fail:
548 while ( i-- > 0 )
549 put_page_from_l2e(pl2e[i], page_nr);
551 unmap_domain_mem(pl2e);
552 return 0;
553 }
556 static int alloc_l1_table(struct pfn_info *page)
557 {
558 struct domain *d = page_get_owner(page);
559 unsigned long page_nr = page_to_pfn(page);
560 l1_pgentry_t *pl1e;
561 int i;
563 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
565 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
566 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
567 goto fail;
569 unmap_domain_mem(pl1e);
570 return 1;
572 fail:
573 while ( i-- > 0 )
574 put_page_from_l1e(pl1e[i], d);
576 unmap_domain_mem(pl1e);
577 return 0;
578 }
581 static void free_l2_table(struct pfn_info *page)
582 {
583 unsigned long page_nr = page - frame_table;
584 l2_pgentry_t *pl2e;
585 int i;
587 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
589 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
590 put_page_from_l2e(pl2e[i], page_nr);
592 unmap_domain_mem(pl2e);
593 }
596 static void free_l1_table(struct pfn_info *page)
597 {
598 struct domain *d = page_get_owner(page);
599 unsigned long page_nr = page - frame_table;
600 l1_pgentry_t *pl1e;
601 int i;
603 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
605 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
606 put_page_from_l1e(pl1e[i], d);
608 unmap_domain_mem(pl1e);
609 }
612 static inline int update_l2e(l2_pgentry_t *pl2e,
613 l2_pgentry_t ol2e,
614 l2_pgentry_t nl2e)
615 {
616 unsigned long o = cmpxchg((unsigned long *)pl2e,
617 l2_pgentry_val(ol2e),
618 l2_pgentry_val(nl2e));
619 if ( o != l2_pgentry_val(ol2e) )
620 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
621 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
622 return (o == l2_pgentry_val(ol2e));
623 }
626 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
627 static int mod_l2_entry(l2_pgentry_t *pl2e,
628 l2_pgentry_t nl2e,
629 unsigned long pfn)
630 {
631 l2_pgentry_t ol2e;
632 unsigned long _ol2e;
634 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
635 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
636 {
637 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
638 return 0;
639 }
641 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
642 return 0;
643 ol2e = mk_l2_pgentry(_ol2e);
645 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
646 {
647 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
648 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
649 return update_l2e(pl2e, ol2e, nl2e);
651 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
652 ((unsigned long)pl2e &
653 ~PAGE_MASK) >> 2)) )
654 return 0;
656 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
657 {
658 put_page_from_l2e(nl2e, pfn);
659 return 0;
660 }
662 put_page_from_l2e(ol2e, pfn);
663 return 1;
664 }
666 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
667 return 0;
669 put_page_from_l2e(ol2e, pfn);
670 return 1;
671 }
674 static inline int update_l1e(l1_pgentry_t *pl1e,
675 l1_pgentry_t ol1e,
676 l1_pgentry_t nl1e)
677 {
678 unsigned long o = l1_pgentry_val(ol1e);
679 unsigned long n = l1_pgentry_val(nl1e);
681 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
682 unlikely(o != l1_pgentry_val(ol1e)) )
683 {
684 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
685 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
686 return 0;
687 }
689 return 1;
690 }
693 /* Update the L1 entry at pl1e to new value nl1e. */
694 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
695 {
696 l1_pgentry_t ol1e;
697 unsigned long _ol1e;
698 struct domain *d = current->domain;
700 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
701 {
702 MEM_LOG("Bad get_user\n");
703 return 0;
704 }
706 ol1e = mk_l1_pgentry(_ol1e);
708 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
709 {
710 /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */
711 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
712 return update_l1e(pl1e, ol1e, nl1e);
714 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
715 return 0;
717 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
718 {
719 put_page_from_l1e(nl1e, d);
720 return 0;
721 }
723 put_page_from_l1e(ol1e, d);
724 return 1;
725 }
727 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
728 return 0;
730 put_page_from_l1e(ol1e, d);
731 return 1;
732 }
735 int alloc_page_type(struct pfn_info *page, unsigned int type)
736 {
737 switch ( type )
738 {
739 case PGT_l1_page_table:
740 return alloc_l1_table(page);
741 case PGT_l2_page_table:
742 return alloc_l2_table(page);
743 case PGT_gdt_page:
744 case PGT_ldt_page:
745 return alloc_segdesc_page(page);
746 default:
747 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
748 type, page->u.inuse.type_info,
749 page->count_info);
750 BUG();
751 }
753 return 0;
754 }
757 void free_page_type(struct pfn_info *page, unsigned int type)
758 {
759 struct domain *d = page_get_owner(page);
761 switch ( type )
762 {
763 case PGT_l1_page_table:
764 free_l1_table(page);
765 break;
767 case PGT_l2_page_table:
768 free_l2_table(page);
769 break;
771 default:
772 BUG();
773 }
775 if ( unlikely(shadow_mode(d)) &&
776 (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
777 {
778 unshadow_table(page_to_pfn(page), type);
779 put_shadow_status(d);
780 }
781 }
784 void put_page_type(struct pfn_info *page)
785 {
786 u32 nx, x, y = page->u.inuse.type_info;
788 again:
789 do {
790 x = y;
791 nx = x - 1;
793 ASSERT((x & PGT_count_mask) != 0);
795 /*
796 * The page should always be validated while a reference is held. The
797 * exception is during domain destruction, when we forcibly invalidate
798 * page-table pages if we detect a referential loop.
799 * See domain.c:relinquish_list().
800 */
801 ASSERT((x & PGT_validated) ||
802 test_bit(DF_DYING, &page_get_owner(page)->d_flags));
804 if ( unlikely((nx & PGT_count_mask) == 0) )
805 {
806 /* Record TLB information for flush later. Races are harmless. */
807 page->tlbflush_timestamp = tlbflush_current_time();
809 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
810 likely(nx & PGT_validated) )
811 {
812 /*
813 * Page-table pages must be unvalidated when count is zero. The
814 * 'free' is safe because the refcnt is non-zero and validated
815 * bit is clear => other ops will spin or fail.
816 */
817 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
818 x & ~PGT_validated)) != x) )
819 goto again;
820 /* We cleared the 'valid bit' so we do the clear up. */
821 free_page_type(page, x & PGT_type_mask);
822 /* Carry on, but with the 'valid bit' now clear. */
823 x &= ~PGT_validated;
824 nx &= ~PGT_validated;
825 }
826 }
827 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
828 (PGT_pinned | 1)) )
829 {
830 /* Page is now only pinned. Make the back pointer mutable again. */
831 nx |= PGT_va_mutable;
832 }
833 }
834 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
835 }
838 int get_page_type(struct pfn_info *page, u32 type)
839 {
840 u32 nx, x, y = page->u.inuse.type_info;
842 again:
843 do {
844 x = y;
845 nx = x + 1;
846 if ( unlikely((nx & PGT_count_mask) == 0) )
847 {
848 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
849 return 0;
850 }
851 else if ( unlikely((x & PGT_count_mask) == 0) )
852 {
853 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
854 {
855 /*
856 * On type change we check to flush stale TLB entries. This
857 * may be unnecessary (e.g., page was GDT/LDT) but those
858 * circumstances should be very rare.
859 */
860 struct domain *d = page_get_owner(page);
861 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
862 page->tlbflush_timestamp)) )
863 {
864 perfc_incr(need_flush_tlb_flush);
865 flush_tlb_cpu(d->exec_domain[0]->processor);
866 }
868 /* We lose existing type, back pointer, and validity. */
869 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
870 nx |= type;
872 /* No special validation needed for writable pages. */
873 /* Page tables and GDT/LDT need to be scanned for validity. */
874 if ( type == PGT_writable_page )
875 nx |= PGT_validated;
876 }
877 }
878 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
879 {
880 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
881 {
882 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
883 ((type & PGT_type_mask) != PGT_l1_page_table) )
884 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
885 x & PGT_type_mask, type, page_to_pfn(page));
886 return 0;
887 }
888 else if ( (x & PGT_va_mask) == PGT_va_mutable )
889 {
890 /* The va backpointer is mutable, hence we update it. */
891 nx &= ~PGT_va_mask;
892 nx |= type; /* we know the actual type is correct */
893 }
894 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
895 {
896 /* This table is potentially mapped at multiple locations. */
897 nx &= ~PGT_va_mask;
898 nx |= PGT_va_unknown;
899 }
900 }
901 else if ( unlikely(!(x & PGT_validated)) )
902 {
903 /* Someone else is updating validation of this page. Wait... */
904 while ( (y = page->u.inuse.type_info) == x )
905 {
906 rep_nop();
907 barrier();
908 }
909 goto again;
910 }
911 }
912 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
914 if ( unlikely(!(nx & PGT_validated)) )
915 {
916 /* Try to validate page type; drop the new reference on failure. */
917 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
918 {
919 MEM_LOG("Error while validating pfn %08lx for type %08x."
920 " caf=%08x taf=%08x\n",
921 page_to_pfn(page), type,
922 page->count_info,
923 page->u.inuse.type_info);
924 /* Noone else can get a reference. We hold the only ref. */
925 page->u.inuse.type_info = 0;
926 return 0;
927 }
929 /* Noone else is updating simultaneously. */
930 __set_bit(_PGT_validated, &page->u.inuse.type_info);
931 }
933 return 1;
934 }
937 int new_guest_cr3(unsigned long pfn)
938 {
939 struct exec_domain *ed = current;
940 struct domain *d = ed->domain;
941 int okay, cpu = smp_processor_id();
942 unsigned long old_base_pfn;
944 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
945 if ( likely(okay) )
946 {
947 invalidate_shadow_ldt(ed);
949 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
950 old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
951 ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
953 shadow_mk_pagetable(ed);
955 write_ptbase(ed);
957 put_page_and_type(&frame_table[old_base_pfn]);
958 }
959 else
960 {
961 MEM_LOG("Error while installing new baseptr %08lx", pfn);
962 }
964 return okay;
965 }
967 static int do_extended_command(unsigned long ptr, unsigned long val)
968 {
969 int okay = 1, cpu = smp_processor_id();
970 unsigned int cmd = val & MMUEXT_CMD_MASK;
971 unsigned long pfn = ptr >> PAGE_SHIFT;
972 struct pfn_info *page = &frame_table[pfn];
973 struct exec_domain *ed = current;
974 struct domain *d = ed->domain, *nd, *e;
975 u32 x, y;
976 domid_t domid;
977 grant_ref_t gntref;
979 switch ( cmd )
980 {
981 case MMUEXT_PIN_L1_TABLE:
982 case MMUEXT_PIN_L2_TABLE:
983 /*
984 * We insist that, if you pin an L1 page, it's the first thing that
985 * you do to it. This is because we require the backptr to still be
986 * mutable. This assumption seems safe.
987 */
988 okay = get_page_and_type_from_pagenr(
989 pfn,
990 ((cmd==MMUEXT_PIN_L2_TABLE) ?
991 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
992 FOREIGNDOM);
994 if ( unlikely(!okay) )
995 {
996 MEM_LOG("Error while pinning pfn %08lx", pfn);
997 break;
998 }
1000 if ( unlikely(test_and_set_bit(_PGT_pinned,
1001 &page->u.inuse.type_info)) )
1003 MEM_LOG("Pfn %08lx already pinned", pfn);
1004 put_page_and_type(page);
1005 okay = 0;
1006 break;
1009 break;
1011 case MMUEXT_UNPIN_TABLE:
1012 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1014 MEM_LOG("Page %08lx bad domain (dom=%p)",
1015 ptr, page_get_owner(page));
1017 else if ( likely(test_and_clear_bit(_PGT_pinned,
1018 &page->u.inuse.type_info)) )
1020 put_page_and_type(page);
1021 put_page(page);
1023 else
1025 okay = 0;
1026 put_page(page);
1027 MEM_LOG("Pfn %08lx not pinned", pfn);
1029 break;
1031 case MMUEXT_NEW_BASEPTR:
1032 okay = new_guest_cr3(pfn);
1033 break;
1035 case MMUEXT_TLB_FLUSH:
1036 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1037 break;
1039 case MMUEXT_INVLPG:
1040 __flush_tlb_one(ptr);
1041 break;
1043 case MMUEXT_FLUSH_CACHE:
1044 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1046 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1047 okay = 0;
1049 else
1051 wbinvd();
1053 break;
1055 case MMUEXT_SET_LDT:
1057 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1058 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1059 (ents > 8192) ||
1060 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1061 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1063 okay = 0;
1064 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1066 else if ( (ed->arch.ldt_ents != ents) ||
1067 (ed->arch.ldt_base != ptr) )
1069 invalidate_shadow_ldt(ed);
1070 ed->arch.ldt_base = ptr;
1071 ed->arch.ldt_ents = ents;
1072 load_LDT(ed);
1073 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1074 if ( ents != 0 )
1075 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1077 break;
1080 case MMUEXT_SET_FOREIGNDOM:
1081 domid = (domid_t)(val >> 16);
1083 if ( (e = percpu_info[cpu].foreign) != NULL )
1084 put_domain(e);
1085 percpu_info[cpu].foreign = NULL;
1087 if ( !IS_PRIV(d) )
1089 switch ( domid )
1091 case DOMID_IO:
1092 get_knownalive_domain(dom_io);
1093 percpu_info[cpu].foreign = dom_io;
1094 break;
1095 default:
1096 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1097 okay = 0;
1098 break;
1101 else
1103 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1104 if ( e == NULL )
1106 switch ( domid )
1108 case DOMID_XEN:
1109 get_knownalive_domain(dom_xen);
1110 percpu_info[cpu].foreign = dom_xen;
1111 break;
1112 case DOMID_IO:
1113 get_knownalive_domain(dom_io);
1114 percpu_info[cpu].foreign = dom_io;
1115 break;
1116 default:
1117 MEM_LOG("Unknown domain '%u'", domid);
1118 okay = 0;
1119 break;
1123 break;
1125 case MMUEXT_TRANSFER_PAGE:
1126 domid = (domid_t)(val >> 16);
1127 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1129 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1130 unlikely(!pfn_is_ram(pfn)) ||
1131 unlikely((e = find_domain_by_id(domid)) == NULL) )
1133 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1134 okay = 0;
1135 break;
1138 spin_lock(&d->page_alloc_lock);
1140 /*
1141 * The tricky bit: atomically release ownership while there is just one
1142 * benign reference to the page (PGC_allocated). If that reference
1143 * disappears then the deallocation routine will safely spin.
1144 */
1145 nd = page_get_owner(page);
1146 y = page->count_info;
1147 do {
1148 x = y;
1149 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1150 (1|PGC_allocated)) ||
1151 unlikely(nd != d) )
1153 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1154 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1155 d, d->id, nd, x, page->u.inuse.type_info);
1156 spin_unlock(&d->page_alloc_lock);
1157 put_domain(e);
1158 return 0;
1160 __asm__ __volatile__(
1161 LOCK_PREFIX "cmpxchg8b %2"
1162 : "=d" (nd), "=a" (y),
1163 "=m" (*(volatile u64 *)(&page->count_info))
1164 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1166 while ( unlikely(nd != d) || unlikely(y != x) );
1168 /*
1169 * Unlink from 'd'. At least one reference remains (now anonymous), so
1170 * noone else is spinning to try to delete this page from 'd'.
1171 */
1172 d->tot_pages--;
1173 list_del(&page->list);
1175 spin_unlock(&d->page_alloc_lock);
1177 spin_lock(&e->page_alloc_lock);
1179 /*
1180 * Check that 'e' will accept the page and has reservation headroom.
1181 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1182 */
1183 ASSERT(e->tot_pages <= e->max_pages);
1184 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1185 unlikely(e->tot_pages == e->max_pages) ||
1186 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1188 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1189 "provided a bad grant ref, or is dying (%08lx).\n",
1190 e->tot_pages, e->max_pages, e->d_flags);
1191 spin_unlock(&e->page_alloc_lock);
1192 put_domain(e);
1193 okay = 0;
1194 break;
1197 /* Okay, add the page to 'e'. */
1198 if ( unlikely(e->tot_pages++ == 0) )
1199 get_knownalive_domain(e);
1200 list_add_tail(&page->list, &e->page_list);
1201 page_set_owner(page, e);
1203 spin_unlock(&e->page_alloc_lock);
1205 /* Transfer is all done: tell the guest about its new page frame. */
1206 gnttab_notify_transfer(e, gntref, pfn);
1208 put_domain(e);
1209 break;
1211 case MMUEXT_REASSIGN_PAGE:
1212 if ( unlikely(!IS_PRIV(d)) )
1214 MEM_LOG("Dom %u has no reassignment priv", d->id);
1215 okay = 0;
1216 break;
1219 e = percpu_info[cpu].foreign;
1220 if ( unlikely(e == NULL) )
1222 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1223 okay = 0;
1224 break;
1227 /*
1228 * Grab both page_list locks, in order. This prevents the page from
1229 * disappearing elsewhere while we modify the owner, and we'll need
1230 * both locks if we're successful so that we can change lists.
1231 */
1232 if ( d < e )
1234 spin_lock(&d->page_alloc_lock);
1235 spin_lock(&e->page_alloc_lock);
1237 else
1239 spin_lock(&e->page_alloc_lock);
1240 spin_lock(&d->page_alloc_lock);
1243 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1244 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1245 unlikely(IS_XEN_HEAP_FRAME(page)) )
1247 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1248 okay = 0;
1249 goto reassign_fail;
1252 /*
1253 * The tricky bit: atomically change owner while there is just one
1254 * benign reference to the page (PGC_allocated). If that reference
1255 * disappears then the deallocation routine will safely spin.
1256 */
1257 nd = page_get_owner(page);
1258 y = page->count_info;
1259 do {
1260 x = y;
1261 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1262 (1|PGC_allocated)) ||
1263 unlikely(nd != d) )
1265 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1266 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1267 d, d->id, nd, x, page->u.inuse.type_info);
1268 okay = 0;
1269 goto reassign_fail;
1271 __asm__ __volatile__(
1272 LOCK_PREFIX "cmpxchg8b %3"
1273 : "=d" (nd), "=a" (y), "=c" (e),
1274 "=m" (*(volatile u64 *)(&page->count_info))
1275 : "0" (d), "1" (x), "c" (e), "b" (x) );
1277 while ( unlikely(nd != d) || unlikely(y != x) );
1279 /*
1280 * Unlink from 'd'. We transferred at least one reference to 'e', so
1281 * noone else is spinning to try to delete this page from 'd'.
1282 */
1283 d->tot_pages--;
1284 list_del(&page->list);
1286 /*
1287 * Add the page to 'e'. Someone may already have removed the last
1288 * reference and want to remove the page from 'e'. However, we have
1289 * the lock so they'll spin waiting for us.
1290 */
1291 if ( unlikely(e->tot_pages++ == 0) )
1292 get_knownalive_domain(e);
1293 list_add_tail(&page->list, &e->page_list);
1295 reassign_fail:
1296 spin_unlock(&d->page_alloc_lock);
1297 spin_unlock(&e->page_alloc_lock);
1298 break;
1300 case MMUEXT_CLEAR_FOREIGNDOM:
1301 if ( (e = percpu_info[cpu].foreign) != NULL )
1302 put_domain(e);
1303 percpu_info[cpu].foreign = NULL;
1304 break;
1306 default:
1307 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1308 okay = 0;
1309 break;
1312 return okay;
1315 int do_mmu_update(
1316 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1318 /*
1319 * We steal the m.s.b. of the @count parameter to indicate whether this
1320 * invocation of do_mmu_update() is resuming a previously preempted call.
1321 * We steal the next 15 bits to remember the current FOREIGNDOM.
1322 */
1323 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1324 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1325 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1327 mmu_update_t req;
1328 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1329 struct pfn_info *page;
1330 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1331 unsigned int cmd, done = 0;
1332 unsigned long prev_smfn = 0;
1333 l1_pgentry_t *prev_spl1e = 0;
1334 struct exec_domain *ed = current;
1335 struct domain *d = ed->domain;
1336 u32 type_info;
1337 domid_t domid;
1339 LOCK_BIGLOCK(d);
1341 cleanup_writable_pagetable(d);
1343 if ( unlikely(shadow_mode(d)) )
1344 check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */
1346 /*
1347 * If we are resuming after preemption, read how much work we have already
1348 * done. This allows us to set the @done output parameter correctly.
1349 * We also reset FOREIGNDOM here.
1350 */
1351 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1353 if ( !(count & MMU_UPDATE_PREEMPTED) )
1355 /* Count overflow into private FOREIGNDOM field. */
1356 MEM_LOG("do_mmu_update count is too large");
1357 rc = -EINVAL;
1358 goto out;
1360 count &= ~MMU_UPDATE_PREEMPTED;
1361 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1362 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1363 if ( unlikely(pdone != NULL) )
1364 (void)get_user(done, pdone);
1365 if ( (domid != current->domain->id) &&
1366 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1368 rc = -EINVAL;
1369 goto out;
1373 perfc_incrc(calls_to_mmu_update);
1374 perfc_addc(num_page_updates, count);
1376 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1378 rc = -EFAULT;
1379 goto out;
1382 for ( i = 0; i < count; i++ )
1384 if ( hypercall_preempt_check() )
1386 rc = hypercall3_create_continuation(
1387 __HYPERVISOR_mmu_update, ureqs,
1388 (count - i) |
1389 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1390 MMU_UPDATE_PREEMPTED, pdone);
1391 break;
1394 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1396 MEM_LOG("Bad __copy_from_user");
1397 rc = -EFAULT;
1398 break;
1401 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1402 pfn = req.ptr >> PAGE_SHIFT;
1404 okay = 0;
1406 switch ( cmd )
1408 /*
1409 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1410 */
1411 case MMU_NORMAL_PT_UPDATE:
1412 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1414 MEM_LOG("Could not get page for normal update");
1415 break;
1418 if ( likely(prev_pfn == pfn) )
1420 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1422 else
1424 if ( prev_pfn != 0 )
1425 unmap_domain_mem((void *)va);
1426 va = (unsigned long)map_domain_mem(req.ptr);
1427 prev_pfn = pfn;
1430 page = &frame_table[pfn];
1431 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1433 case PGT_l1_page_table:
1434 if ( likely(get_page_type(
1435 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1437 okay = mod_l1_entry((l1_pgentry_t *)va,
1438 mk_l1_pgentry(req.val));
1440 if ( unlikely(shadow_mode(d)) && okay &&
1441 (get_shadow_status(d, page-frame_table) &
1442 PSH_shadowed) )
1444 shadow_l1_normal_pt_update(
1445 req.ptr, req.val, &prev_smfn, &prev_spl1e);
1446 put_shadow_status(d);
1449 put_page_type(page);
1451 break;
1452 case PGT_l2_page_table:
1453 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1455 okay = mod_l2_entry((l2_pgentry_t *)va,
1456 mk_l2_pgentry(req.val),
1457 pfn);
1459 if ( unlikely(shadow_mode(d)) && okay &&
1460 (get_shadow_status(d, page-frame_table) &
1461 PSH_shadowed) )
1463 shadow_l2_normal_pt_update(req.ptr, req.val);
1464 put_shadow_status(d);
1467 put_page_type(page);
1469 break;
1470 default:
1471 if ( likely(get_page_type(page, PGT_writable_page)) )
1473 *(unsigned long *)va = req.val;
1474 okay = 1;
1475 put_page_type(page);
1477 break;
1480 put_page(page);
1481 break;
1483 case MMU_MACHPHYS_UPDATE:
1484 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1486 MEM_LOG("Could not get page for mach->phys update");
1487 break;
1490 machine_to_phys_mapping[pfn] = req.val;
1491 okay = 1;
1493 /*
1494 * If in log-dirty mode, mark the corresponding pseudo-physical
1495 * page as dirty.
1496 */
1497 if ( unlikely(shadow_mode(d) == SHM_logdirty) &&
1498 mark_dirty(d, pfn) )
1499 d->arch.shadow_dirty_block_count++;
1501 put_page(&frame_table[pfn]);
1502 break;
1504 /*
1505 * MMU_EXTENDED_COMMAND: Extended command is specified
1506 * in the least-siginificant bits of the 'value' field.
1507 */
1508 case MMU_EXTENDED_COMMAND:
1509 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1510 okay = do_extended_command(req.ptr, req.val);
1511 break;
1513 default:
1514 MEM_LOG("Invalid page update command %08lx", req.ptr);
1515 break;
1518 if ( unlikely(!okay) )
1520 rc = -EINVAL;
1521 break;
1524 ureqs++;
1527 out:
1528 if ( prev_pfn != 0 )
1529 unmap_domain_mem((void *)va);
1531 if ( unlikely(prev_spl1e != 0) )
1532 unmap_domain_mem((void *)prev_spl1e);
1534 deferred_ops = percpu_info[cpu].deferred_ops;
1535 percpu_info[cpu].deferred_ops = 0;
1537 if ( deferred_ops & DOP_FLUSH_TLB )
1538 local_flush_tlb();
1540 if ( deferred_ops & DOP_RELOAD_LDT )
1541 (void)map_ldt_shadow_page(0);
1543 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1545 put_domain(percpu_info[cpu].foreign);
1546 percpu_info[cpu].foreign = NULL;
1549 /* Add incremental work we have done to the @done output parameter. */
1550 if ( unlikely(pdone != NULL) )
1551 __put_user(done + i, pdone);
1553 if ( unlikely(shadow_mode(d)) )
1554 check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */
1556 UNLOCK_BIGLOCK(d);
1557 return rc;
1561 int do_update_va_mapping(unsigned long va,
1562 unsigned long val,
1563 unsigned long flags)
1565 struct exec_domain *ed = current;
1566 struct domain *d = ed->domain;
1567 int err = 0;
1568 unsigned int cpu = ed->processor;
1569 unsigned long deferred_ops;
1571 perfc_incrc(calls_to_update_va);
1573 if ( unlikely(!__addr_ok(va)) )
1574 return -EINVAL;
1576 LOCK_BIGLOCK(d);
1578 cleanup_writable_pagetable(d);
1580 /*
1581 * XXX When we make this support 4MB superpages we should also deal with
1582 * the case of updating L2 entries.
1583 */
1585 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
1586 mk_l1_pgentry(val))) )
1587 err = -EINVAL;
1589 if ( unlikely(shadow_mode(d)) )
1591 unsigned long sval = 0;
1593 l1pte_propagate_from_guest(d, &val, &sval);
1595 if ( unlikely(__put_user(sval, ((unsigned long *)(
1596 &shadow_linear_pg_table[l1_linear_offset(va)])))) )
1598 /*
1599 * Since L2's are guranteed RW, failure indicates the page was not
1600 * shadowed, so ignore.
1601 */
1602 perfc_incrc(shadow_update_va_fail);
1605 /*
1606 * If we're in log-dirty mode then we need to note that we've updated
1607 * the PTE in the PT-holding page. We need the machine frame number
1608 * for this.
1609 */
1610 if ( shadow_mode(d) == SHM_logdirty )
1611 mark_dirty(d, va_to_l1mfn(va));
1613 check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
1616 deferred_ops = percpu_info[cpu].deferred_ops;
1617 percpu_info[cpu].deferred_ops = 0;
1619 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1620 unlikely(flags & UVMF_FLUSH_TLB) )
1621 local_flush_tlb();
1622 else if ( unlikely(flags & UVMF_INVLPG) )
1623 __flush_tlb_one(va);
1625 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1626 (void)map_ldt_shadow_page(0);
1628 UNLOCK_BIGLOCK(d);
1630 return err;
1633 int do_update_va_mapping_otherdomain(unsigned long va,
1634 unsigned long val,
1635 unsigned long flags,
1636 domid_t domid)
1638 unsigned int cpu = smp_processor_id();
1639 struct domain *d;
1640 int rc;
1642 if ( unlikely(!IS_PRIV(current->domain)) )
1643 return -EPERM;
1645 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1646 if ( unlikely(d == NULL) )
1648 MEM_LOG("Unknown domain '%u'", domid);
1649 return -ESRCH;
1652 rc = do_update_va_mapping(va, val, flags);
1654 put_domain(d);
1655 percpu_info[cpu].foreign = NULL;
1657 return rc;
1662 /*************************
1663 * Descriptor Tables
1664 */
1666 void destroy_gdt(struct exec_domain *ed)
1668 int i;
1669 unsigned long pfn;
1671 for ( i = 0; i < 16; i++ )
1673 if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 )
1674 put_page_and_type(&frame_table[pfn]);
1675 ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
1680 long set_gdt(struct exec_domain *ed,
1681 unsigned long *frames,
1682 unsigned int entries)
1684 struct domain *d = ed->domain;
1685 /* NB. There are 512 8-byte entries per GDT page. */
1686 int i = 0, nr_pages = (entries + 511) / 512;
1687 struct desc_struct *vgdt;
1688 unsigned long pfn;
1690 /* Check the first page in the new GDT. */
1691 if ( (pfn = frames[0]) >= max_page )
1692 goto fail;
1694 /* The first page is special because Xen owns a range of entries in it. */
1695 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
1697 /* GDT checks failed: try zapping the Xen reserved entries. */
1698 if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
1699 goto fail;
1700 vgdt = map_domain_mem(pfn << PAGE_SHIFT);
1701 memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
1702 NR_RESERVED_GDT_ENTRIES*8);
1703 unmap_domain_mem(vgdt);
1704 put_page_and_type(&frame_table[pfn]);
1706 /* Okay, we zapped the entries. Now try the GDT checks again. */
1707 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
1708 goto fail;
1711 /* Check the remaining pages in the new GDT. */
1712 for ( i = 1; i < nr_pages; i++ )
1713 if ( ((pfn = frames[i]) >= max_page) ||
1714 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
1715 goto fail;
1717 /* Copy reserved GDT entries to the new GDT. */
1718 vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
1719 memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY,
1720 gdt_table + FIRST_RESERVED_GDT_ENTRY,
1721 NR_RESERVED_GDT_ENTRIES*8);
1722 unmap_domain_mem(vgdt);
1724 /* Tear down the old GDT. */
1725 destroy_gdt(ed);
1727 /* Install the new GDT. */
1728 for ( i = 0; i < nr_pages; i++ )
1729 ed->arch.perdomain_ptes[i] =
1730 mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
1732 SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
1733 SET_GDT_ENTRIES(ed, entries);
1735 return 0;
1737 fail:
1738 while ( i-- > 0 )
1739 put_page_and_type(&frame_table[frames[i]]);
1740 return -EINVAL;
1744 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
1746 int nr_pages = (entries + 511) / 512;
1747 unsigned long frames[16];
1748 long ret;
1750 if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) )
1751 return -EINVAL;
1753 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
1754 return -EFAULT;
1756 LOCK_BIGLOCK(current->domain);
1758 if ( (ret = set_gdt(current, frames, entries)) == 0 )
1760 local_flush_tlb();
1761 __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
1764 UNLOCK_BIGLOCK(current->domain);
1766 return ret;
1770 long do_update_descriptor(
1771 unsigned long pa, unsigned long word1, unsigned long word2)
1773 unsigned long pfn = pa >> PAGE_SHIFT;
1774 struct desc_struct *gdt_pent, d;
1775 struct pfn_info *page;
1776 struct exec_domain *ed;
1777 long ret = -EINVAL;
1779 d.a = (u32)word1;
1780 d.b = (u32)word2;
1782 LOCK_BIGLOCK(current->domain);
1784 if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
1785 UNLOCK_BIGLOCK(current->domain);
1786 return -EINVAL;
1789 page = &frame_table[pfn];
1790 if ( unlikely(!get_page(page, current->domain)) ) {
1791 UNLOCK_BIGLOCK(current->domain);
1792 return -EINVAL;
1795 /* Check if the given frame is in use in an unsafe context. */
1796 switch ( page->u.inuse.type_info & PGT_type_mask )
1798 case PGT_gdt_page:
1799 /* Disallow updates of Xen-reserved descriptors in the current GDT. */
1800 for_each_exec_domain(current->domain, ed) {
1801 if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) &&
1802 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
1803 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
1804 goto out;
1806 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
1807 goto out;
1808 break;
1809 case PGT_ldt_page:
1810 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
1811 goto out;
1812 break;
1813 default:
1814 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
1815 goto out;
1816 break;
1819 /* All is good so make the update. */
1820 gdt_pent = map_domain_mem(pa);
1821 memcpy(gdt_pent, &d, 8);
1822 unmap_domain_mem(gdt_pent);
1824 put_page_type(page);
1826 ret = 0; /* success */
1828 out:
1829 put_page(page);
1831 UNLOCK_BIGLOCK(current->domain);
1833 return ret;
1838 /*************************
1839 * Writable Pagetables
1840 */
1842 ptwr_info_t ptwr_info[NR_CPUS];
1844 #ifdef VERBOSE
1845 int ptwr_debug = 0x0;
1846 #define PTWR_PRINTK(_f, _a...) \
1847 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1848 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1849 #else
1850 #define PTWR_PRINTK(_f, _a...) ((void)0)
1851 #endif
1853 /* Flush the given writable p.t. page and write-protect it again. */
1854 void ptwr_flush(const int which)
1856 unsigned long sstat, spte, pte, *ptep, l1va;
1857 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1858 l2_pgentry_t *pl2e;
1859 int i, cpu = smp_processor_id();
1860 struct exec_domain *ed = current;
1861 struct domain *d = ed->domain;
1863 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1864 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1866 /*
1867 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1868 */
1870 if ( unlikely(__get_user(pte, ptep)) )
1872 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1873 /*
1874 * Really a bug. We could read this PTE during the initial fault,
1875 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1876 */
1877 BUG();
1879 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1880 PTWR_PRINT_WHICH, ptep, pte);
1881 pte &= ~_PAGE_RW;
1883 if ( unlikely(shadow_mode(d)) )
1885 /* Write-protect the p.t. page in the shadow page table. */
1886 l1pte_propagate_from_guest(d, &pte, &spte);
1887 __put_user(
1888 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1890 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1891 sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
1892 if ( sstat & PSH_shadowed )
1893 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1896 /* Write-protect the p.t. page in the guest page table. */
1897 if ( unlikely(__put_user(pte, ptep)) )
1899 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1900 /*
1901 * Really a bug. We could write this PTE during the initial fault,
1902 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1903 */
1904 BUG();
1907 /* Ensure that there are no stale writable mappings in any TLB. */
1908 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1909 #if 1
1910 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1911 #else
1912 flush_tlb_all();
1913 #endif
1914 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1915 PTWR_PRINT_WHICH, ptep, pte);
1917 /*
1918 * STEP 2. Validate any modified PTEs.
1919 */
1921 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1922 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1924 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1925 nl1e = pl1e[i];
1927 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1928 continue;
1930 /*
1931 * Fast path for PTEs that have merely been write-protected
1932 * (e.g., during a Unix fork()). A strict reduction in privilege.
1933 */
1934 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1936 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1938 if ( unlikely(sl1e != NULL) )
1939 l1pte_propagate_from_guest(
1940 d, &l1_pgentry_val(nl1e),
1941 &l1_pgentry_val(sl1e[i]));
1942 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1944 continue;
1947 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1949 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1950 /*
1951 * Make the remaining p.t's consistent before crashing, so the
1952 * reference counts are correct.
1953 */
1954 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1955 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1956 unmap_domain_mem(pl1e);
1957 ptwr_info[cpu].ptinfo[which].l1va = 0;
1958 UNLOCK_BIGLOCK(d);
1959 domain_crash();
1962 if ( unlikely(sl1e != NULL) )
1963 l1pte_propagate_from_guest(
1964 d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1966 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1967 put_page_from_l1e(ol1e, d);
1969 unmap_domain_mem(pl1e);
1971 /*
1972 * STEP 3. Reattach the L1 p.t. page into the current address space.
1973 */
1975 if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) )
1977 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1978 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1981 /*
1982 * STEP 4. Final tidy-up.
1983 */
1985 ptwr_info[cpu].ptinfo[which].l1va = 0;
1987 if ( unlikely(sl1e != NULL) )
1989 unmap_domain_mem(sl1e);
1990 put_shadow_status(d);
1994 /* Write page fault handler: check if guest is trying to modify a PTE. */
1995 int ptwr_do_page_fault(unsigned long addr)
1997 unsigned long pte, pfn, l2e;
1998 struct pfn_info *page;
1999 l2_pgentry_t *pl2e;
2000 int which, cpu = smp_processor_id();
2001 u32 l2_idx;
2003 #ifdef __x86_64__
2004 return 0; /* Writable pagetables need fixing for x86_64. */
2005 #endif
2007 /*
2008 * Attempt to read the PTE that maps the VA being accessed. By checking for
2009 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
2010 */
2011 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
2012 _PAGE_PRESENT) ||
2013 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
2015 return 0;
2018 pfn = pte >> PAGE_SHIFT;
2019 page = &frame_table[pfn];
2021 /* We are looking only for read-only mappings of p.t. pages. */
2022 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
2023 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
2025 return 0;
2028 /* Get the L2 index at which this L1 p.t. is always mapped. */
2029 l2_idx = page->u.inuse.type_info & PGT_va_mask;
2030 if ( unlikely(l2_idx >= PGT_va_unknown) )
2032 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
2034 l2_idx >>= PGT_va_shift;
2036 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
2038 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
2039 domain_crash();
2042 /*
2043 * Is the L1 p.t. mapped into the current address space? If so we call it
2044 * an ACTIVE p.t., otherwise it is INACTIVE.
2045 */
2046 pl2e = &linear_l2_table[l2_idx];
2047 l2e = l2_pgentry_val(*pl2e);
2048 which = PTWR_PT_INACTIVE;
2049 if ( (l2e >> PAGE_SHIFT) == pfn )
2051 /* Check the PRESENT bit to set ACTIVE. */
2052 if ( likely(l2e & _PAGE_PRESENT) )
2053 which = PTWR_PT_ACTIVE;
2054 else {
2055 /*
2056 * If the PRESENT bit is clear, we may be conflicting with
2057 * the current ACTIVE p.t. (it may be the same p.t. mapped
2058 * at another virt addr).
2059 * The ptwr_flush call below will restore the PRESENT bit.
2060 */
2061 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
2062 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
2063 which = PTWR_PT_ACTIVE;
2067 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
2068 "pfn %08lx\n", PTWR_PRINT_WHICH,
2069 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
2071 /*
2072 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
2073 * time. If there is already one, we must flush it out.
2074 */
2075 if ( ptwr_info[cpu].ptinfo[which].l1va )
2076 ptwr_flush(which);
2078 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
2079 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
2081 /* For safety, disconnect the L1 p.t. page from current space. */
2082 if ( (which == PTWR_PT_ACTIVE) &&
2083 likely(!shadow_mode(current->domain)) )
2085 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
2086 #if 1
2087 flush_tlb(); /* XXX Multi-CPU guests? */
2088 #else
2089 flush_tlb_all();
2090 #endif
2093 /* Temporarily map the L1 page, and make a copy of it. */
2094 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
2095 memcpy(ptwr_info[cpu].ptinfo[which].page,
2096 ptwr_info[cpu].ptinfo[which].pl1e,
2097 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
2099 /* Finally, make the p.t. page writable by the guest OS. */
2100 pte |= _PAGE_RW;
2101 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
2102 &linear_pg_table[addr>>PAGE_SHIFT], pte);
2103 if ( unlikely(__put_user(pte, (unsigned long *)
2104 &linear_pg_table[addr>>PAGE_SHIFT])) )
2106 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
2107 &linear_pg_table[addr>>PAGE_SHIFT]);
2108 /* Toss the writable pagetable state and crash. */
2109 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
2110 ptwr_info[cpu].ptinfo[which].l1va = 0;
2111 domain_crash();
2114 return EXCRET_fault_fixed;
2117 static __init int ptwr_init(void)
2119 int i;
2121 for ( i = 0; i < smp_num_cpus; i++ )
2123 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
2124 (void *)alloc_xenheap_page();
2125 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
2126 (void *)alloc_xenheap_page();
2129 return 0;
2131 __initcall(ptwr_init);
2136 /************************************************************************/
2137 /************************************************************************/
2138 /************************************************************************/
2140 #ifndef NDEBUG
2142 void ptwr_status(void)
2144 unsigned long pte, *ptep, pfn;
2145 struct pfn_info *page;
2146 int cpu = smp_processor_id();
2148 ptep = (unsigned long *)&linear_pg_table
2149 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
2151 if ( __get_user(pte, ptep) ) {
2152 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
2153 domain_crash();
2156 pfn = pte >> PAGE_SHIFT;
2157 page = &frame_table[pfn];
2158 printk("need to alloc l1 page %p\n", page);
2159 /* make pt page writable */
2160 printk("need to make read-only l1-page at %p is %08lx\n",
2161 ptep, pte);
2163 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
2164 return;
2166 if ( __get_user(pte, (unsigned long *)
2167 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
2168 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
2169 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
2170 domain_crash();
2172 pfn = pte >> PAGE_SHIFT;
2173 page = &frame_table[pfn];
2176 void audit_domain(struct domain *d)
2178 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
2180 void adjust (struct pfn_info *page, int dir, int adjtype)
2182 int count = page->count_info & PGC_count_mask;
2184 if ( adjtype )
2186 int tcount = page->u.inuse.type_info & PGT_count_mask;
2188 ttot++;
2190 tcount += dir;
2192 if ( tcount < 0 )
2194 /* This will only come out once. */
2195 printk("Audit %d: type count whent below zero pfn=%x "
2196 "taf=%x otaf=%x\n",
2197 d->id, page-frame_table,
2198 page->u.inuse.type_info,
2199 page->tlbflush_timestamp);
2202 page->u.inuse.type_info =
2203 (page->u.inuse.type_info & ~PGT_count_mask) |
2204 (tcount & PGT_count_mask);
2207 ctot++;
2208 count += dir;
2209 if ( count < 0 )
2211 /* This will only come out once. */
2212 printk("Audit %d: general count whent below zero pfn=%x "
2213 "taf=%x otaf=%x\n",
2214 d->id, page-frame_table,
2215 page->u.inuse.type_info,
2216 page->tlbflush_timestamp);
2219 page->count_info =
2220 (page->count_info & ~PGC_count_mask) |
2221 (count & PGC_count_mask);
2225 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2227 unsigned long pfn, *pt;
2228 struct list_head *list_ent;
2229 struct pfn_info *page;
2230 int i;
2232 list_ent = d->page_list.next;
2233 for ( i = 0; (list_ent != &d->page_list); i++ )
2235 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2236 page = &frame_table[pfn];
2238 switch ( page->u.inuse.type_info & PGT_type_mask )
2240 case PGT_l1_page_table:
2241 case PGT_l2_page_table:
2242 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2243 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2244 if ( (pt[i] & _PAGE_PRESENT) &&
2245 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2246 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2247 d->id, i, pfn, page->u.inuse.type_info,
2248 page->count_info);
2249 unmap_domain_mem(pt);
2252 list_ent = frame_table[pfn].list.next;
2257 void scan_for_pfn_remote(unsigned long xpfn)
2259 struct domain *e;
2260 for_each_domain ( e )
2261 scan_for_pfn( e, xpfn );
2264 int i;
2265 unsigned long pfn;
2266 struct list_head *list_ent;
2267 struct pfn_info *page;
2269 if ( d != current->domain )
2270 domain_pause(d);
2271 synchronise_pagetables(~0UL);
2273 printk("pt base=%lx sh_info=%x\n",
2274 pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
2275 virt_to_page(d->shared_info)-frame_table);
2277 spin_lock(&d->page_alloc_lock);
2279 /* PHASE 0 */
2281 list_ent = d->page_list.next;
2282 for ( i = 0; (list_ent != &d->page_list); i++ )
2284 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2285 page = &frame_table[pfn];
2287 if ( page_get_owner(page) != d )
2288 BUG();
2290 if ( (page->u.inuse.type_info & PGT_count_mask) >
2291 (page->count_info & PGC_count_mask) )
2292 printk("taf > caf %x %x pfn=%lx\n",
2293 page->u.inuse.type_info, page->count_info, pfn );
2295 #if 0 /* SYSV shared memory pages plus writeable files. */
2296 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2297 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2299 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2300 pfn,
2301 page->u.inuse.type_info,
2302 page->count_info );
2303 scan_for_pfn_remote(pfn);
2305 #endif
2306 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2307 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2309 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2310 pfn,
2311 page->u.inuse.type_info,
2312 page->count_info );
2315 /* Use tlbflush_timestamp to store original type_info. */
2316 page->tlbflush_timestamp = page->u.inuse.type_info;
2318 list_ent = frame_table[pfn].list.next;
2322 /* PHASE 1 */
2323 if ( pagetable_val(d->exec_domain[0]->arch.pagetable) )
2324 adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)
2325 >>PAGE_SHIFT], -1, 1);
2327 list_ent = d->page_list.next;
2328 for ( i = 0; (list_ent != &d->page_list); i++ )
2330 unsigned long *pt;
2331 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2332 page = &frame_table[pfn];
2334 if ( page_get_owner(page) != d )
2335 BUG();
2337 switch ( page->u.inuse.type_info & PGT_type_mask )
2339 case PGT_l2_page_table:
2341 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2342 printk("Audit %d: L2 not validated %x\n",
2343 d->id, page->u.inuse.type_info);
2345 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2346 printk("Audit %d: L2 not pinned %x\n",
2347 d->id, page->u.inuse.type_info);
2348 else
2349 adjust( page, -1, 1 );
2351 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2353 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2355 if ( pt[i] & _PAGE_PRESENT )
2357 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2358 struct pfn_info *l1page = &frame_table[l1pfn];
2360 if ( page_get_owner(l1page) != d )
2362 printk("L2: Skip bizarre page belonging to other "
2363 "dom %p\n", page_get_owner(l1page));
2364 continue;
2367 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2368 PGT_l2_page_table )
2369 printk("Audit %d: [%x] Found %s Linear PT "
2370 "t=%x pfn=%lx\n", d->id, i,
2371 (l1pfn==pfn) ? "Self" : "Other",
2372 l1page->u.inuse.type_info,
2373 l1pfn);
2374 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2375 PGT_l1_page_table )
2376 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2377 d->id, i,
2378 l1page->u.inuse.type_info,
2379 l1pfn);
2381 adjust(l1page, -1, 1);
2385 unmap_domain_mem(pt);
2387 break;
2390 case PGT_l1_page_table:
2392 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2393 adjust( page, -1, 1 );
2395 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2396 printk("Audit %d: L1 not validated %x\n",
2397 d->id, page->u.inuse.type_info);
2398 #if 0
2399 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2400 printk("Audit %d: L1 not pinned %x\n",
2401 d->id, page->u.inuse.type_info);
2402 #endif
2403 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2405 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2407 if ( pt[i] & _PAGE_PRESENT )
2409 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2410 struct pfn_info *l1page = &frame_table[l1pfn];
2412 if ( l1pfn < 0x100 )
2414 lowmem_mappings++;
2415 continue;
2418 if ( l1pfn > max_page )
2420 io_mappings++;
2421 continue;
2424 if ( pt[i] & _PAGE_RW )
2427 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2428 PGT_l1_page_table ||
2429 (l1page->u.inuse.type_info & PGT_type_mask) ==
2430 PGT_l2_page_table )
2431 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2432 d->id, i,
2433 l1page->u.inuse.type_info,
2434 l1pfn);
2438 if ( page_get_owner(l1page) != d )
2440 printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
2441 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2442 d->id, pfn, i,
2443 page_get_owner(l1page),
2444 l1pfn,
2445 l1page->count_info,
2446 l1page->u.inuse.type_info,
2447 machine_to_phys_mapping[l1pfn]);
2448 continue;
2451 adjust(l1page, -1, 0);
2455 unmap_domain_mem(pt);
2457 break;
2460 list_ent = frame_table[pfn].list.next;
2463 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2464 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2465 d->id, lowmem_mappings, io_mappings);
2467 /* PHASE 2 */
2469 ctot = ttot = 0;
2470 list_ent = d->page_list.next;
2471 for ( i = 0; (list_ent != &d->page_list); i++ )
2473 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2474 page = &frame_table[pfn];
2476 switch ( page->u.inuse.type_info & PGT_type_mask)
2478 case PGT_l1_page_table:
2479 case PGT_l2_page_table:
2480 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2482 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2483 d->id, page->u.inuse.type_info,
2484 page->tlbflush_timestamp,
2485 page->count_info, pfn );
2486 scan_for_pfn_remote(pfn);
2488 default:
2489 if ( (page->count_info & PGC_count_mask) != 1 )
2491 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2492 d->id,
2493 page->count_info,
2494 page->u.inuse.type_info,
2495 page->tlbflush_timestamp, pfn );
2496 scan_for_pfn_remote(pfn);
2498 break;
2501 list_ent = frame_table[pfn].list.next;
2504 /* PHASE 3 */
2505 list_ent = d->page_list.next;
2506 for ( i = 0; (list_ent != &d->page_list); i++ )
2508 unsigned long *pt;
2509 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2510 page = &frame_table[pfn];
2512 switch ( page->u.inuse.type_info & PGT_type_mask )
2514 case PGT_l2_page_table:
2515 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2516 adjust( page, 1, 1 );
2518 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2520 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2522 if ( pt[i] & _PAGE_PRESENT )
2524 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2525 struct pfn_info *l1page;
2527 if (l1pfn>max_page)
2528 continue;
2530 l1page = &frame_table[l1pfn];
2532 if ( page_get_owner(l1page) == d )
2533 adjust(l1page, 1, 1);
2537 unmap_domain_mem(pt);
2538 break;
2540 case PGT_l1_page_table:
2541 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2542 adjust( page, 1, 1 );
2544 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2546 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2548 if ( pt[i] & _PAGE_PRESENT )
2550 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2551 struct pfn_info *l1page;
2553 if (l1pfn>max_page)
2554 continue;
2556 l1page = &frame_table[l1pfn];
2558 if ( (page_get_owner(l1page) != d) ||
2559 (l1pfn < 0x100) || (l1pfn > max_page) )
2560 continue;
2562 adjust(l1page, 1, 0);
2566 unmap_domain_mem(pt);
2567 break;
2571 page->tlbflush_timestamp = 0;
2573 list_ent = frame_table[pfn].list.next;
2576 spin_unlock(&d->page_alloc_lock);
2578 if ( pagetable_val(d->exec_domain[0]->arch.pagetable) )
2579 adjust(&frame_table[pagetable_val(
2580 d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
2582 printk("Audit %d: Done. pages=%d ctot=%d ttot=%d\n", d->id, i, ctot, ttot );
2584 if ( d != current->domain )
2585 domain_unpause(d);
2588 void audit_domains(void)
2590 struct domain *d;
2591 for_each_domain ( d )
2592 audit_domain(d);
2595 void audit_domains_key(unsigned char key)
2597 audit_domains();
2600 #endif