ia64/xen-unstable

view xen/arch/x86/memory.c @ 3105:f7a9de9a462f

bitkeeper revision 1.1159.189.6 (41a4df56fjKgjR75gUVniMEBSnS-9Q)

Unlock biglock on hypercall preemption.
author cl349@arcadians.cl.cam.ac.uk
date Wed Nov 24 19:21:58 2004 +0000 (2004-11-24)
parents 2fae9947de6f
children 75f82adfcc90
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/lib.h>
90 #include <xen/mm.h>
91 #include <xen/sched.h>
92 #include <xen/errno.h>
93 #include <xen/perfc.h>
94 #include <xen/irq.h>
95 #include <xen/softirq.h>
96 #include <asm/shadow.h>
97 #include <asm/page.h>
98 #include <asm/flushtlb.h>
99 #include <asm/io.h>
100 #include <asm/uaccess.h>
101 #include <asm/domain_page.h>
102 #include <asm/ldt.h>
104 #ifdef VERBOSE
105 #define MEM_LOG(_f, _a...) \
106 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
107 current->domain->id , __LINE__ , ## _a )
108 #else
109 #define MEM_LOG(_f, _a...) ((void)0)
110 #endif
112 static int alloc_l2_table(struct pfn_info *page);
113 static int alloc_l1_table(struct pfn_info *page);
114 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
115 static int get_page_and_type_from_pagenr(unsigned long page_nr,
116 u32 type,
117 struct domain *d);
119 static void free_l2_table(struct pfn_info *page);
120 static void free_l1_table(struct pfn_info *page);
122 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
123 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
125 /* Used to defer flushing of memory structures. */
126 static struct {
127 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
128 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
129 unsigned long deferred_ops;
130 unsigned long cr0;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } __cacheline_aligned percpu_info[NR_CPUS];
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 void arch_init_memory(void)
145 {
146 unsigned long mfn;
148 /*
149 * We are rather picky about the layout of 'struct pfn_info'. The
150 * count_info and domain fields must be adjacent, as we perform atomic
151 * 64-bit operations on them. Also, just for sanity, we assert the size
152 * of the structure here.
153 */
154 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
155 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
156 (sizeof(struct pfn_info) != 24) )
157 {
158 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
159 offsetof(struct pfn_info, count_info),
160 offsetof(struct pfn_info, u.inuse.domain),
161 sizeof(struct pfn_info));
162 for ( ; ; ) ;
163 }
165 memset(percpu_info, 0, sizeof(percpu_info));
167 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
168 memset(machine_to_phys_mapping, 0x55, 4<<20);
170 /*
171 * Initialise our DOMID_XEN domain.
172 * Any Xen-heap pages that we will allow to be mapped will have
173 * their domain field set to dom_xen.
174 */
175 dom_xen = alloc_domain_struct();
176 atomic_set(&dom_xen->refcnt, 1);
177 dom_xen->id = DOMID_XEN;
179 /*
180 * Initialise our DOMID_IO domain.
181 * This domain owns no pages but is considered a special case when
182 * mapping I/O pages, as the mappings occur at the priv of the caller.
183 */
184 dom_io = alloc_domain_struct();
185 atomic_set(&dom_io->refcnt, 1);
186 dom_io->id = DOMID_IO;
188 /* M2P table is mappable read-only by privileged domains. */
189 for ( mfn = virt_to_phys(&machine_to_phys_mapping[0<<20])>>PAGE_SHIFT;
190 mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
191 mfn++ )
192 {
193 frame_table[mfn].count_info = PGC_allocated | 1;
194 frame_table[mfn].u.inuse.type_info = PGT_gdt_page | 1; /* non-RW */
195 frame_table[mfn].u.inuse.domain = dom_xen;
196 }
197 }
199 static void __invalidate_shadow_ldt(struct exec_domain *d)
200 {
201 int i;
202 unsigned long pfn;
203 struct pfn_info *page;
205 d->mm.shadow_ldt_mapcnt = 0;
207 for ( i = 16; i < 32; i++ )
208 {
209 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_ptes[i]);
210 if ( pfn == 0 ) continue;
211 d->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
212 page = &frame_table[pfn];
213 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
214 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
215 put_page_and_type(page);
216 }
218 /* Dispose of the (now possibly invalid) mappings from the TLB. */
219 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
220 }
223 static inline void invalidate_shadow_ldt(struct exec_domain *d)
224 {
225 if ( d->mm.shadow_ldt_mapcnt != 0 )
226 __invalidate_shadow_ldt(d);
227 }
230 static int alloc_segdesc_page(struct pfn_info *page)
231 {
232 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
233 int i;
235 for ( i = 0; i < 512; i++ )
236 if ( unlikely(!check_descriptor(&descs[i*2])) )
237 goto fail;
239 unmap_domain_mem(descs);
240 return 1;
242 fail:
243 unmap_domain_mem(descs);
244 return 0;
245 }
248 /* Map shadow page at offset @off. */
249 int map_ldt_shadow_page(unsigned int off)
250 {
251 struct exec_domain *ed = current;
252 struct domain *d = ed->domain;
253 unsigned long l1e;
255 if ( unlikely(in_irq()) )
256 BUG();
258 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->mm.ldt_base >>
259 PAGE_SHIFT) + off]);
261 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
262 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
263 d, PGT_ldt_page)) )
264 return 0;
266 ed->mm.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
267 ed->mm.shadow_ldt_mapcnt++;
269 return 1;
270 }
273 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
274 {
275 struct pfn_info *page = &frame_table[page_nr];
277 if ( unlikely(!pfn_is_ram(page_nr)) )
278 {
279 MEM_LOG("Pfn %08lx is not RAM", page_nr);
280 return 0;
281 }
283 if ( unlikely(!get_page(page, d)) )
284 {
285 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
286 return 0;
287 }
289 return 1;
290 }
293 static int get_page_and_type_from_pagenr(unsigned long page_nr,
294 u32 type,
295 struct domain *d)
296 {
297 struct pfn_info *page = &frame_table[page_nr];
299 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
300 return 0;
302 if ( unlikely(!get_page_type(page, type)) )
303 {
304 #ifdef VERBOSE
305 if ( (type & PGT_type_mask) != PGT_l1_page_table )
306 MEM_LOG("Bad page type for pfn %08lx (%08x)",
307 page_nr, page->u.inuse.type_info);
308 #endif
309 put_page(page);
310 return 0;
311 }
313 return 1;
314 }
317 /*
318 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
319 * needs some special care with reference counst and access permissions:
320 * 1. The mapping entry must be read-only, or the guest may get write access
321 * to its own PTEs.
322 * 2. We must only bump the reference counts for an *already validated*
323 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
324 * on a validation that is required to complete that validation.
325 * 3. We only need to increment the reference counts for the mapped page
326 * frame if it is mapped by a different L2 table. This is sufficient and
327 * also necessary to allow validation of an L2 table mapping itself.
328 */
329 static int
330 get_linear_pagetable(
331 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
332 {
333 u32 x, y;
334 struct pfn_info *page;
336 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
337 {
338 MEM_LOG("Attempt to create linear p.t. with write perms");
339 return 0;
340 }
342 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
343 {
344 /* Make sure the mapped frame belongs to the correct domain. */
345 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
346 return 0;
348 /*
349 * Make sure that the mapped frame is an already-validated L2 table.
350 * If so, atomically increment the count (checking for overflow).
351 */
352 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
353 y = page->u.inuse.type_info;
354 do {
355 x = y;
356 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
357 unlikely((x & (PGT_type_mask|PGT_validated)) !=
358 (PGT_l2_page_table|PGT_validated)) )
359 {
360 put_page(page);
361 return 0;
362 }
363 }
364 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
365 }
367 return 1;
368 }
371 static int
372 get_page_from_l1e(
373 l1_pgentry_t l1e, struct domain *d)
374 {
375 unsigned long l1v = l1_pgentry_val(l1e);
376 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
377 struct pfn_info *page = &frame_table[pfn];
378 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
380 if ( !(l1v & _PAGE_PRESENT) )
381 return 1;
383 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
384 {
385 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
386 return 0;
387 }
389 if ( unlikely(!pfn_is_ram(pfn)) )
390 {
391 /* Revert to caller privileges if FD == DOMID_IO. */
392 if ( d == dom_io )
393 d = current->domain;
395 if ( IS_PRIV(d) )
396 return 1;
398 if ( IS_CAPABLE_PHYSDEV(d) )
399 return domain_iomem_in_pfn(d, pfn);
401 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
402 return 0;
403 }
405 return ((l1v & _PAGE_RW) ?
406 get_page_and_type(page, d, PGT_writable_page) :
407 get_page(page, d));
408 }
411 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
412 static int
413 get_page_from_l2e(
414 l2_pgentry_t l2e, unsigned long pfn,
415 struct domain *d, unsigned long va_idx)
416 {
417 int rc;
419 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
420 return 1;
422 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
423 {
424 MEM_LOG("Bad L2 page type settings %04lx",
425 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
426 return 0;
427 }
429 rc = get_page_and_type_from_pagenr(
430 l2_pgentry_to_pagenr(l2e),
431 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
433 if ( unlikely(!rc) )
434 return get_linear_pagetable(l2e, pfn, d);
436 return 1;
437 }
440 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
441 {
442 unsigned long l1v = l1_pgentry_val(l1e);
443 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
444 struct pfn_info *page = &frame_table[pfn];
445 struct domain *e = page->u.inuse.domain;
447 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
448 return;
450 if ( unlikely(e != d) )
451 {
452 /*
453 * Unmap a foreign page that may have been mapped via a grant table.
454 * Note that this can fail for a privileged domain that can map foreign
455 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
456 * counted via a grant entry and some counted directly in the page
457 * structure's reference count. Note that reference counts won't get
458 * dangerously confused as long as we always try to decrement the
459 * grant entry first. We may end up with a mismatch between which
460 * mappings and which unmappings are counted via the grant entry, but
461 * really it doesn't matter as privileged domains have carte blanche.
462 */
463 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
464 return;
465 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
466 }
468 if ( l1v & _PAGE_RW )
469 {
470 put_page_and_type(page);
471 }
472 else
473 {
474 /* We expect this is rare so we blow the entire shadow LDT. */
475 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
476 PGT_ldt_page)) &&
477 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
478 invalidate_shadow_ldt(e->exec_domain[0]);
479 put_page(page);
480 }
481 }
484 /*
485 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
486 * Note also that this automatically deals correctly with linear p.t.'s.
487 */
488 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
489 {
490 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
491 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
492 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
493 }
496 static int alloc_l2_table(struct pfn_info *page)
497 {
498 struct domain *d = page->u.inuse.domain;
499 unsigned long page_nr = page_to_pfn(page);
500 l2_pgentry_t *pl2e;
501 int i;
503 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
505 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) {
506 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
507 goto fail;
508 }
510 #if defined(__i386__)
511 /* Now we add our private high mappings. */
512 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
513 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
514 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
515 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
516 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
517 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
518 mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) |
519 __PAGE_HYPERVISOR);
520 #endif
522 unmap_domain_mem(pl2e);
523 return 1;
525 fail:
526 while ( i-- > 0 )
527 put_page_from_l2e(pl2e[i], page_nr);
529 unmap_domain_mem(pl2e);
530 return 0;
531 }
534 static int alloc_l1_table(struct pfn_info *page)
535 {
536 struct domain *d = page->u.inuse.domain;
537 unsigned long page_nr = page_to_pfn(page);
538 l1_pgentry_t *pl1e;
539 int i;
541 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
543 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
544 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
545 goto fail;
547 unmap_domain_mem(pl1e);
548 return 1;
550 fail:
551 while ( i-- > 0 )
552 put_page_from_l1e(pl1e[i], d);
554 unmap_domain_mem(pl1e);
555 return 0;
556 }
559 static void free_l2_table(struct pfn_info *page)
560 {
561 unsigned long page_nr = page - frame_table;
562 l2_pgentry_t *pl2e;
563 int i;
565 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
567 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
568 put_page_from_l2e(pl2e[i], page_nr);
570 unmap_domain_mem(pl2e);
571 }
574 static void free_l1_table(struct pfn_info *page)
575 {
576 struct domain *d = page->u.inuse.domain;
577 unsigned long page_nr = page - frame_table;
578 l1_pgentry_t *pl1e;
579 int i;
581 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
583 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
584 put_page_from_l1e(pl1e[i], d);
586 unmap_domain_mem(pl1e);
587 }
590 static inline int update_l2e(l2_pgentry_t *pl2e,
591 l2_pgentry_t ol2e,
592 l2_pgentry_t nl2e)
593 {
594 unsigned long o = cmpxchg((unsigned long *)pl2e,
595 l2_pgentry_val(ol2e),
596 l2_pgentry_val(nl2e));
597 if ( o != l2_pgentry_val(ol2e) )
598 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
599 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
600 return (o == l2_pgentry_val(ol2e));
601 }
604 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
605 static int mod_l2_entry(l2_pgentry_t *pl2e,
606 l2_pgentry_t nl2e,
607 unsigned long pfn)
608 {
609 l2_pgentry_t ol2e;
610 unsigned long _ol2e;
612 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
613 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
614 {
615 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
616 return 0;
617 }
619 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
620 return 0;
621 ol2e = mk_l2_pgentry(_ol2e);
623 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
624 {
625 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
626 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
627 return update_l2e(pl2e, ol2e, nl2e);
629 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
630 ((unsigned long)pl2e &
631 ~PAGE_MASK) >> 2)) )
632 return 0;
634 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
635 {
636 put_page_from_l2e(nl2e, pfn);
637 return 0;
638 }
640 put_page_from_l2e(ol2e, pfn);
641 return 1;
642 }
644 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
645 return 0;
647 put_page_from_l2e(ol2e, pfn);
648 return 1;
649 }
652 static inline int update_l1e(l1_pgentry_t *pl1e,
653 l1_pgentry_t ol1e,
654 l1_pgentry_t nl1e)
655 {
656 unsigned long o = l1_pgentry_val(ol1e);
657 unsigned long n = l1_pgentry_val(nl1e);
659 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
660 unlikely(o != l1_pgentry_val(ol1e)) )
661 {
662 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
663 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
664 return 0;
665 }
667 return 1;
668 }
671 /* Update the L1 entry at pl1e to new value nl1e. */
672 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
673 {
674 l1_pgentry_t ol1e;
675 unsigned long _ol1e;
676 struct domain *d = current->domain;
678 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
679 {
680 MEM_LOG("Bad get_user\n");
681 return 0;
682 }
684 ol1e = mk_l1_pgentry(_ol1e);
686 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
687 {
688 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
689 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
690 return update_l1e(pl1e, ol1e, nl1e);
692 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
693 return 0;
695 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
696 {
697 put_page_from_l1e(nl1e, d);
698 return 0;
699 }
701 put_page_from_l1e(ol1e, d);
702 return 1;
703 }
705 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
706 return 0;
708 put_page_from_l1e(ol1e, d);
709 return 1;
710 }
713 int alloc_page_type(struct pfn_info *page, unsigned int type)
714 {
715 switch ( type )
716 {
717 case PGT_l1_page_table:
718 return alloc_l1_table(page);
719 case PGT_l2_page_table:
720 return alloc_l2_table(page);
721 case PGT_gdt_page:
722 case PGT_ldt_page:
723 return alloc_segdesc_page(page);
724 default:
725 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
726 type, page->u.inuse.type_info,
727 page->count_info);
728 BUG();
729 }
731 return 0;
732 }
735 void free_page_type(struct pfn_info *page, unsigned int type)
736 {
737 struct domain *d = page->u.inuse.domain;
739 switch ( type )
740 {
741 case PGT_l1_page_table:
742 free_l1_table(page);
743 break;
745 case PGT_l2_page_table:
746 free_l2_table(page);
747 break;
749 default:
750 BUG();
751 }
753 if ( unlikely(d->exec_domain[0]->mm.shadow_mode) &&
754 (get_shadow_status(&d->exec_domain[0]->mm, page_to_pfn(page)) & PSH_shadowed) )
755 {
756 unshadow_table(page_to_pfn(page), type);
757 put_shadow_status(&d->exec_domain[0]->mm);
758 }
759 }
762 void put_page_type(struct pfn_info *page)
763 {
764 u32 nx, x, y = page->u.inuse.type_info;
766 again:
767 do {
768 x = y;
769 nx = x - 1;
771 ASSERT((x & PGT_count_mask) != 0);
773 /*
774 * The page should always be validated while a reference is held. The
775 * exception is during domain destruction, when we forcibly invalidate
776 * page-table pages if we detect a referential loop.
777 * See domain.c:relinquish_list().
778 */
779 ASSERT((x & PGT_validated) ||
780 test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
782 if ( unlikely((nx & PGT_count_mask) == 0) )
783 {
784 /* Record TLB information for flush later. Races are harmless. */
785 page->tlbflush_timestamp = tlbflush_current_time();
787 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
788 likely(nx & PGT_validated) )
789 {
790 /*
791 * Page-table pages must be unvalidated when count is zero. The
792 * 'free' is safe because the refcnt is non-zero and validated
793 * bit is clear => other ops will spin or fail.
794 */
795 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
796 x & ~PGT_validated)) != x) )
797 goto again;
798 /* We cleared the 'valid bit' so we do the clear up. */
799 free_page_type(page, x & PGT_type_mask);
800 /* Carry on, but with the 'valid bit' now clear. */
801 x &= ~PGT_validated;
802 nx &= ~PGT_validated;
803 }
804 }
805 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
806 (PGT_pinned | 1)) )
807 {
808 /* Page is now only pinned. Make the back pointer mutable again. */
809 nx |= PGT_va_mutable;
810 }
811 }
812 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
813 }
816 int get_page_type(struct pfn_info *page, u32 type)
817 {
818 u32 nx, x, y = page->u.inuse.type_info;
820 again:
821 do {
822 x = y;
823 nx = x + 1;
824 if ( unlikely((nx & PGT_count_mask) == 0) )
825 {
826 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
827 return 0;
828 }
829 else if ( unlikely((x & PGT_count_mask) == 0) )
830 {
831 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
832 {
833 /*
834 * On type change we check to flush stale TLB entries. This
835 * may be unnecessary (e.g., page was GDT/LDT) but those
836 * circumstances should be very rare.
837 */
838 struct domain *d = page->u.inuse.domain;
839 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
840 page->tlbflush_timestamp)) )
841 {
842 perfc_incr(need_flush_tlb_flush);
843 flush_tlb_cpu(d->exec_domain[0]->processor);
844 }
846 /* We lose existing type, back pointer, and validity. */
847 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
848 nx |= type;
850 /* No special validation needed for writable pages. */
851 /* Page tables and GDT/LDT need to be scanned for validity. */
852 if ( type == PGT_writable_page )
853 nx |= PGT_validated;
854 }
855 }
856 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
857 {
858 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
859 {
860 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
861 ((type & PGT_type_mask) != PGT_l1_page_table) )
862 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
863 x & PGT_type_mask, type, page_to_pfn(page));
864 return 0;
865 }
866 else if ( (x & PGT_va_mask) == PGT_va_mutable )
867 {
868 /* The va backpointer is mutable, hence we update it. */
869 nx &= ~PGT_va_mask;
870 nx |= type; /* we know the actual type is correct */
871 }
872 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
873 {
874 /* This table is potentially mapped at multiple locations. */
875 nx &= ~PGT_va_mask;
876 nx |= PGT_va_unknown;
877 }
878 }
879 else if ( unlikely(!(x & PGT_validated)) )
880 {
881 /* Someone else is updating validation of this page. Wait... */
882 while ( (y = page->u.inuse.type_info) == x )
883 {
884 rep_nop();
885 barrier();
886 }
887 goto again;
888 }
889 }
890 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
892 if ( unlikely(!(nx & PGT_validated)) )
893 {
894 /* Try to validate page type; drop the new reference on failure. */
895 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
896 {
897 MEM_LOG("Error while validating pfn %08lx for type %08x."
898 " caf=%08x taf=%08x\n",
899 page_to_pfn(page), type,
900 page->count_info,
901 page->u.inuse.type_info);
902 /* Noone else can get a reference. We hold the only ref. */
903 page->u.inuse.type_info = 0;
904 return 0;
905 }
907 /* Noone else is updating simultaneously. */
908 __set_bit(_PGT_validated, &page->u.inuse.type_info);
909 }
911 return 1;
912 }
915 static int do_extended_command(unsigned long ptr, unsigned long val)
916 {
917 int okay = 1, cpu = smp_processor_id();
918 unsigned int cmd = val & MMUEXT_CMD_MASK;
919 unsigned long pfn = ptr >> PAGE_SHIFT;
920 unsigned long old_base_pfn;
921 struct pfn_info *page = &frame_table[pfn];
922 struct exec_domain *ed = current;
923 struct domain *d = ed->domain, *nd, *e;
924 u32 x, y;
925 domid_t domid;
926 grant_ref_t gntref;
928 switch ( cmd )
929 {
930 case MMUEXT_PIN_L1_TABLE:
931 case MMUEXT_PIN_L2_TABLE:
932 /*
933 * We insist that, if you pin an L1 page, it's the first thing that
934 * you do to it. This is because we require the backptr to still be
935 * mutable. This assumption seems safe.
936 */
937 okay = get_page_and_type_from_pagenr(
938 pfn,
939 ((cmd==MMUEXT_PIN_L2_TABLE) ?
940 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
941 FOREIGNDOM);
943 if ( unlikely(!okay) )
944 {
945 MEM_LOG("Error while pinning pfn %08lx", pfn);
946 break;
947 }
949 if ( unlikely(test_and_set_bit(_PGT_pinned,
950 &page->u.inuse.type_info)) )
951 {
952 MEM_LOG("Pfn %08lx already pinned", pfn);
953 put_page_and_type(page);
954 okay = 0;
955 break;
956 }
958 break;
960 case MMUEXT_UNPIN_TABLE:
961 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
962 {
963 MEM_LOG("Page %08lx bad domain (dom=%p)",
964 ptr, page->u.inuse.domain);
965 }
966 else if ( likely(test_and_clear_bit(_PGT_pinned,
967 &page->u.inuse.type_info)) )
968 {
969 put_page_and_type(page);
970 put_page(page);
971 }
972 else
973 {
974 okay = 0;
975 put_page(page);
976 MEM_LOG("Pfn %08lx not pinned", pfn);
977 }
978 break;
980 case MMUEXT_NEW_BASEPTR:
981 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
982 if ( likely(okay) )
983 {
984 invalidate_shadow_ldt(ed);
986 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
987 old_base_pfn = pagetable_val(ed->mm.pagetable) >> PAGE_SHIFT;
988 ed->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
990 shadow_mk_pagetable(&ed->mm);
992 write_ptbase(&ed->mm);
994 put_page_and_type(&frame_table[old_base_pfn]);
995 }
996 else
997 {
998 MEM_LOG("Error while installing new baseptr %08lx", ptr);
999 }
1000 break;
1002 case MMUEXT_TLB_FLUSH:
1003 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1004 break;
1006 case MMUEXT_INVLPG:
1007 __flush_tlb_one(ptr);
1008 break;
1010 case MMUEXT_FLUSH_CACHE:
1011 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1013 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1014 okay = 0;
1016 else
1018 wbinvd();
1020 break;
1022 case MMUEXT_SET_LDT:
1024 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1025 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1026 (ents > 8192) ||
1027 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1028 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1030 okay = 0;
1031 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1033 else if ( (ed->mm.ldt_ents != ents) ||
1034 (ed->mm.ldt_base != ptr) )
1036 invalidate_shadow_ldt(ed);
1037 ed->mm.ldt_base = ptr;
1038 ed->mm.ldt_ents = ents;
1039 load_LDT(ed);
1040 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1041 if ( ents != 0 )
1042 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1044 break;
1047 case MMUEXT_SET_FOREIGNDOM:
1048 domid = (domid_t)(val >> 16);
1050 if ( (e = percpu_info[cpu].foreign) != NULL )
1051 put_domain(e);
1052 percpu_info[cpu].foreign = NULL;
1054 if ( !IS_PRIV(d) )
1056 switch ( domid )
1058 case DOMID_IO:
1059 get_knownalive_domain(dom_io);
1060 percpu_info[cpu].foreign = dom_io;
1061 break;
1062 default:
1063 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1064 okay = 0;
1065 break;
1068 else
1070 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1071 if ( e == NULL )
1073 switch ( domid )
1075 case DOMID_XEN:
1076 get_knownalive_domain(dom_xen);
1077 percpu_info[cpu].foreign = dom_xen;
1078 break;
1079 case DOMID_IO:
1080 get_knownalive_domain(dom_io);
1081 percpu_info[cpu].foreign = dom_io;
1082 break;
1083 default:
1084 MEM_LOG("Unknown domain '%u'", domid);
1085 okay = 0;
1086 break;
1090 break;
1092 case MMUEXT_TRANSFER_PAGE:
1093 domid = (domid_t)(val >> 16);
1094 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1096 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1097 unlikely(!pfn_is_ram(pfn)) ||
1098 unlikely((e = find_domain_by_id(domid)) == NULL) )
1100 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1101 okay = 0;
1102 break;
1105 spin_lock(&d->page_alloc_lock);
1107 /*
1108 * The tricky bit: atomically release ownership while there is just one
1109 * benign reference to the page (PGC_allocated). If that reference
1110 * disappears then the deallocation routine will safely spin.
1111 */
1112 nd = page->u.inuse.domain;
1113 y = page->count_info;
1114 do {
1115 x = y;
1116 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1117 (1|PGC_allocated)) ||
1118 unlikely(nd != d) )
1120 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1121 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1122 d, d->id, nd, x, page->u.inuse.type_info);
1123 spin_unlock(&d->page_alloc_lock);
1124 put_domain(e);
1125 return 0;
1127 __asm__ __volatile__(
1128 LOCK_PREFIX "cmpxchg8b %2"
1129 : "=d" (nd), "=a" (y),
1130 "=m" (*(volatile u64 *)(&page->count_info))
1131 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1133 while ( unlikely(nd != d) || unlikely(y != x) );
1135 /*
1136 * Unlink from 'd'. At least one reference remains (now anonymous), so
1137 * noone else is spinning to try to delete this page from 'd'.
1138 */
1139 d->tot_pages--;
1140 list_del(&page->list);
1142 spin_unlock(&d->page_alloc_lock);
1144 spin_lock(&e->page_alloc_lock);
1146 /*
1147 * Check that 'e' will accept the page and has reservation headroom.
1148 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1149 */
1150 ASSERT(e->tot_pages <= e->max_pages);
1151 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1152 unlikely(e->tot_pages == e->max_pages) ||
1153 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1155 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1156 "provided a bad grant ref, or is dying (%08lx).\n",
1157 e->tot_pages, e->max_pages, e->d_flags);
1158 spin_unlock(&e->page_alloc_lock);
1159 put_domain(e);
1160 okay = 0;
1161 break;
1164 /* Okay, add the page to 'e'. */
1165 if ( unlikely(e->tot_pages++ == 0) )
1166 get_knownalive_domain(e);
1167 list_add_tail(&page->list, &e->page_list);
1168 page->u.inuse.domain = e;
1170 spin_unlock(&e->page_alloc_lock);
1172 /* Transfer is all done: tell the guest about its new page frame. */
1173 gnttab_notify_transfer(e, gntref, pfn);
1175 put_domain(e);
1176 break;
1178 case MMUEXT_REASSIGN_PAGE:
1179 if ( unlikely(!IS_PRIV(d)) )
1181 MEM_LOG("Dom %u has no reassignment priv", d->id);
1182 okay = 0;
1183 break;
1186 e = percpu_info[cpu].foreign;
1187 if ( unlikely(e == NULL) )
1189 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1190 okay = 0;
1191 break;
1194 /*
1195 * Grab both page_list locks, in order. This prevents the page from
1196 * disappearing elsewhere while we modify the owner, and we'll need
1197 * both locks if we're successful so that we can change lists.
1198 */
1199 if ( d < e )
1201 spin_lock(&d->page_alloc_lock);
1202 spin_lock(&e->page_alloc_lock);
1204 else
1206 spin_lock(&e->page_alloc_lock);
1207 spin_lock(&d->page_alloc_lock);
1210 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1211 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1212 unlikely(IS_XEN_HEAP_FRAME(page)) )
1214 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1215 okay = 0;
1216 goto reassign_fail;
1219 /*
1220 * The tricky bit: atomically change owner while there is just one
1221 * benign reference to the page (PGC_allocated). If that reference
1222 * disappears then the deallocation routine will safely spin.
1223 */
1224 nd = page->u.inuse.domain;
1225 y = page->count_info;
1226 do {
1227 x = y;
1228 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1229 (1|PGC_allocated)) ||
1230 unlikely(nd != d) )
1232 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1233 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1234 d, d->id, nd, x, page->u.inuse.type_info);
1235 okay = 0;
1236 goto reassign_fail;
1238 __asm__ __volatile__(
1239 LOCK_PREFIX "cmpxchg8b %3"
1240 : "=d" (nd), "=a" (y), "=c" (e),
1241 "=m" (*(volatile u64 *)(&page->count_info))
1242 : "0" (d), "1" (x), "c" (e), "b" (x) );
1244 while ( unlikely(nd != d) || unlikely(y != x) );
1246 /*
1247 * Unlink from 'd'. We transferred at least one reference to 'e', so
1248 * noone else is spinning to try to delete this page from 'd'.
1249 */
1250 d->tot_pages--;
1251 list_del(&page->list);
1253 /*
1254 * Add the page to 'e'. Someone may already have removed the last
1255 * reference and want to remove the page from 'e'. However, we have
1256 * the lock so they'll spin waiting for us.
1257 */
1258 if ( unlikely(e->tot_pages++ == 0) )
1259 get_knownalive_domain(e);
1260 list_add_tail(&page->list, &e->page_list);
1262 reassign_fail:
1263 spin_unlock(&d->page_alloc_lock);
1264 spin_unlock(&e->page_alloc_lock);
1265 break;
1267 case MMUEXT_CLEAR_FOREIGNDOM:
1268 if ( (e = percpu_info[cpu].foreign) != NULL )
1269 put_domain(e);
1270 percpu_info[cpu].foreign = NULL;
1271 break;
1273 default:
1274 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1275 okay = 0;
1276 break;
1279 return okay;
1283 int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count)
1285 mmu_update_t req;
1286 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1287 struct pfn_info *page;
1288 int rc = 0, okay = 1, i, cpu = smp_processor_id();
1289 unsigned int cmd;
1290 unsigned long prev_spfn = 0;
1291 l1_pgentry_t *prev_spl1e = 0;
1292 struct exec_domain *ed = current;
1293 struct domain *d = ed->domain;
1294 u32 type_info;
1296 perfc_incrc(calls_to_mmu_update);
1297 perfc_addc(num_page_updates, count);
1299 LOCK_BIGLOCK(d);
1301 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1303 if ( unlikely(!access_ok(VERIFY_READ, ureqs, count * sizeof(req))) ) {
1304 UNLOCK_BIGLOCK(d);
1305 return -EFAULT;
1308 for ( i = 0; i < count; i++ )
1310 locked_hypercall_may_preempt(d,
1311 __HYPERVISOR_mmu_update, 3, ureqs, count-i, success_count);
1313 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1315 MEM_LOG("Bad __copy_from_user");
1316 rc = -EFAULT;
1317 break;
1320 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1321 pfn = req.ptr >> PAGE_SHIFT;
1323 okay = 0;
1325 switch ( cmd )
1327 /*
1328 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1329 */
1330 case MMU_NORMAL_PT_UPDATE:
1331 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1333 MEM_LOG("Could not get page for normal update");
1334 break;
1337 if ( likely(prev_pfn == pfn) )
1339 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1341 else
1343 if ( prev_pfn != 0 )
1344 unmap_domain_mem((void *)va);
1345 va = (unsigned long)map_domain_mem(req.ptr);
1346 prev_pfn = pfn;
1349 page = &frame_table[pfn];
1350 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1352 case PGT_l1_page_table:
1353 if ( likely(get_page_type(
1354 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1356 okay = mod_l1_entry((l1_pgentry_t *)va,
1357 mk_l1_pgentry(req.val));
1359 if ( unlikely(ed->mm.shadow_mode) && okay &&
1360 (get_shadow_status(&ed->mm, page-frame_table) &
1361 PSH_shadowed) )
1363 shadow_l1_normal_pt_update(
1364 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1365 put_shadow_status(&ed->mm);
1368 put_page_type(page);
1370 break;
1371 case PGT_l2_page_table:
1372 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1374 okay = mod_l2_entry((l2_pgentry_t *)va,
1375 mk_l2_pgentry(req.val),
1376 pfn);
1378 if ( unlikely(ed->mm.shadow_mode) && okay &&
1379 (get_shadow_status(&ed->mm, page-frame_table) &
1380 PSH_shadowed) )
1382 shadow_l2_normal_pt_update(req.ptr, req.val);
1383 put_shadow_status(&ed->mm);
1386 put_page_type(page);
1388 break;
1389 default:
1390 if ( likely(get_page_type(page, PGT_writable_page)) )
1392 *(unsigned long *)va = req.val;
1393 okay = 1;
1394 put_page_type(page);
1396 break;
1399 put_page(page);
1400 break;
1402 case MMU_MACHPHYS_UPDATE:
1403 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1405 MEM_LOG("Could not get page for mach->phys update");
1406 break;
1409 machine_to_phys_mapping[pfn] = req.val;
1410 okay = 1;
1412 /*
1413 * If in log-dirty mode, mark the corresponding pseudo-physical
1414 * page as dirty.
1415 */
1416 if ( unlikely(ed->mm.shadow_mode == SHM_logdirty) &&
1417 mark_dirty(&ed->mm, pfn) )
1418 ed->mm.shadow_dirty_block_count++;
1420 put_page(&frame_table[pfn]);
1421 break;
1423 /*
1424 * MMU_EXTENDED_COMMAND: Extended command is specified
1425 * in the least-siginificant bits of the 'value' field.
1426 */
1427 case MMU_EXTENDED_COMMAND:
1428 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1429 okay = do_extended_command(req.ptr, req.val);
1430 break;
1432 default:
1433 MEM_LOG("Invalid page update command %08lx", req.ptr);
1434 break;
1437 if ( unlikely(!okay) )
1439 rc = -EINVAL;
1440 break;
1443 ureqs++;
1446 if ( prev_pfn != 0 )
1447 unmap_domain_mem((void *)va);
1449 if ( unlikely(prev_spl1e != 0) )
1450 unmap_domain_mem((void *)prev_spl1e);
1452 deferred_ops = percpu_info[cpu].deferred_ops;
1453 percpu_info[cpu].deferred_ops = 0;
1455 if ( deferred_ops & DOP_FLUSH_TLB )
1456 local_flush_tlb();
1458 if ( deferred_ops & DOP_RELOAD_LDT )
1459 (void)map_ldt_shadow_page(0);
1461 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1463 put_domain(percpu_info[cpu].foreign);
1464 percpu_info[cpu].foreign = NULL;
1467 if ( unlikely(success_count != NULL) )
1468 put_user(i, success_count);
1470 UNLOCK_BIGLOCK(d);
1471 return rc;
1475 int do_update_va_mapping(unsigned long page_nr,
1476 unsigned long val,
1477 unsigned long flags)
1479 struct exec_domain *ed = current;
1480 struct domain *d = ed->domain;
1481 int err = 0;
1482 unsigned int cpu = ed->processor;
1483 unsigned long deferred_ops;
1485 perfc_incrc(calls_to_update_va);
1487 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1488 return -EINVAL;
1490 LOCK_BIGLOCK(d);
1492 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1494 /*
1495 * XXX When we make this support 4MB superpages we should also deal with
1496 * the case of updating L2 entries.
1497 */
1499 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1500 mk_l1_pgentry(val))) )
1501 err = -EINVAL;
1503 if ( unlikely(ed->mm.shadow_mode) )
1505 unsigned long sval;
1507 l1pte_propagate_from_guest(&ed->mm, &val, &sval);
1509 if ( unlikely(__put_user(sval, ((unsigned long *)(
1510 &shadow_linear_pg_table[page_nr])))) )
1512 /*
1513 * Since L2's are guranteed RW, failure indicates the page was not
1514 * shadowed, so ignore.
1515 */
1516 perfc_incrc(shadow_update_va_fail);
1519 /*
1520 * If we're in log-dirty mode then we need to note that we've updated
1521 * the PTE in the PT-holding page. We need the machine frame number
1522 * for this.
1523 */
1524 if ( ed->mm.shadow_mode == SHM_logdirty )
1525 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
1527 check_pagetable(&ed->mm, ed->mm.pagetable, "va"); /* debug */
1530 deferred_ops = percpu_info[cpu].deferred_ops;
1531 percpu_info[cpu].deferred_ops = 0;
1533 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1534 unlikely(flags & UVMF_FLUSH_TLB) )
1535 local_flush_tlb();
1536 else if ( unlikely(flags & UVMF_INVLPG) )
1537 __flush_tlb_one(page_nr << PAGE_SHIFT);
1539 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1540 (void)map_ldt_shadow_page(0);
1542 UNLOCK_BIGLOCK(d);
1544 return err;
1547 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1548 unsigned long val,
1549 unsigned long flags,
1550 domid_t domid)
1552 unsigned int cpu = smp_processor_id();
1553 struct domain *d;
1554 int rc;
1556 if ( unlikely(!IS_PRIV(current->domain)) )
1557 return -EPERM;
1559 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1560 if ( unlikely(d == NULL) )
1562 MEM_LOG("Unknown domain '%u'", domid);
1563 return -ESRCH;
1566 rc = do_update_va_mapping(page_nr, val, flags);
1568 put_domain(d);
1569 percpu_info[cpu].foreign = NULL;
1571 return rc;
1576 /*************************
1577 * Writable Pagetables
1578 */
1580 ptwr_info_t ptwr_info[NR_CPUS];
1582 #ifdef VERBOSE
1583 int ptwr_debug = 0x0;
1584 #define PTWR_PRINTK(_f, _a...) \
1585 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1586 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1587 #else
1588 #define PTWR_PRINTK(_f, _a...) ((void)0)
1589 #endif
1591 /* Flush the given writable p.t. page and write-protect it again. */
1592 void ptwr_flush(const int which)
1594 unsigned long sstat, spte, pte, *ptep, l1va;
1595 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1596 l2_pgentry_t *pl2e;
1597 int i, cpu = smp_processor_id();
1598 struct exec_domain *ed = current;
1599 struct domain *d = ed->domain;
1601 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1602 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1604 /*
1605 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1606 */
1608 if ( unlikely(__get_user(pte, ptep)) )
1610 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1611 /*
1612 * Really a bug. We could read this PTE during the initial fault,
1613 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1614 */
1615 BUG();
1617 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1618 PTWR_PRINT_WHICH, ptep, pte);
1619 pte &= ~_PAGE_RW;
1621 if ( unlikely(ed->mm.shadow_mode) )
1623 /* Write-protect the p.t. page in the shadow page table. */
1624 l1pte_propagate_from_guest(&ed->mm, &pte, &spte);
1625 __put_user(
1626 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1628 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1629 sstat = get_shadow_status(&ed->mm, pte >> PAGE_SHIFT);
1630 if ( sstat & PSH_shadowed )
1631 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1634 /* Write-protect the p.t. page in the guest page table. */
1635 if ( unlikely(__put_user(pte, ptep)) )
1637 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1638 /*
1639 * Really a bug. We could write this PTE during the initial fault,
1640 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1641 */
1642 BUG();
1645 /* Ensure that there are no stale writable mappings in any TLB. */
1646 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1647 #if 0
1648 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1649 #else
1650 flush_tlb_all();
1651 #endif
1652 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1653 PTWR_PRINT_WHICH, ptep, pte);
1655 /*
1656 * STEP 2. Validate any modified PTEs.
1657 */
1659 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1660 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1662 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1663 nl1e = pl1e[i];
1665 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1666 continue;
1668 /*
1669 * Fast path for PTEs that have merely been write-protected
1670 * (e.g., during a Unix fork()). A strict reduction in privilege.
1671 */
1672 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1674 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1676 if ( unlikely(sl1e != NULL) )
1677 l1pte_propagate_from_guest(
1678 &ed->mm, &l1_pgentry_val(nl1e),
1679 &l1_pgentry_val(sl1e[i]));
1680 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1682 continue;
1685 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1687 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1688 /*
1689 * Make the remaining p.t's consistent before crashing, so the
1690 * reference counts are correct.
1691 */
1692 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1693 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1694 unmap_domain_mem(pl1e);
1695 ptwr_info[cpu].ptinfo[which].l1va = 0;
1696 UNLOCK_BIGLOCK(d);
1697 domain_crash();
1700 if ( unlikely(sl1e != NULL) )
1701 l1pte_propagate_from_guest(
1702 &ed->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1704 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1705 put_page_from_l1e(ol1e, d);
1707 unmap_domain_mem(pl1e);
1709 /*
1710 * STEP 3. Reattach the L1 p.t. page into the current address space.
1711 */
1713 if ( (which == PTWR_PT_ACTIVE) && likely(!ed->mm.shadow_mode) )
1715 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1716 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1719 /*
1720 * STEP 4. Final tidy-up.
1721 */
1723 ptwr_info[cpu].ptinfo[which].l1va = 0;
1725 if ( unlikely(sl1e != NULL) )
1727 unmap_domain_mem(sl1e);
1728 put_shadow_status(&ed->mm);
1732 /* Write page fault handler: check if guest is trying to modify a PTE. */
1733 int ptwr_do_page_fault(unsigned long addr)
1735 unsigned long pte, pfn, l2e;
1736 struct pfn_info *page;
1737 l2_pgentry_t *pl2e;
1738 int which, cpu = smp_processor_id();
1739 u32 l2_idx;
1740 struct domain *d = current->domain;
1742 LOCK_BIGLOCK(d);
1743 /*
1744 * Attempt to read the PTE that maps the VA being accessed. By checking for
1745 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1746 */
1747 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1748 _PAGE_PRESENT) ||
1749 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1751 UNLOCK_BIGLOCK(d);
1752 return 0;
1755 pfn = pte >> PAGE_SHIFT;
1756 page = &frame_table[pfn];
1758 /* We are looking only for read-only mappings of p.t. pages. */
1759 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1760 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1762 UNLOCK_BIGLOCK(d);
1763 return 0;
1766 /* Get the L2 index at which this L1 p.t. is always mapped. */
1767 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1768 if ( unlikely(l2_idx >= PGT_va_unknown) )
1770 UNLOCK_BIGLOCK(d);
1771 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1773 l2_idx >>= PGT_va_shift;
1775 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1777 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1778 domain_crash();
1781 /*
1782 * Is the L1 p.t. mapped into the current address space? If so we call it
1783 * an ACTIVE p.t., otherwise it is INACTIVE.
1784 */
1785 pl2e = &linear_l2_table[l2_idx];
1786 l2e = l2_pgentry_val(*pl2e);
1787 which = PTWR_PT_INACTIVE;
1788 if ( (l2e >> PAGE_SHIFT) == pfn )
1790 /*
1791 * If the PRESENT bit is clear, we may be conflicting with the current
1792 * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
1793 */
1794 if ( unlikely(!(l2e & _PAGE_PRESENT)) &&
1795 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va )
1796 ptwr_flush(PTWR_PT_ACTIVE);
1798 /* Now do a final check of the PRESENT bit to set ACTIVE. */
1799 if ( likely(l2e & _PAGE_PRESENT) )
1800 which = PTWR_PT_ACTIVE;
1803 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1804 "pfn %08lx\n", PTWR_PRINT_WHICH,
1805 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1807 /*
1808 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1809 * time. If there is already one, we must flush it out.
1810 */
1811 if ( ptwr_info[cpu].ptinfo[which].l1va )
1812 ptwr_flush(which);
1814 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1815 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1817 /* For safety, disconnect the L1 p.t. page from current space. */
1818 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1820 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1821 #if 0
1822 flush_tlb(); /* XXX Multi-CPU guests? */
1823 #else
1824 flush_tlb_all();
1825 #endif
1828 /* Temporarily map the L1 page, and make a copy of it. */
1829 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1830 memcpy(ptwr_info[cpu].ptinfo[which].page,
1831 ptwr_info[cpu].ptinfo[which].pl1e,
1832 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1834 /* Finally, make the p.t. page writable by the guest OS. */
1835 pte |= _PAGE_RW;
1836 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1837 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1838 if ( unlikely(__put_user(pte, (unsigned long *)
1839 &linear_pg_table[addr>>PAGE_SHIFT])) )
1841 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1842 &linear_pg_table[addr>>PAGE_SHIFT]);
1843 /* Toss the writable pagetable state and crash. */
1844 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1845 ptwr_info[cpu].ptinfo[which].l1va = 0;
1846 UNLOCK_BIGLOCK(d);
1847 domain_crash();
1850 UNLOCK_BIGLOCK(d);
1852 return EXCRET_fault_fixed;
1855 static __init int ptwr_init(void)
1857 int i;
1859 for ( i = 0; i < smp_num_cpus; i++ )
1861 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1862 (void *)alloc_xenheap_page();
1863 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1864 (void *)alloc_xenheap_page();
1867 return 0;
1869 __initcall(ptwr_init);
1874 /************************************************************************/
1875 /************************************************************************/
1876 /************************************************************************/
1878 #ifndef NDEBUG
1880 void ptwr_status(void)
1882 unsigned long pte, *ptep, pfn;
1883 struct pfn_info *page;
1884 int cpu = smp_processor_id();
1886 ptep = (unsigned long *)&linear_pg_table
1887 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1889 if ( __get_user(pte, ptep) ) {
1890 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1891 domain_crash();
1894 pfn = pte >> PAGE_SHIFT;
1895 page = &frame_table[pfn];
1896 printk("need to alloc l1 page %p\n", page);
1897 /* make pt page writable */
1898 printk("need to make read-only l1-page at %p is %08lx\n",
1899 ptep, pte);
1901 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1902 return;
1904 if ( __get_user(pte, (unsigned long *)
1905 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1906 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1907 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1908 domain_crash();
1910 pfn = pte >> PAGE_SHIFT;
1911 page = &frame_table[pfn];
1914 void audit_domain(struct domain *d)
1916 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1918 void adjust (struct pfn_info *page, int dir, int adjtype)
1920 int count = page->count_info & PGC_count_mask;
1922 if ( adjtype )
1924 int tcount = page->u.inuse.type_info & PGT_count_mask;
1926 ttot++;
1928 tcount += dir;
1930 if ( tcount < 0 )
1932 /* This will only come out once. */
1933 printk("Audit %d: type count whent below zero pfn=%x "
1934 "taf=%x otaf=%x\n",
1935 d->id, page-frame_table,
1936 page->u.inuse.type_info,
1937 page->tlbflush_timestamp);
1940 page->u.inuse.type_info =
1941 (page->u.inuse.type_info & ~PGT_count_mask) |
1942 (tcount & PGT_count_mask);
1945 ctot++;
1946 count += dir;
1947 if ( count < 0 )
1949 /* This will only come out once. */
1950 printk("Audit %d: general count whent below zero pfn=%x "
1951 "taf=%x otaf=%x\n",
1952 d->id, page-frame_table,
1953 page->u.inuse.type_info,
1954 page->tlbflush_timestamp);
1957 page->count_info =
1958 (page->count_info & ~PGC_count_mask) |
1959 (count & PGC_count_mask);
1963 void scan_for_pfn(struct domain *d, unsigned long xpfn)
1965 unsigned long pfn, *pt;
1966 struct list_head *list_ent;
1967 struct pfn_info *page;
1968 int i;
1970 list_ent = d->page_list.next;
1971 for ( i = 0; (list_ent != &d->page_list); i++ )
1973 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1974 page = &frame_table[pfn];
1976 switch ( page->u.inuse.type_info & PGT_type_mask )
1978 case PGT_l1_page_table:
1979 case PGT_l2_page_table:
1980 pt = map_domain_mem(pfn<<PAGE_SHIFT);
1981 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1982 if ( (pt[i] & _PAGE_PRESENT) &&
1983 ((pt[i] >> PAGE_SHIFT) == xpfn) )
1984 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
1985 d->id, i, pfn, page->u.inuse.type_info,
1986 page->count_info);
1987 unmap_domain_mem(pt);
1990 list_ent = frame_table[pfn].list.next;
1995 void scan_for_pfn_remote(unsigned long xpfn)
1997 struct domain *e;
1998 for_each_domain ( e )
1999 scan_for_pfn( e, xpfn );
2002 int i;
2003 unsigned long pfn;
2004 struct list_head *list_ent;
2005 struct pfn_info *page;
2007 if ( d != current->domain )
2008 domain_pause(d);
2009 synchronise_pagetables(~0UL);
2011 printk("pt base=%lx sh_info=%x\n",
2012 pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT,
2013 virt_to_page(d->shared_info)-frame_table);
2015 spin_lock(&d->page_alloc_lock);
2017 /* PHASE 0 */
2019 list_ent = d->page_list.next;
2020 for ( i = 0; (list_ent != &d->page_list); i++ )
2022 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2023 page = &frame_table[pfn];
2025 if ( page->u.inuse.domain != d )
2026 BUG();
2028 if ( (page->u.inuse.type_info & PGT_count_mask) >
2029 (page->count_info & PGC_count_mask) )
2030 printk("taf > caf %x %x pfn=%lx\n",
2031 page->u.inuse.type_info, page->count_info, pfn );
2033 #if 0 /* SYSV shared memory pages plus writeable files. */
2034 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2035 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2037 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2038 pfn,
2039 page->u.inuse.type_info,
2040 page->count_info );
2041 scan_for_pfn_remote(pfn);
2043 #endif
2044 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2045 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2047 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2048 pfn,
2049 page->u.inuse.type_info,
2050 page->count_info );
2053 /* Use tlbflush_timestamp to store original type_info. */
2054 page->tlbflush_timestamp = page->u.inuse.type_info;
2056 list_ent = frame_table[pfn].list.next;
2060 /* PHASE 1 */
2062 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2064 list_ent = d->page_list.next;
2065 for ( i = 0; (list_ent != &d->page_list); i++ )
2067 unsigned long *pt;
2068 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2069 page = &frame_table[pfn];
2071 if ( page->u.inuse.domain != d )
2072 BUG();
2074 switch ( page->u.inuse.type_info & PGT_type_mask )
2076 case PGT_l2_page_table:
2078 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2079 printk("Audit %d: L2 not validated %x\n",
2080 d->id, page->u.inuse.type_info);
2082 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2083 printk("Audit %d: L2 not pinned %x\n",
2084 d->id, page->u.inuse.type_info);
2085 else
2086 adjust( page, -1, 1 );
2088 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2090 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2092 if ( pt[i] & _PAGE_PRESENT )
2094 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2095 struct pfn_info *l1page = &frame_table[l1pfn];
2097 if ( l1page->u.inuse.domain != d )
2099 printk("L2: Skip bizarre page belonging to other "
2100 "dom %p\n", l1page->u.inuse.domain);
2101 continue;
2104 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2105 PGT_l2_page_table )
2106 printk("Audit %d: [%x] Found %s Linear PT "
2107 "t=%x pfn=%lx\n", d->id, i,
2108 (l1pfn==pfn) ? "Self" : "Other",
2109 l1page->u.inuse.type_info,
2110 l1pfn);
2111 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2112 PGT_l1_page_table )
2113 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2114 d->id, i,
2115 l1page->u.inuse.type_info,
2116 l1pfn);
2118 adjust(l1page, -1, 1);
2122 unmap_domain_mem(pt);
2124 break;
2127 case PGT_l1_page_table:
2129 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2130 adjust( page, -1, 1 );
2132 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2133 printk("Audit %d: L1 not validated %x\n",
2134 d->id, page->u.inuse.type_info);
2135 #if 0
2136 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2137 printk("Audit %d: L1 not pinned %x\n",
2138 d->id, page->u.inuse.type_info);
2139 #endif
2140 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2142 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2144 if ( pt[i] & _PAGE_PRESENT )
2146 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2147 struct pfn_info *l1page = &frame_table[l1pfn];
2149 if ( l1pfn < 0x100 )
2151 lowmem_mappings++;
2152 continue;
2155 if ( l1pfn > max_page )
2157 io_mappings++;
2158 continue;
2161 if ( pt[i] & _PAGE_RW )
2164 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2165 PGT_l1_page_table ||
2166 (l1page->u.inuse.type_info & PGT_type_mask) ==
2167 PGT_l2_page_table )
2168 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2169 d->id, i,
2170 l1page->u.inuse.type_info,
2171 l1pfn);
2175 if ( l1page->u.inuse.domain != d )
2177 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2178 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2179 d->id, pfn, i,
2180 (unsigned long)l1page->u.inuse.domain,
2181 l1pfn,
2182 l1page->count_info,
2183 l1page->u.inuse.type_info,
2184 machine_to_phys_mapping[l1pfn]);
2185 continue;
2188 adjust(l1page, -1, 0);
2192 unmap_domain_mem(pt);
2194 break;
2197 list_ent = frame_table[pfn].list.next;
2200 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2201 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2202 d->id, lowmem_mappings, io_mappings);
2204 /* PHASE 2 */
2206 ctot = ttot = 0;
2207 list_ent = d->page_list.next;
2208 for ( i = 0; (list_ent != &d->page_list); i++ )
2210 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2211 page = &frame_table[pfn];
2213 switch ( page->u.inuse.type_info & PGT_type_mask)
2215 case PGT_l1_page_table:
2216 case PGT_l2_page_table:
2217 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2219 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2220 d->id, page->u.inuse.type_info,
2221 page->tlbflush_timestamp,
2222 page->count_info, pfn );
2223 scan_for_pfn_remote(pfn);
2225 default:
2226 if ( (page->count_info & PGC_count_mask) != 1 )
2228 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2229 d->id,
2230 page->count_info,
2231 page->u.inuse.type_info,
2232 page->tlbflush_timestamp, pfn );
2233 scan_for_pfn_remote(pfn);
2235 break;
2238 list_ent = frame_table[pfn].list.next;
2241 /* PHASE 3 */
2243 list_ent = d->page_list.next;
2244 for ( i = 0; (list_ent != &d->page_list); i++ )
2246 unsigned long *pt;
2247 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2248 page = &frame_table[pfn];
2250 switch ( page->u.inuse.type_info & PGT_type_mask )
2252 case PGT_l2_page_table:
2253 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2254 adjust( page, 1, 1 );
2256 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2258 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2260 if ( pt[i] & _PAGE_PRESENT )
2262 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2263 struct pfn_info *l1page = &frame_table[l1pfn];
2265 if ( l1page->u.inuse.domain == d)
2266 adjust(l1page, 1, 1);
2270 unmap_domain_mem(pt);
2271 break;
2273 case PGT_l1_page_table:
2274 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2275 adjust( page, 1, 1 );
2277 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2279 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2281 if ( pt[i] & _PAGE_PRESENT )
2283 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2284 struct pfn_info *l1page = &frame_table[l1pfn];
2286 if ( (l1page->u.inuse.domain != d) ||
2287 (l1pfn < 0x100) || (l1pfn > max_page) )
2288 continue;
2290 adjust(l1page, 1, 0);
2294 unmap_domain_mem(pt);
2295 break;
2299 page->tlbflush_timestamp = 0;
2301 list_ent = frame_table[pfn].list.next;
2304 spin_unlock(&d->page_alloc_lock);
2306 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2308 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2310 if ( d != current->domain )
2311 domain_unpause(d);
2314 void audit_domains(void)
2316 struct domain *d;
2317 for_each_domain ( d )
2318 audit_domain(d);
2321 void audit_domains_key(unsigned char key)
2323 audit_domains();
2326 #endif