direct-io.hg

view xen/arch/x86/memory.c @ 3231:9035b6656818

bitkeeper revision 1.1159.187.48 (41adc6420WlNaaoUkvfgNxl44rpYYg)

Export Xen s/w perfctrs to DOM0 via new 'xenperf' utility.
author kaf24@scramble.cl.cam.ac.uk
date Wed Dec 01 13:25:22 2004 +0000 (2004-12-01)
parents aacdccdb52e5
children f2e12f9f7cc8 a169836882cb
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/lib.h>
90 #include <xen/mm.h>
91 #include <xen/sched.h>
92 #include <xen/errno.h>
93 #include <xen/perfc.h>
94 #include <xen/irq.h>
95 #include <xen/softirq.h>
96 #include <asm/shadow.h>
97 #include <asm/page.h>
98 #include <asm/flushtlb.h>
99 #include <asm/io.h>
100 #include <asm/uaccess.h>
101 #include <asm/domain_page.h>
102 #include <asm/ldt.h>
104 #ifdef VERBOSE
105 #define MEM_LOG(_f, _a...) \
106 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
107 current->id , __LINE__ , ## _a )
108 #else
109 #define MEM_LOG(_f, _a...) ((void)0)
110 #endif
112 static int alloc_l2_table(struct pfn_info *page);
113 static int alloc_l1_table(struct pfn_info *page);
114 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
115 static int get_page_and_type_from_pagenr(unsigned long page_nr,
116 u32 type,
117 struct domain *d);
119 static void free_l2_table(struct pfn_info *page);
120 static void free_l1_table(struct pfn_info *page);
122 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
123 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
125 /* Used to defer flushing of memory structures. */
126 static struct {
127 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
128 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
129 unsigned long deferred_ops;
130 /* If non-NULL, specifies a foreign subject domain for some operations. */
131 struct domain *foreign;
132 } __cacheline_aligned percpu_info[NR_CPUS];
134 /*
135 * Returns the current foreign domain; defaults to the currently-executing
136 * domain if a foreign override hasn't been specified.
137 */
138 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current)
140 /* Private domain structs for DOMID_XEN and DOMID_IO. */
141 static struct domain *dom_xen, *dom_io;
143 void arch_init_memory(void)
144 {
145 unsigned long mfn;
147 /*
148 * We are rather picky about the layout of 'struct pfn_info'. The
149 * count_info and domain fields must be adjacent, as we perform atomic
150 * 64-bit operations on them. Also, just for sanity, we assert the size
151 * of the structure here.
152 */
153 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
154 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
155 (sizeof(struct pfn_info) != 24) )
156 {
157 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
158 offsetof(struct pfn_info, count_info),
159 offsetof(struct pfn_info, u.inuse.domain),
160 sizeof(struct pfn_info));
161 for ( ; ; ) ;
162 }
164 memset(percpu_info, 0, sizeof(percpu_info));
166 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
167 memset(machine_to_phys_mapping, 0x55, 4<<20);
169 /*
170 * Initialise our DOMID_XEN domain.
171 * Any Xen-heap pages that we will allow to be mapped will have
172 * their domain field set to dom_xen.
173 */
174 dom_xen = alloc_domain_struct();
175 atomic_set(&dom_xen->refcnt, 1);
176 dom_xen->id = DOMID_XEN;
178 /*
179 * Initialise our DOMID_IO domain.
180 * This domain owns no pages but is considered a special case when
181 * mapping I/O pages, as the mappings occur at the priv of the caller.
182 */
183 dom_io = alloc_domain_struct();
184 atomic_set(&dom_io->refcnt, 1);
185 dom_io->id = DOMID_IO;
187 /* M2P table is mappable read-only by privileged domains. */
188 for ( mfn = virt_to_phys(&machine_to_phys_mapping[0<<20])>>PAGE_SHIFT;
189 mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
190 mfn++ )
191 {
192 frame_table[mfn].count_info = PGC_allocated | 1;
193 frame_table[mfn].u.inuse.type_info = PGT_gdt_page | 1; /* non-RW */
194 frame_table[mfn].u.inuse.domain = dom_xen;
195 }
196 }
198 static void __invalidate_shadow_ldt(struct domain *d)
199 {
200 int i;
201 unsigned long pfn;
202 struct pfn_info *page;
204 d->mm.shadow_ldt_mapcnt = 0;
206 for ( i = 16; i < 32; i++ )
207 {
208 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
209 if ( pfn == 0 ) continue;
210 d->mm.perdomain_pt[i] = mk_l1_pgentry(0);
211 page = &frame_table[pfn];
212 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
213 ASSERT_PAGE_IS_DOMAIN(page, d);
214 put_page_and_type(page);
215 }
217 /* Dispose of the (now possibly invalid) mappings from the TLB. */
218 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
219 }
222 static inline void invalidate_shadow_ldt(struct domain *d)
223 {
224 if ( d->mm.shadow_ldt_mapcnt != 0 )
225 __invalidate_shadow_ldt(d);
226 }
229 static int alloc_segdesc_page(struct pfn_info *page)
230 {
231 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
232 int i;
234 for ( i = 0; i < 512; i++ )
235 if ( unlikely(!check_descriptor(&descs[i*2])) )
236 goto fail;
238 unmap_domain_mem(descs);
239 return 1;
241 fail:
242 unmap_domain_mem(descs);
243 return 0;
244 }
247 /* Map shadow page at offset @off. */
248 int map_ldt_shadow_page(unsigned int off)
249 {
250 struct domain *d = current;
251 unsigned long l1e;
253 if ( unlikely(in_irq()) )
254 BUG();
256 __get_user(l1e, (unsigned long *)&linear_pg_table[(d->mm.ldt_base >>
257 PAGE_SHIFT) + off]);
259 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
260 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
261 d, PGT_ldt_page)) )
262 return 0;
264 d->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
265 d->mm.shadow_ldt_mapcnt++;
267 return 1;
268 }
271 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
272 {
273 struct pfn_info *page = &frame_table[page_nr];
275 if ( unlikely(!pfn_is_ram(page_nr)) )
276 {
277 MEM_LOG("Pfn %08lx is not RAM", page_nr);
278 return 0;
279 }
281 if ( unlikely(!get_page(page, d)) )
282 {
283 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
284 return 0;
285 }
287 return 1;
288 }
291 static int get_page_and_type_from_pagenr(unsigned long page_nr,
292 u32 type,
293 struct domain *d)
294 {
295 struct pfn_info *page = &frame_table[page_nr];
297 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
298 return 0;
300 if ( unlikely(!get_page_type(page, type)) )
301 {
302 #ifdef VERBOSE
303 if ( (type & PGT_type_mask) != PGT_l1_page_table )
304 MEM_LOG("Bad page type for pfn %08lx (%08x)",
305 page_nr, page->u.inuse.type_info);
306 #endif
307 put_page(page);
308 return 0;
309 }
311 return 1;
312 }
315 /*
316 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
317 * needs some special care with reference counst and access permissions:
318 * 1. The mapping entry must be read-only, or the guest may get write access
319 * to its own PTEs.
320 * 2. We must only bump the reference counts for an *already validated*
321 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
322 * on a validation that is required to complete that validation.
323 * 3. We only need to increment the reference counts for the mapped page
324 * frame if it is mapped by a different L2 table. This is sufficient and
325 * also necessary to allow validation of an L2 table mapping itself.
326 */
327 static int
328 get_linear_pagetable(
329 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
330 {
331 u32 x, y;
332 struct pfn_info *page;
334 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
335 {
336 MEM_LOG("Attempt to create linear p.t. with write perms");
337 return 0;
338 }
340 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
341 {
342 /* Make sure the mapped frame belongs to the correct domain. */
343 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
344 return 0;
346 /*
347 * Make sure that the mapped frame is an already-validated L2 table.
348 * If so, atomically increment the count (checking for overflow).
349 */
350 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
351 y = page->u.inuse.type_info;
352 do {
353 x = y;
354 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
355 unlikely((x & (PGT_type_mask|PGT_validated)) !=
356 (PGT_l2_page_table|PGT_validated)) )
357 {
358 put_page(page);
359 return 0;
360 }
361 }
362 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
363 }
365 return 1;
366 }
369 static int
370 get_page_from_l1e(
371 l1_pgentry_t l1e, struct domain *d)
372 {
373 unsigned long l1v = l1_pgentry_val(l1e);
374 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
375 struct pfn_info *page = &frame_table[pfn];
376 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
378 if ( !(l1v & _PAGE_PRESENT) )
379 return 1;
381 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
382 {
383 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
384 return 0;
385 }
387 if ( unlikely(!pfn_is_ram(pfn)) )
388 {
389 /* Revert to caller privileges if FD == DOMID_IO. */
390 if ( d == dom_io )
391 d = current;
393 if ( IS_PRIV(d) )
394 return 1;
396 if ( IS_CAPABLE_PHYSDEV(d) )
397 return domain_iomem_in_pfn(d, pfn);
399 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
400 return 0;
401 }
403 return ((l1v & _PAGE_RW) ?
404 get_page_and_type(page, d, PGT_writable_page) :
405 get_page(page, d));
406 }
409 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
410 static int
411 get_page_from_l2e(
412 l2_pgentry_t l2e, unsigned long pfn,
413 struct domain *d, unsigned long va_idx)
414 {
415 int rc;
417 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
418 return 1;
420 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
421 {
422 MEM_LOG("Bad L2 page type settings %04lx",
423 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
424 return 0;
425 }
427 rc = get_page_and_type_from_pagenr(
428 l2_pgentry_to_pagenr(l2e),
429 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
431 if ( unlikely(!rc) )
432 return get_linear_pagetable(l2e, pfn, d);
434 return 1;
435 }
438 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
439 {
440 unsigned long l1v = l1_pgentry_val(l1e);
441 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
442 struct pfn_info *page = &frame_table[pfn];
443 struct domain *e = page->u.inuse.domain;
445 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
446 return;
448 if ( unlikely(e != d) )
449 {
450 /*
451 * Unmap a foreign page that may have been mapped via a grant table.
452 * Note that this can fail for a privileged domain that can map foreign
453 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
454 * counted via a grant entry and some counted directly in the page
455 * structure's reference count. Note that reference counts won't get
456 * dangerously confused as long as we always try to decrement the
457 * grant entry first. We may end up with a mismatch between which
458 * mappings and which unmappings are counted via the grant entry, but
459 * really it doesn't matter as privileged domains have carte blanche.
460 */
461 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
462 return;
463 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
464 }
466 if ( l1v & _PAGE_RW )
467 {
468 put_page_and_type(page);
469 }
470 else
471 {
472 /* We expect this is rare so we blow the entire shadow LDT. */
473 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
474 PGT_ldt_page)) &&
475 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
476 invalidate_shadow_ldt(e);
477 put_page(page);
478 }
479 }
482 /*
483 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
484 * Note also that this automatically deals correctly with linear p.t.'s.
485 */
486 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
487 {
488 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
489 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
490 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
491 }
494 static int alloc_l2_table(struct pfn_info *page)
495 {
496 struct domain *d = page->u.inuse.domain;
497 unsigned long page_nr = page_to_pfn(page);
498 l2_pgentry_t *pl2e;
499 int i;
501 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
503 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) {
504 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
505 goto fail;
506 }
508 #if defined(__i386__)
509 /* Now we add our private high mappings. */
510 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
511 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
512 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
513 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
514 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
515 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
516 mk_l2_pgentry(__pa(page->u.inuse.domain->mm.perdomain_pt) |
517 __PAGE_HYPERVISOR);
518 #endif
520 unmap_domain_mem(pl2e);
521 return 1;
523 fail:
524 while ( i-- > 0 )
525 put_page_from_l2e(pl2e[i], page_nr);
527 unmap_domain_mem(pl2e);
528 return 0;
529 }
532 static int alloc_l1_table(struct pfn_info *page)
533 {
534 struct domain *d = page->u.inuse.domain;
535 unsigned long page_nr = page_to_pfn(page);
536 l1_pgentry_t *pl1e;
537 int i;
539 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
541 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
542 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
543 goto fail;
545 unmap_domain_mem(pl1e);
546 return 1;
548 fail:
549 while ( i-- > 0 )
550 put_page_from_l1e(pl1e[i], d);
552 unmap_domain_mem(pl1e);
553 return 0;
554 }
557 static void free_l2_table(struct pfn_info *page)
558 {
559 unsigned long page_nr = page - frame_table;
560 l2_pgentry_t *pl2e;
561 int i;
563 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
565 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
566 put_page_from_l2e(pl2e[i], page_nr);
568 unmap_domain_mem(pl2e);
569 }
572 static void free_l1_table(struct pfn_info *page)
573 {
574 struct domain *d = page->u.inuse.domain;
575 unsigned long page_nr = page - frame_table;
576 l1_pgentry_t *pl1e;
577 int i;
579 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
581 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
582 put_page_from_l1e(pl1e[i], d);
584 unmap_domain_mem(pl1e);
585 }
588 static inline int update_l2e(l2_pgentry_t *pl2e,
589 l2_pgentry_t ol2e,
590 l2_pgentry_t nl2e)
591 {
592 unsigned long o = cmpxchg((unsigned long *)pl2e,
593 l2_pgentry_val(ol2e),
594 l2_pgentry_val(nl2e));
595 if ( o != l2_pgentry_val(ol2e) )
596 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
597 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
598 return (o == l2_pgentry_val(ol2e));
599 }
602 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
603 static int mod_l2_entry(l2_pgentry_t *pl2e,
604 l2_pgentry_t nl2e,
605 unsigned long pfn)
606 {
607 l2_pgentry_t ol2e;
608 unsigned long _ol2e;
610 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
611 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
612 {
613 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
614 return 0;
615 }
617 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
618 return 0;
619 ol2e = mk_l2_pgentry(_ol2e);
621 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
622 {
623 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
624 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
625 return update_l2e(pl2e, ol2e, nl2e);
627 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current,
628 ((unsigned long)pl2e &
629 ~PAGE_MASK) >> 2)) )
630 return 0;
632 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
633 {
634 put_page_from_l2e(nl2e, pfn);
635 return 0;
636 }
638 put_page_from_l2e(ol2e, pfn);
639 return 1;
640 }
642 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
643 return 0;
645 put_page_from_l2e(ol2e, pfn);
646 return 1;
647 }
650 static inline int update_l1e(l1_pgentry_t *pl1e,
651 l1_pgentry_t ol1e,
652 l1_pgentry_t nl1e)
653 {
654 unsigned long o = l1_pgentry_val(ol1e);
655 unsigned long n = l1_pgentry_val(nl1e);
657 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
658 unlikely(o != l1_pgentry_val(ol1e)) )
659 {
660 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
661 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
662 return 0;
663 }
665 return 1;
666 }
669 /* Update the L1 entry at pl1e to new value nl1e. */
670 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
671 {
672 l1_pgentry_t ol1e;
673 unsigned long _ol1e;
674 struct domain *d = current;
676 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
677 {
678 MEM_LOG("Bad get_user\n");
679 return 0;
680 }
682 ol1e = mk_l1_pgentry(_ol1e);
684 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
685 {
686 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
687 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
688 return update_l1e(pl1e, ol1e, nl1e);
690 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
691 return 0;
693 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
694 {
695 put_page_from_l1e(nl1e, d);
696 return 0;
697 }
699 put_page_from_l1e(ol1e, d);
700 return 1;
701 }
703 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
704 return 0;
706 put_page_from_l1e(ol1e, d);
707 return 1;
708 }
711 int alloc_page_type(struct pfn_info *page, unsigned int type)
712 {
713 switch ( type )
714 {
715 case PGT_l1_page_table:
716 return alloc_l1_table(page);
717 case PGT_l2_page_table:
718 return alloc_l2_table(page);
719 case PGT_gdt_page:
720 case PGT_ldt_page:
721 return alloc_segdesc_page(page);
722 default:
723 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
724 type, page->u.inuse.type_info,
725 page->count_info);
726 BUG();
727 }
729 return 0;
730 }
733 void free_page_type(struct pfn_info *page, unsigned int type)
734 {
735 struct domain *d = page->u.inuse.domain;
737 switch ( type )
738 {
739 case PGT_l1_page_table:
740 free_l1_table(page);
741 break;
743 case PGT_l2_page_table:
744 free_l2_table(page);
745 break;
747 default:
748 BUG();
749 }
751 if ( unlikely(d->mm.shadow_mode) &&
752 (get_shadow_status(&d->mm, page_to_pfn(page)) & PSH_shadowed) )
753 {
754 unshadow_table(page_to_pfn(page), type);
755 put_shadow_status(&d->mm);
756 }
757 }
760 void put_page_type(struct pfn_info *page)
761 {
762 u32 nx, x, y = page->u.inuse.type_info;
764 again:
765 do {
766 x = y;
767 nx = x - 1;
769 ASSERT((x & PGT_count_mask) != 0);
771 /*
772 * The page should always be validated while a reference is held. The
773 * exception is during domain destruction, when we forcibly invalidate
774 * page-table pages if we detect a referential loop.
775 * See domain.c:relinquish_list().
776 */
777 ASSERT((x & PGT_validated) ||
778 test_bit(DF_DYING, &page->u.inuse.domain->flags));
780 if ( unlikely((nx & PGT_count_mask) == 0) )
781 {
782 /* Record TLB information for flush later. Races are harmless. */
783 page->tlbflush_timestamp = tlbflush_current_time();
785 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
786 likely(nx & PGT_validated) )
787 {
788 /*
789 * Page-table pages must be unvalidated when count is zero. The
790 * 'free' is safe because the refcnt is non-zero and validated
791 * bit is clear => other ops will spin or fail.
792 */
793 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
794 x & ~PGT_validated)) != x) )
795 goto again;
796 /* We cleared the 'valid bit' so we do the clear up. */
797 free_page_type(page, x & PGT_type_mask);
798 /* Carry on, but with the 'valid bit' now clear. */
799 x &= ~PGT_validated;
800 nx &= ~PGT_validated;
801 }
802 }
803 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
804 (PGT_pinned | 1)) )
805 {
806 /* Page is now only pinned. Make the back pointer mutable again. */
807 nx |= PGT_va_mutable;
808 }
809 }
810 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
811 }
814 int get_page_type(struct pfn_info *page, u32 type)
815 {
816 u32 nx, x, y = page->u.inuse.type_info;
818 again:
819 do {
820 x = y;
821 nx = x + 1;
822 if ( unlikely((nx & PGT_count_mask) == 0) )
823 {
824 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
825 return 0;
826 }
827 else if ( unlikely((x & PGT_count_mask) == 0) )
828 {
829 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
830 {
831 /*
832 * On type change we check to flush stale TLB entries. This
833 * may be unnecessary (e.g., page was GDT/LDT) but those
834 * circumstances should be very rare.
835 */
836 struct domain *d = page->u.inuse.domain;
837 if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor],
838 page->tlbflush_timestamp)) )
839 {
840 perfc_incr(need_flush_tlb_flush);
841 flush_tlb_cpu(d->processor);
842 }
844 /* We lose existing type, back pointer, and validity. */
845 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
846 nx |= type;
848 /* No special validation needed for writable pages. */
849 /* Page tables and GDT/LDT need to be scanned for validity. */
850 if ( type == PGT_writable_page )
851 nx |= PGT_validated;
852 }
853 }
854 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
855 {
856 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
857 {
858 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
859 ((type & PGT_type_mask) != PGT_l1_page_table) )
860 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
861 x & PGT_type_mask, type, page_to_pfn(page));
862 return 0;
863 }
864 else if ( (x & PGT_va_mask) == PGT_va_mutable )
865 {
866 /* The va backpointer is mutable, hence we update it. */
867 nx &= ~PGT_va_mask;
868 nx |= type; /* we know the actual type is correct */
869 }
870 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
871 {
872 /* This table is potentially mapped at multiple locations. */
873 nx &= ~PGT_va_mask;
874 nx |= PGT_va_unknown;
875 }
876 }
877 else if ( unlikely(!(x & PGT_validated)) )
878 {
879 /* Someone else is updating validation of this page. Wait... */
880 while ( (y = page->u.inuse.type_info) == x )
881 {
882 rep_nop();
883 barrier();
884 }
885 goto again;
886 }
887 }
888 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
890 if ( unlikely(!(nx & PGT_validated)) )
891 {
892 /* Try to validate page type; drop the new reference on failure. */
893 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
894 {
895 MEM_LOG("Error while validating pfn %08lx for type %08x."
896 " caf=%08x taf=%08x\n",
897 page_to_pfn(page), type,
898 page->count_info,
899 page->u.inuse.type_info);
900 /* Noone else can get a reference. We hold the only ref. */
901 page->u.inuse.type_info = 0;
902 return 0;
903 }
905 /* Noone else is updating simultaneously. */
906 __set_bit(_PGT_validated, &page->u.inuse.type_info);
907 }
909 return 1;
910 }
913 static int do_extended_command(unsigned long ptr, unsigned long val)
914 {
915 int okay = 1, cpu = smp_processor_id();
916 unsigned int cmd = val & MMUEXT_CMD_MASK;
917 unsigned long pfn = ptr >> PAGE_SHIFT;
918 unsigned long old_base_pfn;
919 struct pfn_info *page = &frame_table[pfn];
920 struct domain *d = current, *nd, *e;
921 u32 x, y;
922 domid_t domid;
923 grant_ref_t gntref;
925 switch ( cmd )
926 {
927 case MMUEXT_PIN_L1_TABLE:
928 case MMUEXT_PIN_L2_TABLE:
929 /*
930 * We insist that, if you pin an L1 page, it's the first thing that
931 * you do to it. This is because we require the backptr to still be
932 * mutable. This assumption seems safe.
933 */
934 okay = get_page_and_type_from_pagenr(
935 pfn,
936 ((cmd==MMUEXT_PIN_L2_TABLE) ?
937 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
938 FOREIGNDOM);
940 if ( unlikely(!okay) )
941 {
942 MEM_LOG("Error while pinning pfn %08lx", pfn);
943 break;
944 }
946 if ( unlikely(test_and_set_bit(_PGT_pinned,
947 &page->u.inuse.type_info)) )
948 {
949 MEM_LOG("Pfn %08lx already pinned", pfn);
950 put_page_and_type(page);
951 okay = 0;
952 break;
953 }
955 break;
957 case MMUEXT_UNPIN_TABLE:
958 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
959 {
960 MEM_LOG("Page %08lx bad domain (dom=%p)",
961 ptr, page->u.inuse.domain);
962 }
963 else if ( likely(test_and_clear_bit(_PGT_pinned,
964 &page->u.inuse.type_info)) )
965 {
966 put_page_and_type(page);
967 put_page(page);
968 }
969 else
970 {
971 okay = 0;
972 put_page(page);
973 MEM_LOG("Pfn %08lx not pinned", pfn);
974 }
975 break;
977 case MMUEXT_NEW_BASEPTR:
978 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
979 if ( likely(okay) )
980 {
981 invalidate_shadow_ldt(d);
983 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
984 old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
985 d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
987 shadow_mk_pagetable(&d->mm);
989 write_ptbase(&d->mm);
991 put_page_and_type(&frame_table[old_base_pfn]);
992 }
993 else
994 {
995 MEM_LOG("Error while installing new baseptr %08lx", ptr);
996 }
997 break;
999 case MMUEXT_TLB_FLUSH:
1000 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1001 break;
1003 case MMUEXT_INVLPG:
1004 __flush_tlb_one(ptr);
1005 break;
1007 case MMUEXT_FLUSH_CACHE:
1008 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1010 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1011 okay = 0;
1013 else
1015 wbinvd();
1017 break;
1019 case MMUEXT_SET_LDT:
1021 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1022 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1023 (ents > 8192) ||
1024 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1025 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1027 okay = 0;
1028 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1030 else if ( (d->mm.ldt_ents != ents) ||
1031 (d->mm.ldt_base != ptr) )
1033 invalidate_shadow_ldt(d);
1034 d->mm.ldt_base = ptr;
1035 d->mm.ldt_ents = ents;
1036 load_LDT(d);
1037 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1038 if ( ents != 0 )
1039 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1041 break;
1044 case MMUEXT_SET_FOREIGNDOM:
1045 domid = (domid_t)(val >> 16);
1047 if ( (e = percpu_info[cpu].foreign) != NULL )
1048 put_domain(e);
1049 percpu_info[cpu].foreign = NULL;
1051 if ( !IS_PRIV(d) )
1053 switch ( domid )
1055 case DOMID_IO:
1056 get_knownalive_domain(dom_io);
1057 percpu_info[cpu].foreign = dom_io;
1058 break;
1059 default:
1060 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1061 okay = 0;
1062 break;
1065 else
1067 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1068 if ( e == NULL )
1070 switch ( domid )
1072 case DOMID_XEN:
1073 get_knownalive_domain(dom_xen);
1074 percpu_info[cpu].foreign = dom_xen;
1075 break;
1076 case DOMID_IO:
1077 get_knownalive_domain(dom_io);
1078 percpu_info[cpu].foreign = dom_io;
1079 break;
1080 default:
1081 MEM_LOG("Unknown domain '%u'", domid);
1082 okay = 0;
1083 break;
1087 break;
1089 case MMUEXT_TRANSFER_PAGE:
1090 domid = (domid_t)(val >> 16);
1091 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1093 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1094 unlikely(!pfn_is_ram(pfn)) ||
1095 unlikely((e = find_domain_by_id(domid)) == NULL) )
1097 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1098 okay = 0;
1099 break;
1102 spin_lock(&d->page_alloc_lock);
1104 /*
1105 * The tricky bit: atomically release ownership while there is just one
1106 * benign reference to the page (PGC_allocated). If that reference
1107 * disappears then the deallocation routine will safely spin.
1108 */
1109 nd = page->u.inuse.domain;
1110 y = page->count_info;
1111 do {
1112 x = y;
1113 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1114 (1|PGC_allocated)) ||
1115 unlikely(nd != d) )
1117 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1118 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1119 d, d->id, nd, x, page->u.inuse.type_info);
1120 spin_unlock(&d->page_alloc_lock);
1121 put_domain(e);
1122 return 0;
1124 __asm__ __volatile__(
1125 LOCK_PREFIX "cmpxchg8b %2"
1126 : "=d" (nd), "=a" (y),
1127 "=m" (*(volatile u64 *)(&page->count_info))
1128 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1130 while ( unlikely(nd != d) || unlikely(y != x) );
1132 /*
1133 * Unlink from 'd'. At least one reference remains (now anonymous), so
1134 * noone else is spinning to try to delete this page from 'd'.
1135 */
1136 d->tot_pages--;
1137 list_del(&page->list);
1139 spin_unlock(&d->page_alloc_lock);
1141 spin_lock(&e->page_alloc_lock);
1143 /*
1144 * Check that 'e' will accept the page and has reservation headroom.
1145 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1146 */
1147 ASSERT(e->tot_pages <= e->max_pages);
1148 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1149 unlikely(e->tot_pages == e->max_pages) ||
1150 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1152 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1153 "provided a bad grant ref, or is dying (%08lx).\n",
1154 e->tot_pages, e->max_pages, e->flags);
1155 spin_unlock(&e->page_alloc_lock);
1156 put_domain(e);
1157 okay = 0;
1158 break;
1161 /* Okay, add the page to 'e'. */
1162 if ( unlikely(e->tot_pages++ == 0) )
1163 get_knownalive_domain(e);
1164 list_add_tail(&page->list, &e->page_list);
1165 page->u.inuse.domain = e;
1167 spin_unlock(&e->page_alloc_lock);
1169 /* Transfer is all done: tell the guest about its new page frame. */
1170 gnttab_notify_transfer(e, gntref, pfn);
1172 put_domain(e);
1173 break;
1175 case MMUEXT_REASSIGN_PAGE:
1176 if ( unlikely(!IS_PRIV(d)) )
1178 MEM_LOG("Dom %u has no reassignment priv", d->id);
1179 okay = 0;
1180 break;
1183 e = percpu_info[cpu].foreign;
1184 if ( unlikely(e == NULL) )
1186 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1187 okay = 0;
1188 break;
1191 /*
1192 * Grab both page_list locks, in order. This prevents the page from
1193 * disappearing elsewhere while we modify the owner, and we'll need
1194 * both locks if we're successful so that we can change lists.
1195 */
1196 if ( d < e )
1198 spin_lock(&d->page_alloc_lock);
1199 spin_lock(&e->page_alloc_lock);
1201 else
1203 spin_lock(&e->page_alloc_lock);
1204 spin_lock(&d->page_alloc_lock);
1207 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1208 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1209 unlikely(IS_XEN_HEAP_FRAME(page)) )
1211 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1212 okay = 0;
1213 goto reassign_fail;
1216 /*
1217 * The tricky bit: atomically change owner while there is just one
1218 * benign reference to the page (PGC_allocated). If that reference
1219 * disappears then the deallocation routine will safely spin.
1220 */
1221 nd = page->u.inuse.domain;
1222 y = page->count_info;
1223 do {
1224 x = y;
1225 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1226 (1|PGC_allocated)) ||
1227 unlikely(nd != d) )
1229 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1230 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1231 d, d->id, nd, x, page->u.inuse.type_info);
1232 okay = 0;
1233 goto reassign_fail;
1235 __asm__ __volatile__(
1236 LOCK_PREFIX "cmpxchg8b %3"
1237 : "=d" (nd), "=a" (y), "=c" (e),
1238 "=m" (*(volatile u64 *)(&page->count_info))
1239 : "0" (d), "1" (x), "c" (e), "b" (x) );
1241 while ( unlikely(nd != d) || unlikely(y != x) );
1243 /*
1244 * Unlink from 'd'. We transferred at least one reference to 'e', so
1245 * noone else is spinning to try to delete this page from 'd'.
1246 */
1247 d->tot_pages--;
1248 list_del(&page->list);
1250 /*
1251 * Add the page to 'e'. Someone may already have removed the last
1252 * reference and want to remove the page from 'e'. However, we have
1253 * the lock so they'll spin waiting for us.
1254 */
1255 if ( unlikely(e->tot_pages++ == 0) )
1256 get_knownalive_domain(e);
1257 list_add_tail(&page->list, &e->page_list);
1259 reassign_fail:
1260 spin_unlock(&d->page_alloc_lock);
1261 spin_unlock(&e->page_alloc_lock);
1262 break;
1264 case MMUEXT_CLEAR_FOREIGNDOM:
1265 if ( (e = percpu_info[cpu].foreign) != NULL )
1266 put_domain(e);
1267 percpu_info[cpu].foreign = NULL;
1268 break;
1270 default:
1271 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1272 okay = 0;
1273 break;
1276 return okay;
1279 int do_mmu_update(
1280 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1282 /*
1283 * We steal the m.s.b. of the @count parameter to indicate whether this
1284 * invocation of do_mmu_update() is resuming a previously preempted call.
1285 * We steal the next 15 bits to remember the current FOREIGNDOM.
1286 */
1287 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1288 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1289 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1291 mmu_update_t req;
1292 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1293 struct pfn_info *page;
1294 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1295 unsigned int cmd, done = 0;
1296 unsigned long prev_spfn = 0;
1297 l1_pgentry_t *prev_spl1e = 0;
1298 struct domain *d = current;
1299 u32 type_info;
1300 domid_t domid;
1302 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1304 /*
1305 * If we are resuming after preemption, read how much work we have already
1306 * done. This allows us to set the @done output parameter correctly.
1307 * We also reset FOREIGNDOM here.
1308 */
1309 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1311 if ( !(count & MMU_UPDATE_PREEMPTED) )
1313 /* Count overflow into private FOREIGNDOM field. */
1314 MEM_LOG("do_mmu_update count is too large");
1315 rc = -EINVAL;
1316 goto out;
1318 count &= ~MMU_UPDATE_PREEMPTED;
1319 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1320 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1321 if ( unlikely(pdone != NULL) )
1322 (void)get_user(done, pdone);
1323 if ( (domid != current->id) &&
1324 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1326 rc = -EINVAL;
1327 goto out;
1331 perfc_incrc(calls_to_mmu_update);
1332 perfc_addc(num_page_updates, count);
1334 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1336 rc = -EFAULT;
1337 goto out;
1340 for ( i = 0; i < count; i++ )
1342 if ( hypercall_preempt_check() )
1344 rc = hypercall_create_continuation(
1345 __HYPERVISOR_mmu_update, 3, ureqs,
1346 (count - i) |
1347 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1348 MMU_UPDATE_PREEMPTED, pdone);
1349 break;
1352 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1354 MEM_LOG("Bad __copy_from_user");
1355 rc = -EFAULT;
1356 break;
1359 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1360 pfn = req.ptr >> PAGE_SHIFT;
1362 okay = 0;
1364 switch ( cmd )
1366 /*
1367 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1368 */
1369 case MMU_NORMAL_PT_UPDATE:
1370 if ( unlikely(!get_page_from_pagenr(pfn, current)) )
1372 MEM_LOG("Could not get page for normal update");
1373 break;
1376 if ( likely(prev_pfn == pfn) )
1378 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1380 else
1382 if ( prev_pfn != 0 )
1383 unmap_domain_mem((void *)va);
1384 va = (unsigned long)map_domain_mem(req.ptr);
1385 prev_pfn = pfn;
1388 page = &frame_table[pfn];
1389 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1391 case PGT_l1_page_table:
1392 if ( likely(get_page_type(
1393 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1395 okay = mod_l1_entry((l1_pgentry_t *)va,
1396 mk_l1_pgentry(req.val));
1398 if ( unlikely(d->mm.shadow_mode) && okay &&
1399 (get_shadow_status(&d->mm, page-frame_table) &
1400 PSH_shadowed) )
1402 shadow_l1_normal_pt_update(
1403 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1404 put_shadow_status(&d->mm);
1407 put_page_type(page);
1409 break;
1410 case PGT_l2_page_table:
1411 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1413 okay = mod_l2_entry((l2_pgentry_t *)va,
1414 mk_l2_pgentry(req.val),
1415 pfn);
1417 if ( unlikely(d->mm.shadow_mode) && okay &&
1418 (get_shadow_status(&d->mm, page-frame_table) &
1419 PSH_shadowed) )
1421 shadow_l2_normal_pt_update(req.ptr, req.val);
1422 put_shadow_status(&d->mm);
1425 put_page_type(page);
1427 break;
1428 default:
1429 if ( likely(get_page_type(page, PGT_writable_page)) )
1431 *(unsigned long *)va = req.val;
1432 okay = 1;
1433 put_page_type(page);
1435 break;
1438 put_page(page);
1439 break;
1441 case MMU_MACHPHYS_UPDATE:
1442 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1444 MEM_LOG("Could not get page for mach->phys update");
1445 break;
1448 machine_to_phys_mapping[pfn] = req.val;
1449 okay = 1;
1451 /*
1452 * If in log-dirty mode, mark the corresponding pseudo-physical
1453 * page as dirty.
1454 */
1455 if ( unlikely(d->mm.shadow_mode == SHM_logdirty) &&
1456 mark_dirty(&d->mm, pfn) )
1457 d->mm.shadow_dirty_block_count++;
1459 put_page(&frame_table[pfn]);
1460 break;
1462 /*
1463 * MMU_EXTENDED_COMMAND: Extended command is specified
1464 * in the least-siginificant bits of the 'value' field.
1465 */
1466 case MMU_EXTENDED_COMMAND:
1467 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1468 okay = do_extended_command(req.ptr, req.val);
1469 break;
1471 default:
1472 MEM_LOG("Invalid page update command %08lx", req.ptr);
1473 break;
1476 if ( unlikely(!okay) )
1478 rc = -EINVAL;
1479 break;
1482 ureqs++;
1485 out:
1486 if ( prev_pfn != 0 )
1487 unmap_domain_mem((void *)va);
1489 if ( unlikely(prev_spl1e != 0) )
1490 unmap_domain_mem((void *)prev_spl1e);
1492 deferred_ops = percpu_info[cpu].deferred_ops;
1493 percpu_info[cpu].deferred_ops = 0;
1495 if ( deferred_ops & DOP_FLUSH_TLB )
1496 local_flush_tlb();
1498 if ( deferred_ops & DOP_RELOAD_LDT )
1499 (void)map_ldt_shadow_page(0);
1501 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1503 put_domain(percpu_info[cpu].foreign);
1504 percpu_info[cpu].foreign = NULL;
1507 /* Add incremental work we have done to the @done output parameter. */
1508 if ( unlikely(pdone != NULL) )
1509 __put_user(done + i, pdone);
1511 return rc;
1515 int do_update_va_mapping(unsigned long page_nr,
1516 unsigned long val,
1517 unsigned long flags)
1519 struct domain *d = current;
1520 int err = 0;
1521 unsigned int cpu = d->processor;
1522 unsigned long deferred_ops;
1524 perfc_incrc(calls_to_update_va);
1526 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1527 return -EINVAL;
1529 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1531 /*
1532 * XXX When we make this support 4MB superpages we should also deal with
1533 * the case of updating L2 entries.
1534 */
1536 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1537 mk_l1_pgentry(val))) )
1538 err = -EINVAL;
1540 if ( unlikely(d->mm.shadow_mode) )
1542 unsigned long sval;
1544 l1pte_propagate_from_guest(&d->mm, &val, &sval);
1546 if ( unlikely(__put_user(sval, ((unsigned long *)(
1547 &shadow_linear_pg_table[page_nr])))) )
1549 /*
1550 * Since L2's are guranteed RW, failure indicates the page was not
1551 * shadowed, so ignore.
1552 */
1553 perfc_incrc(shadow_update_va_fail);
1556 /*
1557 * If we're in log-dirty mode then we need to note that we've updated
1558 * the PTE in the PT-holding page. We need the machine frame number
1559 * for this.
1560 */
1561 if ( d->mm.shadow_mode == SHM_logdirty )
1562 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
1564 check_pagetable(&d->mm, d->mm.pagetable, "va"); /* debug */
1567 deferred_ops = percpu_info[cpu].deferred_ops;
1568 percpu_info[cpu].deferred_ops = 0;
1570 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1571 unlikely(flags & UVMF_FLUSH_TLB) )
1572 local_flush_tlb();
1573 else if ( unlikely(flags & UVMF_INVLPG) )
1574 __flush_tlb_one(page_nr << PAGE_SHIFT);
1576 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1577 (void)map_ldt_shadow_page(0);
1579 return err;
1582 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1583 unsigned long val,
1584 unsigned long flags,
1585 domid_t domid)
1587 unsigned int cpu = smp_processor_id();
1588 struct domain *d;
1589 int rc;
1591 if ( unlikely(!IS_PRIV(current)) )
1592 return -EPERM;
1594 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1595 if ( unlikely(d == NULL) )
1597 MEM_LOG("Unknown domain '%u'", domid);
1598 return -ESRCH;
1601 rc = do_update_va_mapping(page_nr, val, flags);
1603 put_domain(d);
1604 percpu_info[cpu].foreign = NULL;
1606 return rc;
1611 /*************************
1612 * Writable Pagetables
1613 */
1615 ptwr_info_t ptwr_info[NR_CPUS];
1617 #ifdef VERBOSE
1618 int ptwr_debug = 0x0;
1619 #define PTWR_PRINTK(_f, _a...) \
1620 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1621 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1622 #else
1623 #define PTWR_PRINTK(_f, _a...) ((void)0)
1624 #endif
1626 /* Flush the given writable p.t. page and write-protect it again. */
1627 void ptwr_flush(const int which)
1629 unsigned long sstat, spte, pte, *ptep, l1va;
1630 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1631 l2_pgentry_t *pl2e;
1632 int i, cpu = smp_processor_id();
1633 struct domain *d = current;
1635 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1636 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1638 /*
1639 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1640 */
1642 if ( unlikely(__get_user(pte, ptep)) )
1644 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1645 /*
1646 * Really a bug. We could read this PTE during the initial fault,
1647 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1648 */
1649 BUG();
1651 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1652 PTWR_PRINT_WHICH, ptep, pte);
1653 pte &= ~_PAGE_RW;
1655 if ( unlikely(d->mm.shadow_mode) )
1657 /* Write-protect the p.t. page in the shadow page table. */
1658 l1pte_propagate_from_guest(&d->mm, &pte, &spte);
1659 __put_user(
1660 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1662 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1663 sstat = get_shadow_status(&d->mm, pte >> PAGE_SHIFT);
1664 if ( sstat & PSH_shadowed )
1665 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1668 /* Write-protect the p.t. page in the guest page table. */
1669 if ( unlikely(__put_user(pte, ptep)) )
1671 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1672 /*
1673 * Really a bug. We could write this PTE during the initial fault,
1674 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1675 */
1676 BUG();
1679 /* Ensure that there are no stale writable mappings in any TLB. */
1680 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1681 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1682 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1683 PTWR_PRINT_WHICH, ptep, pte);
1685 /*
1686 * STEP 2. Validate any modified PTEs.
1687 */
1689 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1690 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1692 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1693 nl1e = pl1e[i];
1695 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1696 continue;
1698 /*
1699 * Fast path for PTEs that have merely been write-protected
1700 * (e.g., during a Unix fork()). A strict reduction in privilege.
1701 */
1702 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1704 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1706 if ( unlikely(sl1e != NULL) )
1707 l1pte_propagate_from_guest(
1708 &d->mm, &l1_pgentry_val(nl1e),
1709 &l1_pgentry_val(sl1e[i]));
1710 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1712 continue;
1715 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1717 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1718 /*
1719 * Make the remaining p.t's consistent before crashing, so the
1720 * reference counts are correct.
1721 */
1722 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1723 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1724 unmap_domain_mem(pl1e);
1725 ptwr_info[cpu].ptinfo[which].l1va = 0;
1726 domain_crash();
1729 if ( unlikely(sl1e != NULL) )
1730 l1pte_propagate_from_guest(
1731 &d->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1733 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1734 put_page_from_l1e(ol1e, d);
1736 unmap_domain_mem(pl1e);
1738 /*
1739 * STEP 3. Reattach the L1 p.t. page into the current address space.
1740 */
1742 if ( (which == PTWR_PT_ACTIVE) && likely(!d->mm.shadow_mode) )
1744 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1745 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1748 /*
1749 * STEP 4. Final tidy-up.
1750 */
1752 ptwr_info[cpu].ptinfo[which].l1va = 0;
1754 if ( unlikely(sl1e != NULL) )
1756 unmap_domain_mem(sl1e);
1757 put_shadow_status(&d->mm);
1761 /* Write page fault handler: check if guest is trying to modify a PTE. */
1762 int ptwr_do_page_fault(unsigned long addr)
1764 unsigned long pte, pfn, l2e;
1765 struct pfn_info *page;
1766 l2_pgentry_t *pl2e;
1767 int which, cpu = smp_processor_id();
1768 u32 l2_idx;
1770 /*
1771 * Attempt to read the PTE that maps the VA being accessed. By checking for
1772 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1773 */
1774 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1775 _PAGE_PRESENT) ||
1776 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1777 return 0;
1779 pfn = pte >> PAGE_SHIFT;
1780 page = &frame_table[pfn];
1782 /* We are looking only for read-only mappings of p.t. pages. */
1783 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1784 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1785 return 0;
1787 /* Get the L2 index at which this L1 p.t. is always mapped. */
1788 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1789 if ( unlikely(l2_idx >= PGT_va_unknown) )
1790 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1791 l2_idx >>= PGT_va_shift;
1793 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1795 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1796 domain_crash();
1799 /*
1800 * Is the L1 p.t. mapped into the current address space? If so we call it
1801 * an ACTIVE p.t., otherwise it is INACTIVE.
1802 */
1803 pl2e = &linear_l2_table[l2_idx];
1804 l2e = l2_pgentry_val(*pl2e);
1805 which = PTWR_PT_INACTIVE;
1806 if ( (l2e >> PAGE_SHIFT) == pfn )
1808 /* Check the PRESENT bit to set ACTIVE. */
1809 if ( likely(l2e & _PAGE_PRESENT) )
1810 which = PTWR_PT_ACTIVE;
1811 else {
1812 /*
1813 * If the PRESENT bit is clear, we may be conflicting with
1814 * the current ACTIVE p.t. (it may be the same p.t. mapped
1815 * at another virt addr).
1816 * The ptwr_flush call below will restore the PRESENT bit.
1817 */
1818 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
1819 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
1820 which = PTWR_PT_ACTIVE;
1824 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1825 "pfn %08lx\n", PTWR_PRINT_WHICH,
1826 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1828 /*
1829 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1830 * time. If there is already one, we must flush it out.
1831 */
1832 if ( ptwr_info[cpu].ptinfo[which].l1va )
1833 ptwr_flush(which);
1835 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1836 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1838 /* For safety, disconnect the L1 p.t. page from current space. */
1839 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1841 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1842 flush_tlb(); /* XXX Multi-CPU guests? */
1845 /* Temporarily map the L1 page, and make a copy of it. */
1846 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1847 memcpy(ptwr_info[cpu].ptinfo[which].page,
1848 ptwr_info[cpu].ptinfo[which].pl1e,
1849 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1851 /* Finally, make the p.t. page writable by the guest OS. */
1852 pte |= _PAGE_RW;
1853 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1854 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1855 if ( unlikely(__put_user(pte, (unsigned long *)
1856 &linear_pg_table[addr>>PAGE_SHIFT])) )
1858 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1859 &linear_pg_table[addr>>PAGE_SHIFT]);
1860 /* Toss the writable pagetable state and crash. */
1861 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1862 ptwr_info[cpu].ptinfo[which].l1va = 0;
1863 domain_crash();
1866 return EXCRET_fault_fixed;
1869 static __init int ptwr_init(void)
1871 int i;
1873 for ( i = 0; i < smp_num_cpus; i++ )
1875 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1876 (void *)alloc_xenheap_page();
1877 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1878 (void *)alloc_xenheap_page();
1881 return 0;
1883 __initcall(ptwr_init);
1888 /************************************************************************/
1889 /************************************************************************/
1890 /************************************************************************/
1892 #ifndef NDEBUG
1894 void ptwr_status(void)
1896 unsigned long pte, *ptep, pfn;
1897 struct pfn_info *page;
1898 int cpu = smp_processor_id();
1900 ptep = (unsigned long *)&linear_pg_table
1901 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1903 if ( __get_user(pte, ptep) ) {
1904 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1905 domain_crash();
1908 pfn = pte >> PAGE_SHIFT;
1909 page = &frame_table[pfn];
1910 printk("need to alloc l1 page %p\n", page);
1911 /* make pt page writable */
1912 printk("need to make read-only l1-page at %p is %08lx\n",
1913 ptep, pte);
1915 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1916 return;
1918 if ( __get_user(pte, (unsigned long *)
1919 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1920 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1921 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1922 domain_crash();
1924 pfn = pte >> PAGE_SHIFT;
1925 page = &frame_table[pfn];
1928 void audit_domain(struct domain *d)
1930 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1932 void adjust (struct pfn_info *page, int dir, int adjtype)
1934 int count = page->count_info & PGC_count_mask;
1936 if ( adjtype )
1938 int tcount = page->u.inuse.type_info & PGT_count_mask;
1940 ttot++;
1942 tcount += dir;
1944 if ( tcount < 0 )
1946 /* This will only come out once. */
1947 printk("Audit %d: type count whent below zero pfn=%x "
1948 "taf=%x otaf=%x\n",
1949 d->id, page-frame_table,
1950 page->u.inuse.type_info,
1951 page->tlbflush_timestamp);
1954 page->u.inuse.type_info =
1955 (page->u.inuse.type_info & ~PGT_count_mask) |
1956 (tcount & PGT_count_mask);
1959 ctot++;
1960 count += dir;
1961 if ( count < 0 )
1963 /* This will only come out once. */
1964 printk("Audit %d: general count whent below zero pfn=%x "
1965 "taf=%x otaf=%x\n",
1966 d->id, page-frame_table,
1967 page->u.inuse.type_info,
1968 page->tlbflush_timestamp);
1971 page->count_info =
1972 (page->count_info & ~PGC_count_mask) |
1973 (count & PGC_count_mask);
1977 void scan_for_pfn(struct domain *d, unsigned long xpfn)
1979 unsigned long pfn, *pt;
1980 struct list_head *list_ent;
1981 struct pfn_info *page;
1982 int i;
1984 list_ent = d->page_list.next;
1985 for ( i = 0; (list_ent != &d->page_list); i++ )
1987 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1988 page = &frame_table[pfn];
1990 switch ( page->u.inuse.type_info & PGT_type_mask )
1992 case PGT_l1_page_table:
1993 case PGT_l2_page_table:
1994 pt = map_domain_mem(pfn<<PAGE_SHIFT);
1995 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1996 if ( (pt[i] & _PAGE_PRESENT) &&
1997 ((pt[i] >> PAGE_SHIFT) == xpfn) )
1998 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
1999 d->id, i, pfn, page->u.inuse.type_info,
2000 page->count_info);
2001 unmap_domain_mem(pt);
2004 list_ent = frame_table[pfn].list.next;
2009 void scan_for_pfn_remote(unsigned long xpfn)
2011 struct domain *e;
2012 for_each_domain ( e )
2013 scan_for_pfn( e, xpfn );
2016 int i;
2017 unsigned long pfn;
2018 struct list_head *list_ent;
2019 struct pfn_info *page;
2021 if ( d != current )
2022 domain_pause(d);
2023 synchronise_pagetables(~0UL);
2025 printk("pt base=%lx sh_info=%x\n",
2026 pagetable_val(d->mm.pagetable)>>PAGE_SHIFT,
2027 virt_to_page(d->shared_info)-frame_table);
2029 spin_lock(&d->page_alloc_lock);
2031 /* PHASE 0 */
2033 list_ent = d->page_list.next;
2034 for ( i = 0; (list_ent != &d->page_list); i++ )
2036 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2037 page = &frame_table[pfn];
2039 if ( page->u.inuse.domain != d )
2040 BUG();
2042 if ( (page->u.inuse.type_info & PGT_count_mask) >
2043 (page->count_info & PGC_count_mask) )
2044 printk("taf > caf %x %x pfn=%lx\n",
2045 page->u.inuse.type_info, page->count_info, pfn );
2047 #if 0 /* SYSV shared memory pages plus writeable files. */
2048 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2049 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2051 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2052 pfn,
2053 page->u.inuse.type_info,
2054 page->count_info );
2055 scan_for_pfn_remote(pfn);
2057 #endif
2058 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2059 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2061 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2062 pfn,
2063 page->u.inuse.type_info,
2064 page->count_info );
2067 /* Use tlbflush_timestamp to store original type_info. */
2068 page->tlbflush_timestamp = page->u.inuse.type_info;
2070 list_ent = frame_table[pfn].list.next;
2074 /* PHASE 1 */
2076 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2078 list_ent = d->page_list.next;
2079 for ( i = 0; (list_ent != &d->page_list); i++ )
2081 unsigned long *pt;
2082 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2083 page = &frame_table[pfn];
2085 if ( page->u.inuse.domain != d )
2086 BUG();
2088 switch ( page->u.inuse.type_info & PGT_type_mask )
2090 case PGT_l2_page_table:
2092 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2093 printk("Audit %d: L2 not validated %x\n",
2094 d->id, page->u.inuse.type_info);
2096 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2097 printk("Audit %d: L2 not pinned %x\n",
2098 d->id, page->u.inuse.type_info);
2099 else
2100 adjust( page, -1, 1 );
2102 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2104 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2106 if ( pt[i] & _PAGE_PRESENT )
2108 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2109 struct pfn_info *l1page = &frame_table[l1pfn];
2111 if ( l1page->u.inuse.domain != d )
2113 printk("L2: Skip bizarre page belonging to other "
2114 "dom %p\n", l1page->u.inuse.domain);
2115 continue;
2118 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2119 PGT_l2_page_table )
2120 printk("Audit %d: [%x] Found %s Linear PT "
2121 "t=%x pfn=%lx\n", d->id, i,
2122 (l1pfn==pfn) ? "Self" : "Other",
2123 l1page->u.inuse.type_info,
2124 l1pfn);
2125 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2126 PGT_l1_page_table )
2127 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2128 d->id, i,
2129 l1page->u.inuse.type_info,
2130 l1pfn);
2132 adjust(l1page, -1, 1);
2136 unmap_domain_mem(pt);
2138 break;
2141 case PGT_l1_page_table:
2143 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2144 adjust( page, -1, 1 );
2146 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2147 printk("Audit %d: L1 not validated %x\n",
2148 d->id, page->u.inuse.type_info);
2149 #if 0
2150 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2151 printk("Audit %d: L1 not pinned %x\n",
2152 d->id, page->u.inuse.type_info);
2153 #endif
2154 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2156 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2158 if ( pt[i] & _PAGE_PRESENT )
2160 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2161 struct pfn_info *l1page = &frame_table[l1pfn];
2163 if ( l1pfn < 0x100 )
2165 lowmem_mappings++;
2166 continue;
2169 if ( l1pfn > max_page )
2171 io_mappings++;
2172 continue;
2175 if ( pt[i] & _PAGE_RW )
2178 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2179 PGT_l1_page_table ||
2180 (l1page->u.inuse.type_info & PGT_type_mask) ==
2181 PGT_l2_page_table )
2182 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2183 d->id, i,
2184 l1page->u.inuse.type_info,
2185 l1pfn);
2189 if ( l1page->u.inuse.domain != d )
2191 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2192 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2193 d->id, pfn, i,
2194 (unsigned long)l1page->u.inuse.domain,
2195 l1pfn,
2196 l1page->count_info,
2197 l1page->u.inuse.type_info,
2198 machine_to_phys_mapping[l1pfn]);
2199 continue;
2202 adjust(l1page, -1, 0);
2206 unmap_domain_mem(pt);
2208 break;
2211 list_ent = frame_table[pfn].list.next;
2214 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2215 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2216 d->id, lowmem_mappings, io_mappings);
2218 /* PHASE 2 */
2220 ctot = ttot = 0;
2221 list_ent = d->page_list.next;
2222 for ( i = 0; (list_ent != &d->page_list); i++ )
2224 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2225 page = &frame_table[pfn];
2227 switch ( page->u.inuse.type_info & PGT_type_mask)
2229 case PGT_l1_page_table:
2230 case PGT_l2_page_table:
2231 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2233 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2234 d->id, page->u.inuse.type_info,
2235 page->tlbflush_timestamp,
2236 page->count_info, pfn );
2237 scan_for_pfn_remote(pfn);
2239 default:
2240 if ( (page->count_info & PGC_count_mask) != 1 )
2242 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2243 d->id,
2244 page->count_info,
2245 page->u.inuse.type_info,
2246 page->tlbflush_timestamp, pfn );
2247 scan_for_pfn_remote(pfn);
2249 break;
2252 list_ent = frame_table[pfn].list.next;
2255 /* PHASE 3 */
2257 list_ent = d->page_list.next;
2258 for ( i = 0; (list_ent != &d->page_list); i++ )
2260 unsigned long *pt;
2261 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2262 page = &frame_table[pfn];
2264 switch ( page->u.inuse.type_info & PGT_type_mask )
2266 case PGT_l2_page_table:
2267 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2268 adjust( page, 1, 1 );
2270 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2272 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2274 if ( pt[i] & _PAGE_PRESENT )
2276 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2277 struct pfn_info *l1page = &frame_table[l1pfn];
2279 if ( l1page->u.inuse.domain == d)
2280 adjust(l1page, 1, 1);
2284 unmap_domain_mem(pt);
2285 break;
2287 case PGT_l1_page_table:
2288 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2289 adjust( page, 1, 1 );
2291 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2293 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2295 if ( pt[i] & _PAGE_PRESENT )
2297 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2298 struct pfn_info *l1page = &frame_table[l1pfn];
2300 if ( (l1page->u.inuse.domain != d) ||
2301 (l1pfn < 0x100) || (l1pfn > max_page) )
2302 continue;
2304 adjust(l1page, 1, 0);
2308 unmap_domain_mem(pt);
2309 break;
2313 page->tlbflush_timestamp = 0;
2315 list_ent = frame_table[pfn].list.next;
2318 spin_unlock(&d->page_alloc_lock);
2320 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2322 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2324 if ( d != current )
2325 domain_unpause(d);
2328 void audit_domains(void)
2330 struct domain *d;
2331 for_each_domain ( d )
2332 audit_domain(d);
2335 void audit_domains_key(unsigned char key)
2337 audit_domains();
2340 #endif